2016-03-14 03:08:01 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
|
2016-12-31 06:43:41 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ
|
2016-03-14 03:08:01 +08:00
|
|
|
|
|
|
|
;
|
|
|
|
; add
|
|
|
|
;
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_add_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: paddq %xmm3, %xmm1
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: paddq %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_add_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
|
|
|
; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_add_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_add_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = add <4 x i64> %a0, %a1
|
|
|
|
%2 = trunc <4 x i64> %1 to <4 x i32>
|
|
|
|
ret <4 x i32> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_add_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: paddq %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: paddq %xmm5, %xmm1
|
2016-10-07 02:58:24 +08:00
|
|
|
; SSE-NEXT: paddq %xmm6, %xmm2
|
|
|
|
; SSE-NEXT: paddq %xmm7, %xmm3
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
|
|
|
|
; SSE-NEXT: movapd %xmm2, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_add_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_add_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_add_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = add <8 x i64> %a0, %a1
|
|
|
|
%2 = trunc <8 x i64> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_add_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: paddd %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: paddd %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: pslld $16, %xmm1
|
|
|
|
; SSE-NEXT: psrad $16, %xmm1
|
|
|
|
; SSE-NEXT: pslld $16, %xmm0
|
|
|
|
; SSE-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE-NEXT: packssdw %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_add_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_add_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_add_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = add <8 x i32> %a0, %a1
|
|
|
|
%2 = trunc <8 x i32> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_add_v16i64_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm0
|
|
|
|
; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm1
|
|
|
|
; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm2
|
|
|
|
; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm3
|
|
|
|
; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm4
|
|
|
|
; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm5
|
|
|
|
; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm6
|
|
|
|
; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm7
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm7
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm6
|
|
|
|
; SSE-NEXT: packuswb %xmm7, %xmm6
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm5
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm5, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm6, %xmm4
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_add_v16i64_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm4
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm5
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm6
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3
|
|
|
|
; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm5, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_add_v16i64_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpaddq %ymm5, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpaddq %ymm7, %ymm3, %ymm3
|
|
|
|
; AVX2-NEXT: vpaddq %ymm6, %ymm2, %ymm2
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2017-08-17 23:40:25 +08:00
|
|
|
; AVX512-LABEL: trunc_add_v16i64_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1
|
|
|
|
; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
|
|
|
|
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = add <16 x i64> %a0, %a1
|
|
|
|
%2 = trunc <16 x i64> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_add_v16i32_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: paddd %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: paddd %xmm5, %xmm1
|
|
|
|
; SSE-NEXT: paddd %xmm6, %xmm2
|
|
|
|
; SSE-NEXT: paddd %xmm7, %xmm3
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_add_v16i32_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_add_v16i32_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: trunc_add_v16i32_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = add <16 x i32> %a0, %a1
|
|
|
|
%2 = trunc <16 x i32> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_add_v16i16_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: paddw %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: paddw %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_add_v16i16_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_add_v16i16_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: trunc_add_v16i16_v16i8:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512BW-LABEL: trunc_add_v16i16_v16i8:
|
|
|
|
; AVX512BW: # BB#0:
|
|
|
|
; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
2016-12-31 06:43:41 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: trunc_add_v16i16_v16i8:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = add <16 x i16> %a0, %a1
|
|
|
|
%2 = trunc <16 x i16> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
2017-01-19 23:03:00 +08:00
|
|
|
define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
|
|
|
|
; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
|
|
|
|
; SSE: # BB#0:
|
2017-01-20 00:25:02 +08:00
|
|
|
; SSE-NEXT: pslld $16, %xmm2
|
|
|
|
; SSE-NEXT: psrad $16, %xmm2
|
|
|
|
; SSE-NEXT: pslld $16, %xmm1
|
|
|
|
; SSE-NEXT: psrad $16, %xmm1
|
|
|
|
; SSE-NEXT: packssdw %xmm2, %xmm1
|
2017-01-19 23:03:00 +08:00
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
2017-01-20 00:25:02 +08:00
|
|
|
; SSE-NEXT: psraw $8, %xmm0
|
|
|
|
; SSE-NEXT: paddw %xmm1, %xmm0
|
2017-01-19 23:03:00 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
|
|
|
|
; AVX1: # BB#0:
|
2017-01-20 00:25:02 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
|
|
|
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
|
2017-01-19 23:03:00 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
|
|
|
|
; AVX2: # BB#0:
|
2017-01-20 00:25:02 +08:00
|
|
|
; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2017-01-20 00:25:02 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
|
2017-01-19 23:03:00 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
|
|
|
|
; AVX512: # BB#0:
|
2017-01-20 00:25:02 +08:00
|
|
|
; AVX512-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
|
|
|
|
; AVX512-NEXT: vpmovdw %zmm1, %ymm1
|
|
|
|
; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0
|
|
|
|
; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2017-01-19 23:03:00 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
|
|
%2 = sext <8 x i8> %1 to <8 x i32>
|
|
|
|
%3 = add <8 x i32> %2, %a1
|
|
|
|
%4 = trunc <8 x i32> %3 to <8 x i16>
|
|
|
|
ret <8 x i16> %4
|
|
|
|
}
|
|
|
|
|
2016-03-14 03:08:01 +08:00
|
|
|
;
|
|
|
|
; add to constant
|
|
|
|
;
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_add_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_add_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
|
|
|
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_add_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_add_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
|
|
|
|
%2 = trunc <4 x i64> %1 to <4 x i32>
|
|
|
|
ret <4 x i32> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_add_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
2016-10-07 02:58:24 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
|
|
|
|
; SSE-NEXT: paddw {{.*}}(%rip), %xmm2
|
|
|
|
; SSE-NEXT: movdqa %xmm2, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_add_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_add_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_add_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
|
|
|
|
%2 = trunc <8 x i64> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_add_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pslld $16, %xmm1
|
|
|
|
; SSE-NEXT: psrad $16, %xmm1
|
|
|
|
; SSE-NEXT: pslld $16, %xmm0
|
|
|
|
; SSE-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE-NEXT: packssdw %xmm1, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: paddw {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_add_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_add_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_add_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
|
|
%2 = trunc <8 x i32> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_add_const_v16i64_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm7
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm6
|
|
|
|
; SSE-NEXT: packuswb %xmm7, %xmm6
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm5
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm5, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm6, %xmm4
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm4, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: paddb {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_add_const_v16i64_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2017-08-17 23:40:25 +08:00
|
|
|
; AVX512-LABEL: trunc_add_const_v16i64_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
|
|
|
|
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
|
|
|
; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
|
|
|
|
%2 = trunc <16 x i64> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_add_const_v16i32_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: paddb {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: trunc_add_const_v16i32_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
|
|
%2 = trunc <16 x i32> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_add_const_v16i16_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: paddb {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
|
|
|
|
; AVX512BW: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
2016-12-31 06:43:41 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
|
|
|
|
%2 = trunc <16 x i16> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
;
|
|
|
|
; sub
|
|
|
|
;
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_sub_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: psubq %xmm3, %xmm1
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: psubq %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_sub_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
|
|
|
; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_sub_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_sub_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = sub <4 x i64> %a0, %a1
|
|
|
|
%2 = trunc <4 x i64> %1 to <4 x i32>
|
|
|
|
ret <4 x i32> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_sub_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: psubq %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: psubq %xmm5, %xmm1
|
2016-10-07 02:58:24 +08:00
|
|
|
; SSE-NEXT: psubq %xmm6, %xmm2
|
|
|
|
; SSE-NEXT: psubq %xmm7, %xmm3
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
|
|
|
|
; SSE-NEXT: movapd %xmm2, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_sub_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm4
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_sub_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_sub_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = sub <8 x i64> %a0, %a1
|
|
|
|
%2 = trunc <8 x i64> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_sub_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: psubd %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: psubd %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: pslld $16, %xmm1
|
|
|
|
; SSE-NEXT: psrad $16, %xmm1
|
|
|
|
; SSE-NEXT: pslld $16, %xmm0
|
|
|
|
; SSE-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE-NEXT: packssdw %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_sub_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_sub_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_sub_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = sub <8 x i32> %a0, %a1
|
|
|
|
%2 = trunc <8 x i32> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_sub_v16i64_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm0
|
|
|
|
; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm1
|
|
|
|
; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm2
|
|
|
|
; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm3
|
|
|
|
; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm4
|
|
|
|
; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm5
|
|
|
|
; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm6
|
|
|
|
; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm7
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm7
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm6
|
|
|
|
; SSE-NEXT: packuswb %xmm7, %xmm6
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm5
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm5, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm6, %xmm4
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_sub_v16i64_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm4
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm6
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3
|
|
|
|
; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm5, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_sub_v16i64_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpsubq %ymm5, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpsubq %ymm4, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpsubq %ymm7, %ymm3, %ymm3
|
|
|
|
; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2017-08-17 23:40:25 +08:00
|
|
|
; AVX512-LABEL: trunc_sub_v16i64_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1
|
|
|
|
; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
|
|
|
|
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = sub <16 x i64> %a0, %a1
|
|
|
|
%2 = trunc <16 x i64> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_sub_v16i32_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: psubd %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: psubd %xmm5, %xmm1
|
|
|
|
; SSE-NEXT: psubd %xmm6, %xmm2
|
|
|
|
; SSE-NEXT: psubd %xmm7, %xmm3
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_sub_v16i32_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm4
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_sub_v16i32_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: trunc_sub_v16i32_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = sub <16 x i32> %a0, %a1
|
|
|
|
%2 = trunc <16 x i32> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_sub_v16i16_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: psubw %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: psubw %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_sub_v16i16_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_sub_v16i16_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: trunc_sub_v16i16_v16i8:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512BW-LABEL: trunc_sub_v16i16_v16i8:
|
|
|
|
; AVX512BW: # BB#0:
|
|
|
|
; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
2016-12-31 06:43:41 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = sub <16 x i16> %a0, %a1
|
|
|
|
%2 = trunc <16 x i16> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
;
|
|
|
|
; sub to constant
|
|
|
|
;
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_sub_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movl $1, %eax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm2
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: psubq %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: psubq {{.*}}(%rip), %xmm1
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_sub_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: movl $1, %eax
|
|
|
|
; AVX1-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
|
|
|
|
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_sub_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_sub_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
|
|
|
|
%2 = trunc <4 x i64> %1 to <4 x i32>
|
|
|
|
ret <4 x i32> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_sub_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movl $1, %eax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm4
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: psubq %xmm4, %xmm0
|
2016-10-07 02:58:24 +08:00
|
|
|
; SSE-NEXT: psubq {{.*}}(%rip), %xmm1
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: psubq {{.*}}(%rip), %xmm2
|
|
|
|
; SSE-NEXT: psubq {{.*}}(%rip), %xmm3
|
2016-10-07 02:58:24 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
|
|
|
|
; SSE-NEXT: movapd %xmm2, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_sub_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: movl $1, %eax
|
|
|
|
; AVX1-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
|
|
|
|
; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_sub_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_sub_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
|
|
|
|
%2 = trunc <8 x i64> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_sub_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: psubd {{.*}}(%rip), %xmm0
|
|
|
|
; SSE-NEXT: psubd {{.*}}(%rip), %xmm1
|
|
|
|
; SSE-NEXT: pslld $16, %xmm1
|
|
|
|
; SSE-NEXT: psrad $16, %xmm1
|
|
|
|
; SSE-NEXT: pslld $16, %xmm0
|
|
|
|
; SSE-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE-NEXT: packssdw %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_sub_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_sub_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_sub_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
|
|
%2 = trunc <8 x i32> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_sub_const_v16i64_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movl $1, %eax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm8
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: psubq %xmm8, %xmm0
|
|
|
|
; SSE-NEXT: psubq {{.*}}(%rip), %xmm1
|
|
|
|
; SSE-NEXT: psubq {{.*}}(%rip), %xmm2
|
|
|
|
; SSE-NEXT: psubq {{.*}}(%rip), %xmm3
|
|
|
|
; SSE-NEXT: psubq {{.*}}(%rip), %xmm4
|
|
|
|
; SSE-NEXT: psubq {{.*}}(%rip), %xmm5
|
|
|
|
; SSE-NEXT: psubq {{.*}}(%rip), %xmm6
|
|
|
|
; SSE-NEXT: psubq {{.*}}(%rip), %xmm7
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm7
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm6
|
|
|
|
; SSE-NEXT: packuswb %xmm7, %xmm6
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm5
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm5, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm6, %xmm4
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_sub_const_v16i64_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: movl $1, %eax
|
|
|
|
; AVX1-NEXT: vmovq %rax, %xmm4
|
|
|
|
; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
|
|
|
|
; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm5
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm6
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm7
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm6, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_sub_const_v16i64_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm3, %ymm3
|
|
|
|
; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm2, %ymm2
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2017-08-17 23:40:25 +08:00
|
|
|
; AVX512-LABEL: trunc_sub_const_v16i64_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm1, %zmm1
|
|
|
|
; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
|
|
|
|
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
|
|
|
|
%2 = trunc <16 x i64> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_sub_const_v16i32_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: psubd {{.*}}(%rip), %xmm0
|
|
|
|
; SSE-NEXT: psubd {{.*}}(%rip), %xmm1
|
|
|
|
; SSE-NEXT: psubd {{.*}}(%rip), %xmm2
|
|
|
|
; SSE-NEXT: psubd {{.*}}(%rip), %xmm3
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_sub_const_v16i32_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm1, %xmm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_sub_const_v16i32_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm1, %ymm1
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: trunc_sub_const_v16i32_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpsubd {{.*}}(%rip), %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
|
|
%2 = trunc <16 x i32> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_sub_const_v16i16_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: psubw {{.*}}(%rip), %xmm0
|
|
|
|
; SSE-NEXT: psubw {{.*}}(%rip), %xmm1
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_sub_const_v16i16_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_sub_const_v16i16_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
|
|
|
|
; AVX512BW: # BB#0:
|
|
|
|
; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
2016-12-31 06:43:41 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
|
|
|
|
%2 = trunc <16 x i16> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
;
|
|
|
|
; mul
|
|
|
|
;
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_mul_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm1, %xmm4
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: psrlq $32, %xmm4
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm3, %xmm4
|
|
|
|
; SSE-NEXT: movdqa %xmm3, %xmm5
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: psrlq $32, %xmm5
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm1, %xmm5
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: paddq %xmm4, %xmm5
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: psllq $32, %xmm5
|
|
|
|
; SSE-NEXT: pmuludq %xmm3, %xmm1
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: paddq %xmm5, %xmm1
|
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm3
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: psrlq $32, %xmm3
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm2, %xmm3
|
|
|
|
; SSE-NEXT: movdqa %xmm2, %xmm4
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm4
|
|
|
|
; SSE-NEXT: pmuludq %xmm0, %xmm4
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: paddq %xmm3, %xmm4
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: psllq $32, %xmm4
|
|
|
|
; SSE-NEXT: pmuludq %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: paddq %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_mul_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_mul_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512F-LABEL: trunc_mul_v4i64_v4i32:
|
|
|
|
; AVX512F: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512F-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
|
|
|
|
; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512BW-LABEL: trunc_mul_v4i64_v4i32:
|
|
|
|
; AVX512BW: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
|
|
|
|
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
|
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = mul <4 x i64> %a0, %a1
|
|
|
|
%2 = trunc <4 x i64> %1 to <4 x i32>
|
|
|
|
ret <4 x i32> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_mul_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
|
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
|
2016-10-07 02:58:24 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pmullw %xmm6, %xmm2
|
|
|
|
; SSE-NEXT: movdqa %xmm2, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_mul_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
|
|
|
|
; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1,2,3],xmm2[4],xmm5[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1,2,3],xmm1[4],xmm5[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_mul_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512F-LABEL: trunc_mul_v8i64_v8i16:
|
|
|
|
; AVX512F: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512BW-LABEL: trunc_mul_v8i64_v8i16:
|
|
|
|
; AVX512BW: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = mul <8 x i64> %a0, %a1
|
|
|
|
%2 = trunc <8 x i64> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_mul_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
|
|
|
|
; SSE-NEXT: pmuludq %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
|
|
|
|
; SSE-NEXT: pmuludq %xmm4, %xmm2
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
|
|
|
; SSE-NEXT: pmuludq %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
|
|
|
|
; SSE-NEXT: pmuludq %xmm2, %xmm3
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
|
|
|
; SSE-NEXT: pslld $16, %xmm1
|
|
|
|
; SSE-NEXT: psrad $16, %xmm1
|
|
|
|
; SSE-NEXT: pslld $16, %xmm0
|
|
|
|
; SSE-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE-NEXT: packssdw %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_mul_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_mul_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_mul_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = mul <8 x i32> %a0, %a1
|
|
|
|
%2 = trunc <8 x i32> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_mul_v16i64_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm9
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: psrlq $32, %xmm9
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm9
|
|
|
|
; SSE-NEXT: movdqa %xmm8, %xmm10
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm10
|
|
|
|
; SSE-NEXT: pmuludq %xmm0, %xmm10
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: paddq %xmm9, %xmm10
|
|
|
|
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
|
2016-07-18 14:14:54 +08:00
|
|
|
; SSE-NEXT: psllq $32, %xmm10
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm0
|
2016-07-18 14:14:54 +08:00
|
|
|
; SSE-NEXT: paddq %xmm10, %xmm0
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm1, %xmm8
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm8
|
|
|
|
; SSE-NEXT: pmuludq %xmm9, %xmm8
|
|
|
|
; SSE-NEXT: movdqa %xmm9, %xmm10
|
2016-07-18 14:14:54 +08:00
|
|
|
; SSE-NEXT: psrlq $32, %xmm10
|
|
|
|
; SSE-NEXT: pmuludq %xmm1, %xmm10
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: paddq %xmm8, %xmm10
|
2016-07-18 14:14:54 +08:00
|
|
|
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE-NEXT: psllq $32, %xmm10
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm9, %xmm1
|
2016-07-18 14:14:54 +08:00
|
|
|
; SSE-NEXT: paddq %xmm10, %xmm1
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm2, %xmm9
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: psrlq $32, %xmm9
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm9
|
|
|
|
; SSE-NEXT: movdqa %xmm8, %xmm10
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm10
|
|
|
|
; SSE-NEXT: pmuludq %xmm2, %xmm10
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: paddq %xmm9, %xmm10
|
|
|
|
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
|
2016-07-18 14:14:54 +08:00
|
|
|
; SSE-NEXT: psllq $32, %xmm10
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm2
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: paddq %xmm10, %xmm2
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm3, %xmm8
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm8
|
|
|
|
; SSE-NEXT: pmuludq %xmm9, %xmm8
|
|
|
|
; SSE-NEXT: movdqa %xmm9, %xmm10
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: psrlq $32, %xmm10
|
|
|
|
; SSE-NEXT: pmuludq %xmm3, %xmm10
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: paddq %xmm8, %xmm10
|
2016-07-18 14:14:54 +08:00
|
|
|
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE-NEXT: psllq $32, %xmm10
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm9, %xmm3
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: paddq %xmm10, %xmm3
|
|
|
|
; SSE-NEXT: movdqa %xmm4, %xmm9
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: psrlq $32, %xmm9
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm9
|
|
|
|
; SSE-NEXT: movdqa %xmm8, %xmm10
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm10
|
|
|
|
; SSE-NEXT: pmuludq %xmm4, %xmm10
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: paddq %xmm9, %xmm10
|
|
|
|
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
|
2016-07-18 14:14:54 +08:00
|
|
|
; SSE-NEXT: psllq $32, %xmm10
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm4
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: paddq %xmm10, %xmm4
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm5, %xmm8
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm8
|
|
|
|
; SSE-NEXT: pmuludq %xmm9, %xmm8
|
|
|
|
; SSE-NEXT: movdqa %xmm9, %xmm10
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: psrlq $32, %xmm10
|
|
|
|
; SSE-NEXT: pmuludq %xmm5, %xmm10
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: paddq %xmm8, %xmm10
|
2016-07-18 14:14:54 +08:00
|
|
|
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE-NEXT: psllq $32, %xmm10
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm9, %xmm5
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: paddq %xmm10, %xmm5
|
|
|
|
; SSE-NEXT: movdqa %xmm6, %xmm9
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: psrlq $32, %xmm9
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm9
|
|
|
|
; SSE-NEXT: movdqa %xmm8, %xmm10
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm10
|
|
|
|
; SSE-NEXT: pmuludq %xmm6, %xmm10
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: paddq %xmm9, %xmm10
|
|
|
|
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
|
2016-07-18 14:14:54 +08:00
|
|
|
; SSE-NEXT: psllq $32, %xmm10
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm6
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: paddq %xmm10, %xmm6
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm7, %xmm8
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm8
|
|
|
|
; SSE-NEXT: pmuludq %xmm9, %xmm8
|
|
|
|
; SSE-NEXT: movdqa %xmm9, %xmm10
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: psrlq $32, %xmm10
|
|
|
|
; SSE-NEXT: pmuludq %xmm7, %xmm10
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: paddq %xmm8, %xmm10
|
|
|
|
; SSE-NEXT: pmuludq %xmm9, %xmm7
|
2016-07-18 14:14:54 +08:00
|
|
|
; SSE-NEXT: psllq $32, %xmm10
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: paddq %xmm10, %xmm7
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm7
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm6
|
|
|
|
; SSE-NEXT: packuswb %xmm7, %xmm6
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm5
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm5, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm6, %xmm4
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_mul_v16i64_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm8
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm4, %xmm8, %xmm8
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm9
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm9, %xmm0, %xmm9
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; AVX1-NEXT: vpaddq %xmm8, %xmm9, %xmm8
|
|
|
|
; AVX1-NEXT: vpsllq $32, %xmm8, %xmm8
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm9
|
|
|
|
; AVX1-NEXT: vpaddq %xmm8, %xmm9, %xmm8
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm9
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm9, %xmm4, %xmm10
|
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm9, %xmm4
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; AVX1-NEXT: vpaddq %xmm10, %xmm4, %xmm4
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; AVX1-NEXT: vpmuludq %xmm9, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm9
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm5, %xmm4, %xmm4
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm0
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
|
2016-07-18 14:14:54 +08:00
|
|
|
; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm4
|
|
|
|
; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm10
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm0, %xmm5, %xmm5
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm4
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm0
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm6, %xmm4
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
|
|
|
|
; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm4
|
|
|
|
; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm5
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm0, %xmm4, %xmm4
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm6
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm4
|
|
|
|
; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm0
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm2
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm7, %xmm2, %xmm2
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm7, %xmm4
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm4
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2
|
|
|
|
; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm4
|
|
|
|
; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm6
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm7
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm7
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; AVX1-NEXT: vpaddq %xmm6, %xmm7, %xmm6
|
|
|
|
; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; AVX1-NEXT: vpaddq %xmm6, %xmm3, %xmm3
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm10, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm9, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_mul_v16i64_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpmulld %xmm7, %xmm3, %xmm3
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpmulld %xmm6, %xmm2, %xmm2
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpmulld %xmm5, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpmulld %xmm4, %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512F-LABEL: trunc_mul_v16i64_v16i8:
|
|
|
|
; AVX512F: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512F-NEXT: vpmovqd %zmm3, %ymm3
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512F-NEXT: vpmulld %ymm3, %ymm1, %ymm1
|
|
|
|
; AVX512F-NEXT: vpmovqd %zmm2, %ymm2
|
|
|
|
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
|
|
|
|
; AVX512F-NEXT: vpmulld %ymm2, %ymm0, %ymm0
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
|
|
|
|
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512BW-LABEL: trunc_mul_v16i64_v16i8:
|
|
|
|
; AVX512BW: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512BW-NEXT: vpmovqd %zmm3, %ymm3
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512BW-NEXT: vpmulld %ymm3, %ymm1, %ymm1
|
|
|
|
; AVX512BW-NEXT: vpmovqd %zmm2, %ymm2
|
|
|
|
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
|
|
|
|
; AVX512BW-NEXT: vpmulld %ymm2, %ymm0, %ymm0
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
|
|
|
|
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vpmullq %zmm3, %zmm1, %zmm1
|
|
|
|
; AVX512DQ-NEXT: vpmullq %zmm2, %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
|
2017-08-17 23:40:25 +08:00
|
|
|
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = mul <16 x i64> %a0, %a1
|
|
|
|
%2 = trunc <16 x i64> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_mul_v16i32_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
|
|
|
|
; SSE-NEXT: pmuludq %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
|
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm4
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
|
|
|
|
; SSE-NEXT: pmuludq %xmm5, %xmm1
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
|
|
|
|
; SSE-NEXT: pmuludq %xmm4, %xmm5
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
|
|
|
|
; SSE-NEXT: pmuludq %xmm6, %xmm2
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
|
|
|
|
; SSE-NEXT: pmuludq %xmm4, %xmm5
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
|
|
|
|
; SSE-NEXT: pmuludq %xmm7, %xmm3
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
|
|
|
|
; SSE-NEXT: pmuludq %xmm4, %xmm5
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_mul_v16i32_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm4
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_mul_v16i32_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: trunc_mul_v16i32_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = mul <16 x i32> %a0, %a1
|
|
|
|
%2 = trunc <16 x i32> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_mul_v16i16_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pmullw %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: pmullw %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_mul_v16i16_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_mul_v16i16_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: trunc_mul_v16i16_v16i8:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512BW-LABEL: trunc_mul_v16i16_v16i8:
|
|
|
|
; AVX512BW: # BB#0:
|
|
|
|
; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
2016-12-31 06:43:41 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = mul <16 x i16> %a0, %a1
|
|
|
|
%2 = trunc <16 x i16> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
2017-01-19 23:03:00 +08:00
|
|
|
define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
|
|
|
|
; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pxor %xmm3, %xmm3
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
|
2017-01-20 00:25:02 +08:00
|
|
|
; SSE-NEXT: pslld $16, %xmm2
|
|
|
|
; SSE-NEXT: psrad $16, %xmm2
|
|
|
|
; SSE-NEXT: pslld $16, %xmm1
|
|
|
|
; SSE-NEXT: psrad $16, %xmm1
|
|
|
|
; SSE-NEXT: packssdw %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: pmullw %xmm1, %xmm0
|
2017-01-19 23:03:00 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
|
|
|
|
; AVX1: # BB#0:
|
2017-01-20 00:25:02 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
|
|
|
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
|
|
|
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
2017-01-19 23:03:00 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
|
|
|
|
; AVX2: # BB#0:
|
2017-01-20 00:25:02 +08:00
|
|
|
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2017-01-20 00:25:02 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
2017-01-19 23:03:00 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
|
|
|
|
; AVX512: # BB#0:
|
2017-01-20 00:25:02 +08:00
|
|
|
; AVX512-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
|
|
|
|
; AVX512-NEXT: vpmovdw %zmm1, %ymm1
|
|
|
|
; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
|
|
|
; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2017-01-19 23:03:00 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
|
|
%2 = zext <8 x i8> %1 to <8 x i32>
|
|
|
|
%3 = mul <8 x i32> %2, %a1
|
|
|
|
%4 = trunc <8 x i32> %3 to <8 x i16>
|
|
|
|
ret <8 x i16> %4
|
|
|
|
}
|
|
|
|
|
2016-03-14 03:08:01 +08:00
|
|
|
;
|
|
|
|
; mul to constant
|
|
|
|
;
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_mul_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,3]
|
|
|
|
; SSE-NEXT: movdqa %xmm1, %xmm3
|
|
|
|
; SSE-NEXT: pmuludq %xmm2, %xmm3
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm1
|
|
|
|
; SSE-NEXT: pmuludq %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: psllq $32, %xmm1
|
|
|
|
; SSE-NEXT: paddq %xmm3, %xmm1
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: movl $1, %eax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm2
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm3
|
|
|
|
; SSE-NEXT: pmuludq %xmm2, %xmm3
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm0
|
|
|
|
; SSE-NEXT: pmuludq %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: psllq $32, %xmm0
|
|
|
|
; SSE-NEXT: paddq %xmm3, %xmm0
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_mul_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_mul_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-LABEL: trunc_mul_const_v4i64_v4i32:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
|
|
|
|
%2 = trunc <4 x i64> %1 to <4 x i32>
|
2017-01-04 16:05:42 +08:00
|
|
|
ret <4 x i32> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_mul_const_v8i64_v8i16:
|
|
|
|
; SSE: # BB#0:
|
2016-10-07 02:58:24 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pmullw {{.*}}(%rip), %xmm2
|
|
|
|
; SSE-NEXT: movdqa %xmm2, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_mul_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_mul_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-LABEL: trunc_mul_const_v8i64_v8i16:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
|
|
|
|
; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
|
|
|
|
%2 = trunc <8 x i64> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_mul_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pslld $16, %xmm1
|
|
|
|
; SSE-NEXT: psrad $16, %xmm1
|
|
|
|
; SSE-NEXT: pslld $16, %xmm0
|
|
|
|
; SSE-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE-NEXT: packssdw %xmm1, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_mul_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_mul_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_mul_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
|
|
%2 = trunc <8 x i32> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movl $1, %eax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm8
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm9
|
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm9
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm0
|
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm0
|
|
|
|
; SSE-NEXT: psllq $32, %xmm0
|
|
|
|
; SSE-NEXT: paddq %xmm9, %xmm0
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [2,3]
|
|
|
|
; SSE-NEXT: movdqa %xmm1, %xmm9
|
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm9
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm1
|
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm1
|
|
|
|
; SSE-NEXT: psllq $32, %xmm1
|
|
|
|
; SSE-NEXT: paddq %xmm9, %xmm1
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [4,5]
|
|
|
|
; SSE-NEXT: movdqa %xmm2, %xmm9
|
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm9
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm2
|
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm2
|
|
|
|
; SSE-NEXT: psllq $32, %xmm2
|
|
|
|
; SSE-NEXT: paddq %xmm9, %xmm2
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [6,7]
|
|
|
|
; SSE-NEXT: movdqa %xmm3, %xmm9
|
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm9
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm3
|
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm3
|
|
|
|
; SSE-NEXT: psllq $32, %xmm3
|
|
|
|
; SSE-NEXT: paddq %xmm9, %xmm3
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [8,9]
|
|
|
|
; SSE-NEXT: movdqa %xmm4, %xmm9
|
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm9
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm4
|
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm4
|
|
|
|
; SSE-NEXT: psllq $32, %xmm4
|
|
|
|
; SSE-NEXT: paddq %xmm9, %xmm4
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [10,11]
|
|
|
|
; SSE-NEXT: movdqa %xmm5, %xmm9
|
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm9
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm5
|
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm5
|
|
|
|
; SSE-NEXT: psllq $32, %xmm5
|
|
|
|
; SSE-NEXT: paddq %xmm9, %xmm5
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [12,13]
|
|
|
|
; SSE-NEXT: movdqa %xmm6, %xmm9
|
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm9
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm6
|
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm6
|
|
|
|
; SSE-NEXT: psllq $32, %xmm6
|
|
|
|
; SSE-NEXT: paddq %xmm9, %xmm6
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [14,15]
|
|
|
|
; SSE-NEXT: movdqa %xmm7, %xmm9
|
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm9
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm7
|
|
|
|
; SSE-NEXT: pmuludq %xmm8, %xmm7
|
|
|
|
; SSE-NEXT: psllq $32, %xmm7
|
|
|
|
; SSE-NEXT: paddq %xmm9, %xmm7
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm7
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm6
|
|
|
|
; SSE-NEXT: packuswb %xmm7, %xmm6
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm5
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm5, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm6, %xmm4
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: movl $1, %eax
|
|
|
|
; AVX1-NEXT: vmovq %rax, %xmm4
|
|
|
|
; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm5
|
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4
|
|
|
|
; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm8
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3]
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm6
|
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpaddq %xmm0, %xmm6, %xmm9
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5]
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm6
|
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm7
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
|
|
|
|
; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
|
|
|
|
; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7]
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm7
|
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpaddq %xmm1, %xmm7, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9]
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm7
|
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm6, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpaddq %xmm4, %xmm7, %xmm4
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11]
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm7
|
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpaddq %xmm2, %xmm7, %xmm2
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13]
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm7
|
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm0
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpaddq %xmm0, %xmm7, %xmm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [14,15]
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm7
|
|
|
|
; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpaddq %xmm3, %xmm7, %xmm3
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm3
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
|
|
|
|
; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm3
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_mul_const_v16i64_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2017-08-17 23:40:25 +08:00
|
|
|
; AVX512-LABEL: trunc_mul_const_v16i64_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
|
|
|
|
; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
|
|
|
|
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
|
|
|
|
%2 = trunc <16 x i64> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_mul_const_v16i32_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,2,3]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
|
|
|
|
; SSE-NEXT: pmuludq %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
|
|
|
|
; SSE-NEXT: pmuludq %xmm5, %xmm4
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
|
|
|
|
; SSE-NEXT: pmuludq %xmm4, %xmm1
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
|
|
|
|
; SSE-NEXT: pmuludq %xmm5, %xmm4
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,9,10,11]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
|
|
|
|
; SSE-NEXT: pmuludq %xmm4, %xmm2
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
|
|
|
|
; SSE-NEXT: pmuludq %xmm5, %xmm4
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [12,13,14,15]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
|
|
|
|
; SSE-NEXT: pmuludq %xmm4, %xmm3
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
|
|
|
|
; SSE-NEXT: pmuludq %xmm5, %xmm4
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_mul_const_v16i32_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: trunc_mul_const_v16i32_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmulld {{.*}}(%rip), %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
|
|
%2 = trunc <16 x i32> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_mul_const_v16i16_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
|
|
|
|
; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_mul_const_v16i16_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_mul_const_v16i16_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8:
|
|
|
|
; AVX512BW: # BB#0:
|
|
|
|
; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
2016-12-31 06:43:41 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
|
|
|
|
%2 = trunc <16 x i16> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
;
|
|
|
|
; and
|
|
|
|
;
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_and_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: andps %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: andps %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_and_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_and_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
2017-09-19 12:39:55 +08:00
|
|
|
; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_and_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
2016-07-22 13:46:44 +08:00
|
|
|
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = and <4 x i64> %a0, %a1
|
|
|
|
%2 = trunc <4 x i64> %1 to <4 x i32>
|
|
|
|
ret <4 x i32> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_and_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: pand %xmm5, %xmm1
|
2016-10-07 02:58:24 +08:00
|
|
|
; SSE-NEXT: pand %xmm6, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm7, %xmm3
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
|
|
|
|
; SSE-NEXT: movapd %xmm2, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_and_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_and_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_and_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
2016-07-22 13:46:44 +08:00
|
|
|
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = and <8 x i64> %a0, %a1
|
|
|
|
%2 = trunc <8 x i64> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_and_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pand %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: pslld $16, %xmm1
|
|
|
|
; SSE-NEXT: psrad $16, %xmm1
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; SSE-NEXT: pand %xmm2, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: pslld $16, %xmm0
|
|
|
|
; SSE-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE-NEXT: packssdw %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_and_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_and_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_and_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
2016-07-22 13:46:44 +08:00
|
|
|
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = and <8 x i32> %a0, %a1
|
|
|
|
%2 = trunc <8 x i32> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_and_v16i64_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm0
|
|
|
|
; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm1
|
|
|
|
; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm2
|
|
|
|
; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm3
|
|
|
|
; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm4
|
|
|
|
; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm5
|
|
|
|
; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm6
|
|
|
|
; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm7
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm7
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm6
|
|
|
|
; SSE-NEXT: packuswb %xmm7, %xmm6
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm5
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm5, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm6, %xmm4
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_and_v16i64_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1
|
|
|
|
; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2
|
|
|
|
; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
|
|
|
|
; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_and_v16i64_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpand %ymm5, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpand %ymm7, %ymm3, %ymm3
|
|
|
|
; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2017-08-17 23:40:25 +08:00
|
|
|
; AVX512-LABEL: trunc_and_v16i64_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1
|
|
|
|
; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
|
|
|
|
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = and <16 x i64> %a0, %a1
|
|
|
|
%2 = trunc <16 x i64> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_and_v16i32_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm7
|
|
|
|
; SSE-NEXT: pand %xmm3, %xmm7
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm6
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm6
|
|
|
|
; SSE-NEXT: packuswb %xmm7, %xmm6
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm5
|
|
|
|
; SSE-NEXT: pand %xmm1, %xmm5
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm4
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm5, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm6, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_and_v16i32_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_and_v16i32_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: trunc_and_v16i32_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
2017-08-31 00:38:33 +08:00
|
|
|
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = and <16 x i32> %a0, %a1
|
|
|
|
%2 = trunc <16 x i32> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_and_v16i16_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm1, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_and_v16i16_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_and_v16i16_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: trunc_and_v16i16_v16i8:
|
|
|
|
; AVX512F: # BB#0:
|
2016-07-22 13:46:44 +08:00
|
|
|
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512BW-LABEL: trunc_and_v16i16_v16i8:
|
|
|
|
; AVX512BW: # BB#0:
|
2016-07-22 13:46:44 +08:00
|
|
|
; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
2016-12-31 06:43:41 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: trunc_and_v16i16_v16i8:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = and <16 x i16> %a0, %a1
|
|
|
|
%2 = trunc <16 x i16> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
;
|
|
|
|
; and to constant
|
|
|
|
;
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_and_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_and_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_and_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
2017-09-19 12:39:55 +08:00
|
|
|
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_and_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
|
|
|
|
%2 = trunc <4 x i64> %1 to <4 x i32>
|
|
|
|
ret <4 x i32> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_and_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
2016-10-07 02:58:24 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
|
|
|
|
; SSE-NEXT: andpd {{.*}}(%rip), %xmm2
|
|
|
|
; SSE-NEXT: movapd %xmm2, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_and_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_and_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_and_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
|
|
|
|
%2 = trunc <8 x i64> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_and_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pslld $16, %xmm1
|
|
|
|
; SSE-NEXT: psrad $16, %xmm1
|
|
|
|
; SSE-NEXT: pslld $16, %xmm0
|
|
|
|
; SSE-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE-NEXT: packssdw %xmm1, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_and_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_and_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_and_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
|
|
%2 = trunc <8 x i32> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_and_const_v16i64_v16i8:
|
|
|
|
; SSE: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm7
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm6
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: packuswb %xmm7, %xmm6
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pand %xmm8, %xmm5
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm4
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: packuswb %xmm5, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm6, %xmm4
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pand %xmm8, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm2
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pand %xmm8, %xmm1
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: pand %xmm8, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm4, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_and_const_v16i64_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2017-08-17 23:40:25 +08:00
|
|
|
; AVX512-LABEL: trunc_and_const_v16i64_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
|
|
|
|
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
|
|
|
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
|
|
|
|
%2 = trunc <16 x i64> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_and_const_v16i32_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: trunc_and_const_v16i32_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
|
|
%2 = trunc <16 x i32> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_and_const_v16i16_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_and_const_v16i16_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_and_const_v16i16_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: trunc_and_const_v16i16_v16i8:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
|
|
|
|
; AVX512BW: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
2016-12-31 06:43:41 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
|
|
|
|
%2 = trunc <16 x i16> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
;
|
|
|
|
; xor
|
|
|
|
;
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_xor_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: xorps %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: xorps %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_xor_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_xor_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
2017-09-19 12:39:55 +08:00
|
|
|
; AVX2-NEXT: vxorps %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_xor_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
2016-07-22 13:46:44 +08:00
|
|
|
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = xor <4 x i64> %a0, %a1
|
|
|
|
%2 = trunc <4 x i64> %1 to <4 x i32>
|
|
|
|
ret <4 x i32> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_xor_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pxor %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: pxor %xmm5, %xmm1
|
2016-10-07 02:58:24 +08:00
|
|
|
; SSE-NEXT: pxor %xmm6, %xmm2
|
|
|
|
; SSE-NEXT: pxor %xmm7, %xmm3
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
|
|
|
|
; SSE-NEXT: movapd %xmm2, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_xor_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_xor_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_xor_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
2016-07-22 13:46:44 +08:00
|
|
|
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = xor <8 x i64> %a0, %a1
|
|
|
|
%2 = trunc <8 x i64> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_xor_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pxor %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: pslld $16, %xmm1
|
|
|
|
; SSE-NEXT: psrad $16, %xmm1
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; SSE-NEXT: pxor %xmm2, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: pslld $16, %xmm0
|
|
|
|
; SSE-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE-NEXT: packssdw %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_xor_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_xor_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_xor_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
2016-07-22 13:46:44 +08:00
|
|
|
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = xor <8 x i32> %a0, %a1
|
|
|
|
%2 = trunc <8 x i32> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_xor_v16i64_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm0
|
|
|
|
; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm1
|
|
|
|
; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm2
|
|
|
|
; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm3
|
|
|
|
; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm4
|
|
|
|
; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm5
|
|
|
|
; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm6
|
|
|
|
; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm7
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm7
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm6
|
|
|
|
; SSE-NEXT: packuswb %xmm7, %xmm6
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm5
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm5, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm6, %xmm4
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_xor_v16i64_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1
|
|
|
|
; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2
|
|
|
|
; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
|
|
|
|
; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_xor_v16i64_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpxor %ymm7, %ymm3, %ymm3
|
|
|
|
; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2017-08-17 23:40:25 +08:00
|
|
|
; AVX512-LABEL: trunc_xor_v16i64_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1
|
|
|
|
; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
|
|
|
|
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = xor <16 x i64> %a0, %a1
|
|
|
|
%2 = trunc <16 x i64> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_xor_v16i32_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pxor %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: pxor %xmm5, %xmm1
|
|
|
|
; SSE-NEXT: pxor %xmm6, %xmm2
|
|
|
|
; SSE-NEXT: pxor %xmm7, %xmm3
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_xor_v16i32_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_xor_v16i32_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: trunc_xor_v16i32_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
2017-08-31 00:38:33 +08:00
|
|
|
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = xor <16 x i32> %a0, %a1
|
|
|
|
%2 = trunc <16 x i32> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_xor_v16i16_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pxor %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: pxor %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_xor_v16i16_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_xor_v16i16_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: trunc_xor_v16i16_v16i8:
|
|
|
|
; AVX512F: # BB#0:
|
2016-07-22 13:46:44 +08:00
|
|
|
; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512BW-LABEL: trunc_xor_v16i16_v16i8:
|
|
|
|
; AVX512BW: # BB#0:
|
2016-07-22 13:46:44 +08:00
|
|
|
; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
2016-12-31 06:43:41 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = xor <16 x i16> %a0, %a1
|
|
|
|
%2 = trunc <16 x i16> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
;
|
|
|
|
; xor to constant
|
|
|
|
;
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_xor_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: xorps {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_xor_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_xor_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
2017-09-19 12:39:55 +08:00
|
|
|
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_xor_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
|
|
|
|
%2 = trunc <4 x i64> %1 to <4 x i32>
|
|
|
|
ret <4 x i32> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_xor_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
2016-10-07 02:58:24 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
|
|
|
|
; SSE-NEXT: xorpd {{.*}}(%rip), %xmm2
|
|
|
|
; SSE-NEXT: movapd %xmm2, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_xor_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_xor_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_xor_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
|
|
|
|
%2 = trunc <8 x i64> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_xor_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pslld $16, %xmm1
|
|
|
|
; SSE-NEXT: psrad $16, %xmm1
|
|
|
|
; SSE-NEXT: pslld $16, %xmm0
|
|
|
|
; SSE-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE-NEXT: packssdw %xmm1, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_xor_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_xor_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_xor_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
|
|
%2 = trunc <8 x i32> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_xor_const_v16i64_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm7
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm6
|
|
|
|
; SSE-NEXT: packuswb %xmm7, %xmm6
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm5
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm5, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm6, %xmm4
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm4, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_xor_const_v16i64_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2017-08-17 23:40:25 +08:00
|
|
|
; AVX512-LABEL: trunc_xor_const_v16i64_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
|
|
|
|
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
|
|
|
; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
|
|
|
|
%2 = trunc <16 x i64> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_xor_const_v16i32_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: trunc_xor_const_v16i32_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
|
|
%2 = trunc <16 x i32> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_xor_const_v16i16_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_xor_const_v16i16_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_xor_const_v16i16_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
|
|
|
|
; AVX512BW: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
2016-12-31 06:43:41 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
|
|
|
|
%2 = trunc <16 x i16> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
;
|
|
|
|
; or
|
|
|
|
;
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_or_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: orps %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: orps %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_or_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_or_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
2017-09-19 12:39:55 +08:00
|
|
|
; AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_or_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
2016-07-22 13:46:44 +08:00
|
|
|
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = or <4 x i64> %a0, %a1
|
|
|
|
%2 = trunc <4 x i64> %1 to <4 x i32>
|
|
|
|
ret <4 x i32> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_or_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: por %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: por %xmm5, %xmm1
|
2016-10-07 02:58:24 +08:00
|
|
|
; SSE-NEXT: por %xmm6, %xmm2
|
|
|
|
; SSE-NEXT: por %xmm7, %xmm3
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
|
|
|
|
; SSE-NEXT: movapd %xmm2, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_or_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_or_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_or_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
2016-07-22 13:46:44 +08:00
|
|
|
; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = or <8 x i64> %a0, %a1
|
|
|
|
%2 = trunc <8 x i64> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_or_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: por %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: pslld $16, %xmm1
|
|
|
|
; SSE-NEXT: psrad $16, %xmm1
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; SSE-NEXT: por %xmm2, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: pslld $16, %xmm0
|
|
|
|
; SSE-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE-NEXT: packssdw %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_or_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_or_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_or_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
2016-07-22 13:46:44 +08:00
|
|
|
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = or <8 x i32> %a0, %a1
|
|
|
|
%2 = trunc <8 x i32> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_or_v16i64_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0
|
|
|
|
; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1
|
|
|
|
; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm2
|
|
|
|
; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm3
|
|
|
|
; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm4
|
|
|
|
; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm5
|
|
|
|
; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm6
|
|
|
|
; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm7
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm7
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm6
|
|
|
|
; SSE-NEXT: packuswb %xmm7, %xmm6
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm5
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm5, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm6, %xmm4
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_or_v16i64_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1
|
|
|
|
; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2
|
|
|
|
; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
|
|
|
|
; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_or_v16i64_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpor %ymm5, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpor %ymm7, %ymm3, %ymm3
|
|
|
|
; AVX2-NEXT: vpor %ymm6, %ymm2, %ymm2
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2017-08-17 23:40:25 +08:00
|
|
|
; AVX512-LABEL: trunc_or_v16i64_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1
|
|
|
|
; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
|
|
|
|
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = or <16 x i64> %a0, %a1
|
|
|
|
%2 = trunc <16 x i64> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_or_v16i32_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: por %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: por %xmm5, %xmm1
|
|
|
|
; SSE-NEXT: por %xmm6, %xmm2
|
|
|
|
; SSE-NEXT: por %xmm7, %xmm3
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_or_v16i32_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_or_v16i32_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: trunc_or_v16i32_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
2017-08-31 00:38:33 +08:00
|
|
|
; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = or <16 x i32> %a0, %a1
|
|
|
|
%2 = trunc <16 x i32> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
|
|
|
|
; SSE-LABEL: trunc_or_v16i16_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: por %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: por %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_or_v16i16_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_or_v16i16_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: trunc_or_v16i16_v16i8:
|
|
|
|
; AVX512F: # BB#0:
|
2016-07-22 13:46:44 +08:00
|
|
|
; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512BW-LABEL: trunc_or_v16i16_v16i8:
|
|
|
|
; AVX512BW: # BB#0:
|
2016-07-22 13:46:44 +08:00
|
|
|
; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
2016-12-31 06:43:41 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: trunc_or_v16i16_v16i8:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = or <16 x i16> %a0, %a1
|
|
|
|
%2 = trunc <16 x i16> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
;
|
|
|
|
; or to constant
|
|
|
|
;
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_or_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: orps {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_or_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_or_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
2017-09-19 12:39:55 +08:00
|
|
|
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_or_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
|
|
|
|
%2 = trunc <4 x i64> %1 to <4 x i32>
|
|
|
|
ret <4 x i32> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_or_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
2016-10-07 02:58:24 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
|
|
|
|
; SSE-NEXT: orpd {{.*}}(%rip), %xmm2
|
|
|
|
; SSE-NEXT: movapd %xmm2, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_or_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_or_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_or_const_v8i64_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
|
|
|
|
%2 = trunc <8 x i64> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
2016-12-31 06:40:32 +08:00
|
|
|
define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_or_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pslld $16, %xmm1
|
|
|
|
; SSE-NEXT: psrad $16, %xmm1
|
|
|
|
; SSE-NEXT: pslld $16, %xmm0
|
|
|
|
; SSE-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE-NEXT: packssdw %xmm1, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: por {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX1-LABEL: trunc_or_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX2-LABEL: trunc_or_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2: # BB#0:
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-12-31 06:40:32 +08:00
|
|
|
; AVX512-LABEL: trunc_or_const_v8i32_v8i16:
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
|
|
%2 = trunc <8 x i32> %1 to <8 x i16>
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_or_const_v16i64_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm7
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm6
|
|
|
|
; SSE-NEXT: packuswb %xmm7, %xmm6
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm5
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm5, %xmm4
|
|
|
|
; SSE-NEXT: packuswb %xmm6, %xmm4
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm8, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm4, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: por {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_or_const_v16i64_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
2016-07-15 17:49:12 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2017-08-17 23:40:25 +08:00
|
|
|
; AVX512-LABEL: trunc_or_const_v16i64_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
|
|
|
|
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
|
|
|
; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
|
|
|
|
%2 = trunc <16 x i64> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_or_const_v16i32_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm2
|
|
|
|
; SSE-NEXT: packuswb %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm2, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: por {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: trunc_or_const_v16i32_v16i8:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
|
|
%2 = trunc <16 x i32> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
|
|
|
|
; SSE-LABEL: trunc_or_const_v16i16_v16i8:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: packuswb %xmm1, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: por {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: trunc_or_const_v16i16_v16i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: trunc_or_const_v16i16_v16i8:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: trunc_or_const_v16i16_v16i8:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
|
|
|
|
; AVX512BW: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2016-03-14 03:08:01 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
2016-12-31 06:43:41 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
|
2017-01-04 16:05:42 +08:00
|
|
|
; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-12-31 06:43:41 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
|
|
|
|
%2 = trunc <16 x i16> %1 to <16 x i8>
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
|
|
|
;
|
|
|
|
; complex patterns - often created by vectorizer
|
|
|
|
;
|
|
|
|
|
2017-01-04 03:18:07 +08:00
|
|
|
define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
|
|
|
|
; SSE-LABEL: mul_add_const_v4i64_v4i32:
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE: # BB#0:
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm2
|
2017-10-04 00:59:13 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm2, %xmm4
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm4
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm1, %xmm4
|
|
|
|
; SSE-NEXT: movdqa %xmm1, %xmm5
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: psrlq $32, %xmm5
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm2, %xmm5
|
[X86][SSE] Improve lowering of vXi64 multiplies
As mentioned on PR30845, we were performing our vXi64 multiplication as:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32);
when we could avoid one of the upper shifts with:
AloBlo = pmuludq(a, b);
AloBhi = pmuludq(a, psrlqi(b, 32));
AhiBlo = pmuludq(psrlqi(a, 32), b);
return AloBlo + psllqi(AloBhi + AhiBlo, 32);
This matches the lowering on gcc/icc.
Differential Revision: https://reviews.llvm.org/D27756
llvm-svn: 290267
2016-12-22 04:00:10 +08:00
|
|
|
; SSE-NEXT: paddq %xmm4, %xmm5
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: psllq $32, %xmm5
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm1, %xmm2
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: paddq %xmm5, %xmm2
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm1
|
|
|
|
; SSE-NEXT: pmuludq %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: movdqa %xmm3, %xmm4
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: psrlq $32, %xmm4
|
|
|
|
; SSE-NEXT: pmuludq %xmm0, %xmm4
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: paddq %xmm1, %xmm4
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: psllq $32, %xmm4
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm3, %xmm0
|
2016-07-18 14:14:54 +08:00
|
|
|
; SSE-NEXT: paddq %xmm4, %xmm0
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
|
2016-03-14 03:08:01 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2017-01-20 00:25:02 +08:00
|
|
|
; AVX-LABEL: mul_add_const_v4i64_v4i32:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2016-03-14 03:08:01 +08:00
|
|
|
%1 = sext <4 x i32> %a0 to <4 x i64>
|
|
|
|
%2 = sext <4 x i32> %a1 to <4 x i64>
|
|
|
|
%3 = mul <4 x i64> %1, %2
|
|
|
|
%4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3>
|
|
|
|
%5 = trunc <4 x i64> %4 to <4 x i32>
|
|
|
|
ret <4 x i32> %5
|
|
|
|
}
|
2017-01-04 03:18:07 +08:00
|
|
|
|
|
|
|
define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
|
|
|
|
; SSE-LABEL: mul_add_self_v4i64_v4i32:
|
|
|
|
; SSE: # BB#0:
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
|
|
|
|
; SSE-NEXT: movdqa %xmm2, %xmm3
|
|
|
|
; SSE-NEXT: psrad $31, %xmm3
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm3
|
|
|
|
; SSE-NEXT: psrad $31, %xmm3
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
|
|
|
|
; SSE-NEXT: movdqa %xmm3, %xmm4
|
|
|
|
; SSE-NEXT: psrad $31, %xmm4
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
|
2017-01-04 03:18:07 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm1, %xmm4
|
|
|
|
; SSE-NEXT: psrad $31, %xmm4
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm4
|
2017-01-04 03:18:07 +08:00
|
|
|
; SSE-NEXT: psrlq $32, %xmm4
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm1, %xmm4
|
|
|
|
; SSE-NEXT: movdqa %xmm1, %xmm5
|
2017-01-04 03:18:07 +08:00
|
|
|
; SSE-NEXT: psrlq $32, %xmm5
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm0, %xmm5
|
2017-01-04 03:18:07 +08:00
|
|
|
; SSE-NEXT: paddq %xmm4, %xmm5
|
|
|
|
; SSE-NEXT: psllq $32, %xmm5
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: paddq %xmm5, %xmm1
|
|
|
|
; SSE-NEXT: movdqa %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm0
|
|
|
|
; SSE-NEXT: pmuludq %xmm3, %xmm0
|
|
|
|
; SSE-NEXT: movdqa %xmm3, %xmm4
|
2017-01-04 03:18:07 +08:00
|
|
|
; SSE-NEXT: psrlq $32, %xmm4
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm2, %xmm4
|
|
|
|
; SSE-NEXT: paddq %xmm0, %xmm4
|
2017-01-04 03:18:07 +08:00
|
|
|
; SSE-NEXT: psllq $32, %xmm4
|
2017-01-04 16:05:42 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm2, %xmm3
|
|
|
|
; SSE-NEXT: paddq %xmm4, %xmm3
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
|
|
|
|
; SSE-NEXT: paddd %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: movdqa %xmm1, %xmm0
|
2017-01-04 03:18:07 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2017-01-20 00:25:02 +08:00
|
|
|
; AVX-LABEL: mul_add_self_v4i64_v4i32:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2017-01-04 03:18:07 +08:00
|
|
|
%1 = sext <4 x i32> %a0 to <4 x i64>
|
|
|
|
%2 = sext <4 x i32> %a1 to <4 x i64>
|
|
|
|
%3 = mul <4 x i64> %1, %2
|
|
|
|
%4 = add <4 x i64> %3, %3
|
|
|
|
%5 = trunc <4 x i64> %4 to <4 x i32>
|
|
|
|
ret <4 x i32> %5
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
|
|
|
|
; SSE-LABEL: mul_add_multiuse_v4i64_v4i32:
|
|
|
|
; SSE: # BB#0:
|
2017-01-20 00:25:02 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
|
2017-01-04 03:18:07 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm3, %xmm5
|
2017-01-20 00:25:02 +08:00
|
|
|
; SSE-NEXT: psrlq $32, %xmm5
|
|
|
|
; SSE-NEXT: pmuludq %xmm1, %xmm5
|
|
|
|
; SSE-NEXT: movdqa %xmm1, %xmm6
|
2017-01-04 03:18:07 +08:00
|
|
|
; SSE-NEXT: psrlq $32, %xmm6
|
2017-01-20 00:25:02 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm3, %xmm6
|
|
|
|
; SSE-NEXT: paddq %xmm5, %xmm6
|
|
|
|
; SSE-NEXT: psllq $32, %xmm6
|
|
|
|
; SSE-NEXT: pmuludq %xmm1, %xmm3
|
|
|
|
; SSE-NEXT: paddq %xmm6, %xmm3
|
|
|
|
; SSE-NEXT: movdqa %xmm2, %xmm1
|
2017-01-04 03:18:07 +08:00
|
|
|
; SSE-NEXT: psrlq $32, %xmm1
|
2017-01-20 00:25:02 +08:00
|
|
|
; SSE-NEXT: pmuludq %xmm4, %xmm1
|
|
|
|
; SSE-NEXT: movdqa %xmm4, %xmm5
|
|
|
|
; SSE-NEXT: psrlq $32, %xmm5
|
|
|
|
; SSE-NEXT: pmuludq %xmm2, %xmm5
|
|
|
|
; SSE-NEXT: paddq %xmm1, %xmm5
|
|
|
|
; SSE-NEXT: psllq $32, %xmm5
|
|
|
|
; SSE-NEXT: pmuludq %xmm4, %xmm2
|
|
|
|
; SSE-NEXT: paddq %xmm5, %xmm2
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
|
|
|
|
; SSE-NEXT: paddd %xmm2, %xmm0
|
2017-01-04 03:18:07 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2017-01-20 00:25:02 +08:00
|
|
|
; AVX-LABEL: mul_add_multiuse_v4i64_v4i32:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1
|
|
|
|
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2017-01-04 03:18:07 +08:00
|
|
|
%1 = sext <4 x i32> %a0 to <4 x i64>
|
|
|
|
%2 = sext <4 x i32> %a1 to <4 x i64>
|
|
|
|
%3 = mul <4 x i64> %1, %2
|
|
|
|
%4 = add <4 x i64> %1, %3
|
|
|
|
%5 = trunc <4 x i64> %4 to <4 x i32>
|
|
|
|
ret <4 x i32> %5
|
|
|
|
}
|