2016-11-23 06:04:50 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2015-07-04 04:07:57 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
|
2016-11-23 06:04:50 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=VEX --check-prefix=AVX1
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=VEX --check-prefix=AVX2
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VLDQ
|
2015-07-04 04:07:57 +08:00
|
|
|
;
|
|
|
|
; 32-bit tests to make sure we're not doing anything stupid.
|
|
|
|
; RUN: llc < %s -mtriple=i686-unknown-unknown
|
|
|
|
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse
|
|
|
|
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2
|
2015-05-02 19:18:47 +08:00
|
|
|
|
|
|
|
;
|
|
|
|
; Double to Signed Integer
|
|
|
|
;
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptosi_2f64_to_2i64:
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm0, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm1
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm0, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm0
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; VEX-LABEL: fptosi_2f64_to_2i64:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vcvttsd2si %xmm0, %rax
|
|
|
|
; VEX-NEXT: vmovq %rax, %xmm1
|
|
|
|
; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
|
|
|
; VEX-NEXT: vcvttsd2si %xmm0, %rax
|
|
|
|
; VEX-NEXT: vmovq %rax, %xmm0
|
|
|
|
; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; VEX-NEXT: retq
|
2016-08-23 22:37:35 +08:00
|
|
|
;
|
|
|
|
; AVX512F-LABEL: fptosi_2f64_to_2i64:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vcvttsd2si %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
|
|
|
; AVX512F-NEXT: vcvttsd2si %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-LABEL: fptosi_2f64_to_2i64:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
|
|
|
; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-LABEL: fptosi_2f64_to_2i64:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-24 22:46:55 +08:00
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-LABEL: fptosi_2f64_to_2i64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvttpd2qq %xmm0, %xmm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
%cvt = fptosi <2 x double> %a to <2 x i64>
|
|
|
|
ret <2 x i64> %cvt
|
|
|
|
}
|
|
|
|
|
2016-08-23 23:00:52 +08:00
|
|
|
define <4 x i32> @fptosi_2f64_to_4i32(<2 x double> %a) {
|
|
|
|
; SSE-LABEL: fptosi_2f64_to_4i32:
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE: # BB#0:
|
2016-10-18 15:42:15 +08:00
|
|
|
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
;
|
2016-08-23 23:00:52 +08:00
|
|
|
; AVX-LABEL: fptosi_2f64_to_4i32:
|
2015-05-02 19:18:47 +08:00
|
|
|
; AVX: # BB#0:
|
2016-10-18 15:42:15 +08:00
|
|
|
; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
|
2015-05-02 19:18:47 +08:00
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = fptosi <2 x double> %a to <2 x i32>
|
2016-08-23 23:10:39 +08:00
|
|
|
%ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
2015-05-02 19:18:47 +08:00
|
|
|
ret <4 x i32> %ext
|
|
|
|
}
|
|
|
|
|
2016-08-23 23:00:52 +08:00
|
|
|
define <2 x i32> @fptosi_2f64_to_2i32(<2 x double> %a) {
|
|
|
|
; SSE-LABEL: fptosi_2f64_to_2i32:
|
|
|
|
; SSE: # BB#0:
|
2016-10-18 15:42:15 +08:00
|
|
|
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
|
2016-08-23 23:00:52 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: fptosi_2f64_to_2i32:
|
|
|
|
; AVX: # BB#0:
|
2016-10-18 15:42:15 +08:00
|
|
|
; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
2016-08-23 23:00:52 +08:00
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = fptosi <2 x double> %a to <2 x i32>
|
|
|
|
ret <2 x i32> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) {
|
|
|
|
; SSE-LABEL: fptosi_4f64_to_2i32:
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE: # BB#0:
|
2016-10-18 15:42:15 +08:00
|
|
|
; SSE-NEXT: cvttpd2dq %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX-LABEL: fptosi_4f64_to_2i32:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
|
|
|
|
; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0
|
|
|
|
; AVX-NEXT: vzeroupper
|
|
|
|
; AVX-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
|
|
|
%cvt = fptosi <4 x double> %ext to <4 x i32>
|
|
|
|
ret <4 x i32> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptosi_4f64_to_4i64:
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm0, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm2
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm0, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm0
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm1, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm3
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm1, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm0
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: movdqa %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
;
|
2016-08-23 22:37:35 +08:00
|
|
|
; AVX1-LABEL: fptosi_4f64_to_4i64:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vcvttsd2si %xmm1, %rax
|
|
|
|
; AVX1-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
|
|
|
|
; AVX1-NEXT: vcvttsd2si %xmm1, %rax
|
|
|
|
; AVX1-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
|
|
|
; AVX1-NEXT: vcvttsd2si %xmm0, %rax
|
|
|
|
; AVX1-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
|
|
|
; AVX1-NEXT: vcvttsd2si %xmm0, %rax
|
|
|
|
; AVX1-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: fptosi_4f64_to_4i64:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX2-NEXT: vcvttsd2si %xmm1, %rax
|
|
|
|
; AVX2-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
|
|
|
|
; AVX2-NEXT: vcvttsd2si %xmm1, %rax
|
|
|
|
; AVX2-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
|
|
|
; AVX2-NEXT: vcvttsd2si %xmm0, %rax
|
|
|
|
; AVX2-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
|
|
|
; AVX2-NEXT: vcvttsd2si %xmm0, %rax
|
|
|
|
; AVX2-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
|
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: fptosi_4f64_to_4i64:
|
|
|
|
; AVX512F: # BB#0:
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
|
2016-08-23 22:37:35 +08:00
|
|
|
; AVX512F-NEXT: vcvttsd2si %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
|
|
|
|
; AVX512F-NEXT: vcvttsd2si %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
|
|
|
; AVX512F-NEXT: vcvttsd2si %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
|
|
|
; AVX512F-NEXT: vcvttsd2si %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
2016-08-23 22:37:35 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-LABEL: fptosi_4f64_to_4i64:
|
|
|
|
; AVX512VL: # BB#0:
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-NEXT: vcvttsd2si %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
|
|
|
|
; AVX512VL-NEXT: vcvttsd2si %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
|
|
|
; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
|
|
|
; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-LABEL: fptosi_4f64_to_4i64:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-23 22:01:18 +08:00
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-LABEL: fptosi_4f64_to_4i64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvttpd2qq %ymm0, %ymm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
%cvt = fptosi <4 x double> %a to <4 x i64>
|
|
|
|
ret <4 x i64> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x i32> @fptosi_4f64_to_4i32(<4 x double> %a) {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptosi_4f64_to_4i32:
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE: # BB#0:
|
2016-10-18 15:42:15 +08:00
|
|
|
; SSE-NEXT: cvttpd2dq %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
;
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX-LABEL: fptosi_4f64_to_4i32:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0
|
|
|
|
; AVX-NEXT: vzeroupper
|
|
|
|
; AVX-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
%cvt = fptosi <4 x double> %a to <4 x i32>
|
|
|
|
ret <4 x i32> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
;
|
|
|
|
; Double to Unsigned Integer
|
|
|
|
;
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptoui_2f64_to_2i64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
|
|
|
|
; SSE-NEXT: movapd %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: subsd %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm1, %rax
|
|
|
|
; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
|
|
|
|
; SSE-NEXT: xorq %rcx, %rax
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm0, %rdx
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: ucomisd %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: cmovaeq %rax, %rdx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rdx, %xmm1
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm3
|
|
|
|
; SSE-NEXT: subsd %xmm2, %xmm3
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm3, %rax
|
|
|
|
; SSE-NEXT: xorq %rcx, %rax
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: cvttsd2si %xmm0, %rcx
|
|
|
|
; SSE-NEXT: ucomisd %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: cmovaeq %rax, %rcx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rcx, %xmm0
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
|
|
|
|
; SSE-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; VEX-LABEL: fptoui_2f64_to_2i64:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2
|
|
|
|
; VEX-NEXT: vcvttsd2si %xmm2, %rax
|
|
|
|
; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
|
|
|
|
; VEX-NEXT: xorq %rcx, %rax
|
|
|
|
; VEX-NEXT: vcvttsd2si %xmm0, %rdx
|
|
|
|
; VEX-NEXT: vucomisd %xmm1, %xmm0
|
|
|
|
; VEX-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; VEX-NEXT: vmovq %rdx, %xmm2
|
|
|
|
; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
|
|
|
; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm3
|
|
|
|
; VEX-NEXT: vcvttsd2si %xmm3, %rax
|
|
|
|
; VEX-NEXT: xorq %rcx, %rax
|
|
|
|
; VEX-NEXT: vcvttsd2si %xmm0, %rcx
|
|
|
|
; VEX-NEXT: vucomisd %xmm1, %xmm0
|
|
|
|
; VEX-NEXT: cmovaeq %rax, %rcx
|
|
|
|
; VEX-NEXT: vmovq %rcx, %xmm0
|
|
|
|
; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
|
|
|
|
; VEX-NEXT: retq
|
2016-08-23 22:37:35 +08:00
|
|
|
;
|
|
|
|
; AVX512F-LABEL: fptoui_2f64_to_2i64:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
|
|
|
; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-LABEL: fptoui_2f64_to_2i64:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
|
|
|
; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-LABEL: fptoui_2f64_to_2i64:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-24 22:46:55 +08:00
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-LABEL: fptoui_2f64_to_2i64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvttpd2uqq %xmm0, %xmm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
%cvt = fptoui <2 x double> %a to <2 x i64>
|
|
|
|
ret <2 x i64> %cvt
|
|
|
|
}
|
|
|
|
|
2016-11-06 15:50:25 +08:00
|
|
|
define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
|
|
|
|
; SSE-LABEL: fptoui_2f64_to_4i32:
|
|
|
|
; SSE: # BB#0:
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
|
|
|
|
; SSE-NEXT: movapd %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: subsd %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm1, %rax
|
2016-11-06 15:50:25 +08:00
|
|
|
; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
|
|
|
|
; SSE-NEXT: xorq %rcx, %rax
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm0, %rdx
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: ucomisd %xmm2, %xmm0
|
2016-11-06 15:50:25 +08:00
|
|
|
; SSE-NEXT: cmovaeq %rax, %rdx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rdx, %xmm1
|
2016-11-06 15:50:25 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm3
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: subsd %xmm2, %xmm3
|
2016-11-06 15:50:25 +08:00
|
|
|
; SSE-NEXT: cvttsd2si %xmm3, %rax
|
|
|
|
; SSE-NEXT: xorq %rcx, %rax
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm0, %rcx
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: ucomisd %xmm2, %xmm0
|
2016-11-06 15:50:25 +08:00
|
|
|
; SSE-NEXT: cmovaeq %rax, %rcx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rcx, %xmm0
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
|
|
|
|
; SSE-NEXT: pxor %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
|
|
|
|
; SSE-NEXT: movaps %xmm1, %xmm0
|
2016-11-06 15:50:25 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; VEX-LABEL: fptoui_2f64_to_4i32:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2
|
|
|
|
; VEX-NEXT: vcvttsd2si %xmm2, %rax
|
|
|
|
; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
|
|
|
|
; VEX-NEXT: xorq %rcx, %rax
|
|
|
|
; VEX-NEXT: vcvttsd2si %xmm0, %rdx
|
|
|
|
; VEX-NEXT: vucomisd %xmm1, %xmm0
|
|
|
|
; VEX-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; VEX-NEXT: vmovq %rdx, %xmm2
|
|
|
|
; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
|
|
|
; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm3
|
|
|
|
; VEX-NEXT: vcvttsd2si %xmm3, %rax
|
|
|
|
; VEX-NEXT: xorq %rcx, %rax
|
|
|
|
; VEX-NEXT: vcvttsd2si %xmm0, %rcx
|
|
|
|
; VEX-NEXT: vucomisd %xmm1, %xmm0
|
|
|
|
; VEX-NEXT: cmovaeq %rax, %rcx
|
|
|
|
; VEX-NEXT: vmovq %rcx, %xmm0
|
|
|
|
; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
|
2016-11-23 06:04:50 +08:00
|
|
|
; VEX-NEXT: retq
|
2016-11-06 15:50:25 +08:00
|
|
|
;
|
|
|
|
; AVX512F-LABEL: fptoui_2f64_to_4i32:
|
|
|
|
; AVX512F: # BB#0:
|
2016-11-09 15:48:51 +08:00
|
|
|
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0
|
2016-11-06 15:50:25 +08:00
|
|
|
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-11-06 15:50:25 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-LABEL: fptoui_2f64_to_4i32:
|
|
|
|
; AVX512VL: # BB#0:
|
2016-11-09 15:48:51 +08:00
|
|
|
; AVX512VL-NEXT: vcvttpd2udq %xmm0, %xmm0
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-LABEL: fptoui_2f64_to_4i32:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-LABEL: fptoui_2f64_to_4i32:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
2016-11-09 15:48:51 +08:00
|
|
|
; AVX512VLDQ-NEXT: vcvttpd2udq %xmm0, %xmm0
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-NEXT: retq
|
2016-11-06 15:50:25 +08:00
|
|
|
%cvt = fptoui <2 x double> %a to <2 x i32>
|
|
|
|
%ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
ret <4 x i32> %ext
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptoui_2f64_to_2i32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; SSE-NEXT: movapd %xmm0, %xmm2
|
|
|
|
; SSE-NEXT: subsd %xmm1, %xmm2
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm2, %rax
|
|
|
|
; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
|
|
|
|
; SSE-NEXT: xorq %rcx, %rax
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm0, %rdx
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: ucomisd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: cmovaeq %rax, %rdx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rdx, %xmm2
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm3
|
|
|
|
; SSE-NEXT: subsd %xmm1, %xmm3
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm3, %rax
|
|
|
|
; SSE-NEXT: xorq %rcx, %rax
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: cvttsd2si %xmm0, %rcx
|
|
|
|
; SSE-NEXT: ucomisd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: cmovaeq %rax, %rcx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rcx, %xmm0
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; VEX-LABEL: fptoui_2f64_to_2i32:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2
|
|
|
|
; VEX-NEXT: vcvttsd2si %xmm2, %rax
|
|
|
|
; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
|
|
|
|
; VEX-NEXT: xorq %rcx, %rax
|
|
|
|
; VEX-NEXT: vcvttsd2si %xmm0, %rdx
|
|
|
|
; VEX-NEXT: vucomisd %xmm1, %xmm0
|
|
|
|
; VEX-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; VEX-NEXT: vmovq %rdx, %xmm2
|
|
|
|
; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
|
|
|
; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm3
|
|
|
|
; VEX-NEXT: vcvttsd2si %xmm3, %rax
|
|
|
|
; VEX-NEXT: xorq %rcx, %rax
|
|
|
|
; VEX-NEXT: vcvttsd2si %xmm0, %rcx
|
|
|
|
; VEX-NEXT: vucomisd %xmm1, %xmm0
|
|
|
|
; VEX-NEXT: cmovaeq %rax, %rcx
|
|
|
|
; VEX-NEXT: vmovq %rcx, %xmm0
|
|
|
|
; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
|
|
|
|
; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; VEX-NEXT: retq
|
2016-08-23 22:37:35 +08:00
|
|
|
;
|
|
|
|
; AVX512F-LABEL: fptoui_2f64_to_2i32:
|
|
|
|
; AVX512F: # BB#0:
|
2016-11-09 15:48:51 +08:00
|
|
|
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0
|
|
|
|
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-08-23 22:37:35 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-LABEL: fptoui_2f64_to_2i32:
|
|
|
|
; AVX512VL: # BB#0:
|
2016-11-09 15:48:51 +08:00
|
|
|
; AVX512VL-NEXT: vcvttpd2udq %xmm0, %xmm0
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-LABEL: fptoui_2f64_to_2i32:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-LABEL: fptoui_2f64_to_2i32:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
2016-11-09 15:48:51 +08:00
|
|
|
; AVX512VLDQ-NEXT: vcvttpd2udq %xmm0, %xmm0
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
%cvt = fptoui <2 x double> %a to <2 x i32>
|
|
|
|
%ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
|
|
|
ret <4 x i32> %ext
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
|
|
|
|
; SSE-LABEL: fptoui_4f64_to_2i32:
|
|
|
|
; SSE: # BB#0:
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
|
|
|
|
; SSE-NEXT: movapd %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: subsd %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm1, %rax
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
|
|
|
|
; SSE-NEXT: xorq %rcx, %rax
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm0, %rdx
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: ucomisd %xmm2, %xmm0
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: cmovaeq %rax, %rdx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rdx, %xmm1
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm3
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: subsd %xmm2, %xmm3
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: cvttsd2si %xmm3, %rax
|
|
|
|
; SSE-NEXT: xorq %rcx, %rax
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: cvttsd2si %xmm0, %rdx
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: ucomisd %xmm2, %xmm0
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: cmovaeq %rax, %rdx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rdx, %xmm0
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: cvttsd2si %xmm0, %rax
|
|
|
|
; SSE-NEXT: xorq %rax, %rcx
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: ucomisd %xmm2, %xmm0
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: cmovbq %rax, %rcx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rcx, %xmm0
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
|
|
|
|
; SSE-NEXT: movaps %xmm1, %xmm0
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; VEX-LABEL: fptoui_4f64_to_2i32:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
|
|
|
; VEX-NEXT: vcvttsd2si %xmm1, %rax
|
|
|
|
; VEX-NEXT: vcvttsd2si %xmm0, %rcx
|
|
|
|
; VEX-NEXT: vmovd %ecx, %xmm0
|
|
|
|
; VEX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
|
|
|
|
; VEX-NEXT: vcvttsd2si %xmm0, %rax
|
|
|
|
; VEX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
|
2017-04-07 06:33:25 +08:00
|
|
|
; VEX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
|
2016-11-23 06:04:50 +08:00
|
|
|
; VEX-NEXT: retq
|
2016-08-23 22:37:35 +08:00
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512F-LABEL: fptoui_4f64_to_2i32:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0
|
|
|
|
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: fptoui_4f64_to_2i32:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
|
|
|
|
; AVX512VL-NEXT: vcvttpd2udq %ymm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512VL-NEXT: vzeroupper
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-LABEL: fptoui_4f64_to_2i32:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-LABEL: fptoui_4f64_to_2i32:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
|
|
|
|
; AVX512VLDQ-NEXT: vcvttpd2udq %ymm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512VLDQ-NEXT: vzeroupper
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
|
|
|
%cvt = fptoui <4 x double> %ext to <4 x i32>
|
|
|
|
ret <4 x i32> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptoui_4f64_to_4i64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movapd %xmm0, %xmm2
|
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
|
|
|
|
; SSE-NEXT: subsd %xmm3, %xmm0
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm0, %rcx
|
|
|
|
; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
|
|
|
|
; SSE-NEXT: xorq %rax, %rcx
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm2, %rdx
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: ucomisd %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: cmovaeq %rcx, %rdx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rdx, %xmm0
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
|
|
|
|
; SSE-NEXT: movaps %xmm2, %xmm4
|
|
|
|
; SSE-NEXT: subsd %xmm3, %xmm4
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm4, %rcx
|
|
|
|
; SSE-NEXT: xorq %rax, %rcx
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: cvttsd2si %xmm2, %rdx
|
|
|
|
; SSE-NEXT: ucomisd %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: cmovaeq %rcx, %rdx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rdx, %xmm2
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
|
|
|
; SSE-NEXT: movapd %xmm1, %xmm2
|
|
|
|
; SSE-NEXT: subsd %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm2, %rcx
|
|
|
|
; SSE-NEXT: xorq %rax, %rcx
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm1, %rdx
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: ucomisd %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: cmovaeq %rcx, %rdx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rdx, %xmm2
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
|
|
|
|
; SSE-NEXT: movaps %xmm1, %xmm4
|
|
|
|
; SSE-NEXT: subsd %xmm3, %xmm4
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm4, %rcx
|
|
|
|
; SSE-NEXT: xorq %rax, %rcx
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: cvttsd2si %xmm1, %rax
|
|
|
|
; SSE-NEXT: ucomisd %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: cmovaeq %rcx, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm1
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
|
|
|
|
; SSE-NEXT: movdqa %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
;
|
2016-08-23 22:37:35 +08:00
|
|
|
; AVX1-LABEL: fptoui_4f64_to_4i64:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; AVX1-NEXT: vsubsd %xmm1, %xmm2, %xmm3
|
|
|
|
; AVX1-NEXT: vcvttsd2si %xmm3, %rax
|
|
|
|
; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
|
|
|
|
; AVX1-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvttsd2si %xmm2, %rdx
|
|
|
|
; AVX1-NEXT: vucomisd %xmm1, %xmm2
|
|
|
|
; AVX1-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; AVX1-NEXT: vmovq %rdx, %xmm3
|
|
|
|
; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
|
|
|
|
; AVX1-NEXT: vsubsd %xmm1, %xmm2, %xmm4
|
|
|
|
; AVX1-NEXT: vcvttsd2si %xmm4, %rax
|
|
|
|
; AVX1-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvttsd2si %xmm2, %rdx
|
|
|
|
; AVX1-NEXT: vucomisd %xmm1, %xmm2
|
|
|
|
; AVX1-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; AVX1-NEXT: vmovq %rdx, %xmm2
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
|
|
|
|
; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm3
|
|
|
|
; AVX1-NEXT: vcvttsd2si %xmm3, %rax
|
|
|
|
; AVX1-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvttsd2si %xmm0, %rdx
|
|
|
|
; AVX1-NEXT: vucomisd %xmm1, %xmm0
|
|
|
|
; AVX1-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; AVX1-NEXT: vmovq %rdx, %xmm3
|
|
|
|
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
|
|
|
; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm4
|
|
|
|
; AVX1-NEXT: vcvttsd2si %xmm4, %rax
|
|
|
|
; AVX1-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvttsd2si %xmm0, %rcx
|
|
|
|
; AVX1-NEXT: vucomisd %xmm1, %xmm0
|
|
|
|
; AVX1-NEXT: cmovaeq %rax, %rcx
|
|
|
|
; AVX1-NEXT: vmovq %rcx, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: fptoui_4f64_to_4i64:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; AVX2-NEXT: vsubsd %xmm1, %xmm2, %xmm3
|
|
|
|
; AVX2-NEXT: vcvttsd2si %xmm3, %rax
|
|
|
|
; AVX2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
|
|
|
|
; AVX2-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvttsd2si %xmm2, %rdx
|
|
|
|
; AVX2-NEXT: vucomisd %xmm1, %xmm2
|
|
|
|
; AVX2-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; AVX2-NEXT: vmovq %rdx, %xmm3
|
|
|
|
; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
|
|
|
|
; AVX2-NEXT: vsubsd %xmm1, %xmm2, %xmm4
|
|
|
|
; AVX2-NEXT: vcvttsd2si %xmm4, %rax
|
|
|
|
; AVX2-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvttsd2si %xmm2, %rdx
|
|
|
|
; AVX2-NEXT: vucomisd %xmm1, %xmm2
|
|
|
|
; AVX2-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; AVX2-NEXT: vmovq %rdx, %xmm2
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
|
|
|
|
; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm3
|
|
|
|
; AVX2-NEXT: vcvttsd2si %xmm3, %rax
|
|
|
|
; AVX2-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvttsd2si %xmm0, %rdx
|
|
|
|
; AVX2-NEXT: vucomisd %xmm1, %xmm0
|
|
|
|
; AVX2-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; AVX2-NEXT: vmovq %rdx, %xmm3
|
|
|
|
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
|
|
|
; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm4
|
|
|
|
; AVX2-NEXT: vcvttsd2si %xmm4, %rax
|
|
|
|
; AVX2-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvttsd2si %xmm0, %rcx
|
|
|
|
; AVX2-NEXT: vucomisd %xmm1, %xmm0
|
|
|
|
; AVX2-NEXT: cmovaeq %rax, %rcx
|
|
|
|
; AVX2-NEXT: vmovq %rcx, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
|
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: fptoui_4f64_to_4i64:
|
|
|
|
; AVX512F: # BB#0:
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
|
2016-08-23 22:37:35 +08:00
|
|
|
; AVX512F-NEXT: vcvttsd2usi %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
|
|
|
|
; AVX512F-NEXT: vcvttsd2usi %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
|
|
|
; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
|
|
|
; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
2016-08-23 22:37:35 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-LABEL: fptoui_4f64_to_4i64:
|
|
|
|
; AVX512VL: # BB#0:
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-NEXT: vcvttsd2usi %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
|
|
|
|
; AVX512VL-NEXT: vcvttsd2usi %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
|
|
|
; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
|
|
|
; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-LABEL: fptoui_4f64_to_4i64:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-23 22:01:18 +08:00
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-LABEL: fptoui_4f64_to_4i64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvttpd2uqq %ymm0, %ymm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
%cvt = fptoui <4 x double> %a to <4 x i64>
|
|
|
|
ret <4 x i64> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptoui_4f64_to_4i32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
|
|
|
|
; SSE-NEXT: movapd %xmm1, %xmm3
|
|
|
|
; SSE-NEXT: subsd %xmm2, %xmm3
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm3, %rcx
|
|
|
|
; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
|
|
|
|
; SSE-NEXT: xorq %rax, %rcx
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm1, %rdx
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: ucomisd %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: cmovaeq %rcx, %rdx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rdx, %xmm3
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
|
|
|
|
; SSE-NEXT: movaps %xmm1, %xmm4
|
|
|
|
; SSE-NEXT: subsd %xmm2, %xmm4
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm4, %rcx
|
|
|
|
; SSE-NEXT: xorq %rax, %rcx
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: cvttsd2si %xmm1, %rdx
|
|
|
|
; SSE-NEXT: ucomisd %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: cmovaeq %rcx, %rdx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rdx, %xmm1
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: movapd %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: subsd %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm1, %rcx
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: xorq %rax, %rcx
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm0, %rdx
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: ucomisd %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: cmovaeq %rcx, %rdx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rdx, %xmm1
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm4
|
|
|
|
; SSE-NEXT: subsd %xmm2, %xmm4
|
|
|
|
; SSE-NEXT: cvttsd2si %xmm4, %rcx
|
|
|
|
; SSE-NEXT: xorq %rax, %rcx
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: cvttsd2si %xmm0, %rax
|
|
|
|
; SSE-NEXT: ucomisd %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: cmovaeq %rcx, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm0
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
|
|
|
|
; SSE-NEXT: movaps %xmm1, %xmm0
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; VEX-LABEL: fptoui_4f64_to_4i32:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
|
|
|
; VEX-NEXT: vcvttsd2si %xmm1, %rax
|
|
|
|
; VEX-NEXT: vcvttsd2si %xmm0, %rcx
|
|
|
|
; VEX-NEXT: vmovd %ecx, %xmm1
|
|
|
|
; VEX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
|
|
|
|
; VEX-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; VEX-NEXT: vcvttsd2si %xmm0, %rax
|
|
|
|
; VEX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
|
|
|
|
; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
|
|
|
; VEX-NEXT: vcvttsd2si %xmm0, %rax
|
|
|
|
; VEX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
|
|
|
|
; VEX-NEXT: vzeroupper
|
|
|
|
; VEX-NEXT: retq
|
2016-08-23 22:37:35 +08:00
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512F-LABEL: fptoui_4f64_to_4i32:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0
|
|
|
|
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: fptoui_4f64_to_4i32:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vcvttpd2udq %ymm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512VL-NEXT: vzeroupper
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-LABEL: fptoui_4f64_to_4i32:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-LABEL: fptoui_4f64_to_4i32:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvttpd2udq %ymm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512VLDQ-NEXT: vzeroupper
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
%cvt = fptoui <4 x double> %a to <4 x i32>
|
|
|
|
ret <4 x i32> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
;
|
|
|
|
; Float to Signed Integer
|
|
|
|
;
|
|
|
|
|
2016-08-23 23:00:52 +08:00
|
|
|
define <2 x i32> @fptosi_2f32_to_2i32(<2 x float> %a) {
|
|
|
|
; SSE-LABEL: fptosi_2f32_to_2i32:
|
|
|
|
; SSE: # BB#0:
|
2016-10-18 15:42:15 +08:00
|
|
|
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
|
2016-08-23 23:00:52 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: fptosi_2f32_to_2i32:
|
|
|
|
; AVX: # BB#0:
|
2016-10-18 15:42:15 +08:00
|
|
|
; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
2016-08-23 23:00:52 +08:00
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = fptosi <2 x float> %a to <2 x i32>
|
|
|
|
ret <2 x i32> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x i32> @fptosi_4f32_to_4i32(<4 x float> %a) {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptosi_4f32_to_4i32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: fptosi_4f32_to_4i32:
|
2015-05-02 19:18:47 +08:00
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = fptosi <4 x float> %a to <4 x i32>
|
|
|
|
ret <4 x i32> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptosi_2f32_to_2i64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm1
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm0
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
|
|
|
|
; SSE-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
;
|
2016-12-11 03:35:39 +08:00
|
|
|
; VEX-LABEL: fptosi_2f32_to_2i64:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; VEX-NEXT: vmovq %rax, %xmm1
|
|
|
|
; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; VEX-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; VEX-NEXT: vmovq %rax, %xmm0
|
|
|
|
; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; VEX-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: fptosi_2f32_to_2i64:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; AVX512F-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: fptosi_2f32_to_2i64:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; AVX512VL-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: fptosi_2f32_to_2i64:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX512DQ-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; AVX512DQ-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX512DQ-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: fptosi_2f32_to_2i64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %xmm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-07-18 23:56:33 +08:00
|
|
|
%shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
|
2015-05-02 19:18:47 +08:00
|
|
|
%cvt = fptosi <2 x float> %shuf to <2 x i64>
|
|
|
|
ret <2 x i64> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) {
|
|
|
|
; SSE-LABEL: fptosi_4f32_to_2i64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm1
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm0
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
|
|
|
|
; SSE-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; VEX-LABEL: fptosi_4f32_to_2i64:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
|
|
|
; VEX-NEXT: vcvttss2si %xmm1, %rax
|
|
|
|
; VEX-NEXT: vcvttss2si %xmm0, %rcx
|
|
|
|
; VEX-NEXT: vmovq %rcx, %xmm0
|
|
|
|
; VEX-NEXT: vmovq %rax, %xmm1
|
|
|
|
; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; VEX-NEXT: retq
|
2016-08-23 22:37:35 +08:00
|
|
|
;
|
|
|
|
; AVX512F-LABEL: fptosi_4f32_to_2i64:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
|
|
|
; AVX512F-NEXT: vcvttss2si %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vcvttss2si %xmm0, %rcx
|
|
|
|
; AVX512F-NEXT: vmovq %rcx, %xmm0
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-LABEL: fptosi_4f32_to_2i64:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
|
|
|
; AVX512VL-NEXT: vcvttss2si %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vcvttss2si %xmm0, %rcx
|
|
|
|
; AVX512VL-NEXT: vmovq %rcx, %xmm0
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-LABEL: fptosi_4f32_to_2i64:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-23 22:01:18 +08:00
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-LABEL: fptosi_4f32_to_2i64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0
|
|
|
|
; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512VLDQ-NEXT: vzeroupper
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%cvt = fptosi <4 x float> %a to <4 x i64>
|
|
|
|
%shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
|
|
|
|
ret <2 x i64> %shuf
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <8 x i32> @fptosi_8f32_to_8i32(<8 x float> %a) {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptosi_8f32_to_8i32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: cvttps2dq %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: fptosi_8f32_to_8i32:
|
2015-05-02 19:18:47 +08:00
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vcvttps2dq %ymm0, %ymm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = fptosi <8 x float> %a to <8 x i32>
|
|
|
|
ret <8 x i32> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptosi_4f32_to_4i64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm2
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
|
|
|
|
; SSE-NEXT: cvttss2si %xmm1, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm1
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm1
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
|
|
|
|
; SSE-NEXT: cvttss2si %xmm1, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm3
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm1
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
;
|
2016-08-23 22:37:35 +08:00
|
|
|
; AVX1-LABEL: fptosi_4f32_to_4i64:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm1, %rax
|
|
|
|
; AVX1-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm2, %rax
|
|
|
|
; AVX1-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX1-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX1-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: fptosi_4f32_to_4i64:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm1, %rax
|
|
|
|
; AVX2-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm2, %rax
|
|
|
|
; AVX2-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX2-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX2-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
|
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: fptosi_4f32_to_4i64:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
|
|
|
|
; AVX512F-NEXT: vcvttss2si %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
|
|
|
|
; AVX512F-NEXT: vcvttss2si %xmm2, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
|
|
|
; AVX512F-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; AVX512F-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
2016-08-23 22:37:35 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-LABEL: fptosi_4f32_to_4i64:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
|
|
|
|
; AVX512VL-NEXT: vcvttss2si %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
|
|
|
|
; AVX512VL-NEXT: vcvttss2si %xmm2, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
|
|
|
; AVX512VL-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; AVX512VL-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-LABEL: fptosi_4f32_to_4i64:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-23 22:01:18 +08:00
|
|
|
; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-LABEL: fptosi_4f32_to_4i64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-07-18 23:56:33 +08:00
|
|
|
%shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
2015-05-02 19:18:47 +08:00
|
|
|
%cvt = fptosi <4 x float> %shuf to <4 x i64>
|
|
|
|
ret <4 x i64> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) {
|
|
|
|
; SSE-LABEL: fptosi_8f32_to_4i64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm2
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
|
|
|
|
; SSE-NEXT: cvttss2si %xmm1, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm1
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm1
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
|
|
|
|
; SSE-NEXT: cvttss2si %xmm1, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm3
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm1
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-08-23 22:37:35 +08:00
|
|
|
; AVX1-LABEL: fptosi_8f32_to_4i64:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm1, %rax
|
|
|
|
; AVX1-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm2, %rax
|
|
|
|
; AVX1-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX1-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX1-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: fptosi_8f32_to_4i64:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm1, %rax
|
|
|
|
; AVX2-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm2, %rax
|
|
|
|
; AVX2-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX2-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX2-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
|
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: fptosi_8f32_to_4i64:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
|
|
|
; AVX512F-NEXT: vcvttss2si %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vcvttss2si %xmm0, %rcx
|
|
|
|
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
|
|
|
; AVX512F-NEXT: vcvttss2si %xmm1, %rdx
|
|
|
|
; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
|
|
|
; AVX512F-NEXT: vcvttss2si %xmm0, %rsi
|
|
|
|
; AVX512F-NEXT: vmovq %rsi, %xmm0
|
|
|
|
; AVX512F-NEXT: vmovq %rdx, %xmm1
|
|
|
|
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX512F-NEXT: vmovq %rcx, %xmm1
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
|
2016-08-23 22:37:35 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-LABEL: fptosi_8f32_to_4i64:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
|
|
|
; AVX512VL-NEXT: vcvttss2si %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vcvttss2si %xmm0, %rcx
|
|
|
|
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
|
|
|
; AVX512VL-NEXT: vcvttss2si %xmm1, %rdx
|
|
|
|
; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
|
|
|
; AVX512VL-NEXT: vcvttss2si %xmm0, %rsi
|
|
|
|
; AVX512VL-NEXT: vmovq %rsi, %xmm0
|
|
|
|
; AVX512VL-NEXT: vmovq %rdx, %xmm1
|
|
|
|
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX512VL-NEXT: vmovq %rcx, %xmm1
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-LABEL: fptosi_8f32_to_4i64:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-LABEL: fptosi_8f32_to_4i64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvttps2qq %ymm0, %zmm0
|
|
|
|
; AVX512VLDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%cvt = fptosi <8 x float> %a to <8 x i64>
|
|
|
|
%shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
ret <4 x i64> %shuf
|
|
|
|
}
|
|
|
|
|
2015-05-02 19:18:47 +08:00
|
|
|
;
|
|
|
|
; Float to Unsigned Integer
|
|
|
|
;
|
|
|
|
|
2016-11-06 15:50:25 +08:00
|
|
|
define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) {
|
|
|
|
; SSE-LABEL: fptoui_2f32_to_2i32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: subss %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: cvttss2si %xmm1, %rax
|
|
|
|
; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
|
|
|
|
; SSE-NEXT: xorq %rcx, %rax
|
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rdx
|
|
|
|
; SSE-NEXT: ucomiss %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: cmovaeq %rax, %rdx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rdx, %xmm1
|
2016-11-06 15:50:25 +08:00
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm3
|
|
|
|
; SSE-NEXT: subss %xmm2, %xmm3
|
|
|
|
; SSE-NEXT: cvttss2si %xmm3, %rax
|
|
|
|
; SSE-NEXT: xorq %rcx, %rax
|
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rcx
|
|
|
|
; SSE-NEXT: ucomiss %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: cmovaeq %rax, %rcx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rcx, %xmm0
|
2016-11-06 15:50:25 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
|
|
|
|
; SSE-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; VEX-LABEL: fptoui_2f32_to_2i32:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm2
|
|
|
|
; VEX-NEXT: vcvttss2si %xmm2, %rax
|
|
|
|
; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
|
|
|
|
; VEX-NEXT: xorq %rcx, %rax
|
|
|
|
; VEX-NEXT: vcvttss2si %xmm0, %rdx
|
|
|
|
; VEX-NEXT: vucomiss %xmm1, %xmm0
|
|
|
|
; VEX-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; VEX-NEXT: vmovq %rdx, %xmm2
|
|
|
|
; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm3
|
|
|
|
; VEX-NEXT: vcvttss2si %xmm3, %rax
|
|
|
|
; VEX-NEXT: xorq %rcx, %rax
|
|
|
|
; VEX-NEXT: vcvttss2si %xmm0, %rcx
|
|
|
|
; VEX-NEXT: vucomiss %xmm1, %xmm0
|
|
|
|
; VEX-NEXT: cmovaeq %rax, %rcx
|
|
|
|
; VEX-NEXT: vmovq %rcx, %xmm0
|
|
|
|
; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
|
|
|
|
; VEX-NEXT: retq
|
2016-11-06 15:50:25 +08:00
|
|
|
;
|
2016-11-09 15:48:51 +08:00
|
|
|
; AVX512F-LABEL: fptoui_2f32_to_2i32:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0
|
|
|
|
; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-11-09 15:48:51 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: fptoui_2f32_to_2i32:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vcvttps2udq %xmm0, %xmm0
|
|
|
|
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-LABEL: fptoui_2f32_to_2i32:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
2016-11-09 15:48:51 +08:00
|
|
|
; AVX512VLDQ-LABEL: fptoui_2f32_to_2i32:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvttps2udq %xmm0, %xmm0
|
|
|
|
; AVX512VLDQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2016-11-06 15:50:25 +08:00
|
|
|
%cvt = fptoui <2 x float> %a to <2 x i32>
|
|
|
|
ret <2 x i32> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptoui_4f32_to_4i32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
|
|
|
|
; SSE-NEXT: cvttss2si %xmm1, %rax
|
|
|
|
; SSE-NEXT: movd %eax, %xmm1
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm2
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: cvttss2si %xmm2, %rax
|
|
|
|
; SSE-NEXT: movd %eax, %xmm2
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
|
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rax
|
|
|
|
; SSE-NEXT: movd %eax, %xmm1
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rax
|
|
|
|
; SSE-NEXT: movd %eax, %xmm0
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; VEX-LABEL: fptoui_4f32_to_4i32:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
|
|
|
; VEX-NEXT: vcvttss2si %xmm1, %rax
|
|
|
|
; VEX-NEXT: vcvttss2si %xmm0, %rcx
|
|
|
|
; VEX-NEXT: vmovd %ecx, %xmm1
|
|
|
|
; VEX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
|
|
|
|
; VEX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
|
|
|
|
; VEX-NEXT: vcvttss2si %xmm2, %rax
|
|
|
|
; VEX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
|
|
|
|
; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
|
|
|
; VEX-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; VEX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
|
|
|
|
; VEX-NEXT: retq
|
2016-08-23 22:37:35 +08:00
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512F-LABEL: fptoui_4f32_to_4i32:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0
|
|
|
|
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: fptoui_4f32_to_4i32:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vcvttps2udq %xmm0, %xmm0
|
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-LABEL: fptoui_4f32_to_4i32:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-LABEL: fptoui_4f32_to_4i32:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvttps2udq %xmm0, %xmm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
%cvt = fptoui <4 x float> %a to <4 x i32>
|
|
|
|
ret <4 x i32> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptoui_2f32_to_2i64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: subss %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: cvttss2si %xmm1, %rax
|
|
|
|
; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
|
|
|
|
; SSE-NEXT: xorq %rcx, %rax
|
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rdx
|
|
|
|
; SSE-NEXT: ucomiss %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: cmovaeq %rax, %rdx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rdx, %xmm1
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm3
|
|
|
|
; SSE-NEXT: subss %xmm2, %xmm3
|
|
|
|
; SSE-NEXT: cvttss2si %xmm3, %rax
|
|
|
|
; SSE-NEXT: xorq %rcx, %rax
|
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rcx
|
|
|
|
; SSE-NEXT: ucomiss %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: cmovaeq %rax, %rcx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rcx, %xmm0
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
|
|
|
|
; SSE-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; VEX-LABEL: fptoui_2f32_to_2i64:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm2
|
|
|
|
; VEX-NEXT: vcvttss2si %xmm2, %rax
|
|
|
|
; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
|
|
|
|
; VEX-NEXT: xorq %rcx, %rax
|
|
|
|
; VEX-NEXT: vcvttss2si %xmm0, %rdx
|
|
|
|
; VEX-NEXT: vucomiss %xmm1, %xmm0
|
|
|
|
; VEX-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; VEX-NEXT: vmovq %rdx, %xmm2
|
|
|
|
; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm3
|
|
|
|
; VEX-NEXT: vcvttss2si %xmm3, %rax
|
|
|
|
; VEX-NEXT: xorq %rcx, %rax
|
|
|
|
; VEX-NEXT: vcvttss2si %xmm0, %rcx
|
|
|
|
; VEX-NEXT: vucomiss %xmm1, %xmm0
|
|
|
|
; VEX-NEXT: cmovaeq %rax, %rcx
|
|
|
|
; VEX-NEXT: vmovq %rcx, %xmm0
|
|
|
|
; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
|
|
|
|
; VEX-NEXT: retq
|
2016-08-23 22:37:35 +08:00
|
|
|
;
|
2016-12-11 03:35:39 +08:00
|
|
|
; AVX512F-LABEL: fptoui_2f32_to_2i64:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vcvttss2usi %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; AVX512F-NEXT: vcvttss2usi %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: fptoui_2f32_to_2i64:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: fptoui_2f32_to_2i64:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vcvttss2usi %xmm0, %rax
|
|
|
|
; AVX512DQ-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; AVX512DQ-NEXT: vcvttss2usi %xmm0, %rax
|
|
|
|
; AVX512DQ-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: fptoui_2f32_to_2i64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %xmm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-07-18 23:56:33 +08:00
|
|
|
%shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
|
2015-05-02 19:18:47 +08:00
|
|
|
%cvt = fptoui <2 x float> %shuf to <2 x i64>
|
|
|
|
ret <2 x i64> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
|
|
|
|
; SSE-LABEL: fptoui_4f32_to_2i64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: subss %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: cvttss2si %xmm1, %rax
|
|
|
|
; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
|
|
|
|
; SSE-NEXT: xorq %rcx, %rax
|
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rdx
|
|
|
|
; SSE-NEXT: ucomiss %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: cmovaeq %rax, %rdx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rdx, %xmm1
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm3
|
|
|
|
; SSE-NEXT: subss %xmm2, %xmm3
|
|
|
|
; SSE-NEXT: cvttss2si %xmm3, %rax
|
|
|
|
; SSE-NEXT: xorq %rcx, %rax
|
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rcx
|
|
|
|
; SSE-NEXT: ucomiss %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: cmovaeq %rax, %rcx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rcx, %xmm0
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
|
|
|
|
; SSE-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; VEX-LABEL: fptoui_4f32_to_2i64:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
|
|
|
; VEX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; VEX-NEXT: vsubss %xmm2, %xmm1, %xmm3
|
|
|
|
; VEX-NEXT: vcvttss2si %xmm3, %rax
|
|
|
|
; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
|
|
|
|
; VEX-NEXT: xorq %rcx, %rax
|
|
|
|
; VEX-NEXT: vcvttss2si %xmm1, %rdx
|
|
|
|
; VEX-NEXT: vucomiss %xmm2, %xmm1
|
|
|
|
; VEX-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; VEX-NEXT: vsubss %xmm2, %xmm0, %xmm1
|
|
|
|
; VEX-NEXT: vcvttss2si %xmm1, %rax
|
|
|
|
; VEX-NEXT: xorq %rcx, %rax
|
|
|
|
; VEX-NEXT: vcvttss2si %xmm0, %rcx
|
|
|
|
; VEX-NEXT: vucomiss %xmm2, %xmm0
|
|
|
|
; VEX-NEXT: cmovaeq %rax, %rcx
|
|
|
|
; VEX-NEXT: vmovq %rcx, %xmm0
|
|
|
|
; VEX-NEXT: vmovq %rdx, %xmm1
|
|
|
|
; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; VEX-NEXT: retq
|
2016-08-23 22:37:35 +08:00
|
|
|
;
|
|
|
|
; AVX512F-LABEL: fptoui_4f32_to_2i64:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
|
|
|
; AVX512F-NEXT: vcvttss2usi %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vcvttss2usi %xmm0, %rcx
|
|
|
|
; AVX512F-NEXT: vmovq %rcx, %xmm0
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-LABEL: fptoui_4f32_to_2i64:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
|
|
|
; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vcvttss2usi %xmm0, %rcx
|
|
|
|
; AVX512VL-NEXT: vmovq %rcx, %xmm0
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-LABEL: fptoui_4f32_to_2i64:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-23 22:01:18 +08:00
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-LABEL: fptoui_4f32_to_2i64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %ymm0
|
|
|
|
; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512VLDQ-NEXT: vzeroupper
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%cvt = fptoui <4 x float> %a to <4 x i64>
|
|
|
|
%shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
|
|
|
|
ret <2 x i64> %shuf
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptoui_8f32_to_8i32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm2
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rax
|
|
|
|
; SSE-NEXT: movd %eax, %xmm0
|
|
|
|
; SSE-NEXT: movaps %xmm2, %xmm3
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1]
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: cvttss2si %xmm3, %rax
|
|
|
|
; SSE-NEXT: movd %eax, %xmm3
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
|
|
|
|
; SSE-NEXT: cvttss2si %xmm2, %rax
|
|
|
|
; SSE-NEXT: movd %eax, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: cvttss2si %xmm2, %rax
|
|
|
|
; SSE-NEXT: movd %eax, %xmm2
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: movaps %xmm1, %xmm2
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
|
|
|
|
; SSE-NEXT: cvttss2si %xmm2, %rax
|
|
|
|
; SSE-NEXT: movd %eax, %xmm2
|
|
|
|
; SSE-NEXT: movaps %xmm1, %xmm3
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1]
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: cvttss2si %xmm3, %rax
|
|
|
|
; SSE-NEXT: movd %eax, %xmm3
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
|
|
|
|
; SSE-NEXT: cvttss2si %xmm1, %rax
|
|
|
|
; SSE-NEXT: movd %eax, %xmm2
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: cvttss2si %xmm1, %rax
|
|
|
|
; SSE-NEXT: movd %eax, %xmm1
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
;
|
2016-08-23 22:37:35 +08:00
|
|
|
; AVX1-LABEL: fptoui_8f32_to_8i32:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm2, %rax
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm1, %rcx
|
|
|
|
; AVX1-NEXT: vmovd %ecx, %xmm2
|
|
|
|
; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm3, %rax
|
|
|
|
; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm1, %rax
|
|
|
|
; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
|
|
|
|
; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm2, %rax
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm0, %rcx
|
|
|
|
; AVX1-NEXT: vmovd %ecx, %xmm2
|
|
|
|
; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm3, %rax
|
|
|
|
; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: fptoui_8f32_to_8i32:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm2, %rax
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm1, %rcx
|
|
|
|
; AVX2-NEXT: vmovd %ecx, %xmm2
|
|
|
|
; AVX2-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
|
|
|
|
; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm3, %rax
|
|
|
|
; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
|
|
|
|
; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm1, %rax
|
|
|
|
; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
|
|
|
|
; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm2, %rax
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm0, %rcx
|
|
|
|
; AVX2-NEXT: vmovd %ecx, %xmm2
|
|
|
|
; AVX2-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
|
|
|
|
; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm3, %rax
|
|
|
|
; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
|
|
|
|
; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
|
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512F-LABEL: fptoui_8f32_to_8i32:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0
|
|
|
|
; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: fptoui_8f32_to_8i32:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vcvttps2udq %ymm0, %ymm0
|
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-LABEL: fptoui_8f32_to_8i32:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-LABEL: fptoui_8f32_to_8i32:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvttps2udq %ymm0, %ymm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
%cvt = fptoui <8 x float> %a to <8 x i32>
|
|
|
|
ret <8 x i32> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptoui_4f32_to_4i64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm2
|
|
|
|
; SSE-NEXT: subss %xmm1, %xmm2
|
|
|
|
; SSE-NEXT: cvttss2si %xmm2, %rcx
|
|
|
|
; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
|
|
|
|
; SSE-NEXT: xorq %rax, %rcx
|
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rdx
|
|
|
|
; SSE-NEXT: ucomiss %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: cmovaeq %rcx, %rdx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rdx, %xmm2
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm3
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
|
|
|
|
; SSE-NEXT: movaps %xmm3, %xmm4
|
|
|
|
; SSE-NEXT: subss %xmm1, %xmm4
|
|
|
|
; SSE-NEXT: cvttss2si %xmm4, %rcx
|
|
|
|
; SSE-NEXT: xorq %rax, %rcx
|
|
|
|
; SSE-NEXT: cvttss2si %xmm3, %rdx
|
|
|
|
; SSE-NEXT: ucomiss %xmm1, %xmm3
|
|
|
|
; SSE-NEXT: cmovaeq %rcx, %rdx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rdx, %xmm3
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm3
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
|
|
|
|
; SSE-NEXT: movaps %xmm3, %xmm4
|
|
|
|
; SSE-NEXT: subss %xmm1, %xmm4
|
|
|
|
; SSE-NEXT: cvttss2si %xmm4, %rcx
|
|
|
|
; SSE-NEXT: xorq %rax, %rcx
|
|
|
|
; SSE-NEXT: cvttss2si %xmm3, %rdx
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: ucomiss %xmm1, %xmm3
|
|
|
|
; SSE-NEXT: cmovaeq %rcx, %rdx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rdx, %xmm3
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm4
|
|
|
|
; SSE-NEXT: subss %xmm1, %xmm4
|
|
|
|
; SSE-NEXT: cvttss2si %xmm4, %rcx
|
|
|
|
; SSE-NEXT: xorq %rax, %rcx
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rax
|
|
|
|
; SSE-NEXT: ucomiss %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: cmovaeq %rcx, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm1
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
|
|
|
|
; SSE-NEXT: movdqa %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:18:47 +08:00
|
|
|
;
|
2016-08-23 22:37:35 +08:00
|
|
|
; AVX1-LABEL: fptoui_4f32_to_4i64:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
|
|
|
|
; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm3, %rax
|
|
|
|
; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
|
|
|
|
; AVX1-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm2, %rdx
|
|
|
|
; AVX1-NEXT: vucomiss %xmm1, %xmm2
|
|
|
|
; AVX1-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; AVX1-NEXT: vmovq %rdx, %xmm2
|
|
|
|
; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
|
|
|
|
; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm4
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm4, %rax
|
|
|
|
; AVX1-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm3, %rdx
|
|
|
|
; AVX1-NEXT: vucomiss %xmm1, %xmm3
|
|
|
|
; AVX1-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; AVX1-NEXT: vmovq %rdx, %xmm3
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
|
|
|
|
; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm3
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm3, %rax
|
|
|
|
; AVX1-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm0, %rdx
|
|
|
|
; AVX1-NEXT: vucomiss %xmm1, %xmm0
|
|
|
|
; AVX1-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; AVX1-NEXT: vmovq %rdx, %xmm3
|
|
|
|
; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm4
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm4, %rax
|
|
|
|
; AVX1-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm0, %rcx
|
|
|
|
; AVX1-NEXT: vucomiss %xmm1, %xmm0
|
|
|
|
; AVX1-NEXT: cmovaeq %rax, %rcx
|
|
|
|
; AVX1-NEXT: vmovq %rcx, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: fptoui_4f32_to_4i64:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
|
|
|
|
; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm3, %rax
|
|
|
|
; AVX2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
|
|
|
|
; AVX2-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm2, %rdx
|
|
|
|
; AVX2-NEXT: vucomiss %xmm1, %xmm2
|
|
|
|
; AVX2-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; AVX2-NEXT: vmovq %rdx, %xmm2
|
|
|
|
; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
|
|
|
|
; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm4, %rax
|
|
|
|
; AVX2-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm3, %rdx
|
|
|
|
; AVX2-NEXT: vucomiss %xmm1, %xmm3
|
|
|
|
; AVX2-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; AVX2-NEXT: vmovq %rdx, %xmm3
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
|
|
|
|
; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm3
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm3, %rax
|
|
|
|
; AVX2-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm0, %rdx
|
|
|
|
; AVX2-NEXT: vucomiss %xmm1, %xmm0
|
|
|
|
; AVX2-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; AVX2-NEXT: vmovq %rdx, %xmm3
|
|
|
|
; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm4
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm4, %rax
|
|
|
|
; AVX2-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm0, %rcx
|
|
|
|
; AVX2-NEXT: vucomiss %xmm1, %xmm0
|
|
|
|
; AVX2-NEXT: cmovaeq %rax, %rcx
|
|
|
|
; AVX2-NEXT: vmovq %rcx, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
|
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: fptoui_4f32_to_4i64:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
|
|
|
|
; AVX512F-NEXT: vcvttss2usi %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
|
|
|
|
; AVX512F-NEXT: vcvttss2usi %xmm2, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
|
|
|
; AVX512F-NEXT: vcvttss2usi %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; AVX512F-NEXT: vcvttss2usi %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
2016-08-23 22:37:35 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-LABEL: fptoui_4f32_to_4i64:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
|
|
|
|
; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
|
|
|
|
; AVX512VL-NEXT: vcvttss2usi %xmm2, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
|
|
|
; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-LABEL: fptoui_4f32_to_4i64:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-23 22:01:18 +08:00
|
|
|
; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-LABEL: fptoui_4f32_to_4i64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %ymm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-07-18 23:56:33 +08:00
|
|
|
%shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
2015-05-02 19:18:47 +08:00
|
|
|
%cvt = fptoui <4 x float> %shuf to <4 x i64>
|
|
|
|
ret <4 x i64> %cvt
|
|
|
|
}
|
2015-05-02 21:04:07 +08:00
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
|
|
|
|
; SSE-LABEL: fptoui_8f32_to_4i64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm2
|
|
|
|
; SSE-NEXT: subss %xmm1, %xmm2
|
|
|
|
; SSE-NEXT: cvttss2si %xmm2, %rcx
|
|
|
|
; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
|
|
|
|
; SSE-NEXT: xorq %rax, %rcx
|
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rdx
|
|
|
|
; SSE-NEXT: ucomiss %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: cmovaeq %rcx, %rdx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rdx, %xmm2
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm3
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
|
|
|
|
; SSE-NEXT: movaps %xmm3, %xmm4
|
|
|
|
; SSE-NEXT: subss %xmm1, %xmm4
|
|
|
|
; SSE-NEXT: cvttss2si %xmm4, %rcx
|
|
|
|
; SSE-NEXT: xorq %rax, %rcx
|
|
|
|
; SSE-NEXT: cvttss2si %xmm3, %rdx
|
|
|
|
; SSE-NEXT: ucomiss %xmm1, %xmm3
|
|
|
|
; SSE-NEXT: cmovaeq %rcx, %rdx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rdx, %xmm3
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm3
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
|
|
|
|
; SSE-NEXT: movaps %xmm3, %xmm4
|
|
|
|
; SSE-NEXT: subss %xmm1, %xmm4
|
|
|
|
; SSE-NEXT: cvttss2si %xmm4, %rcx
|
|
|
|
; SSE-NEXT: xorq %rax, %rcx
|
|
|
|
; SSE-NEXT: cvttss2si %xmm3, %rdx
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: ucomiss %xmm1, %xmm3
|
|
|
|
; SSE-NEXT: cmovaeq %rcx, %rdx
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rdx, %xmm3
|
2016-08-23 22:37:35 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm4
|
|
|
|
; SSE-NEXT: subss %xmm1, %xmm4
|
|
|
|
; SSE-NEXT: cvttss2si %xmm4, %rcx
|
|
|
|
; SSE-NEXT: xorq %rax, %rcx
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rax
|
|
|
|
; SSE-NEXT: ucomiss %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: cmovaeq %rcx, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm1
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
|
|
|
|
; SSE-NEXT: movdqa %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-08-23 22:37:35 +08:00
|
|
|
; AVX1-LABEL: fptoui_8f32_to_4i64:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
|
|
|
|
; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm3, %rax
|
|
|
|
; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
|
|
|
|
; AVX1-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm2, %rdx
|
|
|
|
; AVX1-NEXT: vucomiss %xmm1, %xmm2
|
|
|
|
; AVX1-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; AVX1-NEXT: vmovq %rdx, %xmm2
|
|
|
|
; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
|
|
|
|
; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm4
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm4, %rax
|
|
|
|
; AVX1-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm3, %rdx
|
|
|
|
; AVX1-NEXT: vucomiss %xmm1, %xmm3
|
|
|
|
; AVX1-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; AVX1-NEXT: vmovq %rdx, %xmm3
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
|
|
|
|
; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm3
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm3, %rax
|
|
|
|
; AVX1-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm0, %rdx
|
|
|
|
; AVX1-NEXT: vucomiss %xmm1, %xmm0
|
|
|
|
; AVX1-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; AVX1-NEXT: vmovq %rdx, %xmm3
|
|
|
|
; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm4
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm4, %rax
|
|
|
|
; AVX1-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvttss2si %xmm0, %rcx
|
|
|
|
; AVX1-NEXT: vucomiss %xmm1, %xmm0
|
|
|
|
; AVX1-NEXT: cmovaeq %rax, %rcx
|
|
|
|
; AVX1-NEXT: vmovq %rcx, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: fptoui_8f32_to_4i64:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
|
|
|
|
; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm3, %rax
|
|
|
|
; AVX2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
|
|
|
|
; AVX2-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm2, %rdx
|
|
|
|
; AVX2-NEXT: vucomiss %xmm1, %xmm2
|
|
|
|
; AVX2-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; AVX2-NEXT: vmovq %rdx, %xmm2
|
|
|
|
; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
|
|
|
|
; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm4, %rax
|
|
|
|
; AVX2-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm3, %rdx
|
|
|
|
; AVX2-NEXT: vucomiss %xmm1, %xmm3
|
|
|
|
; AVX2-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; AVX2-NEXT: vmovq %rdx, %xmm3
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
|
|
|
|
; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm3
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm3, %rax
|
|
|
|
; AVX2-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm0, %rdx
|
|
|
|
; AVX2-NEXT: vucomiss %xmm1, %xmm0
|
|
|
|
; AVX2-NEXT: cmovaeq %rax, %rdx
|
|
|
|
; AVX2-NEXT: vmovq %rdx, %xmm3
|
|
|
|
; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm4
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm4, %rax
|
|
|
|
; AVX2-NEXT: xorq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvttss2si %xmm0, %rcx
|
|
|
|
; AVX2-NEXT: vucomiss %xmm1, %xmm0
|
|
|
|
; AVX2-NEXT: cmovaeq %rax, %rcx
|
|
|
|
; AVX2-NEXT: vmovq %rcx, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
|
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: fptoui_8f32_to_4i64:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
|
|
|
; AVX512F-NEXT: vcvttss2usi %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vcvttss2usi %xmm0, %rcx
|
|
|
|
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
|
|
|
; AVX512F-NEXT: vcvttss2usi %xmm1, %rdx
|
|
|
|
; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
|
|
|
; AVX512F-NEXT: vcvttss2usi %xmm0, %rsi
|
|
|
|
; AVX512F-NEXT: vmovq %rsi, %xmm0
|
|
|
|
; AVX512F-NEXT: vmovq %rdx, %xmm1
|
|
|
|
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX512F-NEXT: vmovq %rcx, %xmm1
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
|
2016-08-23 22:37:35 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-LABEL: fptoui_8f32_to_4i64:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
|
|
|
; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vcvttss2usi %xmm0, %rcx
|
|
|
|
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
|
|
|
; AVX512VL-NEXT: vcvttss2usi %xmm1, %rdx
|
|
|
|
; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
|
|
|
; AVX512VL-NEXT: vcvttss2usi %xmm0, %rsi
|
|
|
|
; AVX512VL-NEXT: vmovq %rsi, %xmm0
|
|
|
|
; AVX512VL-NEXT: vmovq %rdx, %xmm1
|
|
|
|
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX512VL-NEXT: vmovq %rcx, %xmm1
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm2
|
|
|
|
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-LABEL: fptoui_8f32_to_4i64:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-LABEL: fptoui_8f32_to_4i64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvttps2uqq %ymm0, %zmm0
|
|
|
|
; AVX512VLDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%cvt = fptoui <8 x float> %a to <8 x i64>
|
|
|
|
%shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
ret <4 x i64> %shuf
|
|
|
|
}
|
|
|
|
|
2015-05-02 21:04:07 +08:00
|
|
|
;
|
|
|
|
; Constant Folding
|
|
|
|
;
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <2 x i64> @fptosi_2f64_to_2i64_const() {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptosi_2f64_to_2i64_const:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,18446744073709551615]
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 21:04:07 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: fptosi_2f64_to_2i64_const:
|
2015-05-02 21:04:07 +08:00
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,18446744073709551615]
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = fptosi <2 x double> <double 1.0, double -1.0> to <2 x i64>
|
|
|
|
ret <2 x i64> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x i32> @fptosi_2f64_to_2i32_const() {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptosi_2f64_to_2i32_const:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm0 = <4294967295,1,u,u>
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 21:04:07 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: fptosi_2f64_to_2i32_const:
|
2015-05-02 21:04:07 +08:00
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u>
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = fptosi <2 x double> <double -1.0, double 1.0> to <2 x i32>
|
|
|
|
%ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
|
|
|
ret <4 x i32> %ext
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x i64> @fptosi_4f64_to_4i64_const() {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptosi_4f64_to_4i64_const:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,18446744073709551615]
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm1 = [2,18446744073709551613]
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 21:04:07 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: fptosi_4f64_to_4i64_const:
|
2015-05-02 21:04:07 +08:00
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613]
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = fptosi <4 x double> <double 1.0, double -1.0, double 2.0, double -3.0> to <4 x i64>
|
|
|
|
ret <4 x i64> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x i32> @fptosi_4f64_to_4i32_const() {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptosi_4f64_to_4i32_const:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3]
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 21:04:07 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: fptosi_4f64_to_4i32_const:
|
2015-05-02 21:04:07 +08:00
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3]
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = fptosi <4 x double> <double -1.0, double 1.0, double -2.0, double 3.0> to <4 x i32>
|
|
|
|
ret <4 x i32> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <2 x i64> @fptoui_2f64_to_2i64_const() {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptoui_2f64_to_2i64_const:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4]
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 21:04:07 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: fptoui_2f64_to_2i64_const:
|
2015-05-02 21:04:07 +08:00
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4]
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = fptoui <2 x double> <double 2.0, double 4.0> to <2 x i64>
|
|
|
|
ret <2 x i64> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptoui_2f64_to_2i32_const:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm0 = <2,4,u,u>
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 21:04:07 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: fptoui_2f64_to_2i32_const:
|
2015-05-02 21:04:07 +08:00
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <2,4,u,u>
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = fptoui <2 x double> <double 2.0, double 4.0> to <2 x i32>
|
|
|
|
%ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
|
|
|
ret <4 x i32> %ext
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x i64> @fptoui_4f64_to_4i64_const(<4 x double> %a) {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptoui_4f64_to_4i64_const:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4]
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm1 = [6,8]
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 21:04:07 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: fptoui_4f64_to_4i64_const:
|
2015-05-02 21:04:07 +08:00
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [2,4,6,8]
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = fptoui <4 x double> <double 2.0, double 4.0, double 6.0, double 8.0> to <4 x i64>
|
|
|
|
ret <4 x i64> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x i32> @fptoui_4f64_to_4i32_const(<4 x double> %a) {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptoui_4f64_to_4i32_const:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4,6,8]
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 21:04:07 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: fptoui_4f64_to_4i32_const:
|
2015-05-02 21:04:07 +08:00
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4,6,8]
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = fptoui <4 x double> <double 2.0, double 4.0, double 6.0, double 8.0> to <4 x i32>
|
|
|
|
ret <4 x i32> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x i32> @fptosi_4f32_to_4i32_const() {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptosi_4f32_to_4i32_const:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,4294967295,2,3]
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 21:04:07 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: fptosi_4f32_to_4i32_const:
|
2015-05-02 21:04:07 +08:00
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3]
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = fptosi <4 x float> <float 1.0, float -1.0, float 2.0, float 3.0> to <4 x i32>
|
|
|
|
ret <4 x i32> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x i64> @fptosi_4f32_to_4i64_const() {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptosi_4f32_to_4i64_const:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,18446744073709551615]
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm1 = [2,3]
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 21:04:07 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: fptosi_4f32_to_4i64_const:
|
2015-05-02 21:04:07 +08:00
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3]
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = fptosi <4 x float> <float 1.0, float -1.0, float 2.0, float 3.0> to <4 x i64>
|
|
|
|
ret <4 x i64> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <8 x i32> @fptosi_8f32_to_8i32_const(<8 x float> %a) {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptosi_8f32_to_8i32_const:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,4294967295,2,3]
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm1 = [6,4294967288,2,4294967295]
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 21:04:07 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: fptosi_8f32_to_8i32_const:
|
2015-05-02 21:04:07 +08:00
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295]
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = fptosi <8 x float> <float 1.0, float -1.0, float 2.0, float 3.0, float 6.0, float -8.0, float 2.0, float -1.0> to <8 x i32>
|
|
|
|
ret <8 x i32> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x i32> @fptoui_4f32_to_4i32_const(<4 x float> %a) {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptoui_4f32_to_4i32_const:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,2,4,6]
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 21:04:07 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: fptoui_4f32_to_4i32_const:
|
2015-05-02 21:04:07 +08:00
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,4,6]
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = fptoui <4 x float> <float 1.0, float 2.0, float 4.0, float 6.0> to <4 x i32>
|
|
|
|
ret <4 x i32> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x i64> @fptoui_4f32_to_4i64_const() {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptoui_4f32_to_4i64_const:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,2]
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm1 = [4,8]
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 21:04:07 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: fptoui_4f32_to_4i64_const:
|
2015-05-02 21:04:07 +08:00
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8]
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = fptoui <4 x float> <float 1.0, float 2.0, float 4.0, float 8.0> to <4 x i64>
|
|
|
|
ret <4 x i64> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <8 x i32> @fptoui_8f32_to_8i32_const(<8 x float> %a) {
|
2015-07-19 00:53:51 +08:00
|
|
|
; SSE-LABEL: fptoui_8f32_to_8i32_const:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,2,4,6]
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm1 = [8,6,4,1]
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 21:04:07 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: fptoui_8f32_to_8i32_const:
|
2015-05-02 21:04:07 +08:00
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1]
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = fptoui <8 x float> <float 1.0, float 2.0, float 4.0, float 6.0, float 8.0, float 6.0, float 4.0, float 1.0> to <8 x i32>
|
|
|
|
ret <8 x i32> %cvt
|
|
|
|
}
|
2016-10-07 03:29:25 +08:00
|
|
|
|
|
|
|
;
|
|
|
|
; Special Cases
|
|
|
|
;
|
|
|
|
|
|
|
|
define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
|
|
|
|
; SSE-LABEL: fptosi_2f16_to_4i32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pushq %rax
|
|
|
|
; SSE-NEXT: movss %xmm1, {{[0-9]+}}(%rsp) # 4-byte Spill
|
|
|
|
; SSE-NEXT: callq __gnu_f2h_ieee
|
|
|
|
; SSE-NEXT: movzwl %ax, %edi
|
|
|
|
; SSE-NEXT: callq __gnu_h2f_ieee
|
|
|
|
; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
|
|
|
|
; SSE-NEXT: movss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload
|
|
|
|
; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
|
|
|
|
; SSE-NEXT: callq __gnu_f2h_ieee
|
|
|
|
; SSE-NEXT: movzwl %ax, %edi
|
|
|
|
; SSE-NEXT: callq __gnu_h2f_ieee
|
|
|
|
; SSE-NEXT: cvttss2si %xmm0, %rax
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm1
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: cvttss2si (%rsp), %rax # 4-byte Folded Reload
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm0
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; SSE-NEXT: pxor %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
|
2016-10-07 03:29:25 +08:00
|
|
|
; SSE-NEXT: popq %rax
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; VEX-LABEL: fptosi_2f16_to_4i32:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: pushq %rax
|
|
|
|
; VEX-NEXT: vmovss %xmm1, {{[0-9]+}}(%rsp) # 4-byte Spill
|
|
|
|
; VEX-NEXT: callq __gnu_f2h_ieee
|
|
|
|
; VEX-NEXT: movzwl %ax, %edi
|
|
|
|
; VEX-NEXT: callq __gnu_h2f_ieee
|
|
|
|
; VEX-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill
|
|
|
|
; VEX-NEXT: vmovss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload
|
|
|
|
; VEX-NEXT: # xmm0 = mem[0],zero,zero,zero
|
|
|
|
; VEX-NEXT: callq __gnu_f2h_ieee
|
|
|
|
; VEX-NEXT: movzwl %ax, %edi
|
|
|
|
; VEX-NEXT: callq __gnu_h2f_ieee
|
|
|
|
; VEX-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; VEX-NEXT: vmovq %rax, %xmm0
|
|
|
|
; VEX-NEXT: vcvttss2si (%rsp), %rax # 4-byte Folded Reload
|
|
|
|
; VEX-NEXT: vmovq %rax, %xmm1
|
|
|
|
; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
|
2016-11-23 06:04:50 +08:00
|
|
|
; VEX-NEXT: popq %rax
|
|
|
|
; VEX-NEXT: retq
|
2016-10-07 03:29:25 +08:00
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512F-LABEL: fptosi_2f16_to_4i32:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
|
|
|
|
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
|
|
|
|
; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
|
|
|
|
; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
|
|
|
|
; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
|
|
|
|
; AVX512F-NEXT: vcvttss2si %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512F-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: fptosi_2f16_to_4i32:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
|
|
|
|
; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
|
|
|
|
; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
|
|
|
|
; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
|
|
|
|
; AVX512VL-NEXT: vcvttss2si %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512VL-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-LABEL: fptosi_2f16_to_4i32:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvtps2ph $4, %zmm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: vcvtph2ps %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: vcvtps2ph $4, %zmm1, %ymm1
|
|
|
|
; AVX512DQ-NEXT: vcvtph2ps %ymm1, %zmm1
|
|
|
|
; AVX512DQ-NEXT: vcvttss2si %xmm1, %rax
|
|
|
|
; AVX512DQ-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512DQ-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX512DQ-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-LABEL: fptosi_2f16_to_4i32:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtps2ph $4, %xmm0, %xmm0
|
|
|
|
; AVX512VLDQ-NEXT: vcvtph2ps %xmm0, %xmm0
|
|
|
|
; AVX512VLDQ-NEXT: vcvtps2ph $4, %xmm1, %xmm1
|
|
|
|
; AVX512VLDQ-NEXT: vcvtph2ps %xmm1, %xmm1
|
|
|
|
; AVX512VLDQ-NEXT: vcvttss2si %xmm1, %rax
|
|
|
|
; AVX512VLDQ-NEXT: vmovq %rax, %xmm1
|
|
|
|
; AVX512VLDQ-NEXT: vcvttss2si %xmm0, %rax
|
|
|
|
; AVX512VLDQ-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX512VLDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; AVX512VLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
|
2016-11-08 14:58:53 +08:00
|
|
|
; AVX512VLDQ-NEXT: retq
|
2016-10-07 03:29:25 +08:00
|
|
|
%cvt = fptosi <2 x half> %a to <2 x i32>
|
|
|
|
%ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
ret <4 x i32> %ext
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @fptosi_2f80_to_4i32(<2 x x86_fp80> %a) nounwind {
|
|
|
|
; SSE-LABEL: fptosi_2f80_to_4i32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: fldt {{[0-9]+}}(%rsp)
|
|
|
|
; SSE-NEXT: fldt {{[0-9]+}}(%rsp)
|
|
|
|
; SSE-NEXT: fnstcw -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; SSE-NEXT: movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F
|
|
|
|
; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE-NEXT: fistpll -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE-NEXT: fnstcw -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; SSE-NEXT: movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F
|
|
|
|
; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE-NEXT: fistpll -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; SSE-NEXT: pxor %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
|
2016-10-07 03:29:25 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: fptosi_2f80_to_4i32:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: fldt {{[0-9]+}}(%rsp)
|
|
|
|
; AVX-NEXT: fldt {{[0-9]+}}(%rsp)
|
|
|
|
; AVX-NEXT: fisttpll -{{[0-9]+}}(%rsp)
|
|
|
|
; AVX-NEXT: fisttpll -{{[0-9]+}}(%rsp)
|
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
|
2016-10-07 03:29:25 +08:00
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = fptosi <2 x x86_fp80> %a to <2 x i32>
|
|
|
|
%ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
ret <4 x i32> %ext
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind {
|
|
|
|
; SSE-LABEL: fptosi_2f128_to_4i32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pushq %r14
|
|
|
|
; SSE-NEXT: pushq %rbx
|
|
|
|
; SSE-NEXT: subq $24, %rsp
|
|
|
|
; SSE-NEXT: movq %rsi, %r14
|
|
|
|
; SSE-NEXT: movq %rdi, %rbx
|
|
|
|
; SSE-NEXT: movq %rdx, %rdi
|
|
|
|
; SSE-NEXT: movq %rcx, %rsi
|
|
|
|
; SSE-NEXT: callq __fixtfdi
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm0
|
2016-12-07 20:10:49 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
|
2016-10-07 03:29:25 +08:00
|
|
|
; SSE-NEXT: movq %rbx, %rdi
|
|
|
|
; SSE-NEXT: movq %r14, %rsi
|
|
|
|
; SSE-NEXT: callq __fixtfdi
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %rax, %xmm0
|
2016-10-07 03:29:25 +08:00
|
|
|
; SSE-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload
|
|
|
|
; SSE-NEXT: # xmm0 = xmm0[0],mem[0]
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
|
2016-10-07 03:29:25 +08:00
|
|
|
; SSE-NEXT: addq $24, %rsp
|
|
|
|
; SSE-NEXT: popq %rbx
|
|
|
|
; SSE-NEXT: popq %r14
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX-LABEL: fptosi_2f128_to_4i32:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: pushq %r14
|
|
|
|
; AVX-NEXT: pushq %rbx
|
|
|
|
; AVX-NEXT: subq $24, %rsp
|
|
|
|
; AVX-NEXT: movq %rsi, %r14
|
|
|
|
; AVX-NEXT: movq %rdi, %rbx
|
|
|
|
; AVX-NEXT: movq %rdx, %rdi
|
|
|
|
; AVX-NEXT: movq %rcx, %rsi
|
|
|
|
; AVX-NEXT: callq __fixtfdi
|
|
|
|
; AVX-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
|
|
|
|
; AVX-NEXT: movq %rbx, %rdi
|
|
|
|
; AVX-NEXT: movq %r14, %rsi
|
|
|
|
; AVX-NEXT: callq __fixtfdi
|
|
|
|
; AVX-NEXT: vmovq %rax, %xmm0
|
|
|
|
; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
|
|
|
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
|
|
|
|
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
|
|
|
|
; AVX-NEXT: addq $24, %rsp
|
|
|
|
; AVX-NEXT: popq %rbx
|
|
|
|
; AVX-NEXT: popq %r14
|
|
|
|
; AVX-NEXT: retq
|
2016-10-07 03:29:25 +08:00
|
|
|
%cvt = fptosi <2 x fp128> %a to <2 x i32>
|
|
|
|
%ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
ret <4 x i32> %ext
|
|
|
|
}
|