From cb96142fb82c00dd705e44d83af7c54118592bb4 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 23 Aug 2016 14:37:35 +0000 Subject: [PATCH] [X86][AVX] Add AVX2/AVX512 fp to int conversion tests llvm-svn: 279518 --- llvm/test/CodeGen/X86/vec_fp_to_int.ll | 1275 +++++++++++++++++------- 1 file changed, 924 insertions(+), 351 deletions(-) diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll index 5e32f2c89c73..8d1567e38130 100644 --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -1,5 +1,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ ; ; 32-bit tests to make sure we're not doing anything stupid. ; RUN: llc < %s -mtriple=i686-unknown-unknown @@ -12,13 +15,13 @@ define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) { ; SSE-LABEL: fptosi_2f64_to_2i64: -; SSE: # BB#0: -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm1 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE: # BB#0: +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %rax, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -31,19 +34,34 @@ define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) { ; AVX-NEXT: vmovq %rax, %xmm0 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX-NEXT: retq +; +; AVX512F-LABEL: fptosi_2f64_to_2i64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vcvttsd2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vcvttsd2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_2f64_to_2i64: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vcvttpd2qq %xmm0, %xmm0 +; AVX512DQ-NEXT: retq %cvt = fptosi <2 x double> %a to <2 x i64> ret <2 x i64> %cvt } define <4 x i32> @fptosi_2f64_to_2i32(<2 x double> %a) { ; SSE-LABEL: fptosi_2f64_to_2i32: -; SSE: # BB#0: -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm1 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE: # BB#0: +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %rax, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SSE-NEXT: retq ; @@ -57,6 +75,23 @@ define <4 x i32> @fptosi_2f64_to_2i32(<2 x double> %a) { ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: retq +; +; AVX512F-LABEL: fptosi_2f64_to_2i32: +; AVX512F: # BB#0: +; AVX512F-NEXT: vcvttsd2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vcvttsd2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_2f64_to_2i32: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vcvttpd2qq %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512DQ-NEXT: retq %cvt = fptosi <2 x double> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> ret <4 x i32> %ext @@ -64,13 +99,13 @@ define <4 x i32> @fptosi_2f64_to_2i32(<2 x double> %a) { define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) { ; SSE-LABEL: fptosi_4f64_to_2i32: -; SSE: # BB#0: -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm1 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE: # BB#0: +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %rax, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SSE-NEXT: cvttsd2si %xmm0, %rax ; SSE-NEXT: movd %rax, %xmm1 @@ -81,10 +116,16 @@ define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) { ; ; AVX-LABEL: fptosi_4f64_to_2i32: ; AVX: # BB#0: -; AVX-NEXT: # kill +; AVX-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX-NEXT: vcvttpd2dqy %ymm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq +; +; AVX512-LABEL: fptosi_4f64_to_2i32: +; AVX512: # BB#0: +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512-NEXT: retq %ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> %cvt = fptosi <4 x double> %ext to <4 x i32> ret <4 x i32> %cvt @@ -92,60 +133,101 @@ define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) { define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) { ; SSE-LABEL: fptosi_4f64_to_4i64: -; SSE: # BB#0: -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm2 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movd %rax, %xmm3 -; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movd %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE: # BB#0: +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %rax, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movd %rax, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: fptosi_4f64_to_4i64: -; AVX: # BB#0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vcvttsd2si %xmm1, %rax -; AVX-NEXT: vmovq %rax, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX-NEXT: vcvttsd2si %xmm1, %rax -; AVX-NEXT: vmovq %rax, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX-NEXT: vcvttsd2si %xmm0, %rax -; AVX-NEXT: vmovq %rax, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-NEXT: vcvttsd2si %xmm0, %rax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: fptosi_4f64_to_4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vcvttsd2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-NEXT: vcvttsd2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vcvttsd2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vcvttsd2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_4f64_to_4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vcvttsd2si %xmm1, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX2-NEXT: vcvttsd2si %xmm1, %rax +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vcvttsd2si %xmm0, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vcvttsd2si %xmm0, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptosi_4f64_to_4i64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vextractf32x4 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vcvttsd2si %xmm1, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512F-NEXT: vcvttsd2si %xmm1, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vcvttsd2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vcvttsd2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_4f64_to_4i64: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vcvttpd2qq %ymm0, %ymm0 +; AVX512DQ-NEXT: retq %cvt = fptosi <4 x double> %a to <4 x i64> ret <4 x i64> %cvt } define <4 x i32> @fptosi_4f64_to_4i32(<4 x double> %a) { ; SSE-LABEL: fptosi_4f64_to_4i32: -; SSE: # BB#0: -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movd %rax, %xmm2 -; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movd %rax, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm2 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE: # BB#0: +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movd %rax, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movd %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %rax, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq @@ -155,6 +237,11 @@ define <4 x i32> @fptosi_4f64_to_4i32(<4 x double> %a) { ; AVX-NEXT: vcvttpd2dqy %ymm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq +; +; AVX512-LABEL: fptosi_4f64_to_4i32: +; AVX512: # BB#0: +; AVX512-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512-NEXT: retq %cvt = fptosi <4 x double> %a to <4 x i32> ret <4 x i32> %cvt } @@ -173,14 +260,14 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) { ; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 ; SSE-NEXT: xorq %rcx, %rax ; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movd %rdx, %xmm1 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subsd %xmm2, %xmm3 -; SSE-NEXT: cvttsd2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: ucomisd %xmm2, %xmm0 +; SSE-NEXT: cmovaeq %rax, %rdx +; SSE-NEXT: movd %rdx, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: subsd %xmm2, %xmm3 +; SSE-NEXT: cvttsd2si %xmm3, %rax +; SSE-NEXT: xorq %rcx, %rax ; SSE-NEXT: cvttsd2si %xmm0, %rcx ; SSE-NEXT: ucomisd %xmm2, %xmm0 ; SSE-NEXT: cmovaeq %rax, %rcx @@ -210,6 +297,21 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) { ; AVX-NEXT: vmovq %rcx, %xmm0 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; AVX-NEXT: retq +; +; AVX512F-LABEL: fptoui_2f64_to_2i64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_2f64_to_2i64: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vcvttpd2uqq %xmm0, %xmm0 +; AVX512DQ-NEXT: retq %cvt = fptoui <2 x double> %a to <2 x i64> ret <2 x i64> %cvt } @@ -224,14 +326,14 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) { ; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 ; SSE-NEXT: xorq %rcx, %rax ; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm1, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movd %rdx, %xmm2 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subsd %xmm1, %xmm3 -; SSE-NEXT: cvttsd2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: ucomisd %xmm1, %xmm0 +; SSE-NEXT: cmovaeq %rax, %rdx +; SSE-NEXT: movd %rdx, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: subsd %xmm1, %xmm3 +; SSE-NEXT: cvttsd2si %xmm3, %rax +; SSE-NEXT: xorq %rcx, %rax ; SSE-NEXT: cvttsd2si %xmm0, %rcx ; SSE-NEXT: ucomisd %xmm1, %xmm0 ; SSE-NEXT: cmovaeq %rax, %rcx @@ -262,6 +364,23 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) { ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: retq +; +; AVX512F-LABEL: fptoui_2f64_to_2i32: +; AVX512F: # BB#0: +; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_2f64_to_2i32: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vcvttpd2uqq %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512DQ-NEXT: retq %cvt = fptoui <2 x double> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> ret <4 x i32> %ext @@ -277,14 +396,14 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) { ; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 ; SSE-NEXT: xorq %rcx, %rax ; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm1, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movd %rdx, %xmm2 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subsd %xmm1, %xmm3 -; SSE-NEXT: cvttsd2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: ucomisd %xmm1, %xmm0 +; SSE-NEXT: cmovaeq %rax, %rdx +; SSE-NEXT: movd %rdx, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: subsd %xmm1, %xmm3 +; SSE-NEXT: cvttsd2si %xmm3, %rax +; SSE-NEXT: xorq %rcx, %rax ; SSE-NEXT: cvttsd2si %xmm0, %rdx ; SSE-NEXT: ucomisd %xmm1, %xmm0 ; SSE-NEXT: cmovaeq %rax, %rdx @@ -312,6 +431,12 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) { ; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; AVX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq +; +; AVX512-LABEL: fptoui_4f64_to_2i32: +; AVX512: # BB#0: +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vcvttpd2udq %ymm0, %xmm0 +; AVX512-NEXT: retq %ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> %cvt = fptoui <4 x double> %ext to <4 x i32> ret <4 x i32> %cvt @@ -327,14 +452,14 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 ; SSE-NEXT: xorq %rax, %rcx ; SSE-NEXT: cvttsd2si %xmm2, %rdx -; SSE-NEXT: ucomisd %xmm3, %xmm2 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movd %rdx, %xmm0 -; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: subsd %xmm3, %xmm4 -; SSE-NEXT: cvttsd2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: ucomisd %xmm3, %xmm2 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movd %rdx, %xmm0 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: subsd %xmm3, %xmm4 +; SSE-NEXT: cvttsd2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx ; SSE-NEXT: cvttsd2si %xmm2, %rdx ; SSE-NEXT: ucomisd %xmm3, %xmm2 ; SSE-NEXT: cmovaeq %rcx, %rdx @@ -345,14 +470,14 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; SSE-NEXT: cvttsd2si %xmm2, %rcx ; SSE-NEXT: xorq %rax, %rcx ; SSE-NEXT: cvttsd2si %xmm1, %rdx -; SSE-NEXT: ucomisd %xmm3, %xmm1 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movd %rdx, %xmm2 -; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: subsd %xmm3, %xmm4 -; SSE-NEXT: cvttsd2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: ucomisd %xmm3, %xmm1 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movd %rdx, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: subsd %xmm3, %xmm4 +; SSE-NEXT: cvttsd2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx ; SSE-NEXT: cvttsd2si %xmm1, %rax ; SSE-NEXT: ucomisd %xmm3, %xmm1 ; SSE-NEXT: cmovaeq %rcx, %rax @@ -361,45 +486,108 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: fptoui_4f64_to_4i64: -; AVX: # BB#0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vsubsd %xmm1, %xmm2, %xmm3 -; AVX-NEXT: vcvttsd2si %xmm3, %rax -; AVX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; AVX-NEXT: xorq %rcx, %rax -; AVX-NEXT: vcvttsd2si %xmm2, %rdx -; AVX-NEXT: vucomisd %xmm1, %xmm2 -; AVX-NEXT: cmovaeq %rax, %rdx -; AVX-NEXT: vmovq %rdx, %xmm3 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX-NEXT: vsubsd %xmm1, %xmm2, %xmm4 -; AVX-NEXT: vcvttsd2si %xmm4, %rax -; AVX-NEXT: xorq %rcx, %rax -; AVX-NEXT: vcvttsd2si %xmm2, %rdx -; AVX-NEXT: vucomisd %xmm1, %xmm2 -; AVX-NEXT: cmovaeq %rax, %rdx -; AVX-NEXT: vmovq %rdx, %xmm2 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm3 -; AVX-NEXT: vcvttsd2si %xmm3, %rax -; AVX-NEXT: xorq %rcx, %rax -; AVX-NEXT: vcvttsd2si %xmm0, %rdx -; AVX-NEXT: vucomisd %xmm1, %xmm0 -; AVX-NEXT: cmovaeq %rax, %rdx -; AVX-NEXT: vmovq %rdx, %xmm3 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm4 -; AVX-NEXT: vcvttsd2si %xmm4, %rax -; AVX-NEXT: xorq %rcx, %rax -; AVX-NEXT: vcvttsd2si %xmm0, %rcx -; AVX-NEXT: vucomisd %xmm1, %xmm0 -; AVX-NEXT: cmovaeq %rax, %rcx -; AVX-NEXT: vmovq %rcx, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: fptoui_4f64_to_4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vsubsd %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vcvttsd2si %xmm3, %rax +; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttsd2si %xmm2, %rdx +; AVX1-NEXT: vucomisd %xmm1, %xmm2 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX1-NEXT: vsubsd %xmm1, %xmm2, %xmm4 +; AVX1-NEXT: vcvttsd2si %xmm4, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttsd2si %xmm2, %rdx +; AVX1-NEXT: vucomisd %xmm1, %xmm2 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vcvttsd2si %xmm3, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttsd2si %xmm0, %rdx +; AVX1-NEXT: vucomisd %xmm1, %xmm0 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vcvttsd2si %xmm4, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttsd2si %xmm0, %rcx +; AVX1-NEXT: vucomisd %xmm1, %xmm0 +; AVX1-NEXT: cmovaeq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_4f64_to_4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vsubsd %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vcvttsd2si %xmm3, %rax +; AVX2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttsd2si %xmm2, %rdx +; AVX2-NEXT: vucomisd %xmm1, %xmm2 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX2-NEXT: vsubsd %xmm1, %xmm2, %xmm4 +; AVX2-NEXT: vcvttsd2si %xmm4, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttsd2si %xmm2, %rdx +; AVX2-NEXT: vucomisd %xmm1, %xmm2 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm3 +; AVX2-NEXT: vcvttsd2si %xmm3, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttsd2si %xmm0, %rdx +; AVX2-NEXT: vucomisd %xmm1, %xmm0 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm4 +; AVX2-NEXT: vcvttsd2si %xmm4, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttsd2si %xmm0, %rcx +; AVX2-NEXT: vucomisd %xmm1, %xmm0 +; AVX2-NEXT: cmovaeq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_4f64_to_4i64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vextractf32x4 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vcvttsd2usi %xmm1, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512F-NEXT: vcvttsd2usi %xmm1, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_4f64_to_4i64: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vcvttpd2uqq %ymm0, %ymm0 +; AVX512DQ-NEXT: retq %cvt = fptoui <4 x double> %a to <4 x i64> ret <4 x i64> %cvt } @@ -414,14 +602,14 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) { ; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 ; SSE-NEXT: xorq %rax, %rcx ; SSE-NEXT: cvttsd2si %xmm1, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm1 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movd %rdx, %xmm3 -; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: subsd %xmm2, %xmm4 -; SSE-NEXT: cvttsd2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: ucomisd %xmm2, %xmm1 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movd %rdx, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: subsd %xmm2, %xmm4 +; SSE-NEXT: cvttsd2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx ; SSE-NEXT: cvttsd2si %xmm1, %rdx ; SSE-NEXT: ucomisd %xmm2, %xmm1 ; SSE-NEXT: cmovaeq %rcx, %rdx @@ -433,14 +621,14 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) { ; SSE-NEXT: cvttsd2si %xmm3, %rcx ; SSE-NEXT: xorq %rax, %rcx ; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movd %rdx, %xmm3 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: subsd %xmm2, %xmm4 -; SSE-NEXT: cvttsd2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: ucomisd %xmm2, %xmm0 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movd %rdx, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: subsd %xmm2, %xmm4 +; SSE-NEXT: cvttsd2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx ; SSE-NEXT: cvttsd2si %xmm0, %rax ; SSE-NEXT: ucomisd %xmm2, %xmm0 ; SSE-NEXT: cmovaeq %rcx, %rax @@ -465,6 +653,11 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) { ; AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq +; +; AVX512-LABEL: fptoui_4f64_to_4i32: +; AVX512: # BB#0: +; AVX512-NEXT: vcvttpd2udq %ymm0, %xmm0 +; AVX512-NEXT: retq %cvt = fptoui <4 x double> %a to <4 x i32> ret <4 x i32> %cvt } @@ -483,6 +676,11 @@ define <4 x i32> @fptosi_4f32_to_4i32(<4 x float> %a) { ; AVX: # BB#0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 ; AVX-NEXT: retq +; +; AVX512-LABEL: fptosi_4f32_to_4i32: +; AVX512: # BB#0: +; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512-NEXT: retq %cvt = fptosi <4 x float> %a to <4 x i32> ret <4 x i32> %cvt } @@ -508,6 +706,16 @@ define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) { ; AVX-NEXT: vmovq %rax, %xmm0 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX-NEXT: retq +; +; AVX512-LABEL: fptosi_2f32_to_2i64: +; AVX512: # BB#0: +; AVX512-NEXT: vcvttss2si %xmm0, %rax +; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512-NEXT: vcvttss2si %xmm0, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: retq %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> %cvt = fptosi <2 x float> %shuf to <2 x i64> ret <2 x i64> %cvt @@ -534,6 +742,22 @@ define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) { ; AVX-NEXT: vmovq %rax, %xmm1 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq +; +; AVX512F-LABEL: fptosi_4f32_to_2i64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2si %xmm1, %rax +; AVX512F-NEXT: vcvttss2si %xmm0, %rcx +; AVX512F-NEXT: vmovq %rcx, %xmm0 +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_4f32_to_2i64: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vcvttps2qq %xmm0, %ymm0 +; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: retq %cvt = fptosi <4 x float> %a to <4 x i64> %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> ret <2 x i64> %shuf @@ -550,6 +774,11 @@ define <8 x i32> @fptosi_8f32_to_8i32(<8 x float> %a) { ; AVX: # BB#0: ; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX-NEXT: retq +; +; AVX512-LABEL: fptosi_8f32_to_8i32: +; AVX512: # BB#0: +; AVX512-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512-NEXT: retq %cvt = fptosi <8 x float> %a to <8 x i32> ret <8 x i32> %cvt } @@ -565,33 +794,74 @@ define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) { ; SSE-NEXT: movd %rax, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movd %rax, %xmm3 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movd %rax, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movd %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: fptosi_4f32_to_4i64: -; AVX: # BB#0: -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX-NEXT: vcvttss2si %xmm1, %rax -; AVX-NEXT: vmovq %rax, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX-NEXT: vcvttss2si %xmm2, %rax -; AVX-NEXT: vmovq %rax, %xmm2 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX-NEXT: vcvttss2si %xmm0, %rax -; AVX-NEXT: vmovq %rax, %xmm2 -; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX-NEXT: vcvttss2si %xmm0, %rax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: fptosi_4f32_to_4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX1-NEXT: vcvttss2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-NEXT: vcvttss2si %xmm2, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vcvttss2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vcvttss2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_4f32_to_4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX2-NEXT: vcvttss2si %xmm1, %rax +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX2-NEXT: vcvttss2si %xmm2, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vcvttss2si %xmm0, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vcvttss2si %xmm0, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptosi_4f32_to_4i64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvttss2si %xmm1, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512F-NEXT: vcvttss2si %xmm2, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vcvttss2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_4f32_to_4i64: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vcvttps2qq %xmm0, %ymm0 +; AVX512DQ-NEXT: retq %shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> %cvt = fptosi <4 x float> %shuf to <4 x i64> ret <4 x i64> %cvt @@ -608,33 +878,75 @@ define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) { ; SSE-NEXT: movd %rax, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movd %rax, %xmm3 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movd %rax, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movd %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: fptosi_8f32_to_4i64: -; AVX: # BB#0: -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX-NEXT: vcvttss2si %xmm1, %rax -; AVX-NEXT: vmovq %rax, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX-NEXT: vcvttss2si %xmm2, %rax -; AVX-NEXT: vmovq %rax, %xmm2 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX-NEXT: vcvttss2si %xmm0, %rax -; AVX-NEXT: vmovq %rax, %xmm2 -; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX-NEXT: vcvttss2si %xmm0, %rax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: fptosi_8f32_to_4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX1-NEXT: vcvttss2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-NEXT: vcvttss2si %xmm2, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vcvttss2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vcvttss2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_8f32_to_4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX2-NEXT: vcvttss2si %xmm1, %rax +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX2-NEXT: vcvttss2si %xmm2, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vcvttss2si %xmm0, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vcvttss2si %xmm0, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptosi_8f32_to_4i64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2si %xmm1, %rax +; AVX512F-NEXT: vcvttss2si %xmm0, %rcx +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512F-NEXT: vcvttss2si %xmm1, %rdx +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvttss2si %xmm0, %rsi +; AVX512F-NEXT: vmovq %rsi, %xmm0 +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rcx, %xmm1 +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_8f32_to_4i64: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512DQ-NEXT: retq %cvt = fptosi <8 x float> %a to <8 x i64> %shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> ret <4 x i64> %shuf @@ -655,13 +967,13 @@ define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) { ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] ; SSE-NEXT: cvttss2si %xmm2, %rax ; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq @@ -680,6 +992,11 @@ define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) { ; AVX-NEXT: vcvttss2si %xmm0, %rax ; AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 ; AVX-NEXT: retq +; +; AVX512-LABEL: fptoui_4f32_to_4i32: +; AVX512: # BB#0: +; AVX512-NEXT: vcvttps2udq %xmm0, %xmm0 +; AVX512-NEXT: retq %cvt = fptoui <4 x float> %a to <4 x i32> ret <4 x i32> %cvt } @@ -731,6 +1048,16 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) { ; AVX-NEXT: vmovq %rcx, %xmm0 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; AVX-NEXT: retq +; +; AVX512-LABEL: fptoui_2f32_to_2i64: +; AVX512: # BB#0: +; AVX512-NEXT: vcvttss2usi %xmm0, %rax +; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512-NEXT: vcvttss2usi %xmm0, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: retq %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> %cvt = fptoui <2 x float> %shuf to <2 x i64> ret <2 x i64> %cvt @@ -783,6 +1110,22 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) { ; AVX-NEXT: vmovq %rdx, %xmm1 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq +; +; AVX512F-LABEL: fptoui_4f32_to_2i64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2usi %xmm1, %rax +; AVX512F-NEXT: vcvttss2usi %xmm0, %rcx +; AVX512F-NEXT: vmovq %rcx, %xmm0 +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_4f32_to_2i64: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vcvttps2uqq %xmm0, %ymm0 +; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: retq %cvt = fptoui <4 x float> %a to <4 x i64> %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> ret <2 x i64> %shuf @@ -799,13 +1142,13 @@ define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) { ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,2,3] ; SSE-NEXT: cvttss2si %xmm3, %rax ; SSE-NEXT: movd %eax, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: cvttss2si %xmm2, %rax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: cvttss2si %xmm2, %rax -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: cvttss2si %xmm2, %rax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE-NEXT: cvttss2si %xmm2, %rax +; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] @@ -815,44 +1158,77 @@ define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) { ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,2,3] ; SSE-NEXT: cvttss2si %xmm3, %rax ; SSE-NEXT: movd %eax, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: fptoui_8f32_to_8i32: -; AVX: # BB#0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX-NEXT: vcvttss2si %xmm2, %rax -; AVX-NEXT: vcvttss2si %xmm1, %rcx -; AVX-NEXT: vmovd %ecx, %xmm2 -; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] -; AVX-NEXT: vcvttss2si %xmm3, %rax -; AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX-NEXT: vcvttss2si %xmm1, %rax -; AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 -; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX-NEXT: vcvttss2si %xmm2, %rax -; AVX-NEXT: vcvttss2si %xmm0, %rcx -; AVX-NEXT: vmovd %ecx, %xmm2 -; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX-NEXT: vcvttss2si %xmm3, %rax -; AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX-NEXT: vcvttss2si %xmm0, %rax -; AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: fptoui_8f32_to_8i32: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vcvttss2si %xmm2, %rax +; AVX1-NEXT: vcvttss2si %xmm1, %rcx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX1-NEXT: vcvttss2si %xmm3, %rax +; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX1-NEXT: vcvttss2si %xmm1, %rax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX1-NEXT: vcvttss2si %xmm2, %rax +; AVX1-NEXT: vcvttss2si %xmm0, %rcx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX1-NEXT: vcvttss2si %xmm3, %rax +; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-NEXT: vcvttss2si %xmm0, %rax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_8f32_to_8i32: +; AVX2: # BB#0: +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-NEXT: vcvttss2si %xmm2, %rax +; AVX2-NEXT: vcvttss2si %xmm1, %rcx +; AVX2-NEXT: vmovd %ecx, %xmm2 +; AVX2-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX2-NEXT: vcvttss2si %xmm3, %rax +; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX2-NEXT: vcvttss2si %xmm1, %rax +; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 +; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX2-NEXT: vcvttss2si %xmm2, %rax +; AVX2-NEXT: vcvttss2si %xmm0, %rcx +; AVX2-NEXT: vmovd %ecx, %xmm2 +; AVX2-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX2-NEXT: vcvttss2si %xmm3, %rax +; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-NEXT: vcvttss2si %xmm0, %rax +; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: fptoui_8f32_to_8i32: +; AVX512: # BB#0: +; AVX512-NEXT: vcvttps2udq %ymm0, %ymm0 +; AVX512-NEXT: retq %cvt = fptoui <8 x float> %a to <8 x i32> ret <8 x i32> %cvt } @@ -888,14 +1264,14 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; SSE-NEXT: cvttss2si %xmm4, %rcx ; SSE-NEXT: xorq %rax, %rcx ; SSE-NEXT: cvttss2si %xmm3, %rdx -; SSE-NEXT: ucomiss %xmm1, %xmm3 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movd %rdx, %xmm3 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: subss %xmm1, %xmm4 -; SSE-NEXT: cvttss2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: ucomiss %xmm1, %xmm3 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movd %rdx, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: subss %xmm1, %xmm4 +; SSE-NEXT: cvttss2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx ; SSE-NEXT: cvttss2si %xmm0, %rax ; SSE-NEXT: ucomiss %xmm1, %xmm0 ; SSE-NEXT: cmovaeq %rcx, %rax @@ -904,45 +1280,108 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: fptoui_4f32_to_4i64: -; AVX: # BB#0: -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vsubss %xmm1, %xmm2, %xmm3 -; AVX-NEXT: vcvttss2si %xmm3, %rax -; AVX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; AVX-NEXT: xorq %rcx, %rax -; AVX-NEXT: vcvttss2si %xmm2, %rdx -; AVX-NEXT: vucomiss %xmm1, %xmm2 -; AVX-NEXT: cmovaeq %rax, %rdx -; AVX-NEXT: vmovq %rdx, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX-NEXT: vsubss %xmm1, %xmm3, %xmm4 -; AVX-NEXT: vcvttss2si %xmm4, %rax -; AVX-NEXT: xorq %rcx, %rax -; AVX-NEXT: vcvttss2si %xmm3, %rdx -; AVX-NEXT: vucomiss %xmm1, %xmm3 -; AVX-NEXT: cmovaeq %rax, %rdx -; AVX-NEXT: vmovq %rdx, %xmm3 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm3 -; AVX-NEXT: vcvttss2si %xmm3, %rax -; AVX-NEXT: xorq %rcx, %rax -; AVX-NEXT: vcvttss2si %xmm0, %rdx -; AVX-NEXT: vucomiss %xmm1, %xmm0 -; AVX-NEXT: cmovaeq %rax, %rdx -; AVX-NEXT: vmovq %rdx, %xmm3 -; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm4 -; AVX-NEXT: vcvttss2si %xmm4, %rax -; AVX-NEXT: xorq %rcx, %rax -; AVX-NEXT: vcvttss2si %xmm0, %rcx -; AVX-NEXT: vucomiss %xmm1, %xmm0 -; AVX-NEXT: cmovaeq %rax, %rcx -; AVX-NEXT: vmovq %rcx, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: fptoui_4f32_to_4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vcvttss2si %xmm3, %rax +; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm2, %rdx +; AVX1-NEXT: vucomiss %xmm1, %xmm2 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vcvttss2si %xmm4, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm3, %rdx +; AVX1-NEXT: vucomiss %xmm1, %xmm3 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vcvttss2si %xmm3, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm0, %rdx +; AVX1-NEXT: vucomiss %xmm1, %xmm0 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vcvttss2si %xmm4, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm0, %rcx +; AVX1-NEXT: vucomiss %xmm1, %xmm0 +; AVX1-NEXT: cmovaeq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_4f32_to_4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vcvttss2si %xmm3, %rax +; AVX2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm2, %rdx +; AVX2-NEXT: vucomiss %xmm1, %xmm2 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4 +; AVX2-NEXT: vcvttss2si %xmm4, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm3, %rdx +; AVX2-NEXT: vucomiss %xmm1, %xmm3 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm3 +; AVX2-NEXT: vcvttss2si %xmm3, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm0, %rdx +; AVX2-NEXT: vucomiss %xmm1, %xmm0 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm4 +; AVX2-NEXT: vcvttss2si %xmm4, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm0, %rcx +; AVX2-NEXT: vucomiss %xmm1, %xmm0 +; AVX2-NEXT: cmovaeq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_4f32_to_4i64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvttss2usi %xmm1, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512F-NEXT: vcvttss2usi %xmm2, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vcvttss2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_4f32_to_4i64: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vcvttps2uqq %xmm0, %ymm0 +; AVX512DQ-NEXT: retq %shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> %cvt = fptoui <4 x float> %shuf to <4 x i64> ret <4 x i64> %cvt @@ -979,14 +1418,14 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; SSE-NEXT: cvttss2si %xmm4, %rcx ; SSE-NEXT: xorq %rax, %rcx ; SSE-NEXT: cvttss2si %xmm3, %rdx -; SSE-NEXT: ucomiss %xmm1, %xmm3 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movd %rdx, %xmm3 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: subss %xmm1, %xmm4 -; SSE-NEXT: cvttss2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: ucomiss %xmm1, %xmm3 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movd %rdx, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: subss %xmm1, %xmm4 +; SSE-NEXT: cvttss2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx ; SSE-NEXT: cvttss2si %xmm0, %rax ; SSE-NEXT: ucomiss %xmm1, %xmm0 ; SSE-NEXT: cmovaeq %rcx, %rax @@ -995,45 +1434,109 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: fptoui_8f32_to_4i64: -; AVX: # BB#0: -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vsubss %xmm1, %xmm2, %xmm3 -; AVX-NEXT: vcvttss2si %xmm3, %rax -; AVX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; AVX-NEXT: xorq %rcx, %rax -; AVX-NEXT: vcvttss2si %xmm2, %rdx -; AVX-NEXT: vucomiss %xmm1, %xmm2 -; AVX-NEXT: cmovaeq %rax, %rdx -; AVX-NEXT: vmovq %rdx, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX-NEXT: vsubss %xmm1, %xmm3, %xmm4 -; AVX-NEXT: vcvttss2si %xmm4, %rax -; AVX-NEXT: xorq %rcx, %rax -; AVX-NEXT: vcvttss2si %xmm3, %rdx -; AVX-NEXT: vucomiss %xmm1, %xmm3 -; AVX-NEXT: cmovaeq %rax, %rdx -; AVX-NEXT: vmovq %rdx, %xmm3 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm3 -; AVX-NEXT: vcvttss2si %xmm3, %rax -; AVX-NEXT: xorq %rcx, %rax -; AVX-NEXT: vcvttss2si %xmm0, %rdx -; AVX-NEXT: vucomiss %xmm1, %xmm0 -; AVX-NEXT: cmovaeq %rax, %rdx -; AVX-NEXT: vmovq %rdx, %xmm3 -; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm4 -; AVX-NEXT: vcvttss2si %xmm4, %rax -; AVX-NEXT: xorq %rcx, %rax -; AVX-NEXT: vcvttss2si %xmm0, %rcx -; AVX-NEXT: vucomiss %xmm1, %xmm0 -; AVX-NEXT: cmovaeq %rax, %rcx -; AVX-NEXT: vmovq %rcx, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: fptoui_8f32_to_4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vcvttss2si %xmm3, %rax +; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm2, %rdx +; AVX1-NEXT: vucomiss %xmm1, %xmm2 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vcvttss2si %xmm4, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm3, %rdx +; AVX1-NEXT: vucomiss %xmm1, %xmm3 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vcvttss2si %xmm3, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm0, %rdx +; AVX1-NEXT: vucomiss %xmm1, %xmm0 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vcvttss2si %xmm4, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm0, %rcx +; AVX1-NEXT: vucomiss %xmm1, %xmm0 +; AVX1-NEXT: cmovaeq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_8f32_to_4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vcvttss2si %xmm3, %rax +; AVX2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm2, %rdx +; AVX2-NEXT: vucomiss %xmm1, %xmm2 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4 +; AVX2-NEXT: vcvttss2si %xmm4, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm3, %rdx +; AVX2-NEXT: vucomiss %xmm1, %xmm3 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm3 +; AVX2-NEXT: vcvttss2si %xmm3, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm0, %rdx +; AVX2-NEXT: vucomiss %xmm1, %xmm0 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm4 +; AVX2-NEXT: vcvttss2si %xmm4, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm0, %rcx +; AVX2-NEXT: vucomiss %xmm1, %xmm0 +; AVX2-NEXT: cmovaeq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_8f32_to_4i64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2usi %xmm1, %rax +; AVX512F-NEXT: vcvttss2usi %xmm0, %rcx +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512F-NEXT: vcvttss2usi %xmm1, %rdx +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvttss2usi %xmm0, %rsi +; AVX512F-NEXT: vmovq %rsi, %xmm0 +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rcx, %xmm1 +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_8f32_to_4i64: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512DQ-NEXT: retq %cvt = fptoui <8 x float> %a to <8 x i64> %shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> ret <4 x i64> %shuf @@ -1053,6 +1556,11 @@ define <2 x i64> @fptosi_2f64_to_2i64_const() { ; AVX: # BB#0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,18446744073709551615] ; AVX-NEXT: retq +; +; AVX512-LABEL: fptosi_2f64_to_2i64_const: +; AVX512: # BB#0: +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,18446744073709551615] +; AVX512-NEXT: retq %cvt = fptosi <2 x double> to <2 x i64> ret <2 x i64> %cvt } @@ -1067,6 +1575,11 @@ define <4 x i32> @fptosi_2f64_to_2i32_const() { ; AVX: # BB#0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u> ; AVX-NEXT: retq +; +; AVX512-LABEL: fptosi_2f64_to_2i32_const: +; AVX512: # BB#0: +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u> +; AVX512-NEXT: retq %cvt = fptosi <2 x double> to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> ret <4 x i32> %ext @@ -1083,6 +1596,11 @@ define <4 x i64> @fptosi_4f64_to_4i64_const() { ; AVX: # BB#0: ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613] ; AVX-NEXT: retq +; +; AVX512-LABEL: fptosi_4f64_to_4i64_const: +; AVX512: # BB#0: +; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613] +; AVX512-NEXT: retq %cvt = fptosi <4 x double> to <4 x i64> ret <4 x i64> %cvt } @@ -1097,6 +1615,11 @@ define <4 x i32> @fptosi_4f64_to_4i32_const() { ; AVX: # BB#0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3] ; AVX-NEXT: retq +; +; AVX512-LABEL: fptosi_4f64_to_4i32_const: +; AVX512: # BB#0: +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3] +; AVX512-NEXT: retq %cvt = fptosi <4 x double> to <4 x i32> ret <4 x i32> %cvt } @@ -1111,6 +1634,11 @@ define <2 x i64> @fptoui_2f64_to_2i64_const() { ; AVX: # BB#0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4] ; AVX-NEXT: retq +; +; AVX512-LABEL: fptoui_2f64_to_2i64_const: +; AVX512: # BB#0: +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [2,4] +; AVX512-NEXT: retq %cvt = fptoui <2 x double> to <2 x i64> ret <2 x i64> %cvt } @@ -1125,6 +1653,11 @@ define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) { ; AVX: # BB#0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <2,4,u,u> ; AVX-NEXT: retq +; +; AVX512-LABEL: fptoui_2f64_to_2i32_const: +; AVX512: # BB#0: +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = <2,4,u,u> +; AVX512-NEXT: retq %cvt = fptoui <2 x double> to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> ret <4 x i32> %ext @@ -1141,6 +1674,11 @@ define <4 x i64> @fptoui_4f64_to_4i64_const(<4 x double> %a) { ; AVX: # BB#0: ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [2,4,6,8] ; AVX-NEXT: retq +; +; AVX512-LABEL: fptoui_4f64_to_4i64_const: +; AVX512: # BB#0: +; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [2,4,6,8] +; AVX512-NEXT: retq %cvt = fptoui <4 x double> to <4 x i64> ret <4 x i64> %cvt } @@ -1155,6 +1693,11 @@ define <4 x i32> @fptoui_4f64_to_4i32_const(<4 x double> %a) { ; AVX: # BB#0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4,6,8] ; AVX-NEXT: retq +; +; AVX512-LABEL: fptoui_4f64_to_4i32_const: +; AVX512: # BB#0: +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [2,4,6,8] +; AVX512-NEXT: retq %cvt = fptoui <4 x double> to <4 x i32> ret <4 x i32> %cvt } @@ -1169,6 +1712,11 @@ define <4 x i32> @fptosi_4f32_to_4i32_const() { ; AVX: # BB#0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3] ; AVX-NEXT: retq +; +; AVX512-LABEL: fptosi_4f32_to_4i32_const: +; AVX512: # BB#0: +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3] +; AVX512-NEXT: retq %cvt = fptosi <4 x float> to <4 x i32> ret <4 x i32> %cvt } @@ -1184,6 +1732,11 @@ define <4 x i64> @fptosi_4f32_to_4i64_const() { ; AVX: # BB#0: ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3] ; AVX-NEXT: retq +; +; AVX512-LABEL: fptosi_4f32_to_4i64_const: +; AVX512: # BB#0: +; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3] +; AVX512-NEXT: retq %cvt = fptosi <4 x float> to <4 x i64> ret <4 x i64> %cvt } @@ -1199,6 +1752,11 @@ define <8 x i32> @fptosi_8f32_to_8i32_const(<8 x float> %a) { ; AVX: # BB#0: ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295] ; AVX-NEXT: retq +; +; AVX512-LABEL: fptosi_8f32_to_8i32_const: +; AVX512: # BB#0: +; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295] +; AVX512-NEXT: retq %cvt = fptosi <8 x float> to <8 x i32> ret <8 x i32> %cvt } @@ -1213,6 +1771,11 @@ define <4 x i32> @fptoui_4f32_to_4i32_const(<4 x float> %a) { ; AVX: # BB#0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,4,6] ; AVX-NEXT: retq +; +; AVX512-LABEL: fptoui_4f32_to_4i32_const: +; AVX512: # BB#0: +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,4,6] +; AVX512-NEXT: retq %cvt = fptoui <4 x float> to <4 x i32> ret <4 x i32> %cvt } @@ -1228,6 +1791,11 @@ define <4 x i64> @fptoui_4f32_to_4i64_const() { ; AVX: # BB#0: ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8] ; AVX-NEXT: retq +; +; AVX512-LABEL: fptoui_4f32_to_4i64_const: +; AVX512: # BB#0: +; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8] +; AVX512-NEXT: retq %cvt = fptoui <4 x float> to <4 x i64> ret <4 x i64> %cvt } @@ -1243,6 +1811,11 @@ define <8 x i32> @fptoui_8f32_to_8i32_const(<8 x float> %a) { ; AVX: # BB#0: ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1] ; AVX-NEXT: retq +; +; AVX512-LABEL: fptoui_8f32_to_8i32_const: +; AVX512: # BB#0: +; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1] +; AVX512-NEXT: retq %cvt = fptoui <8 x float> to <8 x i32> ret <8 x i32> %cvt }