2016-11-16 22:48:32 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2015-07-04 04:07:57 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
|
2016-08-07 03:31:47 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=VEX --check-prefix=AVX1
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=VEX --check-prefix=AVX2
|
2016-11-18 23:31:36 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
|
2016-11-23 06:04:50 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VLDQ
|
2015-07-04 04:07:57 +08:00
|
|
|
;
|
|
|
|
; 32-bit tests to make sure we're not doing anything stupid.
|
|
|
|
; RUN: llc < %s -mtriple=i686-unknown-unknown
|
|
|
|
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse
|
|
|
|
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2
|
2015-05-02 19:42:47 +08:00
|
|
|
|
|
|
|
;
|
|
|
|
; Signed Integer to Double
|
|
|
|
;
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: sitofp_2i64_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: cvtsi2sdq %rax, %xmm1
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: cvtsi2sdq %rax, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
|
2017-09-18 11:29:54 +08:00
|
|
|
; SSE-NEXT: movaps %xmm1, %xmm0
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; VEX-LABEL: sitofp_2i64_to_2f64:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1
|
|
|
|
; VEX-NEXT: vmovq %xmm0, %rax
|
|
|
|
; VEX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; VEX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-11-23 06:04:50 +08:00
|
|
|
; VEX-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: sitofp_2i64_to_2f64:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: sitofp_2i64_to_2f64:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: sitofp_2i64_to_2f64:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-24 22:46:55 +08:00
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: sitofp_2i64_to_2f64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
%cvt = sitofp <2 x i64> %a to <2 x double>
|
|
|
|
ret <2 x double> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <2 x double> @sitofp_2i32_to_2f64(<4 x i32> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: sitofp_2i32_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: sitofp_2i32_to_2f64:
|
2015-05-02 19:42:47 +08:00
|
|
|
; AVX: # BB#0:
|
2015-06-17 05:40:28 +08:00
|
|
|
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
|
2015-05-02 19:42:47 +08:00
|
|
|
; AVX-NEXT: retq
|
|
|
|
%shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
|
|
|
|
%cvt = sitofp <2 x i32> %shuf to <2 x double>
|
|
|
|
ret <2 x double> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) {
|
|
|
|
; SSE-LABEL: sitofp_4i32_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX-LABEL: sitofp_4i32_to_2f64:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
|
|
|
; AVX-NEXT: vzeroupper
|
|
|
|
; AVX-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%cvt = sitofp <4 x i32> %a to <4 x double>
|
|
|
|
%shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
|
|
|
|
ret <2 x double> %shuf
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: sitofp_2i16_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
|
|
|
|
; SSE-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: sitofp_2i16_to_2f64:
|
2015-06-16 05:49:31 +08:00
|
|
|
; AVX: # BB#0:
|
2015-07-03 16:01:36 +08:00
|
|
|
; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
|
2015-06-16 05:49:31 +08:00
|
|
|
; AVX-NEXT: retq
|
|
|
|
%shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
|
|
|
|
%cvt = sitofp <2 x i16> %shuf to <2 x double>
|
|
|
|
ret <2 x double> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) {
|
|
|
|
; SSE-LABEL: sitofp_8i16_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
|
|
|
|
; SSE-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: sitofp_8i16_to_2f64:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: sitofp_8i16_to_2f64:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
|
|
|
|
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: sitofp_8i16_to_2f64:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0
|
|
|
|
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
|
|
|
|
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX512-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%cvt = sitofp <8 x i16> %a to <8 x double>
|
|
|
|
%shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
|
|
|
|
ret <2 x double> %shuf
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: sitofp_2i8_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
|
|
|
|
; SSE-NEXT: psrad $24, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: sitofp_2i8_to_2f64:
|
2015-06-16 05:49:31 +08:00
|
|
|
; AVX: # BB#0:
|
2015-07-03 16:01:36 +08:00
|
|
|
; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
|
2015-06-16 05:49:31 +08:00
|
|
|
; AVX-NEXT: retq
|
|
|
|
%shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
|
|
|
|
%cvt = sitofp <2 x i8> %shuf to <2 x double>
|
|
|
|
ret <2 x double> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
|
|
|
|
; SSE-LABEL: sitofp_16i8_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
|
|
|
|
; SSE-NEXT: psrad $24, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: sitofp_16i8_to_2f64:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: sitofp_16i8_to_2f64:
|
|
|
|
; AVX2: # BB#0:
|
2016-08-07 05:21:12 +08:00
|
|
|
; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: sitofp_16i8_to_2f64:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
|
|
|
|
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX512-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%cvt = sitofp <16 x i8> %a to <16 x double>
|
|
|
|
%shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
|
|
|
|
ret <2 x double> %shuf
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: sitofp_4i64_to_4f64:
|
|
|
|
; SSE: # BB#0:
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: cvtsi2sdq %rax, %xmm2
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: cvtsi2sdq %rax, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: cvtsi2sdq %rax, %xmm3
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: cvtsi2sdq %rax, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
|
2017-09-18 11:29:54 +08:00
|
|
|
; SSE-NEXT: movaps %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: movaps %xmm3, %xmm1
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX1-LABEL: sitofp_4i64_to_4f64:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm1, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: vmovq %xmm1, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: vmovq %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX2-LABEL: sitofp_4i64_to_4f64:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm1, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: vmovq %xmm1, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: vmovq %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-LABEL: sitofp_4i64_to_4f64:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
|
|
|
|
; AVX512F-NEXT: vmovq %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: sitofp_4i64_to_4f64:
|
|
|
|
; AVX512VL: # BB#0:
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
2016-11-23 06:04:50 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: sitofp_4i64_to_4f64:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-23 22:01:18 +08:00
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: sitofp_4i64_to_4f64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtqq2pd %ymm0, %ymm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
%cvt = sitofp <4 x i64> %a to <4 x double>
|
|
|
|
ret <4 x double> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: sitofp_4i32_to_4f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: movaps %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: sitofp_4i32_to_4f64:
|
2015-05-02 19:42:47 +08:00
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = sitofp <4 x i32> %a to <4 x double>
|
|
|
|
ret <4 x double> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: sitofp_4i16_to_4f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
|
|
|
; SSE-NEXT: psrad $16, %xmm1
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: sitofp_4i16_to_4f64:
|
2015-06-16 05:49:31 +08:00
|
|
|
; AVX: # BB#0:
|
2015-06-18 06:43:34 +08:00
|
|
|
; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
|
2015-06-16 05:49:31 +08:00
|
|
|
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
%cvt = sitofp <4 x i16> %shuf to <4 x double>
|
|
|
|
ret <4 x double> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) {
|
|
|
|
; SSE-LABEL: sitofp_8i16_to_4f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
|
|
|
; SSE-NEXT: psrad $16, %xmm1
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: sitofp_8i16_to_4f64:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: sitofp_8i16_to_4f64:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
|
|
|
|
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: sitofp_8i16_to_4f64:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0
|
|
|
|
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
|
|
|
|
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%cvt = sitofp <8 x i16> %a to <8 x double>
|
|
|
|
%shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
ret <4 x double> %shuf
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: sitofp_4i8_to_4f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
|
|
|
; SSE-NEXT: psrad $24, %xmm1
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: sitofp_4i8_to_4f64:
|
2015-06-16 05:49:31 +08:00
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
%cvt = sitofp <4 x i8> %shuf to <4 x double>
|
|
|
|
ret <4 x double> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
|
|
|
|
; SSE-LABEL: sitofp_16i8_to_4f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
|
|
|
; SSE-NEXT: psrad $24, %xmm1
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: sitofp_16i8_to_4f64:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: sitofp_16i8_to_4f64:
|
|
|
|
; AVX2: # BB#0:
|
2016-08-07 05:21:12 +08:00
|
|
|
; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: sitofp_16i8_to_4f64:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
|
|
|
|
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%cvt = sitofp <16 x i8> %a to <16 x double>
|
|
|
|
%shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
ret <4 x double> %shuf
|
|
|
|
}
|
|
|
|
|
2015-05-02 19:42:47 +08:00
|
|
|
;
|
|
|
|
; Unsigned Integer to Double
|
|
|
|
;
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: uitofp_2i64_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
|
|
|
|
; SSE-NEXT: subpd %xmm3, %xmm0
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
|
|
|
|
; SSE-NEXT: addpd %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
|
|
|
|
; SSE-NEXT: subpd %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
|
|
|
|
; SSE-NEXT: addpd %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
;
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-LABEL: uitofp_2i64_to_2f64:
|
|
|
|
; VEX: # BB#0:
|
2017-09-18 11:29:54 +08:00
|
|
|
; VEX-NEXT: vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
|
|
|
|
; VEX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
|
|
|
|
; VEX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
|
2017-09-18 11:29:54 +08:00
|
|
|
; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; VEX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
|
2016-11-23 01:50:06 +08:00
|
|
|
; VEX-NEXT: vhaddpd %xmm0, %xmm2, %xmm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512F-LABEL: uitofp_2i64_to_2f64:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: uitofp_2i64_to_2f64:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: uitofp_2i64_to_2f64:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-24 22:46:55 +08:00
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: uitofp_2i64_to_2f64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtuqq2pd %xmm0, %xmm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
%cvt = uitofp <2 x i64> %a to <2 x double>
|
|
|
|
ret <2 x double> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: uitofp_2i32_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
2016-11-24 23:12:56 +08:00
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
|
|
|
|
; SSE-NEXT: pand %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: psrld $16, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: mulpd {{.*}}(%rip), %xmm0
|
|
|
|
; SSE-NEXT: addpd %xmm1, %xmm0
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
;
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-LABEL: uitofp_2i32_to_2f64:
|
|
|
|
; VEX: # BB#0:
|
2016-11-24 23:12:56 +08:00
|
|
|
; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
|
|
|
; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
|
|
|
|
; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1
|
|
|
|
; VEX-NEXT: vpsrld $16, %xmm0, %xmm0
|
|
|
|
; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0
|
|
|
|
; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: retq
|
|
|
|
;
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-LABEL: uitofp_2i32_to_2f64:
|
|
|
|
; AVX512F: # BB#0:
|
2016-11-20 22:03:23 +08:00
|
|
|
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
|
|
|
|
; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
|
|
|
|
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: uitofp_2i32_to_2f64:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0
|
|
|
|
; AVX512VL-NEXT: retq
|
2016-11-23 06:04:50 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: uitofp_2i32_to_2f64:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: uitofp_2i32_to_2f64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
%shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
|
|
|
|
%cvt = uitofp <2 x i32> %shuf to <2 x double>
|
|
|
|
ret <2 x double> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
|
|
|
|
; SSE-LABEL: uitofp_4i32_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
2016-11-24 23:12:56 +08:00
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
|
|
|
|
; SSE-NEXT: pand %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: psrld $16, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: mulpd {{.*}}(%rip), %xmm0
|
|
|
|
; SSE-NEXT: addpd %xmm1, %xmm0
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: uitofp_4i32_to_2f64:
|
|
|
|
; AVX1: # BB#0:
|
2016-12-01 23:41:40 +08:00
|
|
|
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1
|
|
|
|
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: uitofp_4i32_to_2f64:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
|
|
|
|
; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1
|
|
|
|
; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2
|
|
|
|
; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1
|
2016-12-02 21:16:08 +08:00
|
|
|
; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2
|
|
|
|
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-LABEL: uitofp_4i32_to_2f64:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
|
|
|
|
; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
|
|
|
|
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: uitofp_4i32_to_2f64:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vcvtudq2pd %xmm0, %ymm0
|
|
|
|
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512VL-NEXT: vzeroupper
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
2016-11-23 06:04:50 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: uitofp_4i32_to_2f64:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: uitofp_4i32_to_2f64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %ymm0
|
|
|
|
; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512VLDQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%cvt = uitofp <4 x i32> %a to <4 x double>
|
|
|
|
%shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
|
|
|
|
ret <2 x double> %shuf
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: uitofp_2i16_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
2015-08-01 18:01:46 +08:00
|
|
|
; SSE-NEXT: pxor %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: uitofp_2i16_to_2f64:
|
2015-06-16 05:49:31 +08:00
|
|
|
; AVX: # BB#0:
|
2015-08-01 18:01:46 +08:00
|
|
|
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
2015-07-04 23:33:34 +08:00
|
|
|
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
|
2015-06-16 05:49:31 +08:00
|
|
|
; AVX-NEXT: retq
|
|
|
|
%shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
|
|
|
|
%cvt = uitofp <2 x i16> %shuf to <2 x double>
|
|
|
|
ret <2 x double> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) {
|
|
|
|
; SSE-LABEL: uitofp_8i16_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pxor %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: uitofp_8i16_to_2f64:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
|
|
|
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: uitofp_8i16_to_2f64:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
|
|
|
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: uitofp_8i16_to_2f64:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
2016-09-18 20:45:23 +08:00
|
|
|
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX512-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%cvt = uitofp <8 x i16> %a to <8 x double>
|
|
|
|
%shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
|
|
|
|
ret <2 x double> %shuf
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: uitofp_2i8_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
2015-08-01 18:01:46 +08:00
|
|
|
; SSE-NEXT: pxor %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: uitofp_2i8_to_2f64:
|
2015-06-16 05:49:31 +08:00
|
|
|
; AVX: # BB#0:
|
2015-08-01 18:01:46 +08:00
|
|
|
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
2015-07-04 23:33:34 +08:00
|
|
|
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
|
2015-06-16 05:49:31 +08:00
|
|
|
; AVX-NEXT: retq
|
|
|
|
%shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
|
|
|
|
%cvt = uitofp <2 x i8> %shuf to <2 x double>
|
|
|
|
ret <2 x double> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
|
|
|
|
; SSE-LABEL: uitofp_16i8_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pxor %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: uitofp_16i8_to_2f64:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: uitofp_16i8_to_2f64:
|
|
|
|
; AVX2: # BB#0:
|
2016-08-07 05:21:12 +08:00
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: uitofp_16i8_to_2f64:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
|
2016-09-18 20:45:23 +08:00
|
|
|
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX512-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%cvt = uitofp <16 x i8> %a to <16 x double>
|
|
|
|
%shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
|
|
|
|
ret <2 x double> %shuf
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: uitofp_4i64_to_4f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
|
|
|
; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
|
|
|
|
; SSE-NEXT: subpd %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
|
|
|
|
; SSE-NEXT: addpd %xmm5, %xmm0
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
|
|
|
|
; SSE-NEXT: subpd %xmm4, %xmm3
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
|
|
|
|
; SSE-NEXT: addpd %xmm3, %xmm5
|
|
|
|
; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
|
|
|
; SSE-NEXT: subpd %xmm4, %xmm1
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
|
|
|
|
; SSE-NEXT: addpd %xmm5, %xmm1
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
|
|
|
|
; SSE-NEXT: subpd %xmm4, %xmm3
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
|
|
|
|
; SSE-NEXT: addpd %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
;
|
2017-09-18 11:29:54 +08:00
|
|
|
; VEX-LABEL: uitofp_4i64_to_4f64:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; VEX-NEXT: vmovapd {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
|
|
|
|
; VEX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
|
|
|
; VEX-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
|
|
|
|
; VEX-NEXT: vsubpd %xmm4, %xmm3, %xmm3
|
|
|
|
; VEX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
|
|
|
; VEX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
|
|
|
; VEX-NEXT: vsubpd %xmm4, %xmm1, %xmm1
|
|
|
|
; VEX-NEXT: vhaddpd %xmm1, %xmm3, %xmm1
|
|
|
|
; VEX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
|
|
|
; VEX-NEXT: vsubpd %xmm4, %xmm3, %xmm3
|
|
|
|
; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; VEX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
|
|
|
; VEX-NEXT: vsubpd %xmm4, %xmm0, %xmm0
|
|
|
|
; VEX-NEXT: vhaddpd %xmm0, %xmm3, %xmm0
|
|
|
|
; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; VEX-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-LABEL: uitofp_4i64_to_4f64:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
|
|
|
|
; AVX512F-NEXT: vmovq %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: uitofp_4i64_to_4f64:
|
|
|
|
; AVX512VL: # BB#0:
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
2016-11-23 06:04:50 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: uitofp_4i64_to_4f64:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-23 22:01:18 +08:00
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: uitofp_4i64_to_4f64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtuqq2pd %ymm0, %ymm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
%cvt = uitofp <4 x i64> %a to <4 x double>
|
|
|
|
ret <4 x double> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: uitofp_4i32_to_4f64:
|
|
|
|
; SSE: # BB#0:
|
2016-11-24 23:12:56 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: psrld $16, %xmm1
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: movapd {{.*#+}} xmm2 = [6.553600e+04,6.553600e+04]
|
|
|
|
; SSE-NEXT: mulpd %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
|
2015-08-01 18:01:46 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
|
2016-11-24 23:12:56 +08:00
|
|
|
; SSE-NEXT: pand %xmm3, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: addpd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: movdqa %xmm4, %xmm1
|
|
|
|
; SSE-NEXT: psrld $16, %xmm1
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm5
|
|
|
|
; SSE-NEXT: mulpd %xmm2, %xmm5
|
|
|
|
; SSE-NEXT: pand %xmm3, %xmm4
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm4, %xmm1
|
|
|
|
; SSE-NEXT: addpd %xmm5, %xmm1
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX1-LABEL: uitofp_4i32_to_4f64:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1: # BB#0:
|
2016-12-01 23:41:40 +08:00
|
|
|
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1
|
|
|
|
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
|
2015-07-19 00:14:56 +08:00
|
|
|
; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX2-LABEL: uitofp_4i32_to_4f64:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
|
|
|
|
; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1
|
2015-07-19 00:14:56 +08:00
|
|
|
; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1
|
2016-12-02 21:16:08 +08:00
|
|
|
; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2
|
|
|
|
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-LABEL: uitofp_4i32_to_4f64:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
|
|
|
|
; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
|
|
|
|
; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: uitofp_4i32_to_4f64:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vcvtudq2pd %xmm0, %ymm0
|
|
|
|
; AVX512VL-NEXT: retq
|
2016-11-23 06:04:50 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: uitofp_4i32_to_4f64:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: uitofp_4i32_to_4f64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %ymm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
%cvt = uitofp <4 x i32> %a to <4 x double>
|
|
|
|
ret <4 x double> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: uitofp_4i16_to_4f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pxor %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: movaps %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
;
|
2016-09-18 20:45:23 +08:00
|
|
|
; AVX-LABEL: uitofp_4i16_to_4f64:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
|
|
|
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
%shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
%cvt = uitofp <4 x i16> %shuf to <4 x double>
|
|
|
|
ret <4 x double> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) {
|
|
|
|
; SSE-LABEL: uitofp_8i16_to_4f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pxor %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: movaps %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: uitofp_8i16_to_4f64:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
|
|
|
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: uitofp_8i16_to_4f64:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
|
|
|
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: uitofp_8i16_to_4f64:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
2016-09-18 20:45:23 +08:00
|
|
|
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%cvt = uitofp <8 x i16> %a to <8 x double>
|
|
|
|
%shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
ret <4 x double> %shuf
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: uitofp_4i8_to_4f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pxor %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: movaps %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
;
|
2016-09-18 20:45:23 +08:00
|
|
|
; AVX-LABEL: uitofp_4i8_to_4f64:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
|
|
|
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
%shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
%cvt = uitofp <4 x i8> %shuf to <4 x double>
|
|
|
|
ret <4 x double> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) {
|
|
|
|
; SSE-LABEL: uitofp_16i8_to_4f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pxor %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: movaps %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: uitofp_16i8_to_4f64:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: uitofp_16i8_to_4f64:
|
|
|
|
; AVX2: # BB#0:
|
2016-08-07 05:21:12 +08:00
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: uitofp_16i8_to_4f64:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
|
2016-09-18 20:45:23 +08:00
|
|
|
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%cvt = uitofp <16 x i8> %a to <16 x double>
|
|
|
|
%shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
ret <4 x double> %shuf
|
|
|
|
}
|
|
|
|
|
2015-05-02 19:42:47 +08:00
|
|
|
;
|
|
|
|
; Signed Integer to Float
|
|
|
|
;
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: sitofp_2i64_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
|
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
|
|
|
; SSE-NEXT: movaps %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; VEX-LABEL: sitofp_2i64_to_4f32:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; VEX-NEXT: vmovq %xmm0, %rax
|
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
|
|
|
|
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
|
2016-12-18 22:26:02 +08:00
|
|
|
; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
|
2016-11-23 06:04:50 +08:00
|
|
|
; VEX-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: sitofp_2i64_to_4f32:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
|
2016-12-18 22:26:02 +08:00
|
|
|
; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: sitofp_2i64_to_4f32:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
|
2016-12-18 22:26:02 +08:00
|
|
|
; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: sitofp_2i64_to_4f32:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-23 22:01:18 +08:00
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
2016-11-24 21:38:59 +08:00
|
|
|
; AVX512VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
%cvt = sitofp <2 x i64> %a to <2 x float>
|
|
|
|
%ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
|
|
|
ret <4 x float> %ext
|
|
|
|
}
|
|
|
|
|
2016-11-24 05:19:57 +08:00
|
|
|
define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) {
|
|
|
|
; SSE-LABEL: sitofp_2i64_to_4f32_zero:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
|
2017-09-18 11:29:54 +08:00
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; VEX-LABEL: sitofp_2i64_to_4f32_zero:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; VEX-NEXT: vmovq %xmm0, %rax
|
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
|
|
|
|
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
|
|
|
|
; VEX-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: sitofp_2i64_to_4f32_zero:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: sitofp_2i64_to_4f32_zero:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
|
|
|
; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
|
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: sitofp_2i64_to_4f32_zero:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32_zero:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
2016-11-24 21:38:59 +08:00
|
|
|
; AVX512VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX512VLDQ-NEXT: retq
|
|
|
|
%cvt = sitofp <2 x i64> %a to <2 x float>
|
|
|
|
%ext = shufflevector <2 x float> %cvt, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
ret <4 x float> %ext
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
|
|
|
|
; SSE-LABEL: sitofp_4i64_to_4f32_undef:
|
|
|
|
; SSE: # BB#0:
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
|
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,0]
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: movaps %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; VEX-LABEL: sitofp_4i64_to_4f32_undef:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; VEX-NEXT: vmovq %xmm0, %rax
|
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
|
|
|
|
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
|
2016-12-18 22:26:02 +08:00
|
|
|
; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
|
2016-11-23 06:04:50 +08:00
|
|
|
; VEX-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: sitofp_4i64_to_4f32_undef:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
|
2016-12-18 22:26:02 +08:00
|
|
|
; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: sitofp_4i64_to_4f32_undef:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
|
2016-12-18 22:26:02 +08:00
|
|
|
; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: sitofp_4i64_to_4f32_undef:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-23 22:01:18 +08:00
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32_undef:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
|
|
|
|
; AVX512VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512VLDQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
|
|
|
%cvt = sitofp <4 x i64> %ext to <4 x float>
|
|
|
|
ret <4 x float> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 00:14:56 +08:00
|
|
|
define <4 x float> @sitofp_4i32_to_4f32(<4 x i32> %a) {
|
|
|
|
; SSE-LABEL: sitofp_4i32_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: sitofp_4i32_to_4f32:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = sitofp <4 x i32> %a to <4 x float>
|
|
|
|
ret <4 x float> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: sitofp_4i16_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
|
|
|
|
; SSE-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: sitofp_4i16_to_4f32:
|
2015-06-16 05:49:31 +08:00
|
|
|
; AVX: # BB#0:
|
2015-06-18 06:43:34 +08:00
|
|
|
; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
|
2015-06-16 05:49:31 +08:00
|
|
|
; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
%cvt = sitofp <4 x i16> %shuf to <4 x float>
|
|
|
|
ret <4 x float> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) {
|
|
|
|
; SSE-LABEL: sitofp_8i16_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
|
|
|
|
; SSE-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: sitofp_8i16_to_4f32:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
|
|
|
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: sitofp_8i16_to_4f32:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
|
|
|
|
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: sitofp_8i16_to_4f32:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0
|
|
|
|
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX512-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%cvt = sitofp <8 x i16> %a to <8 x float>
|
|
|
|
%shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
ret <4 x float> %shuf
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: sitofp_4i8_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
|
|
|
|
; SSE-NEXT: psrad $24, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX-LABEL: sitofp_4i8_to_4f32:
|
2015-06-16 05:49:31 +08:00
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
%cvt = sitofp <4 x i8> %shuf to <4 x float>
|
|
|
|
ret <4 x float> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
|
|
|
|
; SSE-LABEL: sitofp_16i8_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
|
|
|
|
; SSE-NEXT: psrad $24, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: sitofp_16i8_to_4f32:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
|
|
|
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: sitofp_16i8_to_4f32:
|
|
|
|
; AVX2: # BB#0:
|
2016-08-07 05:21:12 +08:00
|
|
|
; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: sitofp_16i8_to_4f32:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX512-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%cvt = sitofp <16 x i8> %a to <16 x float>
|
|
|
|
%shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
ret <4 x float> %shuf
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: sitofp_4i64_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
|
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
2017-09-18 12:40:58 +08:00
|
|
|
; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2017-09-18 11:29:54 +08:00
|
|
|
; SSE-NEXT: movaps %xmm1, %xmm0
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX1-LABEL: sitofp_4i64_to_4f32:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: vmovq %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vmovq %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX2-LABEL: sitofp_4i64_to_4f32:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: vmovq %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; AVX2-NEXT: vmovq %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-LABEL: sitofp_4i64_to_4f32:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
|
|
|
|
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: sitofp_4i64_to_4f32:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512VL-NEXT: vzeroupper
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
2016-11-23 06:04:50 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: sitofp_4i64_to_4f32:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-23 22:01:18 +08:00
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512VLDQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
%cvt = sitofp <4 x i64> %a to <4 x float>
|
|
|
|
ret <4 x float> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 00:14:56 +08:00
|
|
|
define <8 x float> @sitofp_8i32_to_8f32(<8 x i32> %a) {
|
|
|
|
; SSE-LABEL: sitofp_8i32_to_8f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: sitofp_8i32_to_8f32:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = sitofp <8 x i32> %a to <8 x float>
|
|
|
|
ret <8 x float> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: sitofp_8i16_to_8f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
|
|
|
; SSE-NEXT: psrad $16, %xmm1
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm1, %xmm2
|
[X86][SSE] Combine UNPCKL with vector_shuffle into UNPCKH to save one instruction for sext from v16i8 to v16i16 and v8i16 to v8i32.
This patch is enabling combining UNPCKL with vector_shuffle that moves the upper
half of a vector into the lower half, into a UNPCKH instruction. For example:
t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1, undef:v16i8
t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
will be combined to:
t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
Differential revision: http://reviews.llvm.org/D14399
llvm-svn: 253067
2015-11-14 03:47:43 +08:00
|
|
|
; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: movaps %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX1-LABEL: sitofp_8i16_to_8f32:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
|
|
|
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX2-LABEL: sitofp_8i16_to_8f32:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
|
|
|
|
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: sitofp_8i16_to_8f32:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0
|
|
|
|
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
%cvt = sitofp <8 x i16> %a to <8 x float>
|
|
|
|
ret <8 x float> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: sitofp_8i8_to_8f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
|
|
|
|
; SSE-NEXT: psrad $24, %xmm1
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm1, %xmm2
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
|
|
|
|
; SSE-NEXT: psrad $24, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: movaps %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX1-LABEL: sitofp_8i8_to_8f32:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
|
|
|
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX2-LABEL: sitofp_8i8_to_8f32:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2: # BB#0:
|
2016-02-09 16:19:19 +08:00
|
|
|
; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: sitofp_8i8_to_8f32:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0
|
|
|
|
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
%shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
|
|
%cvt = sitofp <8 x i8> %shuf to <8 x float>
|
|
|
|
ret <8 x float> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) {
|
|
|
|
; SSE-LABEL: sitofp_16i8_to_8f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
|
|
|
|
; SSE-NEXT: psrad $24, %xmm1
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm1, %xmm2
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
|
|
|
|
; SSE-NEXT: psrad $24, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: movaps %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: sitofp_16i8_to_8f32:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
|
|
|
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: sitofp_16i8_to_8f32:
|
|
|
|
; AVX2: # BB#0:
|
2016-08-07 05:21:12 +08:00
|
|
|
; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: sitofp_16i8_to_8f32:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
|
|
|
|
; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0
|
|
|
|
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%cvt = sitofp <16 x i8> %a to <16 x float>
|
|
|
|
%shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
|
|
ret <8 x float> %shuf
|
|
|
|
}
|
|
|
|
|
2015-05-02 19:42:47 +08:00
|
|
|
;
|
|
|
|
; Unsigned Integer to Float
|
|
|
|
;
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: uitofp_2i64_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm1
|
2017-10-04 00:59:13 +08:00
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: js .LBB39_1
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: # BB#2:
|
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: jmp .LBB39_3
|
|
|
|
; SSE-NEXT: .LBB39_1:
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: addss %xmm0, %xmm0
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: .LBB39_3:
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: js .LBB39_4
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: # BB#5:
|
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; SSE-NEXT: retq
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: .LBB39_4:
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: addss %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
;
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-LABEL: uitofp_2i64_to_4f32:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; VEX-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; VEX-NEXT: js .LBB39_1
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: # BB#2:
|
2016-08-11 15:32:08 +08:00
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; VEX-NEXT: jmp .LBB39_3
|
|
|
|
; VEX-NEXT: .LBB39_1:
|
2016-08-12 11:33:22 +08:00
|
|
|
; VEX-NEXT: movq %rax, %rcx
|
|
|
|
; VEX-NEXT: shrq %rcx
|
|
|
|
; VEX-NEXT: andl $1, %eax
|
|
|
|
; VEX-NEXT: orq %rcx, %rax
|
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; VEX-NEXT: .LBB39_3:
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: vmovq %xmm0, %rax
|
|
|
|
; VEX-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; VEX-NEXT: js .LBB39_4
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: # BB#5:
|
2016-08-11 15:32:08 +08:00
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
|
2016-11-24 05:19:57 +08:00
|
|
|
; VEX-NEXT: jmp .LBB39_6
|
|
|
|
; VEX-NEXT: .LBB39_4:
|
2016-08-12 11:33:22 +08:00
|
|
|
; VEX-NEXT: movq %rax, %rcx
|
|
|
|
; VEX-NEXT: shrq %rcx
|
|
|
|
; VEX-NEXT: andl $1, %eax
|
|
|
|
; VEX-NEXT: orq %rcx, %rax
|
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0
|
2016-11-24 05:19:57 +08:00
|
|
|
; VEX-NEXT: .LBB39_6:
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
|
|
|
; VEX-NEXT: testq %rax, %rax
|
2017-08-30 12:34:48 +08:00
|
|
|
; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; VEX-NEXT: js .LBB39_8
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: # BB#7:
|
2016-08-11 15:32:08 +08:00
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; VEX-NEXT: .LBB39_8:
|
2016-12-18 22:26:02 +08:00
|
|
|
; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512F-LABEL: uitofp_2i64_to_4f32:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1
|
2016-12-18 22:26:02 +08:00
|
|
|
; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: uitofp_2i64_to_4f32:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1
|
2016-12-18 22:26:02 +08:00
|
|
|
; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: uitofp_2i64_to_4f32:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-23 22:01:18 +08:00
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: uitofp_2i64_to_4f32:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
2016-11-24 21:38:59 +08:00
|
|
|
; AVX512VLDQ-NEXT: vcvtuqq2ps %xmm0, %xmm0
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
%cvt = uitofp <2 x i64> %a to <2 x float>
|
|
|
|
%ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
|
|
|
ret <4 x float> %ext
|
|
|
|
}
|
|
|
|
|
2016-11-24 05:19:57 +08:00
|
|
|
define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
|
|
|
|
; SSE-LABEL: uitofp_2i64_to_2f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
|
|
|
; SSE-NEXT: js .LBB40_1
|
|
|
|
; SSE-NEXT: # BB#2:
|
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
|
|
|
; SSE-NEXT: jmp .LBB40_3
|
|
|
|
; SSE-NEXT: .LBB40_1:
|
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
|
|
|
; SSE-NEXT: addss %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: .LBB40_3:
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
|
|
|
; SSE-NEXT: js .LBB40_4
|
|
|
|
; SSE-NEXT: # BB#5:
|
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
|
|
|
|
; SSE-NEXT: jmp .LBB40_6
|
|
|
|
; SSE-NEXT: .LBB40_4:
|
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
|
|
|
|
; SSE-NEXT: addss %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: .LBB40_6:
|
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; VEX-LABEL: uitofp_2i64_to_2f32:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; VEX-NEXT: testq %rax, %rax
|
|
|
|
; VEX-NEXT: js .LBB40_1
|
|
|
|
; VEX-NEXT: # BB#2:
|
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; VEX-NEXT: jmp .LBB40_3
|
|
|
|
; VEX-NEXT: .LBB40_1:
|
|
|
|
; VEX-NEXT: movq %rax, %rcx
|
|
|
|
; VEX-NEXT: shrq %rcx
|
|
|
|
; VEX-NEXT: andl $1, %eax
|
|
|
|
; VEX-NEXT: orq %rcx, %rax
|
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1
|
|
|
|
; VEX-NEXT: .LBB40_3:
|
|
|
|
; VEX-NEXT: vmovq %xmm0, %rax
|
|
|
|
; VEX-NEXT: testq %rax, %rax
|
|
|
|
; VEX-NEXT: js .LBB40_4
|
|
|
|
; VEX-NEXT: # BB#5:
|
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
|
|
|
|
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
|
|
|
|
; VEX-NEXT: retq
|
|
|
|
; VEX-NEXT: .LBB40_4:
|
|
|
|
; VEX-NEXT: movq %rax, %rcx
|
|
|
|
; VEX-NEXT: shrq %rcx
|
|
|
|
; VEX-NEXT: andl $1, %eax
|
|
|
|
; VEX-NEXT: orq %rcx, %rax
|
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
|
|
|
|
; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0
|
|
|
|
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
|
|
|
|
; VEX-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: uitofp_2i64_to_2f32:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: uitofp_2i64_to_2f32:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
|
|
|
; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
|
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: uitofp_2i64_to_2f32:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: uitofp_2i64_to_2f32:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
2016-11-24 21:38:59 +08:00
|
|
|
; AVX512VLDQ-NEXT: vcvtuqq2ps %xmm0, %xmm0
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX512VLDQ-NEXT: retq
|
|
|
|
%cvt = uitofp <2 x i64> %a to <2 x float>
|
|
|
|
%ext = shufflevector <2 x float> %cvt, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
ret <4 x float> %ext
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
|
|
|
|
; SSE-LABEL: uitofp_4i64_to_4f32_undef:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm1
|
2017-10-04 00:59:13 +08:00
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: js .LBB41_1
|
|
|
|
; SSE-NEXT: # BB#2:
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: jmp .LBB41_3
|
|
|
|
; SSE-NEXT: .LBB41_1:
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: addss %xmm0, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: .LBB41_3:
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: js .LBB41_4
|
|
|
|
; SSE-NEXT: # BB#5:
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: jmp .LBB41_6
|
|
|
|
; SSE-NEXT: .LBB41_4:
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: addss %xmm1, %xmm1
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: .LBB41_6:
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: js .LBB41_8
|
|
|
|
; SSE-NEXT: # BB#7:
|
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
|
|
|
; SSE-NEXT: .LBB41_8:
|
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-LABEL: uitofp_4i64_to_4f32_undef:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; VEX-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; VEX-NEXT: js .LBB41_1
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: # BB#2:
|
2016-08-11 15:32:08 +08:00
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; VEX-NEXT: jmp .LBB41_3
|
|
|
|
; VEX-NEXT: .LBB41_1:
|
2016-08-12 11:33:22 +08:00
|
|
|
; VEX-NEXT: movq %rax, %rcx
|
|
|
|
; VEX-NEXT: shrq %rcx
|
|
|
|
; VEX-NEXT: andl $1, %eax
|
|
|
|
; VEX-NEXT: orq %rcx, %rax
|
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; VEX-NEXT: .LBB41_3:
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: vmovq %xmm0, %rax
|
|
|
|
; VEX-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; VEX-NEXT: js .LBB41_4
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: # BB#5:
|
2016-08-11 15:32:08 +08:00
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
|
2016-11-24 05:19:57 +08:00
|
|
|
; VEX-NEXT: jmp .LBB41_6
|
|
|
|
; VEX-NEXT: .LBB41_4:
|
2016-08-12 11:33:22 +08:00
|
|
|
; VEX-NEXT: movq %rax, %rcx
|
|
|
|
; VEX-NEXT: shrq %rcx
|
|
|
|
; VEX-NEXT: andl $1, %eax
|
|
|
|
; VEX-NEXT: orq %rcx, %rax
|
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0
|
2016-11-24 05:19:57 +08:00
|
|
|
; VEX-NEXT: .LBB41_6:
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
|
|
|
; VEX-NEXT: testq %rax, %rax
|
2017-08-30 12:34:48 +08:00
|
|
|
; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; VEX-NEXT: js .LBB41_8
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: # BB#7:
|
2016-08-11 15:32:08 +08:00
|
|
|
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; VEX-NEXT: .LBB41_8:
|
2016-12-18 22:26:02 +08:00
|
|
|
; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: retq
|
|
|
|
;
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512F-LABEL: uitofp_4i64_to_4f32_undef:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1
|
2016-12-18 22:26:02 +08:00
|
|
|
; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: uitofp_4i64_to_4f32_undef:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1
|
2016-12-18 22:26:02 +08:00
|
|
|
; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: uitofp_4i64_to_4f32_undef:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-23 22:01:18 +08:00
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32_undef:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
|
|
|
|
; AVX512VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512VLDQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
|
|
|
%cvt = uitofp <4 x i64> %ext to <4 x float>
|
|
|
|
ret <4 x float> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 00:14:56 +08:00
|
|
|
define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) {
|
|
|
|
; SSE-LABEL: uitofp_4i32_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
|
|
|
|
; SSE-NEXT: pand %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: por {{.*}}(%rip), %xmm1
|
|
|
|
; SSE-NEXT: psrld $16, %xmm0
|
|
|
|
; SSE-NEXT: por {{.*}}(%rip), %xmm0
|
|
|
|
; SSE-NEXT: addps {{.*}}(%rip), %xmm0
|
|
|
|
; SSE-NEXT: addps %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: uitofp_4i32_to_4f32:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
|
|
|
|
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
|
|
|
|
; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: uitofp_4i32_to_4f32:
|
|
|
|
; AVX2: # BB#0:
|
2017-10-02 23:22:35 +08:00
|
|
|
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
|
2015-07-19 00:14:56 +08:00
|
|
|
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
|
|
|
|
; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
|
2017-10-02 23:22:35 +08:00
|
|
|
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928]
|
2015-07-19 00:14:56 +08:00
|
|
|
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
|
|
|
|
; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
|
|
|
|
; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-LABEL: uitofp_4i32_to_4f32:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
|
|
|
|
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: uitofp_4i32_to_4f32:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0
|
|
|
|
; AVX512VL-NEXT: retq
|
2016-11-23 06:04:50 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: uitofp_4i32_to_4f32:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: uitofp_4i32_to_4f32:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-07-19 00:14:56 +08:00
|
|
|
%cvt = uitofp <4 x i32> %a to <4 x float>
|
|
|
|
ret <4 x float> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: uitofp_4i16_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pxor %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
;
|
2016-09-18 20:45:23 +08:00
|
|
|
; AVX-LABEL: uitofp_4i16_to_4f32:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
|
|
|
; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
%shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
%cvt = uitofp <4 x i16> %shuf to <4 x float>
|
|
|
|
ret <4 x float> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
|
|
|
|
; SSE-LABEL: uitofp_8i16_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pxor %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: uitofp_8i16_to_4f32:
|
|
|
|
; AVX1: # BB#0:
|
2017-03-05 17:57:20 +08:00
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
2017-03-05 17:57:20 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: uitofp_8i16_to_4f32:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
|
|
|
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: uitofp_8i16_to_4f32:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
2016-09-18 20:45:23 +08:00
|
|
|
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX512-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%cvt = uitofp <8 x i16> %a to <8 x float>
|
|
|
|
%shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
ret <4 x float> %shuf
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: uitofp_4i8_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pxor %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
;
|
2016-09-18 20:45:23 +08:00
|
|
|
; AVX-LABEL: uitofp_4i8_to_4f32:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
|
|
|
; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
%shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
%cvt = uitofp <4 x i8> %shuf to <4 x float>
|
|
|
|
ret <4 x float> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
|
|
|
|
; SSE-LABEL: uitofp_16i8_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pxor %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: uitofp_16i8_to_4f32:
|
|
|
|
; AVX1: # BB#0:
|
2016-03-11 04:40:26 +08:00
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
2016-03-11 04:40:26 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: uitofp_16i8_to_4f32:
|
|
|
|
; AVX2: # BB#0:
|
2016-08-07 05:21:12 +08:00
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: uitofp_16i8_to_4f32:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
|
2016-09-18 20:45:23 +08:00
|
|
|
; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX512-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%cvt = uitofp <16 x i8> %a to <16 x float>
|
|
|
|
%shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
ret <4 x float> %shuf
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: uitofp_4i64_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: js .LBB47_1
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: # BB#2:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: jmp .LBB47_3
|
|
|
|
; SSE-NEXT: .LBB47_1:
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
|
|
|
|
; SSE-NEXT: addss %xmm2, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: .LBB47_3:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: js .LBB47_4
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: # BB#5:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm3
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: jmp .LBB47_6
|
|
|
|
; SSE-NEXT: .LBB47_4:
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm3
|
|
|
|
; SSE-NEXT: addss %xmm3, %xmm3
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: .LBB47_6:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: js .LBB47_7
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: # BB#8:
|
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: jmp .LBB47_9
|
|
|
|
; SSE-NEXT: .LBB47_7:
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: addss %xmm1, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: .LBB47_9:
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: js .LBB47_10
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: # BB#11:
|
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: jmp .LBB47_12
|
|
|
|
; SSE-NEXT: .LBB47_10:
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: addss %xmm0, %xmm0
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: .LBB47_12:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
2017-09-18 12:40:58 +08:00
|
|
|
; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2017-09-18 11:29:54 +08:00
|
|
|
; SSE-NEXT: movaps %xmm1, %xmm0
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX1-LABEL: uitofp_4i64_to_4f32:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX1-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: js .LBB47_1
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: # BB#2:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: jmp .LBB47_3
|
|
|
|
; AVX1-NEXT: .LBB47_1:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX1-NEXT: movq %rax, %rcx
|
|
|
|
; AVX1-NEXT: shrq %rcx
|
|
|
|
; AVX1-NEXT: andl $1, %eax
|
|
|
|
; AVX1-NEXT: orq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: .LBB47_3:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX1-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: js .LBB47_4
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: # BB#5:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: jmp .LBB47_6
|
|
|
|
; AVX1-NEXT: .LBB47_4:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX1-NEXT: movq %rax, %rcx
|
|
|
|
; AVX1-NEXT: shrq %rcx
|
|
|
|
; AVX1-NEXT: andl $1, %eax
|
|
|
|
; AVX1-NEXT: orq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: .LBB47_6:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX1-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: js .LBB47_7
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: # BB#8:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: jmp .LBB47_9
|
|
|
|
; AVX1-NEXT: .LBB47_7:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX1-NEXT: movq %rax, %rcx
|
|
|
|
; AVX1-NEXT: shrq %rcx
|
|
|
|
; AVX1-NEXT: andl $1, %eax
|
|
|
|
; AVX1-NEXT: orq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: .LBB47_9:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX1-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: js .LBB47_10
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: # BB#11:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: .LBB47_10:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX1-NEXT: movq %rax, %rcx
|
|
|
|
; AVX1-NEXT: shrq %rcx
|
|
|
|
; AVX1-NEXT: andl $1, %eax
|
|
|
|
; AVX1-NEXT: orq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX2-LABEL: uitofp_4i64_to_4f32:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX2-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: js .LBB47_1
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: # BB#2:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: jmp .LBB47_3
|
|
|
|
; AVX2-NEXT: .LBB47_1:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX2-NEXT: movq %rax, %rcx
|
|
|
|
; AVX2-NEXT: shrq %rcx
|
|
|
|
; AVX2-NEXT: andl $1, %eax
|
|
|
|
; AVX2-NEXT: orq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: .LBB47_3:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX2-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: js .LBB47_4
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: # BB#5:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: jmp .LBB47_6
|
|
|
|
; AVX2-NEXT: .LBB47_4:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX2-NEXT: movq %rax, %rcx
|
|
|
|
; AVX2-NEXT: shrq %rcx
|
|
|
|
; AVX2-NEXT: andl $1, %eax
|
|
|
|
; AVX2-NEXT: orq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: .LBB47_6:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; AVX2-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX2-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: js .LBB47_7
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: # BB#8:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: jmp .LBB47_9
|
|
|
|
; AVX2-NEXT: .LBB47_7:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX2-NEXT: movq %rax, %rcx
|
|
|
|
; AVX2-NEXT: shrq %rcx
|
|
|
|
; AVX2-NEXT: andl $1, %eax
|
|
|
|
; AVX2-NEXT: orq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: .LBB47_9:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX2-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: js .LBB47_10
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: # BB#11:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: .LBB47_10:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX2-NEXT: movq %rax, %rcx
|
|
|
|
; AVX2-NEXT: shrq %rcx
|
|
|
|
; AVX2-NEXT: andl $1, %eax
|
|
|
|
; AVX2-NEXT: orq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-LABEL: uitofp_4i64_to_4f32:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
|
|
|
|
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: uitofp_4i64_to_4f32:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512VL-NEXT: vzeroupper
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
2016-11-23 06:04:50 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: uitofp_4i64_to_4f32:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-23 22:01:18 +08:00
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512VLDQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-05-02 19:42:47 +08:00
|
|
|
%cvt = uitofp <4 x i64> %a to <4 x float>
|
|
|
|
ret <4 x float> %cvt
|
|
|
|
}
|
2015-06-16 05:49:31 +08:00
|
|
|
|
2015-07-19 00:14:56 +08:00
|
|
|
define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) {
|
|
|
|
; SSE-LABEL: uitofp_8i32_to_8f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
|
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm3
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
|
|
|
|
; SSE-NEXT: por %xmm4, %xmm3
|
|
|
|
; SSE-NEXT: psrld $16, %xmm0
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
|
|
|
|
; SSE-NEXT: por %xmm5, %xmm0
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
|
|
|
|
; SSE-NEXT: addps %xmm6, %xmm0
|
|
|
|
; SSE-NEXT: addps %xmm3, %xmm0
|
|
|
|
; SSE-NEXT: pand %xmm1, %xmm2
|
|
|
|
; SSE-NEXT: por %xmm4, %xmm2
|
|
|
|
; SSE-NEXT: psrld $16, %xmm1
|
|
|
|
; SSE-NEXT: por %xmm5, %xmm1
|
|
|
|
; SSE-NEXT: addps %xmm6, %xmm1
|
|
|
|
; SSE-NEXT: addps %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: uitofp_8i32_to_8f32:
|
|
|
|
; AVX1: # BB#0:
|
2016-08-09 11:06:26 +08:00
|
|
|
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
|
2015-07-19 00:14:56 +08:00
|
|
|
; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
|
2016-08-09 11:06:26 +08:00
|
|
|
; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
|
|
|
|
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
|
2015-07-19 00:14:56 +08:00
|
|
|
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
|
2016-08-09 11:06:26 +08:00
|
|
|
; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0
|
2015-07-19 00:14:56 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: uitofp_8i32_to_8f32:
|
|
|
|
; AVX2: # BB#0:
|
2017-10-02 23:22:35 +08:00
|
|
|
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
|
2015-07-19 00:14:56 +08:00
|
|
|
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
|
|
|
|
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
|
2017-10-02 23:22:35 +08:00
|
|
|
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
|
2015-07-19 00:14:56 +08:00
|
|
|
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
|
|
|
|
; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm2
|
|
|
|
; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-LABEL: uitofp_8i32_to_8f32:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
|
|
|
|
; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: uitofp_8i32_to_8f32:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vcvtudq2ps %ymm0, %ymm0
|
|
|
|
; AVX512VL-NEXT: retq
|
2016-11-23 06:04:50 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: uitofp_8i32_to_8f32:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
|
|
|
; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: uitofp_8i32_to_8f32:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtudq2ps %ymm0, %ymm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2015-07-19 00:14:56 +08:00
|
|
|
%cvt = uitofp <8 x i32> %a to <8 x float>
|
|
|
|
ret <8 x float> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: uitofp_8i16_to_8f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pxor %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm2
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm2, %xmm2
|
2015-08-01 18:01:46 +08:00
|
|
|
; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: movaps %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX1-LABEL: uitofp_8i16_to_8f32:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1: # BB#0:
|
2017-03-05 17:57:20 +08:00
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
2017-03-05 17:57:20 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX2-LABEL: uitofp_8i16_to_8f32:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
|
|
|
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: uitofp_8i16_to_8f32:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
2016-09-18 20:45:23 +08:00
|
|
|
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX512-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
%cvt = uitofp <8 x i16> %a to <8 x float>
|
|
|
|
ret <8 x float> %cvt
|
|
|
|
}
|
|
|
|
|
2015-07-17 05:00:57 +08:00
|
|
|
define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: uitofp_8i8_to_8f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pxor %xmm1, %xmm1
|
2015-09-22 16:16:08 +08:00
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm2
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm2, %xmm2
|
2015-09-22 16:16:08 +08:00
|
|
|
; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: movaps %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX1-LABEL: uitofp_8i8_to_8f32:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1: # BB#0:
|
2016-03-11 04:40:26 +08:00
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
2015-08-01 18:01:46 +08:00
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
2016-03-11 04:40:26 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX2-LABEL: uitofp_8i8_to_8f32:
|
2015-06-18 20:32:28 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: uitofp_8i8_to_8f32:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
|
2016-09-18 20:45:23 +08:00
|
|
|
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX512-NEXT: retq
|
2015-06-16 05:49:31 +08:00
|
|
|
%shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
|
|
%cvt = uitofp <8 x i8> %shuf to <8 x float>
|
|
|
|
ret <8 x float> %cvt
|
|
|
|
}
|
2015-06-21 00:19:24 +08:00
|
|
|
|
2015-07-19 01:05:39 +08:00
|
|
|
define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
|
|
|
|
; SSE-LABEL: uitofp_16i8_to_8f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: pxor %xmm1, %xmm1
|
2015-09-22 16:16:08 +08:00
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm2
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm2, %xmm2
|
2015-09-22 16:16:08 +08:00
|
|
|
; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
2015-07-19 01:05:39 +08:00
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: movaps %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: uitofp_16i8_to_8f32:
|
|
|
|
; AVX1: # BB#0:
|
2016-03-11 04:40:26 +08:00
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
2016-03-11 04:40:26 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: uitofp_16i8_to_8f32:
|
|
|
|
; AVX2: # BB#0:
|
2016-08-07 05:21:12 +08:00
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
|
2015-07-19 01:05:39 +08:00
|
|
|
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: uitofp_16i8_to_8f32:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
|
2016-09-18 20:45:23 +08:00
|
|
|
; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512-NEXT: retq
|
2015-07-19 01:05:39 +08:00
|
|
|
%cvt = uitofp <16 x i8> %a to <16 x float>
|
|
|
|
%shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
|
|
ret <8 x float> %shuf
|
|
|
|
}
|
|
|
|
|
2016-05-24 21:07:23 +08:00
|
|
|
;
|
|
|
|
; Load Signed Integer to Double
|
|
|
|
;
|
|
|
|
|
|
|
|
define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
|
|
|
|
; SSE-LABEL: sitofp_load_2i64_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa (%rdi), %xmm1
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: cvtsi2sdq %rax, %xmm0
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: cvtsi2sdq %rax, %xmm1
|
2017-09-18 12:40:58 +08:00
|
|
|
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-LABEL: sitofp_load_2i64_to_2f64:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vmovdqa (%rdi), %xmm0
|
|
|
|
; VEX-NEXT: vpextrq $1, %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: vmovq %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; VEX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; VEX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: retq
|
|
|
|
;
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-LABEL: sitofp_load_2i64_to_2f64:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: sitofp_load_2i64_to_2f64:
|
|
|
|
; AVX512VL: # BB#0:
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
2016-11-23 06:04:50 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: sitofp_load_2i64_to_2f64:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-24 22:46:55 +08:00
|
|
|
; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
|
|
|
|
; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: sitofp_load_2i64_to_2f64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtqq2pd (%rdi), %xmm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <2 x i64>, <2 x i64> *%a
|
|
|
|
%cvt = sitofp <2 x i64> %ld to <2 x double>
|
|
|
|
ret <2 x double> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @sitofp_load_2i32_to_2f64(<2 x i32> *%a) {
|
|
|
|
; SSE-LABEL: sitofp_load_2i32_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
2016-05-31 20:04:35 +08:00
|
|
|
; SSE-NEXT: cvtdq2pd (%rdi), %xmm0
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2017-10-16 00:41:17 +08:00
|
|
|
; AVX-LABEL: sitofp_load_2i32_to_2f64:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <2 x i32>, <2 x i32> *%a
|
|
|
|
%cvt = sitofp <2 x i32> %ld to <2 x double>
|
|
|
|
ret <2 x double> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) {
|
|
|
|
; SSE-LABEL: sitofp_load_2i16_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
2017-02-16 01:41:33 +08:00
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: sitofp_load_2i16_to_2f64:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpmovsxwq (%rdi), %xmm0
|
|
|
|
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%ld = load <2 x i16>, <2 x i16> *%a
|
|
|
|
%cvt = sitofp <2 x i16> %ld to <2 x double>
|
|
|
|
ret <2 x double> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @sitofp_load_2i8_to_2f64(<2 x i8> *%a) {
|
|
|
|
; SSE-LABEL: sitofp_load_2i8_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movzwl (%rdi), %eax
|
|
|
|
; SSE-NEXT: movd %eax, %xmm0
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
|
|
|
|
; SSE-NEXT: psrad $24, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: sitofp_load_2i8_to_2f64:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpmovsxbq (%rdi), %xmm0
|
|
|
|
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%ld = load <2 x i8>, <2 x i8> *%a
|
|
|
|
%cvt = sitofp <2 x i8> %ld to <2 x double>
|
|
|
|
ret <2 x double> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
|
|
|
|
; SSE-LABEL: sitofp_load_4i64_to_4f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa (%rdi), %xmm1
|
|
|
|
; SSE-NEXT: movdqa 16(%rdi), %xmm2
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: cvtsi2sdq %rax, %xmm0
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: cvtsi2sdq %rax, %xmm1
|
2017-09-18 12:40:58 +08:00
|
|
|
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm2, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: cvtsi2sdq %rax, %xmm1
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm2, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: xorps %xmm2, %xmm2
|
|
|
|
; SSE-NEXT: cvtsi2sdq %rax, %xmm2
|
2017-09-18 12:40:58 +08:00
|
|
|
; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: sitofp_load_4i64_to_4f64:
|
|
|
|
; AVX1: # BB#0:
|
2017-02-11 13:32:57 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm1, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vmovq %xmm1, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vmovq %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: sitofp_load_4i64_to_4f64:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm1, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vmovq %xmm1, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vmovq %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-LABEL: sitofp_load_4i64_to_4f64:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
|
|
|
|
; AVX512F-NEXT: vmovq %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: sitofp_load_4i64_to_4f64:
|
|
|
|
; AVX512VL: # BB#0:
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
2016-11-23 06:04:50 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: sitofp_load_4i64_to_4f64:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-23 22:01:18 +08:00
|
|
|
; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
|
|
|
|
; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtqq2pd (%rdi), %ymm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <4 x i64>, <4 x i64> *%a
|
|
|
|
%cvt = sitofp <4 x i64> %ld to <4 x double>
|
|
|
|
ret <4 x double> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) {
|
|
|
|
; SSE-LABEL: sitofp_load_4i32_to_4f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa (%rdi), %xmm1
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: sitofp_load_4i32_to_4f64:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vcvtdq2pd (%rdi), %ymm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%ld = load <4 x i32>, <4 x i32> *%a
|
|
|
|
%cvt = sitofp <4 x i32> %ld to <4 x double>
|
|
|
|
ret <4 x double> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) {
|
|
|
|
; SSE-LABEL: sitofp_load_4i16_to_4f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
|
|
|
; SSE-NEXT: psrad $16, %xmm1
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: sitofp_load_4i16_to_4f64:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpmovsxwd (%rdi), %xmm0
|
|
|
|
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%ld = load <4 x i16>, <4 x i16> *%a
|
|
|
|
%cvt = sitofp <4 x i16> %ld to <4 x double>
|
|
|
|
ret <4 x double> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) {
|
|
|
|
; SSE-LABEL: sitofp_load_4i8_to_4f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
|
|
|
; SSE-NEXT: psrad $24, %xmm1
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: sitofp_load_4i8_to_4f64:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpmovsxbd (%rdi), %xmm0
|
|
|
|
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%ld = load <4 x i8>, <4 x i8> *%a
|
|
|
|
%cvt = sitofp <4 x i8> %ld to <4 x double>
|
|
|
|
ret <4 x double> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
;
|
|
|
|
; Load Unsigned Integer to Double
|
|
|
|
;
|
|
|
|
|
|
|
|
define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) {
|
|
|
|
; SSE-LABEL: uitofp_load_2i64_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa (%rdi), %xmm1
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
|
|
|
; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
|
|
|
|
; SSE-NEXT: subpd %xmm4, %xmm1
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
|
|
|
|
; SSE-NEXT: addpd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
|
|
|
|
; SSE-NEXT: subpd %xmm4, %xmm3
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
|
|
|
|
; SSE-NEXT: addpd %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-LABEL: uitofp_load_2i64_to_2f64:
|
|
|
|
; VEX: # BB#0:
|
2017-09-18 11:29:54 +08:00
|
|
|
; VEX-NEXT: vmovapd (%rdi), %xmm0
|
|
|
|
; VEX-NEXT: vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
|
|
|
|
; VEX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
|
|
|
|
; VEX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
|
2017-09-18 11:29:54 +08:00
|
|
|
; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; VEX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
|
2016-11-23 01:50:06 +08:00
|
|
|
; VEX-NEXT: vhaddpd %xmm0, %xmm2, %xmm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: retq
|
|
|
|
;
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-LABEL: uitofp_load_2i64_to_2f64:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: uitofp_load_2i64_to_2f64:
|
|
|
|
; AVX512VL: # BB#0:
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
2016-11-23 06:04:50 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: uitofp_load_2i64_to_2f64:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-24 22:46:55 +08:00
|
|
|
; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
|
|
|
|
; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: uitofp_load_2i64_to_2f64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtuqq2pd (%rdi), %xmm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <2 x i64>, <2 x i64> *%a
|
|
|
|
%cvt = uitofp <2 x i64> %ld to <2 x double>
|
|
|
|
ret <2 x double> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) {
|
|
|
|
; SSE-LABEL: uitofp_load_2i32_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
2016-11-24 23:12:56 +08:00
|
|
|
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
|
|
|
|
; SSE-NEXT: pand %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: psrld $16, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: mulpd {{.*}}(%rip), %xmm0
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: addpd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-LABEL: uitofp_load_2i32_to_2f64:
|
|
|
|
; VEX: # BB#0:
|
2016-11-24 23:12:56 +08:00
|
|
|
; VEX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
|
|
|
; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
|
|
|
|
; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1
|
|
|
|
; VEX-NEXT: vpsrld $16, %xmm0, %xmm0
|
|
|
|
; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0
|
|
|
|
; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; VEX-NEXT: retq
|
|
|
|
;
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-LABEL: uitofp_load_2i32_to_2f64:
|
|
|
|
; AVX512F: # BB#0:
|
2016-12-16 00:05:29 +08:00
|
|
|
; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
2016-11-20 22:03:23 +08:00
|
|
|
; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
|
|
|
|
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: uitofp_load_2i32_to_2f64:
|
|
|
|
; AVX512VL: # BB#0:
|
2017-10-16 00:41:17 +08:00
|
|
|
; AVX512VL-NEXT: vcvtudq2pd (%rdi), %xmm0
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
2016-11-23 06:04:50 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: uitofp_load_2i32_to_2f64:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-12-16 00:05:29 +08:00
|
|
|
; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: uitofp_load_2i32_to_2f64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
2017-10-16 00:41:17 +08:00
|
|
|
; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %xmm0
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512VLDQ-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <2 x i32>, <2 x i32> *%a
|
|
|
|
%cvt = uitofp <2 x i32> %ld to <2 x double>
|
|
|
|
ret <2 x double> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) {
|
|
|
|
; SSE-LABEL: uitofp_load_2i16_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; SSE-NEXT: pxor %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2017-10-16 00:41:17 +08:00
|
|
|
; AVX-LABEL: uitofp_load_2i16_to_2f64:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
|
|
|
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <2 x i16>, <2 x i16> *%a
|
|
|
|
%cvt = uitofp <2 x i16> %ld to <2 x double>
|
|
|
|
ret <2 x double> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) {
|
|
|
|
; SSE-LABEL: uitofp_load_2i8_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movzwl (%rdi), %eax
|
|
|
|
; SSE-NEXT: movd %eax, %xmm0
|
|
|
|
; SSE-NEXT: pxor %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2017-09-13 14:40:26 +08:00
|
|
|
; AVX-LABEL: uitofp_load_2i8_to_2f64:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
|
|
|
|
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <2 x i8>, <2 x i8> *%a
|
|
|
|
%cvt = uitofp <2 x i8> %ld to <2 x double>
|
|
|
|
ret <2 x double> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) {
|
|
|
|
; SSE-LABEL: uitofp_load_4i64_to_4f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa (%rdi), %xmm1
|
|
|
|
; SSE-NEXT: movdqa 16(%rdi), %xmm2
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
|
|
|
|
; SSE-NEXT: movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25]
|
|
|
|
; SSE-NEXT: subpd %xmm5, %xmm1
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
|
|
|
|
; SSE-NEXT: addpd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
|
|
|
|
; SSE-NEXT: subpd %xmm5, %xmm4
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,0,1]
|
|
|
|
; SSE-NEXT: addpd %xmm4, %xmm1
|
|
|
|
; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
|
|
|
; SSE-NEXT: subpd %xmm5, %xmm2
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
|
|
|
|
; SSE-NEXT: addpd %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
|
|
|
|
; SSE-NEXT: subpd %xmm5, %xmm4
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
|
|
|
|
; SSE-NEXT: addpd %xmm4, %xmm2
|
|
|
|
; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2017-09-18 11:29:54 +08:00
|
|
|
; VEX-LABEL: uitofp_load_4i64_to_4f64:
|
|
|
|
; VEX: # BB#0:
|
|
|
|
; VEX-NEXT: vmovapd (%rdi), %ymm0
|
|
|
|
; VEX-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; VEX-NEXT: vmovapd {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
|
|
|
|
; VEX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
|
|
|
; VEX-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
|
|
|
|
; VEX-NEXT: vsubpd %xmm4, %xmm3, %xmm3
|
|
|
|
; VEX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
|
|
|
; VEX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
|
|
|
; VEX-NEXT: vsubpd %xmm4, %xmm1, %xmm1
|
|
|
|
; VEX-NEXT: vhaddpd %xmm1, %xmm3, %xmm1
|
|
|
|
; VEX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
|
|
|
; VEX-NEXT: vsubpd %xmm4, %xmm3, %xmm3
|
|
|
|
; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; VEX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
|
|
|
; VEX-NEXT: vsubpd %xmm4, %xmm0, %xmm0
|
|
|
|
; VEX-NEXT: vhaddpd %xmm0, %xmm3, %xmm0
|
|
|
|
; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; VEX-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-LABEL: uitofp_load_4i64_to_4f64:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
|
|
|
|
; AVX512F-NEXT: vmovq %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: uitofp_load_4i64_to_4f64:
|
|
|
|
; AVX512VL: # BB#0:
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0
|
2017-09-18 12:40:58 +08:00
|
|
|
; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
2016-11-23 06:04:50 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: uitofp_load_4i64_to_4f64:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-23 22:01:18 +08:00
|
|
|
; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
|
|
|
|
; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtuqq2pd (%rdi), %ymm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <4 x i64>, <4 x i64> *%a
|
|
|
|
%cvt = uitofp <4 x i64> %ld to <4 x double>
|
|
|
|
ret <4 x double> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
|
|
|
|
; SSE-LABEL: uitofp_load_4i32_to_4f64:
|
|
|
|
; SSE: # BB#0:
|
2016-11-24 23:12:56 +08:00
|
|
|
; SSE-NEXT: movdqa (%rdi), %xmm0
|
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: psrld $16, %xmm1
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: movapd {{.*#+}} xmm2 = [6.553600e+04,6.553600e+04]
|
|
|
|
; SSE-NEXT: mulpd %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
|
|
|
|
; SSE-NEXT: pand %xmm3, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: addpd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: movdqa %xmm4, %xmm1
|
|
|
|
; SSE-NEXT: psrld $16, %xmm1
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm5
|
|
|
|
; SSE-NEXT: mulpd %xmm2, %xmm5
|
|
|
|
; SSE-NEXT: pand %xmm3, %xmm4
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm4, %xmm1
|
|
|
|
; SSE-NEXT: addpd %xmm5, %xmm1
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: uitofp_load_4i32_to_4f64:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
|
2016-12-01 23:41:40 +08:00
|
|
|
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1
|
|
|
|
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: uitofp_load_4i32_to_4f64:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
|
|
|
|
; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
|
|
|
|
; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1
|
|
|
|
; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2
|
|
|
|
; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1
|
2016-12-02 21:16:08 +08:00
|
|
|
; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2
|
|
|
|
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-LABEL: uitofp_load_4i32_to_4f64:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vmovaps (%rdi), %xmm0
|
|
|
|
; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
|
|
|
|
; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: uitofp_load_4i32_to_4f64:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vcvtudq2pd (%rdi), %ymm0
|
|
|
|
; AVX512VL-NEXT: retq
|
2016-11-23 06:04:50 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: uitofp_load_4i32_to_4f64:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
|
|
|
|
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f64:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %ymm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <4 x i32>, <4 x i32> *%a
|
|
|
|
%cvt = uitofp <4 x i32> %ld to <4 x double>
|
|
|
|
ret <4 x double> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) {
|
|
|
|
; SSE-LABEL: uitofp_load_4i16_to_4f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; SSE-NEXT: pxor %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-09-18 20:45:23 +08:00
|
|
|
; AVX-LABEL: uitofp_load_4i16_to_4f64:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
|
|
|
|
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <4 x i16>, <4 x i16> *%a
|
|
|
|
%cvt = uitofp <4 x i16> %ld to <4 x double>
|
|
|
|
ret <4 x double> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) {
|
|
|
|
; SSE-LABEL: uitofp_load_4i8_to_4f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; SSE-NEXT: pxor %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
|
|
|
; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-09-18 20:45:23 +08:00
|
|
|
; AVX-LABEL: uitofp_load_4i8_to_4f64:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
|
|
|
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
|
|
|
|
; AVX-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <4 x i8>, <4 x i8> *%a
|
|
|
|
%cvt = uitofp <4 x i8> %ld to <4 x double>
|
|
|
|
ret <4 x double> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
;
|
|
|
|
; Load Signed Integer to Float
|
|
|
|
;
|
|
|
|
|
|
|
|
define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
|
|
|
|
; SSE-LABEL: sitofp_load_4i64_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa (%rdi), %xmm1
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: movdqa 16(%rdi), %xmm0
|
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
|
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
2017-09-18 12:40:58 +08:00
|
|
|
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: sitofp_load_4i64_to_4f32:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vmovq %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vmovq %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: sitofp_load_4i64_to_4f32:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vmovq %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; AVX2-NEXT: vmovq %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-LABEL: sitofp_load_4i64_to_4f32:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
|
|
|
|
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: sitofp_load_4i64_to_4f32:
|
|
|
|
; AVX512VL: # BB#0:
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512VL-NEXT: vzeroupper
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
2016-11-23 06:04:50 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: sitofp_load_4i64_to_4f32:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-23 22:01:18 +08:00
|
|
|
; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
|
|
|
|
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f32:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <4 x i64>, <4 x i64> *%a
|
|
|
|
%cvt = sitofp <4 x i64> %ld to <4 x float>
|
|
|
|
ret <4 x float> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @sitofp_load_4i32_to_4f32(<4 x i32> *%a) {
|
|
|
|
; SSE-LABEL: sitofp_load_4i32_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: cvtdq2ps (%rdi), %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: sitofp_load_4i32_to_4f32:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vcvtdq2ps (%rdi), %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%ld = load <4 x i32>, <4 x i32> *%a
|
|
|
|
%cvt = sitofp <4 x i32> %ld to <4 x float>
|
|
|
|
ret <4 x float> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @sitofp_load_4i16_to_4f32(<4 x i16> *%a) {
|
|
|
|
; SSE-LABEL: sitofp_load_4i16_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
|
|
|
|
; SSE-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: sitofp_load_4i16_to_4f32:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpmovsxwd (%rdi), %xmm0
|
|
|
|
; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%ld = load <4 x i16>, <4 x i16> *%a
|
|
|
|
%cvt = sitofp <4 x i16> %ld to <4 x float>
|
|
|
|
ret <4 x float> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @sitofp_load_4i8_to_4f32(<4 x i8> *%a) {
|
|
|
|
; SSE-LABEL: sitofp_load_4i8_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
|
|
|
|
; SSE-NEXT: psrad $24, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: sitofp_load_4i8_to_4f32:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpmovsxbd (%rdi), %xmm0
|
|
|
|
; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%ld = load <4 x i8>, <4 x i8> *%a
|
|
|
|
%cvt = sitofp <4 x i8> %ld to <4 x float>
|
|
|
|
ret <4 x float> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
|
|
|
|
; SSE-LABEL: sitofp_load_8i64_to_8f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa (%rdi), %xmm1
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: movdqa 16(%rdi), %xmm0
|
|
|
|
; SSE-NEXT: movdqa 32(%rdi), %xmm2
|
|
|
|
; SSE-NEXT: movdqa 48(%rdi), %xmm3
|
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm4
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
|
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
2017-09-18 12:40:58 +08:00
|
|
|
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm3, %rax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: xorps %xmm4, %xmm4
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm4
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
|
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
|
|
|
|
; SSE-NEXT: movq %xmm2, %rax
|
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm2, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: xorps %xmm2, %xmm2
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
2017-09-18 12:40:58 +08:00
|
|
|
; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: sitofp_load_8i64_to_8f32:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm1, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vmovq %xmm1, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vmovq %xmm1, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm1, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm1
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vmovq %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vmovq %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: sitofp_load_8i64_to_8f32:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm1, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vmovq %xmm1, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
|
|
|
|
; AVX2-NEXT: vmovq %xmm1, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm1, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm1
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vmovq %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; AVX2-NEXT: vmovq %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
|
|
|
|
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-LABEL: sitofp_load_8i64_to_8f32:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0
|
|
|
|
; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
|
|
|
|
; AVX512F-NEXT: vmovq %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
|
|
|
|
; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm2
|
|
|
|
; AVX512F-NEXT: vmovq %xmm2, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm2, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
|
2017-08-30 15:26:12 +08:00
|
|
|
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
|
|
|
|
; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: sitofp_load_8i64_to_8f32:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0
|
|
|
|
; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
|
|
|
|
; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm2
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm2, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
|
2017-08-30 15:26:12 +08:00
|
|
|
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
2016-11-23 06:04:50 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: sitofp_load_8i64_to_8f32:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vcvtqq2ps (%rdi), %ymm0
|
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: sitofp_load_8i64_to_8f32:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtqq2ps (%rdi), %ymm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <8 x i64>, <8 x i64> *%a
|
|
|
|
%cvt = sitofp <8 x i64> %ld to <8 x float>
|
|
|
|
ret <8 x float> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x float> @sitofp_load_8i32_to_8f32(<8 x i32> *%a) {
|
|
|
|
; SSE-LABEL: sitofp_load_8i32_to_8f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: cvtdq2ps (%rdi), %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2ps 16(%rdi), %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: sitofp_load_8i32_to_8f32:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vcvtdq2ps (%rdi), %ymm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%ld = load <8 x i32>, <8 x i32> *%a
|
|
|
|
%cvt = sitofp <8 x i32> %ld to <8 x float>
|
|
|
|
ret <8 x float> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) {
|
|
|
|
; SSE-LABEL: sitofp_load_8i16_to_8f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
|
|
|
|
; SSE-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
|
|
|
|
; SSE-NEXT: psrad $16, %xmm1
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: sitofp_load_8i16_to_8f32:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0
|
|
|
|
; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: sitofp_load_8i16_to_8f32:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpmovsxwd (%rdi), %ymm0
|
|
|
|
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: sitofp_load_8i16_to_8f32:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovsxwd (%rdi), %ymm0
|
|
|
|
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <8 x i16>, <8 x i16> *%a
|
|
|
|
%cvt = sitofp <8 x i16> %ld to <8 x float>
|
|
|
|
ret <8 x float> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) {
|
|
|
|
; SSE-LABEL: sitofp_load_8i8_to_8f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
|
|
|
|
; SSE-NEXT: psrad $24, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
|
|
|
|
; SSE-NEXT: psrad $24, %xmm1
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: sitofp_load_8i8_to_8f32:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0
|
|
|
|
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
|
|
|
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: sitofp_load_8i8_to_8f32:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpmovsxbd (%rdi), %ymm0
|
|
|
|
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: sitofp_load_8i8_to_8f32:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovsxbd (%rdi), %ymm0
|
|
|
|
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <8 x i8>, <8 x i8> *%a
|
|
|
|
%cvt = sitofp <8 x i8> %ld to <8 x float>
|
|
|
|
ret <8 x float> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
;
|
|
|
|
; Load Unsigned Integer to Float
|
|
|
|
;
|
|
|
|
|
|
|
|
define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
|
|
|
|
; SSE-LABEL: uitofp_load_4i64_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: movdqa (%rdi), %xmm2
|
|
|
|
; SSE-NEXT: movdqa 16(%rdi), %xmm0
|
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: js .LBB76_1
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: # BB#2:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: jmp .LBB76_3
|
|
|
|
; SSE-NEXT: .LBB76_1:
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
|
|
|
; SSE-NEXT: addss %xmm1, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: .LBB76_3:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: js .LBB76_4
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: # BB#5:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm3
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: jmp .LBB76_6
|
|
|
|
; SSE-NEXT: .LBB76_4:
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm3
|
|
|
|
; SSE-NEXT: addss %xmm3, %xmm3
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: .LBB76_6:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: movq %xmm2, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: js .LBB76_7
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: # BB#8:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: jmp .LBB76_9
|
|
|
|
; SSE-NEXT: .LBB76_7:
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
|
|
|
|
; SSE-NEXT: addss %xmm0, %xmm0
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: .LBB76_9:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
|
|
|
|
; SSE-NEXT: movq %xmm2, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: js .LBB76_10
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: # BB#11:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: xorps %xmm2, %xmm2
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: jmp .LBB76_12
|
|
|
|
; SSE-NEXT: .LBB76_10:
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: xorps %xmm2, %xmm2
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
|
|
|
|
; SSE-NEXT: addss %xmm2, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: .LBB76_12:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
2017-09-18 12:40:58 +08:00
|
|
|
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: uitofp_load_4i64_to_4f32:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX1-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: js .LBB76_1
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: # BB#2:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: jmp .LBB76_3
|
|
|
|
; AVX1-NEXT: .LBB76_1:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX1-NEXT: movq %rax, %rcx
|
|
|
|
; AVX1-NEXT: shrq %rcx
|
|
|
|
; AVX1-NEXT: andl $1, %eax
|
|
|
|
; AVX1-NEXT: orq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: .LBB76_3:
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX1-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: js .LBB76_4
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: # BB#5:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: jmp .LBB76_6
|
|
|
|
; AVX1-NEXT: .LBB76_4:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX1-NEXT: movq %rax, %rcx
|
|
|
|
; AVX1-NEXT: shrq %rcx
|
|
|
|
; AVX1-NEXT: andl $1, %eax
|
|
|
|
; AVX1-NEXT: orq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: .LBB76_6:
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX1-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: js .LBB76_7
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: # BB#8:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: jmp .LBB76_9
|
|
|
|
; AVX1-NEXT: .LBB76_7:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX1-NEXT: movq %rax, %rcx
|
|
|
|
; AVX1-NEXT: shrq %rcx
|
|
|
|
; AVX1-NEXT: andl $1, %eax
|
|
|
|
; AVX1-NEXT: orq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: .LBB76_9:
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX1-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: js .LBB76_10
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: # BB#11:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: .LBB76_10:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX1-NEXT: movq %rax, %rcx
|
|
|
|
; AVX1-NEXT: shrq %rcx
|
|
|
|
; AVX1-NEXT: andl $1, %eax
|
|
|
|
; AVX1-NEXT: orq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: uitofp_load_4i64_to_4f32:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX2-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: js .LBB76_1
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: # BB#2:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: jmp .LBB76_3
|
|
|
|
; AVX2-NEXT: .LBB76_1:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX2-NEXT: movq %rax, %rcx
|
|
|
|
; AVX2-NEXT: shrq %rcx
|
|
|
|
; AVX2-NEXT: andl $1, %eax
|
|
|
|
; AVX2-NEXT: orq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: .LBB76_3:
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX2-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: js .LBB76_4
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: # BB#5:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: jmp .LBB76_6
|
|
|
|
; AVX2-NEXT: .LBB76_4:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX2-NEXT: movq %rax, %rcx
|
|
|
|
; AVX2-NEXT: shrq %rcx
|
|
|
|
; AVX2-NEXT: andl $1, %eax
|
|
|
|
; AVX2-NEXT: orq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: .LBB76_6:
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; AVX2-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX2-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: js .LBB76_7
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: # BB#8:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: jmp .LBB76_9
|
|
|
|
; AVX2-NEXT: .LBB76_7:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX2-NEXT: movq %rax, %rcx
|
|
|
|
; AVX2-NEXT: shrq %rcx
|
|
|
|
; AVX2-NEXT: andl $1, %eax
|
|
|
|
; AVX2-NEXT: orq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: .LBB76_9:
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX2-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: js .LBB76_10
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: # BB#11:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: .LBB76_10:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX2-NEXT: movq %rax, %rcx
|
|
|
|
; AVX2-NEXT: shrq %rcx
|
|
|
|
; AVX2-NEXT: andl $1, %eax
|
|
|
|
; AVX2-NEXT: orq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-LABEL: uitofp_load_4i64_to_4f32:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
|
|
|
|
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: uitofp_load_4i64_to_4f32:
|
|
|
|
; AVX512VL: # BB#0:
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512VL-NEXT: vzeroupper
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
2016-11-23 06:04:50 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: uitofp_load_4i64_to_4f32:
|
|
|
|
; AVX512DQ: # BB#0:
|
2016-11-23 22:01:18 +08:00
|
|
|
; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
|
|
|
|
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f32:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtuqq2psy (%rdi), %xmm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <4 x i64>, <4 x i64> *%a
|
|
|
|
%cvt = uitofp <4 x i64> %ld to <4 x float>
|
|
|
|
ret <4 x float> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) {
|
|
|
|
; SSE-LABEL: uitofp_load_4i32_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa (%rdi), %xmm0
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
|
|
|
|
; SSE-NEXT: pand %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: por {{.*}}(%rip), %xmm1
|
|
|
|
; SSE-NEXT: psrld $16, %xmm0
|
|
|
|
; SSE-NEXT: por {{.*}}(%rip), %xmm0
|
|
|
|
; SSE-NEXT: addps {{.*}}(%rip), %xmm0
|
|
|
|
; SSE-NEXT: addps %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: uitofp_load_4i32_to_4f32:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
|
|
|
|
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
|
|
|
|
; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: uitofp_load_4i32_to_4f32:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
|
2017-10-02 23:22:35 +08:00
|
|
|
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
|
|
|
|
; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
|
2017-10-02 23:22:35 +08:00
|
|
|
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928]
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
|
|
|
|
; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
|
|
|
|
; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-LABEL: uitofp_load_4i32_to_4f32:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vmovaps (%rdi), %xmm0
|
|
|
|
; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
|
|
|
|
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: uitofp_load_4i32_to_4f32:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vcvtudq2ps (%rdi), %xmm0
|
|
|
|
; AVX512VL-NEXT: retq
|
2016-11-23 06:04:50 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: uitofp_load_4i32_to_4f32:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
|
|
|
|
; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512DQ-NEXT: vzeroupper
|
2016-11-23 06:04:50 +08:00
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f32:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtudq2ps (%rdi), %xmm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <4 x i32>, <4 x i32> *%a
|
|
|
|
%cvt = uitofp <4 x i32> %ld to <4 x float>
|
|
|
|
ret <4 x float> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @uitofp_load_4i16_to_4f32(<4 x i16> *%a) {
|
|
|
|
; SSE-LABEL: uitofp_load_4i16_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; SSE-NEXT: pxor %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-09-18 20:45:23 +08:00
|
|
|
; AVX-LABEL: uitofp_load_4i16_to_4f32:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
|
|
|
|
; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <4 x i16>, <4 x i16> *%a
|
|
|
|
%cvt = uitofp <4 x i16> %ld to <4 x float>
|
|
|
|
ret <4 x float> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) {
|
|
|
|
; SSE-LABEL: uitofp_load_4i8_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; SSE-NEXT: pxor %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-09-18 20:45:23 +08:00
|
|
|
; AVX-LABEL: uitofp_load_4i8_to_4f32:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
|
|
|
; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <4 x i8>, <4 x i8> *%a
|
|
|
|
%cvt = uitofp <4 x i8> %ld to <4 x float>
|
|
|
|
ret <4 x float> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
|
|
|
|
; SSE-LABEL: uitofp_load_8i64_to_8f32:
|
|
|
|
; SSE: # BB#0:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: movdqa (%rdi), %xmm5
|
|
|
|
; SSE-NEXT: movdqa 16(%rdi), %xmm0
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: movdqa 32(%rdi), %xmm2
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: movdqa 48(%rdi), %xmm1
|
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: js .LBB80_1
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: # BB#2:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm3
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: jmp .LBB80_3
|
|
|
|
; SSE-NEXT: .LBB80_1:
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm3
|
|
|
|
; SSE-NEXT: addss %xmm3, %xmm3
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: .LBB80_3:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; SSE-NEXT: movq %xmm0, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: js .LBB80_4
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: # BB#5:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm4
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: jmp .LBB80_6
|
|
|
|
; SSE-NEXT: .LBB80_4:
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm4
|
|
|
|
; SSE-NEXT: addss %xmm4, %xmm4
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: .LBB80_6:
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm5, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: js .LBB80_7
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: # BB#8:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: jmp .LBB80_9
|
|
|
|
; SSE-NEXT: .LBB80_7:
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
|
|
|
|
; SSE-NEXT: addss %xmm0, %xmm0
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: .LBB80_9:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
|
|
|
|
; SSE-NEXT: movq %xmm5, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: js .LBB80_10
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: # BB#11:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm6
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: jmp .LBB80_12
|
|
|
|
; SSE-NEXT: .LBB80_10:
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm6
|
|
|
|
; SSE-NEXT: addss %xmm6, %xmm6
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: .LBB80_12:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: js .LBB80_13
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: # BB#14:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: xorps %xmm5, %xmm5
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm5
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: jmp .LBB80_15
|
|
|
|
; SSE-NEXT: .LBB80_13:
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: xorps %xmm5, %xmm5
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm5
|
|
|
|
; SSE-NEXT: addss %xmm5, %xmm5
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: .LBB80_15:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
|
|
|
; SSE-NEXT: movq %xmm1, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: js .LBB80_16
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: # BB#17:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm7
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: jmp .LBB80_18
|
|
|
|
; SSE-NEXT: .LBB80_16:
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm7
|
|
|
|
; SSE-NEXT: addss %xmm7, %xmm7
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: .LBB80_18:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
|
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
|
|
|
|
; SSE-NEXT: movq %xmm2, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: js .LBB80_19
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: # BB#20:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: jmp .LBB80_21
|
|
|
|
; SSE-NEXT: .LBB80_19:
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
|
|
|
|
; SSE-NEXT: addss %xmm1, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: .LBB80_21:
|
2017-09-18 12:40:58 +08:00
|
|
|
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
|
2017-04-26 15:08:44 +08:00
|
|
|
; SSE-NEXT: movq %xmm2, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: js .LBB80_22
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: # BB#23:
|
|
|
|
; SSE-NEXT: xorps %xmm2, %xmm2
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: jmp .LBB80_24
|
|
|
|
; SSE-NEXT: .LBB80_22:
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: movq %rax, %rcx
|
|
|
|
; SSE-NEXT: shrq %rcx
|
|
|
|
; SSE-NEXT: andl $1, %eax
|
|
|
|
; SSE-NEXT: orq %rcx, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: xorps %xmm2, %xmm2
|
2016-08-12 11:33:22 +08:00
|
|
|
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: addss %xmm2, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; SSE-NEXT: .LBB80_24:
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
2017-09-18 12:40:58 +08:00
|
|
|
; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
|
2016-05-24 21:07:23 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: uitofp_load_8i64_to_8f32:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX1-NEXT: vmovdqa 32(%rdi), %ymm2
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm2, %rax
|
|
|
|
; AVX1-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: js .LBB80_1
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: # BB#2:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: jmp .LBB80_3
|
|
|
|
; AVX1-NEXT: .LBB80_1:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX1-NEXT: movq %rax, %rcx
|
|
|
|
; AVX1-NEXT: shrq %rcx
|
|
|
|
; AVX1-NEXT: andl $1, %eax
|
|
|
|
; AVX1-NEXT: orq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: .LBB80_3:
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vmovq %xmm2, %rax
|
|
|
|
; AVX1-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: js .LBB80_4
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: # BB#5:
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: jmp .LBB80_6
|
|
|
|
; AVX1-NEXT: .LBB80_4:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX1-NEXT: movq %rax, %rcx
|
|
|
|
; AVX1-NEXT: shrq %rcx
|
|
|
|
; AVX1-NEXT: andl $1, %eax
|
|
|
|
; AVX1-NEXT: orq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: .LBB80_6:
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
|
|
|
|
; AVX1-NEXT: vmovq %xmm2, %rax
|
|
|
|
; AVX1-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: js .LBB80_7
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: # BB#8:
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: jmp .LBB80_9
|
|
|
|
; AVX1-NEXT: .LBB80_7:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX1-NEXT: movq %rax, %rcx
|
|
|
|
; AVX1-NEXT: shrq %rcx
|
|
|
|
; AVX1-NEXT: andl $1, %eax
|
|
|
|
; AVX1-NEXT: orq %rcx, %rax
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: .LBB80_9:
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm2, %rax
|
|
|
|
; AVX1-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: js .LBB80_10
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: # BB#11:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: jmp .LBB80_12
|
|
|
|
; AVX1-NEXT: .LBB80_10:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX1-NEXT: movq %rax, %rcx
|
|
|
|
; AVX1-NEXT: shrq %rcx
|
|
|
|
; AVX1-NEXT: andl $1, %eax
|
|
|
|
; AVX1-NEXT: orq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: .LBB80_12:
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX1-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: js .LBB80_13
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: # BB#14:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: jmp .LBB80_15
|
|
|
|
; AVX1-NEXT: .LBB80_13:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX1-NEXT: movq %rax, %rcx
|
|
|
|
; AVX1-NEXT: shrq %rcx
|
|
|
|
; AVX1-NEXT: andl $1, %eax
|
|
|
|
; AVX1-NEXT: orq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: .LBB80_15:
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX1-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: js .LBB80_16
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: # BB#17:
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: jmp .LBB80_18
|
|
|
|
; AVX1-NEXT: .LBB80_16:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX1-NEXT: movq %rax, %rcx
|
|
|
|
; AVX1-NEXT: shrq %rcx
|
|
|
|
; AVX1-NEXT: andl $1, %eax
|
|
|
|
; AVX1-NEXT: orq %rcx, %rax
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
|
|
|
|
; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: .LBB80_18:
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
|
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
|
|
|
|
; AVX1-NEXT: vmovq %xmm4, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: js .LBB80_19
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: # BB#20:
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: jmp .LBB80_21
|
|
|
|
; AVX1-NEXT: .LBB80_19:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX1-NEXT: movq %rax, %rcx
|
|
|
|
; AVX1-NEXT: shrq %rcx
|
|
|
|
; AVX1-NEXT: andl $1, %eax
|
|
|
|
; AVX1-NEXT: orq %rcx, %rax
|
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: .LBB80_21:
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm4, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: js .LBB80_22
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: # BB#23:
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: jmp .LBB80_24
|
|
|
|
; AVX1-NEXT: .LBB80_22:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX1-NEXT: movq %rax, %rcx
|
|
|
|
; AVX1-NEXT: shrq %rcx
|
|
|
|
; AVX1-NEXT: andl $1, %eax
|
|
|
|
; AVX1-NEXT: orq %rcx, %rax
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
|
|
|
|
; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX1-NEXT: .LBB80_24:
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: uitofp_load_8i64_to_8f32:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm2, %rax
|
|
|
|
; AVX2-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: js .LBB80_1
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: # BB#2:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: jmp .LBB80_3
|
|
|
|
; AVX2-NEXT: .LBB80_1:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX2-NEXT: movq %rax, %rcx
|
|
|
|
; AVX2-NEXT: shrq %rcx
|
|
|
|
; AVX2-NEXT: andl $1, %eax
|
|
|
|
; AVX2-NEXT: orq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: .LBB80_3:
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vmovq %xmm2, %rax
|
|
|
|
; AVX2-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: js .LBB80_4
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: # BB#5:
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: jmp .LBB80_6
|
|
|
|
; AVX2-NEXT: .LBB80_4:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX2-NEXT: movq %rax, %rcx
|
|
|
|
; AVX2-NEXT: shrq %rcx
|
|
|
|
; AVX2-NEXT: andl $1, %eax
|
|
|
|
; AVX2-NEXT: orq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: .LBB80_6:
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
|
|
|
|
; AVX2-NEXT: vmovq %xmm2, %rax
|
|
|
|
; AVX2-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: js .LBB80_7
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: # BB#8:
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: jmp .LBB80_9
|
|
|
|
; AVX2-NEXT: .LBB80_7:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX2-NEXT: movq %rax, %rcx
|
|
|
|
; AVX2-NEXT: shrq %rcx
|
|
|
|
; AVX2-NEXT: andl $1, %eax
|
|
|
|
; AVX2-NEXT: orq %rcx, %rax
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
|
|
|
|
; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: .LBB80_9:
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm2, %rax
|
|
|
|
; AVX2-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: js .LBB80_10
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: # BB#11:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: jmp .LBB80_12
|
|
|
|
; AVX2-NEXT: .LBB80_10:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX2-NEXT: movq %rax, %rcx
|
|
|
|
; AVX2-NEXT: shrq %rcx
|
|
|
|
; AVX2-NEXT: andl $1, %eax
|
|
|
|
; AVX2-NEXT: orq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: .LBB80_12:
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX2-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: js .LBB80_13
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: # BB#14:
|
2016-08-11 15:32:08 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: jmp .LBB80_15
|
|
|
|
; AVX2-NEXT: .LBB80_13:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX2-NEXT: movq %rax, %rcx
|
|
|
|
; AVX2-NEXT: shrq %rcx
|
|
|
|
; AVX2-NEXT: andl $1, %eax
|
|
|
|
; AVX2-NEXT: orq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: .LBB80_15:
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX2-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: js .LBB80_16
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: # BB#17:
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: jmp .LBB80_18
|
|
|
|
; AVX2-NEXT: .LBB80_16:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX2-NEXT: movq %rax, %rcx
|
|
|
|
; AVX2-NEXT: shrq %rcx
|
|
|
|
; AVX2-NEXT: andl $1, %eax
|
|
|
|
; AVX2-NEXT: orq %rcx, %rax
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
|
|
|
|
; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: .LBB80_18:
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
|
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
|
|
|
|
; AVX2-NEXT: vmovq %xmm4, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: js .LBB80_19
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: # BB#20:
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: jmp .LBB80_21
|
|
|
|
; AVX2-NEXT: .LBB80_19:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX2-NEXT: movq %rax, %rcx
|
|
|
|
; AVX2-NEXT: shrq %rcx
|
|
|
|
; AVX2-NEXT: andl $1, %eax
|
|
|
|
; AVX2-NEXT: orq %rcx, %rax
|
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: .LBB80_21:
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm4, %rax
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: testq %rax, %rax
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: js .LBB80_22
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: # BB#23:
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: jmp .LBB80_24
|
|
|
|
; AVX2-NEXT: .LBB80_22:
|
2016-08-12 11:33:22 +08:00
|
|
|
; AVX2-NEXT: movq %rax, %rcx
|
|
|
|
; AVX2-NEXT: shrq %rcx
|
|
|
|
; AVX2-NEXT: andl $1, %eax
|
|
|
|
; AVX2-NEXT: orq %rcx, %rax
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
|
|
|
|
; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
|
2016-11-24 05:19:57 +08:00
|
|
|
; AVX2-NEXT: .LBB80_24:
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-LABEL: uitofp_load_8i64_to_8f32:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0
|
|
|
|
; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
|
|
|
|
; AVX512F-NEXT: vmovq %xmm1, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
|
|
|
|
; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm2
|
|
|
|
; AVX512F-NEXT: vmovq %xmm2, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm2, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2
|
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
|
2017-08-30 15:26:12 +08:00
|
|
|
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
|
|
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm0
|
|
|
|
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
|
|
|
|
; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: uitofp_load_8i64_to_8f32:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0
|
|
|
|
; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm1, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
|
|
|
|
; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm2
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm2, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2
|
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
|
2017-08-30 15:26:12 +08:00
|
|
|
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
|
|
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm0
|
|
|
|
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
|
2017-01-03 13:46:18 +08:00
|
|
|
; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512VL-NEXT: retq
|
2016-11-23 06:04:50 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: uitofp_load_8i64_to_8f32:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vcvtuqq2ps (%rdi), %ymm0
|
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: uitofp_load_8i64_to_8f32:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtuqq2ps (%rdi), %ymm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <8 x i64>, <8 x i64> *%a
|
|
|
|
%cvt = uitofp <8 x i64> %ld to <8 x float>
|
|
|
|
ret <8 x float> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) {
|
|
|
|
; SSE-LABEL: uitofp_load_8i32_to_8f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa (%rdi), %xmm0
|
|
|
|
; SSE-NEXT: movdqa 16(%rdi), %xmm1
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
|
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm3
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm3
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
|
|
|
|
; SSE-NEXT: por %xmm4, %xmm3
|
|
|
|
; SSE-NEXT: psrld $16, %xmm0
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
|
|
|
|
; SSE-NEXT: por %xmm5, %xmm0
|
|
|
|
; SSE-NEXT: movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
|
|
|
|
; SSE-NEXT: addps %xmm6, %xmm0
|
|
|
|
; SSE-NEXT: addps %xmm3, %xmm0
|
|
|
|
; SSE-NEXT: pand %xmm1, %xmm2
|
|
|
|
; SSE-NEXT: por %xmm4, %xmm2
|
|
|
|
; SSE-NEXT: psrld $16, %xmm1
|
|
|
|
; SSE-NEXT: por %xmm5, %xmm1
|
|
|
|
; SSE-NEXT: addps %xmm6, %xmm1
|
|
|
|
; SSE-NEXT: addps %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: uitofp_load_8i32_to_8f32:
|
|
|
|
; AVX1: # BB#0:
|
2016-08-09 11:06:26 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
|
2016-08-09 11:06:26 +08:00
|
|
|
; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
|
|
|
|
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
|
2016-08-09 11:06:26 +08:00
|
|
|
; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: uitofp_load_8i32_to_8f32:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
|
2017-10-02 23:22:35 +08:00
|
|
|
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
|
|
|
|
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
|
2017-10-02 23:22:35 +08:00
|
|
|
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
|
2016-05-24 21:07:23 +08:00
|
|
|
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
|
|
|
|
; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm2
|
|
|
|
; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
2016-11-18 23:31:36 +08:00
|
|
|
; AVX512F-LABEL: uitofp_load_8i32_to_8f32:
|
|
|
|
; AVX512F: # BB#0:
|
|
|
|
; AVX512F-NEXT: vmovaps (%rdi), %ymm0
|
|
|
|
; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
|
|
|
|
; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VL-LABEL: uitofp_load_8i32_to_8f32:
|
|
|
|
; AVX512VL: # BB#0:
|
|
|
|
; AVX512VL-NEXT: vcvtudq2ps (%rdi), %ymm0
|
|
|
|
; AVX512VL-NEXT: retq
|
2016-11-23 06:04:50 +08:00
|
|
|
;
|
|
|
|
; AVX512DQ-LABEL: uitofp_load_8i32_to_8f32:
|
|
|
|
; AVX512DQ: # BB#0:
|
|
|
|
; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
|
|
|
|
; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
|
|
|
|
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512DQ-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512VLDQ-LABEL: uitofp_load_8i32_to_8f32:
|
|
|
|
; AVX512VLDQ: # BB#0:
|
|
|
|
; AVX512VLDQ-NEXT: vcvtudq2ps (%rdi), %ymm0
|
|
|
|
; AVX512VLDQ-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <8 x i32>, <8 x i32> *%a
|
|
|
|
%cvt = uitofp <8 x i32> %ld to <8 x float>
|
|
|
|
ret <8 x float> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x float> @uitofp_load_8i16_to_8f32(<8 x i16> *%a) {
|
|
|
|
; SSE-LABEL: uitofp_load_8i16_to_8f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movdqa (%rdi), %xmm1
|
|
|
|
; SSE-NEXT: pxor %xmm2, %xmm2
|
|
|
|
; SSE-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: uitofp_load_8i16_to_8f32:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: uitofp_load_8i16_to_8f32:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
|
|
|
|
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: uitofp_load_8i16_to_8f32:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
|
2016-09-18 20:45:23 +08:00
|
|
|
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX512-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <8 x i16>, <8 x i16> *%a
|
|
|
|
%cvt = uitofp <8 x i16> %ld to <8 x float>
|
|
|
|
ret <8 x float> %cvt
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) {
|
|
|
|
; SSE-LABEL: uitofp_load_8i8_to_8f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; SSE-NEXT: pxor %xmm2, %xmm2
|
|
|
|
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
|
|
|
|
; SSE-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
|
|
|
; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: uitofp_load_8i8_to_8f32:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: uitofp_load_8i8_to_8f32:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: uitofp_load_8i8_to_8f32:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
|
2016-09-18 20:45:23 +08:00
|
|
|
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX512-NEXT: retq
|
2016-05-24 21:07:23 +08:00
|
|
|
%ld = load <8 x i8>, <8 x i8> *%a
|
|
|
|
%cvt = uitofp <8 x i8> %ld to <8 x float>
|
|
|
|
ret <8 x float> %cvt
|
|
|
|
}
|
|
|
|
|
2015-06-21 00:19:24 +08:00
|
|
|
;
|
|
|
|
; Aggregates
|
|
|
|
;
|
|
|
|
|
|
|
|
%Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }>
|
2015-07-17 05:00:57 +08:00
|
|
|
define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-LABEL: aggregate_sitofp_8i16_to_8f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movq 24(%rdi), %rax
|
|
|
|
; SSE-NEXT: movdqu 8(%rdi), %xmm0
|
[X86][SSE] Combine UNPCKL with vector_shuffle into UNPCKH to save one instruction for sext from v16i8 to v16i16 and v8i16 to v8i32.
This patch is enabling combining UNPCKL with vector_shuffle that moves the upper
half of a vector into the lower half, into a UNPCKH instruction. For example:
t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1, undef:v16i8
t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
will be combined to:
t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
Differential revision: http://reviews.llvm.org/D14399
llvm-svn: 253067
2015-11-14 03:47:43 +08:00
|
|
|
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: psrad $16, %xmm1
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
|
[X86][SSE] Combine UNPCKL with vector_shuffle into UNPCKH to save one instruction for sext from v16i8 to v16i16 and v8i16 to v8i32.
This patch is enabling combining UNPCKL with vector_shuffle that moves the upper
half of a vector into the lower half, into a UNPCKH instruction. For example:
t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1, undef:v16i8
t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
will be combined to:
t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
Differential revision: http://reviews.llvm.org/D14399
llvm-svn: 253067
2015-11-14 03:47:43 +08:00
|
|
|
; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
[X86][SSE] Combine UNPCKL with vector_shuffle into UNPCKH to save one instruction for sext from v16i8 to v16i16 and v8i16 to v8i32.
This patch is enabling combining UNPCKL with vector_shuffle that moves the upper
half of a vector into the lower half, into a UNPCKH instruction. For example:
t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1, undef:v16i8
t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
will be combined to:
t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
Differential revision: http://reviews.llvm.org/D14399
llvm-svn: 253067
2015-11-14 03:47:43 +08:00
|
|
|
; SSE-NEXT: movaps %xmm0, 16(%rax)
|
|
|
|
; SSE-NEXT: movaps %xmm1, (%rax)
|
2015-07-19 00:14:56 +08:00
|
|
|
; SSE-NEXT: retq
|
2015-06-21 00:19:24 +08:00
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32:
|
2015-06-21 00:19:24 +08:00
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: movq 24(%rdi), %rax
|
|
|
|
; AVX1-NEXT: vmovdqu 8(%rdi), %xmm0
|
|
|
|
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
|
|
|
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vmovaps %ymm0, (%rax)
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2015-07-17 05:00:57 +08:00
|
|
|
; AVX2-LABEL: aggregate_sitofp_8i16_to_8f32:
|
2015-06-21 00:19:24 +08:00
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: movq 24(%rdi), %rax
|
|
|
|
; AVX2-NEXT: vpmovsxwd 8(%rdi), %ymm0
|
|
|
|
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vmovaps %ymm0, (%rax)
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
2016-08-07 03:31:47 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: aggregate_sitofp_8i16_to_8f32:
|
|
|
|
; AVX512: # BB#0:
|
|
|
|
; AVX512-NEXT: movq 24(%rdi), %rax
|
|
|
|
; AVX512-NEXT: vpmovsxwd 8(%rdi), %ymm0
|
|
|
|
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vmovaps %ymm0, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
2016-08-07 03:31:47 +08:00
|
|
|
; AVX512-NEXT: retq
|
2015-06-21 00:19:24 +08:00
|
|
|
%1 = load %Arguments, %Arguments* %a0, align 1
|
|
|
|
%2 = extractvalue %Arguments %1, 1
|
|
|
|
%3 = extractvalue %Arguments %1, 2
|
|
|
|
%4 = sitofp <8 x i16> %2 to <8 x float>
|
|
|
|
store <8 x float> %4, <8 x float>* %3, align 32
|
|
|
|
ret void
|
|
|
|
}
|
2017-01-11 17:11:48 +08:00
|
|
|
|
|
|
|
define <2 x double> @sitofp_i32_to_2f64(<2 x double> %a0, i32 %a1) nounwind {
|
|
|
|
; SSE-LABEL: sitofp_i32_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: cvtsi2sdl %edi, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: sitofp_i32_to_2f64:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = sitofp i32 %a1 to double
|
|
|
|
%res = insertelement <2 x double> %a0, double %cvt, i32 0
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @sitofp_i32_to_4f32(<4 x float> %a0, i32 %a1) nounwind {
|
|
|
|
; SSE-LABEL: sitofp_i32_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: cvtsi2ssl %edi, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: sitofp_i32_to_4f32:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = sitofp i32 %a1 to float
|
|
|
|
%res = insertelement <4 x float> %a0, float %cvt, i32 0
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @sitofp_i64_to_2f64(<2 x double> %a0, i64 %a1) nounwind {
|
|
|
|
; SSE-LABEL: sitofp_i64_to_2f64:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: cvtsi2sdq %rdi, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: sitofp_i64_to_2f64:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = sitofp i64 %a1 to double
|
|
|
|
%res = insertelement <2 x double> %a0, double %cvt, i32 0
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @sitofp_i64_to_4f32(<4 x float> %a0, i64 %a1) nounwind {
|
|
|
|
; SSE-LABEL: sitofp_i64_to_4f32:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: cvtsi2ssq %rdi, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: sitofp_i64_to_4f32:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%cvt = sitofp i64 %a1 to float
|
|
|
|
%res = insertelement <4 x float> %a0, float %cvt, i32 0
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|