[X86] Custom type legalize v2i32/v4i16/v8i8->i64 bitcasts in 64-bit mode similar to what's done when the destination is f64.

The generic legalizer will fall back to a stack spill that uses a truncating store. That store will get expanded into a shuffle and non-truncating store on pre-avx512 targets. Once that happens the stack store/load pair will be combined away leaving behind the shuffle and bitcasts. On avx512 targets the truncating store is legal so doesn't get folded away.

By custom legalizing it we can avoid this churn and maybe produce better code.

llvm-svn: 348085
This commit is contained in:
Craig Topper 2018-12-02 05:46:48 +00:00
parent 0ff50d49d1
commit ec096a1dae
5 changed files with 183 additions and 747 deletions

View File

@ -25221,7 +25221,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
SrcVT == MVT::i64) {
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
if (DstVT != MVT::f64)
if (DstVT != MVT::f64 && DstVT != MVT::i64)
// This conversion needs to be expanded.
return SDValue();
@ -25253,8 +25253,9 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
MVT V2X64VT = MVT::getVectorVT(DstVT, 2);
SDValue ToV2X64 = DAG.getBitcast(V2X64VT, BV);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, ToV2X64,
DAG.getIntPtrConstant(0, dl));
}

View File

@ -19,7 +19,7 @@ define x86_mmx @mmx_movzl(x86_mmx %x) nounwind {
; X64-LABEL: mmx_movzl:
; X64: ## %bb.0:
; X64-NEXT: movl $32, %eax
; X64-NEXT: movq %rax, %xmm0
; X64-NEXT: movd %eax, %xmm0
; X64-NEXT: retq
%tmp = bitcast x86_mmx %x to <2 x i32>
%tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0

View File

@ -13,10 +13,8 @@ define x86_mmx @t0(i32 %A) nounwind {
;
; X64-LABEL: t0:
; X64: ## %bb.0:
; X64-NEXT: ## kill: def $edi killed $edi def $rdi
; X64-NEXT: movq %rdi, %xmm0
; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-NEXT: movd %edi, %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
; X64-NEXT: retq
%tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1
%tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx

View File

@ -22,224 +22,63 @@ define float @cvt_i16_to_f32(i16 %a0) nounwind {
}
define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind {
; AVX1-LABEL: cvt_4i16_to_4f32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: movq %rax, %rdx
; AVX1-NEXT: movswl %ax, %esi
; AVX1-NEXT: # kill: def $eax killed $eax killed $rax
; AVX1-NEXT: shrl $16, %eax
; AVX1-NEXT: shrq $32, %rcx
; AVX1-NEXT: shrq $48, %rdx
; AVX1-NEXT: movswl %dx, %edx
; AVX1-NEXT: vmovd %edx, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX1-NEXT: movswl %cx, %ecx
; AVX1-NEXT: vmovd %ecx, %xmm1
; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX1-NEXT: cwtl
; AVX1-NEXT: vmovd %eax, %xmm2
; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX1-NEXT: vmovd %esi, %xmm3
; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_4i16_to_4f32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: movq %rax, %rdx
; AVX2-NEXT: movswl %ax, %esi
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: shrl $16, %eax
; AVX2-NEXT: shrq $32, %rcx
; AVX2-NEXT: shrq $48, %rdx
; AVX2-NEXT: movswl %dx, %edx
; AVX2-NEXT: vmovd %edx, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX2-NEXT: movswl %cx, %ecx
; AVX2-NEXT: vmovd %ecx, %xmm1
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: cwtl
; AVX2-NEXT: vmovd %eax, %xmm2
; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX2-NEXT: vmovd %esi, %xmm3
; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: cvt_4i16_to_4f32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: movq %rax, %rdx
; AVX512F-NEXT: movswl %ax, %esi
; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: shrq $48, %rdx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm1
; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm2
; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: vmovd %esi, %xmm3
; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4i16_to_4f32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: movq %rax, %rcx
; AVX512VL-NEXT: movq %rax, %rdx
; AVX512VL-NEXT: movswl %ax, %esi
; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
; AVX512VL-NEXT: shrl $16, %eax
; AVX512VL-NEXT: shrq $32, %rcx
; AVX512VL-NEXT: shrq $48, %rdx
; AVX512VL-NEXT: movswl %dx, %edx
; AVX512VL-NEXT: vmovd %edx, %xmm0
; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512VL-NEXT: movswl %cx, %ecx
; AVX512VL-NEXT: vmovd %ecx, %xmm1
; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512VL-NEXT: cwtl
; AVX512VL-NEXT: vmovd %eax, %xmm2
; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512VL-NEXT: vmovd %esi, %xmm3
; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX512VL-NEXT: retq
; ALL-LABEL: cvt_4i16_to_4f32:
; ALL: # %bb.0:
; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; ALL-NEXT: vmovq %xmm0, %rax
; ALL-NEXT: movq %rax, %rcx
; ALL-NEXT: movq %rax, %rdx
; ALL-NEXT: movswl %ax, %esi
; ALL-NEXT: # kill: def $eax killed $eax killed $rax
; ALL-NEXT: shrl $16, %eax
; ALL-NEXT: shrq $32, %rcx
; ALL-NEXT: shrq $48, %rdx
; ALL-NEXT: movswl %dx, %edx
; ALL-NEXT: vmovd %edx, %xmm0
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: movswl %cx, %ecx
; ALL-NEXT: vmovd %ecx, %xmm1
; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
; ALL-NEXT: cwtl
; ALL-NEXT: vmovd %eax, %xmm2
; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
; ALL-NEXT: vmovd %esi, %xmm3
; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; ALL-NEXT: retq
%1 = bitcast <4 x i16> %a0 to <4 x half>
%2 = fpext <4 x half> %1 to <4 x float>
ret <4 x float> %2
}
define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
; AVX1-LABEL: cvt_8i16_to_4f32:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: movq %rax, %rdx
; AVX1-NEXT: movswl %ax, %esi
; AVX1-NEXT: # kill: def $eax killed $eax killed $rax
; AVX1-NEXT: shrl $16, %eax
; AVX1-NEXT: shrq $32, %rcx
; AVX1-NEXT: shrq $48, %rdx
; AVX1-NEXT: movswl %dx, %edx
; AVX1-NEXT: vmovd %edx, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX1-NEXT: movswl %cx, %ecx
; AVX1-NEXT: vmovd %ecx, %xmm1
; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX1-NEXT: cwtl
; AVX1-NEXT: vmovd %eax, %xmm2
; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX1-NEXT: vmovd %esi, %xmm3
; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_8i16_to_4f32:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: movq %rax, %rdx
; AVX2-NEXT: movswl %ax, %esi
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: shrl $16, %eax
; AVX2-NEXT: shrq $32, %rcx
; AVX2-NEXT: shrq $48, %rdx
; AVX2-NEXT: movswl %dx, %edx
; AVX2-NEXT: vmovd %edx, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX2-NEXT: movswl %cx, %ecx
; AVX2-NEXT: vmovd %ecx, %xmm1
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: cwtl
; AVX2-NEXT: vmovd %eax, %xmm2
; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX2-NEXT: vmovd %esi, %xmm3
; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: cvt_8i16_to_4f32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: movq %rax, %rdx
; AVX512F-NEXT: movswl %ax, %esi
; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: shrq $48, %rdx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm1
; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm2
; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: vmovd %esi, %xmm3
; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_8i16_to_4f32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: movq %rax, %rcx
; AVX512VL-NEXT: movq %rax, %rdx
; AVX512VL-NEXT: movswl %ax, %esi
; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
; AVX512VL-NEXT: shrl $16, %eax
; AVX512VL-NEXT: shrq $32, %rcx
; AVX512VL-NEXT: shrq $48, %rdx
; AVX512VL-NEXT: movswl %dx, %edx
; AVX512VL-NEXT: vmovd %edx, %xmm0
; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512VL-NEXT: movswl %cx, %ecx
; AVX512VL-NEXT: vmovd %ecx, %xmm1
; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512VL-NEXT: cwtl
; AVX512VL-NEXT: vmovd %eax, %xmm2
; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512VL-NEXT: vmovd %esi, %xmm3
; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX512VL-NEXT: retq
; ALL-LABEL: cvt_8i16_to_4f32:
; ALL: # %bb.0:
; ALL-NEXT: vmovq %xmm0, %rax
; ALL-NEXT: movq %rax, %rcx
; ALL-NEXT: movq %rax, %rdx
; ALL-NEXT: movswl %ax, %esi
; ALL-NEXT: # kill: def $eax killed $eax killed $rax
; ALL-NEXT: shrl $16, %eax
; ALL-NEXT: shrq $32, %rcx
; ALL-NEXT: shrq $48, %rdx
; ALL-NEXT: movswl %dx, %edx
; ALL-NEXT: vmovd %edx, %xmm0
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: movswl %cx, %ecx
; ALL-NEXT: vmovd %ecx, %xmm1
; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
; ALL-NEXT: cwtl
; ALL-NEXT: vmovd %eax, %xmm2
; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
; ALL-NEXT: vmovd %esi, %xmm3
; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; ALL-NEXT: retq
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = bitcast <4 x i16> %1 to <4 x half>
%3 = fpext <4 x half> %2 to <4 x float>
@ -730,111 +569,31 @@ define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind {
}
define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind {
; AVX1-LABEL: load_cvt_8i16_to_4f32:
; AVX1: # %bb.0:
; AVX1-NEXT: movq (%rdi), %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: movq %rax, %rdx
; AVX1-NEXT: movswl %ax, %esi
; AVX1-NEXT: # kill: def $eax killed $eax killed $rax
; AVX1-NEXT: shrl $16, %eax
; AVX1-NEXT: shrq $32, %rcx
; AVX1-NEXT: shrq $48, %rdx
; AVX1-NEXT: movswl %dx, %edx
; AVX1-NEXT: vmovd %edx, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX1-NEXT: movswl %cx, %ecx
; AVX1-NEXT: vmovd %ecx, %xmm1
; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX1-NEXT: cwtl
; AVX1-NEXT: vmovd %eax, %xmm2
; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX1-NEXT: vmovd %esi, %xmm3
; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_cvt_8i16_to_4f32:
; AVX2: # %bb.0:
; AVX2-NEXT: movq (%rdi), %rax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: movq %rax, %rdx
; AVX2-NEXT: movswl %ax, %esi
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: shrl $16, %eax
; AVX2-NEXT: shrq $32, %rcx
; AVX2-NEXT: shrq $48, %rdx
; AVX2-NEXT: movswl %dx, %edx
; AVX2-NEXT: vmovd %edx, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX2-NEXT: movswl %cx, %ecx
; AVX2-NEXT: vmovd %ecx, %xmm1
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: cwtl
; AVX2-NEXT: vmovd %eax, %xmm2
; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX2-NEXT: vmovd %esi, %xmm3
; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_cvt_8i16_to_4f32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: movq (%rdi), %rax
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: movq %rax, %rdx
; AVX512F-NEXT: movswl %ax, %esi
; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: shrq $48, %rdx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm1
; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm2
; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: vmovd %esi, %xmm3
; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_cvt_8i16_to_4f32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: movq %rax, %rcx
; AVX512VL-NEXT: movq %rax, %rdx
; AVX512VL-NEXT: movswl %ax, %esi
; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
; AVX512VL-NEXT: shrl $16, %eax
; AVX512VL-NEXT: shrq $32, %rcx
; AVX512VL-NEXT: shrq $48, %rdx
; AVX512VL-NEXT: movswl %dx, %edx
; AVX512VL-NEXT: vmovd %edx, %xmm0
; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512VL-NEXT: movswl %cx, %ecx
; AVX512VL-NEXT: vmovd %ecx, %xmm1
; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512VL-NEXT: cwtl
; AVX512VL-NEXT: vmovd %eax, %xmm2
; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512VL-NEXT: vmovd %esi, %xmm3
; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX512VL-NEXT: retq
; ALL-LABEL: load_cvt_8i16_to_4f32:
; ALL: # %bb.0:
; ALL-NEXT: movq (%rdi), %rax
; ALL-NEXT: movq %rax, %rcx
; ALL-NEXT: movq %rax, %rdx
; ALL-NEXT: movswl %ax, %esi
; ALL-NEXT: # kill: def $eax killed $eax killed $rax
; ALL-NEXT: shrl $16, %eax
; ALL-NEXT: shrq $32, %rcx
; ALL-NEXT: shrq $48, %rdx
; ALL-NEXT: movswl %dx, %edx
; ALL-NEXT: vmovd %edx, %xmm0
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: movswl %cx, %ecx
; ALL-NEXT: vmovd %ecx, %xmm1
; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
; ALL-NEXT: cwtl
; ALL-NEXT: vmovd %eax, %xmm2
; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
; ALL-NEXT: vmovd %esi, %xmm3
; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; ALL-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = bitcast <4 x i16> %2 to <4 x half>
@ -1261,125 +1020,35 @@ define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
}
define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
; AVX1-LABEL: cvt_4i16_to_4f64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: movl %eax, %edx
; AVX1-NEXT: movswl %ax, %esi
; AVX1-NEXT: shrq $48, %rax
; AVX1-NEXT: shrq $32, %rcx
; AVX1-NEXT: shrl $16, %edx
; AVX1-NEXT: movswl %dx, %edx
; AVX1-NEXT: vmovd %edx, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX1-NEXT: vmovd %esi, %xmm1
; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX1-NEXT: movswl %cx, %ecx
; AVX1-NEXT: vmovd %ecx, %xmm2
; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX1-NEXT: cwtl
; AVX1-NEXT: vmovd %eax, %xmm3
; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_4i16_to_4f64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: movl %eax, %edx
; AVX2-NEXT: movswl %ax, %esi
; AVX2-NEXT: shrq $48, %rax
; AVX2-NEXT: shrq $32, %rcx
; AVX2-NEXT: shrl $16, %edx
; AVX2-NEXT: movswl %dx, %edx
; AVX2-NEXT: vmovd %edx, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX2-NEXT: vmovd %esi, %xmm1
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: movswl %cx, %ecx
; AVX2-NEXT: vmovd %ecx, %xmm2
; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX2-NEXT: cwtl
; AVX2-NEXT: vmovd %eax, %xmm3
; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: cvt_4i16_to_4f64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: movl %eax, %edx
; AVX512F-NEXT: movswl %ax, %esi
; AVX512F-NEXT: shrq $48, %rax
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: shrl $16, %edx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %esi, %xmm1
; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm2
; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm3
; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4i16_to_4f64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: movq %rax, %rcx
; AVX512VL-NEXT: movl %eax, %edx
; AVX512VL-NEXT: movswl %ax, %esi
; AVX512VL-NEXT: shrq $48, %rax
; AVX512VL-NEXT: shrq $32, %rcx
; AVX512VL-NEXT: shrl $16, %edx
; AVX512VL-NEXT: movswl %dx, %edx
; AVX512VL-NEXT: vmovd %edx, %xmm0
; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512VL-NEXT: vmovd %esi, %xmm1
; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512VL-NEXT: movswl %cx, %ecx
; AVX512VL-NEXT: vmovd %ecx, %xmm2
; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512VL-NEXT: cwtl
; AVX512VL-NEXT: vmovd %eax, %xmm3
; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512VL-NEXT: retq
; ALL-LABEL: cvt_4i16_to_4f64:
; ALL: # %bb.0:
; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; ALL-NEXT: vmovq %xmm0, %rax
; ALL-NEXT: movq %rax, %rcx
; ALL-NEXT: movl %eax, %edx
; ALL-NEXT: movswl %ax, %esi
; ALL-NEXT: shrq $48, %rax
; ALL-NEXT: shrq $32, %rcx
; ALL-NEXT: shrl $16, %edx
; ALL-NEXT: movswl %dx, %edx
; ALL-NEXT: vmovd %edx, %xmm0
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: vmovd %esi, %xmm1
; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
; ALL-NEXT: movswl %cx, %ecx
; ALL-NEXT: vmovd %ecx, %xmm2
; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
; ALL-NEXT: cwtl
; ALL-NEXT: vmovd %eax, %xmm3
; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; ALL-NEXT: retq
%1 = bitcast <4 x i16> %a0 to <4 x half>
%2 = fpext <4 x half> %1 to <4 x double>
ret <4 x double> %2
@ -1454,123 +1123,34 @@ define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind {
}
define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind {
; AVX1-LABEL: cvt_8i16_to_4f64:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: movl %eax, %edx
; AVX1-NEXT: movswl %ax, %esi
; AVX1-NEXT: shrq $48, %rax
; AVX1-NEXT: shrq $32, %rcx
; AVX1-NEXT: shrl $16, %edx
; AVX1-NEXT: movswl %dx, %edx
; AVX1-NEXT: vmovd %edx, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX1-NEXT: vmovd %esi, %xmm1
; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX1-NEXT: movswl %cx, %ecx
; AVX1-NEXT: vmovd %ecx, %xmm2
; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX1-NEXT: cwtl
; AVX1-NEXT: vmovd %eax, %xmm3
; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_8i16_to_4f64:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: movl %eax, %edx
; AVX2-NEXT: movswl %ax, %esi
; AVX2-NEXT: shrq $48, %rax
; AVX2-NEXT: shrq $32, %rcx
; AVX2-NEXT: shrl $16, %edx
; AVX2-NEXT: movswl %dx, %edx
; AVX2-NEXT: vmovd %edx, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX2-NEXT: vmovd %esi, %xmm1
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: movswl %cx, %ecx
; AVX2-NEXT: vmovd %ecx, %xmm2
; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX2-NEXT: cwtl
; AVX2-NEXT: vmovd %eax, %xmm3
; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: cvt_8i16_to_4f64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: movl %eax, %edx
; AVX512F-NEXT: movswl %ax, %esi
; AVX512F-NEXT: shrq $48, %rax
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: shrl $16, %edx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %esi, %xmm1
; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm2
; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm3
; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_8i16_to_4f64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: movq %rax, %rcx
; AVX512VL-NEXT: movl %eax, %edx
; AVX512VL-NEXT: movswl %ax, %esi
; AVX512VL-NEXT: shrq $48, %rax
; AVX512VL-NEXT: shrq $32, %rcx
; AVX512VL-NEXT: shrl $16, %edx
; AVX512VL-NEXT: movswl %dx, %edx
; AVX512VL-NEXT: vmovd %edx, %xmm0
; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512VL-NEXT: vmovd %esi, %xmm1
; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512VL-NEXT: movswl %cx, %ecx
; AVX512VL-NEXT: vmovd %ecx, %xmm2
; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512VL-NEXT: cwtl
; AVX512VL-NEXT: vmovd %eax, %xmm3
; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512VL-NEXT: retq
; ALL-LABEL: cvt_8i16_to_4f64:
; ALL: # %bb.0:
; ALL-NEXT: vmovq %xmm0, %rax
; ALL-NEXT: movq %rax, %rcx
; ALL-NEXT: movl %eax, %edx
; ALL-NEXT: movswl %ax, %esi
; ALL-NEXT: shrq $48, %rax
; ALL-NEXT: shrq $32, %rcx
; ALL-NEXT: shrl $16, %edx
; ALL-NEXT: movswl %dx, %edx
; ALL-NEXT: vmovd %edx, %xmm0
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: vmovd %esi, %xmm1
; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
; ALL-NEXT: movswl %cx, %ecx
; ALL-NEXT: vmovd %ecx, %xmm2
; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
; ALL-NEXT: cwtl
; ALL-NEXT: vmovd %eax, %xmm3
; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; ALL-NEXT: retq
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = bitcast <4 x i16> %1 to <4 x half>
%3 = fpext <4 x half> %2 to <4 x double>
@ -1812,123 +1392,34 @@ define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind {
}
define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind {
; AVX1-LABEL: load_cvt_8i16_to_4f64:
; AVX1: # %bb.0:
; AVX1-NEXT: movq (%rdi), %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: movl %eax, %edx
; AVX1-NEXT: movswl %ax, %esi
; AVX1-NEXT: shrq $48, %rax
; AVX1-NEXT: shrq $32, %rcx
; AVX1-NEXT: shrl $16, %edx
; AVX1-NEXT: movswl %dx, %edx
; AVX1-NEXT: vmovd %edx, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX1-NEXT: vmovd %esi, %xmm1
; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX1-NEXT: movswl %cx, %ecx
; AVX1-NEXT: vmovd %ecx, %xmm2
; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX1-NEXT: cwtl
; AVX1-NEXT: vmovd %eax, %xmm3
; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_cvt_8i16_to_4f64:
; AVX2: # %bb.0:
; AVX2-NEXT: movq (%rdi), %rax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: movl %eax, %edx
; AVX2-NEXT: movswl %ax, %esi
; AVX2-NEXT: shrq $48, %rax
; AVX2-NEXT: shrq $32, %rcx
; AVX2-NEXT: shrl $16, %edx
; AVX2-NEXT: movswl %dx, %edx
; AVX2-NEXT: vmovd %edx, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX2-NEXT: vmovd %esi, %xmm1
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: movswl %cx, %ecx
; AVX2-NEXT: vmovd %ecx, %xmm2
; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX2-NEXT: cwtl
; AVX2-NEXT: vmovd %eax, %xmm3
; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_cvt_8i16_to_4f64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: movq (%rdi), %rax
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: movl %eax, %edx
; AVX512F-NEXT: movswl %ax, %esi
; AVX512F-NEXT: shrq $48, %rax
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: shrl $16, %edx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %esi, %xmm1
; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm2
; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm3
; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_cvt_8i16_to_4f64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: movq %rax, %rcx
; AVX512VL-NEXT: movl %eax, %edx
; AVX512VL-NEXT: movswl %ax, %esi
; AVX512VL-NEXT: shrq $48, %rax
; AVX512VL-NEXT: shrq $32, %rcx
; AVX512VL-NEXT: shrl $16, %edx
; AVX512VL-NEXT: movswl %dx, %edx
; AVX512VL-NEXT: vmovd %edx, %xmm0
; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512VL-NEXT: vmovd %esi, %xmm1
; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512VL-NEXT: movswl %cx, %ecx
; AVX512VL-NEXT: vmovd %ecx, %xmm2
; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512VL-NEXT: cwtl
; AVX512VL-NEXT: vmovd %eax, %xmm3
; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512VL-NEXT: retq
; ALL-LABEL: load_cvt_8i16_to_4f64:
; ALL: # %bb.0:
; ALL-NEXT: movq (%rdi), %rax
; ALL-NEXT: movq %rax, %rcx
; ALL-NEXT: movl %eax, %edx
; ALL-NEXT: movswl %ax, %esi
; ALL-NEXT: shrq $48, %rax
; ALL-NEXT: shrq $32, %rcx
; ALL-NEXT: shrl $16, %edx
; ALL-NEXT: movswl %dx, %edx
; ALL-NEXT: vmovd %edx, %xmm0
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: vmovd %esi, %xmm1
; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
; ALL-NEXT: movswl %cx, %ecx
; ALL-NEXT: vmovd %ecx, %xmm2
; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
; ALL-NEXT: cwtl
; ALL-NEXT: vmovd %eax, %xmm3
; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; ALL-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = bitcast <4 x i16> %2 to <4 x half>

View File

@ -1639,29 +1639,11 @@ define i64 @trunc2i64_i64(<2 x i64> %inval) {
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc2i64_i64:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc2i64_i64:
; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vpmovqd %xmm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc2i64_i64:
; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc2i64_i64:
; AVX512BWVL: # %bb.0: # %entry
; AVX512BWVL-NEXT: vpmovqd %xmm0, -{{[0-9]+}}(%rsp)
; AVX512BWVL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX512BWVL-NEXT: retq
; AVX512-LABEL: trunc2i64_i64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: retq
entry:
%0 = trunc <2 x i64> %inval to <2 x i32>
%1 = bitcast <2 x i32> %0 to i64
@ -1746,29 +1728,11 @@ define i64 @trunc4i32_i64(<4 x i32> %inval) {
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc4i32_i64:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc4i32_i64:
; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc4i32_i64:
; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc4i32_i64:
; AVX512BWVL: # %bb.0: # %entry
; AVX512BWVL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
; AVX512BWVL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX512BWVL-NEXT: retq
; AVX512-LABEL: trunc4i32_i64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: retq
entry:
%0 = trunc <4 x i32> %inval to <4 x i16>
%1 = bitcast <4 x i16> %0 to i64
@ -1849,29 +1813,11 @@ define i64 @trunc8i16_i64(<8 x i16> %inval) {
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc8i16_i64:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc8i16_i64:
; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc8i16_i64:
; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc8i16_i64:
; AVX512BWVL: # %bb.0: # %entry
; AVX512BWVL-NEXT: vpmovwb %xmm0, -{{[0-9]+}}(%rsp)
; AVX512BWVL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX512BWVL-NEXT: retq
; AVX512-LABEL: trunc8i16_i64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: retq
entry:
%0 = trunc <8 x i16> %inval to <8 x i8>
%1 = bitcast <8 x i8> %0 to i64