[FPEnv][X86][SystemZ] Use different algorithms for i64->double uint_to_fp under strictfp to avoid producing -0.0 when rounding toward negative infinity

Some of our conversion algorithms produce -0.0 when converting unsigned i64 to double when the rounding mode is round toward negative. This switches them to other algorithms that don't have this problem. Since it is undefined behavior to change rounding mode with the non-strict nodes, this patch only changes the behavior for strict nodes.

There are still problems with unsigned i32 conversions too which I'll try to fix in another patch.

Fixes part of PR47393

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D87115
This commit is contained in:
Craig Topper 2020-10-21 16:45:23 -07:00
parent e04ba2bc05
commit 9e884169a2
10 changed files with 808 additions and 366 deletions

View File

@ -2460,12 +2460,19 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(SDNode *Node,
assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet");
// TODO: Generalize this for use with other types.
if ((SrcVT == MVT::i32 || SrcVT == MVT::i64) && DestVT == MVT::f32) {
LLVM_DEBUG(dbgs() << "Converting unsigned i32/i64 to f32\n");
if (((SrcVT == MVT::i32 || SrcVT == MVT::i64) && DestVT == MVT::f32) ||
(SrcVT == MVT::i64 && DestVT == MVT::f64)) {
LLVM_DEBUG(dbgs() << "Converting unsigned i32/i64 to f32/f64\n");
// For unsigned conversions, convert them to signed conversions using the
// algorithm from the x86_64 __floatundisf in compiler_rt. That method
// should be valid for i32->f32 as well.
// More generally this transform should be valid if there are 3 more bits
// in the integer type than the significand. Rounding uses the first bit
// after the width of the significand and the OR of all bits after that. So
// we need to be able to OR the shifted out bit into one of the bits that
// participate in the OR.
// TODO: This really should be implemented using a branch rather than a
// select. We happen to get lucky and machinesink does the right
// thing most of the time. This would be a good candidate for a

View File

@ -6528,8 +6528,13 @@ bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result,
SDValue &Chain,
SelectionDAG &DAG) const {
unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0;
SDValue Src = Node->getOperand(OpNo);
// This transform is not correct for converting 0 when rounding mode is set
// to round toward negative infinity which will produce -0.0. So disable under
// strictfp.
if (Node->isStrictFPOpcode())
return false;
SDValue Src = Node->getOperand(0);
EVT SrcVT = Src.getValueType();
EVT DstVT = Node->getValueType(0);
@ -6548,9 +6553,10 @@ bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result,
EVT ShiftVT = getShiftAmountTy(SrcVT, DAG.getDataLayout());
// Implementation of unsigned i64 to f64 following the algorithm in
// __floatundidf in compiler_rt. This implementation has the advantage
// of performing rounding correctly, both in the default rounding mode
// and in all alternate rounding modes.
// __floatundidf in compiler_rt. This implementation performs rounding
// correctly in all rounding modes with the exception of converting 0
// when rounding toward negative infinity. In that case the fsub will produce
// -0.0. This will be added to +0.0 and produce -0.0 which is incorrect.
SDValue TwoP52 = DAG.getConstant(UINT64_C(0x4330000000000000), dl, SrcVT);
SDValue TwoP84PlusTwoP52 = DAG.getConstantFP(
BitsToDouble(UINT64_C(0x4530000000100000)), dl, DstVT);
@ -6564,18 +6570,9 @@ bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result,
SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84);
SDValue LoFlt = DAG.getBitcast(DstVT, LoOr);
SDValue HiFlt = DAG.getBitcast(DstVT, HiOr);
if (Node->isStrictFPOpcode()) {
SDValue HiSub =
DAG.getNode(ISD::STRICT_FSUB, dl, {DstVT, MVT::Other},
{Node->getOperand(0), HiFlt, TwoP84PlusTwoP52});
Result = DAG.getNode(ISD::STRICT_FADD, dl, {DstVT, MVT::Other},
{HiSub.getValue(1), LoFlt, HiSub});
Chain = Result.getValue(1);
} else {
SDValue HiSub =
DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52);
Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub);
}
SDValue HiSub =
DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52);
Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub);
return true;
}

View File

@ -19885,6 +19885,10 @@ static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
/// 64-bit unsigned integer to double expansion.
static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
// when converting 0 when rounding toward negative infinity. Caller will
// fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
// This algorithm is not obvious. Here it is what we're trying to output:
/*
movq %rax, %xmm0
@ -19898,8 +19902,6 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
#endif
*/
bool IsStrict = Op->isStrictFPOpcode();
unsigned OpNo = IsStrict ? 1 : 0;
SDLoc dl(Op);
LLVMContext *Context = DAG.getContext();
@ -19921,7 +19923,7 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
// Load the 64-bit value into an XMM register.
SDValue XR1 =
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo));
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
SDValue CLod0 = DAG.getLoad(
MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
@ -19932,35 +19934,19 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
SDValue Sub;
SDValue Chain;
// TODO: Are there any fast-math-flags to propagate here?
if (IsStrict) {
Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
{Op.getOperand(0), XR2F, CLod1});
Chain = Sub.getValue(1);
} else
Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
SDValue Result;
if (!IsStrict && Subtarget.hasSSE3() &&
if (Subtarget.hasSSE3() &&
shouldUseHorizontalOp(true, DAG, Subtarget)) {
// FIXME: Do we need a STRICT version of FHADD?
Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
} else {
SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
if (IsStrict) {
Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other},
{Chain, Shuffle, Sub});
Chain = Result.getValue(1);
} else
Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
}
Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
DAG.getIntPtrConstant(0, dl));
if (IsStrict)
return DAG.getMergeValues({Result, Chain}, dl);
return Result;
}
@ -20286,11 +20272,14 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
return V;
if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
// The transform for i64->f64 isn't correct for 0 when rounding to negative
// infinity. It produces -0.0, so disable under strictfp.
if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
(DstVT == MVT::f32 || DstVT == MVT::f64))
return SDValue();
// Make a 64-bit buffer, and use it to build an FILD.

View File

@ -22,7 +22,7 @@ define float @f1(i64 %i) #0 {
; Test i64->f64.
define double @f2(i64 %i) #0 {
; CHECK-LABEL: f2:
; CHECK: ldgr
; CHECK: cdgbr
; CHECK: adbr
; CHECK: br %r14
%conv = call double @llvm.experimental.constrained.uitofp.f64.i64(i64 %i,

View File

@ -2492,38 +2492,54 @@ define double @uifdl(i64 %x) #0 {
;
; X86-SSE-LABEL: uifdl:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: subl $12, %esp
; X86-SSE-NEXT: .cfi_def_cfa_offset 16
; X86-SSE-NEXT: subl $28, %esp
; X86-SSE-NEXT: .cfi_def_cfa_offset 32
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; X86-SSE-NEXT: subpd {{\.LCPI.*}}, %xmm0
; X86-SSE-NEXT: movapd %xmm0, %xmm1
; X86-SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; X86-SSE-NEXT: addpd %xmm0, %xmm1
; X86-SSE-NEXT: movlpd %xmm1, (%esp)
; X86-SSE-NEXT: movlps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE-NEXT: shrl $31, %eax
; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp)
; X86-SSE-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
; X86-SSE-NEXT: fstpl {{[0-9]+}}(%esp)
; X86-SSE-NEXT: wait
; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-SSE-NEXT: movsd %xmm0, (%esp)
; X86-SSE-NEXT: fldl (%esp)
; X86-SSE-NEXT: wait
; X86-SSE-NEXT: addl $12, %esp
; X86-SSE-NEXT: addl $28, %esp
; X86-SSE-NEXT: .cfi_def_cfa_offset 4
; X86-SSE-NEXT: retl
;
; SSE-LABEL: uifdl:
; SSE: # %bb.0: # %entry
; SSE-NEXT: movq %rdi, %xmm1
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; SSE-NEXT: subpd {{.*}}(%rip), %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-NEXT: addpd %xmm1, %xmm0
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: shrq %rax
; SSE-NEXT: movl %edi, %ecx
; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: orq %rax, %rcx
; SSE-NEXT: testq %rdi, %rdi
; SSE-NEXT: cmovnsq %rdi, %rcx
; SSE-NEXT: cvtsi2sd %rcx, %xmm0
; SSE-NEXT: jns .LBB48_2
; SSE-NEXT: # %bb.1:
; SSE-NEXT: addsd %xmm0, %xmm0
; SSE-NEXT: .LBB48_2: # %entry
; SSE-NEXT: retq
;
; AVX1-LABEL: uifdl:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovq %rdi, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: movq %rdi, %rax
; AVX1-NEXT: shrq %rax
; AVX1-NEXT: movl %edi, %ecx
; AVX1-NEXT: andl $1, %ecx
; AVX1-NEXT: orq %rax, %rcx
; AVX1-NEXT: testq %rdi, %rdi
; AVX1-NEXT: cmovnsq %rdi, %rcx
; AVX1-NEXT: vcvtsi2sd %rcx, %xmm0, %xmm0
; AVX1-NEXT: jns .LBB48_2
; AVX1-NEXT: # %bb.1:
; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm0
; AVX1-NEXT: .LBB48_2: # %entry
; AVX1-NEXT: retq
;
; AVX512-LABEL: uifdl:

View File

@ -1262,14 +1262,17 @@ define double @uitofp_i64tof64(i64 %x) #0 {
; SSE-X86-NEXT: movl %esp, %ebp
; SSE-X86-NEXT: .cfi_def_cfa_register %ebp
; SSE-X86-NEXT: andl $-8, %esp
; SSE-X86-NEXT: subl $8, %esp
; SSE-X86-NEXT: subl $24, %esp
; SSE-X86-NEXT: movl 12(%ebp), %eax
; SSE-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-X86-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; SSE-X86-NEXT: subpd {{\.LCPI.*}}, %xmm0
; SSE-X86-NEXT: movapd %xmm0, %xmm1
; SSE-X86-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE-X86-NEXT: addpd %xmm0, %xmm1
; SSE-X86-NEXT: movlpd %xmm1, (%esp)
; SSE-X86-NEXT: movlps %xmm0, {{[0-9]+}}(%esp)
; SSE-X86-NEXT: shrl $31, %eax
; SSE-X86-NEXT: fildll {{[0-9]+}}(%esp)
; SSE-X86-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
; SSE-X86-NEXT: fstpl {{[0-9]+}}(%esp)
; SSE-X86-NEXT: wait
; SSE-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-X86-NEXT: movsd %xmm0, (%esp)
; SSE-X86-NEXT: fldl (%esp)
; SSE-X86-NEXT: wait
; SSE-X86-NEXT: movl %ebp, %esp
@ -1279,12 +1282,18 @@ define double @uitofp_i64tof64(i64 %x) #0 {
;
; SSE-X64-LABEL: uitofp_i64tof64:
; SSE-X64: # %bb.0:
; SSE-X64-NEXT: movq %rdi, %xmm1
; SSE-X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; SSE-X64-NEXT: subpd {{.*}}(%rip), %xmm1
; SSE-X64-NEXT: movapd %xmm1, %xmm0
; SSE-X64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-X64-NEXT: addpd %xmm1, %xmm0
; SSE-X64-NEXT: movq %rdi, %rax
; SSE-X64-NEXT: shrq %rax
; SSE-X64-NEXT: movl %edi, %ecx
; SSE-X64-NEXT: andl $1, %ecx
; SSE-X64-NEXT: orq %rax, %rcx
; SSE-X64-NEXT: testq %rdi, %rdi
; SSE-X64-NEXT: cmovnsq %rdi, %rcx
; SSE-X64-NEXT: cvtsi2sd %rcx, %xmm0
; SSE-X64-NEXT: jns .LBB18_2
; SSE-X64-NEXT: # %bb.1:
; SSE-X64-NEXT: addsd %xmm0, %xmm0
; SSE-X64-NEXT: .LBB18_2:
; SSE-X64-NEXT: retq
;
; AVX-X86-LABEL: uitofp_i64tof64:
@ -1295,13 +1304,17 @@ define double @uitofp_i64tof64(i64 %x) #0 {
; AVX-X86-NEXT: movl %esp, %ebp
; AVX-X86-NEXT: .cfi_def_cfa_register %ebp
; AVX-X86-NEXT: andl $-8, %esp
; AVX-X86-NEXT: subl $8, %esp
; AVX-X86-NEXT: subl $24, %esp
; AVX-X86-NEXT: movl 12(%ebp), %eax
; AVX-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-X86-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX-X86-NEXT: vsubpd {{\.LCPI.*}}, %xmm0, %xmm0
; AVX-X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-X86-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX-X86-NEXT: vmovlpd %xmm0, (%esp)
; AVX-X86-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
; AVX-X86-NEXT: shrl $31, %eax
; AVX-X86-NEXT: fildll {{[0-9]+}}(%esp)
; AVX-X86-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
; AVX-X86-NEXT: fstpl {{[0-9]+}}(%esp)
; AVX-X86-NEXT: wait
; AVX-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-X86-NEXT: vmovsd %xmm0, (%esp)
; AVX-X86-NEXT: fldl (%esp)
; AVX-X86-NEXT: wait
; AVX-X86-NEXT: movl %ebp, %esp
@ -1311,11 +1324,18 @@ define double @uitofp_i64tof64(i64 %x) #0 {
;
; AVX1-X64-LABEL: uitofp_i64tof64:
; AVX1-X64: # %bb.0:
; AVX1-X64-NEXT: vmovq %rdi, %xmm0
; AVX1-X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX1-X64-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
; AVX1-X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX1-X64-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX1-X64-NEXT: movq %rdi, %rax
; AVX1-X64-NEXT: shrq %rax
; AVX1-X64-NEXT: movl %edi, %ecx
; AVX1-X64-NEXT: andl $1, %ecx
; AVX1-X64-NEXT: orq %rax, %rcx
; AVX1-X64-NEXT: testq %rdi, %rdi
; AVX1-X64-NEXT: cmovnsq %rdi, %rcx
; AVX1-X64-NEXT: vcvtsi2sd %rcx, %xmm0, %xmm0
; AVX1-X64-NEXT: jns .LBB18_2
; AVX1-X64-NEXT: # %bb.1:
; AVX1-X64-NEXT: vaddsd %xmm0, %xmm0, %xmm0
; AVX1-X64-NEXT: .LBB18_2:
; AVX1-X64-NEXT: retq
;
; AVX512-X64-LABEL: uitofp_i64tof64:

View File

@ -1262,112 +1262,218 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 {
define <2 x double> @uitofp_v2i64_v2f64(<2 x i64> %x) #0 {
; SSE-32-LABEL: uitofp_v2i64_v2f64:
; SSE-32: # %bb.0:
; SSE-32-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
; SSE-32-NEXT: pand %xmm0, %xmm1
; SSE-32-NEXT: por {{\.LCPI.*}}, %xmm1
; SSE-32-NEXT: psrlq $32, %xmm0
; SSE-32-NEXT: por {{\.LCPI.*}}, %xmm0
; SSE-32-NEXT: subpd {{\.LCPI.*}}, %xmm0
; SSE-32-NEXT: addpd %xmm1, %xmm0
; SSE-32-NEXT: pushl %ebp
; SSE-32-NEXT: .cfi_def_cfa_offset 8
; SSE-32-NEXT: .cfi_offset %ebp, -8
; SSE-32-NEXT: movl %esp, %ebp
; SSE-32-NEXT: .cfi_def_cfa_register %ebp
; SSE-32-NEXT: andl $-8, %esp
; SSE-32-NEXT: subl $32, %esp
; SSE-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
; SSE-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-32-NEXT: movq %xmm1, {{[0-9]+}}(%esp)
; SSE-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE-32-NEXT: movd %xmm1, %eax
; SSE-32-NEXT: shrl $31, %eax
; SSE-32-NEXT: fildll {{[0-9]+}}(%esp)
; SSE-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
; SSE-32-NEXT: fstpl {{[0-9]+}}(%esp)
; SSE-32-NEXT: wait
; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE-32-NEXT: movd %xmm0, %eax
; SSE-32-NEXT: shrl $31, %eax
; SSE-32-NEXT: fildll {{[0-9]+}}(%esp)
; SSE-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
; SSE-32-NEXT: fstpl (%esp)
; SSE-32-NEXT: wait
; SSE-32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-32-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; SSE-32-NEXT: movl %ebp, %esp
; SSE-32-NEXT: popl %ebp
; SSE-32-NEXT: .cfi_def_cfa %esp, 4
; SSE-32-NEXT: retl
;
; SSE-64-LABEL: uitofp_v2i64_v2f64:
; SSE-64: # %bb.0:
; SSE-64-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
; SSE-64-NEXT: pand %xmm0, %xmm1
; SSE-64-NEXT: por {{.*}}(%rip), %xmm1
; SSE-64-NEXT: psrlq $32, %xmm0
; SSE-64-NEXT: por {{.*}}(%rip), %xmm0
; SSE-64-NEXT: subpd {{.*}}(%rip), %xmm0
; SSE-64-NEXT: addpd %xmm1, %xmm0
; SSE-64-NEXT: movdqa %xmm0, %xmm1
; SSE-64-NEXT: movq %xmm0, %rax
; SSE-64-NEXT: movq %rax, %rcx
; SSE-64-NEXT: shrq %rcx
; SSE-64-NEXT: movl %eax, %edx
; SSE-64-NEXT: andl $1, %edx
; SSE-64-NEXT: orq %rcx, %rdx
; SSE-64-NEXT: testq %rax, %rax
; SSE-64-NEXT: cmovnsq %rax, %rdx
; SSE-64-NEXT: xorps %xmm0, %xmm0
; SSE-64-NEXT: cvtsi2sd %rdx, %xmm0
; SSE-64-NEXT: jns .LBB21_2
; SSE-64-NEXT: # %bb.1:
; SSE-64-NEXT: addsd %xmm0, %xmm0
; SSE-64-NEXT: .LBB21_2:
; SSE-64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE-64-NEXT: movq %xmm1, %rax
; SSE-64-NEXT: movq %rax, %rcx
; SSE-64-NEXT: shrq %rcx
; SSE-64-NEXT: movl %eax, %edx
; SSE-64-NEXT: andl $1, %edx
; SSE-64-NEXT: orq %rcx, %rdx
; SSE-64-NEXT: testq %rax, %rax
; SSE-64-NEXT: cmovnsq %rax, %rdx
; SSE-64-NEXT: xorps %xmm1, %xmm1
; SSE-64-NEXT: cvtsi2sd %rdx, %xmm1
; SSE-64-NEXT: jns .LBB21_4
; SSE-64-NEXT: # %bb.3:
; SSE-64-NEXT: addsd %xmm1, %xmm1
; SSE-64-NEXT: .LBB21_4:
; SSE-64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-64-NEXT: retq
;
; SSE41-32-LABEL: uitofp_v2i64_v2f64:
; SSE41-32: # %bb.0:
; SSE41-32-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
; SSE41-32-NEXT: pand %xmm0, %xmm1
; SSE41-32-NEXT: por {{\.LCPI.*}}, %xmm1
; SSE41-32-NEXT: psrlq $32, %xmm0
; SSE41-32-NEXT: por {{\.LCPI.*}}, %xmm0
; SSE41-32-NEXT: subpd {{\.LCPI.*}}, %xmm0
; SSE41-32-NEXT: addpd %xmm1, %xmm0
; SSE41-32-NEXT: pushl %ebp
; SSE41-32-NEXT: .cfi_def_cfa_offset 8
; SSE41-32-NEXT: .cfi_offset %ebp, -8
; SSE41-32-NEXT: movl %esp, %ebp
; SSE41-32-NEXT: .cfi_def_cfa_register %ebp
; SSE41-32-NEXT: andl $-8, %esp
; SSE41-32-NEXT: subl $32, %esp
; SSE41-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
; SSE41-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE41-32-NEXT: movq %xmm1, {{[0-9]+}}(%esp)
; SSE41-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE41-32-NEXT: movd %xmm1, %eax
; SSE41-32-NEXT: shrl $31, %eax
; SSE41-32-NEXT: fildll {{[0-9]+}}(%esp)
; SSE41-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
; SSE41-32-NEXT: fstpl {{[0-9]+}}(%esp)
; SSE41-32-NEXT: wait
; SSE41-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE41-32-NEXT: movd %xmm0, %eax
; SSE41-32-NEXT: shrl $31, %eax
; SSE41-32-NEXT: fildll {{[0-9]+}}(%esp)
; SSE41-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
; SSE41-32-NEXT: fstpl (%esp)
; SSE41-32-NEXT: wait
; SSE41-32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE41-32-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; SSE41-32-NEXT: movl %ebp, %esp
; SSE41-32-NEXT: popl %ebp
; SSE41-32-NEXT: .cfi_def_cfa %esp, 4
; SSE41-32-NEXT: retl
;
; SSE41-64-LABEL: uitofp_v2i64_v2f64:
; SSE41-64: # %bb.0:
; SSE41-64-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
; SSE41-64-NEXT: pand %xmm0, %xmm1
; SSE41-64-NEXT: por {{.*}}(%rip), %xmm1
; SSE41-64-NEXT: psrlq $32, %xmm0
; SSE41-64-NEXT: por {{.*}}(%rip), %xmm0
; SSE41-64-NEXT: subpd {{.*}}(%rip), %xmm0
; SSE41-64-NEXT: addpd %xmm1, %xmm0
; SSE41-64-NEXT: movdqa %xmm0, %xmm1
; SSE41-64-NEXT: movq %xmm0, %rax
; SSE41-64-NEXT: movq %rax, %rcx
; SSE41-64-NEXT: shrq %rcx
; SSE41-64-NEXT: movl %eax, %edx
; SSE41-64-NEXT: andl $1, %edx
; SSE41-64-NEXT: orq %rcx, %rdx
; SSE41-64-NEXT: testq %rax, %rax
; SSE41-64-NEXT: cmovnsq %rax, %rdx
; SSE41-64-NEXT: xorps %xmm0, %xmm0
; SSE41-64-NEXT: cvtsi2sd %rdx, %xmm0
; SSE41-64-NEXT: jns .LBB21_2
; SSE41-64-NEXT: # %bb.1:
; SSE41-64-NEXT: addsd %xmm0, %xmm0
; SSE41-64-NEXT: .LBB21_2:
; SSE41-64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-64-NEXT: movq %xmm1, %rax
; SSE41-64-NEXT: movq %rax, %rcx
; SSE41-64-NEXT: shrq %rcx
; SSE41-64-NEXT: movl %eax, %edx
; SSE41-64-NEXT: andl $1, %edx
; SSE41-64-NEXT: orq %rcx, %rdx
; SSE41-64-NEXT: testq %rax, %rax
; SSE41-64-NEXT: cmovnsq %rax, %rdx
; SSE41-64-NEXT: xorps %xmm1, %xmm1
; SSE41-64-NEXT: cvtsi2sd %rdx, %xmm1
; SSE41-64-NEXT: jns .LBB21_4
; SSE41-64-NEXT: # %bb.3:
; SSE41-64-NEXT: addsd %xmm1, %xmm1
; SSE41-64-NEXT: .LBB21_4:
; SSE41-64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-64-NEXT: retq
;
; AVX1-32-LABEL: uitofp_v2i64_v2f64:
; AVX1-32: # %bb.0:
; AVX1-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-32-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-32-NEXT: vpor {{\.LCPI.*}}, %xmm1, %xmm1
; AVX1-32-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX1-32-NEXT: vpor {{\.LCPI.*}}, %xmm0, %xmm0
; AVX1-32-NEXT: vsubpd {{\.LCPI.*}}, %xmm0, %xmm0
; AVX1-32-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX1-32-NEXT: retl
; AVX-32-LABEL: uitofp_v2i64_v2f64:
; AVX-32: # %bb.0:
; AVX-32-NEXT: pushl %ebp
; AVX-32-NEXT: .cfi_def_cfa_offset 8
; AVX-32-NEXT: .cfi_offset %ebp, -8
; AVX-32-NEXT: movl %esp, %ebp
; AVX-32-NEXT: .cfi_def_cfa_register %ebp
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $32, %esp
; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; AVX-32-NEXT: vextractps $1, %xmm0, %eax
; AVX-32-NEXT: shrl $31, %eax
; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp)
; AVX-32-NEXT: wait
; AVX-32-NEXT: vextractps $3, %xmm0, %eax
; AVX-32-NEXT: shrl $31, %eax
; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
; AVX-32-NEXT: fstpl (%esp)
; AVX-32-NEXT: wait
; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; AVX-32-NEXT: movl %ebp, %esp
; AVX-32-NEXT: popl %ebp
; AVX-32-NEXT: .cfi_def_cfa %esp, 4
; AVX-32-NEXT: retl
;
; AVX1-64-LABEL: uitofp_v2i64_v2f64:
; AVX1-64: # %bb.0:
; AVX1-64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-64-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-64-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
; AVX1-64-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX1-64-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX1-64-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
; AVX1-64-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX1-64-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-64-NEXT: movq %rax, %rcx
; AVX1-64-NEXT: shrq %rcx
; AVX1-64-NEXT: movl %eax, %edx
; AVX1-64-NEXT: andl $1, %edx
; AVX1-64-NEXT: orq %rcx, %rdx
; AVX1-64-NEXT: testq %rax, %rax
; AVX1-64-NEXT: cmovnsq %rax, %rdx
; AVX1-64-NEXT: vcvtsi2sd %rdx, %xmm1, %xmm1
; AVX1-64-NEXT: jns .LBB21_2
; AVX1-64-NEXT: # %bb.1:
; AVX1-64-NEXT: vaddsd %xmm1, %xmm1, %xmm1
; AVX1-64-NEXT: .LBB21_2:
; AVX1-64-NEXT: vmovq %xmm0, %rax
; AVX1-64-NEXT: movq %rax, %rcx
; AVX1-64-NEXT: shrq %rcx
; AVX1-64-NEXT: movl %eax, %edx
; AVX1-64-NEXT: andl $1, %edx
; AVX1-64-NEXT: orq %rcx, %rdx
; AVX1-64-NEXT: testq %rax, %rax
; AVX1-64-NEXT: cmovnsq %rax, %rdx
; AVX1-64-NEXT: vcvtsi2sd %rdx, %xmm2, %xmm0
; AVX1-64-NEXT: jns .LBB21_4
; AVX1-64-NEXT: # %bb.3:
; AVX1-64-NEXT: vaddsd %xmm0, %xmm0, %xmm0
; AVX1-64-NEXT: .LBB21_4:
; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-64-NEXT: retq
;
; AVX512F-32-LABEL: uitofp_v2i64_v2f64:
; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-32-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX512F-32-NEXT: vpor {{\.LCPI.*}}, %xmm1, %xmm1
; AVX512F-32-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX512F-32-NEXT: vpor {{\.LCPI.*}}, %xmm0, %xmm0
; AVX512F-32-NEXT: vsubpd {{\.LCPI.*}}, %xmm0, %xmm0
; AVX512F-32-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX512F-32-NEXT: retl
;
; AVX512F-64-LABEL: uitofp_v2i64_v2f64:
; AVX512F-64: # %bb.0:
; AVX512F-64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-64-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX512F-64-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
; AVX512F-64-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX512F-64-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-64-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-64-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm1, %xmm1
; AVX512F-64-NEXT: vmovq %xmm0, %rax
; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm0
; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-64-NEXT: retq
;
; AVX512VL-32-LABEL: uitofp_v2i64_v2f64:
; AVX512VL-32: # %bb.0:
; AVX512VL-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-32-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX512VL-32-NEXT: vpor {{\.LCPI.*}}, %xmm1, %xmm1
; AVX512VL-32-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX512VL-32-NEXT: vpor {{\.LCPI.*}}, %xmm0, %xmm0
; AVX512VL-32-NEXT: vsubpd {{\.LCPI.*}}, %xmm0, %xmm0
; AVX512VL-32-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX512VL-32-NEXT: retl
;
; AVX512VL-64-LABEL: uitofp_v2i64_v2f64:
; AVX512VL-64: # %bb.0:
; AVX512VL-64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-64-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX512VL-64-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
; AVX512VL-64-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX512VL-64-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-64-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-64-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm1, %xmm1
; AVX512VL-64-NEXT: vmovq %xmm0, %rax
; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm0
; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-64-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_v2i64_v2f64:

View File

@ -748,106 +748,154 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 {
}
define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 {
; AVX1-32-LABEL: uitofp_v4i64_v4f64:
; AVX1-32: # %bb.0:
; AVX1-32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-32-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX1-32-NEXT: vorps {{\.LCPI.*}}, %ymm1, %ymm1
; AVX1-32-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX1-32-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-32-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX1-32-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-32-NEXT: vorpd {{\.LCPI.*}}, %ymm0, %ymm0
; AVX1-32-NEXT: vsubpd {{\.LCPI.*}}, %ymm0, %ymm0
; AVX1-32-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; AVX1-32-NEXT: retl
; AVX-32-LABEL: uitofp_v4i64_v4f64:
; AVX-32: # %bb.0:
; AVX-32-NEXT: pushl %ebp
; AVX-32-NEXT: .cfi_def_cfa_offset 8
; AVX-32-NEXT: .cfi_offset %ebp, -8
; AVX-32-NEXT: movl %esp, %ebp
; AVX-32-NEXT: .cfi_def_cfa_register %ebp
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $64, %esp
; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; AVX-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3]
; AVX-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp)
; AVX-32-NEXT: vextractps $1, %xmm0, %eax
; AVX-32-NEXT: shrl $31, %eax
; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
; AVX-32-NEXT: fstpl (%esp)
; AVX-32-NEXT: wait
; AVX-32-NEXT: vextractps $3, %xmm0, %eax
; AVX-32-NEXT: shrl $31, %eax
; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp)
; AVX-32-NEXT: wait
; AVX-32-NEXT: vextractps $1, %xmm1, %eax
; AVX-32-NEXT: shrl $31, %eax
; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp)
; AVX-32-NEXT: wait
; AVX-32-NEXT: vextractps $3, %xmm1, %eax
; AVX-32-NEXT: shrl $31, %eax
; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp)
; AVX-32-NEXT: wait
; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-32-NEXT: movl %ebp, %esp
; AVX-32-NEXT: popl %ebp
; AVX-32-NEXT: .cfi_def_cfa %esp, 4
; AVX-32-NEXT: retl
;
; AVX1-64-LABEL: uitofp_v4i64_v4f64:
; AVX1-64: # %bb.0:
; AVX1-64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-64-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX1-64-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
; AVX1-64-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-64-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-64-NEXT: vpextrq $1, %xmm2, %rax
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3
; AVX1-64-NEXT: vmovq %xmm2, %rax
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX1-64-NEXT: vpextrq $1, %xmm1, %rax
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
; AVX1-64-NEXT: vmovq %xmm1, %rax
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm1
; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-64-NEXT: vpsrlq $32, %xmm2, %xmm2
; AVX1-64-NEXT: vpextrq $1, %xmm2, %rax
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
; AVX1-64-NEXT: vmovq %xmm2, %rax
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX1-64-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX1-64-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-64-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0
; AVX1-64-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0
; AVX1-64-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; AVX1-64-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
; AVX1-64-NEXT: vmovq %xmm0, %rax
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0
; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-64-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
; AVX1-64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX1-64-NEXT: retq
;
; AVX2-32-LABEL: uitofp_v4i64_v4f64:
; AVX2-32: # %bb.0:
; AVX2-32-NEXT: vpsrlq $32, %ymm0, %ymm1
; AVX2-32-NEXT: vpor {{\.LCPI.*}}, %ymm1, %ymm1
; AVX2-32-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
; AVX2-32-NEXT: vsubpd %ymm2, %ymm1, %ymm1
; AVX2-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; AVX2-32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
; AVX2-32-NEXT: vpor {{\.LCPI.*}}, %ymm0, %ymm0
; AVX2-32-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX2-32-NEXT: retl
;
; AVX2-64-LABEL: uitofp_v4i64_v4f64:
; AVX2-64: # %bb.0:
; AVX2-64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-64-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
; AVX2-64-NEXT: vpor %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT: vpsrlq $32, %ymm0, %ymm0
; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
; AVX2-64-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
; AVX2-64-NEXT: vsubpd %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT: vpsrlq $32, %ymm0, %ymm1
; AVX2-64-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-64-NEXT: vpextrq $1, %xmm2, %rax
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3
; AVX2-64-NEXT: vmovq %xmm2, %rax
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX2-64-NEXT: vpextrq $1, %xmm1, %rax
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
; AVX2-64-NEXT: vmovq %xmm1, %rax
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm1
; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; AVX2-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.294967296E+9,4.294967296E+9,4.294967296E+9,4.294967296E+9]
; AVX2-64-NEXT: vmulpd %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; AVX2-64-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-64-NEXT: vpextrq $1, %xmm2, %rax
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
; AVX2-64-NEXT: vmovq %xmm2, %rax
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX2-64-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
; AVX2-64-NEXT: vmovq %xmm0, %rax
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0
; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; AVX2-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX2-64-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; AVX2-64-NEXT: retq
;
; AVX512F-32-LABEL: uitofp_v4i64_v4f64:
; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpsrlq $32, %ymm0, %ymm1
; AVX512F-32-NEXT: vpor {{\.LCPI.*}}, %ymm1, %ymm1
; AVX512F-32-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
; AVX512F-32-NEXT: vsubpd %ymm2, %ymm1, %ymm1
; AVX512F-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; AVX512F-32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
; AVX512F-32-NEXT: vpor {{\.LCPI.*}}, %ymm0, %ymm0
; AVX512F-32-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX512F-32-NEXT: retl
;
; AVX512F-64-LABEL: uitofp_v4i64_v4f64:
; AVX512F-64: # %bb.0:
; AVX512F-64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-64-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX512F-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
; AVX512F-64-NEXT: vpor %ymm2, %ymm1, %ymm1
; AVX512F-64-NEXT: vpsrlq $32, %ymm0, %ymm0
; AVX512F-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
; AVX512F-64-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX512F-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
; AVX512F-64-NEXT: vsubpd %ymm2, %ymm0, %ymm0
; AVX512F-64-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; AVX512F-64-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-64-NEXT: vpextrq $1, %xmm1, %rax
; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2
; AVX512F-64-NEXT: vmovq %xmm1, %rax
; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1
; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm2
; AVX512F-64-NEXT: vmovq %xmm0, %rax
; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0
; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX512F-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-64-NEXT: retq
;
; AVX512VL-32-LABEL: uitofp_v4i64_v4f64:
; AVX512VL-32: # %bb.0:
; AVX512VL-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-32-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX512VL-32-NEXT: vpor {{\.LCPI.*}}, %ymm1, %ymm1
; AVX512VL-32-NEXT: vpsrlq $32, %ymm0, %ymm0
; AVX512VL-32-NEXT: vpor {{\.LCPI.*}}, %ymm0, %ymm0
; AVX512VL-32-NEXT: vsubpd {{\.LCPI.*}}{1to4}, %ymm0, %ymm0
; AVX512VL-32-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; AVX512VL-32-NEXT: retl
;
; AVX512VL-64-LABEL: uitofp_v4i64_v4f64:
; AVX512VL-64: # %bb.0:
; AVX512VL-64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-64-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX512VL-64-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1
; AVX512VL-64-NEXT: vpsrlq $32, %ymm0, %ymm0
; AVX512VL-64-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0
; AVX512VL-64-NEXT: vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
; AVX512VL-64-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; AVX512VL-64-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-64-NEXT: vpextrq $1, %xmm1, %rax
; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2
; AVX512VL-64-NEXT: vmovq %xmm1, %rax
; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1
; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm2
; AVX512VL-64-NEXT: vmovq %xmm0, %rax
; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0
; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX512VL-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-64-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_v4i64_v4f64:

View File

@ -362,22 +362,120 @@ define <8 x double> @sitofp_v8i64_v8f64(<8 x i64> %x) #0 {
define <8 x double> @uitofp_v8i64_v8f64(<8 x i64> %x) #0 {
; NODQ-32-LABEL: uitofp_v8i64_v8f64:
; NODQ-32: # %bb.0:
; NODQ-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1127219200,0,1127219200,0,1127219200,0,1127219200,0,1127219200,0,1127219200,0,1127219200,0,1127219200]
; NODQ-32-NEXT: vpternlogq $248, {{\.LCPI.*}}, %zmm0, %zmm1
; NODQ-32-NEXT: vpsrlq $32, %zmm0, %zmm0
; NODQ-32-NEXT: vporq {{\.LCPI.*}}, %zmm0, %zmm0
; NODQ-32-NEXT: vsubpd {{\.LCPI.*}}{1to8}, %zmm0, %zmm0
; NODQ-32-NEXT: vaddpd %zmm0, %zmm1, %zmm0
; NODQ-32-NEXT: pushl %ebp
; NODQ-32-NEXT: .cfi_def_cfa_offset 8
; NODQ-32-NEXT: .cfi_offset %ebp, -8
; NODQ-32-NEXT: movl %esp, %ebp
; NODQ-32-NEXT: .cfi_def_cfa_register %ebp
; NODQ-32-NEXT: andl $-8, %esp
; NODQ-32-NEXT: subl $128, %esp
; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm2
; NODQ-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,3,2,3]
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm3
; NODQ-32-NEXT: vmovlps %xmm3, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,3,2,3]
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm1
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,3,2,3]
; NODQ-32-NEXT: vmovlps %xmm4, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vextractps $1, %xmm2, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
; NODQ-32-NEXT: wait
; NODQ-32-NEXT: vextractps $3, %xmm2, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
; NODQ-32-NEXT: wait
; NODQ-32-NEXT: vextractps $1, %xmm3, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
; NODQ-32-NEXT: wait
; NODQ-32-NEXT: vextractps $3, %xmm3, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
; NODQ-32-NEXT: wait
; NODQ-32-NEXT: vextractps $1, %xmm0, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
; NODQ-32-NEXT: fstpl (%esp)
; NODQ-32-NEXT: wait
; NODQ-32-NEXT: vextractps $3, %xmm0, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
; NODQ-32-NEXT: wait
; NODQ-32-NEXT: vextractps $1, %xmm1, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
; NODQ-32-NEXT: wait
; NODQ-32-NEXT: vextractps $3, %xmm1, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
; NODQ-32-NEXT: wait
; NODQ-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; NODQ-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; NODQ-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; NODQ-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; NODQ-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; NODQ-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; NODQ-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; NODQ-32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
; NODQ-32-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
; NODQ-32-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; NODQ-32-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; NODQ-32-NEXT: movl %ebp, %esp
; NODQ-32-NEXT: popl %ebp
; NODQ-32-NEXT: .cfi_def_cfa %esp, 4
; NODQ-32-NEXT: retl
;
; NODQ-64-LABEL: uitofp_v8i64_v8f64:
; NODQ-64: # %bb.0:
; NODQ-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
; NODQ-64-NEXT: vpternlogq $248, {{.*}}(%rip){1to8}, %zmm0, %zmm1
; NODQ-64-NEXT: vpsrlq $32, %zmm0, %zmm0
; NODQ-64-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; NODQ-64-NEXT: vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
; NODQ-64-NEXT: vaddpd %zmm0, %zmm1, %zmm0
; NODQ-64-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; NODQ-64-NEXT: vpextrq $1, %xmm1, %rax
; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2
; NODQ-64-NEXT: vmovq %xmm1, %rax
; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1
; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; NODQ-64-NEXT: vextracti32x4 $2, %zmm0, %xmm2
; NODQ-64-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm3
; NODQ-64-NEXT: vmovq %xmm2, %rax
; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm2
; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; NODQ-64-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; NODQ-64-NEXT: vextracti128 $1, %ymm0, %xmm2
; NODQ-64-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm3
; NODQ-64-NEXT: vmovq %xmm2, %rax
; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm2
; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; NODQ-64-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm3
; NODQ-64-NEXT: vmovq %xmm0, %rax
; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm0
; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; NODQ-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; NODQ-64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; NODQ-64-NEXT: retq
;
; DQ-LABEL: uitofp_v8i64_v8f64:

View File

@ -6761,21 +6761,34 @@ entry:
define <1 x double> @constrained_vector_uitofp_v1f64_v1i64(<1 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v1f64_v1i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %rdi, %xmm1
; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; CHECK-NEXT: subpd {{.*}}(%rip), %xmm1
; CHECK-NEXT: movapd %xmm1, %xmm0
; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; CHECK-NEXT: addpd %xmm1, %xmm0
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: shrq %rax
; CHECK-NEXT: movl %edi, %ecx
; CHECK-NEXT: andl $1, %ecx
; CHECK-NEXT: orq %rax, %rcx
; CHECK-NEXT: testq %rdi, %rdi
; CHECK-NEXT: cmovnsq %rdi, %rcx
; CHECK-NEXT: cvtsi2sd %rcx, %xmm0
; CHECK-NEXT: jns .LBB169_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: addsd %xmm0, %xmm0
; CHECK-NEXT: .LBB169_2: # %entry
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v1f64_v1i64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovq %rdi, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: movq %rdi, %rax
; AVX1-NEXT: shrq %rax
; AVX1-NEXT: movl %edi, %ecx
; AVX1-NEXT: andl $1, %ecx
; AVX1-NEXT: orq %rax, %rcx
; AVX1-NEXT: testq %rdi, %rdi
; AVX1-NEXT: cmovnsq %rdi, %rcx
; AVX1-NEXT: vcvtsi2sd %rcx, %xmm0, %xmm0
; AVX1-NEXT: jns .LBB169_2
; AVX1-NEXT: # %bb.1:
; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm0
; AVX1-NEXT: .LBB169_2: # %entry
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_uitofp_v1f64_v1i64:
@ -6906,35 +6919,77 @@ entry:
define <2 x double> @constrained_vector_uitofp_v2f64_v2i64(<2 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v2f64_v2i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
; CHECK-NEXT: pand %xmm0, %xmm1
; CHECK-NEXT: por {{.*}}(%rip), %xmm1
; CHECK-NEXT: psrlq $32, %xmm0
; CHECK-NEXT: por {{.*}}(%rip), %xmm0
; CHECK-NEXT: subpd {{.*}}(%rip), %xmm0
; CHECK-NEXT: addpd %xmm1, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: shrq %rcx
; CHECK-NEXT: movl %eax, %edx
; CHECK-NEXT: andl $1, %edx
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: testq %rax, %rax
; CHECK-NEXT: cmovnsq %rax, %rdx
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2sd %rdx, %xmm0
; CHECK-NEXT: jns .LBB173_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: addsd %xmm0, %xmm0
; CHECK-NEXT: .LBB173_2: # %entry
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; CHECK-NEXT: movq %xmm1, %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: shrq %rcx
; CHECK-NEXT: movl %eax, %edx
; CHECK-NEXT: andl $1, %edx
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: testq %rax, %rax
; CHECK-NEXT: cmovnsq %rax, %rdx
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: cvtsi2sd %rdx, %xmm1
; CHECK-NEXT: jns .LBB173_4
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: addsd %xmm1, %xmm1
; CHECK-NEXT: .LBB173_4: # %entry
; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v2f64_v2i64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: movl %eax, %edx
; AVX1-NEXT: andl $1, %edx
; AVX1-NEXT: orq %rcx, %rdx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: cmovnsq %rax, %rdx
; AVX1-NEXT: vcvtsi2sd %rdx, %xmm1, %xmm1
; AVX1-NEXT: jns .LBB173_2
; AVX1-NEXT: # %bb.1:
; AVX1-NEXT: vaddsd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: .LBB173_2: # %entry
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: movl %eax, %edx
; AVX1-NEXT: andl $1, %edx
; AVX1-NEXT: orq %rcx, %rdx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: cmovnsq %rax, %rdx
; AVX1-NEXT: vcvtsi2sd %rdx, %xmm2, %xmm0
; AVX1-NEXT: jns .LBB173_4
; AVX1-NEXT: # %bb.3:
; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm0
; AVX1-NEXT: .LBB173_4: # %entry
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: retq
;
; AVX512F-LABEL: constrained_vector_uitofp_v2f64_v2i64:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtusi2sd %rax, %xmm1, %xmm1
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: vcvtusi2sd %rax, %xmm2, %xmm0
; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: constrained_vector_uitofp_v2f64_v2i64:
@ -7124,51 +7179,91 @@ entry:
define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v3f64_v3i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %rdi, %xmm1
; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-NEXT: movapd {{.*#+}} xmm3 = [4.503599627370496E+15,1.9342813113834067E+25]
; CHECK-NEXT: subpd %xmm3, %xmm1
; CHECK-NEXT: movapd %xmm1, %xmm0
; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; CHECK-NEXT: addpd %xmm1, %xmm0
; CHECK-NEXT: movq %rsi, %xmm4
; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
; CHECK-NEXT: subpd %xmm3, %xmm4
; CHECK-NEXT: movapd %xmm4, %xmm1
; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
; CHECK-NEXT: addpd %xmm4, %xmm1
; CHECK-NEXT: movq %rdx, %xmm4
; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
; CHECK-NEXT: subpd %xmm3, %xmm4
; CHECK-NEXT: movapd %xmm4, %xmm2
; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1]
; CHECK-NEXT: addpd %xmm4, %xmm2
; CHECK-NEXT: movlpd %xmm2, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: shrq %rax
; CHECK-NEXT: movl %edi, %ecx
; CHECK-NEXT: andl $1, %ecx
; CHECK-NEXT: orq %rax, %rcx
; CHECK-NEXT: testq %rdi, %rdi
; CHECK-NEXT: cmovnsq %rdi, %rcx
; CHECK-NEXT: cvtsi2sd %rcx, %xmm0
; CHECK-NEXT: jns .LBB177_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: addsd %xmm0, %xmm0
; CHECK-NEXT: .LBB177_2: # %entry
; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: shrq %rax
; CHECK-NEXT: movl %esi, %ecx
; CHECK-NEXT: andl $1, %ecx
; CHECK-NEXT: orq %rax, %rcx
; CHECK-NEXT: testq %rsi, %rsi
; CHECK-NEXT: cmovnsq %rsi, %rcx
; CHECK-NEXT: cvtsi2sd %rcx, %xmm1
; CHECK-NEXT: jns .LBB177_4
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: addsd %xmm1, %xmm1
; CHECK-NEXT: .LBB177_4: # %entry
; CHECK-NEXT: movq %rdx, %rax
; CHECK-NEXT: shrq %rax
; CHECK-NEXT: movl %edx, %ecx
; CHECK-NEXT: andl $1, %ecx
; CHECK-NEXT: orq %rax, %rcx
; CHECK-NEXT: testq %rdx, %rdx
; CHECK-NEXT: cmovnsq %rdx, %rcx
; CHECK-NEXT: cvtsi2sd %rcx, %xmm2
; CHECK-NEXT: jns .LBB177_6
; CHECK-NEXT: # %bb.5:
; CHECK-NEXT: addsd %xmm2, %xmm2
; CHECK-NEXT: .LBB177_6: # %entry
; CHECK-NEXT: movsd %xmm2, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4.503599627370496E+15,1.9342813113834067E+25]
; AVX1-NEXT: vsubpd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
; AVX1-NEXT: vaddpd %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[2,3,2,3]
; AVX1-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
; AVX1-NEXT: vsubpd %xmm3, %xmm4, %xmm4
; AVX1-NEXT: vpermilpd {{.*#+}} xmm5 = xmm4[1,0]
; AVX1-NEXT: vaddpd %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm4[0]
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: movl %eax, %edx
; AVX1-NEXT: andl $1, %edx
; AVX1-NEXT: orq %rcx, %rdx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: cmovnsq %rax, %rdx
; AVX1-NEXT: vcvtsi2sd %rdx, %xmm1, %xmm1
; AVX1-NEXT: jns .LBB177_2
; AVX1-NEXT: # %bb.1:
; AVX1-NEXT: vaddsd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: .LBB177_2: # %entry
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: movl %eax, %edx
; AVX1-NEXT: andl $1, %edx
; AVX1-NEXT: orq %rcx, %rdx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: cmovnsq %rax, %rdx
; AVX1-NEXT: vcvtsi2sd %rdx, %xmm2, %xmm2
; AVX1-NEXT: jns .LBB177_4
; AVX1-NEXT: # %bb.3:
; AVX1-NEXT: vaddsd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: .LBB177_4: # %entry
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX1-NEXT: vsubpd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: movl %eax, %edx
; AVX1-NEXT: andl $1, %edx
; AVX1-NEXT: orq %rcx, %rdx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: cmovnsq %rax, %rdx
; AVX1-NEXT: vcvtsi2sd %rdx, %xmm3, %xmm0
; AVX1-NEXT: jns .LBB177_6
; AVX1-NEXT: # %bb.5:
; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm0
; AVX1-NEXT: .LBB177_6: # %entry
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_uitofp_v3f64_v3i64:
@ -7381,51 +7476,117 @@ entry:
define <4 x double> @constrained_vector_uitofp_v4f64_v4i64(<4 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v4f64_v4i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
; CHECK-NEXT: movdqa %xmm1, %xmm3
; CHECK-NEXT: pand %xmm2, %xmm3
; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
; CHECK-NEXT: por %xmm4, %xmm3
; CHECK-NEXT: psrlq $32, %xmm1
; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
; CHECK-NEXT: por %xmm5, %xmm1
; CHECK-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
; CHECK-NEXT: subpd %xmm6, %xmm1
; CHECK-NEXT: addpd %xmm3, %xmm1
; CHECK-NEXT: pand %xmm0, %xmm2
; CHECK-NEXT: por %xmm4, %xmm2
; CHECK-NEXT: psrlq $32, %xmm0
; CHECK-NEXT: por %xmm5, %xmm0
; CHECK-NEXT: subpd %xmm6, %xmm0
; CHECK-NEXT: addpd %xmm2, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm2
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: shrq %rcx
; CHECK-NEXT: movl %eax, %edx
; CHECK-NEXT: andl $1, %edx
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: testq %rax, %rax
; CHECK-NEXT: cmovnsq %rax, %rdx
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2sd %rdx, %xmm0
; CHECK-NEXT: jns .LBB181_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: addsd %xmm0, %xmm0
; CHECK-NEXT: .LBB181_2: # %entry
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; CHECK-NEXT: movq %xmm2, %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: shrq %rcx
; CHECK-NEXT: movl %eax, %edx
; CHECK-NEXT: andl $1, %edx
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: testq %rax, %rax
; CHECK-NEXT: cmovnsq %rax, %rdx
; CHECK-NEXT: cvtsi2sd %rdx, %xmm3
; CHECK-NEXT: jns .LBB181_4
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: addsd %xmm3, %xmm3
; CHECK-NEXT: .LBB181_4: # %entry
; CHECK-NEXT: movq %xmm1, %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: shrq %rcx
; CHECK-NEXT: movl %eax, %edx
; CHECK-NEXT: andl $1, %edx
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: testq %rax, %rax
; CHECK-NEXT: cmovnsq %rax, %rdx
; CHECK-NEXT: xorps %xmm2, %xmm2
; CHECK-NEXT: cvtsi2sd %rdx, %xmm2
; CHECK-NEXT: jns .LBB181_6
; CHECK-NEXT: # %bb.5:
; CHECK-NEXT: addsd %xmm2, %xmm2
; CHECK-NEXT: .LBB181_6: # %entry
; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; CHECK-NEXT: movq %xmm1, %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: shrq %rcx
; CHECK-NEXT: movl %eax, %edx
; CHECK-NEXT: andl $1, %edx
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: testq %rax, %rax
; CHECK-NEXT: cmovnsq %rax, %rdx
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: cvtsi2sd %rdx, %xmm1
; CHECK-NEXT: jns .LBB181_8
; CHECK-NEXT: # %bb.7:
; CHECK-NEXT: addsd %xmm1, %xmm1
; CHECK-NEXT: .LBB181_8: # %entry
; CHECK-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; CHECK-NEXT: movapd %xmm2, %xmm1
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v4f64_v4i64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpextrq $1, %xmm2, %rax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3
; AVX1-NEXT: vmovq %xmm2, %rax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX1-NEXT: vpextrq $1, %xmm1, %rax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
; AVX1-NEXT: vmovq %xmm1, %rax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm1
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
; AVX1-NEXT: vpextrq $1, %xmm2, %rax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
; AVX1-NEXT: vmovq %xmm2, %rax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX512F-LABEL: constrained_vector_uitofp_v4f64_v4i64:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
; AVX512F-NEXT: vsubpd %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
; AVX512F-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2
; AVX512F-NEXT: vmovq %xmm1, %rax
; AVX512F-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1
; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtusi2sd %rax, %xmm3, %xmm2
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0
; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: constrained_vector_uitofp_v4f64_v4i64: