[X86] Enable the use of movlps for i64 atomic load on 32-bit targets with sse1.

Still a little room for improvement by using movlps to store to
the stack temporary needed to move data out of the xmm register
after the load.
This commit is contained in:
Craig Topper 2020-02-23 10:46:34 -08:00
parent 2a10f8019d
commit 15b6aa7448
3 changed files with 70 additions and 67 deletions

View File

@ -27544,7 +27544,7 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
!Subtarget.useSoftFloat() && !NoImplicitFloatOps && !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
(Subtarget.hasSSE2() || Subtarget.hasX87())) (Subtarget.hasSSE1() || Subtarget.hasX87()))
return AtomicExpansionKind::None; return AtomicExpansionKind::None;
return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
@ -29667,15 +29667,27 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Attribute::NoImplicitFloat); Attribute::NoImplicitFloat);
if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) { if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
auto *Node = cast<AtomicSDNode>(N); auto *Node = cast<AtomicSDNode>(N);
if (Subtarget.hasSSE2()) { if (Subtarget.hasSSE1()) {
// Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
// lower 64-bits. // Then extract the lower 64-bits.
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
SDValue Ops[] = { Node->getChain(), Node->getBasePtr() }; SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
MVT::i64, Node->getMemOperand()); MVT::i64, Node->getMemOperand());
SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld, if (Subtarget.hasSSE2()) {
SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
DAG.getIntPtrConstant(0, dl));
Results.push_back(Res);
Results.push_back(Ld.getValue(1));
return;
}
// We use an alternative sequence for SSE1 that extracts as v2f32 and
// then casts to i64. This avoids a 128-bit stack temporary being
// created by type legalization if we were to cast v4f32->v2i64.
SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
DAG.getIntPtrConstant(0, dl)); DAG.getIntPtrConstant(0, dl));
Res = DAG.getBitcast(MVT::i64, Res);
Results.push_back(Res); Results.push_back(Res);
Results.push_back(Ld.getValue(1)); Results.push_back(Ld.getValue(1));
return; return;

View File

@ -107,18 +107,17 @@ define void @fadd_64r(double* %loc, double %val) nounwind {
; X86-SSE1-NEXT: pushl %ebp ; X86-SSE1-NEXT: pushl %ebp
; X86-SSE1-NEXT: movl %esp, %ebp ; X86-SSE1-NEXT: movl %esp, %ebp
; X86-SSE1-NEXT: andl $-8, %esp ; X86-SSE1-NEXT: andl $-8, %esp
; X86-SSE1-NEXT: subl $24, %esp ; X86-SSE1-NEXT: subl $16, %esp
; X86-SSE1-NEXT: movl 8(%ebp), %eax ; X86-SSE1-NEXT: movl 8(%ebp), %eax
; X86-SSE1-NEXT: fildll (%eax) ; X86-SSE1-NEXT: xorps %xmm0, %xmm0
; X86-SSE1-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: xorps %xmm1, %xmm1
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE1-NEXT: movss %xmm1, (%esp)
; X86-SSE1-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X86-SSE1-NEXT: movl %ecx, (%esp) ; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: fldl (%esp)
; X86-SSE1-NEXT: faddl 12(%ebp) ; X86-SSE1-NEXT: faddl 12(%ebp)
; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: xorps %xmm0, %xmm0
; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; X86-SSE1-NEXT: movlps %xmm0, (%eax) ; X86-SSE1-NEXT: movlps %xmm0, (%eax)
; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: movl %ebp, %esp
@ -274,17 +273,16 @@ define void @fadd_64g() nounwind {
; X86-SSE1-NEXT: pushl %ebp ; X86-SSE1-NEXT: pushl %ebp
; X86-SSE1-NEXT: movl %esp, %ebp ; X86-SSE1-NEXT: movl %esp, %ebp
; X86-SSE1-NEXT: andl $-8, %esp ; X86-SSE1-NEXT: andl $-8, %esp
; X86-SSE1-NEXT: subl $24, %esp ; X86-SSE1-NEXT: subl $16, %esp
; X86-SSE1-NEXT: fildll glob64 ; X86-SSE1-NEXT: xorps %xmm0, %xmm0
; X86-SSE1-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: xorps %xmm1, %xmm1
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE1-NEXT: movss %xmm1, (%esp)
; X86-SSE1-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X86-SSE1-NEXT: movl %eax, (%esp) ; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: fld1 ; X86-SSE1-NEXT: fld1
; X86-SSE1-NEXT: faddl (%esp) ; X86-SSE1-NEXT: faddl (%esp)
; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: xorps %xmm0, %xmm0
; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; X86-SSE1-NEXT: movlps %xmm0, glob64 ; X86-SSE1-NEXT: movlps %xmm0, glob64
; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: movl %ebp, %esp
@ -438,17 +436,16 @@ define void @fadd_64imm() nounwind {
; X86-SSE1-NEXT: pushl %ebp ; X86-SSE1-NEXT: pushl %ebp
; X86-SSE1-NEXT: movl %esp, %ebp ; X86-SSE1-NEXT: movl %esp, %ebp
; X86-SSE1-NEXT: andl $-8, %esp ; X86-SSE1-NEXT: andl $-8, %esp
; X86-SSE1-NEXT: subl $24, %esp ; X86-SSE1-NEXT: subl $16, %esp
; X86-SSE1-NEXT: fildll -559038737 ; X86-SSE1-NEXT: xorps %xmm0, %xmm0
; X86-SSE1-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: xorps %xmm1, %xmm1
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE1-NEXT: movss %xmm1, (%esp)
; X86-SSE1-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X86-SSE1-NEXT: movl %eax, (%esp) ; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: fld1 ; X86-SSE1-NEXT: fld1
; X86-SSE1-NEXT: faddl (%esp) ; X86-SSE1-NEXT: faddl (%esp)
; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: xorps %xmm0, %xmm0
; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; X86-SSE1-NEXT: movlps %xmm0, -559038737 ; X86-SSE1-NEXT: movlps %xmm0, -559038737
; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: movl %ebp, %esp
@ -608,17 +605,16 @@ define void @fadd_64stack() nounwind {
; X86-SSE1-NEXT: pushl %ebp ; X86-SSE1-NEXT: pushl %ebp
; X86-SSE1-NEXT: movl %esp, %ebp ; X86-SSE1-NEXT: movl %esp, %ebp
; X86-SSE1-NEXT: andl $-8, %esp ; X86-SSE1-NEXT: andl $-8, %esp
; X86-SSE1-NEXT: subl $32, %esp ; X86-SSE1-NEXT: subl $24, %esp
; X86-SSE1-NEXT: fildll {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: xorps %xmm0, %xmm0
; X86-SSE1-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: xorps %xmm1, %xmm1
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE1-NEXT: movss %xmm1, (%esp)
; X86-SSE1-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X86-SSE1-NEXT: movl %eax, (%esp) ; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: fld1 ; X86-SSE1-NEXT: fld1
; X86-SSE1-NEXT: faddl (%esp) ; X86-SSE1-NEXT: faddl (%esp)
; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: xorps %xmm0, %xmm0
; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; X86-SSE1-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: movlps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: movl %ebp, %esp
@ -712,25 +708,22 @@ define void @fadd_array(i64* %arg, double %arg1, i64 %arg2) nounwind {
; X86-SSE1: # %bb.0: # %bb ; X86-SSE1: # %bb.0: # %bb
; X86-SSE1-NEXT: pushl %ebp ; X86-SSE1-NEXT: pushl %ebp
; X86-SSE1-NEXT: movl %esp, %ebp ; X86-SSE1-NEXT: movl %esp, %ebp
; X86-SSE1-NEXT: pushl %esi
; X86-SSE1-NEXT: andl $-8, %esp ; X86-SSE1-NEXT: andl $-8, %esp
; X86-SSE1-NEXT: subl $32, %esp ; X86-SSE1-NEXT: subl $16, %esp
; X86-SSE1-NEXT: movl 20(%ebp), %eax ; X86-SSE1-NEXT: movl 20(%ebp), %eax
; X86-SSE1-NEXT: movl 8(%ebp), %ecx ; X86-SSE1-NEXT: movl 8(%ebp), %ecx
; X86-SSE1-NEXT: fildll (%ecx,%eax,8) ; X86-SSE1-NEXT: xorps %xmm0, %xmm0
; X86-SSE1-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: xorps %xmm1, %xmm1
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SSE1-NEXT: movss %xmm1, (%esp)
; X86-SSE1-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X86-SSE1-NEXT: movl %edx, (%esp) ; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: fldl (%esp)
; X86-SSE1-NEXT: faddl 12(%ebp) ; X86-SSE1-NEXT: faddl 12(%ebp)
; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: xorps %xmm0, %xmm0
; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; X86-SSE1-NEXT: movlps %xmm0, (%ecx,%eax,8) ; X86-SSE1-NEXT: movlps %xmm0, (%ecx,%eax,8)
; X86-SSE1-NEXT: leal -4(%ebp), %esp ; X86-SSE1-NEXT: movl %ebp, %esp
; X86-SSE1-NEXT: popl %esi
; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: popl %ebp
; X86-SSE1-NEXT: retl ; X86-SSE1-NEXT: retl
; ;

View File

@ -272,17 +272,16 @@ define float @load_float(float* %fptr) {
define double @load_double(double* %fptr) { define double @load_double(double* %fptr) {
; X86-SSE1-LABEL: load_double: ; X86-SSE1-LABEL: load_double:
; X86-SSE1: # %bb.0: ; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: subl $20, %esp ; X86-SSE1-NEXT: subl $12, %esp
; X86-SSE1-NEXT: .cfi_def_cfa_offset 24 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 16
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE1-NEXT: fildll (%eax) ; X86-SSE1-NEXT: xorps %xmm0, %xmm0
; X86-SSE1-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE1-NEXT: movss %xmm0, (%esp)
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X86-SSE1-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: movl %eax, (%esp)
; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: fldl (%esp)
; X86-SSE1-NEXT: addl $20, %esp ; X86-SSE1-NEXT: addl $12, %esp
; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
; X86-SSE1-NEXT: retl ; X86-SSE1-NEXT: retl
; ;
@ -660,17 +659,16 @@ define float @load_float_seq_cst(float* %fptr) {
define double @load_double_seq_cst(double* %fptr) { define double @load_double_seq_cst(double* %fptr) {
; X86-SSE1-LABEL: load_double_seq_cst: ; X86-SSE1-LABEL: load_double_seq_cst:
; X86-SSE1: # %bb.0: ; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: subl $20, %esp ; X86-SSE1-NEXT: subl $12, %esp
; X86-SSE1-NEXT: .cfi_def_cfa_offset 24 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 16
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE1-NEXT: fildll (%eax) ; X86-SSE1-NEXT: xorps %xmm0, %xmm0
; X86-SSE1-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE1-NEXT: movss %xmm0, (%esp)
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X86-SSE1-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: movl %eax, (%esp)
; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: fldl (%esp)
; X86-SSE1-NEXT: addl $20, %esp ; X86-SSE1-NEXT: addl $12, %esp
; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
; X86-SSE1-NEXT: retl ; X86-SSE1-NEXT: retl
; ;