diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2de737787938..f15acd73078e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -8285,14 +8285,21 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, /// PerformSTORECombine - Do target-specific dag combines on STORE nodes. static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { + const X86Subtarget *Subtarget) { // Turn load->store of MMX types into GPR load/stores. This avoids clobbering // the FP state in cases where an emms may be missing. // A preferable solution to the general problem is to figure out the right // places to insert EMMS. This qualifies as a quick hack. + + // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. StoreSDNode *St = cast(N); - if (St->getValue().getValueType().isVector() && - St->getValue().getValueType().getSizeInBits() == 64 && + MVT VT = St->getValue().getValueType(); + if (VT.getSizeInBits() != 64) + return SDValue(); + + bool F64IsLegal = !UseSoftFloat && !NoImplicitFloat && Subtarget->hasSSE2(); + if ((VT.isVector() || + (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && isa(St->getValue()) && !cast(St->getValue())->isVolatile() && St->getChain().hasOneUse() && !St->isVolatile()) { @@ -8316,60 +8323,72 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, Ops.push_back(ChainVal->getOperand(i)); } } - if (Ld) { - DebugLoc DL = N->getDebugLoc(); - // If we are a 64-bit capable x86, lower to a single movq load/store pair. - if (Subtarget->is64Bit()) { - SDValue NewLd = DAG.getLoad(MVT::i64, DL, Ld->getChain(), - Ld->getBasePtr(), Ld->getSrcValue(), - Ld->getSrcValueOffset(), Ld->isVolatile(), - Ld->getAlignment()); - SDValue NewChain = NewLd.getValue(1); - if (TokenFactorIndex != -1) { - Ops.push_back(NewChain); - NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Ops[0], - Ops.size()); - } - return DAG.getStore(NewChain, DL, NewLd, St->getBasePtr(), - St->getSrcValue(), St->getSrcValueOffset(), - St->isVolatile(), St->getAlignment()); - } - // Otherwise, lower to two 32-bit copies. - SDValue LoAddr = Ld->getBasePtr(); - SDValue HiAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, LoAddr, - DAG.getConstant(4, MVT::i32)); + if (!Ld || !ISD::isNormalLoad(Ld)) + return SDValue(); - SDValue LoLd = DAG.getLoad(MVT::i32, DL, Ld->getChain(), LoAddr, - Ld->getSrcValue(), Ld->getSrcValueOffset(), - Ld->isVolatile(), Ld->getAlignment()); - SDValue HiLd = DAG.getLoad(MVT::i32, DL, Ld->getChain(), HiAddr, - Ld->getSrcValue(), Ld->getSrcValueOffset()+4, - Ld->isVolatile(), - MinAlign(Ld->getAlignment(), 4)); + // If this is not the MMX case, i.e. we are just turning i64 load/store + // into f64 load/store, avoid the transformation if there are multiple + // uses of the loaded value. + if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) + return SDValue(); - SDValue NewChain = LoLd.getValue(1); + DebugLoc LdDL = Ld->getDebugLoc(); + DebugLoc StDL = N->getDebugLoc(); + // If we are a 64-bit capable x86, lower to a single movq load/store pair. + // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store + // pair instead. + if (Subtarget->is64Bit() || F64IsLegal) { + MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; + SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), + Ld->getBasePtr(), Ld->getSrcValue(), + Ld->getSrcValueOffset(), Ld->isVolatile(), + Ld->getAlignment()); + SDValue NewChain = NewLd.getValue(1); if (TokenFactorIndex != -1) { - Ops.push_back(LoLd); - Ops.push_back(HiLd); - NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Ops[0], + Ops.push_back(NewChain); + NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], Ops.size()); } - - LoAddr = St->getBasePtr(); - HiAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, LoAddr, - DAG.getConstant(4, MVT::i32)); - - SDValue LoSt = DAG.getStore(NewChain, DL, LoLd, LoAddr, + return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), St->getSrcValue(), St->getSrcValueOffset(), St->isVolatile(), St->getAlignment()); - SDValue HiSt = DAG.getStore(NewChain, DL, HiLd, HiAddr, - St->getSrcValue(), - St->getSrcValueOffset() + 4, - St->isVolatile(), - MinAlign(St->getAlignment(), 4)); - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoSt, HiSt); } + + // Otherwise, lower to two pairs of 32-bit loads / stores. + SDValue LoAddr = Ld->getBasePtr(); + SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, + DAG.getConstant(4, MVT::i32)); + + SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, + Ld->getSrcValue(), Ld->getSrcValueOffset(), + Ld->isVolatile(), Ld->getAlignment()); + SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, + Ld->getSrcValue(), Ld->getSrcValueOffset()+4, + Ld->isVolatile(), + MinAlign(Ld->getAlignment(), 4)); + + SDValue NewChain = LoLd.getValue(1); + if (TokenFactorIndex != -1) { + Ops.push_back(LoLd); + Ops.push_back(HiLd); + NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], + Ops.size()); + } + + LoAddr = St->getBasePtr(); + HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, + DAG.getConstant(4, MVT::i32)); + + SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, + St->getSrcValue(), St->getSrcValueOffset(), + St->isVolatile(), St->getAlignment()); + SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, + St->getSrcValue(), + St->getSrcValueOffset() + 4, + St->isVolatile(), + MinAlign(St->getAlignment(), 4)); + return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); } return SDValue(); } diff --git a/llvm/test/CodeGen/X86/i64-mem-copy.ll b/llvm/test/CodeGen/X86/i64-mem-copy.ll new file mode 100644 index 000000000000..ce540112087c --- /dev/null +++ b/llvm/test/CodeGen/X86/i64-mem-copy.ll @@ -0,0 +1,13 @@ +; RUN: llvm-as < %s | llc -march=x86-64 | grep {movq.*(%rsi), %rax} +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep {movsd.*(%eax),} + +; Uses movsd to load / store i64 values in sse2 is available. + +; rdar://6659858 + +define void @foo(i64* %x, i64* %y) nounwind { +entry: + %tmp1 = load i64* %y, align 8 ; [#uses=1] + store i64 %tmp1, i64* %x, align 8 + ret void +} diff --git a/llvm/test/CodeGen/X86/mmx-copy-gprs.ll b/llvm/test/CodeGen/X86/mmx-copy-gprs.ll index da17a04a466c..2047ce75e570 100644 --- a/llvm/test/CodeGen/X86/mmx-copy-gprs.ll +++ b/llvm/test/CodeGen/X86/mmx-copy-gprs.ll @@ -1,11 +1,11 @@ -; RUN: llvm-as < %s | llc -march=x86-64 | grep {movq.*(%rsi), %rax} -; RUN: llvm-as < %s | llc -march=x86 | grep {movl.*4(%eax),} +; RUN: llvm-as < %s | llc -march=x86-64 | grep {movq.*(%rsi), %rax} +; RUN: llvm-as < %s | llc -march=x86 -mattr=-sse2 | grep {movl.*4(%eax),} +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep {movsd.(%eax),} ; This test should use GPRs to copy the mmx value, not MMX regs. Using mmx regs, ; increases the places that need to use emms. ; rdar://5741668 -target triple = "x86_64-apple-darwin8" define void @foo(<1 x i64>* %x, <1 x i64>* %y) nounwind { entry: