diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b5e91ce71449..96a58c11ef7a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -704,6 +704,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::SELECT); + setTargetDAGCombine(ISD::STORE); computeRegisterProperties(); @@ -5872,6 +5873,35 @@ static SDOperand PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, return SDOperand(); } +/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. +static SDOperand PerformSTORECombine(StoreSDNode *St, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + // Turn load->store of MMX types into GPR load/stores. This avoids clobbering + // the FP state in cases where an emms may be missing. + if (MVT::isVector(St->getValue().getValueType()) && + MVT::getSizeInBits(St->getValue().getValueType()) == 64 && + // Must be a store of a load. + isa(St->getChain()) && + St->getChain().Val == St->getValue().Val && + St->getValue().hasOneUse() && St->getChain().hasOneUse() && + !St->isVolatile() && !cast(St->getChain())->isVolatile()) { + LoadSDNode *Ld = cast(St->getChain()); + + // If we are a 64-bit capable x86, lower to a single movq load/store pair. + if (Subtarget->is64Bit()) { + SDOperand NewLd = DAG.getLoad(MVT::i64, Ld->getChain(), Ld->getBasePtr(), + Ld->getSrcValue(), Ld->getSrcValueOffset(), + Ld->isVolatile(), Ld->getAlignment()); + return DAG.getStore(NewLd.getValue(1), NewLd, St->getBasePtr(), + St->getSrcValue(), St->getSrcValueOffset(), + St->isVolatile(), St->getAlignment()); + } + + // TODO: 2 32-bit copies. + } + return SDOperand(); +} + /// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and /// X86ISD::FXOR nodes. static SDOperand PerformFORCombine(SDNode *N, SelectionDAG &DAG) { @@ -5908,6 +5938,8 @@ SDOperand X86TargetLowering::PerformDAGCombine(SDNode *N, default: break; case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, Subtarget); case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); + case ISD::STORE: + return PerformSTORECombine(cast(N), DAG, Subtarget); case X86ISD::FXOR: case X86ISD::FOR: return PerformFORCombine(N, DAG); case X86ISD::FAND: return PerformFANDCombine(N, DAG); diff --git a/llvm/test/CodeGen/X86/mmx-copy-gprs.ll b/llvm/test/CodeGen/X86/mmx-copy-gprs.ll new file mode 100644 index 000000000000..8cf36e05a837 --- /dev/null +++ b/llvm/test/CodeGen/X86/mmx-copy-gprs.ll @@ -0,0 +1,14 @@ +; RUN: llvm-as < %s | llc -march=x86-64 | grep {movq.*(%rsi), %rax} + +; This test should use GPRs to copy the mmx value, not MMX regs. Using mmx regs, +; increases the places that need to use emms. + +; rdar://5741668 +target triple = "x86_64-apple-darwin8" + +define i32 @foo(<1 x i64>* %x, <1 x i64>* %y) nounwind { +entry: + %tmp1 = load <1 x i64>* %y, align 8 ; <<1 x i64>> [#uses=1] + store <1 x i64> %tmp1, <1 x i64>* %x, align 8 + ret i32 undef +}