diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 24954d75d2e5..2d2ae069b55d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2695,8 +2695,8 @@ static SDOperand getMemcpyLoadsAndStores(SelectionDAG &DAG, const Value *SrcSV, uint64_t SrcSVOff){ const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // Expand memcpy to a series of store ops if the size operand falls below - // a certain threshold. + // Expand memcpy to a series of load and store ops if the size operand falls + // below a certain threshold. std::vector MemOps; uint64_t Limit = -1; if (!AlwaysInline) @@ -2743,6 +2743,63 @@ static SDOperand getMemcpyLoadsAndStores(SelectionDAG &DAG, &OutChains[0], OutChains.size()); } +static SDOperand getMemmoveLoadsAndStores(SelectionDAG &DAG, + SDOperand Chain, SDOperand Dst, + SDOperand Src, uint64_t Size, + unsigned Align, bool AlwaysInline, + const Value *DstSV, uint64_t DstSVOff, + const Value *SrcSV, uint64_t SrcSVOff){ + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // Expand memmove to a series of load and store ops if the size operand falls + // below a certain threshold. + std::vector MemOps; + uint64_t Limit = -1; + if (!AlwaysInline) + Limit = TLI.getMaxStoresPerMemmove(); + unsigned DstAlign = Align; // Destination alignment can change. + if (!MeetsMaxMemopRequirement(MemOps, Dst, Src, Limit, Size, DstAlign, + DAG, TLI)) + return SDOperand(); + + std::string Str; + uint64_t SrcOff = 0, DstOff = 0; + + SmallVector LoadValues; + SmallVector LoadChains; + SmallVector OutChains; + unsigned NumMemOps = MemOps.size(); + for (unsigned i = 0; i < NumMemOps; i++) { + MVT::ValueType VT = MemOps[i]; + unsigned VTSize = MVT::getSizeInBits(VT) / 8; + SDOperand Value, Store; + + Value = DAG.getLoad(VT, Chain, + getMemBasePlusOffset(Src, SrcOff, DAG), + SrcSV, SrcSVOff + SrcOff, false, Align); + LoadValues.push_back(Value); + LoadChains.push_back(Value.getValue(1)); + SrcOff += VTSize; + } + Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, + &LoadChains[0], LoadChains.size()); + OutChains.clear(); + for (unsigned i = 0; i < NumMemOps; i++) { + MVT::ValueType VT = MemOps[i]; + unsigned VTSize = MVT::getSizeInBits(VT) / 8; + SDOperand Value, Store; + + Store = DAG.getStore(Chain, LoadValues[i], + getMemBasePlusOffset(Dst, DstOff, DAG), + DstSV, DstSVOff + DstOff, false, DstAlign); + OutChains.push_back(Store); + DstOff += VTSize; + } + + return DAG.getNode(ISD::TokenFactor, MVT::Other, + &OutChains[0], OutChains.size()); +} + static SDOperand getMemsetStores(SelectionDAG &DAG, SDOperand Chain, SDOperand Dst, SDOperand Src, uint64_t Size, @@ -2836,9 +2893,20 @@ SDOperand SelectionDAG::getMemmove(SDOperand Chain, SDOperand Dst, const Value *DstSV, uint64_t DstSVOff, const Value *SrcSV, uint64_t SrcSVOff) { - // TODO: Optimize small memmove cases with simple loads and stores, - // ensuring that all loads precede all stores. This can cause severe - // register pressure, so targets should be careful with the size limit. + // Check to see if we should lower the memmove to loads and stores first. + // For cases within the target-specified limits, this is the best choice. + ConstantSDNode *ConstantSize = dyn_cast(Size); + if (ConstantSize) { + // Memmove with size zero? Just return the original chain. + if (ConstantSize->isNullValue()) + return Chain; + + SDOperand Result = + getMemmoveLoadsAndStores(*this, Chain, Dst, Src, ConstantSize->getValue(), + Align, false, DstSV, DstSVOff, SrcSV, SrcSVOff); + if (Result.Val) + return Result; + } // Then check to see if we should lower the memmove with target-specific // code. If the target chooses to do this, this is the next best. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d194d38e1ce6..c4307b881a41 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -737,7 +737,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // be smaller when we are in optimizing for size mode. maxStoresPerMemset = 16; // For %llvm.memset -> sequence of stores maxStoresPerMemcpy = 16; // For %llvm.memcpy -> sequence of stores - maxStoresPerMemmove = 16; // For %llvm.memmove -> sequence of stores + maxStoresPerMemmove = 3; // For %llvm.memmove -> sequence of stores allowUnalignedMemoryAccesses = true; // x86 supports it! setPrefLoopAlignment(16); } diff --git a/llvm/test/CodeGen/X86/memmove-4.ll b/llvm/test/CodeGen/X86/memmove-4.ll new file mode 100644 index 000000000000..f23c7d5cb854 --- /dev/null +++ b/llvm/test/CodeGen/X86/memmove-4.ll @@ -0,0 +1,12 @@ +; RUN: llvm-as < %s | llc | not grep call + +target triple = "i686-pc-linux-gnu" + +define void @a(i8* %a, i8* %b) nounwind { + %tmp2 = bitcast i8* %a to i8* + %tmp3 = bitcast i8* %b to i8* + tail call void @llvm.memmove.i32( i8* %tmp2, i8* %tmp3, i32 12, i32 4 ) + ret void +} + +declare void @llvm.memmove.i32(i8*, i8*, i32, i32)