diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h index 097849d64739..360f9dff94a7 100644 --- a/llvm/include/llvm/LinkAllPasses.h +++ b/llvm/include/llvm/LinkAllPasses.h @@ -112,6 +112,7 @@ namespace { (void) llvm::createPredicateSimplifierPass(); (void) llvm::createCodeGenPreparePass(); (void) llvm::createGVNPass(); + (void) llvm::createMemCpyOptPass(); (void)new llvm::IntervalPartition(); (void)new llvm::FindUsedTypes(); diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h index f07877b15a71..c7e87239c4f9 100644 --- a/llvm/include/llvm/Transforms/Scalar.h +++ b/llvm/include/llvm/Transforms/Scalar.h @@ -303,6 +303,13 @@ FunctionPass *createGVNPREPass(); // FunctionPass *createGVNPass(); +//===----------------------------------------------------------------------===// +// +// MemCpyOpt - This pass performs optimizations related to eliminating memcpy +// calls and/or combining multiple stores into memset's. +// +FunctionPass *createMemCpyOptPass(); + //===----------------------------------------------------------------------===// // // CodeGenPrepare - This pass prepares a function for instruction selection. diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 082c36192ae0..a231e56405f7 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -42,14 +42,6 @@ using namespace llvm; STATISTIC(NumGVNInstr, "Number of instructions deleted"); STATISTIC(NumGVNLoad, "Number of loads deleted"); -STATISTIC(NumMemSetInfer, "Number of memsets inferred"); - -namespace { - cl::opt - FormMemSet("form-memset-from-stores", - cl::desc("Transform straight-line stores to memsets"), - cl::init(true), cl::Hidden); -} //===----------------------------------------------------------------------===// // ValueTable Class @@ -668,17 +660,12 @@ namespace { bool processLoad(LoadInst* L, DenseMap &lastLoad, SmallVectorImpl &toErase); - bool processStore(StoreInst *SI, SmallVectorImpl &toErase); bool processInstruction(Instruction* I, ValueNumberedSet& currAvail, DenseMap& lastSeenLoad, SmallVectorImpl &toErase); bool processNonLocalLoad(LoadInst* L, SmallVectorImpl &toErase); - bool processMemCpy(MemCpyInst* M, MemCpyInst* MDep, - SmallVectorImpl &toErase); - bool performCallSlotOptzn(MemCpyInst* cpy, CallInst* C, - SmallVectorImpl &toErase); Value *GetValueForBlock(BasicBlock *BB, LoadInst* orig, DenseMap &Phis, bool top_level = false); @@ -983,593 +970,6 @@ bool GVN::processLoad(LoadInst *L, DenseMap &lastLoad, return deletedLoad; } -/// isBytewiseValue - If the specified value can be set by repeating the same -/// byte in memory, return the i8 value that it is represented with. This is -/// true for all i8 values obviously, but is also true for i32 0, i32 -1, -/// i16 0xF0F0, double 0.0 etc. If the value can't be handled with a repeated -/// byte store (e.g. i16 0x1234), return null. -static Value *isBytewiseValue(Value *V) { - // All byte-wide stores are splatable, even of arbitrary variables. - if (V->getType() == Type::Int8Ty) return V; - - // Constant float and double values can be handled as integer values if the - // corresponding integer value is "byteable". An important case is 0.0. - if (ConstantFP *CFP = dyn_cast(V)) { - if (CFP->getType() == Type::FloatTy) - V = ConstantExpr::getBitCast(CFP, Type::Int32Ty); - if (CFP->getType() == Type::DoubleTy) - V = ConstantExpr::getBitCast(CFP, Type::Int64Ty); - // Don't handle long double formats, which have strange constraints. - } - - // We can handle constant integers that are power of two in size and a - // multiple of 8 bits. - if (ConstantInt *CI = dyn_cast(V)) { - unsigned Width = CI->getBitWidth(); - if (isPowerOf2_32(Width) && Width > 8) { - // We can handle this value if the recursive binary decomposition is the - // same at all levels. - APInt Val = CI->getValue(); - APInt Val2; - while (Val.getBitWidth() != 8) { - unsigned NextWidth = Val.getBitWidth()/2; - Val2 = Val.lshr(NextWidth); - Val2.trunc(Val.getBitWidth()/2); - Val.trunc(Val.getBitWidth()/2); - - // If the top/bottom halves aren't the same, reject it. - if (Val != Val2) - return 0; - } - return ConstantInt::get(Val); - } - } - - // Conceptually, we could handle things like: - // %a = zext i8 %X to i16 - // %b = shl i16 %a, 8 - // %c = or i16 %a, %b - // but until there is an example that actually needs this, it doesn't seem - // worth worrying about. - return 0; -} - -static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx, - bool &VariableIdxFound, TargetData &TD) { - // Skip over the first indices. - gep_type_iterator GTI = gep_type_begin(GEP); - for (unsigned i = 1; i != Idx; ++i, ++GTI) - /*skip along*/; - - // Compute the offset implied by the rest of the indices. - int64_t Offset = 0; - for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) { - ConstantInt *OpC = dyn_cast(GEP->getOperand(i)); - if (OpC == 0) - return VariableIdxFound = true; - if (OpC->isZero()) continue; // No offset. - - // Handle struct indices, which add their field offset to the pointer. - if (const StructType *STy = dyn_cast(*GTI)) { - Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue()); - continue; - } - - // Otherwise, we have a sequential type like an array or vector. Multiply - // the index by the ElementSize. - uint64_t Size = TD.getABITypeSize(GTI.getIndexedType()); - Offset += Size*OpC->getSExtValue(); - } - - return Offset; -} - -/// IsPointerOffset - Return true if Ptr1 is provably equal to Ptr2 plus a -/// constant offset, and return that constant offset. For example, Ptr1 might -/// be &A[42], and Ptr2 might be &A[40]. In this case offset would be -8. -static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, - TargetData &TD) { - // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical - // base. After that base, they may have some number of common (and - // potentially variable) indices. After that they handle some constant - // offset, which determines their offset from each other. At this point, we - // handle no other case. - GetElementPtrInst *GEP1 = dyn_cast(Ptr1); - GetElementPtrInst *GEP2 = dyn_cast(Ptr2); - if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0)) - return false; - - // Skip any common indices and track the GEP types. - unsigned Idx = 1; - for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx) - if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx)) - break; - - bool VariableIdxFound = false; - int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, TD); - int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, TD); - if (VariableIdxFound) return false; - - Offset = Offset2-Offset1; - return true; -} - - -/// MemsetRange - Represents a range of memset'd bytes with the ByteVal value. -/// This allows us to analyze stores like: -/// store 0 -> P+1 -/// store 0 -> P+0 -/// store 0 -> P+3 -/// store 0 -> P+2 -/// which sometimes happens with stores to arrays of structs etc. When we see -/// the first store, we make a range [1, 2). The second store extends the range -/// to [0, 2). The third makes a new range [2, 3). The fourth store joins the -/// two ranges into [0, 3) which is memset'able. -namespace { -struct MemsetRange { - // Start/End - A semi range that describes the span that this range covers. - // The range is closed at the start and open at the end: [Start, End). - int64_t Start, End; - - /// StartPtr - The getelementptr instruction that points to the start of the - /// range. - Value *StartPtr; - - /// Alignment - The known alignment of the first store. - unsigned Alignment; - - /// TheStores - The actual stores that make up this range. - SmallVector TheStores; - - bool isProfitableToUseMemset(const TargetData &TD) const; - -}; -} // end anon namespace - -bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const { - // If we found more than 8 stores to merge or 64 bytes, use memset. - if (TheStores.size() >= 8 || End-Start >= 64) return true; - - // Assume that the code generator is capable of merging pairs of stores - // together if it wants to. - if (TheStores.size() <= 2) return false; - - // If we have fewer than 8 stores, it can still be worthwhile to do this. - // For example, merging 4 i8 stores into an i32 store is useful almost always. - // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the - // memset will be split into 2 32-bit stores anyway) and doing so can - // pessimize the llvm optimizer. - // - // Since we don't have perfect knowledge here, make some assumptions: assume - // the maximum GPR width is the same size as the pointer size and assume that - // this width can be stored. If so, check to see whether we will end up - // actually reducing the number of stores used. - unsigned Bytes = unsigned(End-Start); - unsigned NumPointerStores = Bytes/TD.getPointerSize(); - - // Assume the remaining bytes if any are done a byte at a time. - unsigned NumByteStores = Bytes - NumPointerStores*TD.getPointerSize(); - - // If we will reduce the # stores (according to this heuristic), do the - // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 - // etc. - return TheStores.size() > NumPointerStores+NumByteStores; -} - - -namespace { -class MemsetRanges { - /// Ranges - A sorted list of the memset ranges. We use std::list here - /// because each element is relatively large and expensive to copy. - std::list Ranges; - typedef std::list::iterator range_iterator; - TargetData &TD; -public: - MemsetRanges(TargetData &td) : TD(td) {} - - typedef std::list::const_iterator const_iterator; - const_iterator begin() const { return Ranges.begin(); } - const_iterator end() const { return Ranges.end(); } - bool empty() const { return Ranges.empty(); } - - void addStore(int64_t OffsetFromFirst, StoreInst *SI); -}; - -} // end anon namespace - - -/// addStore - Add a new store to the MemsetRanges data structure. This adds a -/// new range for the specified store at the specified offset, merging into -/// existing ranges as appropriate. -void MemsetRanges::addStore(int64_t Start, StoreInst *SI) { - int64_t End = Start+TD.getTypeStoreSize(SI->getOperand(0)->getType()); - - // Do a linear search of the ranges to see if this can be joined and/or to - // find the insertion point in the list. We keep the ranges sorted for - // simplicity here. This is a linear search of a linked list, which is ugly, - // however the number of ranges is limited, so this won't get crazy slow. - range_iterator I = Ranges.begin(), E = Ranges.end(); - - while (I != E && Start > I->End) - ++I; - - // We now know that I == E, in which case we didn't find anything to merge - // with, or that Start <= I->End. If End < I->Start or I == E, then we need - // to insert a new range. Handle this now. - if (I == E || End < I->Start) { - MemsetRange &R = *Ranges.insert(I, MemsetRange()); - R.Start = Start; - R.End = End; - R.StartPtr = SI->getPointerOperand(); - R.Alignment = SI->getAlignment(); - R.TheStores.push_back(SI); - return; - } - - // This store overlaps with I, add it. - I->TheStores.push_back(SI); - - // At this point, we may have an interval that completely contains our store. - // If so, just add it to the interval and return. - if (I->Start <= Start && I->End >= End) - return; - - // Now we know that Start <= I->End and End >= I->Start so the range overlaps - // but is not entirely contained within the range. - - // See if the range extends the start of the range. In this case, it couldn't - // possibly cause it to join the prior range, because otherwise we would have - // stopped on *it*. - if (Start < I->Start) { - I->Start = Start; - I->StartPtr = SI->getPointerOperand(); - } - - // Now we know that Start <= I->End and Start >= I->Start (so the startpoint - // is in or right at the end of I), and that End >= I->Start. Extend I out to - // End. - if (End > I->End) { - I->End = End; - range_iterator NextI = I;; - while (++NextI != E && End >= NextI->Start) { - // Merge the range in. - I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end()); - if (NextI->End > I->End) - I->End = NextI->End; - Ranges.erase(NextI); - NextI = I; - } - } -} - - - -/// processStore - When GVN is scanning forward over instructions, we look for -/// some other patterns to fold away. In particular, this looks for stores to -/// neighboring locations of memory. If it sees enough consequtive ones -/// (currently 4) it attempts to merge them together into a memcpy/memset. -bool GVN::processStore(StoreInst *SI, SmallVectorImpl &toErase) { - if (!FormMemSet) return false; - if (SI->isVolatile()) return false; - - // There are two cases that are interesting for this code to handle: memcpy - // and memset. Right now we only handle memset. - - // Ensure that the value being stored is something that can be memset'able a - // byte at a time like "0" or "-1" or any width, as well as things like - // 0xA0A0A0A0 and 0.0. - Value *ByteVal = isBytewiseValue(SI->getOperand(0)); - if (!ByteVal) - return false; - - TargetData &TD = getAnalysis(); - AliasAnalysis &AA = getAnalysis(); - - // Okay, so we now have a single store that can be splatable. Scan to find - // all subsequent stores of the same value to offset from the same pointer. - // Join these together into ranges, so we can decide whether contiguous blocks - // are stored. - MemsetRanges Ranges(TD); - - Value *StartPtr = SI->getPointerOperand(); - - BasicBlock::iterator BI = SI; - for (++BI; !isa(BI); ++BI) { - if (isa(BI) || isa(BI)) { - // If the call is readnone, ignore it, otherwise bail out. We don't even - // allow readonly here because we don't want something like: - // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A). - if (AA.getModRefBehavior(CallSite::get(BI)) == - AliasAnalysis::DoesNotAccessMemory) - continue; - - // TODO: If this is a memset, try to join it in. - - break; - } else if (isa(BI) || isa(BI)) - break; - - // If this is a non-store instruction it is fine, ignore it. - StoreInst *NextStore = dyn_cast(BI); - if (NextStore == 0) continue; - - // If this is a store, see if we can merge it in. - if (NextStore->isVolatile()) break; - - // Check to see if this stored value is of the same byte-splattable value. - if (ByteVal != isBytewiseValue(NextStore->getOperand(0))) - break; - - // Check to see if this store is to a constant offset from the start ptr. - int64_t Offset; - if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset, TD)) - break; - - Ranges.addStore(Offset, NextStore); - } - - // If we have no ranges, then we just had a single store with nothing that - // could be merged in. This is a very common case of course. - if (Ranges.empty()) - return false; - - // If we had at least one store that could be merged in, add the starting - // store as well. We try to avoid this unless there is at least something - // interesting as a small compile-time optimization. - Ranges.addStore(0, SI); - - - Function *MemSetF = 0; - - // Now that we have full information about ranges, loop over the ranges and - // emit memset's for anything big enough to be worthwhile. - bool MadeChange = false; - for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end(); - I != E; ++I) { - const MemsetRange &Range = *I; - - if (Range.TheStores.size() == 1) continue; - - // If it is profitable to lower this range to memset, do so now. - if (!Range.isProfitableToUseMemset(TD)) - continue; - - // Otherwise, we do want to transform this! Create a new memset. We put - // the memset right before the first instruction that isn't part of this - // memset block. This ensure that the memset is dominated by any addressing - // instruction needed by the start of the block. - BasicBlock::iterator InsertPt = BI; - - if (MemSetF == 0) - MemSetF = Intrinsic::getDeclaration(SI->getParent()->getParent() - ->getParent(), Intrinsic::memset_i64); - - // Get the starting pointer of the block. - StartPtr = Range.StartPtr; - - // Cast the start ptr to be i8* as memset requires. - const Type *i8Ptr = PointerType::getUnqual(Type::Int8Ty); - if (StartPtr->getType() != i8Ptr) - StartPtr = new BitCastInst(StartPtr, i8Ptr, StartPtr->getNameStart(), - InsertPt); - - Value *Ops[] = { - StartPtr, ByteVal, // Start, value - ConstantInt::get(Type::Int64Ty, Range.End-Range.Start), // size - ConstantInt::get(Type::Int32Ty, Range.Alignment) // align - }; - Value *C = CallInst::Create(MemSetF, Ops, Ops+4, "", InsertPt); - DEBUG(cerr << "Replace stores:\n"; - for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i) - cerr << *Range.TheStores[i]; - cerr << "With: " << *C); C=C; - - // Zap all the stores. - toErase.append(Range.TheStores.begin(), Range.TheStores.end()); - ++NumMemSetInfer; - MadeChange = true; - } - - return MadeChange; -} - - -/// performCallSlotOptzn - takes a memcpy and a call that it depends on, -/// and checks for the possibility of a call slot optimization by having -/// the call write its result directly into the destination of the memcpy. -bool GVN::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C, - SmallVectorImpl &toErase) { - // The general transformation to keep in mind is - // - // call @func(..., src, ...) - // memcpy(dest, src, ...) - // - // -> - // - // memcpy(dest, src, ...) - // call @func(..., dest, ...) - // - // Since moving the memcpy is technically awkward, we additionally check that - // src only holds uninitialized values at the moment of the call, meaning that - // the memcpy can be discarded rather than moved. - - // Deliberately get the source and destination with bitcasts stripped away, - // because we'll need to do type comparisons based on the underlying type. - Value* cpyDest = cpy->getDest(); - Value* cpySrc = cpy->getSource(); - CallSite CS = CallSite::get(C); - - // We need to be able to reason about the size of the memcpy, so we require - // that it be a constant. - ConstantInt* cpyLength = dyn_cast(cpy->getLength()); - if (!cpyLength) - return false; - - // Require that src be an alloca. This simplifies the reasoning considerably. - AllocaInst* srcAlloca = dyn_cast(cpySrc); - if (!srcAlloca) - return false; - - // Check that all of src is copied to dest. - TargetData& TD = getAnalysis(); - - ConstantInt* srcArraySize = dyn_cast(srcAlloca->getArraySize()); - if (!srcArraySize) - return false; - - uint64_t srcSize = TD.getABITypeSize(srcAlloca->getAllocatedType()) * - srcArraySize->getZExtValue(); - - if (cpyLength->getZExtValue() < srcSize) - return false; - - // Check that accessing the first srcSize bytes of dest will not cause a - // trap. Otherwise the transform is invalid since it might cause a trap - // to occur earlier than it otherwise would. - if (AllocaInst* A = dyn_cast(cpyDest)) { - // The destination is an alloca. Check it is larger than srcSize. - ConstantInt* destArraySize = dyn_cast(A->getArraySize()); - if (!destArraySize) - return false; - - uint64_t destSize = TD.getABITypeSize(A->getAllocatedType()) * - destArraySize->getZExtValue(); - - if (destSize < srcSize) - return false; - } else if (Argument* A = dyn_cast(cpyDest)) { - // If the destination is an sret parameter then only accesses that are - // outside of the returned struct type can trap. - if (!A->hasStructRetAttr()) - return false; - - const Type* StructTy = cast(A->getType())->getElementType(); - uint64_t destSize = TD.getABITypeSize(StructTy); - - if (destSize < srcSize) - return false; - } else { - return false; - } - - // Check that src is not accessed except via the call and the memcpy. This - // guarantees that it holds only undefined values when passed in (so the final - // memcpy can be dropped), that it is not read or written between the call and - // the memcpy, and that writing beyond the end of it is undefined. - SmallVector srcUseList(srcAlloca->use_begin(), - srcAlloca->use_end()); - while (!srcUseList.empty()) { - User* UI = srcUseList.back(); - srcUseList.pop_back(); - - if (isa(UI) || isa(UI)) { - for (User::use_iterator I = UI->use_begin(), E = UI->use_end(); - I != E; ++I) - srcUseList.push_back(*I); - } else if (UI != C && UI != cpy) { - return false; - } - } - - // Since we're changing the parameter to the callsite, we need to make sure - // that what would be the new parameter dominates the callsite. - DominatorTree& DT = getAnalysis(); - if (Instruction* cpyDestInst = dyn_cast(cpyDest)) - if (!DT.dominates(cpyDestInst, C)) - return false; - - // In addition to knowing that the call does not access src in some - // unexpected manner, for example via a global, which we deduce from - // the use analysis, we also need to know that it does not sneakily - // access dest. We rely on AA to figure this out for us. - AliasAnalysis& AA = getAnalysis(); - if (AA.getModRefInfo(C, cpy->getRawDest(), srcSize) != - AliasAnalysis::NoModRef) - return false; - - // All the checks have passed, so do the transformation. - for (unsigned i = 0; i < CS.arg_size(); ++i) - if (CS.getArgument(i) == cpySrc) { - if (cpySrc->getType() != cpyDest->getType()) - cpyDest = CastInst::createPointerCast(cpyDest, cpySrc->getType(), - cpyDest->getName(), C); - CS.setArgument(i, cpyDest); - } - - // Drop any cached information about the call, because we may have changed - // its dependence information by changing its parameter. - MemoryDependenceAnalysis& MD = getAnalysis(); - MD.dropInstruction(C); - - // Remove the memcpy - MD.removeInstruction(cpy); - toErase.push_back(cpy); - - return true; -} - -/// processMemCpy - perform simplication of memcpy's. If we have memcpy A which -/// copies X to Y, and memcpy B which copies Y to Z, then we can rewrite B to be -/// a memcpy from X to Z (or potentially a memmove, depending on circumstances). -/// This allows later passes to remove the first memcpy altogether. -bool GVN::processMemCpy(MemCpyInst* M, MemCpyInst* MDep, - SmallVectorImpl &toErase) { - // We can only transforms memcpy's where the dest of one is the source of the - // other - if (M->getSource() != MDep->getDest()) - return false; - - // Second, the length of the memcpy's must be the same, or the preceeding one - // must be larger than the following one. - ConstantInt* C1 = dyn_cast(MDep->getLength()); - ConstantInt* C2 = dyn_cast(M->getLength()); - if (!C1 || !C2) - return false; - - uint64_t DepSize = C1->getValue().getZExtValue(); - uint64_t CpySize = C2->getValue().getZExtValue(); - - if (DepSize < CpySize) - return false; - - // Finally, we have to make sure that the dest of the second does not - // alias the source of the first - AliasAnalysis& AA = getAnalysis(); - if (AA.alias(M->getRawDest(), CpySize, MDep->getRawSource(), DepSize) != - AliasAnalysis::NoAlias) - return false; - else if (AA.alias(M->getRawDest(), CpySize, M->getRawSource(), CpySize) != - AliasAnalysis::NoAlias) - return false; - else if (AA.alias(MDep->getRawDest(), DepSize, MDep->getRawSource(), DepSize) - != AliasAnalysis::NoAlias) - return false; - - // If all checks passed, then we can transform these memcpy's - Function* MemCpyFun = Intrinsic::getDeclaration( - M->getParent()->getParent()->getParent(), - M->getIntrinsicID()); - - std::vector args; - args.push_back(M->getRawDest()); - args.push_back(MDep->getRawSource()); - args.push_back(M->getLength()); - args.push_back(M->getAlignment()); - - CallInst* C = CallInst::Create(MemCpyFun, args.begin(), args.end(), "", M); - - MemoryDependenceAnalysis& MD = getAnalysis(); - if (MD.getDependency(C) == MDep) { - MD.dropInstruction(M); - toErase.push_back(M); - return true; - } - - MD.removeInstruction(C); - toErase.push_back(C); - return false; -} - /// processInstruction - When calculating availability, handle an instruction /// by inserting it into the appropriate sets bool GVN::processInstruction(Instruction *I, ValueNumberedSet &currAvail, @@ -1578,31 +978,11 @@ bool GVN::processInstruction(Instruction *I, ValueNumberedSet &currAvail, if (LoadInst* L = dyn_cast(I)) return processLoad(L, lastSeenLoad, toErase); - if (StoreInst *SI = dyn_cast(I)) - return processStore(SI, toErase); - // Allocations are always uniquely numbered, so we can save time and memory // by fast failing them. if (isa(I)) return false; - if (MemCpyInst* M = dyn_cast(I)) { - MemoryDependenceAnalysis& MD = getAnalysis(); - - // The are two possible optimizations we can do for memcpy: - // a) memcpy-memcpy xform which exposes redundance for DSE - // b) call-memcpy xform for return slot optimization - Instruction* dep = MD.getDependency(M); - if (dep == MemoryDependenceAnalysis::None || - dep == MemoryDependenceAnalysis::NonLocal) - return false; - if (MemCpyInst *MemCpy = dyn_cast(dep)) - return processMemCpy(M, MemCpy, toErase); - if (CallInst* C = dyn_cast(dep)) - return performCallSlotOptzn(M, C, toErase); - return false; - } - unsigned num = VN.lookup_or_add(I); // Collapse PHI nodes diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp new file mode 100644 index 000000000000..f990ba870ec1 --- /dev/null +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -0,0 +1,769 @@ +//===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass performs various transformations related to eliminating memcpy +// calls, or transforming sets of stores into memset's. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "memcpyopt" +#include "llvm/Transforms/Scalar.h" +#include "llvm/BasicBlock.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Instructions.h" +#include "llvm/ParameterAttributes.h" +#include "llvm/Value.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Target/TargetData.h" +#include +using namespace llvm; + +STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted"); +STATISTIC(NumMemSetInfer, "Number of memsets inferred"); + +namespace { + cl::opt + FormMemSet("form-memset-from-stores", + cl::desc("Transform straight-line stores to memsets"), + cl::init(true), cl::Hidden); +} + +/// isBytewiseValue - If the specified value can be set by repeating the same +/// byte in memory, return the i8 value that it is represented with. This is +/// true for all i8 values obviously, but is also true for i32 0, i32 -1, +/// i16 0xF0F0, double 0.0 etc. If the value can't be handled with a repeated +/// byte store (e.g. i16 0x1234), return null. +static Value *isBytewiseValue(Value *V) { + // All byte-wide stores are splatable, even of arbitrary variables. + if (V->getType() == Type::Int8Ty) return V; + + // Constant float and double values can be handled as integer values if the + // corresponding integer value is "byteable". An important case is 0.0. + if (ConstantFP *CFP = dyn_cast(V)) { + if (CFP->getType() == Type::FloatTy) + V = ConstantExpr::getBitCast(CFP, Type::Int32Ty); + if (CFP->getType() == Type::DoubleTy) + V = ConstantExpr::getBitCast(CFP, Type::Int64Ty); + // Don't handle long double formats, which have strange constraints. + } + + // We can handle constant integers that are power of two in size and a + // multiple of 8 bits. + if (ConstantInt *CI = dyn_cast(V)) { + unsigned Width = CI->getBitWidth(); + if (isPowerOf2_32(Width) && Width > 8) { + // We can handle this value if the recursive binary decomposition is the + // same at all levels. + APInt Val = CI->getValue(); + APInt Val2; + while (Val.getBitWidth() != 8) { + unsigned NextWidth = Val.getBitWidth()/2; + Val2 = Val.lshr(NextWidth); + Val2.trunc(Val.getBitWidth()/2); + Val.trunc(Val.getBitWidth()/2); + + // If the top/bottom halves aren't the same, reject it. + if (Val != Val2) + return 0; + } + return ConstantInt::get(Val); + } + } + + // Conceptually, we could handle things like: + // %a = zext i8 %X to i16 + // %b = shl i16 %a, 8 + // %c = or i16 %a, %b + // but until there is an example that actually needs this, it doesn't seem + // worth worrying about. + return 0; +} + +static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx, + bool &VariableIdxFound, TargetData &TD) { + // Skip over the first indices. + gep_type_iterator GTI = gep_type_begin(GEP); + for (unsigned i = 1; i != Idx; ++i, ++GTI) + /*skip along*/; + + // Compute the offset implied by the rest of the indices. + int64_t Offset = 0; + for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) { + ConstantInt *OpC = dyn_cast(GEP->getOperand(i)); + if (OpC == 0) + return VariableIdxFound = true; + if (OpC->isZero()) continue; // No offset. + + // Handle struct indices, which add their field offset to the pointer. + if (const StructType *STy = dyn_cast(*GTI)) { + Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue()); + continue; + } + + // Otherwise, we have a sequential type like an array or vector. Multiply + // the index by the ElementSize. + uint64_t Size = TD.getABITypeSize(GTI.getIndexedType()); + Offset += Size*OpC->getSExtValue(); + } + + return Offset; +} + +/// IsPointerOffset - Return true if Ptr1 is provably equal to Ptr2 plus a +/// constant offset, and return that constant offset. For example, Ptr1 might +/// be &A[42], and Ptr2 might be &A[40]. In this case offset would be -8. +static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, + TargetData &TD) { + // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical + // base. After that base, they may have some number of common (and + // potentially variable) indices. After that they handle some constant + // offset, which determines their offset from each other. At this point, we + // handle no other case. + GetElementPtrInst *GEP1 = dyn_cast(Ptr1); + GetElementPtrInst *GEP2 = dyn_cast(Ptr2); + if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0)) + return false; + + // Skip any common indices and track the GEP types. + unsigned Idx = 1; + for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx) + if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx)) + break; + + bool VariableIdxFound = false; + int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, TD); + int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, TD); + if (VariableIdxFound) return false; + + Offset = Offset2-Offset1; + return true; +} + + +/// MemsetRange - Represents a range of memset'd bytes with the ByteVal value. +/// This allows us to analyze stores like: +/// store 0 -> P+1 +/// store 0 -> P+0 +/// store 0 -> P+3 +/// store 0 -> P+2 +/// which sometimes happens with stores to arrays of structs etc. When we see +/// the first store, we make a range [1, 2). The second store extends the range +/// to [0, 2). The third makes a new range [2, 3). The fourth store joins the +/// two ranges into [0, 3) which is memset'able. +namespace { +struct MemsetRange { + // Start/End - A semi range that describes the span that this range covers. + // The range is closed at the start and open at the end: [Start, End). + int64_t Start, End; + + /// StartPtr - The getelementptr instruction that points to the start of the + /// range. + Value *StartPtr; + + /// Alignment - The known alignment of the first store. + unsigned Alignment; + + /// TheStores - The actual stores that make up this range. + SmallVector TheStores; + + bool isProfitableToUseMemset(const TargetData &TD) const; + +}; +} // end anon namespace + +bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const { + // If we found more than 8 stores to merge or 64 bytes, use memset. + if (TheStores.size() >= 8 || End-Start >= 64) return true; + + // Assume that the code generator is capable of merging pairs of stores + // together if it wants to. + if (TheStores.size() <= 2) return false; + + // If we have fewer than 8 stores, it can still be worthwhile to do this. + // For example, merging 4 i8 stores into an i32 store is useful almost always. + // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the + // memset will be split into 2 32-bit stores anyway) and doing so can + // pessimize the llvm optimizer. + // + // Since we don't have perfect knowledge here, make some assumptions: assume + // the maximum GPR width is the same size as the pointer size and assume that + // this width can be stored. If so, check to see whether we will end up + // actually reducing the number of stores used. + unsigned Bytes = unsigned(End-Start); + unsigned NumPointerStores = Bytes/TD.getPointerSize(); + + // Assume the remaining bytes if any are done a byte at a time. + unsigned NumByteStores = Bytes - NumPointerStores*TD.getPointerSize(); + + // If we will reduce the # stores (according to this heuristic), do the + // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 + // etc. + return TheStores.size() > NumPointerStores+NumByteStores; +} + + +namespace { +class MemsetRanges { + /// Ranges - A sorted list of the memset ranges. We use std::list here + /// because each element is relatively large and expensive to copy. + std::list Ranges; + typedef std::list::iterator range_iterator; + TargetData &TD; +public: + MemsetRanges(TargetData &td) : TD(td) {} + + typedef std::list::const_iterator const_iterator; + const_iterator begin() const { return Ranges.begin(); } + const_iterator end() const { return Ranges.end(); } + bool empty() const { return Ranges.empty(); } + + void addStore(int64_t OffsetFromFirst, StoreInst *SI); +}; + +} // end anon namespace + + +/// addStore - Add a new store to the MemsetRanges data structure. This adds a +/// new range for the specified store at the specified offset, merging into +/// existing ranges as appropriate. +void MemsetRanges::addStore(int64_t Start, StoreInst *SI) { + int64_t End = Start+TD.getTypeStoreSize(SI->getOperand(0)->getType()); + + // Do a linear search of the ranges to see if this can be joined and/or to + // find the insertion point in the list. We keep the ranges sorted for + // simplicity here. This is a linear search of a linked list, which is ugly, + // however the number of ranges is limited, so this won't get crazy slow. + range_iterator I = Ranges.begin(), E = Ranges.end(); + + while (I != E && Start > I->End) + ++I; + + // We now know that I == E, in which case we didn't find anything to merge + // with, or that Start <= I->End. If End < I->Start or I == E, then we need + // to insert a new range. Handle this now. + if (I == E || End < I->Start) { + MemsetRange &R = *Ranges.insert(I, MemsetRange()); + R.Start = Start; + R.End = End; + R.StartPtr = SI->getPointerOperand(); + R.Alignment = SI->getAlignment(); + R.TheStores.push_back(SI); + return; + } + + // This store overlaps with I, add it. + I->TheStores.push_back(SI); + + // At this point, we may have an interval that completely contains our store. + // If so, just add it to the interval and return. + if (I->Start <= Start && I->End >= End) + return; + + // Now we know that Start <= I->End and End >= I->Start so the range overlaps + // but is not entirely contained within the range. + + // See if the range extends the start of the range. In this case, it couldn't + // possibly cause it to join the prior range, because otherwise we would have + // stopped on *it*. + if (Start < I->Start) { + I->Start = Start; + I->StartPtr = SI->getPointerOperand(); + } + + // Now we know that Start <= I->End and Start >= I->Start (so the startpoint + // is in or right at the end of I), and that End >= I->Start. Extend I out to + // End. + if (End > I->End) { + I->End = End; + range_iterator NextI = I;; + while (++NextI != E && End >= NextI->Start) { + // Merge the range in. + I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end()); + if (NextI->End > I->End) + I->End = NextI->End; + Ranges.erase(NextI); + NextI = I; + } + } +} + +//===----------------------------------------------------------------------===// +// MemCpyOpt Pass +//===----------------------------------------------------------------------===// + +namespace { + + class VISIBILITY_HIDDEN MemCpyOpt : public FunctionPass { + bool runOnFunction(Function &F); + public: + static char ID; // Pass identification, replacement for typeid + MemCpyOpt() : FunctionPass((intptr_t)&ID) { } + + private: + // This transformation requires dominator postdominator info + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + } + + // Helper fuctions + bool processInstruction(Instruction* I, + SmallVectorImpl &toErase); + bool processStore(StoreInst *SI, SmallVectorImpl &toErase); + bool processMemCpy(MemCpyInst* M, MemCpyInst* MDep, + SmallVectorImpl &toErase); + bool performCallSlotOptzn(MemCpyInst* cpy, CallInst* C, + SmallVectorImpl &toErase); + bool iterateOnFunction(Function &F); + }; + + char MemCpyOpt::ID = 0; +} + +// createMemCpyOptPass - The public interface to this file... +FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); } + +static RegisterPass X("memcpyopt", + "MemCpy Optimization"); + + + +/// processStore - When GVN is scanning forward over instructions, we look for +/// some other patterns to fold away. In particular, this looks for stores to +/// neighboring locations of memory. If it sees enough consequtive ones +/// (currently 4) it attempts to merge them together into a memcpy/memset. +bool MemCpyOpt::processStore(StoreInst *SI, SmallVectorImpl &toErase) { + if (!FormMemSet) return false; + if (SI->isVolatile()) return false; + + // There are two cases that are interesting for this code to handle: memcpy + // and memset. Right now we only handle memset. + + // Ensure that the value being stored is something that can be memset'able a + // byte at a time like "0" or "-1" or any width, as well as things like + // 0xA0A0A0A0 and 0.0. + Value *ByteVal = isBytewiseValue(SI->getOperand(0)); + if (!ByteVal) + return false; + + TargetData &TD = getAnalysis(); + AliasAnalysis &AA = getAnalysis(); + + // Okay, so we now have a single store that can be splatable. Scan to find + // all subsequent stores of the same value to offset from the same pointer. + // Join these together into ranges, so we can decide whether contiguous blocks + // are stored. + MemsetRanges Ranges(TD); + + Value *StartPtr = SI->getPointerOperand(); + + BasicBlock::iterator BI = SI; + for (++BI; !isa(BI); ++BI) { + if (isa(BI) || isa(BI)) { + // If the call is readnone, ignore it, otherwise bail out. We don't even + // allow readonly here because we don't want something like: + // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A). + if (AA.getModRefBehavior(CallSite::get(BI)) == + AliasAnalysis::DoesNotAccessMemory) + continue; + + // TODO: If this is a memset, try to join it in. + + break; + } else if (isa(BI) || isa(BI)) + break; + + // If this is a non-store instruction it is fine, ignore it. + StoreInst *NextStore = dyn_cast(BI); + if (NextStore == 0) continue; + + // If this is a store, see if we can merge it in. + if (NextStore->isVolatile()) break; + + // Check to see if this stored value is of the same byte-splattable value. + if (ByteVal != isBytewiseValue(NextStore->getOperand(0))) + break; + + // Check to see if this store is to a constant offset from the start ptr. + int64_t Offset; + if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset, TD)) + break; + + Ranges.addStore(Offset, NextStore); + } + + // If we have no ranges, then we just had a single store with nothing that + // could be merged in. This is a very common case of course. + if (Ranges.empty()) + return false; + + // If we had at least one store that could be merged in, add the starting + // store as well. We try to avoid this unless there is at least something + // interesting as a small compile-time optimization. + Ranges.addStore(0, SI); + + + Function *MemSetF = 0; + + // Now that we have full information about ranges, loop over the ranges and + // emit memset's for anything big enough to be worthwhile. + bool MadeChange = false; + for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end(); + I != E; ++I) { + const MemsetRange &Range = *I; + + if (Range.TheStores.size() == 1) continue; + + // If it is profitable to lower this range to memset, do so now. + if (!Range.isProfitableToUseMemset(TD)) + continue; + + // Otherwise, we do want to transform this! Create a new memset. We put + // the memset right before the first instruction that isn't part of this + // memset block. This ensure that the memset is dominated by any addressing + // instruction needed by the start of the block. + BasicBlock::iterator InsertPt = BI; + + if (MemSetF == 0) + MemSetF = Intrinsic::getDeclaration(SI->getParent()->getParent() + ->getParent(), Intrinsic::memset_i64); + + // Get the starting pointer of the block. + StartPtr = Range.StartPtr; + + // Cast the start ptr to be i8* as memset requires. + const Type *i8Ptr = PointerType::getUnqual(Type::Int8Ty); + if (StartPtr->getType() != i8Ptr) + StartPtr = new BitCastInst(StartPtr, i8Ptr, StartPtr->getNameStart(), + InsertPt); + + Value *Ops[] = { + StartPtr, ByteVal, // Start, value + ConstantInt::get(Type::Int64Ty, Range.End-Range.Start), // size + ConstantInt::get(Type::Int32Ty, Range.Alignment) // align + }; + Value *C = CallInst::Create(MemSetF, Ops, Ops+4, "", InsertPt); + DEBUG(cerr << "Replace stores:\n"; + for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i) + cerr << *Range.TheStores[i]; + cerr << "With: " << *C); C=C; + + // Zap all the stores. + toErase.append(Range.TheStores.begin(), Range.TheStores.end()); + ++NumMemSetInfer; + MadeChange = true; + } + + return MadeChange; +} + + +/// performCallSlotOptzn - takes a memcpy and a call that it depends on, +/// and checks for the possibility of a call slot optimization by having +/// the call write its result directly into the destination of the memcpy. +bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C, + SmallVectorImpl &toErase) { + // The general transformation to keep in mind is + // + // call @func(..., src, ...) + // memcpy(dest, src, ...) + // + // -> + // + // memcpy(dest, src, ...) + // call @func(..., dest, ...) + // + // Since moving the memcpy is technically awkward, we additionally check that + // src only holds uninitialized values at the moment of the call, meaning that + // the memcpy can be discarded rather than moved. + + // Deliberately get the source and destination with bitcasts stripped away, + // because we'll need to do type comparisons based on the underlying type. + Value* cpyDest = cpy->getDest(); + Value* cpySrc = cpy->getSource(); + CallSite CS = CallSite::get(C); + + // We need to be able to reason about the size of the memcpy, so we require + // that it be a constant. + ConstantInt* cpyLength = dyn_cast(cpy->getLength()); + if (!cpyLength) + return false; + + // Require that src be an alloca. This simplifies the reasoning considerably. + AllocaInst* srcAlloca = dyn_cast(cpySrc); + if (!srcAlloca) + return false; + + // Check that all of src is copied to dest. + TargetData& TD = getAnalysis(); + + ConstantInt* srcArraySize = dyn_cast(srcAlloca->getArraySize()); + if (!srcArraySize) + return false; + + uint64_t srcSize = TD.getABITypeSize(srcAlloca->getAllocatedType()) * + srcArraySize->getZExtValue(); + + if (cpyLength->getZExtValue() < srcSize) + return false; + + // Check that accessing the first srcSize bytes of dest will not cause a + // trap. Otherwise the transform is invalid since it might cause a trap + // to occur earlier than it otherwise would. + if (AllocaInst* A = dyn_cast(cpyDest)) { + // The destination is an alloca. Check it is larger than srcSize. + ConstantInt* destArraySize = dyn_cast(A->getArraySize()); + if (!destArraySize) + return false; + + uint64_t destSize = TD.getABITypeSize(A->getAllocatedType()) * + destArraySize->getZExtValue(); + + if (destSize < srcSize) + return false; + } else if (Argument* A = dyn_cast(cpyDest)) { + // If the destination is an sret parameter then only accesses that are + // outside of the returned struct type can trap. + if (!A->hasStructRetAttr()) + return false; + + const Type* StructTy = cast(A->getType())->getElementType(); + uint64_t destSize = TD.getABITypeSize(StructTy); + + if (destSize < srcSize) + return false; + } else { + return false; + } + + // Check that src is not accessed except via the call and the memcpy. This + // guarantees that it holds only undefined values when passed in (so the final + // memcpy can be dropped), that it is not read or written between the call and + // the memcpy, and that writing beyond the end of it is undefined. + SmallVector srcUseList(srcAlloca->use_begin(), + srcAlloca->use_end()); + while (!srcUseList.empty()) { + User* UI = srcUseList.back(); + srcUseList.pop_back(); + + if (isa(UI) || isa(UI)) { + for (User::use_iterator I = UI->use_begin(), E = UI->use_end(); + I != E; ++I) + srcUseList.push_back(*I); + } else if (UI != C && UI != cpy) { + return false; + } + } + + // Since we're changing the parameter to the callsite, we need to make sure + // that what would be the new parameter dominates the callsite. + DominatorTree& DT = getAnalysis(); + if (Instruction* cpyDestInst = dyn_cast(cpyDest)) + if (!DT.dominates(cpyDestInst, C)) + return false; + + // In addition to knowing that the call does not access src in some + // unexpected manner, for example via a global, which we deduce from + // the use analysis, we also need to know that it does not sneakily + // access dest. We rely on AA to figure this out for us. + AliasAnalysis& AA = getAnalysis(); + if (AA.getModRefInfo(C, cpy->getRawDest(), srcSize) != + AliasAnalysis::NoModRef) + return false; + + // All the checks have passed, so do the transformation. + for (unsigned i = 0; i < CS.arg_size(); ++i) + if (CS.getArgument(i) == cpySrc) { + if (cpySrc->getType() != cpyDest->getType()) + cpyDest = CastInst::createPointerCast(cpyDest, cpySrc->getType(), + cpyDest->getName(), C); + CS.setArgument(i, cpyDest); + } + + // Drop any cached information about the call, because we may have changed + // its dependence information by changing its parameter. + MemoryDependenceAnalysis& MD = getAnalysis(); + MD.dropInstruction(C); + + // Remove the memcpy + MD.removeInstruction(cpy); + toErase.push_back(cpy); + + return true; +} + +/// processMemCpy - perform simplication of memcpy's. If we have memcpy A which +/// copies X to Y, and memcpy B which copies Y to Z, then we can rewrite B to be +/// a memcpy from X to Z (or potentially a memmove, depending on circumstances). +/// This allows later passes to remove the first memcpy altogether. +bool MemCpyOpt::processMemCpy(MemCpyInst* M, MemCpyInst* MDep, + SmallVectorImpl &toErase) { + // We can only transforms memcpy's where the dest of one is the source of the + // other + if (M->getSource() != MDep->getDest()) + return false; + + // Second, the length of the memcpy's must be the same, or the preceeding one + // must be larger than the following one. + ConstantInt* C1 = dyn_cast(MDep->getLength()); + ConstantInt* C2 = dyn_cast(M->getLength()); + if (!C1 || !C2) + return false; + + uint64_t DepSize = C1->getValue().getZExtValue(); + uint64_t CpySize = C2->getValue().getZExtValue(); + + if (DepSize < CpySize) + return false; + + // Finally, we have to make sure that the dest of the second does not + // alias the source of the first + AliasAnalysis& AA = getAnalysis(); + if (AA.alias(M->getRawDest(), CpySize, MDep->getRawSource(), DepSize) != + AliasAnalysis::NoAlias) + return false; + else if (AA.alias(M->getRawDest(), CpySize, M->getRawSource(), CpySize) != + AliasAnalysis::NoAlias) + return false; + else if (AA.alias(MDep->getRawDest(), DepSize, MDep->getRawSource(), DepSize) + != AliasAnalysis::NoAlias) + return false; + + // If all checks passed, then we can transform these memcpy's + Function* MemCpyFun = Intrinsic::getDeclaration( + M->getParent()->getParent()->getParent(), + M->getIntrinsicID()); + + std::vector args; + args.push_back(M->getRawDest()); + args.push_back(MDep->getRawSource()); + args.push_back(M->getLength()); + args.push_back(M->getAlignment()); + + CallInst* C = CallInst::Create(MemCpyFun, args.begin(), args.end(), "", M); + + MemoryDependenceAnalysis& MD = getAnalysis(); + if (MD.getDependency(C) == MDep) { + MD.dropInstruction(M); + toErase.push_back(M); + return true; + } + + MD.removeInstruction(C); + toErase.push_back(C); + return false; +} + +/// processInstruction - When calculating availability, handle an instruction +/// by inserting it into the appropriate sets +bool MemCpyOpt::processInstruction(Instruction *I, + SmallVectorImpl &toErase) { + if (StoreInst *SI = dyn_cast(I)) + return processStore(SI, toErase); + + if (MemCpyInst* M = dyn_cast(I)) { + MemoryDependenceAnalysis& MD = getAnalysis(); + + // The are two possible optimizations we can do for memcpy: + // a) memcpy-memcpy xform which exposes redundance for DSE + // b) call-memcpy xform for return slot optimization + Instruction* dep = MD.getDependency(M); + if (dep == MemoryDependenceAnalysis::None || + dep == MemoryDependenceAnalysis::NonLocal) + return false; + if (MemCpyInst *MemCpy = dyn_cast(dep)) + return processMemCpy(M, MemCpy, toErase); + if (CallInst* C = dyn_cast(dep)) + return performCallSlotOptzn(M, C, toErase); + return false; + } + + return false; +} + +// MemCpyOpt::runOnFunction - This is the main transformation entry point for a +// function. +// +bool MemCpyOpt::runOnFunction(Function& F) { + + bool changed = false; + bool shouldContinue = true; + + while (shouldContinue) { + shouldContinue = iterateOnFunction(F); + changed |= shouldContinue; + } + + return changed; +} + + +// MemCpyOpt::iterateOnFunction - Executes one iteration of GVN +bool MemCpyOpt::iterateOnFunction(Function &F) { + bool changed_function = false; + + DominatorTree &DT = getAnalysis(); + + SmallVector toErase; + + // Top-down walk of the dominator tree + for (df_iterator DI = df_begin(DT.getRootNode()), + E = df_end(DT.getRootNode()); DI != E; ++DI) { + + BasicBlock* BB = DI->getBlock(); + for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); + BI != BE;) { + changed_function |= processInstruction(BI, toErase); + if (toErase.empty()) { + ++BI; + continue; + } + + // If we need some instructions deleted, do it now. + NumMemCpyInstr += toErase.size(); + + // Avoid iterator invalidation. + bool AtStart = BI == BB->begin(); + if (!AtStart) + --BI; + + for (SmallVector::iterator I = toErase.begin(), + E = toErase.end(); I != E; ++I) + (*I)->eraseFromParent(); + + if (AtStart) + BI = BB->begin(); + else + ++BI; + + toErase.clear(); + } + } + + return changed_function; +} diff --git a/llvm/test/Transforms/GVN/2008-02-24-MultipleUseofSRet.ll b/llvm/test/Transforms/MemCpyOpt/2008-02-24-MultipleUseofSRet.ll similarity index 94% rename from llvm/test/Transforms/GVN/2008-02-24-MultipleUseofSRet.ll rename to llvm/test/Transforms/MemCpyOpt/2008-02-24-MultipleUseofSRet.ll index 21dff98b61b7..30d0a6dbebb5 100644 --- a/llvm/test/Transforms/GVN/2008-02-24-MultipleUseofSRet.ll +++ b/llvm/test/Transforms/MemCpyOpt/2008-02-24-MultipleUseofSRet.ll @@ -1,4 +1,4 @@ -; RUN: llvm-as < %s | opt -gvn -dse | llvm-dis | grep {call.*initialize} | not grep memtmp +; RUN: llvm-as < %s | opt -memcpyopt -dse | llvm-dis | grep {call.*initialize} | not grep memtmp ; PR2077 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32" diff --git a/llvm/test/Transforms/GVN/2008-03-13-ReturnSlotBitcast.ll b/llvm/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll similarity index 85% rename from llvm/test/Transforms/GVN/2008-03-13-ReturnSlotBitcast.ll rename to llvm/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll index 7757a92b240d..12c9a9d23e53 100644 --- a/llvm/test/Transforms/GVN/2008-03-13-ReturnSlotBitcast.ll +++ b/llvm/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll @@ -1,4 +1,4 @@ -; RUN: llvm-as < %s | opt -gvn | llvm-dis | not grep {call.*memcpy.} +; RUN: llvm-as < %s | opt -memcpyopt | llvm-dis | not grep {call.*memcpy.} %a = type { i32 } %b = type { float } diff --git a/llvm/test/Transforms/MemCpyOpt/dg.exp b/llvm/test/Transforms/MemCpyOpt/dg.exp new file mode 100644 index 000000000000..879685ca879d --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/dg.exp @@ -0,0 +1,3 @@ +load_lib llvm.exp + +RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll,llx,c,cpp,tr}]] diff --git a/llvm/test/Transforms/GVN/form-memset.ll b/llvm/test/Transforms/MemCpyOpt/form-memset.ll similarity index 92% rename from llvm/test/Transforms/GVN/form-memset.ll rename to llvm/test/Transforms/MemCpyOpt/form-memset.ll index 3bc557a74f49..cdcd00697ab4 100644 --- a/llvm/test/Transforms/GVN/form-memset.ll +++ b/llvm/test/Transforms/MemCpyOpt/form-memset.ll @@ -1,5 +1,5 @@ -; RUN: llvm-as < %s | opt -gvn -form-memset-from-stores | llvm-dis | not grep store -; RUN: llvm-as < %s | opt -gvn -form-memset-from-stores | llvm-dis | grep {call.*llvm.memset} +; RUN: llvm-as < %s | opt -memcpyopt -form-memset-from-stores | llvm-dis | not grep store +; RUN: llvm-as < %s | opt -memcpyopt -form-memset-from-stores | llvm-dis | grep {call.*llvm.memset} ; All the stores in this example should be merged into a single memset. diff --git a/llvm/test/Transforms/GVN/form-memset2.ll b/llvm/test/Transforms/MemCpyOpt/form-memset2.ll similarity index 96% rename from llvm/test/Transforms/GVN/form-memset2.ll rename to llvm/test/Transforms/MemCpyOpt/form-memset2.ll index f4e7702eb714..46eb6b479d13 100644 --- a/llvm/test/Transforms/GVN/form-memset2.ll +++ b/llvm/test/Transforms/MemCpyOpt/form-memset2.ll @@ -1,5 +1,5 @@ -; RUN: llvm-as < %s | opt -gvn -form-memset-from-stores | llvm-dis | not grep store -; RUN: llvm-as < %s | opt -gvn -form-memset-from-stores | llvm-dis | grep {call.*llvm.memset} | count 3 +; RUN: llvm-as < %s | opt -memcpyopt -form-memset-from-stores | llvm-dis | not grep store +; RUN: llvm-as < %s | opt -memcpyopt -form-memset-from-stores | llvm-dis | grep {call.*llvm.memset} | count 3 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "i386-apple-darwin8" diff --git a/llvm/test/Transforms/GVN/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll similarity index 93% rename from llvm/test/Transforms/GVN/memcpy.ll rename to llvm/test/Transforms/MemCpyOpt/memcpy.ll index f24a81f50d9f..c5cdc29a5cf2 100644 --- a/llvm/test/Transforms/GVN/memcpy.ll +++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll @@ -1,4 +1,4 @@ -; RUN: llvm-as < %s | opt -gvn -dse | llvm-dis | grep {call.*memcpy} | count 1 +; RUN: llvm-as < %s | opt -memcpyopt -dse | llvm-dis | grep {call.*memcpy} | count 1 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "i686-apple-darwin9" diff --git a/llvm/test/Transforms/GVN/sret.ll b/llvm/test/Transforms/MemCpyOpt/sret.ll similarity index 95% rename from llvm/test/Transforms/GVN/sret.ll rename to llvm/test/Transforms/MemCpyOpt/sret.ll index b907b041adb0..1ac11aa959f9 100644 --- a/llvm/test/Transforms/GVN/sret.ll +++ b/llvm/test/Transforms/MemCpyOpt/sret.ll @@ -1,4 +1,4 @@ -; RUN: llvm-as < %s | opt -gvn | llvm-dis | not grep {call.*memcpy} +; RUN: llvm-as < %s | opt -memcpyopt | llvm-dis | not grep {call.*memcpy} target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "i686-apple-darwin9" diff --git a/llvm/tools/llvm-ld/Optimize.cpp b/llvm/tools/llvm-ld/Optimize.cpp index 31b0bf0a594e..aff228d3ea42 100644 --- a/llvm/tools/llvm-ld/Optimize.cpp +++ b/llvm/tools/llvm-ld/Optimize.cpp @@ -169,6 +169,7 @@ void Optimize(Module* M) { addPass(Passes, createGlobalsModRefPass()); // IP alias analysis addPass(Passes, createLICMPass()); // Hoist loop invariants + addPass(Passes, createMemCpyOptPass()); // Remove dead memcpy's addPass(Passes, createGVNPass()); // Remove redundancies addPass(Passes, createDeadStoreEliminationPass()); // Nuke dead stores diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp index cc09642d574c..6155386e6ade 100644 --- a/llvm/tools/opt/opt.cpp +++ b/llvm/tools/opt/opt.cpp @@ -282,6 +282,7 @@ void AddStandardCompilePasses(PassManager &PM) { addPass(PM, createIndVarSimplifyPass()); // Canonicalize indvars addPass(PM, createLoopUnrollPass()); // Unroll small loops addPass(PM, createInstructionCombiningPass()); // Clean up after the unroller + addPass(PM, createMemCpyOptPass()); // Remove unneeded memcpy's addPass(PM, createGVNPass()); // Remove redundancies addPass(PM, createSCCPPass()); // Constant prop with SCCP