[MemCpyOpt] Port to MemorySSA

This is a straightforward port of MemCpyOpt to MemorySSA following the approach of D26739. MemDep queries are replaced with MSSA queries without changing the overall structure of the pass. Some care has to be taken to account for differences between these APIs (MemDep also returns reads, MSSA doesn't). Differential Revision: https://reviews.llvm.org/D89207
2020-10-02 21:41:19 +02:00 · 2020-10-02 21:41:19 +02:00 · 624af932a8
parent 941e9336d0
commit 624af932a8
10 changed files with 638 additions and 269 deletions
--- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@ -43,6 +43,7 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
  AliasAnalysis *AA = nullptr;
  AssumptionCache *AC = nullptr;
  DominatorTree *DT = nullptr;
+  MemorySSA *MSSA = nullptr;
  MemorySSAUpdater *MSSAU = nullptr;

 public:
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@ -67,7 +67,6 @@ using namespace llvm;

 #define DEBUG_TYPE "memcpyopt"

-// TODO: Actually implement MemorySSA-based MemCpyOpt.
 static cl::opt<bool>
    EnableMemorySSA("enable-memcpyopt-memoryssa", cl::init(false), cl::Hidden,
                    cl::desc("Use MemorySSA-backed MemCpyOpt."));
@ -283,7 +282,8 @@ private:
    AU.addPreserved<DominatorTreeWrapperPass>();
    AU.addPreserved<GlobalsAAWrapperPass>();
    AU.addRequired<TargetLibraryInfoWrapperPass>();
-    AU.addRequired<MemoryDependenceWrapperPass>();
+    if (!EnableMemorySSA)
+      AU.addRequired<MemoryDependenceWrapperPass>();
    AU.addPreserved<MemoryDependenceWrapperPass>();
    AU.addRequired<AAResultsWrapperPass>();
    AU.addPreserved<AAResultsWrapperPass>();
@ -330,10 +330,37 @@ static bool mayBeVisibleThroughUnwinding(Value *V, Instruction *Start,
 void MemCpyOptPass::eraseInstruction(Instruction *I) {
  if (MSSAU)
    MSSAU->removeMemoryAccess(I);
-  MD->removeInstruction(I);
+  if (MD)
+    MD->removeInstruction(I);
  I->eraseFromParent();
 }

+// Check for mod or ref of Loc between Start and End, excluding both boundaries.
+// Start and End must be in the same block
+static bool accessedBetween(AliasAnalysis &AA, MemoryLocation Loc,
+                            const MemoryUseOrDef *Start,
+                            const MemoryUseOrDef *End) {
+  assert(Start->getBlock() == End->getBlock() && "Only local supported");
+  for (const MemoryAccess &MA :
+       make_range(++Start->getIterator(), End->getIterator())) {
+    if (isModOrRefSet(AA.getModRefInfo(cast<MemoryUseOrDef>(MA).getMemoryInst(),
+                                       Loc)))
+      return true;
+  }
+  return false;
+}
+
+// Check for mod of Loc between Start and End, excluding both boundaries.
+// Start and End can be in different blocks.
+static bool writtenBetween(MemorySSA *MSSA, MemoryLocation Loc,
+                           const MemoryUseOrDef *Start,
+                           const MemoryUseOrDef *End) {
+  // TODO: Only walk until we hit Start.
+  MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
+      End->getDefiningAccess(), Loc);
+  return !MSSA->dominates(Clobber, Start);
+}
+
 /// When scanning forward over instructions, we look for some other patterns to
 /// fold away. In particular, this looks for stores to neighboring locations of
 /// memory. If it sees enough consecutive ones, it attempts to merge them
@ -645,6 +672,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
        // the memory we load from in between the load and the store. If
        // such an instruction is found, we try to promote there instead
        // of at the store position.
+        // TODO: Can use MSSA for this.
        Instruction *P = SI;
        for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) {
          if (isModSet(AA->getModRefInfo(&I, LoadLoc))) {
@ -709,20 +737,37 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
      // Detect cases where we're performing call slot forwarding, but
      // happen to be using a load-store pair to implement it, rather than
      // a memcpy.
-      MemDepResult ldep = MD->getDependency(LI);
      CallInst *C = nullptr;
-      if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
-        C = dyn_cast<CallInst>(ldep.getInst());
+      if (EnableMemorySSA) {
+        if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>(
+                MSSA->getWalker()->getClobberingMemoryAccess(LI))) {
+          // The load most post-dom the call. Limit to the same block for now.
+          // TODO: Support non-local call-slot optimization?
+          if (LoadClobber->getBlock() == SI->getParent())
+            C = dyn_cast_or_null<CallInst>(LoadClobber->getMemoryInst());
+        }
+      } else {
+        MemDepResult ldep = MD->getDependency(LI);
+        if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
+          C = dyn_cast<CallInst>(ldep.getInst());
+      }

      if (C) {
        // Check that nothing touches the dest of the "copy" between
        // the call and the store.
        MemoryLocation StoreLoc = MemoryLocation::get(SI);
-        for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator();
-             I != E; --I) {
-          if (isModOrRefSet(AA->getModRefInfo(&*I, StoreLoc))) {
+        if (EnableMemorySSA) {
+          if (accessedBetween(*AA, StoreLoc, MSSA->getMemoryAccess(C),
+                              MSSA->getMemoryAccess(SI)))
            C = nullptr;
-            break;
+        } else {
+          for (BasicBlock::iterator I = --SI->getIterator(),
+                                    E = C->getIterator();
+               I != E; --I) {
+            if (isModOrRefSet(AA->getModRefInfo(&*I, StoreLoc))) {
+              C = nullptr;
+              break;
+            }
          }
        }
      }
@ -972,7 +1017,8 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,

  // Drop any cached information about the call, because we may have changed
  // its dependence information by changing its parameter.
-  MD->removeInstruction(C);
+  if (MD)
+    MD->removeInstruction(C);

  // Update AA metadata
  // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be
@ -1020,14 +1066,21 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
  //
  // TODO: If the code between M and MDep is transparent to the destination "c",
  // then we could still perform the xform by moving M up to the first memcpy.
-  //
-  // NOTE: This is conservative, it will stop on any read from the source loc,
-  // not just the defining memcpy.
-  MemDepResult SourceDep =
-      MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,
-                                   M->getIterator(), M->getParent());
-  if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
-    return false;
+  if (EnableMemorySSA) {
+    // TODO: It would be sufficient to check the MDep source up to the memcpy
+    // size of M, rather than MDep.
+    if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep),
+                       MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(M)))
+      return false;
+  } else {
+    // NOTE: This is conservative, it will stop on any read from the source loc,
+    // not just the defining memcpy.
+    MemDepResult SourceDep =
+        MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,
+                                     M->getIterator(), M->getParent());
+    if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
+      return false;
+  }

  // If the dest of the second might alias the source of the first, then the
  // source and dest might overlap.  We still want to eliminate the intermediate
@ -1095,12 +1148,24 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
                                    LocationSize::precise(1))))
    return false;

-  // Check that there are no other dependencies on the memset destination.
-  MemDepResult DstDepInfo =
-      MD->getPointerDependencyFrom(MemoryLocation::getForDest(MemSet), false,
-                                   MemCpy->getIterator(), MemCpy->getParent());
-  if (DstDepInfo.getInst() != MemSet)
-    return false;
+  if (EnableMemorySSA) {
+    // We know that dst up to src_size is not written. We now need to make sure
+    // that dst up to dst_size is not accessed. (If we did not move the memset,
+    // checking for reads would be sufficient.)
+    if (accessedBetween(*AA, MemoryLocation::getForDest(MemSet),
+                        MSSA->getMemoryAccess(MemSet),
+                        MSSA->getMemoryAccess(MemCpy))) {
+      return false;
+    }
+  } else {
+    // We have already checked that dst up to src_size is not accessed. We
+    // need to make sure that there are no accesses up to dst_size either.
+    MemDepResult DstDepInfo = MD->getPointerDependencyFrom(
+        MemoryLocation::getForDest(MemSet), false, MemCpy->getIterator(),
+        MemCpy->getParent());
+    if (DstDepInfo.getInst() != MemSet)
+      return false;
+  }

  // Use the same i8* dest as the memcpy, killing the memset dest if different.
  Value *Dest = MemCpy->getRawDest();
@ -1172,6 +1237,24 @@ static bool hasUndefContents(Instruction *I, ConstantInt *Size) {
  return false;
 }

+static bool hasUndefContentsMSSA(MemorySSA *MSSA, AliasAnalysis *AA, Value *V,
+                                 MemoryDef *Def, ConstantInt *Size) {
+  if (MSSA->isLiveOnEntryDef(Def))
+    return isa<AllocaInst>(getUnderlyingObject(V));
+
+  if (IntrinsicInst *II =
+          dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) {
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
+      ConstantInt *LTSize = cast<ConstantInt>(II->getArgOperand(0));
+      if (AA->isMustAlias(V, II->getArgOperand(1)) &&
+          LTSize->getZExtValue() >= Size->getZExtValue())
+        return true;
+    }
+  }
+
+  return false;
+}
+
 /// Transform memcpy to memset when its source was just memset.
 /// In other words, turn:
 /// \code
@ -1207,12 +1290,24 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
    // interested in the bytes from MemSetSize..CopySize here, but as we can't
    // easily represent this location, we use the full 0..CopySize range.
    MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy);
-    MemDepResult DepInfo = MD->getPointerDependencyFrom(
-        MemCpyLoc, true, MemSet->getIterator(), MemSet->getParent());
-    if (DepInfo.isDef() && hasUndefContents(DepInfo.getInst(), CopySize))
-      CopySize = MemSetSize;
-    else
+    bool CanReduceSize = false;
+    if (EnableMemorySSA) {
+      MemoryUseOrDef *MemSetAccess = MSSA->getMemoryAccess(MemSet);
+      MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
+          MemSetAccess->getDefiningAccess(), MemCpyLoc);
+      if (auto *MD = dyn_cast<MemoryDef>(Clobber))
+        if (hasUndefContentsMSSA(MSSA, AA, MemCpy->getSource(), MD, CopySize))
+          CanReduceSize = true;
+    } else {
+      MemDepResult DepInfo = MD->getPointerDependencyFrom(
+          MemCpyLoc, true, MemSet->getIterator(), MemSet->getParent());
+      if (DepInfo.isDef() && hasUndefContents(DepInfo.getInst(), CopySize))
+        CanReduceSize = true;
+    }
+
+    if (!CanReduceSize)
      return false;
+    CopySize = MemSetSize;
  }

  IRBuilder<> Builder(MemCpy);
@ -1267,63 +1362,140 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
        return true;
      }

-  MemDepResult DepInfo = MD->getDependency(M);
+  if (EnableMemorySSA) {
+    MemoryUseOrDef *MA = MSSA->getMemoryAccess(M);
+    MemoryAccess *AnyClobber = MSSA->getWalker()->getClobberingMemoryAccess(MA);
+    MemoryLocation DestLoc = MemoryLocation::getForDest(M);
+    const MemoryAccess *DestClobber =
+        MSSA->getWalker()->getClobberingMemoryAccess(AnyClobber, DestLoc);

-  // Try to turn a partially redundant memset + memcpy into
-  // memcpy + smaller memset.  We don't need the memcpy size for this.
-  if (DepInfo.isClobber())
-    if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst()))
-      if (processMemSetMemCpyDependence(M, MDep))
-        return true;
+    // Try to turn a partially redundant memset + memcpy into
+    // memcpy + smaller memset.  We don't need the memcpy size for this.
+    // The memcpy most post-dom the memset, so limit this to the same basic
+    // block. A non-local generalization is likely not worthwhile.
+    if (auto *MD = dyn_cast<MemoryDef>(DestClobber))
+      if (auto *MDep = dyn_cast_or_null<MemSetInst>(MD->getMemoryInst()))
+        if (DestClobber->getBlock() == M->getParent())
+          if (processMemSetMemCpyDependence(M, MDep))
+            return true;

-  // The optimizations after this point require the memcpy size.
-  ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
-  if (!CopySize) return false;
+    // The optimizations after this point require the memcpy size.
+    ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
+    if (!CopySize) return false;

-  // There are four possible optimizations we can do for memcpy:
-  //   a) memcpy-memcpy xform which exposes redundance for DSE.
-  //   b) call-memcpy xform for return slot optimization.
-  //   c) memcpy from freshly alloca'd space or space that has just started its
-  //      lifetime copies undefined data, and we can therefore eliminate the
-  //      memcpy in favor of the data that was already at the destination.
-  //   d) memcpy from a just-memset'd source can be turned into memset.
-  if (DepInfo.isClobber()) {
-    if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
-      // FIXME: Can we pass in either of dest/src alignment here instead
-      // of conservatively taking the minimum?
-      Align Alignment = std::min(M->getDestAlign().valueOrOne(),
-                                 M->getSourceAlign().valueOrOne());
-      if (performCallSlotOptzn(M, M, M->getDest(), M->getSource(),
-                               CopySize->getZExtValue(), Alignment, C)) {
+    MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess(
+        AnyClobber, MemoryLocation::getForSource(M));
+
+    // There are four possible optimizations we can do for memcpy:
+    //   a) memcpy-memcpy xform which exposes redundance for DSE.
+    //   b) call-memcpy xform for return slot optimization.
+    //   c) memcpy from freshly alloca'd space or space that has just started
+    //      its lifetime copies undefined data, and we can therefore eliminate
+    //      the memcpy in favor of the data that was already at the destination.
+    //   d) memcpy from a just-memset'd source can be turned into memset.
+    if (auto *MD = dyn_cast<MemoryDef>(SrcClobber)) {
+      if (Instruction *MI = MD->getMemoryInst()) {
+        if (auto *C = dyn_cast<CallInst>(MI)) {
+          // The memcpy must post-dom the call. Limit to the same block for now.
+          // Additionally, we need to ensure that there are no accesses to dest
+          // between the call and the memcpy. Accesses to src will be checked
+          // by performCallSlotOptzn().
+          // TODO: Support non-local call-slot optimization?
+          if (C->getParent() == M->getParent() &&
+              !accessedBetween(*AA, DestLoc, MD, MA)) {
+            // FIXME: Can we pass in either of dest/src alignment here instead
+            // of conservatively taking the minimum?
+            Align Alignment = std::min(M->getDestAlign().valueOrOne(),
+                                       M->getSourceAlign().valueOrOne());
+            if (performCallSlotOptzn(M, M, M->getDest(), M->getSource(),
+                                     CopySize->getZExtValue(), Alignment, C)) {
+              LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n"
+                                << "    call: " << *C << "\n"
+                                << "    memcpy: " << *M << "\n");
+              eraseInstruction(M);
+              ++NumMemCpyInstr;
+              return true;
+            }
+          }
+        }
+        if (auto *MDep = dyn_cast<MemCpyInst>(MI))
+          return processMemCpyMemCpyDependence(M, MDep);
+        if (auto *MDep = dyn_cast<MemSetInst>(MI)) {
+          if (performMemCpyToMemSetOptzn(M, MDep)) {
+            LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n");
+            eraseInstruction(M);
+            ++NumCpyToSet;
+            return true;
+          }
+        }
+      }
+
+      if (hasUndefContentsMSSA(MSSA, AA, M->getSource(), MD, CopySize)) {
+        LLVM_DEBUG(dbgs() << "Removed memcpy from undef\n");
        eraseInstruction(M);
        ++NumMemCpyInstr;
        return true;
      }
    }
-  }
+  } else {
+    MemDepResult DepInfo = MD->getDependency(M);

-  MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
-  MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
-      SrcLoc, true, M->getIterator(), M->getParent());
+    // Try to turn a partially redundant memset + memcpy into
+    // memcpy + smaller memset.  We don't need the memcpy size for this.
+    if (DepInfo.isClobber())
+      if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst()))
+        if (processMemSetMemCpyDependence(M, MDep))
+          return true;

-  if (SrcDepInfo.isClobber()) {
-    if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
-      return processMemCpyMemCpyDependence(M, MDep);
-  } else if (SrcDepInfo.isDef()) {
-    if (hasUndefContents(SrcDepInfo.getInst(), CopySize)) {
-      eraseInstruction(M);
-      ++NumMemCpyInstr;
-      return true;
+    // The optimizations after this point require the memcpy size.
+    ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
+    if (!CopySize) return false;
+
+    // There are four possible optimizations we can do for memcpy:
+    //   a) memcpy-memcpy xform which exposes redundance for DSE.
+    //   b) call-memcpy xform for return slot optimization.
+    //   c) memcpy from freshly alloca'd space or space that has just started
+    //      its lifetime copies undefined data, and we can therefore eliminate
+    //      the memcpy in favor of the data that was already at the destination.
+    //   d) memcpy from a just-memset'd source can be turned into memset.
+    if (DepInfo.isClobber()) {
+      if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
+        // FIXME: Can we pass in either of dest/src alignment here instead
+        // of conservatively taking the minimum?
+        Align Alignment = std::min(M->getDestAlign().valueOrOne(),
+                                   M->getSourceAlign().valueOrOne());
+        if (performCallSlotOptzn(M, M, M->getDest(), M->getSource(),
+                                 CopySize->getZExtValue(), Alignment, C)) {
+          eraseInstruction(M);
+          ++NumMemCpyInstr;
+          return true;
+        }
+      }
    }
-  }

-  if (SrcDepInfo.isClobber())
-    if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst()))
-      if (performMemCpyToMemSetOptzn(M, MDep)) {
+    MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
+    MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
+        SrcLoc, true, M->getIterator(), M->getParent());
+
+    if (SrcDepInfo.isClobber()) {
+      if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
+        return processMemCpyMemCpyDependence(M, MDep);
+    } else if (SrcDepInfo.isDef()) {
+      if (hasUndefContents(SrcDepInfo.getInst(), CopySize)) {
        eraseInstruction(M);
-        ++NumCpyToSet;
+        ++NumMemCpyInstr;
        return true;
      }
+    }
+
+    if (SrcDepInfo.isClobber())
+      if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst()))
+        if (performMemCpyToMemSetOptzn(M, MDep)) {
+          eraseInstruction(M);
+          ++NumCpyToSet;
+          return true;
+        }
+  }

  return false;
 }
@ -1354,7 +1526,8 @@ bool MemCpyOptPass::processMemMove(MemMoveInst *M) {

  // MemDep may have over conservative information about this instruction, just
  // conservatively flush it from the cache.
-  MD->removeInstruction(M);
+  if (MD)
+    MD->removeInstruction(M);

  ++NumMoveToCpy;
  return true;
@ -1367,16 +1540,25 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
  Value *ByValArg = CB.getArgOperand(ArgNo);
  Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
  uint64_t ByValSize = DL.getTypeAllocSize(ByValTy);
-  MemDepResult DepInfo = MD->getPointerDependencyFrom(
-      MemoryLocation(ByValArg, LocationSize::precise(ByValSize)), true,
-      CB.getIterator(), CB.getParent());
-  if (!DepInfo.isClobber())
-    return false;
+  MemoryLocation Loc(ByValArg, LocationSize::precise(ByValSize));
+  MemCpyInst *MDep = nullptr;
+  if (EnableMemorySSA) {
+    MemoryUseOrDef *CallAccess = MSSA->getMemoryAccess(&CB);
+    MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
+        CallAccess->getDefiningAccess(), Loc);
+    if (auto *MD = dyn_cast<MemoryDef>(Clobber))
+      MDep = dyn_cast_or_null<MemCpyInst>(MD->getMemoryInst());
+  } else {
+    MemDepResult DepInfo = MD->getPointerDependencyFrom(
+        Loc, true, CB.getIterator(), CB.getParent());
+    if (!DepInfo.isClobber())
+      return false;
+    MDep = dyn_cast<MemCpyInst>(DepInfo.getInst());
+  }

  // If the byval argument isn't fed by a memcpy, ignore it.  If it is fed by
  // a memcpy, see if we can byval from the source of the memcpy instead of the
  // result.
-  MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst());
  if (!MDep || MDep->isVolatile() ||
      ByValArg->stripPointerCasts() != MDep->getDest())
    return false;
@ -1410,14 +1592,19 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
  //    *b = 42;
  //    foo(*a)
  // It would be invalid to transform the second memcpy into foo(*b).
-  //
-  // NOTE: This is conservative, it will stop on any read from the source loc,
-  // not just the defining memcpy.
-  MemDepResult SourceDep = MD->getPointerDependencyFrom(
-      MemoryLocation::getForSource(MDep), false,
-      CB.getIterator(), MDep->getParent());
-  if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
-    return false;
+  if (EnableMemorySSA) {
+    if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep),
+                       MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(&CB)))
+      return false;
+  } else {
+    // NOTE: This is conservative, it will stop on any read from the source loc,
+    // not just the defining memcpy.
+    MemDepResult SourceDep = MD->getPointerDependencyFrom(
+        MemoryLocation::getForSource(MDep), false,
+        CB.getIterator(), MDep->getParent());
+    if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
+      return false;
+  }

  Value *TmpCast = MDep->getSource();
  if (MDep->getSource()->getType() != ByValArg->getType()) {
@ -1484,7 +1671,8 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) {
 }

 PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
-  auto &MD = AM.getResult<MemoryDependenceAnalysis>(F);
+  auto *MD = !EnableMemorySSA ? &AM.getResult<MemoryDependenceAnalysis>(F)
+                              : AM.getCachedResult<MemoryDependenceAnalysis>(F);
  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
  auto *AA = &AM.getResult<AAManager>(F);
  auto *AC = &AM.getResult<AssumptionAnalysis>(F);
@ -1493,14 +1681,15 @@ PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
                               : AM.getCachedResult<MemorySSAAnalysis>(F);

  bool MadeChange =
-      runImpl(F, &MD, &TLI, AA, AC, DT, MSSA ? &MSSA->getMSSA() : nullptr);
+      runImpl(F, MD, &TLI, AA, AC, DT, MSSA ? &MSSA->getMSSA() : nullptr);
  if (!MadeChange)
    return PreservedAnalyses::all();

  PreservedAnalyses PA;
  PA.preserveSet<CFGAnalyses>();
  PA.preserve<GlobalsAA>();
-  PA.preserve<MemoryDependenceAnalysis>();
+  if (MD)
+    PA.preserve<MemoryDependenceAnalysis>();
  if (MSSA)
    PA.preserve<MemorySSAAnalysis>();
  return PA;
@ -1516,6 +1705,7 @@ bool MemCpyOptPass::runImpl(Function &F, MemoryDependenceResults *MD_,
  AA = AA_;
  AC = AC_;
  DT = DT_;
+  MSSA = MSSA_;
  MemorySSAUpdater MSSAU_(MSSA_);
  MSSAU = MSSA_ ? &MSSAU_ : nullptr;
  // If we don't have at least memset and memcpy, there is little point of doing
@ -1542,7 +1732,9 @@ bool MemCpyOptLegacyPass::runOnFunction(Function &F) {
  if (skipFunction(F))
    return false;

-  auto *MD = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+  auto *MDWP = !EnableMemorySSA
+      ? &getAnalysis<MemoryDependenceWrapperPass>()
+      : getAnalysisIfAvailable<MemoryDependenceWrapperPass>();
  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@ -1551,6 +1743,6 @@ bool MemCpyOptLegacyPass::runOnFunction(Function &F) {
      ? &getAnalysis<MemorySSAWrapperPass>()
      : getAnalysisIfAvailable<MemorySSAWrapperPass>();

-  return Impl.runImpl(F, MD, TLI, AA, AC, DT,
+  return Impl.runImpl(F, MDWP ? & MDWP->getMemDep() : nullptr, TLI, AA, AC, DT,
                      MSSAWP ? &MSSAWP->getMSSA() : nullptr);
 }
--- a/llvm/test/Transforms/MemCpyOpt/callslot.ll
+++ b/llvm/test/Transforms/MemCpyOpt/callslot.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -memcpyopt < %s -enable-memcpyopt-memoryssa=0 | FileCheck %s
-; RUN: opt -S -memcpyopt < %s -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s
+; RUN: opt -S -memcpyopt < %s -enable-memcpyopt-memoryssa=0 | FileCheck %s --check-prefixes=CHECK,NO_MSSA
+; RUN: opt -S -memcpyopt < %s -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s --check-prefixes=CHECK,MSSA

 define i8 @read_dest_between_call_and_memcpy() {
 ; CHECK-LABEL: @read_dest_between_call_and_memcpy(
@ -26,15 +26,25 @@ define i8 @read_dest_between_call_and_memcpy() {
 }

 define i8 @read_src_between_call_and_memcpy() {
-; CHECK-LABEL: @read_src_between_call_and_memcpy(
-; CHECK-NEXT:    [[DEST:%.*]] = alloca [16 x i8], align 1
-; CHECK-NEXT:    [[SRC:%.*]] = alloca [16 x i8], align 1
-; CHECK-NEXT:    [[DEST_I8:%.*]] = bitcast [16 x i8]* [[DEST]] to i8*
-; CHECK-NEXT:    [[SRC_I8:%.*]] = bitcast [16 x i8]* [[SRC]] to i8*
-; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[SRC_I8]], i8 0, i64 16, i1 false)
-; CHECK-NEXT:    [[X:%.*]] = load i8, i8* [[SRC_I8]], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST_I8]], i8* [[SRC_I8]], i64 16, i1 false)
-; CHECK-NEXT:    ret i8 [[X]]
+; NO_MSSA-LABEL: @read_src_between_call_and_memcpy(
+; NO_MSSA-NEXT:    [[DEST:%.*]] = alloca [16 x i8], align 1
+; NO_MSSA-NEXT:    [[SRC:%.*]] = alloca [16 x i8], align 1
+; NO_MSSA-NEXT:    [[DEST_I8:%.*]] = bitcast [16 x i8]* [[DEST]] to i8*
+; NO_MSSA-NEXT:    [[SRC_I8:%.*]] = bitcast [16 x i8]* [[SRC]] to i8*
+; NO_MSSA-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[SRC_I8]], i8 0, i64 16, i1 false)
+; NO_MSSA-NEXT:    [[X:%.*]] = load i8, i8* [[SRC_I8]], align 1
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST_I8]], i8* [[SRC_I8]], i64 16, i1 false)
+; NO_MSSA-NEXT:    ret i8 [[X]]
+;
+; MSSA-LABEL: @read_src_between_call_and_memcpy(
+; MSSA-NEXT:    [[DEST:%.*]] = alloca [16 x i8], align 1
+; MSSA-NEXT:    [[SRC:%.*]] = alloca [16 x i8], align 1
+; MSSA-NEXT:    [[DEST_I8:%.*]] = bitcast [16 x i8]* [[DEST]] to i8*
+; MSSA-NEXT:    [[SRC_I8:%.*]] = bitcast [16 x i8]* [[SRC]] to i8*
+; MSSA-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[SRC_I8]], i8 0, i64 16, i1 false)
+; MSSA-NEXT:    [[X:%.*]] = load i8, i8* [[SRC_I8]], align 1
+; MSSA-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DEST_I8]], i8 0, i64 16, i1 false)
+; MSSA-NEXT:    ret i8 [[X]]
 ;
  %dest = alloca [16 x i8]
  %src = alloca [16 x i8]
--- a/llvm/test/Transforms/MemCpyOpt/invariant.start.ll
+++ b/llvm/test/Transforms/MemCpyOpt/invariant.start.ll
@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; MemCpy optimizations should take place even in presence of invariant.start
-; RUN: opt < %s -basic-aa -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s
-; RUN: opt < %s -basic-aa -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s
+; RUN: opt < %s -basic-aa -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s --check-prefixes=CHECK,NO_MSSA
+; RUN: opt < %s -basic-aa -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s --check-prefixes=CHECK,MSSA

 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"

@ -18,13 +18,21 @@ declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) nounwind readonly
 ; The intermediate alloca and one of the memcpy's should be eliminated, the
 ; other should be transformed to a memmove.
 define void @test1(i8* %P, i8* %Q) nounwind  {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[MEMTMP:%.*]] = alloca [[TMP0:%.*]], align 16
-; CHECK-NEXT:    [[R:%.*]] = bitcast %0* [[MEMTMP]] to i8*
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[R]], i8* align 16 [[P:%.*]], i32 32, i1 false)
-; CHECK-NEXT:    [[I:%.*]] = call {}* @llvm.invariant.start.p0i8(i64 32, i8* [[P]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[R]], i32 32, i1 false)
-; CHECK-NEXT:    ret void
+; NO_MSSA-LABEL: @test1(
+; NO_MSSA-NEXT:    [[MEMTMP:%.*]] = alloca [[TMP0:%.*]], align 16
+; NO_MSSA-NEXT:    [[R:%.*]] = bitcast %0* [[MEMTMP]] to i8*
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[R]], i8* align 16 [[P:%.*]], i32 32, i1 false)
+; NO_MSSA-NEXT:    [[I:%.*]] = call {}* @llvm.invariant.start.p0i8(i64 32, i8* [[P]])
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[R]], i32 32, i1 false)
+; NO_MSSA-NEXT:    ret void
+;
+; MSSA-LABEL: @test1(
+; MSSA-NEXT:    [[MEMTMP:%.*]] = alloca [[TMP0:%.*]], align 16
+; MSSA-NEXT:    [[R:%.*]] = bitcast %0* [[MEMTMP]] to i8*
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[R]], i8* align 16 [[P:%.*]], i32 32, i1 false)
+; MSSA-NEXT:    [[I:%.*]] = call {}* @llvm.invariant.start.p0i8(i64 32, i8* [[P]])
+; MSSA-NEXT:    call void @llvm.memmove.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[P]], i32 32, i1 false)
+; MSSA-NEXT:    ret void
 ;
  %memtmp = alloca %0, align 16
  %R = bitcast %0* %memtmp to i8*
--- a/llvm/test/Transforms/MemCpyOpt/memcpy-invoke-memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-invoke-memcpy.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s
-; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s
+; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s --check-prefixes=CHECK,NO_MSSA
+; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s --check-prefixes=CHECK,MSSA

 ; Test memcpy-memcpy dependencies across invoke edges.

@ -8,19 +8,33 @@
 ; TODO: Not supported yet.

 define hidden void @test_normal(i8* noalias %dst, i8* %src) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
-; CHECK-LABEL: @test_normal(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TEMP:%.*]] = alloca i8, i32 64, align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TEMP]], i8* nonnull align 8 [[SRC:%.*]], i64 64, i1 false)
-; CHECK-NEXT:    invoke void @invoke_me()
-; CHECK-NEXT:    to label [[TRY_CONT:%.*]] unwind label [[LPAD:%.*]]
-; CHECK:       lpad:
-; CHECK-NEXT:    [[TMP0:%.*]] = landingpad { i8*, i32 }
-; CHECK-NEXT:    catch i8* null
-; CHECK-NEXT:    ret void
-; CHECK:       try.cont:
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST:%.*]], i8* align 8 [[TEMP]], i64 64, i1 false)
-; CHECK-NEXT:    ret void
+; NO_MSSA-LABEL: @test_normal(
+; NO_MSSA-NEXT:  entry:
+; NO_MSSA-NEXT:    [[TEMP:%.*]] = alloca i8, i32 64, align 1
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TEMP]], i8* nonnull align 8 [[SRC:%.*]], i64 64, i1 false)
+; NO_MSSA-NEXT:    invoke void @invoke_me()
+; NO_MSSA-NEXT:    to label [[TRY_CONT:%.*]] unwind label [[LPAD:%.*]]
+; NO_MSSA:       lpad:
+; NO_MSSA-NEXT:    [[TMP0:%.*]] = landingpad { i8*, i32 }
+; NO_MSSA-NEXT:    catch i8* null
+; NO_MSSA-NEXT:    ret void
+; NO_MSSA:       try.cont:
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST:%.*]], i8* align 8 [[TEMP]], i64 64, i1 false)
+; NO_MSSA-NEXT:    ret void
+;
+; MSSA-LABEL: @test_normal(
+; MSSA-NEXT:  entry:
+; MSSA-NEXT:    [[TEMP:%.*]] = alloca i8, i32 64, align 1
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TEMP]], i8* nonnull align 8 [[SRC:%.*]], i64 64, i1 false)
+; MSSA-NEXT:    invoke void @invoke_me()
+; MSSA-NEXT:    to label [[TRY_CONT:%.*]] unwind label [[LPAD:%.*]]
+; MSSA:       lpad:
+; MSSA-NEXT:    [[TMP0:%.*]] = landingpad { i8*, i32 }
+; MSSA-NEXT:    catch i8* null
+; MSSA-NEXT:    ret void
+; MSSA:       try.cont:
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST:%.*]], i8* align 8 [[SRC]], i64 64, i1 false)
+; MSSA-NEXT:    ret void
 ;
 entry:
  %temp = alloca i8, i32 64
@ -42,19 +56,33 @@ try.cont:
 ; TODO: Not supported yet.

 define hidden void @test_unwind(i8* noalias %dst, i8* %src) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
-; CHECK-LABEL: @test_unwind(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TEMP:%.*]] = alloca i8, i32 64, align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TEMP]], i8* nonnull align 8 [[SRC:%.*]], i64 64, i1 false)
-; CHECK-NEXT:    invoke void @invoke_me()
-; CHECK-NEXT:    to label [[TRY_CONT:%.*]] unwind label [[LPAD:%.*]]
-; CHECK:       lpad:
-; CHECK-NEXT:    [[TMP0:%.*]] = landingpad { i8*, i32 }
-; CHECK-NEXT:    catch i8* null
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST:%.*]], i8* align 8 [[TEMP]], i64 64, i1 false)
-; CHECK-NEXT:    ret void
-; CHECK:       try.cont:
-; CHECK-NEXT:    ret void
+; NO_MSSA-LABEL: @test_unwind(
+; NO_MSSA-NEXT:  entry:
+; NO_MSSA-NEXT:    [[TEMP:%.*]] = alloca i8, i32 64, align 1
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TEMP]], i8* nonnull align 8 [[SRC:%.*]], i64 64, i1 false)
+; NO_MSSA-NEXT:    invoke void @invoke_me()
+; NO_MSSA-NEXT:    to label [[TRY_CONT:%.*]] unwind label [[LPAD:%.*]]
+; NO_MSSA:       lpad:
+; NO_MSSA-NEXT:    [[TMP0:%.*]] = landingpad { i8*, i32 }
+; NO_MSSA-NEXT:    catch i8* null
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST:%.*]], i8* align 8 [[TEMP]], i64 64, i1 false)
+; NO_MSSA-NEXT:    ret void
+; NO_MSSA:       try.cont:
+; NO_MSSA-NEXT:    ret void
+;
+; MSSA-LABEL: @test_unwind(
+; MSSA-NEXT:  entry:
+; MSSA-NEXT:    [[TEMP:%.*]] = alloca i8, i32 64, align 1
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TEMP]], i8* nonnull align 8 [[SRC:%.*]], i64 64, i1 false)
+; MSSA-NEXT:    invoke void @invoke_me()
+; MSSA-NEXT:    to label [[TRY_CONT:%.*]] unwind label [[LPAD:%.*]]
+; MSSA:       lpad:
+; MSSA-NEXT:    [[TMP0:%.*]] = landingpad { i8*, i32 }
+; MSSA-NEXT:    catch i8* null
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST:%.*]], i8* align 8 [[SRC]], i64 64, i1 false)
+; MSSA-NEXT:    ret void
+; MSSA:       try.cont:
+; MSSA-NEXT:    ret void
 ;
 entry:
  %temp = alloca i8, i32 64
--- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
@ -141,13 +141,21 @@ define void @test4_write_between(i8 *%P) {
 }

 define i8 @test4_read_between(i8 *%P) {
-; CHECK-LABEL: @test4_read_between(
-; CHECK-NEXT:    [[A1:%.*]] = alloca [[TMP1:%.*]], align 8
-; CHECK-NEXT:    [[A2:%.*]] = bitcast %1* [[A1]] to i8*
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[A2]], i8* align 4 [[P:%.*]], i64 8, i1 false)
-; CHECK-NEXT:    [[X:%.*]] = load i8, i8* [[A2]], align 1
-; CHECK-NEXT:    call void @test4a(i8* byval(i8) align 1 [[A2]])
-; CHECK-NEXT:    ret i8 [[X]]
+; NO_MSSA-LABEL: @test4_read_between(
+; NO_MSSA-NEXT:    [[A1:%.*]] = alloca [[TMP1:%.*]], align 8
+; NO_MSSA-NEXT:    [[A2:%.*]] = bitcast %1* [[A1]] to i8*
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[A2]], i8* align 4 [[P:%.*]], i64 8, i1 false)
+; NO_MSSA-NEXT:    [[X:%.*]] = load i8, i8* [[A2]], align 1
+; NO_MSSA-NEXT:    call void @test4a(i8* byval align 1 [[A2]])
+; NO_MSSA-NEXT:    ret i8 [[X]]
+;
+; MSSA-LABEL: @test4_read_between(
+; MSSA-NEXT:    [[A1:%.*]] = alloca [[TMP1:%.*]], align 8
+; MSSA-NEXT:    [[A2:%.*]] = bitcast %1* [[A1]] to i8*
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[A2]], i8* align 4 [[P:%.*]], i64 8, i1 false)
+; MSSA-NEXT:    [[X:%.*]] = load i8, i8* [[A2]], align 1
+; MSSA-NEXT:    call void @test4a(i8* byval align 1 [[P]])
+; MSSA-NEXT:    ret i8 [[X]]
 ;
  %a1 = alloca %1
  %a2 = bitcast %1* %a1 to i8*
@ -158,16 +166,27 @@ define i8 @test4_read_between(i8 *%P) {
 }

 define void @test4_non_local(i8 *%P, i1 %c) {
-; CHECK-LABEL: @test4_non_local(
-; CHECK-NEXT:    [[A1:%.*]] = alloca [[TMP1:%.*]], align 8
-; CHECK-NEXT:    [[A2:%.*]] = bitcast %1* [[A1]] to i8*
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[A2]], i8* align 4 [[P:%.*]], i64 8, i1 false)
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[CALL:%.*]], label [[EXIT:%.*]]
-; CHECK:       call:
-; CHECK-NEXT:    call void @test4a(i8* byval(i8) align 1 [[A2]])
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; NO_MSSA-LABEL: @test4_non_local(
+; NO_MSSA-NEXT:    [[A1:%.*]] = alloca [[TMP1:%.*]], align 8
+; NO_MSSA-NEXT:    [[A2:%.*]] = bitcast %1* [[A1]] to i8*
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[A2]], i8* align 4 [[P:%.*]], i64 8, i1 false)
+; NO_MSSA-NEXT:    br i1 [[C:%.*]], label [[CALL:%.*]], label [[EXIT:%.*]]
+; NO_MSSA:       call:
+; NO_MSSA-NEXT:    call void @test4a(i8* byval align 1 [[A2]])
+; NO_MSSA-NEXT:    br label [[EXIT]]
+; NO_MSSA:       exit:
+; NO_MSSA-NEXT:    ret void
+;
+; MSSA-LABEL: @test4_non_local(
+; MSSA-NEXT:    [[A1:%.*]] = alloca [[TMP1:%.*]], align 8
+; MSSA-NEXT:    [[A2:%.*]] = bitcast %1* [[A1]] to i8*
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[A2]], i8* align 4 [[P:%.*]], i64 8, i1 false)
+; MSSA-NEXT:    br i1 [[C:%.*]], label [[CALL:%.*]], label [[EXIT:%.*]]
+; MSSA:       call:
+; MSSA-NEXT:    call void @test4a(i8* byval align 1 [[P]])
+; MSSA-NEXT:    br label [[EXIT]]
+; MSSA:       exit:
+; MSSA-NEXT:    ret void
 ;
  %a1 = alloca %1
  %a2 = bitcast %1* %a1 to i8*
--- a/llvm/test/Transforms/MemCpyOpt/merge-into-memset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/merge-into-memset.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s
-; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s
+; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s --check-prefix=NO_MSSA
+; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s --check-prefix=MSSA

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

@ -8,19 +8,33 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; which will be deleted.

 define void @foo(i1 %c, i8* %d, i8* %e, i8* %f) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP:%.*]] = alloca [50 x i8], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast [50 x i8]* [[TMP]] to i8*
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 1
-; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* nonnull [[D:%.*]], i8 0, i64 10, i1 false)
-; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP4]], i8 0, i64 11, i1 false)
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[EXIT:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[F:%.*]], i8* nonnull align 8 [[TMP4]], i64 30, i1 false)
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; NO_MSSA-LABEL: @foo(
+; NO_MSSA-NEXT:  entry:
+; NO_MSSA-NEXT:    [[TMP:%.*]] = alloca [50 x i8], align 8
+; NO_MSSA-NEXT:    [[TMP4:%.*]] = bitcast [50 x i8]* [[TMP]] to i8*
+; NO_MSSA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 1
+; NO_MSSA-NEXT:    call void @llvm.memset.p0i8.i64(i8* nonnull [[D:%.*]], i8 0, i64 10, i1 false)
+; NO_MSSA-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP4]], i8 0, i64 11, i1 false)
+; NO_MSSA-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[EXIT:%.*]]
+; NO_MSSA:       if.then:
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[F:%.*]], i8* nonnull align 8 [[TMP4]], i64 30, i1 false)
+; NO_MSSA-NEXT:    br label [[EXIT]]
+; NO_MSSA:       exit:
+; NO_MSSA-NEXT:    ret void
+;
+; MSSA-LABEL: @foo(
+; MSSA-NEXT:  entry:
+; MSSA-NEXT:    [[TMP:%.*]] = alloca [50 x i8], align 8
+; MSSA-NEXT:    [[TMP4:%.*]] = bitcast [50 x i8]* [[TMP]] to i8*
+; MSSA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 1
+; MSSA-NEXT:    call void @llvm.memset.p0i8.i64(i8* nonnull [[D:%.*]], i8 0, i64 10, i1 false)
+; MSSA-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP4]], i8 0, i64 11, i1 false)
+; MSSA-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[EXIT:%.*]]
+; MSSA:       if.then:
+; MSSA-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[F:%.*]], i8 0, i64 11, i1 false)
+; MSSA-NEXT:    br label [[EXIT]]
+; MSSA:       exit:
+; MSSA-NEXT:    ret void
 ;
 entry:
  %tmp = alloca [50 x i8], align 8
--- a/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll
+++ b/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s
-; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s
+; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s --check-prefix=NO_MSSA
+; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s --check-prefix=MSSA
 ; Handle memcpy-memcpy dependencies of differing sizes correctly.

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@ -9,25 +9,44 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; memcpy with a larger size from the same address.

 define i32 @foo(i1 %z) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
-; CHECK-NEXT:    [[S:%.*]] = alloca [10 x i32], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [10 x i32]* [[A]] to i8*
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[S]] to i8*
-; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* nonnull align 16 [[TMP1]], i8 0, i64 40, i1 false)
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[A]], i64 0, i64 0
-; CHECK-NEXT:    store i32 1, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr [10 x i32], [10 x i32]* [[S]], i64 0, i64 1
-; CHECK-NEXT:    [[SCEVGEP7:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    br i1 [[Z:%.*]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_INC7_1:%.*]]
-; CHECK:       for.body3.lr.ph:
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 [[SCEVGEP7]], i64 17179869180, i1 false)
-; CHECK-NEXT:    br label [[FOR_INC7_1]]
-; CHECK:       for.inc7.1:
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 [[SCEVGEP7]], i64 4, i1 false)
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    ret i32 [[TMP2]]
+; NO_MSSA-LABEL: @foo(
+; NO_MSSA-NEXT:  entry:
+; NO_MSSA-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
+; NO_MSSA-NEXT:    [[S:%.*]] = alloca [10 x i32], align 4
+; NO_MSSA-NEXT:    [[TMP0:%.*]] = bitcast [10 x i32]* [[A]] to i8*
+; NO_MSSA-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[S]] to i8*
+; NO_MSSA-NEXT:    call void @llvm.memset.p0i8.i64(i8* nonnull align 16 [[TMP1]], i8 0, i64 40, i1 false)
+; NO_MSSA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[A]], i64 0, i64 0
+; NO_MSSA-NEXT:    store i32 1, i32* [[ARRAYIDX]], align 4
+; NO_MSSA-NEXT:    [[SCEVGEP:%.*]] = getelementptr [10 x i32], [10 x i32]* [[S]], i64 0, i64 1
+; NO_MSSA-NEXT:    [[SCEVGEP7:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; NO_MSSA-NEXT:    br i1 [[Z:%.*]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_INC7_1:%.*]]
+; NO_MSSA:       for.body3.lr.ph:
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 [[SCEVGEP7]], i64 17179869180, i1 false)
+; NO_MSSA-NEXT:    br label [[FOR_INC7_1]]
+; NO_MSSA:       for.inc7.1:
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 [[SCEVGEP7]], i64 4, i1 false)
+; NO_MSSA-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; NO_MSSA-NEXT:    ret i32 [[TMP2]]
+;
+; MSSA-LABEL: @foo(
+; MSSA-NEXT:  entry:
+; MSSA-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
+; MSSA-NEXT:    [[S:%.*]] = alloca [10 x i32], align 4
+; MSSA-NEXT:    [[TMP0:%.*]] = bitcast [10 x i32]* [[A]] to i8*
+; MSSA-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[S]] to i8*
+; MSSA-NEXT:    call void @llvm.memset.p0i8.i64(i8* nonnull align 16 [[TMP1]], i8 0, i64 40, i1 false)
+; MSSA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[A]], i64 0, i64 0
+; MSSA-NEXT:    store i32 1, i32* [[ARRAYIDX]], align 4
+; MSSA-NEXT:    [[SCEVGEP:%.*]] = getelementptr [10 x i32], [10 x i32]* [[S]], i64 0, i64 1
+; MSSA-NEXT:    [[SCEVGEP7:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; MSSA-NEXT:    br i1 [[Z:%.*]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_INC7_1:%.*]]
+; MSSA:       for.body3.lr.ph:
+; MSSA-NEXT:    br label [[FOR_INC7_1]]
+; MSSA:       for.inc7.1:
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 [[SCEVGEP7]], i64 4, i1 false)
+; MSSA-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; MSSA-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
  %a = alloca [10 x i32]
--- a/llvm/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s
-; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s
+; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s --check-prefix=NO_MSSA
+; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s --check-prefix=MSSA

 ; Test whether memcpy-memcpy dependence is optimized across
 ; basic blocks (conditional branches and invokes).
@ -22,17 +22,29 @@ declare i8* @__cxa_begin_catch(i8*)
 ; to copy directly from the original source rather than from the temporary.

 define void @wobble(i8* noalias %dst, i8* %src, i1 %some_condition) {
-; CHECK-LABEL: @wobble(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TEMP:%.*]] = alloca i8, i32 64, align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TEMP]], i8* nonnull align 8 [[SRC:%.*]], i64 64, i1 false)
-; CHECK-NEXT:    br i1 [[SOME_CONDITION:%.*]], label [[MORE:%.*]], label [[OUT:%.*]]
-; CHECK:       out:
-; CHECK-NEXT:    call void @qux()
-; CHECK-NEXT:    unreachable
-; CHECK:       more:
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST:%.*]], i8* align 8 [[TEMP]], i64 64, i1 false)
-; CHECK-NEXT:    ret void
+; NO_MSSA-LABEL: @wobble(
+; NO_MSSA-NEXT:  bb:
+; NO_MSSA-NEXT:    [[TEMP:%.*]] = alloca i8, i32 64, align 1
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TEMP]], i8* nonnull align 8 [[SRC:%.*]], i64 64, i1 false)
+; NO_MSSA-NEXT:    br i1 [[SOME_CONDITION:%.*]], label [[MORE:%.*]], label [[OUT:%.*]]
+; NO_MSSA:       out:
+; NO_MSSA-NEXT:    call void @qux()
+; NO_MSSA-NEXT:    unreachable
+; NO_MSSA:       more:
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST:%.*]], i8* align 8 [[TEMP]], i64 64, i1 false)
+; NO_MSSA-NEXT:    ret void
+;
+; MSSA-LABEL: @wobble(
+; MSSA-NEXT:  bb:
+; MSSA-NEXT:    [[TEMP:%.*]] = alloca i8, i32 64, align 1
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TEMP]], i8* nonnull align 8 [[SRC:%.*]], i64 64, i1 false)
+; MSSA-NEXT:    br i1 [[SOME_CONDITION:%.*]], label [[MORE:%.*]], label [[OUT:%.*]]
+; MSSA:       out:
+; MSSA-NEXT:    call void @qux()
+; MSSA-NEXT:    unreachable
+; MSSA:       more:
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST:%.*]], i8* align 8 [[SRC]], i64 64, i1 false)
+; MSSA-NEXT:    ret void
 ;
 bb:
  %temp = alloca i8, i32 64
@ -53,25 +65,45 @@ more:
 ; source rather than from the temporary.

 define i32 @foo(i1 %t3) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_S:%.*]], align 4
-; CHECK-NEXT:    [[T:%.*]] = alloca [[STRUCT_S]], align 4
-; CHECK-NEXT:    [[S1:%.*]] = bitcast %struct.s* [[S]] to i8*
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[S1]], i8* align 4 bitcast (%struct.s* @s_foo to i8*), i64 8, i1 false)
-; CHECK-NEXT:    br i1 [[T3:%.*]], label [[BB4:%.*]], label [[BB7:%.*]]
-; CHECK:       bb4:
-; CHECK-NEXT:    [[T5:%.*]] = bitcast %struct.s* [[T]] to i8*
-; CHECK-NEXT:    [[S6:%.*]] = bitcast %struct.s* [[S]] to i8*
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[T5]], i8* align 4 [[S6]], i64 8, i1 false)
-; CHECK-NEXT:    br label [[BB7]]
-; CHECK:       bb7:
-; CHECK-NEXT:    [[T8:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 0
-; CHECK-NEXT:    [[T9:%.*]] = load i32, i32* [[T8]], align 4
-; CHECK-NEXT:    [[T10:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 1
-; CHECK-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4
-; CHECK-NEXT:    [[T12:%.*]] = add i32 [[T9]], [[T11]]
-; CHECK-NEXT:    ret i32 [[T12]]
+; NO_MSSA-LABEL: @foo(
+; NO_MSSA-NEXT:  bb:
+; NO_MSSA-NEXT:    [[S:%.*]] = alloca [[STRUCT_S:%.*]], align 4
+; NO_MSSA-NEXT:    [[T:%.*]] = alloca [[STRUCT_S]], align 4
+; NO_MSSA-NEXT:    [[S1:%.*]] = bitcast %struct.s* [[S]] to i8*
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[S1]], i8* align 4 bitcast (%struct.s* @s_foo to i8*), i64 8, i1 false)
+; NO_MSSA-NEXT:    br i1 [[T3:%.*]], label [[BB4:%.*]], label [[BB7:%.*]]
+; NO_MSSA:       bb4:
+; NO_MSSA-NEXT:    [[T5:%.*]] = bitcast %struct.s* [[T]] to i8*
+; NO_MSSA-NEXT:    [[S6:%.*]] = bitcast %struct.s* [[S]] to i8*
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[T5]], i8* align 4 [[S6]], i64 8, i1 false)
+; NO_MSSA-NEXT:    br label [[BB7]]
+; NO_MSSA:       bb7:
+; NO_MSSA-NEXT:    [[T8:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 0
+; NO_MSSA-NEXT:    [[T9:%.*]] = load i32, i32* [[T8]], align 4
+; NO_MSSA-NEXT:    [[T10:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 1
+; NO_MSSA-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4
+; NO_MSSA-NEXT:    [[T12:%.*]] = add i32 [[T9]], [[T11]]
+; NO_MSSA-NEXT:    ret i32 [[T12]]
+;
+; MSSA-LABEL: @foo(
+; MSSA-NEXT:  bb:
+; MSSA-NEXT:    [[S:%.*]] = alloca [[STRUCT_S:%.*]], align 4
+; MSSA-NEXT:    [[T:%.*]] = alloca [[STRUCT_S]], align 4
+; MSSA-NEXT:    [[S1:%.*]] = bitcast %struct.s* [[S]] to i8*
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[S1]], i8* align 4 bitcast (%struct.s* @s_foo to i8*), i64 8, i1 false)
+; MSSA-NEXT:    br i1 [[T3:%.*]], label [[BB4:%.*]], label [[BB7:%.*]]
+; MSSA:       bb4:
+; MSSA-NEXT:    [[T5:%.*]] = bitcast %struct.s* [[T]] to i8*
+; MSSA-NEXT:    [[S6:%.*]] = bitcast %struct.s* [[S]] to i8*
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[T5]], i8* align 4 bitcast (%struct.s* @s_foo to i8*), i64 8, i1 false)
+; MSSA-NEXT:    br label [[BB7]]
+; MSSA:       bb7:
+; MSSA-NEXT:    [[T8:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 0
+; MSSA-NEXT:    [[T9:%.*]] = load i32, i32* [[T8]], align 4
+; MSSA-NEXT:    [[T10:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 1
+; MSSA-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4
+; MSSA-NEXT:    [[T12:%.*]] = add i32 [[T9]], [[T11]]
+; MSSA-NEXT:    ret i32 [[T12]]
 ;
 bb:
  %s = alloca %struct.s, align 4
@ -102,37 +134,69 @@ bb7:                                              ; preds = %bb4, %bb
 ; pattern.

 define i32 @baz(i1 %t5) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
-; CHECK-LABEL: @baz(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_S:%.*]], align 4
-; CHECK-NEXT:    [[T:%.*]] = alloca [[STRUCT_S]], align 4
-; CHECK-NEXT:    [[S3:%.*]] = bitcast %struct.s* [[S]] to i8*
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[S3]], i8* align 4 bitcast (%struct.s* @s_baz to i8*), i64 8, i1 false)
-; CHECK-NEXT:    br i1 [[T5:%.*]], label [[BB6:%.*]], label [[BB22:%.*]]
-; CHECK:       bb6:
-; CHECK-NEXT:    invoke void @__cxa_throw(i8* null, i8* bitcast (i8** @i to i8*), i8* null)
-; CHECK-NEXT:    to label [[BB25:%.*]] unwind label [[BB9:%.*]]
-; CHECK:       bb9:
-; CHECK-NEXT:    [[T10:%.*]] = landingpad { i8*, i32 }
-; CHECK-NEXT:    catch i8* null
-; CHECK-NEXT:    br label [[BB13:%.*]]
-; CHECK:       bb13:
-; CHECK-NEXT:    [[T15:%.*]] = call i8* @__cxa_begin_catch(i8* null)
-; CHECK-NEXT:    br label [[BB23:%.*]]
-; CHECK:       bb22:
-; CHECK-NEXT:    [[T23:%.*]] = bitcast %struct.s* [[T]] to i8*
-; CHECK-NEXT:    [[S24:%.*]] = bitcast %struct.s* [[S]] to i8*
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[T23]], i8* align 4 [[S24]], i64 8, i1 false)
-; CHECK-NEXT:    br label [[BB23]]
-; CHECK:       bb23:
-; CHECK-NEXT:    [[T17:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 0
-; CHECK-NEXT:    [[T18:%.*]] = load i32, i32* [[T17]], align 4
-; CHECK-NEXT:    [[T19:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 1
-; CHECK-NEXT:    [[T20:%.*]] = load i32, i32* [[T19]], align 4
-; CHECK-NEXT:    [[T21:%.*]] = add nsw i32 [[T18]], [[T20]]
-; CHECK-NEXT:    ret i32 [[T21]]
-; CHECK:       bb25:
-; CHECK-NEXT:    unreachable
+; NO_MSSA-LABEL: @baz(
+; NO_MSSA-NEXT:  bb:
+; NO_MSSA-NEXT:    [[S:%.*]] = alloca [[STRUCT_S:%.*]], align 4
+; NO_MSSA-NEXT:    [[T:%.*]] = alloca [[STRUCT_S]], align 4
+; NO_MSSA-NEXT:    [[S3:%.*]] = bitcast %struct.s* [[S]] to i8*
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[S3]], i8* align 4 bitcast (%struct.s* @s_baz to i8*), i64 8, i1 false)
+; NO_MSSA-NEXT:    br i1 [[T5:%.*]], label [[BB6:%.*]], label [[BB22:%.*]]
+; NO_MSSA:       bb6:
+; NO_MSSA-NEXT:    invoke void @__cxa_throw(i8* null, i8* bitcast (i8** @i to i8*), i8* null)
+; NO_MSSA-NEXT:    to label [[BB25:%.*]] unwind label [[BB9:%.*]]
+; NO_MSSA:       bb9:
+; NO_MSSA-NEXT:    [[T10:%.*]] = landingpad { i8*, i32 }
+; NO_MSSA-NEXT:    catch i8* null
+; NO_MSSA-NEXT:    br label [[BB13:%.*]]
+; NO_MSSA:       bb13:
+; NO_MSSA-NEXT:    [[T15:%.*]] = call i8* @__cxa_begin_catch(i8* null)
+; NO_MSSA-NEXT:    br label [[BB23:%.*]]
+; NO_MSSA:       bb22:
+; NO_MSSA-NEXT:    [[T23:%.*]] = bitcast %struct.s* [[T]] to i8*
+; NO_MSSA-NEXT:    [[S24:%.*]] = bitcast %struct.s* [[S]] to i8*
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[T23]], i8* align 4 [[S24]], i64 8, i1 false)
+; NO_MSSA-NEXT:    br label [[BB23]]
+; NO_MSSA:       bb23:
+; NO_MSSA-NEXT:    [[T17:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 0
+; NO_MSSA-NEXT:    [[T18:%.*]] = load i32, i32* [[T17]], align 4
+; NO_MSSA-NEXT:    [[T19:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 1
+; NO_MSSA-NEXT:    [[T20:%.*]] = load i32, i32* [[T19]], align 4
+; NO_MSSA-NEXT:    [[T21:%.*]] = add nsw i32 [[T18]], [[T20]]
+; NO_MSSA-NEXT:    ret i32 [[T21]]
+; NO_MSSA:       bb25:
+; NO_MSSA-NEXT:    unreachable
+;
+; MSSA-LABEL: @baz(
+; MSSA-NEXT:  bb:
+; MSSA-NEXT:    [[S:%.*]] = alloca [[STRUCT_S:%.*]], align 4
+; MSSA-NEXT:    [[T:%.*]] = alloca [[STRUCT_S]], align 4
+; MSSA-NEXT:    [[S3:%.*]] = bitcast %struct.s* [[S]] to i8*
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[S3]], i8* align 4 bitcast (%struct.s* @s_baz to i8*), i64 8, i1 false)
+; MSSA-NEXT:    br i1 [[T5:%.*]], label [[BB6:%.*]], label [[BB22:%.*]]
+; MSSA:       bb6:
+; MSSA-NEXT:    invoke void @__cxa_throw(i8* null, i8* bitcast (i8** @i to i8*), i8* null)
+; MSSA-NEXT:    to label [[BB25:%.*]] unwind label [[BB9:%.*]]
+; MSSA:       bb9:
+; MSSA-NEXT:    [[T10:%.*]] = landingpad { i8*, i32 }
+; MSSA-NEXT:    catch i8* null
+; MSSA-NEXT:    br label [[BB13:%.*]]
+; MSSA:       bb13:
+; MSSA-NEXT:    [[T15:%.*]] = call i8* @__cxa_begin_catch(i8* null)
+; MSSA-NEXT:    br label [[BB23:%.*]]
+; MSSA:       bb22:
+; MSSA-NEXT:    [[T23:%.*]] = bitcast %struct.s* [[T]] to i8*
+; MSSA-NEXT:    [[S24:%.*]] = bitcast %struct.s* [[S]] to i8*
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[T23]], i8* align 4 bitcast (%struct.s* @s_baz to i8*), i64 8, i1 false)
+; MSSA-NEXT:    br label [[BB23]]
+; MSSA:       bb23:
+; MSSA-NEXT:    [[T17:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 0
+; MSSA-NEXT:    [[T18:%.*]] = load i32, i32* [[T17]], align 4
+; MSSA-NEXT:    [[T19:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 1
+; MSSA-NEXT:    [[T20:%.*]] = load i32, i32* [[T19]], align 4
+; MSSA-NEXT:    [[T21:%.*]] = add nsw i32 [[T18]], [[T20]]
+; MSSA-NEXT:    ret i32 [[T21]]
+; MSSA:       bb25:
+; MSSA-NEXT:    unreachable
 ;
 bb:
  %s = alloca %struct.s, align 4
--- a/llvm/test/Transforms/MemCpyOpt/stackrestore.ll
+++ b/llvm/test/Transforms/MemCpyOpt/stackrestore.ll
@ -16,19 +16,33 @@ target triple = "i686-unknown-windows-msvc19.14.26433"
 ; a call to @external.

 define i32 @test_norestore(i32 %n) {
-; CHECK-LABEL: @test_norestore(
-; CHECK-NEXT:    [[TMPMEM:%.*]] = alloca [10 x i8], align 4
-; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [10 x i8], [10 x i8]* [[TMPMEM]], i32 0, i32 0
-; CHECK-NEXT:    [[P:%.*]] = alloca i8, i32 [[N:%.*]], align 4
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[P]], i8* align 1 getelementptr inbounds ([9 x i8], [9 x i8]* @str, i32 0, i32 0), i32 9, i1 false)
-; CHECK-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 9
-; CHECK-NEXT:    store i8 0, i8* [[P10]], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP]], i8* [[P]], i32 10, i1 false)
-; CHECK-NEXT:    call void @external()
-; CHECK-NEXT:    [[HEAP:%.*]] = call i8* @malloc(i32 9)
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[HEAP]], i8* [[P]], i32 9, i1 false)
-; CHECK-NEXT:    call void @useit(i8* [[HEAP]])
-; CHECK-NEXT:    ret i32 0
+; NO_MSSA-LABEL: @test_norestore(
+; NO_MSSA-NEXT:    [[TMPMEM:%.*]] = alloca [10 x i8], align 4
+; NO_MSSA-NEXT:    [[TMP:%.*]] = getelementptr inbounds [10 x i8], [10 x i8]* [[TMPMEM]], i32 0, i32 0
+; NO_MSSA-NEXT:    [[P:%.*]] = alloca i8, i32 [[N:%.*]], align 4
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[P]], i8* align 1 getelementptr inbounds ([9 x i8], [9 x i8]* @str, i32 0, i32 0), i32 9, i1 false)
+; NO_MSSA-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 9
+; NO_MSSA-NEXT:    store i8 0, i8* [[P10]], align 1
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP]], i8* [[P]], i32 10, i1 false)
+; NO_MSSA-NEXT:    call void @external()
+; NO_MSSA-NEXT:    [[HEAP:%.*]] = call i8* @malloc(i32 9)
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[HEAP]], i8* [[P]], i32 9, i1 false)
+; NO_MSSA-NEXT:    call void @useit(i8* [[HEAP]])
+; NO_MSSA-NEXT:    ret i32 0
+;
+; MSSA-LABEL: @test_norestore(
+; MSSA-NEXT:    [[TMPMEM:%.*]] = alloca [10 x i8], align 4
+; MSSA-NEXT:    [[TMP:%.*]] = getelementptr inbounds [10 x i8], [10 x i8]* [[TMPMEM]], i32 0, i32 0
+; MSSA-NEXT:    [[P:%.*]] = alloca i8, i32 [[N:%.*]], align 4
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[P]], i8* align 1 getelementptr inbounds ([9 x i8], [9 x i8]* @str, i32 0, i32 0), i32 9, i1 false)
+; MSSA-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 9
+; MSSA-NEXT:    store i8 0, i8* [[P10]], align 1
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP]], i8* [[P]], i32 10, i1 false)
+; MSSA-NEXT:    call void @external()
+; MSSA-NEXT:    [[HEAP:%.*]] = call i8* @malloc(i32 9)
+; MSSA-NEXT:    call void @llvm.memmove.p0i8.p0i8.i32(i8* [[HEAP]], i8* align 1 getelementptr inbounds ([9 x i8], [9 x i8]* @str, i32 0, i32 0), i32 9, i1 false)
+; MSSA-NEXT:    call void @useit(i8* [[HEAP]])
+; MSSA-NEXT:    ret i32 0
 ;
  %tmpmem = alloca [10 x i8], align 4
  %tmp = getelementptr inbounds [10 x i8], [10 x i8]* %tmpmem, i32 0, i32 0