[DSE,MemorySSA] Check if Current is valid for elimination first.

This changes getDomMemoryDef to check if a Current is a valid candidate for elimination before checking for reads. Before the change, we were spending a lot of compile-time in checking for read accesses for Current that might not even be removable. This patch flips the logic, so we skip Current if they cannot be removed before checking all their uses. This is much more efficient in practice. It also adds a more aggressive limit for checking partially overlapping stores. The main problem with overlapping stores is that we do not know if they will lead to elimination until seeing all of them. This patch limits adds a new limit for overlapping store candidates, which keeps the number of modified overlapping stores roughly the same. This is another substantial compile-time improvement (while also increasing the number of stores eliminated). Geomean -O3 -0.67%, ReleaseThinLTO -0.97%. http://llvm-compile-time-tracker.com/compare.php?from=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&to=2e630629b43f64b60b282e90f0d96082fde2dacc&stat=instructions Reviewed By: asbirlea Differential Revision: https://reviews.llvm.org/D86487
2020-08-28 10:31:30 +01:00 · 2020-08-28 10:31:30 +01:00 · 43aa7227df
parent 9b50546b0b
commit 43aa7227df
6 changed files with 209 additions and 99 deletions
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@ -87,6 +87,8 @@ STATISTIC(NumModifiedStores, "Number of stores modified");
 STATISTIC(NumCFGChecks, "Number of stores modified");
 STATISTIC(NumCFGTries, "Number of stores modified");
 STATISTIC(NumCFGSuccess, "Number of stores modified");
+STATISTIC(NumGetDomMemoryDefPassed,
+          "Number of times a valid candidate is returned from getDomMemoryDef");
 STATISTIC(NumDomMemDefChecks,
          "Number iterations check for reads in getDomMemoryDef");

@ -116,6 +118,12 @@ static cl::opt<unsigned> MemorySSAUpwardsStepLimit(
    cl::desc("The maximum number of steps while walking upwards to find "
             "MemoryDefs that may be killed (default = 70)"));

+static cl::opt<unsigned> MemorySSAPartialStoreLimit(
+    "dse-memoryssa-partial-store-limit", cl::init(5), cl::Hidden,
+    cl::desc("The maximum number candidates that only partially overwrite the "
+             "killing MemoryDef to consider"
+             " (default = 5)"));
+
 static cl::opt<unsigned> MemorySSADefsPerBlockLimit(
    "dse-memoryssa-defs-per-block-limit", cl::init(5000), cl::Hidden,
    cl::desc("The number of MemoryDefs we consider as candidates to eliminated "
@ -1464,12 +1472,12 @@ namespace {
 // 2. Check that there are no reads between EarlierAccess and the StartDef by
 //    checking all uses starting at EarlierAccess and walking until we see
 //    StartDef.
-// 3. For each found EarlierDef, check that:
-//   1. There are no barrier instructions between EarlierDef and StartDef (like
+// 3. For each found CurrentDef, check that:
+//   1. There are no barrier instructions between CurrentDef and StartDef (like
 //       throws or stores with ordering constraints).
-//   2. StartDef is executed whenever EarlierDef is executed.
-//   3. StartDef completely overwrites EarlierDef.
-// 4. Erase EarlierDef from the function and MemorySSA.
+//   2. StartDef is executed whenever CurrentDef is executed.
+//   3. StartDef completely overwrites CurrentDef.
+// 4. Erase CurrentDef from the function and MemorySSA.

 // Returns true if \p M is an intrisnic that does not read or write memory.
 bool isNoopIntrinsic(MemoryUseOrDef *M) {
@ -1801,26 +1809,29 @@ struct DSEState {
    return isRefSet(BatchAA.getModRefInfo(UseInst, DefLoc));
  }

-  // Find a MemoryDef writing to \p DefLoc and dominating \p Current, with no
-  // read access between them or on any other path to a function exit block if
-  // \p DefLoc is not accessible after the function returns. If there is no such
-  // MemoryDef, return None. The returned value may not (completely) overwrite
-  // \p DefLoc. Currently we bail out when we encounter an aliasing MemoryUse
-  // (read).
+  // Find a MemoryDef writing to \p DefLoc and dominating \p StartAccess, with
+  // no read access between them or on any other path to a function exit block
+  // if \p DefLoc is not accessible after the function returns. If there is no
+  // such MemoryDef, return None. The returned value may not (completely)
+  // overwrite \p DefLoc. Currently we bail out when we encounter an aliasing
+  // MemoryUse (read).
  Optional<MemoryAccess *>
-  getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *Current,
+  getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *StartAccess,
                  MemoryLocation DefLoc, const Value *DefUO, CheckCache &Cache,
-                  unsigned &ScanLimit, unsigned &WalkerStepLimit) {
+                  unsigned &ScanLimit, unsigned &WalkerStepLimit,
+                  bool IsMemTerm, unsigned &PartialLimit) {
    if (ScanLimit == 0 || WalkerStepLimit == 0) {
      LLVM_DEBUG(dbgs() << "\n    ...  hit scan limit\n");
      return None;
    }

-    MemoryAccess *StartAccess = Current;
+    MemoryAccess *Current = StartAccess;
+    Instruction *KillingI = KillingDef->getMemoryInst();
    bool StepAgain;
-    LLVM_DEBUG(dbgs() << "  trying to get dominating access for " << *Current
-                      << "\n");
-    // Find the next clobbering Mod access for DefLoc, starting at Current.
+    LLVM_DEBUG(dbgs() << "  trying to get dominating access for "
+                      << *StartAccess << "\n");
+
+    // Find the next clobbering Mod access for DefLoc, starting at StartAccess.
    do {
      StepAgain = false;
      // Reached TOP.
@ -1839,12 +1850,86 @@ struct DSEState {
      if (isa<MemoryPhi>(Current))
        break;

-      // Check if we can skip EarlierDef for DSE.
-      MemoryDef *CurrentDef = dyn_cast<MemoryDef>(Current);
-      if (CurrentDef &&
-          canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(DefUO))) {
+      // Below, check if CurrentDef is a valid candidate to be eliminated by
+      // KillingDef. If it is not, check the next candidate.
+      MemoryDef *CurrentDef = cast<MemoryDef>(Current);
+      Instruction *CurrentI = CurrentDef->getMemoryInst();
+
+      if (canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(DefUO))) {
        StepAgain = true;
        Current = CurrentDef->getDefiningAccess();
+        continue;
+      }
+
+      // Before we try to remove anything, check for any extra throwing
+      // instructions that block us from DSEing
+      if (mayThrowBetween(KillingI, CurrentI, DefUO)) {
+        LLVM_DEBUG(dbgs() << "  ... skip, may throw!\n");
+        return None;
+      }
+
+      // Check for anything that looks like it will be a barrier to further
+      // removal
+      if (isDSEBarrier(DefUO, CurrentI)) {
+        LLVM_DEBUG(dbgs() << "  ... skip, barrier\n");
+        return None;
+      }
+
+      // If Current is known to be on path that reads DefLoc or is a read
+      // clobber, bail out, as the path is not profitable. We skip this check
+      // for intrinsic calls, because the code knows how to handle memcpy
+      // intrinsics.
+      if (!isa<IntrinsicInst>(CurrentI) &&
+          (Cache.KnownReads.contains(Current) ||
+           isReadClobber(DefLoc, CurrentI))) {
+        Cache.KnownReads.insert(Current);
+        return None;
+      }
+
+      // If Current cannot be analyzed or is not removable, check the next
+      // candidate.
+      if (!hasAnalyzableMemoryWrite(CurrentI, TLI) || !isRemovable(CurrentI)) {
+        StepAgain = true;
+        Current = CurrentDef->getDefiningAccess();
+        continue;
+      }
+
+      auto CurrentLoc = getLocForWriteEx(CurrentI);
+      if (!CurrentLoc)
+        break;
+
+      if (IsMemTerm) {
+        // If the killing def is a memory terminator (e.g. lifetime.end), check
+        // the next candidate if the current Current does not write the same
+        // underlying object as the terminator.
+        const Value *NIUnd = getUnderlyingObject(CurrentLoc->Ptr);
+        if (DefUO != NIUnd) {
+          StepAgain = true;
+          Current = CurrentDef->getDefiningAccess();
+        }
+        continue;
+      } else {
+        int64_t InstWriteOffset, DepWriteOffset;
+        auto OR = isOverwrite(DefLoc, *CurrentLoc, DL, TLI, DepWriteOffset,
+                              InstWriteOffset, BatchAA, &F);
+        // If Current does not write to the same object as KillingDef, check
+        // the next candidate.
+        if (OR == OW_Unknown) {
+          StepAgain = true;
+          Current = CurrentDef->getDefiningAccess();
+        } else if (OR == OW_MaybePartial) {
+          // If KillingDef only partially overwrites Current, check the next
+          // candidate if the partial step limit is exceeded. This aggressively
+          // limits the number of candidates for partial store elimination,
+          // which are less likely to be removable in the end.
+          if (PartialLimit <= 1) {
+            StepAgain = true;
+            Current = CurrentDef->getDefiningAccess();
+            WalkerStepLimit -= 1;
+            continue;
+          }
+          PartialLimit -= 1;
+        }
      }
    } while (StepAgain);

@ -2260,10 +2345,14 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,

    unsigned ScanLimit = MemorySSAScanLimit;
    unsigned WalkerStepLimit = MemorySSAUpwardsStepLimit;
+    unsigned PartialLimit = MemorySSAPartialStoreLimit;
    // Worklist of MemoryAccesses that may be killed by KillingDef.
    SetVector<MemoryAccess *> ToCheck;
    ToCheck.insert(KillingDef->getDefiningAccess());

+    if (!SILocUnd)
+      continue;
+    bool IsMemTerm = State.isMemTerminatorInst(SI);
    DSEState::CheckCache Cache;
    // Check if MemoryAccesses in the worklist are killed by KillingDef.
    for (unsigned I = 0; I < ToCheck.size(); I++) {
@ -2271,9 +2360,9 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
      if (State.SkipStores.count(Current))
        continue;

-      Optional<MemoryAccess *> Next =
-          State.getDomMemoryDef(KillingDef, Current, SILoc, SILocUnd, Cache,
-                                ScanLimit, WalkerStepLimit);
+      Optional<MemoryAccess *> Next = State.getDomMemoryDef(
+          KillingDef, Current, SILoc, SILocUnd, Cache, ScanLimit,
+          WalkerStepLimit, IsMemTerm, PartialLimit);

      if (!Next) {
        LLVM_DEBUG(dbgs() << "  finished walk\n");
@ -2301,41 +2390,17 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
      MemoryDef *NextDef = dyn_cast<MemoryDef>(EarlierAccess);
      Instruction *NI = NextDef->getMemoryInst();
      LLVM_DEBUG(dbgs() << " (" << *NI << ")\n");
-
-      // Before we try to remove anything, check for any extra throwing
-      // instructions that block us from DSEing
-      if (State.mayThrowBetween(SI, NI, SILocUnd)) {
-        LLVM_DEBUG(dbgs() << "  ... skip, may throw!\n");
-        break;
-      }
-
-      // Check for anything that looks like it will be a barrier to further
-      // removal
-      if (State.isDSEBarrier(SILocUnd, NI)) {
-        LLVM_DEBUG(dbgs() << "  ... skip, barrier\n");
-        continue;
-      }
-
      ToCheck.insert(NextDef->getDefiningAccess());
-
-      if (!hasAnalyzableMemoryWrite(NI, TLI)) {
-        LLVM_DEBUG(dbgs() << "  ... skip, cannot analyze def\n");
-        continue;
-      }
-
-      if (!isRemovable(NI)) {
-        LLVM_DEBUG(dbgs() << "  ... skip, cannot remove def\n");
-        continue;
-      }
+      NumGetDomMemoryDefPassed++;

      if (!DebugCounter::shouldExecute(MemorySSACounter))
        continue;

      MemoryLocation NILoc = *State.getLocForWriteEx(NI);

-      if (State.isMemTerminatorInst(SI)) {
+      if (IsMemTerm) {
        const Value *NIUnd = getUnderlyingObject(NILoc.Ptr);
-        if (!SILocUnd || SILocUnd != NIUnd)
+        if (SILocUnd != NIUnd)
          continue;
        LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *NI
                          << "\n  KILLER: " << *SI << '\n');
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false < %s | FileCheck %s
+; RUN: opt -S -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false < %s | FileCheck --check-prefixes=CHECK,DEFAULT-LIMIT %s
+; RUN: opt -S -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false -dse-memoryssa-partial-store-limit=10 < %s | FileCheck --check-prefixes=CHECK,LARGER-LIMIT %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux"

@ -209,22 +210,43 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1)
 declare void @goFunc(%struct.foostruct*)
 declare i32 @fa(i8*, i8**, i32, i8, i8*)

+; We miss this case, because of an aggressive limit of partial overlap analysis.
+; With a larger partial store limit, we remove the memset.
 define void @test4()  {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8
-; CHECK-NEXT:    [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0
-; CHECK-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8
-; CHECK-NEXT:    [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1
-; CHECK-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8
-; CHECK-NEXT:    [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2
-; CHECK-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8
-; CHECK-NEXT:    [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3
-; CHECK-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8
-; CHECK-NEXT:    [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4
-; CHECK-NEXT:    store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8
-; CHECK-NEXT:    call void @goFunc(%struct.foostruct* [[BANG]])
-; CHECK-NEXT:    ret void
+; DEFAULT-LIMIT-LABEL: @test4(
+; DEFAULT-LIMIT-NEXT:  entry:
+; DEFAULT-LIMIT-NEXT:    [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8
+; DEFAULT-LIMIT-NEXT:    [[V1:%.*]] = bitcast %struct.foostruct* [[BANG]] to i8*
+; DEFAULT-LIMIT-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[V1]], i64 32
+; DEFAULT-LIMIT-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 8, i1 false)
+; DEFAULT-LIMIT-NEXT:    [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0
+; DEFAULT-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8
+; DEFAULT-LIMIT-NEXT:    [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1
+; DEFAULT-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8
+; DEFAULT-LIMIT-NEXT:    [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2
+; DEFAULT-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8
+; DEFAULT-LIMIT-NEXT:    [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3
+; DEFAULT-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8
+; DEFAULT-LIMIT-NEXT:    [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4
+; DEFAULT-LIMIT-NEXT:    store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8
+; DEFAULT-LIMIT-NEXT:    call void @goFunc(%struct.foostruct* [[BANG]])
+; DEFAULT-LIMIT-NEXT:    ret void
+;
+; LARGER-LIMIT-LABEL: @test4(
+; LARGER-LIMIT-NEXT:  entry:
+; LARGER-LIMIT-NEXT:    [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8
+; LARGER-LIMIT-NEXT:    [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0
+; LARGER-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8
+; LARGER-LIMIT-NEXT:    [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1
+; LARGER-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8
+; LARGER-LIMIT-NEXT:    [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2
+; LARGER-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8
+; LARGER-LIMIT-NEXT:    [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3
+; LARGER-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8
+; LARGER-LIMIT-NEXT:    [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4
+; LARGER-LIMIT-NEXT:    store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8
+; LARGER-LIMIT-NEXT:    call void @goFunc(%struct.foostruct* [[BANG]])
+; LARGER-LIMIT-NEXT:    ret void
 ;
 entry:

--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll
@ -1,7 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py

-; XFAIL: *
-
 ; REQUIRES: asserts

 ; Eliminates store to %R in the entry block.
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
@ -1,7 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py

-; XFAIL: *
-
 ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -S | FileCheck --check-prefix=NO-LIMIT %s
 ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=0 -S | FileCheck --check-prefix=LIMIT-0 %s
 ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=2 -S | FileCheck --check-prefix=LIMIT-2 %s
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll
@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -dse -enable-dse-memoryssa %s -S | FileCheck %s
+; RUN: opt -dse -enable-dse-memoryssa %s -S | FileCheck --check-prefixes=CHECK,DEFAULT-LIMIT %s
+; RUN: opt -dse -enable-dse-memoryssa -dse-memoryssa-partial-store-limit=10 %s -S | FileCheck --check-prefixes=CHECK,LARGER-LIMIT %s


 %struct.ham = type { [3 x double], [3 x double]}
@ -7,28 +8,55 @@
 declare void @may_throw()
 declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg)

+; We miss this case, because of an aggressive limit of partial overlap analysis.
+; With a larger partial store limit, we remove the memset.
 define void @overlap1(%struct.ham* %arg, i1 %cond) {
-; CHECK-LABEL: @overlap1(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]]
-; CHECK:       bb7:
-; CHECK-NEXT:    br label [[BB9:%.*]]
-; CHECK:       bb8:
-; CHECK-NEXT:    br label [[BB9]]
-; CHECK:       bb9:
-; CHECK-NEXT:    store double 1.000000e+00, double* [[TMP2]], align 8
-; CHECK-NEXT:    store double 2.000000e+00, double* [[TMP1]], align 8
-; CHECK-NEXT:    store double 3.000000e+00, double* [[TMP]], align 8
-; CHECK-NEXT:    store double 4.000000e+00, double* [[TMP5]], align 8
-; CHECK-NEXT:    store double 5.000000e+00, double* [[TMP4]], align 8
-; CHECK-NEXT:    store double 6.000000e+00, double* [[TMP3]], align 8
-; CHECK-NEXT:    ret void
+; DEFAULT-LIMIT-LABEL: @overlap1(
+; DEFAULT-LIMIT-NEXT:  bb:
+; DEFAULT-LIMIT-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2
+; DEFAULT-LIMIT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1
+; DEFAULT-LIMIT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0
+; DEFAULT-LIMIT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2
+; DEFAULT-LIMIT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1
+; DEFAULT-LIMIT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0
+; DEFAULT-LIMIT-NEXT:    [[TMP6:%.*]] = bitcast double* [[TMP2]] to i8*
+; DEFAULT-LIMIT-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i64 32
+; DEFAULT-LIMIT-NEXT:    call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(48) [[TMP0]], i8 0, i64 16, i1 false)
+; DEFAULT-LIMIT-NEXT:    br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]]
+; DEFAULT-LIMIT:       bb7:
+; DEFAULT-LIMIT-NEXT:    br label [[BB9:%.*]]
+; DEFAULT-LIMIT:       bb8:
+; DEFAULT-LIMIT-NEXT:    br label [[BB9]]
+; DEFAULT-LIMIT:       bb9:
+; DEFAULT-LIMIT-NEXT:    store double 1.000000e+00, double* [[TMP2]], align 8
+; DEFAULT-LIMIT-NEXT:    store double 2.000000e+00, double* [[TMP1]], align 8
+; DEFAULT-LIMIT-NEXT:    store double 3.000000e+00, double* [[TMP]], align 8
+; DEFAULT-LIMIT-NEXT:    store double 4.000000e+00, double* [[TMP5]], align 8
+; DEFAULT-LIMIT-NEXT:    store double 5.000000e+00, double* [[TMP4]], align 8
+; DEFAULT-LIMIT-NEXT:    store double 6.000000e+00, double* [[TMP3]], align 8
+; DEFAULT-LIMIT-NEXT:    ret void
+;
+; LARGER-LIMIT-LABEL: @overlap1(
+; LARGER-LIMIT-NEXT:  bb:
+; LARGER-LIMIT-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2
+; LARGER-LIMIT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1
+; LARGER-LIMIT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0
+; LARGER-LIMIT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2
+; LARGER-LIMIT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1
+; LARGER-LIMIT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0
+; LARGER-LIMIT-NEXT:    br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]]
+; LARGER-LIMIT:       bb7:
+; LARGER-LIMIT-NEXT:    br label [[BB9:%.*]]
+; LARGER-LIMIT:       bb8:
+; LARGER-LIMIT-NEXT:    br label [[BB9]]
+; LARGER-LIMIT:       bb9:
+; LARGER-LIMIT-NEXT:    store double 1.000000e+00, double* [[TMP2]], align 8
+; LARGER-LIMIT-NEXT:    store double 2.000000e+00, double* [[TMP1]], align 8
+; LARGER-LIMIT-NEXT:    store double 3.000000e+00, double* [[TMP]], align 8
+; LARGER-LIMIT-NEXT:    store double 4.000000e+00, double* [[TMP5]], align 8
+; LARGER-LIMIT-NEXT:    store double 5.000000e+00, double* [[TMP4]], align 8
+; LARGER-LIMIT-NEXT:    store double 6.000000e+00, double* [[TMP3]], align 8
+; LARGER-LIMIT-NEXT:    ret void
 ;
 bb:
  %tmp = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 0, i64 2
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
@ -477,10 +477,8 @@ bb2:
  ret i32 0
 }

-; TODO
-; We can remove redundant store, as noalias %p guarantees that the function does
-; only access it via %p. This also holds for the call to unknown_func even though
-; it could unwind
+; We cannot remove any stores, because @unknown_func may unwind and the caller
+; may read %p while unwinding.
 define void @test34(i32* noalias %p) {
 ; CHECK-LABEL: @test34(
 ; CHECK-NEXT:    store i32 1, i32* [[P:%.*]], align 4
@ -636,9 +634,10 @@ entry:
  ret void
 }

-; I think this case is currently handled incorrectly by memdeps dse
-; throwing should leave store i32 1, not remove from the free.
 declare void @free(i8* nocapture)
+
+; We cannot remove `store i32 1, i32* %p`, because @unknown_func may unwind
+; and the caller may read %p while unwinding.
 define void @test41(i32* noalias %P) {
 ; CHECK-LABEL: @test41(
 ; CHECK-NEXT:    [[P2:%.*]] = bitcast i32* [[P:%.*]] to i8*