[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.

For DSE with MemorySSA it is beneficial to manually traverse the defining access, instead of using a MemorySSA walker, so we can better control the number of steps together with other limits and also weed out invalid/unprofitable paths early on. This patch requires a follow-up patch to be most effective, which I will share soon after putting this patch up. This temporarily XFAIL's the limit tests, because we now explore more MemoryDefs that may not alias/clobber the killing def. This will be improved/fixed by the follow-up patch. This patch also renames some `Dom*` variables to `Earlier*`, because the dominance relation is not really used/important here and potentially confusing. This patch allows us to aggressively cut down compile time, geomean -O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores removed. Subsequent patches will increase the number of removed stores again, while keeping compile-time in check. http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions Reviewed By: asbirlea Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 09:52:19 +01:00 · 2020-08-27 09:52:19 +01:00 · e717fdb0f1
parent 26d659bbe0
commit e717fdb0f1
4 changed files with 98 additions and 72 deletions
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@ -111,12 +111,29 @@ static cl::opt<unsigned>
    MemorySSAScanLimit("dse-memoryssa-scanlimit", cl::init(150), cl::Hidden,
                       cl::desc("The number of memory instructions to scan for "
                                "dead store elimination (default = 100)"));
+static cl::opt<unsigned> MemorySSAUpwardsStepLimit(
+    "dse-memoryssa-walklimit", cl::init(70), cl::Hidden,
+    cl::desc("The maximum number of steps while walking upwards to find "
+             "MemoryDefs that may be killed (default = 70)"));

 static cl::opt<unsigned> MemorySSADefsPerBlockLimit(
    "dse-memoryssa-defs-per-block-limit", cl::init(5000), cl::Hidden,
    cl::desc("The number of MemoryDefs we consider as candidates to eliminated "
             "other stores per basic block (default = 5000)"));

+static cl::opt<unsigned> MemorySSASameBBStepCost(
+    "dse-memoryssa-samebb-cost", cl::init(1), cl::Hidden,
+    cl::desc(
+        "The cost of a step in the same basic block as the killing MemoryDef"
+        "(default = 1)"));
+
+static cl::opt<unsigned>
+    MemorySSAOtherBBStepCost("dse-memoryssa-otherbb-cost", cl::init(5),
+                             cl::Hidden,
+                             cl::desc("The cost of a step in a different basic "
+                                      "block than the killing MemoryDef"
+                                      "(default = 5)"));
+
 static cl::opt<unsigned> MemorySSAPathCheckLimit(
    "dse-memoryssa-path-check-limit", cl::init(50), cl::Hidden,
    cl::desc("The maximum number of blocks to check when trying to prove that "
@ -1442,16 +1459,17 @@ namespace {
 // in between both MemoryDefs. A bit more concretely:
 //
 // For all MemoryDefs StartDef:
-// 1. Get the next dominating clobbering MemoryDef (DomAccess) by walking
+// 1. Get the next dominating clobbering MemoryDef (EarlierAccess) by walking
 //    upwards.
-// 2. Check that there are no reads between DomAccess and the StartDef by
-//    checking all uses starting at DomAccess and walking until we see StartDef.
-// 3. For each found DomDef, check that:
-//   1. There are no barrier instructions between DomDef and StartDef (like
+// 2. Check that there are no reads between EarlierAccess and the StartDef by
+//    checking all uses starting at EarlierAccess and walking until we see
+//    StartDef.
+// 3. For each found EarlierDef, check that:
+//   1. There are no barrier instructions between EarlierDef and StartDef (like
 //       throws or stores with ordering constraints).
-//   2. StartDef is executed whenever DomDef is executed.
-//   3. StartDef completely overwrites DomDef.
-// 4. Erase DomDef from the function and MemorySSA.
+//   2. StartDef is executed whenever EarlierDef is executed.
+//   3. StartDef completely overwrites EarlierDef.
+// 4. Erase EarlierDef from the function and MemorySSA.

 // Returns true if \p M is an intrisnic that does not read or write memory.
 bool isNoopIntrinsic(MemoryUseOrDef *M) {
@ -1792,13 +1810,12 @@ struct DSEState {
  Optional<MemoryAccess *>
  getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *Current,
                  MemoryLocation DefLoc, const Value *DefUO, CheckCache &Cache,
-                  unsigned &ScanLimit) {
-    if (ScanLimit == 0) {
+                  unsigned &ScanLimit, unsigned &WalkerStepLimit) {
+    if (ScanLimit == 0 || WalkerStepLimit == 0) {
      LLVM_DEBUG(dbgs() << "\n    ...  hit scan limit\n");
      return None;
    }

-    MemoryAccess *DomAccess;
    MemoryAccess *StartAccess = Current;
    bool StepAgain;
    LLVM_DEBUG(dbgs() << "  trying to get dominating access for " << *Current
@ -1810,42 +1827,42 @@ struct DSEState {
      if (MSSA.isLiveOnEntryDef(Current))
        return None;

-      if (isa<MemoryPhi>(Current)) {
-        DomAccess = Current;
-        break;
-      }
-      MemoryUseOrDef *CurrentUD = cast<MemoryUseOrDef>(Current);
-      // Look for access that clobber DefLoc.
-      DomAccess = MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(CurrentUD,
-                                                                      DefLoc);
-      if (MSSA.isLiveOnEntryDef(DomAccess))
+      // Cost of a step. Accesses in the same block are more likely to be valid
+      // candidates for elimination, hence consider them cheaper.
+      unsigned StepCost = KillingDef->getBlock() == Current->getBlock()
+                              ? MemorySSASameBBStepCost
+                              : MemorySSAOtherBBStepCost;
+      if (WalkerStepLimit <= StepCost)
        return None;
+      WalkerStepLimit -= StepCost;

-      if (isa<MemoryPhi>(DomAccess))
+      if (isa<MemoryPhi>(Current))
        break;

-      // Check if we can skip DomDef for DSE.
-      MemoryDef *DomDef = dyn_cast<MemoryDef>(DomAccess);
-      if (DomDef && canSkipDef(DomDef, !isInvisibleToCallerBeforeRet(DefUO))) {
+      // Check if we can skip EarlierDef for DSE.
+      MemoryDef *CurrentDef = dyn_cast<MemoryDef>(Current);
+      if (CurrentDef &&
+          canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(DefUO))) {
        StepAgain = true;
-        Current = DomDef->getDefiningAccess();
+        Current = CurrentDef->getDefiningAccess();
      }
-
    } while (StepAgain);

+    MemoryAccess *EarlierAccess = Current;
    // Accesses to objects accessible after the function returns can only be
    // eliminated if the access is killed along all paths to the exit. Collect
    // the blocks with killing (=completely overwriting MemoryDefs) and check if
-    // they cover all paths from DomAccess to any function exit.
+    // they cover all paths from EarlierAccess to any function exit.
    SmallPtrSet<Instruction *, 16> KillingDefs;
    KillingDefs.insert(KillingDef->getMemoryInst());
-    Instruction *DomMemInst = isa<MemoryDef>(DomAccess)
-                                  ? cast<MemoryDef>(DomAccess)->getMemoryInst()
-                                  : nullptr;
+    Instruction *EarlierMemInst =
+        isa<MemoryDef>(EarlierAccess)
+            ? cast<MemoryDef>(EarlierAccess)->getMemoryInst()
+            : nullptr;
    LLVM_DEBUG({
-      dbgs() << "  Checking for reads of " << *DomAccess;
-      if (DomMemInst)
-        dbgs() << " (" << *DomMemInst << ")\n";
+      dbgs() << "  Checking for reads of " << *EarlierAccess;
+      if (EarlierMemInst)
+        dbgs() << " (" << *EarlierMemInst << ")\n";
      else
        dbgs() << ")\n";
    });
@ -1855,14 +1872,14 @@ struct DSEState {
      for (Use &U : Acc->uses())
        WorkList.insert(cast<MemoryAccess>(U.getUser()));
    };
-    PushMemUses(DomAccess);
+    PushMemUses(EarlierAccess);

    // Optimistically collect all accesses for reads. If we do not find any
    // read clobbers, add them to the cache.
    SmallPtrSet<MemoryAccess *, 16> KnownNoReads;
-    if (!DomMemInst || !DomMemInst->mayReadFromMemory())
-      KnownNoReads.insert(DomAccess);
-    // Check if DomDef may be read.
+    if (!EarlierMemInst || !EarlierMemInst->mayReadFromMemory())
+      KnownNoReads.insert(EarlierAccess);
+    // Check if EarlierDef may be read.
    for (unsigned I = 0; I < WorkList.size(); I++) {
      MemoryAccess *UseAccess = WorkList[I];

@ -1928,15 +1945,15 @@ struct DSEState {
        LLVM_DEBUG(dbgs() << "    ... found read clobber\n");
        Cache.KnownReads.insert(UseAccess);
        Cache.KnownReads.insert(StartAccess);
-        Cache.KnownReads.insert(DomAccess);
+        Cache.KnownReads.insert(EarlierAccess);
        return None;
      }

-      // For the KillingDef and DomAccess we only have to check if it reads the
-      // memory location.
+      // For the KillingDef and EarlierAccess we only have to check if it reads
+      // the memory location.
      // TODO: It would probably be better to check for self-reads before
      // calling the function.
-      if (KillingDef == UseAccess || DomAccess == UseAccess) {
+      if (KillingDef == UseAccess || EarlierAccess == UseAccess) {
        LLVM_DEBUG(dbgs() << "    ... skipping killing def/dom access\n");
        continue;
      }
@ -1945,7 +1962,7 @@ struct DSEState {
      // the original location. Otherwise we have to check uses of *all*
      // MemoryDefs we discover, including non-aliasing ones. Otherwise we might
      // miss cases like the following
-      //   1 = Def(LoE) ; <----- DomDef stores [0,1]
+      //   1 = Def(LoE) ; <----- EarlierDef stores [0,1]
      //   2 = Def(1)   ; (2, 1) = NoAlias,   stores [2,3]
      //   Use(2)       ; MayAlias 2 *and* 1, loads [0, 3].
      //                  (The Use points to the *first* Def it may alias)
@ -1953,10 +1970,11 @@ struct DSEState {
      //                  stores [0,1]
      if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess)) {
        if (isCompleteOverwrite(DefLoc, UseInst)) {
-          if (!isInvisibleToCallerAfterRet(DefUO) && UseAccess != DomAccess) {
+          if (!isInvisibleToCallerAfterRet(DefUO) &&
+              UseAccess != EarlierAccess) {
            BasicBlock *MaybeKillingBlock = UseInst->getParent();
            if (PostOrderNumbers.find(MaybeKillingBlock)->second <
-                PostOrderNumbers.find(DomAccess->getBlock())->second) {
+                PostOrderNumbers.find(EarlierAccess->getBlock())->second) {

              LLVM_DEBUG(dbgs()
                         << "    ... found killing def " << *UseInst << "\n");
@ -1969,8 +1987,8 @@ struct DSEState {
    }

    // For accesses to locations visible after the function returns, make sure
-    // that the location is killed (=overwritten) along all paths from DomAccess
-    // to the exit.
+    // that the location is killed (=overwritten) along all paths from
+    // EarlierAccess to the exit.
    if (!isInvisibleToCallerAfterRet(DefUO)) {
      SmallPtrSet<BasicBlock *, 16> KillingBlocks;
      for (Instruction *KD : KillingDefs)
@ -1988,22 +2006,23 @@ struct DSEState {
      }

      // If CommonPred is in the set of killing blocks, just check if it
-      // post-dominates DomAccess.
+      // post-dominates EarlierAccess.
      if (KillingBlocks.count(CommonPred)) {
-        if (PDT.dominates(CommonPred, DomAccess->getBlock()))
-          return {DomAccess};
+        if (PDT.dominates(CommonPred, EarlierAccess->getBlock()))
+          return {EarlierAccess};
        return None;
      }

-      // If the common post-dominator does not post-dominate DomAccess, there
-      // is a path from DomAccess to an exit not going through a killing block.
-      if (PDT.dominates(CommonPred, DomAccess->getBlock())) {
+      // If the common post-dominator does not post-dominate EarlierAccess,
+      // there is a path from EarlierAccess to an exit not going through a
+      // killing block.
+      if (PDT.dominates(CommonPred, EarlierAccess->getBlock())) {
        SetVector<BasicBlock *> WorkList;

-        // DomAccess's post-order number provides an upper bound of the blocks
-        // on a path starting at DomAccess.
+        // EarlierAccess's post-order number provides an upper bound of the
+        // blocks on a path starting at EarlierAccess.
        unsigned UpperBound =
-            PostOrderNumbers.find(DomAccess->getBlock())->second;
+            PostOrderNumbers.find(EarlierAccess->getBlock())->second;

        // If CommonPred is null, there are multiple exits from the function.
        // They all have to be added to the worklist.
@ -2015,22 +2034,22 @@ struct DSEState {

        NumCFGTries++;
        // Check if all paths starting from an exit node go through one of the
-        // killing blocks before reaching DomAccess.
+        // killing blocks before reaching EarlierAccess.
        for (unsigned I = 0; I < WorkList.size(); I++) {
          NumCFGChecks++;
          BasicBlock *Current = WorkList[I];
          if (KillingBlocks.count(Current))
            continue;
-          if (Current == DomAccess->getBlock())
+          if (Current == EarlierAccess->getBlock())
            return None;

-          // DomAccess is reachable from the entry, so we don't have to explore
-          // unreachable blocks further.
+          // EarlierAccess is reachable from the entry, so we don't have to
+          // explore unreachable blocks further.
          if (!DT.isReachableFromEntry(Current))
            continue;

          unsigned CPO = PostOrderNumbers.find(Current)->second;
-          // Current block is not on a path starting at DomAccess.
+          // Current block is not on a path starting at EarlierAccess.
          if (CPO > UpperBound)
            continue;
          for (BasicBlock *Pred : predecessors(Current))
@ -2040,14 +2059,15 @@ struct DSEState {
            return None;
        }
        NumCFGSuccess++;
-        return {DomAccess};
+        return {EarlierAccess};
      }
      return None;
    }

-    // No aliasing MemoryUses of DomAccess found, DomAccess is potentially dead.
+    // No aliasing MemoryUses of EarlierAccess found, EarlierAccess is
+    // potentially dead.
    Cache.KnownNoReads.insert(KnownNoReads.begin(), KnownNoReads.end());
-    return {DomAccess};
+    return {EarlierAccess};
  }

  // Delete dead memory defs
@ -2248,6 +2268,7 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
                      << *KillingDef << " (" << *SI << ")\n");

    unsigned ScanLimit = MemorySSAScanLimit;
+    unsigned WalkerStepLimit = MemorySSAUpwardsStepLimit;
    // Worklist of MemoryAccesses that may be killed by KillingDef.
    SetVector<MemoryAccess *> ToCheck;
    ToCheck.insert(KillingDef->getDefiningAccess());
@ -2259,22 +2280,23 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
      if (State.SkipStores.count(Current))
        continue;

-      Optional<MemoryAccess *> Next = State.getDomMemoryDef(
-          KillingDef, Current, SILoc, SILocUnd, Cache, ScanLimit);
+      Optional<MemoryAccess *> Next =
+          State.getDomMemoryDef(KillingDef, Current, SILoc, SILocUnd, Cache,
+                                ScanLimit, WalkerStepLimit);

      if (!Next) {
        LLVM_DEBUG(dbgs() << "  finished walk\n");
        continue;
      }

-      MemoryAccess *DomAccess = *Next;
-      LLVM_DEBUG(dbgs() << " Checking if we can kill " << *DomAccess);
-      if (isa<MemoryPhi>(DomAccess)) {
+      MemoryAccess *EarlierAccess = *Next;
+      LLVM_DEBUG(dbgs() << " Checking if we can kill " << *EarlierAccess);
+      if (isa<MemoryPhi>(EarlierAccess)) {
        LLVM_DEBUG(dbgs() << "\n  ... adding incoming values to worklist\n");
-        for (Value *V : cast<MemoryPhi>(DomAccess)->incoming_values()) {
+        for (Value *V : cast<MemoryPhi>(EarlierAccess)->incoming_values()) {
          MemoryAccess *IncomingAccess = cast<MemoryAccess>(V);
          BasicBlock *IncomingBlock = IncomingAccess->getBlock();
-          BasicBlock *PhiBlock = DomAccess->getBlock();
+          BasicBlock *PhiBlock = EarlierAccess->getBlock();

          // We only consider incoming MemoryAccesses that come before the
          // MemoryPhi. Otherwise we could discover candidates that do not
@ -2285,7 +2307,7 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
        }
        continue;
      }
-      MemoryDef *NextDef = dyn_cast<MemoryDef>(DomAccess);
+      MemoryDef *NextDef = dyn_cast<MemoryDef>(EarlierAccess);
      Instruction *NI = NextDef->getMemoryInst();
      LLVM_DEBUG(dbgs() << " (" << *NI << ")\n");

--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll
@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py

+; XFAIL: *
+
 ; REQUIRES: asserts

 ; Eliminates store to %R in the entry block.
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
@ -1,4 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+
+; XFAIL: *
+
 ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -S | FileCheck --check-prefix=NO-LIMIT %s
 ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=0 -S | FileCheck --check-prefix=LIMIT-0 %s
 ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=2 -S | FileCheck --check-prefix=LIMIT-2 %s
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-multipath.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-multipath.ll
@ -406,7 +406,6 @@ define void @accessible_after_return11_loop() {
 ; CHECK-NEXT:    [[C_1:%.*]] = call i1 @cond()
 ; CHECK-NEXT:    br i1 [[C_1]], label [[FOR_BODY_I]], label [[INIT_PARSE_EXIT:%.*]]
 ; CHECK:       init_parse.exit:
-; CHECK-NEXT:    store i32 0, i32* @linenum, align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull undef)
 ; CHECK-NEXT:    store i32 0, i32* @linenum, align 4
 ; CHECK-NEXT:    br label [[FOR_BODY_I20:%.*]]