Revert "[Unroll] Implement a conservative and monotonically increasing cost tracking system during the full unroll heuristic analysis that avoids counting any instruction cost until that instruction becomes "live" through a side-effect or use outside the..."

This reverts commit r269388. It caused some bots to fail, I'm reverting it until I investigate the issue. llvm-svn: 269395
2016-05-13 06:32:25 +00:00 · 2016-05-13 06:32:25 +00:00 · 9be3b8b9bb
parent e58d6228dc
commit 9be3b8b9bb
7 changed files with 18 additions and 236 deletions
--- a/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h
+++ b/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h
@ -89,7 +89,6 @@ private:
  bool visitLoad(LoadInst &I);
  bool visitCastInst(CastInst &I);
  bool visitCmpInst(CmpInst &I);
-  bool visitPHINode(PHINode &PN);
 };
 }
 #endif
--- a/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
+++ b/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
@ -189,13 +189,3 @@ bool UnrolledInstAnalyzer::visitCmpInst(CmpInst &I) {

  return Base::visitCmpInst(I);
 }
-
-bool UnrolledInstAnalyzer::visitPHINode(PHINode &PN) {
-  // Run base visitor first. This way we can gather some useful for later
-  // analysis information.
-  if (Base::visitPHINode(PN))
-    return true;
-
-  // The loop induction PHI nodes are definitionally free.
-  return PN.getParent() == L->getHeader();
-}
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@ -184,40 +184,6 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
  return UP;
 }

-namespace {
-/// A struct to densely store the state of an instruction after unrolling at
-/// each iteration.
-///
-/// This is designed to work like a tuple of <Instruction *, int> for the
-/// purposes of hashing and lookup, but to be able to associate two boolean
-/// states with each key.
-struct UnrolledInstState {
-  Instruction *I;
-  int Iteration : 30;
-  unsigned IsFree : 1;
-  unsigned IsCounted : 1;
-};
-
-/// Hashing and equality testing for a set of the instruction states.
-struct UnrolledInstStateKeyInfo {
-  typedef DenseMapInfo<Instruction *> PtrInfo;
-  typedef DenseMapInfo<std::pair<Instruction *, int>> PairInfo;
-  static inline UnrolledInstState getEmptyKey() {
-    return {PtrInfo::getEmptyKey(), 0, 0, 0};
-  }
-  static inline UnrolledInstState getTombstoneKey() {
-    return {PtrInfo::getTombstoneKey(), 0, 0, 0};
-  }
-  static inline unsigned getHashValue(const UnrolledInstState &S) {
-    return PairInfo::getHashValue({S.I, S.Iteration});
-  }
-  static inline bool isEqual(const UnrolledInstState &LHS,
-                             const UnrolledInstState &RHS) {
-    return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
-  }
-};
-}
-
 namespace {
 struct EstimatedUnrollCost {
  /// \brief The estimated cost after unrolling.
@ -252,25 +218,18 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
  assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) &&
         "The unroll iterations max is too large!");

-  // Only analyze inner loops. We can't properly estimate cost of nested loops
-  // and we won't visit inner loops again anyway.
-  if (!L->empty())
-    return None;
-
  // Don't simulate loops with a big or unknown tripcount
  if (!UnrollMaxIterationsCountToAnalyze || !TripCount ||
      TripCount > UnrollMaxIterationsCountToAnalyze)
    return None;

  SmallSetVector<BasicBlock *, 16> BBWorklist;
-  SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist;
  DenseMap<Value *, Constant *> SimplifiedValues;
  SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;

  // The estimated cost of the unrolled form of the loop. We try to estimate
  // this by simplifying as much as we can while computing the estimate.
  int UnrolledCost = 0;
-
  // We also track the estimated dynamic (that is, actually executed) cost in
  // the rolled form. This helps identify cases when the savings from unrolling
  // aren't just exposing dead control flows, but actual reduced dynamic
@ -278,97 +237,6 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
  // unrolling.
  int RolledDynamicCost = 0;

-  // We track the simplification of each instruction in each iteration. We use
-  // this to recursively merge costs into the unrolled cost on-demand so that
-  // we don't count the cost of any dead code. This is essentially a map from
-  // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
-  DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap;
-
-  // A small worklist used to accumulate cost of instructions from each
-  // observable and reached root in the loop.
-  SmallVector<Instruction *, 16> CostWorklist;
-
-  // PHI-used worklist used between iterations while accumulating cost.
-  SmallVector<Instruction *, 4> PHIUsedList;
-
-  // Helper function to accumulate cost for instructions in the loop.
-  auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
-    assert(Iteration >= 0 && "Cannot have a negative iteration!");
-    assert(CostWorklist.empty() && "Must start with an empty cost list");
-    assert(PHIUsedList.empty() && "Must start with an empty phi used list");
-    CostWorklist.push_back(&RootI);
-    for (;; --Iteration) {
-      do {
-        Instruction *I = CostWorklist.pop_back_val();
-
-        // InstCostMap only uses I and Iteration as a key, the other two values
-        // don't matter here.
-        auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
-        if (CostIter == InstCostMap.end())
-          // If an input to a PHI node comes from a dead path through the loop
-          // we may have no cost data for it here. What that actually means is
-          // that it is free.
-          continue;
-        auto &Cost = *CostIter;
-        if (Cost.IsCounted)
-          // Already counted this instruction.
-          continue;
-
-        // Mark that we are counting the cost of this instruction now.
-        Cost.IsCounted = true;
-
-        // If this is a PHI node in the loop header, just add it to the PHI set.
-        if (auto *PhiI = dyn_cast<PHINode>(I))
-          if (PhiI->getParent() == L->getHeader()) {
-            assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
-                                  "inherently simplify during unrolling.");
-            if (Iteration == 0)
-              continue;
-
-            // Push the incoming value from the backedge into the PHI used list
-            // if it is an in-loop instruction. We'll use this to populate the
-            // cost worklist for the next iteration (as we count backwards).
-            if (auto *OpI = dyn_cast<Instruction>(
-                    PhiI->getIncomingValueForBlock(L->getLoopLatch())))
-              if (L->contains(OpI))
-                PHIUsedList.push_back(OpI);
-            continue;
-          }
-
-        // First accumulate the cost of this instruction.
-        if (!Cost.IsFree) {
-          UnrolledCost += TTI.getUserCost(I);
-          DEBUG(dbgs() << "Adding cost of instruction (iteration " << Iteration
-                       << "): ");
-          DEBUG(I->dump());
-        }
-
-        // We must count the cost of every operand which is not free,
-        // recursively. If we reach a loop PHI node, simply add it to the set
-        // to be considered on the next iteration (backwards!).
-        for (Value *Op : I->operands()) {
-          // Check whether this operand is free due to being a constant or
-          // outside the loop.
-          auto *OpI = dyn_cast<Instruction>(Op);
-          if (!OpI || !L->contains(OpI))
-            continue;
-
-          // Otherwise accumulate its cost.
-          CostWorklist.push_back(OpI);
-        }
-      } while (!CostWorklist.empty());
-
-      if (PHIUsedList.empty())
-        // We've exhausted the search.
-        break;
-
-      assert(Iteration > 0 &&
-             "Cannot track PHI-used values past the first iteration!");
-      CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
-      PHIUsedList.clear();
-    }
-  };
-
  // Ensure that we don't violate the loop structure invariants relied on by
  // this analysis.
  assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
@ -423,32 +291,22 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
      // it.  We don't change the actual IR, just count optimization
      // opportunities.
      for (Instruction &I : *BB) {
-        // Track this instruction's expected baseline cost when executing the
-        // rolled loop form.
-        RolledDynamicCost += TTI.getUserCost(&I);
+        int InstCost = TTI.getUserCost(&I);

        // Visit the instruction to analyze its loop cost after unrolling,
-        // and if the visitor returns true, mark the instruction as free after
-        // unrolling and continue.
-        bool IsFree = Analyzer.visit(I);
-        bool Inserted = InstCostMap.insert({&I, (int)Iteration, IsFree,
-                                            /*IsCounted*/ false})
-                            .second;
-        (void)Inserted;
-        assert(Inserted && "Cannot have a state for an unvisited instruction!");
+        // and if the visitor returns false, include this instruction in the
+        // unrolled cost.
+        if (!Analyzer.visit(I))
+          UnrolledCost += InstCost;
+        else {
+          DEBUG(dbgs() << "  " << I
+                       << " would be simplified if loop is unrolled.\n");
+          (void)0;
+        }

-        if (IsFree)
-          continue;
-
-        // If the instruction might have a side-effect recursively account for
-        // the cost of it and all the instructions leading up to it.
-        if (I.mayHaveSideEffects())
-          AddCostRecursively(I, Iteration);
-
-        // Can't properly model a cost of a call.
-        // FIXME: With a proper cost model we should be able to do it.
-        if(isa<CallInst>(&I))
-          return None;
+        // Also track this instructions expected cost when executing the rolled
+        // loop form.
+        RolledDynamicCost += InstCost;

        // If unrolled body turns out to be too big, bail out.
        if (UnrolledCost > MaxUnrolledLoopSize) {
@ -477,8 +335,6 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
                  cast<ConstantInt>(SimpleCond)->isZero() ? 1 : 0);
            if (L->contains(Succ))
              BBWorklist.insert(Succ);
-            else
-              ExitWorklist.insert({BB, Succ});
            continue;
          }
        }
@ -494,8 +350,6 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
                       .getCaseSuccessor();
          if (L->contains(Succ))
            BBWorklist.insert(Succ);
-          else
-            ExitWorklist.insert({BB, Succ});
          continue;
        }
      }
@ -504,8 +358,6 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
      for (BasicBlock *Succ : successors(BB))
        if (L->contains(Succ))
          BBWorklist.insert(Succ);
-        else
-          ExitWorklist.insert({BB, Succ});
    }

    // If we found no optimization opportunities on the first iteration, we
@ -516,23 +368,6 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
      return None;
    }
  }
-
-  while (!ExitWorklist.empty()) {
-    BasicBlock *ExitingBB, *ExitBB;
-    std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
-
-    for (Instruction &I : *ExitBB) {
-      auto *PN = dyn_cast<PHINode>(&I);
-      if (!PN)
-        break;
-
-      Value *Op = PN->getIncomingValueForBlock(ExitingBB);
-      if (auto *OpI = dyn_cast<Instruction>(Op))
-        if (L->contains(OpI))
-          AddCostRecursively(*OpI, TripCount - 1);
-    }
-  }
-
  DEBUG(dbgs() << "Analysis finished:\n"
               << "UnrolledCost: " << UnrolledCost << ", "
               << "RolledDynamicCost: " << RolledDynamicCost << "\n");
--- a/llvm/test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll
+++ b/llvm/test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll
@ -1,4 +1,4 @@
-; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10  -unroll-percent-dynamic-cost-saved-threshold=70 -unroll-dynamic-cost-savings-discount=90 | FileCheck %s
+; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10  -unroll-percent-dynamic-cost-saved-threshold=50 -unroll-dynamic-cost-savings-discount=90 | FileCheck %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"

@unknown_global = internal unnamed_addr global [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16
--- a/llvm/test/Transforms/LoopUnroll/full-unroll-heuristics-dce.ll
+++ b/llvm/test/Transforms/LoopUnroll/full-unroll-heuristics-dce.ll
@ -1,38 +0,0 @@
-; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=60 | FileCheck %s
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-
-@known_constant = internal unnamed_addr constant [10 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0], align 16
-
-; If a load becomes a constant after loop unrolling, we sometimes can simplify
-; CFG. This test verifies that we handle such cases.
-; After one operand in an instruction is constant-folded and the
-; instruction is simplified, the other operand might become dead.
-; In this test we have::
-; for i in 1..10:
-;   r += A[i] * B[i]
-; A[i] is 0 almost at every iteration, so there is no need in loading B[i] at
-; all.
-
-
-; CHECK-LABEL: @unroll_dce
-; CHECK-NOT:   br i1 %exitcond, label %for.end, label %for.body
-define i32 @unroll_dce(i32* noalias nocapture readonly %b) {
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %iv.0 = phi i64 [ 0, %entry ], [ %iv.1, %for.body ]
-  %r.0 = phi i32 [ 0, %entry ], [ %r.1, %for.body ]
-  %arrayidx1 = getelementptr inbounds [10 x i32], [10 x i32]* @known_constant, i64 0, i64 %iv.0
-  %x1 = load i32, i32* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv.0
-  %x2 = load i32, i32* %arrayidx2, align 4
-  %mul = mul i32 %x1, %x2
-  %r.1 = add i32 %mul, %r.0
-  %iv.1 = add nuw nsw i64 %iv.0, 1
-  %exitcond = icmp eq i64 %iv.1, 10
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret i32 %r.1
-}
--- a/llvm/test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll
+++ b/llvm/test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll
@ -1,4 +1,4 @@
-; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=60 | FileCheck %s
+; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=40 | FileCheck %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"

 ; When examining gep-instructions we shouldn't consider them simplified if the
--- a/llvm/unittests/Analysis/UnrollAnalyzer.cpp
+++ b/llvm/unittests/Analysis/UnrollAnalyzer.cpp
@ -134,7 +134,6 @@ TEST(UnrollAnalyzerTest, OuterLoopSimplification) {
      "  br label %outer.loop\n"
      "outer.loop:\n"
      "  %iv.outer = phi i64 [ 0, %entry ], [ %iv.outer.next, %outer.loop.latch ]\n"
-      "  %iv.outer.next = add nuw nsw i64 %iv.outer, 1\n"
      "  br label %inner.loop\n"
      "inner.loop:\n"
      "  %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]\n"
@ -142,6 +141,7 @@ TEST(UnrollAnalyzerTest, OuterLoopSimplification) {
      "  %exitcond.inner = icmp eq i64 %iv.inner.next, 1000\n"
      "  br i1 %exitcond.inner, label %outer.loop.latch, label %inner.loop\n"
      "outer.loop.latch:\n"
+      "  %iv.outer.next = add nuw nsw i64 %iv.outer, 1\n"
      "  %exitcond.outer = icmp eq i64 %iv.outer.next, 40\n"
      "  br i1 %exitcond.outer, label %exit, label %outer.loop\n"
      "exit:\n"
@ -163,15 +163,11 @@ TEST(UnrollAnalyzerTest, OuterLoopSimplification) {
  BasicBlock *InnerBody = &*FI++;

  BasicBlock::iterator BBI = Header->begin();
-  BBI++;
-  Instruction *Y1 = &*BBI;
+  Instruction *Y1 = &*BBI++;
  BBI = InnerBody->begin();
-  BBI++;
-  Instruction *Y2 = &*BBI;
+  Instruction *Y2 = &*BBI++;
  // Check that we can simplify IV of the outer loop, but can't simplify the IV
  // of the inner loop if we only know the iteration number of the outer loop.
-  //
-  //  Y1 is %iv.outer.next, Y2 is %iv.inner.next
  auto I1 = SimplifiedValuesVector[0].find(Y1);
  EXPECT_TRUE(I1 != SimplifiedValuesVector[0].end());
  auto I2 = SimplifiedValuesVector[0].find(Y2);