[Unroll] Improve the brute force loop unroll estimate by propagating

through PHI nodes across iterations. This patch teaches the new advanced loop unrolling heuristics to propagate constants into the loop from the preheader and around the backedge after simulating each iteration. This lets us brute force solve simple recurrances that aren't modeled effectively by SCEV. It also makes it more clear why we need to process the loop in-order rather than bottom-up which might otherwise make much more sense (for example, for DCE). This came out of an attempt I'm making to develop a principled way to account for dead code in the unroll estimation. When I implemented a forward-propagating version of that it produced incorrect results due to failing to propagate *cost* between loop iterations through the PHI nodes, and it occured to me we really should at least propagate simplifications across those edges, and it is quite easy thanks to the loop being in canonical and LCSSA form. Differential Revision: http://reviews.llvm.org/D11706 llvm-svn: 243900
2015-08-03 20:32:27 +00:00 · 2015-08-03 20:32:27 +00:00 · 87adb7a2e2
parent 693744127b
commit 87adb7a2e2
2 changed files with 65 additions and 4 deletions
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@ -137,6 +137,7 @@ namespace {
    ///
    void getAnalysisUsage(AnalysisUsage &AU) const override {
      AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequired<DominatorTreeWrapperPass>();
      AU.addRequired<LoopInfoWrapperPass>();
      AU.addPreserved<LoopInfoWrapperPass>();
      AU.addRequiredID(LoopSimplifyID);
@ -235,6 +236,7 @@ char LoopUnroll::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
@ -516,8 +518,8 @@ struct EstimatedUnrollCost {
 /// the analysis failed (no benefits expected from the unrolling, or the loop is
 /// too big to analyze), the returned value is None.
 Optional<EstimatedUnrollCost>
-analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE,
-                      const TargetTransformInfo &TTI,
+analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
+                      ScalarEvolution &SE, const TargetTransformInfo &TTI,
                      unsigned MaxUnrolledLoopSize) {
  // We want to be able to scale offsets by the trip count and add more offsets
  // to them without checking for overflows, and we already don't want to
@ -532,6 +534,7 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE,

  SmallSetVector<BasicBlock *, 16> BBWorklist;
  DenseMap<Value *, Constant *> SimplifiedValues;
+  SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;

  // The estimated cost of the unrolled form of the loop. We try to estimate
  // this by simplifying as much as we can while computing the estimate.
@ -543,6 +546,12 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE,
  // unrolling.
  unsigned RolledDynamicCost = 0;

+  // Ensure that we don't violate the loop structure invariants relied on by
+  // this analysis.
+  assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
+  assert(L->isLCSSAForm(DT) &&
+         "Must have loops in LCSSA form to track live-out values.");
+
  DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");

  // Simulate execution of each iteration of the loop counting instructions,
@ -551,7 +560,34 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE,
  // we literally have to go through all loop's iterations.
  for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
    DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
+
+    // Prepare for the iteration by collecting any simplified entry or backedge
+    // inputs.
+    for (Instruction &I : *L->getHeader()) {
+      auto *PHI = dyn_cast<PHINode>(&I);
+      if (!PHI)
+        break;
+
+      // The loop header PHI nodes must have exactly two input: one from the
+      // loop preheader and one from the loop latch.
+      assert(
+          PHI->getNumIncomingValues() == 2 &&
+          "Must have an incoming value only for the preheader and the latch.");
+
+      Value *V = PHI->getIncomingValueForBlock(
+          Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
+      Constant *C = dyn_cast<Constant>(V);
+      if (Iteration != 0 && !C)
+        C = SimplifiedValues.lookup(V);
+      if (C)
+        SimplifiedInputValues.push_back({PHI, C});
+    }
+
+    // Now clear and re-populate the map for the next iteration.
    SimplifiedValues.clear();
+    while (!SimplifiedInputValues.empty())
+      SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
+
    UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, L, SE);

    BBWorklist.clear();
@ -849,6 +885,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {

  Function &F = *L->getHeader()->getParent();

+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
  const TargetTransformInfo &TTI =
@ -930,8 +967,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
      // The loop isn't that small, but we still can fully unroll it if that
      // helps to remove a significant number of instructions.
      // To check that, run additional analysis on the loop.
-      if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
-              L, TripCount, *SE, TTI, Threshold + DynamicCostSavingsDiscount))
+      if (Optional<EstimatedUnrollCost> Cost =
+              analyzeLoopUnrollCost(L, TripCount, DT, *SE, TTI,
+                                    Threshold + DynamicCostSavingsDiscount))
        if (canUnrollCompletely(L, Threshold, PercentDynamicCostSavedThreshold,
                                DynamicCostSavingsDiscount, Cost->UnrolledCost,
                                Cost->RolledDynamicCost)) {
--- a/llvm/test/Transforms/LoopUnroll/full-unroll-heuristics-phi-prop.ll
+++ b/llvm/test/Transforms/LoopUnroll/full-unroll-heuristics-phi-prop.ll
@ -0,0 +1,23 @@
+; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=50 | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define i64 @propagate_loop_phis() {
+; CHECK-LABEL: @propagate_loop_phis(
+; CHECK-NOT: br i1
+; CHECK: ret i64 3
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %inc, %loop ]
+  %x0 = phi i64 [ 0, %entry ], [ %x2, %loop ]
+  %x1 = or i64 %x0, 1
+  %x2 = or i64 %x1, 2
+  %inc = add nuw nsw i64 %iv, 1
+  %cond = icmp sge i64 %inc, 10
+  br i1 %cond, label %loop.end, label %loop
+
+loop.end:
+  %x.lcssa = phi i64 [ %x2, %loop ]
+  ret i64 %x.lcssa
+}