[LICM] Ignore exits provably not taken on first iteration when computing must execute

It is common to have conditional exits within a loop which are known not to be taken on some iterations, but not necessarily all. This patches extends our reasoning around guaranteed to execute (used when establishing whether it's safe to dereference a location from the preheader) to handle the case where an exit is known not to be taken on the first iteration and the instruction of interest *is* known to be taken on the first iteration. This case comes up in two major ways: * If we have a range check which we've been unable to eliminate, we frequently know that it doesn't fail on the first iteration. * Pass ordering. We may have a check which will be eliminated through some sequence of other passes, but depending on the exact pass sequence we might never actually do so or we might miss other optimizations from passes run before the check is finally eliminated. The initial version (here) is implemented via InstSimplify. At the moment, it catches a few cases, but misses a lot too. I added test cases for missing cases in InstSimplify which I'll follow up on separately. Longer term, we should probably wire SCEV through to here to get much smarter loop aware simplification of the first iteration predicate. Differential Revision: https://reviews.llvm.org/D44287 llvm-svn: 327664
2018-03-15 21:04:28 +00:00 · 2018-03-15 21:04:28 +00:00 · a21d5f1e18
parent d4254ac1b9
commit a21d5f1e18
3 changed files with 347 additions and 1 deletions
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@ -16,6 +16,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@ -1515,6 +1516,46 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
        SafetyInfo->BlockColors = colorEHFunclets(*Fn);
 }

+/// Return true if we can prove that the given ExitBlock is not reached on the
+/// first iteration of the given loop.  That is, the backedge of the loop must
+/// be executed before the ExitBlock is executed in any dynamic execution trace.
+static bool CanProveNotTakenFirstIteration(BasicBlock *ExitBlock,
+                                           const DominatorTree *DT,
+                                           const Loop *CurLoop) {
+  auto *CondExitBlock = ExitBlock->getSinglePredecessor();
+  if (!CondExitBlock)
+    // expect unique exits
+    return false;
+  assert(CurLoop->contains(CondExitBlock) && "meaning of exit block");
+  auto *BI = dyn_cast<BranchInst>(CondExitBlock->getTerminator());
+  if (!BI || !BI->isConditional())
+    return false;
+  // todo: handle fcmp someday
+  // todo: this would be a lot more powerful if we used scev, but all the
+  // plumbing is currently missing to pass a pointer in from the pass
+  auto *ICI = dyn_cast<ICmpInst>(BI->getCondition());
+  if (!ICI)
+    return false;
+  // Check for cmp (phi [x, preheader] ...), y where (pred x, y is known
+  auto *LHS = dyn_cast<PHINode>(ICI->getOperand(0));
+  auto *RHS = ICI->getOperand(1);
+  if (!LHS || LHS->getParent() != CurLoop->getHeader())
+    return false;
+  auto DL = ExitBlock->getModule()->getDataLayout();
+  auto *IVStart = LHS->getIncomingValueForBlock(CurLoop->getLoopPreheader());
+  auto *SimpleValOrNull = SimplifyICmpInst(ICI->getPredicate(),
+                                           IVStart, RHS,
+                                           {DL, /*TLI*/ nullptr,
+                                               DT, /*AC*/ nullptr, BI});
+  auto *SimpleCst = dyn_cast_or_null<Constant>(SimpleValOrNull);
+  if (!SimpleCst)
+    return false;
+  if (ExitBlock == BI->getSuccessor(0))
+    return SimpleCst->isZeroValue();
+  assert(ExitBlock == BI->getSuccessor(1) && "implied by above");
+  return SimpleCst->isAllOnesValue();
+}
+
 /// Returns true if the instruction in a loop is guaranteed to execute at least
 /// once.
 bool llvm::isGuaranteedToExecute(const Instruction &Inst,
@ -1537,6 +1578,22 @@ bool llvm::isGuaranteedToExecute(const Instruction &Inst,
  if (SafetyInfo->MayThrow)
    return false;

+  // Note: There are two styles of reasoning intermixed below for
+  // implementation efficiency reasons.  They are:
+  // 1) If we can prove that the instruction dominates all exit blocks, then we
+  // know the instruction must have executed on *some* iteration before we
+  // exit.  We do not prove *which* iteration the instruction must execute on.
+  // 2) If we can prove that the instruction dominates the latch and all exits
+  // which might be taken on the first iteration, we know the instruction must
+  // execute on the first iteration.  This second style allows a conditional
+  // exit before the instruction of interest which is provably not taken on the
+  // first iteration.  This is a quite common case for range check like
+  // patterns.  TODO: support loops with multiple latches.
+
+  const bool InstDominatesLatch =
+    CurLoop->getLoopLatch() != nullptr &&
+    DT->dominates(Inst.getParent(), CurLoop->getLoopLatch());
+
  // Get the exit blocks for the current loop.
  SmallVector<BasicBlock *, 8> ExitBlocks;
  CurLoop->getExitBlocks(ExitBlocks);
@ -1544,7 +1601,9 @@ bool llvm::isGuaranteedToExecute(const Instruction &Inst,
  // Verify that the block dominates each of the exit blocks of the loop.
  for (BasicBlock *ExitBlock : ExitBlocks)
    if (!DT->dominates(Inst.getParent(), ExitBlock))
-      return false;
+      if (!InstDominatesLatch ||
+          !CanProveNotTakenFirstIteration(ExitBlock, DT, CurLoop))
+        return false;

  // As a degenerate case, if the loop is statically infinite then we haven't
  // proven anything since there are no exit blocks.
--- a/llvm/test/Transforms/LICM/hoist-mustexec.ll
+++ b/llvm/test/Transforms/LICM/hoist-mustexec.ll
@ -0,0 +1,249 @@
+; RUN: opt -S -basicaa -licm < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @f() nounwind
+
+; constant fold on first ieration
+define i32 @test1(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test1(
+entry:
+; CHECK: %i1 = load i32, i32* %a, align 4
+; CHECK-NEXT: br label %for.body
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %continue ]
+  %acc = phi i32 [ 0, %entry ], [ %add, %continue ]
+  %r.chk = icmp ult i32 %iv, 2000
+  br i1 %r.chk, label %continue, label %fail
+continue:
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+; Count down from a.length w/entry guard
+; TODO: currently unable to prove the following:
+; ule i32 (add nsw i32 %len, -1), %len where len is [0, 512]
+define i32 @test2(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test2(
+entry:
+  %len = load i32, i32* %a, align 4, !range !{i32 0, i32 512}
+  %is.non.pos = icmp eq i32 %len, 0
+  br i1 %is.non.pos, label %fail, label %preheader
+preheader:
+  %lenminusone = add nsw i32 %len, -1
+  br label %for.body
+for.body:
+  %iv = phi i32 [ %lenminusone, %preheader ], [ %dec, %continue ]
+  %acc = phi i32 [ 0, %preheader ], [ %add, %continue ]
+  %r.chk = icmp ule i32 %iv, %len
+  br i1 %r.chk, label %continue, label %fail
+continue:
+; CHECK-LABEL: continue
+; CHECK: %i1 = load i32, i32* %a, align 4
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %dec = add nsw i32 %iv, -1
+  %exitcond = icmp eq i32 %dec, 0
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+; trivially true for zero
+define i32 @test3(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test3(
+entry:
+  %len = load i32, i32* %a, align 4, !range !{i32 0, i32 512}
+  %is.zero = icmp eq i32 %len, 0
+  br i1 %is.zero, label %fail, label %preheader
+preheader:
+; CHECK: %i1 = load i32, i32* %a, align 4
+; CHECK-NEXT: br label %for.body
+  br label %for.body
+for.body:
+  %iv = phi i32 [ 0, %preheader ], [ %inc, %continue ]
+  %acc = phi i32 [ 0, %preheader ], [ %add, %continue ]
+  %r.chk = icmp ule i32 %iv, %len
+  br i1 %r.chk, label %continue, label %fail
+continue:
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+; requires fact length is non-zero
+; TODO: IsKnownNonNullFromDominatingConditions is currently only be done for
+; pointers; should handle integers too
+define i32 @test4(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test4(
+entry:
+  %len = load i32, i32* %a, align 4, !range !{i32 0, i32 512}
+  %is.zero = icmp eq i32 %len, 0
+  br i1 %is.zero, label %fail, label %preheader
+preheader:
+  br label %for.body
+for.body:
+  %iv = phi i32 [ 0, %preheader ], [ %inc, %continue ]
+  %acc = phi i32 [ 0, %preheader ], [ %add, %continue ]
+  %r.chk = icmp ult i32 %iv, %len
+  br i1 %r.chk, label %continue, label %fail
+continue:
+; CHECK-LABEL: continue
+; CHECK: %i1 = load i32, i32* %a, align 4
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+; variation on test1 with branch swapped
+define i32 @test-brswap(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test-brswap(
+entry:
+; CHECK: %i1 = load i32, i32* %a, align 4
+; CHECK-NEXT: br label %for.body
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %continue ]
+  %acc = phi i32 [ 0, %entry ], [ %add, %continue ]
+  %r.chk = icmp ugt i32 %iv, 2000
+  br i1 %r.chk, label %fail, label %continue
+continue:
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+define i32 @test-nonphi(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test-nonphi(
+entry:
+  br label %for.body
+
+for.body:
+; CHECK-LABEL: continue
+; CHECK: %i1 = load i32, i32* %a, align 4
+  %iv = phi i32 [ 0, %entry ], [ %inc, %continue ]
+  %acc = phi i32 [ 0, %entry ], [ %add, %continue ]
+  %xor = xor i32 %iv, 72
+  %r.chk = icmp ugt i32 %xor, 2000
+  br i1 %r.chk, label %fail, label %continue
+continue:
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+define i32 @test-wrongphi(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test-wrongphi(
+entry:
+  br label %for.body
+  
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %continue ]
+  %acc = phi i32 [ 0, %entry ], [ %add, %continue ]
+  br label %dummy_block
+dummy_block:
+  %wrongphi = phi i32 [12, %for.body]
+  %r.chk = icmp ugt i32 %wrongphi, 2000
+  br i1 %r.chk, label %fail, label %continue
+continue:
+; CHECK-LABEL: continue
+; CHECK: %i1 = load i32, i32* %a, align 4
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+; This works because loop-simplify is run implicitly, but test for it anyways
+define i32 @test-multiple-latch(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test-multiple-latch(
+entry:
+; CHECK: %i1 = load i32, i32* %a, align 4
+; CHECK-NEXT: br label %for.body
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %continue1 ], [ %inc, %continue2 ]
+  %acc = phi i32 [ 0, %entry ], [ %add, %continue1 ], [ %add, %continue2 ]
+  %r.chk = icmp ult i32 %iv, 2000
+  br i1 %r.chk, label %continue1, label %fail
+continue1:
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = add nuw nsw i32 %iv, 1
+  %cmp = icmp eq i32 %add, 0
+  br i1 %cmp, label %continue2, label %for.body
+continue2:
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
--- a/llvm/test/Transforms/LICM/scalar-promote.ll
+++ b/llvm/test/Transforms/LICM/scalar-promote.ll
@ -405,6 +405,44 @@ Out:

 }

+; Early exit is known not to be taken on first iteration and thus doesn't
+; effect whether load is known to execute.
+define void @test11(i32 %i) {
+Entry:
+  br label %Loop
+; CHECK-LABEL: @test11(
+; CHECK: Entry:
+; CHECK-NEXT:   load i32, i32* @X
+; CHECK-NEXT:   br label %Loop
+
+
+Loop:   ; preds = %Loop, %0
+  %j = phi i32 [ 0, %Entry ], [ %Next, %body ]    ; <i32> [#uses=1]
+  %early.test = icmp ult i32 %j, 32
+  br i1 %early.test, label %body, label %Early
+body:
+  %x = load i32, i32* @X   ; <i32> [#uses=1]
+  %x2 = add i32 %x, 1   ; <i32> [#uses=1]
+  store i32 %x2, i32* @X
+  %Next = add i32 %j, 1   ; <i32> [#uses=2]
+  %cond = icmp eq i32 %Next, 0    ; <i1> [#uses=1]
+  br i1 %cond, label %Out, label %Loop
+
+Early:
+; CHECK: Early:
+; CHECK-NEXT:   %[[LCSSAPHI:.*]] = phi i32 [ %x2
+; CHECK-NEXT:   store i32 %[[LCSSAPHI]], i32* @X
+; CHECK-NEXT:   ret void
+  ret void
+Out:
+  ret void
+; CHECK: Out:
+; CHECK-NEXT:   %[[LCSSAPHI:.*]] = phi i32 [ %x2
+; CHECK-NEXT:   store i32 %[[LCSSAPHI]], i32* @X
+; CHECK-NEXT:   ret void
+
+}
+
 !0 = !{!4, !4, i64 0}
 !1 = !{!"omnipotent char", !2}
 !2 = !{!"Simple C/C++ TBAA"}