Re-commit : [LICM] Allow sinking when foldable in loop

This recommits r320823 reverted due to the test failure in sink-foldable.ll and an unused variable. Added "REQUIRES: aarch64-registered-target" in the test and removed unused variable. Original commit message: Continue trying to sink an instruction if its users in the loop is foldable. This will allow the instruction to be folded in the loop by decoupling it from the user outside of the loop. Reviewers: hfinkel, majnemer, davidxl, efriedma, danielcdh, bmakam, mcrosier Reviewed By: hfinkel Subscribers: javed.absar, bmakam, mcrosier, llvm-commits Differential Revision: https://reviews.llvm.org/D37076 llvm-svn: 320858
2017-12-15 20:33:24 +00:00 · 2017-12-15 20:33:24 +00:00 · 44c58d35c1
parent 67ca67d1b2
commit 44c58d35c1
3 changed files with 232 additions and 31 deletions
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@ -436,8 +436,9 @@ bool formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI,
 /// instructions of the loop and loop safety information as
 /// arguments. Diagnostics is emitted via \p ORE. It returns changed status.
 bool sinkRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *,
-                TargetLibraryInfo *, Loop *, AliasSetTracker *,
-                LoopSafetyInfo *, OptimizationRemarkEmitter *ORE);
+                TargetLibraryInfo *, TargetTransformInfo *, Loop *,
+                AliasSetTracker *, LoopSafetyInfo *,
+                OptimizationRemarkEmitter *ORE);

 /// \brief Walk the specified region of the CFG (defined by all blocks
 /// dominated by the specified block, and that are in the current loop) in depth
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@ -90,14 +90,15 @@ static cl::opt<uint32_t> MaxNumUsesTraversed(
             "invariance in loop using invariant start (default = 8)"));

 static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
-static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
-                            const LoopSafetyInfo *SafetyInfo);
+static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
+                                  const LoopSafetyInfo *SafetyInfo,
+                                  TargetTransformInfo *TTI, bool &FreeInLoop);
 static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
                  const LoopSafetyInfo *SafetyInfo,
                  OptimizationRemarkEmitter *ORE);
 static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
                 const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo,
-                 OptimizationRemarkEmitter *ORE);
+                 OptimizationRemarkEmitter *ORE, bool FreeInLoop);
 static bool isSafeToExecuteUnconditionally(Instruction &Inst,
                                           const DominatorTree *DT,
                                           const Loop *CurLoop,
@ -115,7 +116,8 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
 namespace {
 struct LoopInvariantCodeMotion {
  bool runOnLoop(Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT,
-                 TargetLibraryInfo *TLI, ScalarEvolution *SE, MemorySSA *MSSA,
+                 TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
+                 ScalarEvolution *SE, MemorySSA *MSSA,
                 OptimizationRemarkEmitter *ORE, bool DeleteAST);

  DenseMap<Loop *, AliasSetTracker *> &getLoopToAliasSetMap() {
@ -159,6 +161,8 @@ struct LegacyLICMPass : public LoopPass {
                          &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
                          &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
                          &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+                          &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+                              *L->getHeader()->getParent()),
                          SE ? &SE->getSE() : nullptr, MSSA, &ORE, false);
  }

@ -170,6 +174,7 @@ struct LegacyLICMPass : public LoopPass {
    AU.addRequired<TargetLibraryInfoWrapperPass>();
    if (EnableMSSALoopDependency)
      AU.addRequired<MemorySSAWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
    getLoopAnalysisUsage(AU);
  }

@ -210,8 +215,8 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
                       "cached at a higher level");

  LoopInvariantCodeMotion LICM;
-  if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.SE, AR.MSSA, ORE,
-                      true))
+  if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.TTI, &AR.SE,
+                      AR.MSSA, ORE, true))
    return PreservedAnalyses::all();

  auto PA = getLoopPassPreservedAnalyses();
@ -224,6 +229,7 @@ INITIALIZE_PASS_BEGIN(LegacyLICMPass, "licm", "Loop Invariant Code Motion",
                      false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_END(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false,
                    false)
@ -236,12 +242,10 @@ Pass *llvm::createLICMPass() { return new LegacyLICMPass(); }
 /// We should delete AST for inner loops in the new pass manager to avoid
 /// memory leak.
 ///
-bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
-                                        LoopInfo *LI, DominatorTree *DT,
-                                        TargetLibraryInfo *TLI,
-                                        ScalarEvolution *SE, MemorySSA *MSSA,
-                                        OptimizationRemarkEmitter *ORE,
-                                        bool DeleteAST) {
+bool LoopInvariantCodeMotion::runOnLoop(
+    Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT,
+    TargetLibraryInfo *TLI, TargetTransformInfo *TTI, ScalarEvolution *SE,
+    MemorySSA *MSSA, OptimizationRemarkEmitter *ORE, bool DeleteAST) {
  bool Changed = false;

  assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
@ -266,7 +270,7 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
  // instructions, we perform another pass to hoist them out of the loop.
  //
  if (L->hasDedicatedExits())
-    Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
+    Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L,
                          CurAST, &SafetyInfo, ORE);
  if (Preheader)
    Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
@ -359,7 +363,8 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
 /// definitions, allowing us to sink a loop body in one pass without iteration.
 ///
 bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
-                      DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
+                      DominatorTree *DT, TargetLibraryInfo *TLI,
+                      TargetTransformInfo *TTI, Loop *CurLoop,
                      AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
                      OptimizationRemarkEmitter *ORE) {

@ -400,12 +405,15 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
      // outside of the loop.  In this case, it doesn't even matter if the
      // operands of the instruction are loop invariant.
      //
-      if (isNotUsedInLoop(I, CurLoop, SafetyInfo) &&
+      bool FreeInLoop = false;
+      if (isNotUsedOrFreeInLoop(I, CurLoop, SafetyInfo, TTI, FreeInLoop) &&
          canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE)) {
-        if (sink(I, LI, DT, CurLoop, SafetyInfo, ORE)) {
-          ++II;
-          CurAST->deleteValue(&I);
-          I.eraseFromParent();
+        if (sink(I, LI, DT, CurLoop, SafetyInfo, ORE, FreeInLoop)) {
+          if (!FreeInLoop) {
+            ++II;
+            CurAST->deleteValue(&I);
+            I.eraseFromParent();
+          }
          Changed = true;
        }
      }
@ -708,13 +716,40 @@ static bool isTriviallyReplacablePHI(const PHINode &PN, const Instruction &I) {
  return true;
 }

+/// Return true if the instruction is free in the loop.
+static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop,
+                         const TargetTransformInfo *TTI) {
+
+  if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+    if (TTI->getUserCost(GEP) != TargetTransformInfo::TCC_Free)
+      return false;
+    // For a GEP, we cannot simply use getUserCost because currently it
+    // optimistically assume that a GEP will fold into addressing mode
+    // regardless of its users.
+    const BasicBlock *BB = GEP->getParent();
+    for (const User *U : GEP->users()) {
+      const Instruction *UI = cast<Instruction>(U);
+      if (CurLoop->contains(UI) &&
+          (BB != UI->getParent() ||
+           (!isa<StoreInst>(UI) && !isa<LoadInst>(UI))))
+        return false;
+    }
+    return true;
+  } else
+    return TTI->getUserCost(&I) == TargetTransformInfo::TCC_Free;
+}
+
 /// Return true if the only users of this instruction are outside of
 /// the loop. If this is true, we can sink the instruction to the exit
 /// blocks of the loop.
 ///
-static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
-                            const LoopSafetyInfo *SafetyInfo) {
+/// We also return true if the instruction could be folded away in lowering.
+/// (e.g.,  a GEP can be folded into a load as an addressing mode in the loop).
+static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
+                                  const LoopSafetyInfo *SafetyInfo,
+                                  TargetTransformInfo *TTI, bool &FreeInLoop) {
  const auto &BlockColors = SafetyInfo->BlockColors;
+  bool IsFree = isFreeInLoop(I, CurLoop, TTI);
  for (const User *U : I.users()) {
    const Instruction *UI = cast<Instruction>(U);
    if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
@ -731,8 +766,13 @@ static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
          return false;
    }

-    if (CurLoop->contains(UI))
+    if (CurLoop->contains(UI)) {
+      if (IsFree) {
+        FreeInLoop = true;
+        continue;
+      }
      return false;
+    }
  }
  return true;
 }
@ -888,7 +928,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
 ///
 static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
                 const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo,
-                 OptimizationRemarkEmitter *ORE) {
+                 OptimizationRemarkEmitter *ORE, bool FreeInLoop) {
  DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
  ORE->emit([&]() {
    return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
@ -900,7 +940,6 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
  else if (isa<CallInst>(I))
    ++NumMovedCalls;
  ++NumSunk;
-  Changed = true;

  // Iterate over users to be ready for actual sinking. Replace users via
  // unrechable blocks with undef and make all user PHIs trivially replcable.
@ -910,11 +949,12 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
    Use &U = UI.getUse();
    ++UI;

-    if (VisitedUsers.count(User))
+    if (VisitedUsers.count(User) || CurLoop->contains(User))
      continue;

    if (!DT->isReachableFromEntry(User->getParent())) {
      U = UndefValue::get(I.getType());
+      Changed = true;
      continue;
    }

@ -927,6 +967,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
    BasicBlock *BB = PN->getIncomingBlock(U);
    if (!DT->isReachableFromEntry(BB)) {
      U = UndefValue::get(I.getType());
+      Changed = true;
      continue;
    }

@ -935,7 +976,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
      continue;

    if (!canSplitPredecessors(PN))
-      return false;
+      return Changed;

    // Split predecessors of the PHI so that we can make users trivially
    // replacable.
@ -947,6 +988,9 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
    UE = I.user_end();
  }

+  if (VisitedUsers.empty())
+    return Changed;
+
 #ifndef NDEBUG
  SmallVector<BasicBlock *, 32> ExitBlocks;
  CurLoop->getUniqueExitBlocks(ExitBlocks);
@ -960,9 +1004,14 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
  // If this instruction is only used outside of the loop, then all users are
  // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
  // the instruction.
-  while (!I.use_empty()) {
-    Value::user_iterator UI = I.user_begin();
-    PHINode *PN = cast<PHINode>(*UI);
+  SmallSetVector<User*, 8> Users(I.user_begin(), I.user_end());
+  for (auto *UI : Users) {
+    auto *User = cast<Instruction>(UI);
+
+    if (CurLoop->contains(User))
+      continue;
+
+    PHINode *PN = cast<PHINode>(User);
    assert(ExitBlockSet.count(PN->getParent()) &&
           "The LCSSA PHI is not in an exit block!");
    // The PHI must be trivially replacable.
@ -970,6 +1019,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
                                                         SafetyInfo, CurLoop);
    PN->replaceAllUsesWith(New);
    PN->eraseFromParent();
+    Changed = true;
  }
  return Changed;
 }
--- a/llvm/test/Transforms/LICM/sink-foldable.ll
+++ b/llvm/test/Transforms/LICM/sink-foldable.ll
@ -0,0 +1,150 @@
+; REQUIRES: aarch64-registered-target
+
+; RUN: opt < %s  -licm -S   | FileCheck %s
+
+target triple = "aarch64--linux-gnueabi"
+
+; CHECK-LABEL:@test1
+; CHECK-LABEL:loopexit1:
+; CHECK: %[[PHI:.+]] = phi i8** [ %arrayidx0, %if.end ]
+; CHECK: getelementptr inbounds i8*, i8** %[[PHI]], i64 1
+define i8** @test1(i32 %j, i8** readonly %P, i8* readnone %Q) {
+entry:
+  %cmp0 = icmp slt i32 0, %j
+  br i1 %cmp0, label %for.body.lr.ph, label %return
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %P.addr = phi i8** [ %P, %for.body.lr.ph ], [ %arrayidx0, %if.end  ]
+  %i0 = phi i32 [ 0, %for.body.lr.ph ], [ %i.add, %if.end]
+
+  %i0.ext = sext i32 %i0 to i64
+  %arrayidx0 = getelementptr inbounds i8*, i8** %P.addr, i64 %i0.ext
+  %l0 = load i8*, i8** %arrayidx0, align 8
+  %cmp1 = icmp ugt i8* %l0, %Q
+  br i1 %cmp1, label %loopexit0, label %if.end
+
+if.end:                                           ; preds = %for.body
+  %arrayidx1 = getelementptr inbounds i8*, i8** %arrayidx0, i64 1
+  %l1 = load i8*, i8** %arrayidx1, align 8
+  %cmp4 = icmp ugt i8* %l1, %Q
+  %i.add = add nsw i32 %i0, 2
+  br i1 %cmp4, label %loopexit1, label %for.body
+
+loopexit0:
+  %p1 = phi i8** [%arrayidx0, %for.body]
+  br label %return
+
+loopexit1:
+  %p2 = phi i8** [%arrayidx1, %if.end]
+  br label  %return
+
+return:
+  %retval.0 = phi i8** [ %p1, %loopexit0 ], [%p2, %loopexit1], [ null, %entry ]
+  ret i8** %retval.0
+}
+
+; CHECK-LABEL: @test2
+; CHECK-LABEL: loopexit2:
+; CHECK: %[[PHI:.*]] = phi i8** [ %add.ptr, %if.end ]
+; CHECK: getelementptr inbounds i8*, i8** %[[PHI]]
+define i8** @test2(i32 %j, i8** readonly %P, i8* readnone %Q) {
+
+entry:
+  br label %for.body
+
+for.cond:
+  %i.addr.0 = phi i32 [ %add, %if.end ]
+  %P.addr.0 = phi i8** [ %add.ptr, %if.end ]
+  %cmp = icmp slt i32 %i.addr.0, %j
+  br i1 %cmp, label %for.body, label %loopexit0
+
+for.body:
+  %P.addr = phi i8** [ %P, %entry ], [ %P.addr.0, %for.cond ]
+  %i.addr = phi i32 [ 0, %entry ], [ %i.addr.0, %for.cond ]
+
+  %idx.ext = sext i32 %i.addr to i64
+  %add.ptr = getelementptr inbounds i8*, i8** %P.addr, i64 %idx.ext
+  %l0 = load i8*, i8** %add.ptr, align 8
+
+  %cmp1 = icmp ugt i8* %l0, %Q
+  br i1 %cmp1, label %loopexit1, label %if.end
+
+if.end:
+  %add.i = add i32 %i.addr, 1
+  %idx2.ext = sext i32 %add.i to i64
+  %arrayidx2 = getelementptr inbounds i8*, i8** %add.ptr, i64 %idx2.ext
+  %l1 = load i8*, i8** %arrayidx2, align 8
+  %cmp2 = icmp ugt i8* %l1, %Q
+  %add = add nsw i32 %add.i, 1
+  br i1 %cmp2, label %loopexit2, label %for.cond
+
+loopexit0:
+  %p0 = phi i8** [ null, %for.cond ]
+  br label %return
+
+loopexit1:
+  %p1 = phi i8** [ %add.ptr, %for.body ]
+  br label %return
+
+loopexit2:
+  %p2 = phi i8** [ %arrayidx2, %if.end ]
+  br label %return
+
+return:
+  %retval.0 = phi i8** [ %p1, %loopexit1 ], [ %p2, %loopexit2 ], [ %p0, %loopexit0 ]
+  ret i8** %retval.0
+}
+
+
+; CHECK-LABEL: @test3
+; CHECK-LABEL: loopexit1:
+; CHECK: %[[ADD:.*]]  = phi i64 [ %add, %if.end ]
+; CHECK: %[[ADDR:.*]] = phi i8** [ %P.addr, %if.end ]
+; CHECK: %[[TRUNC:.*]] = trunc i64 %[[ADD]] to i32
+; CHECK: getelementptr inbounds i8*, i8** %[[ADDR]], i32 %[[TRUNC]]
+; CHECK: call void @dummy(i32 %[[TRUNC]])
+define i8** @test3(i64 %j, i8** readonly %P, i8* readnone %Q) {
+entry:
+  %cmp0 = icmp slt i64 0, %j
+  br i1 %cmp0, label %for.body.lr.ph, label %return
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %P.addr = phi i8** [ %P, %for.body.lr.ph ], [ %arrayidx0, %if.end  ]
+  %i0 = phi i32 [ 0, %for.body.lr.ph ], [ %i.add, %if.end]
+
+  %i0.ext = sext i32 %i0 to i64
+  %arrayidx0 = getelementptr inbounds i8*, i8** %P.addr, i64 %i0.ext
+  %l0 = load i8*, i8** %arrayidx0, align 8
+  %cmp1 = icmp ugt i8* %l0, %Q
+  br i1 %cmp1, label %loopexit0, label %if.end
+
+if.end:                                           ; preds = %for.body
+  %add = add i64 %i0.ext, 1
+  %trunc = trunc i64 %add to i32
+  %arrayidx1 = getelementptr inbounds i8*, i8** %P.addr, i32 %trunc
+  %l1 = load i8*, i8** %arrayidx1, align 8
+  %cmp4 = icmp ugt i8* %l1, %Q
+  %i.add = add nsw i32 %i0, 2
+  br i1 %cmp4, label %loopexit1, label %for.body
+
+loopexit0:
+  %p1 = phi i8** [%arrayidx0, %for.body]
+  br label %return
+
+loopexit1:
+  %p2 = phi i8** [%arrayidx1, %if.end]
+  call void @dummy(i32 %trunc)
+  br label  %return
+
+return:
+  %retval.0 = phi i8** [ %p1, %loopexit0 ], [%p2, %loopexit1], [ null, %entry ]
+  ret i8** %retval.0
+}
+
+declare void @dummy(i32)