The patch adds CTLZ idiom recognition.

Summary: The following loops should be recognized: i = 0; while (n) { n = n >> 1; i++; body(); } use(i); And replaced with builtin_ctlz(n) if body() is empty or for CPUs that have CTLZ instruction converted to countable: for (j = 0; j < builtin_ctlz(n); j++) { n = n >> 1; i++; body(); } use(builtin_ctlz(n)); Reviewers: rengolin, joerg Differential Revision: http://reviews.llvm.org/D32605 From: Evgeny Stupachenko <evstupac@gmail.com> llvm-svn: 303102
2017-05-15 19:08:56 +00:00 · 2017-05-15 19:08:56 +00:00 · 2fecd38ab8
parent 586849918b
commit 2fecd38ab8
3 changed files with 664 additions and 1 deletions
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@ -144,6 +144,10 @@ private:
  bool recognizePopcount();
  void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
                               PHINode *CntPhi, Value *Var);
+  bool recognizeAndInsertCTLZ();
+  void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst,
+                                PHINode *CntPhi, Value *Var, const DebugLoc DL,
+                                bool ZeroCheck, bool IsCntPhiUsedOutsideLoop);

  /// @}
 };
@ -994,7 +998,7 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
 }

 bool LoopIdiomRecognize::runOnNoncountableLoop() {
-  return recognizePopcount();
+  return recognizePopcount() || recognizeAndInsertCTLZ();
 }

 /// Check if the given conditional branch is based on the comparison between
@ -1159,6 +1163,167 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
  return true;
 }

+/// Return true if the idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
+///       or nullptr if there is no such.
+/// 2) \p CntPhi is set to the corresponding phi node
+///       or nullptr if there is no such.
+/// 3) \p Var is set to the value whose CTLZ could be used.
+/// 4) \p DefX is set to the instruction calculating Loop exit condition.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+///    if (x0 == 0)
+///      goto loop-exit // the precondition of the loop
+///    cnt0 = init-val;
+///    do {
+///       x = phi (x0, x.next);   //PhiX
+///       cnt = phi(cnt0, cnt.next);
+///
+///       cnt.next = cnt + 1;
+///        ...
+///       x.next = x >> 1;   // DefX
+///        ...
+///    } while(x.next != 0);
+///
+/// loop-exit:
+/// \endcode
+static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
+                            Instruction *&CntInst, PHINode *&CntPhi,
+                            Instruction *&DefX) {
+  BasicBlock *LoopEntry;
+  Value *VarX = nullptr;
+
+  DefX = nullptr;
+  PhiX = nullptr;
+  CntInst = nullptr;
+  CntPhi = nullptr;
+  LoopEntry = *(CurLoop->block_begin());
+
+  // step 1: Check if the loop-back branch is in desirable form.
+  if (Value *T = matchCondition(
+          dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
+    DefX = dyn_cast<Instruction>(T);
+  else
+    return false;
+
+  // step 2: detect instructions corresponding to "x.next = x >> 1"
+  if (!DefX || DefX->getOpcode() != Instruction::AShr)
+    return false;
+  if (ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)))
+    if (!Shft || !Shft->isOne())
+      return false;
+  VarX = DefX->getOperand(0);
+
+  // step 3: Check the recurrence of variable X
+  PhiX = dyn_cast<PHINode>(VarX);
+  if (!PhiX || (PhiX->getOperand(0) != DefX && PhiX->getOperand(1) != DefX))
+    return false;
+
+  // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
+  // TODO: We can skip the step. If loop trip count is known (CTLZ),
+  //       then all uses of "cnt.next" could be optimized to the trip count
+  //       plus "cnt0". Currently it is not optimized.
+  //       This step could be used to detect POPCNT instruction:
+  //       cnt.next = cnt + (x.next & 1)
+  for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
+                            IterE = LoopEntry->end();
+       Iter != IterE; Iter++) {
+    Instruction *Inst = &*Iter;
+    if (Inst->getOpcode() != Instruction::Add)
+      continue;
+
+    ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+    if (!Inc || !Inc->isOne())
+      continue;
+
+    PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
+    if (!Phi || Phi->getParent() != LoopEntry)
+      continue;
+
+    CntInst = Inst;
+    CntPhi = Phi;
+    break;
+  }
+  if (!CntInst)
+    return false;
+
+  return true;
+}
+
+/// Recognize CTLZ idiom in a non-countable loop and convert the loop
+/// to countable (with CTLZ trip count).
+/// If CTLZ inserted as a new trip count returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
+  // Give up if the loop has multiple blocks or multiple backedges.
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+    return false;
+
+  Instruction *CntInst, *DefX;
+  PHINode *CntPhi, *PhiX;
+  if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX))
+    return false;
+
+  bool IsCntPhiUsedOutsideLoop = false;
+  for (User *U : CntPhi->users())
+    if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+      IsCntPhiUsedOutsideLoop = true;
+      break;
+    }
+  bool IsCntInstUsedOutsideLoop = false;
+  for (User *U : CntInst->users())
+    if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+      IsCntInstUsedOutsideLoop = true;
+      break;
+    }
+  // If both CntInst and CntPhi are used outside the loop the profitability
+  // is questionable.
+  if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
+    return false;
+
+  // For some CPUs result of CTLZ(X) intrinsic is undefined
+  // when X is 0. If we can not guarantee X != 0, we need to check this
+  // when expand.
+  bool ZeroCheck = false;
+  // It is safe to assume Preheader exist as it was checked in
+  // parent function RunOnLoop.
+  BasicBlock *PH = CurLoop->getLoopPreheader();
+  Value *InitX = PhiX->getIncomingValueForBlock(PH);
+  // If we check X != 0 before entering the loop we don't need a zero
+  // check in CTLZ intrinsic.
+  if (BasicBlock *PreCondBB = PH->getSinglePredecessor())
+    if (BranchInst *PreCondBr =
+        dyn_cast<BranchInst>(PreCondBB->getTerminator())) {
+      if (matchCondition(PreCondBr, PH) == InitX)
+        ZeroCheck = true;
+    }
+
+  // Check if CTLZ intrinsic is profitable. Assume it is always profitable
+  // if we delete the loop (the loop has only 6 instructions):
+  //  %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
+  //  %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
+  //  %shr = ashr %n.addr.0, 1
+  //  %tobool = icmp eq %shr, 0
+  //  %inc = add nsw %i.0, 1
+  //  br i1 %tobool
+
+  IRBuilder<> Builder(PH->getTerminator());
+  SmallVector<const Value *, 2> Ops =
+      {InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()};
+  ArrayRef<const Value *> Args(Ops);
+  if (CurLoop->getHeader()->size() != 6 &&
+      TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) >
+          TargetTransformInfo::TCC_Basic)
+    return false;
+
+  const DebugLoc DL = DefX->getDebugLoc();
+  transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck,
+                           IsCntPhiUsedOutsideLoop);
+  return true;
+}
+
 /// Recognizes a population count idiom in a non-countable loop.
 ///
 /// If detected, transforms the relevant code to issue the popcount intrinsic
@ -1222,6 +1387,134 @@ static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
  return CI;
 }

+static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
+                                     const DebugLoc &DL, bool ZeroCheck) {
+  Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
+  Type *Tys[] = {Val->getType()};
+
+  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
+  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys);
+  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+  CI->setDebugLoc(DL);
+
+  return CI;
+}
+
+/// Transform the following loop:
+/// loop:
+///   CntPhi = PHI [Cnt0, CntInst]
+///   PhiX = PHI [InitX, DefX]
+///   CntInst = CntPhi + 1
+///   DefX = PhiX >> 1
+//    LOOP_BODY
+///   Br: loop if (DefX != 0)
+/// Use(CntPhi) or Use(CntInst)
+///
+/// Into:
+/// If CntPhi used outside the loop:
+///   CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
+///   Count = CountPrev + 1
+/// else
+///   Count = BitWidth(InitX) - CTLZ(InitX)
+/// loop:
+///   CntPhi = PHI [Cnt0, CntInst]
+///   PhiX = PHI [InitX, DefX]
+///   PhiCount = PHI [Count, Dec]
+///   CntInst = CntPhi + 1
+///   DefX = PhiX >> 1
+///   Dec = PhiCount - 1
+///   LOOP_BODY
+///   Br: loop if (Dec != 0)
+/// Use(CountPrev + Cnt0) // Use(CntPhi)
+/// or
+/// Use(Count + Cnt0) // Use(CntInst)
+///
+/// If LOOP_BODY is empty the loop will be deleted.
+/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
+void LoopIdiomRecognize::transformLoopToCountable(
+    BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX,
+    const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
+  BranchInst *PreheaderBr = dyn_cast<BranchInst>(Preheader->getTerminator());
+
+  // Step 1: Insert the CTLZ instruction at the end of the preheader block
+  //   Count = BitWidth - CTLZ(InitX);
+  // If there are uses of CntPhi create:
+  //   CountPrev = BitWidth - CTLZ(InitX >> 1);
+  IRBuilder<> Builder(PreheaderBr);
+  Builder.SetCurrentDebugLocation(DL);
+  Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext;
+
+  if (IsCntPhiUsedOutsideLoop)
+    InitXNext = Builder.CreateAShr(InitX,
+                                   ConstantInt::get(InitX->getType(), 1));
+  else
+    InitXNext = InitX;
+  CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck);
+  Count = Builder.CreateSub(
+      ConstantInt::get(CTLZ->getType(),
+                       CTLZ->getType()->getIntegerBitWidth()),
+      CTLZ);
+  if (IsCntPhiUsedOutsideLoop) {
+    CountPrev = Count;
+    Count = Builder.CreateAdd(
+        CountPrev,
+        ConstantInt::get(CountPrev->getType(), 1));
+  }
+  if (IsCntPhiUsedOutsideLoop)
+    NewCount = Builder.CreateZExtOrTrunc(CountPrev,
+        cast<IntegerType>(CntInst->getType()));
+  else
+    NewCount = Builder.CreateZExtOrTrunc(Count,
+        cast<IntegerType>(CntInst->getType()));
+
+  // If the CTLZ counter's initial value is not zero, insert Add Inst.
+  Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
+  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+  if (!InitConst || !InitConst->isZero())
+    NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+
+  // Step 2: Insert new IV and loop condition:
+  // loop:
+  //   ...
+  //   PhiCount = PHI [Count, Dec]
+  //   ...
+  //   Dec = PhiCount - 1
+  //   ...
+  //   Br: loop if (Dec != 0)
+  BasicBlock *Body = *(CurLoop->block_begin());
+  auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
+  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+  Type *Ty = Count->getType();
+
+  PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
+
+  Builder.SetInsertPoint(LbCond);
+  Instruction *TcDec = cast<Instruction>(
+      Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
+                        "tcdec", false, true));
+
+  TcPhi->addIncoming(Count, Preheader);
+  TcPhi->addIncoming(TcDec, Body);
+
+  CmpInst::Predicate Pred =
+      (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
+  LbCond->setPredicate(Pred);
+  LbCond->setOperand(0, TcDec);
+  LbCond->setOperand(1, ConstantInt::get(Ty, 0));
+
+  // Step 3: All the references to the original counter outside
+  //  the loop are replaced with the NewCount -- the value returned from
+  //  __builtin_ctlz(x).
+  if (IsCntPhiUsedOutsideLoop)
+    CntPhi->replaceUsesOutsideBlock(NewCount, Body);
+  else
+    CntInst->replaceUsesOutsideBlock(NewCount, Body);
+
+  // step 4: Forget the "non-computable" trip-count SCEV associated with the
+  //   loop. The loop would otherwise not be deleted even if it becomes empty.
+  SE->forgetLoop(CurLoop);
+}
+
 void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
                                                 Instruction *CntInst,
                                                 PHINode *CntPhi, Value *Var) {
--- a/llvm/test/Transforms/LoopIdiom/ARM/ctlz.ll
+++ b/llvm/test/Transforms/LoopIdiom/ARM/ctlz.ll
@ -0,0 +1,185 @@
+; RUN: opt -loop-idiom -mtriple=armv7a < %s -S | FileCheck -check-prefix=LZCNT --check-prefix=ALL %s
+; RUN: opt -loop-idiom -mtriple=armv4t < %s -S | FileCheck -check-prefix=NOLZCNT --check-prefix=ALL %s
+
+; Recognize CTLZ builtin pattern.
+; Here we'll just convert loop to countable,
+; so do not insert builtin if CPU do not support CTLZ
+;
+; int ctlz_and_other(int n, char *a)
+; {
+;   int i = 0, n0 = n;
+;   while(n >>= 1) {
+;     a[i] = (n0 & (1 << i)) ? 1 : 0;
+;     i++;
+;   }
+;   return i;
+; }
+;
+; LZCNT:  entry
+; LZCNT:  %0 = call i32 @llvm.ctlz.i32(i32 %shr8, i1 true)
+; LZCNT-NEXT:  %1 = sub i32 32, %0
+; LZCNT-NEXT:  %2 = zext i32 %1 to i64
+; LZCNT:  %indvars.iv.next.lcssa = phi i64 [ %2, %while.body ]
+; LZCNT:  %4 = trunc i64 %indvars.iv.next.lcssa to i32
+; LZCNT:  %i.0.lcssa = phi i32 [ 0, %entry ], [ %4, %while.end.loopexit ]
+; LZCNT:  ret i32 %i.0.lcssa
+
+; NOLZCNT:  entry
+; NOLZCNT-NOT:  @llvm.ctlz
+
+; Function Attrs: norecurse nounwind uwtable
+define i32 @ctlz_and_other(i32 %n, i8* nocapture %a) {
+entry:
+  %shr8 = ashr i32 %n, 1
+  %tobool9 = icmp eq i32 %shr8, 0
+  br i1 %tobool9, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %while.body ], [ 0, %while.body.preheader ]
+  %shr11 = phi i32 [ %shr, %while.body ], [ %shr8, %while.body.preheader ]
+  %0 = trunc i64 %indvars.iv to i32
+  %shl = shl i32 1, %0
+  %and = and i32 %shl, %n
+  %tobool1 = icmp ne i32 %and, 0
+  %conv = zext i1 %tobool1 to i8
+  %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
+  store i8 %conv, i8* %arrayidx, align 1
+  %indvars.iv.next = add nuw i64 %indvars.iv, 1
+  %shr = ashr i32 %shr11, 1
+  %tobool = icmp eq i32 %shr, 0
+  br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  %1 = trunc i64 %indvars.iv.next to i32
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %1, %while.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_zero_check(int n)
+; {
+;   int i = 0;
+;   while(n) {
+;     n >>= 1;
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = call i32 @llvm.ctlz.i32(i32 %n, i1 true)
+; ALL-NEXT:  %1 = sub i32 32, %0
+; ALL:  %inc.lcssa = phi i32 [ %1, %while.body ]
+; ALL:  %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ]
+; ALL:  ret i32 %i.0.lcssa
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_zero_check(i32 %n) {
+entry:
+  %tobool4 = icmp eq i32 %n, 0
+  br i1 %tobool4, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
+  %n.addr.05 = phi i32 [ %shr, %while.body ], [ %n, %while.body.preheader ]
+  %shr = ashr i32 %n.addr.05, 1
+  %inc = add nsw i32 %i.06, 1
+  %tobool = icmp eq i32 %shr, 0
+  br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz(int n)
+; {
+;   int i = 0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = ashr i32 %n, 1
+; ALL-NEXT:  %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
+; ALL-NEXT:  %2 = sub i32 32, %1
+; ALL-NEXT:  %3 = add i32 %2, 1
+; ALL:  %i.0.lcssa = phi i32 [ %2, %while.cond ]
+; ALL:  ret i32 %i.0.lcssa
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz(i32 %n) {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_add(int n, int i0)
+; {
+;   int i = i0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = ashr i32 %n, 1
+; ALL-NEXT:  %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
+; ALL-NEXT:  %2 = sub i32 32, %1
+; ALL-NEXT:  %3 = add i32 %2, 1
+; ALL-NEXT:  %4 = add i32 %2, %i0
+; ALL:  %i.0.lcssa = phi i32 [ %4, %while.cond ]
+; ALL:  ret i32 %i.0.lcssa
+;
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_add(i32 %n, i32 %i0) {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ %i0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
--- a/llvm/test/Transforms/LoopIdiom/X86/ctlz.ll
+++ b/llvm/test/Transforms/LoopIdiom/X86/ctlz.ll
@ -0,0 +1,185 @@
+; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=core-avx2 < %s -S | FileCheck -check-prefix=LZCNT --check-prefix=ALL %s
+; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=corei7 < %s -S | FileCheck -check-prefix=NOLZCNT --check-prefix=ALL %s
+
+; Recognize CTLZ builtin pattern.
+; Here we'll just convert loop to countable,
+; so do not insert builtin if CPU do not support CTLZ
+;
+; int ctlz_and_other(int n, char *a)
+; {
+;   int i = 0, n0 = n;
+;   while(n >>= 1) {
+;     a[i] = (n0 & (1 << i)) ? 1 : 0;
+;     i++;
+;   }
+;   return i;
+; }
+;
+; LZCNT:  entry
+; LZCNT:  %0 = call i32 @llvm.ctlz.i32(i32 %shr8, i1 true)
+; LZCNT-NEXT:  %1 = sub i32 32, %0
+; LZCNT-NEXT:  %2 = zext i32 %1 to i64
+; LZCNT:  %indvars.iv.next.lcssa = phi i64 [ %2, %while.body ]
+; LZCNT:  %4 = trunc i64 %indvars.iv.next.lcssa to i32
+; LZCNT:  %i.0.lcssa = phi i32 [ 0, %entry ], [ %4, %while.end.loopexit ]
+; LZCNT:  ret i32 %i.0.lcssa
+
+; NOLZCNT:  entry
+; NOLZCNT-NOT:  @llvm.ctlz
+
+; Function Attrs: norecurse nounwind uwtable
+define i32 @ctlz_and_other(i32 %n, i8* nocapture %a) {
+entry:
+  %shr8 = ashr i32 %n, 1
+  %tobool9 = icmp eq i32 %shr8, 0
+  br i1 %tobool9, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %while.body ], [ 0, %while.body.preheader ]
+  %shr11 = phi i32 [ %shr, %while.body ], [ %shr8, %while.body.preheader ]
+  %0 = trunc i64 %indvars.iv to i32
+  %shl = shl i32 1, %0
+  %and = and i32 %shl, %n
+  %tobool1 = icmp ne i32 %and, 0
+  %conv = zext i1 %tobool1 to i8
+  %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
+  store i8 %conv, i8* %arrayidx, align 1
+  %indvars.iv.next = add nuw i64 %indvars.iv, 1
+  %shr = ashr i32 %shr11, 1
+  %tobool = icmp eq i32 %shr, 0
+  br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  %1 = trunc i64 %indvars.iv.next to i32
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %1, %while.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_zero_check(int n)
+; {
+;   int i = 0;
+;   while(n) {
+;     n >>= 1;
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = call i32 @llvm.ctlz.i32(i32 %n, i1 true)
+; ALL-NEXT:  %1 = sub i32 32, %0
+; ALL:  %inc.lcssa = phi i32 [ %1, %while.body ]
+; ALL:  %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ]
+; ALL:  ret i32 %i.0.lcssa
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_zero_check(i32 %n) {
+entry:
+  %tobool4 = icmp eq i32 %n, 0
+  br i1 %tobool4, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
+  %n.addr.05 = phi i32 [ %shr, %while.body ], [ %n, %while.body.preheader ]
+  %shr = ashr i32 %n.addr.05, 1
+  %inc = add nsw i32 %i.06, 1
+  %tobool = icmp eq i32 %shr, 0
+  br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz(int n)
+; {
+;   int i = 0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = ashr i32 %n, 1
+; ALL-NEXT:  %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
+; ALL-NEXT:  %2 = sub i32 32, %1
+; ALL-NEXT:  %3 = add i32 %2, 1
+; ALL:  %i.0.lcssa = phi i32 [ %2, %while.cond ]
+; ALL:  ret i32 %i.0.lcssa
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz(i32 %n) {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_add(int n, int i0)
+; {
+;   int i = i0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = ashr i32 %n, 1
+; ALL-NEXT:  %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
+; ALL-NEXT:  %2 = sub i32 32, %1
+; ALL-NEXT:  %3 = add i32 %2, 1
+; ALL-NEXT:  %4 = add i32 %2, %i0
+; ALL:  %i.0.lcssa = phi i32 [ %4, %while.cond ]
+; ALL:  ret i32 %i.0.lcssa
+;
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_add(i32 %n, i32 %i0) {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ %i0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}