forked from OSchip/llvm-project
The patch adds CTLZ idiom recognition.
Summary: The following loops should be recognized: i = 0; while (n) { n = n >> 1; i++; body(); } use(i); And replaced with builtin_ctlz(n) if body() is empty or for CPUs that have CTLZ instruction converted to countable: for (j = 0; j < builtin_ctlz(n); j++) { n = n >> 1; i++; body(); } use(builtin_ctlz(n)); Reviewers: rengolin, joerg Differential Revision: http://reviews.llvm.org/D32605 From: Evgeny Stupachenko <evstupac@gmail.com> llvm-svn: 303102
This commit is contained in:
parent
586849918b
commit
2fecd38ab8
|
@ -144,6 +144,10 @@ private:
|
|||
bool recognizePopcount();
|
||||
void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
|
||||
PHINode *CntPhi, Value *Var);
|
||||
bool recognizeAndInsertCTLZ();
|
||||
void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst,
|
||||
PHINode *CntPhi, Value *Var, const DebugLoc DL,
|
||||
bool ZeroCheck, bool IsCntPhiUsedOutsideLoop);
|
||||
|
||||
/// @}
|
||||
};
|
||||
|
@ -994,7 +998,7 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
|
|||
}
|
||||
|
||||
bool LoopIdiomRecognize::runOnNoncountableLoop() {
|
||||
return recognizePopcount();
|
||||
return recognizePopcount() || recognizeAndInsertCTLZ();
|
||||
}
|
||||
|
||||
/// Check if the given conditional branch is based on the comparison between
|
||||
|
@ -1159,6 +1163,167 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
|
|||
return true;
|
||||
}
|
||||
|
||||
/// Return true if the idiom is detected in the loop.
|
||||
///
|
||||
/// Additionally:
|
||||
/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
|
||||
/// or nullptr if there is no such.
|
||||
/// 2) \p CntPhi is set to the corresponding phi node
|
||||
/// or nullptr if there is no such.
|
||||
/// 3) \p Var is set to the value whose CTLZ could be used.
|
||||
/// 4) \p DefX is set to the instruction calculating Loop exit condition.
|
||||
///
|
||||
/// The core idiom we are trying to detect is:
|
||||
/// \code
|
||||
/// if (x0 == 0)
|
||||
/// goto loop-exit // the precondition of the loop
|
||||
/// cnt0 = init-val;
|
||||
/// do {
|
||||
/// x = phi (x0, x.next); //PhiX
|
||||
/// cnt = phi(cnt0, cnt.next);
|
||||
///
|
||||
/// cnt.next = cnt + 1;
|
||||
/// ...
|
||||
/// x.next = x >> 1; // DefX
|
||||
/// ...
|
||||
/// } while(x.next != 0);
|
||||
///
|
||||
/// loop-exit:
|
||||
/// \endcode
|
||||
static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
|
||||
Instruction *&CntInst, PHINode *&CntPhi,
|
||||
Instruction *&DefX) {
|
||||
BasicBlock *LoopEntry;
|
||||
Value *VarX = nullptr;
|
||||
|
||||
DefX = nullptr;
|
||||
PhiX = nullptr;
|
||||
CntInst = nullptr;
|
||||
CntPhi = nullptr;
|
||||
LoopEntry = *(CurLoop->block_begin());
|
||||
|
||||
// step 1: Check if the loop-back branch is in desirable form.
|
||||
if (Value *T = matchCondition(
|
||||
dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
|
||||
DefX = dyn_cast<Instruction>(T);
|
||||
else
|
||||
return false;
|
||||
|
||||
// step 2: detect instructions corresponding to "x.next = x >> 1"
|
||||
if (!DefX || DefX->getOpcode() != Instruction::AShr)
|
||||
return false;
|
||||
if (ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)))
|
||||
if (!Shft || !Shft->isOne())
|
||||
return false;
|
||||
VarX = DefX->getOperand(0);
|
||||
|
||||
// step 3: Check the recurrence of variable X
|
||||
PhiX = dyn_cast<PHINode>(VarX);
|
||||
if (!PhiX || (PhiX->getOperand(0) != DefX && PhiX->getOperand(1) != DefX))
|
||||
return false;
|
||||
|
||||
// step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
|
||||
// TODO: We can skip the step. If loop trip count is known (CTLZ),
|
||||
// then all uses of "cnt.next" could be optimized to the trip count
|
||||
// plus "cnt0". Currently it is not optimized.
|
||||
// This step could be used to detect POPCNT instruction:
|
||||
// cnt.next = cnt + (x.next & 1)
|
||||
for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
|
||||
IterE = LoopEntry->end();
|
||||
Iter != IterE; Iter++) {
|
||||
Instruction *Inst = &*Iter;
|
||||
if (Inst->getOpcode() != Instruction::Add)
|
||||
continue;
|
||||
|
||||
ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
|
||||
if (!Inc || !Inc->isOne())
|
||||
continue;
|
||||
|
||||
PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
|
||||
if (!Phi || Phi->getParent() != LoopEntry)
|
||||
continue;
|
||||
|
||||
CntInst = Inst;
|
||||
CntPhi = Phi;
|
||||
break;
|
||||
}
|
||||
if (!CntInst)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Recognize CTLZ idiom in a non-countable loop and convert the loop
|
||||
/// to countable (with CTLZ trip count).
|
||||
/// If CTLZ inserted as a new trip count returns true; otherwise, returns false.
|
||||
bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
|
||||
// Give up if the loop has multiple blocks or multiple backedges.
|
||||
if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
|
||||
return false;
|
||||
|
||||
Instruction *CntInst, *DefX;
|
||||
PHINode *CntPhi, *PhiX;
|
||||
if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX))
|
||||
return false;
|
||||
|
||||
bool IsCntPhiUsedOutsideLoop = false;
|
||||
for (User *U : CntPhi->users())
|
||||
if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
|
||||
IsCntPhiUsedOutsideLoop = true;
|
||||
break;
|
||||
}
|
||||
bool IsCntInstUsedOutsideLoop = false;
|
||||
for (User *U : CntInst->users())
|
||||
if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
|
||||
IsCntInstUsedOutsideLoop = true;
|
||||
break;
|
||||
}
|
||||
// If both CntInst and CntPhi are used outside the loop the profitability
|
||||
// is questionable.
|
||||
if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
|
||||
return false;
|
||||
|
||||
// For some CPUs result of CTLZ(X) intrinsic is undefined
|
||||
// when X is 0. If we can not guarantee X != 0, we need to check this
|
||||
// when expand.
|
||||
bool ZeroCheck = false;
|
||||
// It is safe to assume Preheader exist as it was checked in
|
||||
// parent function RunOnLoop.
|
||||
BasicBlock *PH = CurLoop->getLoopPreheader();
|
||||
Value *InitX = PhiX->getIncomingValueForBlock(PH);
|
||||
// If we check X != 0 before entering the loop we don't need a zero
|
||||
// check in CTLZ intrinsic.
|
||||
if (BasicBlock *PreCondBB = PH->getSinglePredecessor())
|
||||
if (BranchInst *PreCondBr =
|
||||
dyn_cast<BranchInst>(PreCondBB->getTerminator())) {
|
||||
if (matchCondition(PreCondBr, PH) == InitX)
|
||||
ZeroCheck = true;
|
||||
}
|
||||
|
||||
// Check if CTLZ intrinsic is profitable. Assume it is always profitable
|
||||
// if we delete the loop (the loop has only 6 instructions):
|
||||
// %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
|
||||
// %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
|
||||
// %shr = ashr %n.addr.0, 1
|
||||
// %tobool = icmp eq %shr, 0
|
||||
// %inc = add nsw %i.0, 1
|
||||
// br i1 %tobool
|
||||
|
||||
IRBuilder<> Builder(PH->getTerminator());
|
||||
SmallVector<const Value *, 2> Ops =
|
||||
{InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()};
|
||||
ArrayRef<const Value *> Args(Ops);
|
||||
if (CurLoop->getHeader()->size() != 6 &&
|
||||
TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) >
|
||||
TargetTransformInfo::TCC_Basic)
|
||||
return false;
|
||||
|
||||
const DebugLoc DL = DefX->getDebugLoc();
|
||||
transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck,
|
||||
IsCntPhiUsedOutsideLoop);
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Recognizes a population count idiom in a non-countable loop.
|
||||
///
|
||||
/// If detected, transforms the relevant code to issue the popcount intrinsic
|
||||
|
@ -1222,6 +1387,134 @@ static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
|
|||
return CI;
|
||||
}
|
||||
|
||||
static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
|
||||
const DebugLoc &DL, bool ZeroCheck) {
|
||||
Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
|
||||
Type *Tys[] = {Val->getType()};
|
||||
|
||||
Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
|
||||
Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys);
|
||||
CallInst *CI = IRBuilder.CreateCall(Func, Ops);
|
||||
CI->setDebugLoc(DL);
|
||||
|
||||
return CI;
|
||||
}
|
||||
|
||||
/// Transform the following loop:
|
||||
/// loop:
|
||||
/// CntPhi = PHI [Cnt0, CntInst]
|
||||
/// PhiX = PHI [InitX, DefX]
|
||||
/// CntInst = CntPhi + 1
|
||||
/// DefX = PhiX >> 1
|
||||
// LOOP_BODY
|
||||
/// Br: loop if (DefX != 0)
|
||||
/// Use(CntPhi) or Use(CntInst)
|
||||
///
|
||||
/// Into:
|
||||
/// If CntPhi used outside the loop:
|
||||
/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
|
||||
/// Count = CountPrev + 1
|
||||
/// else
|
||||
/// Count = BitWidth(InitX) - CTLZ(InitX)
|
||||
/// loop:
|
||||
/// CntPhi = PHI [Cnt0, CntInst]
|
||||
/// PhiX = PHI [InitX, DefX]
|
||||
/// PhiCount = PHI [Count, Dec]
|
||||
/// CntInst = CntPhi + 1
|
||||
/// DefX = PhiX >> 1
|
||||
/// Dec = PhiCount - 1
|
||||
/// LOOP_BODY
|
||||
/// Br: loop if (Dec != 0)
|
||||
/// Use(CountPrev + Cnt0) // Use(CntPhi)
|
||||
/// or
|
||||
/// Use(Count + Cnt0) // Use(CntInst)
|
||||
///
|
||||
/// If LOOP_BODY is empty the loop will be deleted.
|
||||
/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
|
||||
void LoopIdiomRecognize::transformLoopToCountable(
|
||||
BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX,
|
||||
const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
|
||||
BranchInst *PreheaderBr = dyn_cast<BranchInst>(Preheader->getTerminator());
|
||||
|
||||
// Step 1: Insert the CTLZ instruction at the end of the preheader block
|
||||
// Count = BitWidth - CTLZ(InitX);
|
||||
// If there are uses of CntPhi create:
|
||||
// CountPrev = BitWidth - CTLZ(InitX >> 1);
|
||||
IRBuilder<> Builder(PreheaderBr);
|
||||
Builder.SetCurrentDebugLocation(DL);
|
||||
Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext;
|
||||
|
||||
if (IsCntPhiUsedOutsideLoop)
|
||||
InitXNext = Builder.CreateAShr(InitX,
|
||||
ConstantInt::get(InitX->getType(), 1));
|
||||
else
|
||||
InitXNext = InitX;
|
||||
CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck);
|
||||
Count = Builder.CreateSub(
|
||||
ConstantInt::get(CTLZ->getType(),
|
||||
CTLZ->getType()->getIntegerBitWidth()),
|
||||
CTLZ);
|
||||
if (IsCntPhiUsedOutsideLoop) {
|
||||
CountPrev = Count;
|
||||
Count = Builder.CreateAdd(
|
||||
CountPrev,
|
||||
ConstantInt::get(CountPrev->getType(), 1));
|
||||
}
|
||||
if (IsCntPhiUsedOutsideLoop)
|
||||
NewCount = Builder.CreateZExtOrTrunc(CountPrev,
|
||||
cast<IntegerType>(CntInst->getType()));
|
||||
else
|
||||
NewCount = Builder.CreateZExtOrTrunc(Count,
|
||||
cast<IntegerType>(CntInst->getType()));
|
||||
|
||||
// If the CTLZ counter's initial value is not zero, insert Add Inst.
|
||||
Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
|
||||
ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
|
||||
if (!InitConst || !InitConst->isZero())
|
||||
NewCount = Builder.CreateAdd(NewCount, CntInitVal);
|
||||
|
||||
// Step 2: Insert new IV and loop condition:
|
||||
// loop:
|
||||
// ...
|
||||
// PhiCount = PHI [Count, Dec]
|
||||
// ...
|
||||
// Dec = PhiCount - 1
|
||||
// ...
|
||||
// Br: loop if (Dec != 0)
|
||||
BasicBlock *Body = *(CurLoop->block_begin());
|
||||
auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
|
||||
ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
|
||||
Type *Ty = Count->getType();
|
||||
|
||||
PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
|
||||
|
||||
Builder.SetInsertPoint(LbCond);
|
||||
Instruction *TcDec = cast<Instruction>(
|
||||
Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
|
||||
"tcdec", false, true));
|
||||
|
||||
TcPhi->addIncoming(Count, Preheader);
|
||||
TcPhi->addIncoming(TcDec, Body);
|
||||
|
||||
CmpInst::Predicate Pred =
|
||||
(LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
|
||||
LbCond->setPredicate(Pred);
|
||||
LbCond->setOperand(0, TcDec);
|
||||
LbCond->setOperand(1, ConstantInt::get(Ty, 0));
|
||||
|
||||
// Step 3: All the references to the original counter outside
|
||||
// the loop are replaced with the NewCount -- the value returned from
|
||||
// __builtin_ctlz(x).
|
||||
if (IsCntPhiUsedOutsideLoop)
|
||||
CntPhi->replaceUsesOutsideBlock(NewCount, Body);
|
||||
else
|
||||
CntInst->replaceUsesOutsideBlock(NewCount, Body);
|
||||
|
||||
// step 4: Forget the "non-computable" trip-count SCEV associated with the
|
||||
// loop. The loop would otherwise not be deleted even if it becomes empty.
|
||||
SE->forgetLoop(CurLoop);
|
||||
}
|
||||
|
||||
void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
|
||||
Instruction *CntInst,
|
||||
PHINode *CntPhi, Value *Var) {
|
||||
|
|
|
@ -0,0 +1,185 @@
|
|||
; RUN: opt -loop-idiom -mtriple=armv7a < %s -S | FileCheck -check-prefix=LZCNT --check-prefix=ALL %s
|
||||
; RUN: opt -loop-idiom -mtriple=armv4t < %s -S | FileCheck -check-prefix=NOLZCNT --check-prefix=ALL %s
|
||||
|
||||
; Recognize CTLZ builtin pattern.
|
||||
; Here we'll just convert loop to countable,
|
||||
; so do not insert builtin if CPU do not support CTLZ
|
||||
;
|
||||
; int ctlz_and_other(int n, char *a)
|
||||
; {
|
||||
; int i = 0, n0 = n;
|
||||
; while(n >>= 1) {
|
||||
; a[i] = (n0 & (1 << i)) ? 1 : 0;
|
||||
; i++;
|
||||
; }
|
||||
; return i;
|
||||
; }
|
||||
;
|
||||
; LZCNT: entry
|
||||
; LZCNT: %0 = call i32 @llvm.ctlz.i32(i32 %shr8, i1 true)
|
||||
; LZCNT-NEXT: %1 = sub i32 32, %0
|
||||
; LZCNT-NEXT: %2 = zext i32 %1 to i64
|
||||
; LZCNT: %indvars.iv.next.lcssa = phi i64 [ %2, %while.body ]
|
||||
; LZCNT: %4 = trunc i64 %indvars.iv.next.lcssa to i32
|
||||
; LZCNT: %i.0.lcssa = phi i32 [ 0, %entry ], [ %4, %while.end.loopexit ]
|
||||
; LZCNT: ret i32 %i.0.lcssa
|
||||
|
||||
; NOLZCNT: entry
|
||||
; NOLZCNT-NOT: @llvm.ctlz
|
||||
|
||||
; Function Attrs: norecurse nounwind uwtable
|
||||
define i32 @ctlz_and_other(i32 %n, i8* nocapture %a) {
|
||||
entry:
|
||||
%shr8 = ashr i32 %n, 1
|
||||
%tobool9 = icmp eq i32 %shr8, 0
|
||||
br i1 %tobool9, label %while.end, label %while.body.preheader
|
||||
|
||||
while.body.preheader: ; preds = %entry
|
||||
br label %while.body
|
||||
|
||||
while.body: ; preds = %while.body.preheader, %while.body
|
||||
%indvars.iv = phi i64 [ %indvars.iv.next, %while.body ], [ 0, %while.body.preheader ]
|
||||
%shr11 = phi i32 [ %shr, %while.body ], [ %shr8, %while.body.preheader ]
|
||||
%0 = trunc i64 %indvars.iv to i32
|
||||
%shl = shl i32 1, %0
|
||||
%and = and i32 %shl, %n
|
||||
%tobool1 = icmp ne i32 %and, 0
|
||||
%conv = zext i1 %tobool1 to i8
|
||||
%arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
|
||||
store i8 %conv, i8* %arrayidx, align 1
|
||||
%indvars.iv.next = add nuw i64 %indvars.iv, 1
|
||||
%shr = ashr i32 %shr11, 1
|
||||
%tobool = icmp eq i32 %shr, 0
|
||||
br i1 %tobool, label %while.end.loopexit, label %while.body
|
||||
|
||||
while.end.loopexit: ; preds = %while.body
|
||||
%1 = trunc i64 %indvars.iv.next to i32
|
||||
br label %while.end
|
||||
|
||||
while.end: ; preds = %while.end.loopexit, %entry
|
||||
%i.0.lcssa = phi i32 [ 0, %entry ], [ %1, %while.end.loopexit ]
|
||||
ret i32 %i.0.lcssa
|
||||
}
|
||||
|
||||
; Recognize CTLZ builtin pattern.
|
||||
; Here it will replace the loop -
|
||||
; assume builtin is always profitable.
|
||||
;
|
||||
; int ctlz_zero_check(int n)
|
||||
; {
|
||||
; int i = 0;
|
||||
; while(n) {
|
||||
; n >>= 1;
|
||||
; i++;
|
||||
; }
|
||||
; return i;
|
||||
; }
|
||||
;
|
||||
; ALL: entry
|
||||
; ALL: %0 = call i32 @llvm.ctlz.i32(i32 %n, i1 true)
|
||||
; ALL-NEXT: %1 = sub i32 32, %0
|
||||
; ALL: %inc.lcssa = phi i32 [ %1, %while.body ]
|
||||
; ALL: %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ]
|
||||
; ALL: ret i32 %i.0.lcssa
|
||||
|
||||
; Function Attrs: norecurse nounwind readnone uwtable
|
||||
define i32 @ctlz_zero_check(i32 %n) {
|
||||
entry:
|
||||
%tobool4 = icmp eq i32 %n, 0
|
||||
br i1 %tobool4, label %while.end, label %while.body.preheader
|
||||
|
||||
while.body.preheader: ; preds = %entry
|
||||
br label %while.body
|
||||
|
||||
while.body: ; preds = %while.body.preheader, %while.body
|
||||
%i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
|
||||
%n.addr.05 = phi i32 [ %shr, %while.body ], [ %n, %while.body.preheader ]
|
||||
%shr = ashr i32 %n.addr.05, 1
|
||||
%inc = add nsw i32 %i.06, 1
|
||||
%tobool = icmp eq i32 %shr, 0
|
||||
br i1 %tobool, label %while.end.loopexit, label %while.body
|
||||
|
||||
while.end.loopexit: ; preds = %while.body
|
||||
br label %while.end
|
||||
|
||||
while.end: ; preds = %while.end.loopexit, %entry
|
||||
%i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ]
|
||||
ret i32 %i.0.lcssa
|
||||
}
|
||||
|
||||
; Recognize CTLZ builtin pattern.
|
||||
; Here it will replace the loop -
|
||||
; assume builtin is always profitable.
|
||||
;
|
||||
; int ctlz(int n)
|
||||
; {
|
||||
; int i = 0;
|
||||
; while(n >>= 1) {
|
||||
; i++;
|
||||
; }
|
||||
; return i;
|
||||
; }
|
||||
;
|
||||
; ALL: entry
|
||||
; ALL: %0 = ashr i32 %n, 1
|
||||
; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
|
||||
; ALL-NEXT: %2 = sub i32 32, %1
|
||||
; ALL-NEXT: %3 = add i32 %2, 1
|
||||
; ALL: %i.0.lcssa = phi i32 [ %2, %while.cond ]
|
||||
; ALL: ret i32 %i.0.lcssa
|
||||
|
||||
; Function Attrs: norecurse nounwind readnone uwtable
|
||||
define i32 @ctlz(i32 %n) {
|
||||
entry:
|
||||
br label %while.cond
|
||||
|
||||
while.cond: ; preds = %while.cond, %entry
|
||||
%n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ]
|
||||
%i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ]
|
||||
%shr = ashr i32 %n.addr.0, 1
|
||||
%tobool = icmp eq i32 %shr, 0
|
||||
%inc = add nsw i32 %i.0, 1
|
||||
br i1 %tobool, label %while.end, label %while.cond
|
||||
|
||||
while.end: ; preds = %while.cond
|
||||
ret i32 %i.0
|
||||
}
|
||||
|
||||
; Recognize CTLZ builtin pattern.
|
||||
; Here it will replace the loop -
|
||||
; assume builtin is always profitable.
|
||||
;
|
||||
; int ctlz_add(int n, int i0)
|
||||
; {
|
||||
; int i = i0;
|
||||
; while(n >>= 1) {
|
||||
; i++;
|
||||
; }
|
||||
; return i;
|
||||
; }
|
||||
;
|
||||
; ALL: entry
|
||||
; ALL: %0 = ashr i32 %n, 1
|
||||
; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
|
||||
; ALL-NEXT: %2 = sub i32 32, %1
|
||||
; ALL-NEXT: %3 = add i32 %2, 1
|
||||
; ALL-NEXT: %4 = add i32 %2, %i0
|
||||
; ALL: %i.0.lcssa = phi i32 [ %4, %while.cond ]
|
||||
; ALL: ret i32 %i.0.lcssa
|
||||
;
|
||||
; Function Attrs: norecurse nounwind readnone uwtable
|
||||
define i32 @ctlz_add(i32 %n, i32 %i0) {
|
||||
entry:
|
||||
br label %while.cond
|
||||
|
||||
while.cond: ; preds = %while.cond, %entry
|
||||
%n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ]
|
||||
%i.0 = phi i32 [ %i0, %entry ], [ %inc, %while.cond ]
|
||||
%shr = ashr i32 %n.addr.0, 1
|
||||
%tobool = icmp eq i32 %shr, 0
|
||||
%inc = add nsw i32 %i.0, 1
|
||||
br i1 %tobool, label %while.end, label %while.cond
|
||||
|
||||
while.end: ; preds = %while.cond
|
||||
ret i32 %i.0
|
||||
}
|
|
@ -0,0 +1,185 @@
|
|||
; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=core-avx2 < %s -S | FileCheck -check-prefix=LZCNT --check-prefix=ALL %s
|
||||
; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=corei7 < %s -S | FileCheck -check-prefix=NOLZCNT --check-prefix=ALL %s
|
||||
|
||||
; Recognize CTLZ builtin pattern.
|
||||
; Here we'll just convert loop to countable,
|
||||
; so do not insert builtin if CPU do not support CTLZ
|
||||
;
|
||||
; int ctlz_and_other(int n, char *a)
|
||||
; {
|
||||
; int i = 0, n0 = n;
|
||||
; while(n >>= 1) {
|
||||
; a[i] = (n0 & (1 << i)) ? 1 : 0;
|
||||
; i++;
|
||||
; }
|
||||
; return i;
|
||||
; }
|
||||
;
|
||||
; LZCNT: entry
|
||||
; LZCNT: %0 = call i32 @llvm.ctlz.i32(i32 %shr8, i1 true)
|
||||
; LZCNT-NEXT: %1 = sub i32 32, %0
|
||||
; LZCNT-NEXT: %2 = zext i32 %1 to i64
|
||||
; LZCNT: %indvars.iv.next.lcssa = phi i64 [ %2, %while.body ]
|
||||
; LZCNT: %4 = trunc i64 %indvars.iv.next.lcssa to i32
|
||||
; LZCNT: %i.0.lcssa = phi i32 [ 0, %entry ], [ %4, %while.end.loopexit ]
|
||||
; LZCNT: ret i32 %i.0.lcssa
|
||||
|
||||
; NOLZCNT: entry
|
||||
; NOLZCNT-NOT: @llvm.ctlz
|
||||
|
||||
; Function Attrs: norecurse nounwind uwtable
|
||||
define i32 @ctlz_and_other(i32 %n, i8* nocapture %a) {
|
||||
entry:
|
||||
%shr8 = ashr i32 %n, 1
|
||||
%tobool9 = icmp eq i32 %shr8, 0
|
||||
br i1 %tobool9, label %while.end, label %while.body.preheader
|
||||
|
||||
while.body.preheader: ; preds = %entry
|
||||
br label %while.body
|
||||
|
||||
while.body: ; preds = %while.body.preheader, %while.body
|
||||
%indvars.iv = phi i64 [ %indvars.iv.next, %while.body ], [ 0, %while.body.preheader ]
|
||||
%shr11 = phi i32 [ %shr, %while.body ], [ %shr8, %while.body.preheader ]
|
||||
%0 = trunc i64 %indvars.iv to i32
|
||||
%shl = shl i32 1, %0
|
||||
%and = and i32 %shl, %n
|
||||
%tobool1 = icmp ne i32 %and, 0
|
||||
%conv = zext i1 %tobool1 to i8
|
||||
%arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
|
||||
store i8 %conv, i8* %arrayidx, align 1
|
||||
%indvars.iv.next = add nuw i64 %indvars.iv, 1
|
||||
%shr = ashr i32 %shr11, 1
|
||||
%tobool = icmp eq i32 %shr, 0
|
||||
br i1 %tobool, label %while.end.loopexit, label %while.body
|
||||
|
||||
while.end.loopexit: ; preds = %while.body
|
||||
%1 = trunc i64 %indvars.iv.next to i32
|
||||
br label %while.end
|
||||
|
||||
while.end: ; preds = %while.end.loopexit, %entry
|
||||
%i.0.lcssa = phi i32 [ 0, %entry ], [ %1, %while.end.loopexit ]
|
||||
ret i32 %i.0.lcssa
|
||||
}
|
||||
|
||||
; Recognize CTLZ builtin pattern.
|
||||
; Here it will replace the loop -
|
||||
; assume builtin is always profitable.
|
||||
;
|
||||
; int ctlz_zero_check(int n)
|
||||
; {
|
||||
; int i = 0;
|
||||
; while(n) {
|
||||
; n >>= 1;
|
||||
; i++;
|
||||
; }
|
||||
; return i;
|
||||
; }
|
||||
;
|
||||
; ALL: entry
|
||||
; ALL: %0 = call i32 @llvm.ctlz.i32(i32 %n, i1 true)
|
||||
; ALL-NEXT: %1 = sub i32 32, %0
|
||||
; ALL: %inc.lcssa = phi i32 [ %1, %while.body ]
|
||||
; ALL: %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ]
|
||||
; ALL: ret i32 %i.0.lcssa
|
||||
|
||||
; Function Attrs: norecurse nounwind readnone uwtable
|
||||
define i32 @ctlz_zero_check(i32 %n) {
|
||||
entry:
|
||||
%tobool4 = icmp eq i32 %n, 0
|
||||
br i1 %tobool4, label %while.end, label %while.body.preheader
|
||||
|
||||
while.body.preheader: ; preds = %entry
|
||||
br label %while.body
|
||||
|
||||
while.body: ; preds = %while.body.preheader, %while.body
|
||||
%i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
|
||||
%n.addr.05 = phi i32 [ %shr, %while.body ], [ %n, %while.body.preheader ]
|
||||
%shr = ashr i32 %n.addr.05, 1
|
||||
%inc = add nsw i32 %i.06, 1
|
||||
%tobool = icmp eq i32 %shr, 0
|
||||
br i1 %tobool, label %while.end.loopexit, label %while.body
|
||||
|
||||
while.end.loopexit: ; preds = %while.body
|
||||
br label %while.end
|
||||
|
||||
while.end: ; preds = %while.end.loopexit, %entry
|
||||
%i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ]
|
||||
ret i32 %i.0.lcssa
|
||||
}
|
||||
|
||||
; Recognize CTLZ builtin pattern.
|
||||
; Here it will replace the loop -
|
||||
; assume builtin is always profitable.
|
||||
;
|
||||
; int ctlz(int n)
|
||||
; {
|
||||
; int i = 0;
|
||||
; while(n >>= 1) {
|
||||
; i++;
|
||||
; }
|
||||
; return i;
|
||||
; }
|
||||
;
|
||||
; ALL: entry
|
||||
; ALL: %0 = ashr i32 %n, 1
|
||||
; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
|
||||
; ALL-NEXT: %2 = sub i32 32, %1
|
||||
; ALL-NEXT: %3 = add i32 %2, 1
|
||||
; ALL: %i.0.lcssa = phi i32 [ %2, %while.cond ]
|
||||
; ALL: ret i32 %i.0.lcssa
|
||||
|
||||
; Function Attrs: norecurse nounwind readnone uwtable
|
||||
define i32 @ctlz(i32 %n) {
|
||||
entry:
|
||||
br label %while.cond
|
||||
|
||||
while.cond: ; preds = %while.cond, %entry
|
||||
%n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ]
|
||||
%i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ]
|
||||
%shr = ashr i32 %n.addr.0, 1
|
||||
%tobool = icmp eq i32 %shr, 0
|
||||
%inc = add nsw i32 %i.0, 1
|
||||
br i1 %tobool, label %while.end, label %while.cond
|
||||
|
||||
while.end: ; preds = %while.cond
|
||||
ret i32 %i.0
|
||||
}
|
||||
|
||||
; Recognize CTLZ builtin pattern.
|
||||
; Here it will replace the loop -
|
||||
; assume builtin is always profitable.
|
||||
;
|
||||
; int ctlz_add(int n, int i0)
|
||||
; {
|
||||
; int i = i0;
|
||||
; while(n >>= 1) {
|
||||
; i++;
|
||||
; }
|
||||
; return i;
|
||||
; }
|
||||
;
|
||||
; ALL: entry
|
||||
; ALL: %0 = ashr i32 %n, 1
|
||||
; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
|
||||
; ALL-NEXT: %2 = sub i32 32, %1
|
||||
; ALL-NEXT: %3 = add i32 %2, 1
|
||||
; ALL-NEXT: %4 = add i32 %2, %i0
|
||||
; ALL: %i.0.lcssa = phi i32 [ %4, %while.cond ]
|
||||
; ALL: ret i32 %i.0.lcssa
|
||||
;
|
||||
; Function Attrs: norecurse nounwind readnone uwtable
|
||||
define i32 @ctlz_add(i32 %n, i32 %i0) {
|
||||
entry:
|
||||
br label %while.cond
|
||||
|
||||
while.cond: ; preds = %while.cond, %entry
|
||||
%n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ]
|
||||
%i.0 = phi i32 [ %i0, %entry ], [ %inc, %while.cond ]
|
||||
%shr = ashr i32 %n.addr.0, 1
|
||||
%tobool = icmp eq i32 %shr, 0
|
||||
%inc = add nsw i32 %i.0, 1
|
||||
br i1 %tobool, label %while.end, label %while.cond
|
||||
|
||||
while.end: ; preds = %while.cond
|
||||
ret i32 %i.0
|
||||
}
|
Loading…
Reference in New Issue