forked from OSchip/llvm-project
[LoopIdiomRecognize] Add CTTZ support
Summary: Existing LIR recognizes CTLZ where shifting input variable right until it is zero. (Shift-Until-Zero idiom) This commit: 1. Augments Shift-Until-Zero idiom to recognize CTTZ where input variable is shifted left. 2. Prepare for BitScan idiom recognition. Patch by Yuanfang Chen (tabloid.adroit) Reviewers: craig.topper, evstupac Reviewed By: craig.topper Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D55876 llvm-svn: 350074
This commit is contained in:
parent
c168c6f86f
commit
c9a6000755
|
@ -26,7 +26,7 @@
|
|||
// Future floating point idioms to recognize in -ffast-math mode:
|
||||
// fpowi
|
||||
// Future integer operation idioms to recognize:
|
||||
// ctpop, ctlz, cttz
|
||||
// ctpop
|
||||
//
|
||||
// Beware that isel's default lowering for ctpop is highly inefficient for
|
||||
// i64 and larger types when i64 is legal and the value has few bits set. It
|
||||
|
@ -187,9 +187,10 @@ private:
|
|||
bool recognizePopcount();
|
||||
void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
|
||||
PHINode *CntPhi, Value *Var);
|
||||
bool recognizeAndInsertCTLZ();
|
||||
void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst,
|
||||
PHINode *CntPhi, Value *Var, Instruction *DefX,
|
||||
bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
|
||||
void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
|
||||
Instruction *CntInst, PHINode *CntPhi,
|
||||
Value *Var, Instruction *DefX,
|
||||
const DebugLoc &DL, bool ZeroCheck,
|
||||
bool IsCntPhiUsedOutsideLoop);
|
||||
|
||||
|
@ -1108,15 +1109,17 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
|
|||
}
|
||||
|
||||
bool LoopIdiomRecognize::runOnNoncountableLoop() {
|
||||
return recognizePopcount() || recognizeAndInsertCTLZ();
|
||||
return recognizePopcount() || recognizeAndInsertFFS();
|
||||
}
|
||||
|
||||
/// Check if the given conditional branch is based on the comparison between
|
||||
/// a variable and zero, and if the variable is non-zero, the control yields to
|
||||
/// the loop entry. If the branch matches the behavior, the variable involved
|
||||
/// in the comparison is returned. This function will be called to see if the
|
||||
/// precondition and postcondition of the loop are in desirable form.
|
||||
static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) {
|
||||
/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
|
||||
/// true), the control yields to the loop entry. If the branch matches the
|
||||
/// behavior, the variable involved in the comparison is returned. This function
|
||||
/// will be called to see if the precondition and postcondition of the loop are
|
||||
/// in desirable form.
|
||||
static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
|
||||
bool JmpOnZero = false) {
|
||||
if (!BI || !BI->isConditional())
|
||||
return nullptr;
|
||||
|
||||
|
@ -1128,9 +1131,14 @@ static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) {
|
|||
if (!CmpZero || !CmpZero->isZero())
|
||||
return nullptr;
|
||||
|
||||
BasicBlock *TrueSucc = BI->getSuccessor(0);
|
||||
BasicBlock *FalseSucc = BI->getSuccessor(1);
|
||||
if (JmpOnZero)
|
||||
std::swap(TrueSucc, FalseSucc);
|
||||
|
||||
ICmpInst::Predicate Pred = Cond->getPredicate();
|
||||
if ((Pred == ICmpInst::ICMP_NE && BI->getSuccessor(0) == LoopEntry) ||
|
||||
(Pred == ICmpInst::ICMP_EQ && BI->getSuccessor(1) == LoopEntry))
|
||||
if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
|
||||
(Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
|
||||
return Cond->getOperand(0);
|
||||
|
||||
return nullptr;
|
||||
|
@ -1306,14 +1314,14 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
|
|||
///
|
||||
/// loop-exit:
|
||||
/// \endcode
|
||||
static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
|
||||
Instruction *&CntInst, PHINode *&CntPhi,
|
||||
Instruction *&DefX) {
|
||||
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
|
||||
Intrinsic::ID &IntrinID, Value *&InitX,
|
||||
Instruction *&CntInst, PHINode *&CntPhi,
|
||||
Instruction *&DefX) {
|
||||
BasicBlock *LoopEntry;
|
||||
Value *VarX = nullptr;
|
||||
|
||||
DefX = nullptr;
|
||||
PhiX = nullptr;
|
||||
CntInst = nullptr;
|
||||
CntPhi = nullptr;
|
||||
LoopEntry = *(CurLoop->block_begin());
|
||||
|
@ -1325,20 +1333,28 @@ static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
|
|||
else
|
||||
return false;
|
||||
|
||||
// step 2: detect instructions corresponding to "x.next = x >> 1"
|
||||
if (!DefX || (DefX->getOpcode() != Instruction::AShr &&
|
||||
DefX->getOpcode() != Instruction::LShr))
|
||||
// step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
|
||||
if (!DefX || !DefX->isShift())
|
||||
return false;
|
||||
IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
|
||||
Intrinsic::ctlz;
|
||||
ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
|
||||
if (!Shft || !Shft->isOne())
|
||||
return false;
|
||||
VarX = DefX->getOperand(0);
|
||||
|
||||
// step 3: Check the recurrence of variable X
|
||||
PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
|
||||
PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
|
||||
if (!PhiX)
|
||||
return false;
|
||||
|
||||
InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
|
||||
|
||||
// Make sure the initial value can't be negative otherwise the ashr in the
|
||||
// loop might never reach zero which would make the loop infinite.
|
||||
if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
|
||||
return false;
|
||||
|
||||
// step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
|
||||
// TODO: We can skip the step. If loop trip count is known (CTLZ),
|
||||
// then all uses of "cnt.next" could be optimized to the trip count
|
||||
|
@ -1370,17 +1386,25 @@ static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
|
|||
return true;
|
||||
}
|
||||
|
||||
/// Recognize CTLZ idiom in a non-countable loop and convert the loop
|
||||
/// to countable (with CTLZ trip count).
|
||||
/// If CTLZ inserted as a new trip count returns true; otherwise, returns false.
|
||||
bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
|
||||
/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
|
||||
/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
|
||||
/// trip count returns true; otherwise, returns false.
|
||||
bool LoopIdiomRecognize::recognizeAndInsertFFS() {
|
||||
// Give up if the loop has multiple blocks or multiple backedges.
|
||||
if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
|
||||
return false;
|
||||
|
||||
Instruction *CntInst, *DefX;
|
||||
PHINode *CntPhi, *PhiX;
|
||||
if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX))
|
||||
Intrinsic::ID IntrinID;
|
||||
Value *InitX;
|
||||
Instruction *DefX = nullptr;
|
||||
PHINode *CntPhi = nullptr;
|
||||
Instruction *CntInst = nullptr;
|
||||
// Help decide if transformation is profitable. For ShiftUntilZero idiom,
|
||||
// this is always 6.
|
||||
size_t IdiomCanonicalSize = 6;
|
||||
|
||||
if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
|
||||
CntInst, CntPhi, DefX))
|
||||
return false;
|
||||
|
||||
bool IsCntPhiUsedOutsideLoop = false;
|
||||
|
@ -1407,12 +1431,6 @@ bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
|
|||
// It is safe to assume Preheader exist as it was checked in
|
||||
// parent function RunOnLoop.
|
||||
BasicBlock *PH = CurLoop->getLoopPreheader();
|
||||
Value *InitX = PhiX->getIncomingValueForBlock(PH);
|
||||
|
||||
// Make sure the initial value can't be negative otherwise the ashr in the
|
||||
// loop might never reach zero which would make the loop infinite.
|
||||
if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, *DL))
|
||||
return false;
|
||||
|
||||
// If we are using the count instruction outside the loop, make sure we
|
||||
// have a zero check as a precondition. Without the check the loop would run
|
||||
|
@ -1430,8 +1448,10 @@ bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
|
|||
ZeroCheck = true;
|
||||
}
|
||||
|
||||
// Check if CTLZ intrinsic is profitable. Assume it is always profitable
|
||||
// if we delete the loop (the loop has only 6 instructions):
|
||||
// Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
|
||||
// profitable if we delete the loop.
|
||||
|
||||
// the loop has only 6 instructions:
|
||||
// %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
|
||||
// %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
|
||||
// %shr = ashr %n.addr.0, 1
|
||||
|
@ -1442,12 +1462,12 @@ bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
|
|||
const Value *Args[] =
|
||||
{InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext())
|
||||
: ConstantInt::getFalse(InitX->getContext())};
|
||||
if (CurLoop->getHeader()->size() != 6 &&
|
||||
TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) >
|
||||
TargetTransformInfo::TCC_Basic)
|
||||
if (CurLoop->getHeader()->size() != IdiomCanonicalSize &&
|
||||
TTI->getIntrinsicCost(IntrinID, InitX->getType(), Args) >
|
||||
TargetTransformInfo::TCC_Basic)
|
||||
return false;
|
||||
|
||||
transformLoopToCountable(PH, CntInst, CntPhi, InitX, DefX,
|
||||
transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
|
||||
DefX->getDebugLoc(), ZeroCheck,
|
||||
IsCntPhiUsedOutsideLoop);
|
||||
return true;
|
||||
|
@ -1516,20 +1536,21 @@ static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
|
|||
return CI;
|
||||
}
|
||||
|
||||
static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
|
||||
const DebugLoc &DL, bool ZeroCheck) {
|
||||
static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
|
||||
const DebugLoc &DL, bool ZeroCheck,
|
||||
Intrinsic::ID IID) {
|
||||
Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
|
||||
Type *Tys[] = {Val->getType()};
|
||||
|
||||
Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
|
||||
Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys);
|
||||
Value *Func = Intrinsic::getDeclaration(M, IID, Tys);
|
||||
CallInst *CI = IRBuilder.CreateCall(Func, Ops);
|
||||
CI->setDebugLoc(DL);
|
||||
|
||||
return CI;
|
||||
}
|
||||
|
||||
/// Transform the following loop:
|
||||
/// Transform the following loop (Using CTLZ, CTTZ is similar):
|
||||
/// loop:
|
||||
/// CntPhi = PHI [Cnt0, CntInst]
|
||||
/// PhiX = PHI [InitX, DefX]
|
||||
|
@ -1561,19 +1582,19 @@ static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
|
|||
/// If LOOP_BODY is empty the loop will be deleted.
|
||||
/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
|
||||
void LoopIdiomRecognize::transformLoopToCountable(
|
||||
BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX,
|
||||
Instruction *DefX, const DebugLoc &DL, bool ZeroCheck,
|
||||
bool IsCntPhiUsedOutsideLoop) {
|
||||
Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
|
||||
PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
|
||||
bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
|
||||
BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
|
||||
|
||||
// Step 1: Insert the CTLZ instruction at the end of the preheader block
|
||||
// Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
|
||||
IRBuilder<> Builder(PreheaderBr);
|
||||
Builder.SetCurrentDebugLocation(DL);
|
||||
Value *FFS, *Count, *CountPrev, *NewCount, *InitXNext;
|
||||
|
||||
// Count = BitWidth - CTLZ(InitX);
|
||||
// If there are uses of CntPhi create:
|
||||
// CountPrev = BitWidth - CTLZ(InitX >> 1);
|
||||
IRBuilder<> Builder(PreheaderBr);
|
||||
Builder.SetCurrentDebugLocation(DL);
|
||||
Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext;
|
||||
|
||||
if (IsCntPhiUsedOutsideLoop) {
|
||||
if (DefX->getOpcode() == Instruction::AShr)
|
||||
InitXNext =
|
||||
|
@ -1581,29 +1602,30 @@ void LoopIdiomRecognize::transformLoopToCountable(
|
|||
else if (DefX->getOpcode() == Instruction::LShr)
|
||||
InitXNext =
|
||||
Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1));
|
||||
else if (DefX->getOpcode() == Instruction::Shl) // cttz
|
||||
InitXNext =
|
||||
Builder.CreateShl(InitX, ConstantInt::get(InitX->getType(), 1));
|
||||
else
|
||||
llvm_unreachable("Unexpected opcode!");
|
||||
} else
|
||||
InitXNext = InitX;
|
||||
CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck);
|
||||
FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
|
||||
Count = Builder.CreateSub(
|
||||
ConstantInt::get(CTLZ->getType(),
|
||||
CTLZ->getType()->getIntegerBitWidth()),
|
||||
CTLZ);
|
||||
ConstantInt::get(FFS->getType(),
|
||||
FFS->getType()->getIntegerBitWidth()),
|
||||
FFS);
|
||||
if (IsCntPhiUsedOutsideLoop) {
|
||||
CountPrev = Count;
|
||||
Count = Builder.CreateAdd(
|
||||
CountPrev,
|
||||
ConstantInt::get(CountPrev->getType(), 1));
|
||||
}
|
||||
if (IsCntPhiUsedOutsideLoop)
|
||||
NewCount = Builder.CreateZExtOrTrunc(CountPrev,
|
||||
cast<IntegerType>(CntInst->getType()));
|
||||
else
|
||||
NewCount = Builder.CreateZExtOrTrunc(Count,
|
||||
cast<IntegerType>(CntInst->getType()));
|
||||
|
||||
// If the CTLZ counter's initial value is not zero, insert Add Inst.
|
||||
NewCount = Builder.CreateZExtOrTrunc(
|
||||
IsCntPhiUsedOutsideLoop ? CountPrev : Count,
|
||||
cast<IntegerType>(CntInst->getType()));
|
||||
|
||||
// If the counter's initial value is not zero, insert Add Inst.
|
||||
Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
|
||||
ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
|
||||
if (!InitConst || !InitConst->isZero())
|
||||
|
@ -1639,8 +1661,7 @@ void LoopIdiomRecognize::transformLoopToCountable(
|
|||
LbCond->setOperand(1, ConstantInt::get(Ty, 0));
|
||||
|
||||
// Step 3: All the references to the original counter outside
|
||||
// the loop are replaced with the NewCount -- the value returned from
|
||||
// __builtin_ctlz(x).
|
||||
// the loop are replaced with the NewCount
|
||||
if (IsCntPhiUsedOutsideLoop)
|
||||
CntPhi->replaceUsesOutsideBlock(NewCount, Body);
|
||||
else
|
||||
|
|
|
@ -0,0 +1,82 @@
|
|||
; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=core-avx2 < %s -S | FileCheck --check-prefix=ALL %s
|
||||
; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=corei7 < %s -S | FileCheck --check-prefix=ALL %s
|
||||
|
||||
; Recognize CTTZ builtin pattern.
|
||||
; Here it will replace the loop -
|
||||
; assume builtin is always profitable.
|
||||
;
|
||||
; int cttz_zero_check(int n)
|
||||
; {
|
||||
; int i = 0;
|
||||
; while(n) {
|
||||
; n <<= 1;
|
||||
; i++;
|
||||
; }
|
||||
; return i;
|
||||
; }
|
||||
;
|
||||
; ALL-LABEL: @cttz_zero_check
|
||||
; ALL: %0 = call i32 @llvm.cttz.i32(i32 %n, i1 true)
|
||||
; ALL-NEXT: %1 = sub i32 32, %0
|
||||
;
|
||||
; Function Attrs: norecurse nounwind readnone uwtable
|
||||
define i32 @cttz_zero_check(i32 %n) {
|
||||
entry:
|
||||
%tobool4 = icmp eq i32 %n, 0
|
||||
br i1 %tobool4, label %while.end, label %while.body.preheader
|
||||
|
||||
while.body.preheader: ; preds = %entry
|
||||
br label %while.body
|
||||
|
||||
while.body: ; preds = %while.body.preheader, %while.body
|
||||
%i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
|
||||
%n.addr.05 = phi i32 [ %shl, %while.body ], [ %n, %while.body.preheader ]
|
||||
%shl = shl i32 %n.addr.05, 1
|
||||
%inc = add nsw i32 %i.06, 1
|
||||
%tobool = icmp eq i32 %shl, 0
|
||||
br i1 %tobool, label %while.end.loopexit, label %while.body
|
||||
|
||||
while.end.loopexit: ; preds = %while.body
|
||||
br label %while.end
|
||||
|
||||
while.end: ; preds = %while.end.loopexit, %entry
|
||||
%i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ]
|
||||
ret i32 %i.0.lcssa
|
||||
}
|
||||
|
||||
; Recognize CTTZ builtin pattern.
|
||||
; Here it will replace the loop -
|
||||
; assume builtin is always profitable.
|
||||
;
|
||||
; int cttz(int n)
|
||||
; {
|
||||
; int i = 0;
|
||||
; while(n <<= 1) {
|
||||
; i++;
|
||||
; }
|
||||
; return i;
|
||||
; }
|
||||
;
|
||||
; ALL-LABEL: @cttz
|
||||
; ALL: %0 = shl i32 %n, 1
|
||||
; ALL-NEXT: %1 = call i32 @llvm.cttz.i32(i32 %0, i1 false)
|
||||
; ALL-NEXT: %2 = sub i32 32, %1
|
||||
; ALL-NEXT: %3 = add i32 %2, 1
|
||||
;
|
||||
; Function Attrs: norecurse nounwind readnone uwtable
|
||||
define i32 @cttz(i32 %n) {
|
||||
entry:
|
||||
br label %while.cond
|
||||
|
||||
while.cond: ; preds = %while.cond, %entry
|
||||
%n.addr.0 = phi i32 [ %n, %entry ], [ %shl, %while.cond ]
|
||||
%i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ]
|
||||
%shl = shl i32 %n.addr.0, 1
|
||||
%tobool = icmp eq i32 %shl, 0
|
||||
%inc = add nsw i32 %i.0, 1
|
||||
br i1 %tobool, label %while.end, label %while.cond
|
||||
|
||||
while.end: ; preds = %while.cond
|
||||
ret i32 %i.0
|
||||
}
|
||||
|
Loading…
Reference in New Issue