Adding IV chain generation to LSR.

After collecting chains, check if any should be materialized. If so,
hide the chained IV users from the LSR solver. LSR will only solve for
the head of the chain. GenerateIVChains will then materialize the
chained IV users by computing the IV relative to its previous value in
the chain.

In theory, chained IV users could be exposed to LSR's solver. This
would be considerably complicated to implement and I'm not aware of a
case where we need it. In practice it's more important to
intelligently prune the search space of nontrivial loops before
running the solver, otherwise the solver is often forced to prune the
most optimal solutions. Hiding the chained users does this well, so
that LSR is more likely to find the best IV for the chain as a whole.

llvm-svn: 147801
This commit is contained in:
Andrew Trick 2012-01-09 21:18:52 +00:00
parent bfedfc171d
commit 248d410e3e
2 changed files with 324 additions and 5 deletions

View File

@ -91,6 +91,15 @@ static cl::opt<bool> EnablePhiElim(
"enable-lsr-phielim", cl::Hidden, cl::init(true),
cl::desc("Enable LSR phi elimination"));
#ifndef NDEBUG
// Stress test IV chain generation.
static cl::opt<bool> StressIVChain(
"stress-ivchain", cl::Hidden, cl::init(false),
cl::desc("Stress test LSR IV chains"));
#else
static bool StressIVChain = false;
#endif
namespace {
/// RegSortData - This class holds data which is used to order reuse candidates.
@ -1415,6 +1424,9 @@ class LSRInstance {
/// IVChainVec - IV users can form a chain of IV increments.
SmallVector<IVChain, MaxChains> IVChainVec;
/// IVIncSet - IV users that belong to profitable IVChains.
SmallPtrSet<Use*, MaxChains> IVIncSet;
void OptimizeShadowIV();
bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
@ -1422,7 +1434,10 @@ class LSRInstance {
void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
SmallVectorImpl<ChainUsers> &ChainUsersVec);
void FinalizeChain(IVChain &Chain);
void CollectChains();
void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
SmallVectorImpl<WeakVH> &DeadInsts);
void CollectInterestingTypesAndFactors();
void CollectFixupsAndInitialFormulae();
@ -2189,6 +2204,48 @@ static bool isCompatibleIVType(Value *LVal, Value *RVal) {
return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy());
}
/// Return true if the chain increment is profitable to expand into a loop
/// invariant value, which may require its own register. A profitable chain
/// increment will be an offset relative to the same base. We allow such offsets
/// to potentially be used as chain increment as long as it's not obviously
/// expensive to expand using real instructions.
static const SCEV *
getProfitableChainIncrement(Value *NextIV, Value *PrevIV,
const IVChain &Chain, Loop *L,
ScalarEvolution &SE, const TargetLowering *TLI) {
const SCEV *IncExpr = SE.getMinusSCEV(SE.getSCEV(NextIV), SE.getSCEV(PrevIV));
if (!SE.isLoopInvariant(IncExpr, L))
return 0;
// We are not able to expand an increment unless it is loop invariant,
// however, the following checks are purely for profitability.
if (StressIVChain)
return IncExpr;
// Unimplemented
return 0;
}
/// Return true if the number of registers needed for the chain is estimated to
/// be less than the number required for the individual IV users. First prohibit
/// any IV users that keep the IV live across increments (the Users set should
/// be empty). Next count the number and type of increments in the chain.
///
/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
/// effectively use postinc addressing modes. Only consider it profitable it the
/// increments can be computed in fewer registers when chained.
///
/// TODO: Consider IVInc free if it's already used in another chains.
static bool
isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users,
ScalarEvolution &SE, const TargetLowering *TLI) {
if (StressIVChain)
return true;
// Unimplemented
return false;
}
/// ChainInstruction - Add this IV user to an existing chain or make it the head
/// of a new chain.
void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
@ -2211,9 +2268,9 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
&& isa<PHINode>(IVChainVec[ChainIdx].back().UserInst))
continue;
const SCEV *IncExpr = SE.getMinusSCEV(SE.getSCEV(NextIV),
SE.getSCEV(PrevIV));
if (SE.isLoopInvariant(IncExpr, L)) {
if (const SCEV *IncExpr =
getProfitableChainIncrement(NextIV, PrevIV, IVChainVec[ChainIdx],
L, SE, TLI)) {
LastIncExpr = IncExpr;
break;
}
@ -2223,7 +2280,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
if (ChainIdx == NChains) {
if (isa<PHINode>(UserInst))
return;
if (NChains >= MaxChains) {
if (NChains >= MaxChains && !StressIVChain) {
DEBUG(dbgs() << "IV Chain Limit\n");
return;
}
@ -2349,13 +2406,173 @@ void LSRInstance::CollectChains() {
if (IncV)
ChainInstruction(PN, IncV, ChainUsersVec);
}
// Remove any unprofitable chains.
unsigned ChainIdx = 0;
for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
UsersIdx < NChains; ++UsersIdx) {
if (!isProfitableChain(IVChainVec[UsersIdx],
ChainUsersVec[UsersIdx].FarUsers, SE, TLI))
continue;
// Preserve the chain at UsesIdx.
if (ChainIdx != UsersIdx)
IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
FinalizeChain(IVChainVec[ChainIdx]);
++ChainIdx;
}
IVChainVec.resize(ChainIdx);
}
void LSRInstance::FinalizeChain(IVChain &Chain) {
assert(!Chain.empty() && "empty IV chains are not allowed");
DEBUG(dbgs() << "Final Chain: " << *Chain[0].UserInst << "\n");
for (IVChain::const_iterator I = llvm::next(Chain.begin()), E = Chain.end();
I != E; ++I) {
DEBUG(dbgs() << " Inc: " << *I->UserInst << "\n");
User::op_iterator UseI =
std::find(I->UserInst->op_begin(), I->UserInst->op_end(), I->IVOperand);
assert(UseI != I->UserInst->op_end() && "cannot find IV operand");
IVIncSet.insert(UseI);
}
}
/// Return true if the IVInc can be folded into an addressing mode.
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
Value *Operand, const TargetLowering *TLI) {
const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
if (!IncConst || !isAddressUse(UserInst, Operand))
return false;
if (IncConst->getValue()->getValue().getMinSignedBits() > 64)
return false;
int64_t IncOffset = IncConst->getValue()->getSExtValue();
if (!isAlwaysFoldable(IncOffset, /*BaseGV=*/0, /*HaseBaseReg=*/false,
LSRUse::Address, getAccessType(UserInst), TLI))
return false;
return true;
}
/// GenerateIVChains - Generate an add or subtract for each IVInc in a chain to
/// materialize the IV user's operand from the previous IV user's operand.
void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
SmallVectorImpl<WeakVH> &DeadInsts) {
// Find the new IVOperand for the head of the chain. It may have been replaced
// by LSR.
const IVInc &Head = Chain[0];
User::op_iterator IVOpEnd = Head.UserInst->op_end();
User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
IVOpEnd, L, SE);
Value *IVSrc = 0;
while (IVOpIter != IVOpEnd) {
IVSrc = getWideOperand(*IVOpIter);
// If this operand computes the expression that the chain needs, we may use
// it. (Check this after setting IVSrc which is used below.)
//
// Note that if Head.IncExpr is wider than IVSrc, then this phi is too
// narrow for the chain, so we can no longer use it. We do allow using a
// wider phi, assuming the LSR checked for free truncation. In that case we
// should already have a truncate on this operand such that
// getSCEV(IVSrc) == IncExpr.
if (SE.getSCEV(*IVOpIter) == Head.IncExpr
|| SE.getSCEV(IVSrc) == Head.IncExpr) {
break;
}
IVOpIter = findIVOperand(llvm::next(IVOpIter), IVOpEnd, L, SE);
}
if (IVOpIter == IVOpEnd) {
// Gracefully give up on this chain.
DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
return;
}
DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
Type *IVTy = IVSrc->getType();
Type *IntTy = SE.getEffectiveSCEVType(IVTy);
const SCEV *LeftOverExpr = 0;
for (IVChain::const_iterator IncI = llvm::next(Chain.begin()),
IncE = Chain.end(); IncI != IncE; ++IncI) {
Instruction *InsertPt = IncI->UserInst;
if (isa<PHINode>(InsertPt))
InsertPt = L->getLoopLatch()->getTerminator();
// IVOper will replace the current IV User's operand. IVSrc is the IV
// value currently held in a register.
Value *IVOper = IVSrc;
if (!IncI->IncExpr->isZero()) {
// IncExpr was the result of subtraction of two narrow values, so must
// be signed.
const SCEV *IncExpr = SE.getNoopOrSignExtend(IncI->IncExpr, IntTy);
LeftOverExpr = LeftOverExpr ?
SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
}
if (LeftOverExpr && !LeftOverExpr->isZero()) {
// Expand the IV increment.
Rewriter.clearPostInc();
Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
SE.getUnknown(IncV));
IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
// If an IV increment can't be folded, use it as the next IV value.
if (!canFoldIVIncExpr(LeftOverExpr, IncI->UserInst, IncI->IVOperand,
TLI)) {
assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
IVSrc = IVOper;
LeftOverExpr = 0;
}
}
Type *OperTy = IncI->IVOperand->getType();
if (IVTy != OperTy) {
assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
"cannot extend a chained IV");
IRBuilder<> Builder(InsertPt);
IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
}
IncI->UserInst->replaceUsesOfWith(IncI->IVOperand, IVOper);
DeadInsts.push_back(IncI->IVOperand);
}
// If LSR created a new, wider phi, we may also replace its postinc. We only
// do this if we also found a wide value for the head of the chain.
if (isa<PHINode>(Chain.back().UserInst)) {
for (BasicBlock::iterator I = L->getHeader()->begin();
PHINode *Phi = dyn_cast<PHINode>(I); ++I) {
if (!isCompatibleIVType(Phi, IVSrc))
continue;
Instruction *PostIncV = dyn_cast<Instruction>(
Phi->getIncomingValueForBlock(L->getLoopLatch()));
if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
continue;
Value *IVOper = IVSrc;
Type *PostIncTy = PostIncV->getType();
if (IVTy != PostIncTy) {
assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
}
Phi->replaceUsesOfWith(PostIncV, IVOper);
DeadInsts.push_back(PostIncV);
}
}
}
void LSRInstance::CollectFixupsAndInitialFormulae() {
for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
Instruction *UserInst = UI->getUser();
// Skip IV users that are part of profitable IV Chains.
User::op_iterator UseI = std::find(UserInst->op_begin(), UserInst->op_end(),
UI->getOperandValToReplace());
assert(UseI != UserInst->op_end() && "cannot find IV operand");
if (IVIncSet.count(UseI))
continue;
// Record the uses.
LSRFixup &LF = getNewFixup();
LF.UserInst = UI->getUser();
LF.UserInst = UserInst;
LF.OperandValToReplace = UI->getOperandValToReplace();
LF.PostIncLoops = UI->getPostIncLoops();
@ -4073,6 +4290,11 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
Changed = true;
}
for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(),
ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) {
GenerateIVChain(*ChainI, Rewriter, DeadInsts);
Changed = true;
}
// Clean up after ourselves. This must be done before deleting any
// instructions.
Rewriter.clear();
@ -4123,6 +4345,7 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
CollectFixupsAndInitialFormulae();
CollectLoopInvariantFixupsAndFormulae();
assert(!Uses.empty() && "IVUsers reported at least one use");
DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
print_uses(dbgs()));

View File

@ -0,0 +1,96 @@
; REQUIRES: asserts
; RUN: llc < %s -O3 -march=x86-64 -mcpu=core2 -stress-ivchain | FileCheck %s -check-prefix=X64
; RUN: llc < %s -O3 -march=x86 -mcpu=core2 -stress-ivchain | FileCheck %s -check-prefix=X32
; @sharedidx is an unrolled variant of this loop:
; for (unsigned long i = 0; i < len; i += s) {
; c[i] = a[i] + b[i];
; }
; where 's' cannot be folded into the addressing mode.
;
; This is not quite profitable to chain. But with -stress-ivchain, we
; can form three address chains in place of the shared induction
; variable.
; X64: sharedidx:
; X64: %for.body.preheader
; X64-NOT: leal ({{.*}},4)
; X64: %for.body.1
; X32: sharedidx:
; X32: %for.body.2
; X32: add
; X32: add
; X32: add
; X32: add
; X32: add
; X32: %for.body.3
define void @sharedidx(i8* nocapture %a, i8* nocapture %b, i8* nocapture %c, i32 %s, i32 %len) nounwind ssp {
entry:
%cmp8 = icmp eq i32 %len, 0
br i1 %cmp8, label %for.end, label %for.body
for.body: ; preds = %entry, %for.body.3
%i.09 = phi i32 [ %add5.3, %for.body.3 ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i8* %a, i32 %i.09
%0 = load i8* %arrayidx, align 1
%conv6 = zext i8 %0 to i32
%arrayidx1 = getelementptr inbounds i8* %b, i32 %i.09
%1 = load i8* %arrayidx1, align 1
%conv27 = zext i8 %1 to i32
%add = add nsw i32 %conv27, %conv6
%conv3 = trunc i32 %add to i8
%arrayidx4 = getelementptr inbounds i8* %c, i32 %i.09
store i8 %conv3, i8* %arrayidx4, align 1
%add5 = add i32 %i.09, %s
%cmp = icmp ult i32 %add5, %len
br i1 %cmp, label %for.body.1, label %for.end
for.end: ; preds = %for.body, %for.body.1, %for.body.2, %for.body.3, %entry
ret void
for.body.1: ; preds = %for.body
%arrayidx.1 = getelementptr inbounds i8* %a, i32 %add5
%2 = load i8* %arrayidx.1, align 1
%conv6.1 = zext i8 %2 to i32
%arrayidx1.1 = getelementptr inbounds i8* %b, i32 %add5
%3 = load i8* %arrayidx1.1, align 1
%conv27.1 = zext i8 %3 to i32
%add.1 = add nsw i32 %conv27.1, %conv6.1
%conv3.1 = trunc i32 %add.1 to i8
%arrayidx4.1 = getelementptr inbounds i8* %c, i32 %add5
store i8 %conv3.1, i8* %arrayidx4.1, align 1
%add5.1 = add i32 %add5, %s
%cmp.1 = icmp ult i32 %add5.1, %len
br i1 %cmp.1, label %for.body.2, label %for.end
for.body.2: ; preds = %for.body.1
%arrayidx.2 = getelementptr inbounds i8* %a, i32 %add5.1
%4 = load i8* %arrayidx.2, align 1
%conv6.2 = zext i8 %4 to i32
%arrayidx1.2 = getelementptr inbounds i8* %b, i32 %add5.1
%5 = load i8* %arrayidx1.2, align 1
%conv27.2 = zext i8 %5 to i32
%add.2 = add nsw i32 %conv27.2, %conv6.2
%conv3.2 = trunc i32 %add.2 to i8
%arrayidx4.2 = getelementptr inbounds i8* %c, i32 %add5.1
store i8 %conv3.2, i8* %arrayidx4.2, align 1
%add5.2 = add i32 %add5.1, %s
%cmp.2 = icmp ult i32 %add5.2, %len
br i1 %cmp.2, label %for.body.3, label %for.end
for.body.3: ; preds = %for.body.2
%arrayidx.3 = getelementptr inbounds i8* %a, i32 %add5.2
%6 = load i8* %arrayidx.3, align 1
%conv6.3 = zext i8 %6 to i32
%arrayidx1.3 = getelementptr inbounds i8* %b, i32 %add5.2
%7 = load i8* %arrayidx1.3, align 1
%conv27.3 = zext i8 %7 to i32
%add.3 = add nsw i32 %conv27.3, %conv6.3
%conv3.3 = trunc i32 %add.3 to i8
%arrayidx4.3 = getelementptr inbounds i8* %c, i32 %add5.2
store i8 %conv3.3, i8* %arrayidx4.3, align 1
%add5.3 = add i32 %add5.2, %s
%cmp.3 = icmp ult i32 %add5.3, %len
br i1 %cmp.3, label %for.body, label %for.end
}