Reapply the new LoopStrengthReduction code, with compile time and

bug fixes, and with improved heuristics for analyzing foreign-loop
addrecs.

This change also flattens IVUsers, eliminating the stride-oriented
groupings, which makes it easier to work with.

llvm-svn: 95975
This commit is contained in:
Dan Gohman 2010-02-12 10:34:29 +00:00
parent c7ef4cc9fc
commit 45774ce0ad
35 changed files with 3523 additions and 2689 deletions

View File

@ -16,29 +16,27 @@
#define LLVM_ANALYSIS_IVUSERS_H
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/ADT/SmallVector.h"
#include <map>
#include "llvm/Support/ValueHandle.h"
namespace llvm {
class DominatorTree;
class Instruction;
class Value;
struct IVUsersOfOneStride;
class IVUsers;
class ScalarEvolution;
class SCEV;
/// IVStrideUse - Keep track of one use of a strided induction variable, where
/// the stride is stored externally. The Offset member keeps track of the
/// offset from the IV, User is the actual user of the operand, and
/// 'OperandValToReplace' is the operand of the User that is the use.
/// IVStrideUse - Keep track of one use of a strided induction variable.
/// The Expr member keeps track of the expression, User is the actual user
/// instruction of the operand, and 'OperandValToReplace' is the operand of
/// the User that is the use.
class IVStrideUse : public CallbackVH, public ilist_node<IVStrideUse> {
public:
IVStrideUse(IVUsersOfOneStride *parent,
const SCEV *offset,
IVStrideUse(IVUsers *P, const SCEV *S, const SCEV *Off,
Instruction* U, Value *O)
: CallbackVH(U), Parent(parent), Offset(offset),
OperandValToReplace(O),
IsUseOfPostIncrementedValue(false) {
: CallbackVH(U), Parent(P), Stride(S), Offset(Off),
OperandValToReplace(O), IsUseOfPostIncrementedValue(false) {
}
/// getUser - Return the user instruction for this use.
@ -51,9 +49,17 @@ public:
setValPtr(NewUser);
}
/// getParent - Return a pointer to the IVUsersOfOneStride that owns
/// getParent - Return a pointer to the IVUsers that owns
/// this IVStrideUse.
IVUsersOfOneStride *getParent() const { return Parent; }
IVUsers *getParent() const { return Parent; }
/// getStride - Return the expression for the stride for the use.
const SCEV *getStride() const { return Stride; }
/// setStride - Assign a new stride to this use.
void setStride(const SCEV *Val) {
Stride = Val;
}
/// getOffset - Return the offset to add to a theoeretical induction
/// variable that starts at zero and counts up by the stride to compute
@ -92,8 +98,11 @@ public:
}
private:
/// Parent - a pointer to the IVUsersOfOneStride that owns this IVStrideUse.
IVUsersOfOneStride *Parent;
/// Parent - a pointer to the IVUsers that owns this IVStrideUse.
IVUsers *Parent;
/// Stride - The stride for this use.
const SCEV *Stride;
/// Offset - The offset to add to the base induction expression.
const SCEV *Offset;
@ -138,42 +147,8 @@ private:
mutable ilist_node<IVStrideUse> Sentinel;
};
/// IVUsersOfOneStride - This structure keeps track of all instructions that
/// have an operand that is based on the trip count multiplied by some stride.
struct IVUsersOfOneStride : public ilist_node<IVUsersOfOneStride> {
private:
IVUsersOfOneStride(const IVUsersOfOneStride &I); // do not implement
void operator=(const IVUsersOfOneStride &I); // do not implement
public:
IVUsersOfOneStride() : Stride(0) {}
explicit IVUsersOfOneStride(const SCEV *stride) : Stride(stride) {}
/// Stride - The stride for all the contained IVStrideUses. This is
/// a constant for affine strides.
const SCEV *Stride;
/// Users - Keep track of all of the users of this stride as well as the
/// initial value and the operand that uses the IV.
ilist<IVStrideUse> Users;
void addUser(const SCEV *Offset, Instruction *User, Value *Operand) {
Users.push_back(new IVStrideUse(this, Offset, User, Operand));
}
void removeUser(IVStrideUse *User) {
Users.erase(User);
}
void print(raw_ostream &OS) const;
/// dump - This method is used for debugging.
void dump() const;
};
class IVUsers : public LoopPass {
friend class IVStrideUserVH;
friend class IVStrideUse;
Loop *L;
LoopInfo *LI;
DominatorTree *DT;
@ -182,19 +157,8 @@ class IVUsers : public LoopPass {
/// IVUses - A list of all tracked IV uses of induction variable expressions
/// we are interested in.
ilist<IVUsersOfOneStride> IVUses;
ilist<IVStrideUse> IVUses;
public:
/// IVUsesByStride - A mapping from the strides in StrideOrder to the
/// uses in IVUses.
std::map<const SCEV *, IVUsersOfOneStride*> IVUsesByStride;
/// StrideOrder - An ordering of the keys in IVUsesByStride that is stable:
/// We use this to iterate over the IVUsesByStride collection without being
/// dependent on random ordering of pointers in the process.
SmallVector<const SCEV *, 16> StrideOrder;
private:
virtual void getAnalysisUsage(AnalysisUsage &AU) const;
virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
@ -210,8 +174,8 @@ public:
/// return true. Otherwise, return false.
bool AddUsersIfInteresting(Instruction *I);
void AddUser(const SCEV *Stride, const SCEV *Offset,
Instruction *User, Value *Operand);
IVStrideUse &AddUser(const SCEV *Stride, const SCEV *Offset,
Instruction *User, Value *Operand);
/// getReplacementExpr - Return a SCEV expression which computes the
/// value of the OperandValToReplace of the given IVStrideUse.
@ -222,6 +186,14 @@ public:
/// isUseOfPostIncrementedValue flag.
const SCEV *getCanonicalExpr(const IVStrideUse &U) const;
typedef ilist<IVStrideUse>::iterator iterator;
typedef ilist<IVStrideUse>::const_iterator const_iterator;
iterator begin() { return IVUses.begin(); }
iterator end() { return IVUses.end(); }
const_iterator begin() const { return IVUses.begin(); }
const_iterator end() const { return IVUses.end(); }
bool empty() const { return IVUses.empty(); }
void print(raw_ostream &OS, const Module* = 0) const;
/// dump - This method is used for debugging.

View File

@ -27,10 +27,7 @@ namespace llvm {
/// and destroy it when finished to allow the release of the associated
/// memory.
class SCEVExpander : public SCEVVisitor<SCEVExpander, Value*> {
public:
ScalarEvolution &SE;
private:
std::map<std::pair<const SCEV *, Instruction *>, AssertingVH<Value> >
InsertedExpressions;
std::set<Value*> InsertedValues;

View File

@ -36,42 +36,30 @@ Pass *llvm::createIVUsersPass() {
return new IVUsers();
}
/// containsAddRecFromDifferentLoop - Determine whether expression S involves a
/// subexpression that is an AddRec from a loop other than L. An outer loop
/// of L is OK, but not an inner loop nor a disjoint loop.
static bool containsAddRecFromDifferentLoop(const SCEV *S, Loop *L) {
// This is very common, put it first.
if (isa<SCEVConstant>(S))
return false;
if (const SCEVCommutativeExpr *AE = dyn_cast<SCEVCommutativeExpr>(S)) {
for (unsigned int i=0; i< AE->getNumOperands(); i++)
if (containsAddRecFromDifferentLoop(AE->getOperand(i), L))
return true;
return false;
}
if (const SCEVAddRecExpr *AE = dyn_cast<SCEVAddRecExpr>(S)) {
if (const Loop *newLoop = AE->getLoop()) {
if (newLoop == L)
return false;
// if newLoop is an outer loop of L, this is OK.
if (newLoop->contains(L))
return false;
/// CollectSubexprs - Split S into subexpressions which can be pulled out into
/// separate registers.
static void CollectSubexprs(const SCEV *S,
SmallVectorImpl<const SCEV *> &Ops,
ScalarEvolution &SE) {
if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
// Break out add operands.
for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
I != E; ++I)
CollectSubexprs(*I, Ops, SE);
return;
} else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
// Split a non-zero base out of an addrec.
if (!AR->getStart()->isZero()) {
CollectSubexprs(AR->getStart(), Ops, SE);
CollectSubexprs(SE.getAddRecExpr(SE.getIntegerSCEV(0, AR->getType()),
AR->getStepRecurrence(SE),
AR->getLoop()), Ops, SE);
return;
}
return true;
}
if (const SCEVUDivExpr *DE = dyn_cast<SCEVUDivExpr>(S))
return containsAddRecFromDifferentLoop(DE->getLHS(), L) ||
containsAddRecFromDifferentLoop(DE->getRHS(), L);
#if 0
// SCEVSDivExpr has been backed out temporarily, but will be back; we'll
// need this when it is.
if (const SCEVSDivExpr *DE = dyn_cast<SCEVSDivExpr>(S))
return containsAddRecFromDifferentLoop(DE->getLHS(), L) ||
containsAddRecFromDifferentLoop(DE->getRHS(), L);
#endif
if (const SCEVCastExpr *CE = dyn_cast<SCEVCastExpr>(S))
return containsAddRecFromDifferentLoop(CE->getOperand(), L);
return false;
// Otherwise use the value itself.
Ops.push_back(S);
}
/// getSCEVStartAndStride - Compute the start and stride of this expression,
@ -90,35 +78,42 @@ static bool getSCEVStartAndStride(const SCEV *&SH, Loop *L, Loop *UseLoop,
if (const SCEVAddExpr *AE = dyn_cast<SCEVAddExpr>(SH)) {
for (unsigned i = 0, e = AE->getNumOperands(); i != e; ++i)
if (const SCEVAddRecExpr *AddRec =
dyn_cast<SCEVAddRecExpr>(AE->getOperand(i))) {
if (AddRec->getLoop() == L)
TheAddRec = SE->getAddExpr(AddRec, TheAddRec);
else
return false; // Nested IV of some sort?
} else {
dyn_cast<SCEVAddRecExpr>(AE->getOperand(i)))
TheAddRec = SE->getAddExpr(AddRec, TheAddRec);
else
Start = SE->getAddExpr(Start, AE->getOperand(i));
}
} else if (isa<SCEVAddRecExpr>(SH)) {
TheAddRec = SH;
} else {
return false; // not analyzable.
}
const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(TheAddRec);
if (!AddRec || AddRec->getLoop() != L) return false;
// Break down TheAddRec into its component parts.
SmallVector<const SCEV *, 4> Subexprs;
CollectSubexprs(TheAddRec, Subexprs, *SE);
// Look for an addrec on the current loop among the parts.
const SCEV *AddRecStride = 0;
for (SmallVectorImpl<const SCEV *>::iterator I = Subexprs.begin(),
E = Subexprs.end(); I != E; ++I) {
const SCEV *S = *I;
if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
if (AR->getLoop() == L) {
*I = AR->getStart();
AddRecStride = AR->getStepRecurrence(*SE);
break;
}
}
if (!AddRecStride)
return false;
// Add up everything else into a start value (which may not be
// loop-invariant).
const SCEV *AddRecStart = SE->getAddExpr(Subexprs);
// Use getSCEVAtScope to attempt to simplify other loops out of
// the picture.
const SCEV *AddRecStart = AddRec->getStart();
AddRecStart = SE->getSCEVAtScope(AddRecStart, UseLoop);
const SCEV *AddRecStride = AddRec->getStepRecurrence(*SE);
// FIXME: If Start contains an SCEVAddRecExpr from a different loop, other
// than an outer loop of the current loop, reject it. LSR has no concept of
// operating on more than one loop at a time so don't confuse it with such
// expressions.
if (containsAddRecFromDifferentLoop(AddRecStart, L))
return false;
Start = SE->getAddExpr(Start, AddRecStart);
@ -131,7 +126,7 @@ static bool getSCEVStartAndStride(const SCEV *&SH, Loop *L, Loop *UseLoop,
DEBUG(dbgs() << "[";
WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false);
dbgs() << "] Variable stride: " << *AddRec << "\n");
dbgs() << "] Variable stride: " << *AddRecStride << "\n");
}
Stride = AddRecStride;
@ -247,14 +242,6 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
}
if (AddUserToIVUsers) {
IVUsersOfOneStride *StrideUses = IVUsesByStride[Stride];
if (!StrideUses) { // First occurrence of this stride?
StrideOrder.push_back(Stride);
StrideUses = new IVUsersOfOneStride(Stride);
IVUses.push_back(StrideUses);
IVUsesByStride[Stride] = StrideUses;
}
// Okay, we found a user that we cannot reduce. Analyze the instruction
// and decide what to do with it. If we are a use inside of the loop, use
// the value before incrementation, otherwise use it after incrementation.
@ -262,27 +249,21 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
// The value used will be incremented by the stride more than we are
// expecting, so subtract this off.
const SCEV *NewStart = SE->getMinusSCEV(Start, Stride);
StrideUses->addUser(NewStart, User, I);
StrideUses->Users.back().setIsUseOfPostIncrementedValue(true);
IVUses.push_back(new IVStrideUse(this, Stride, NewStart, User, I));
IVUses.back().setIsUseOfPostIncrementedValue(true);
DEBUG(dbgs() << " USING POSTINC SCEV, START=" << *NewStart<< "\n");
} else {
StrideUses->addUser(Start, User, I);
IVUses.push_back(new IVStrideUse(this, Stride, Start, User, I));
}
}
}
return true;
}
void IVUsers::AddUser(const SCEV *Stride, const SCEV *Offset,
Instruction *User, Value *Operand) {
IVUsersOfOneStride *StrideUses = IVUsesByStride[Stride];
if (!StrideUses) { // First occurrence of this stride?
StrideOrder.push_back(Stride);
StrideUses = new IVUsersOfOneStride(Stride);
IVUses.push_back(StrideUses);
IVUsesByStride[Stride] = StrideUses;
}
IVUsesByStride[Stride]->addUser(Offset, User, Operand);
IVStrideUse &IVUsers::AddUser(const SCEV *Stride, const SCEV *Offset,
Instruction *User, Value *Operand) {
IVUses.push_back(new IVStrideUse(this, Stride, Offset, User, Operand));
return IVUses.back();
}
IVUsers::IVUsers()
@ -316,15 +297,15 @@ bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) {
/// value of the OperandValToReplace of the given IVStrideUse.
const SCEV *IVUsers::getReplacementExpr(const IVStrideUse &U) const {
// Start with zero.
const SCEV *RetVal = SE->getIntegerSCEV(0, U.getParent()->Stride->getType());
const SCEV *RetVal = SE->getIntegerSCEV(0, U.getStride()->getType());
// Create the basic add recurrence.
RetVal = SE->getAddRecExpr(RetVal, U.getParent()->Stride, L);
RetVal = SE->getAddRecExpr(RetVal, U.getStride(), L);
// Add the offset in a separate step, because it may be loop-variant.
RetVal = SE->getAddExpr(RetVal, U.getOffset());
// For uses of post-incremented values, add an extra stride to compute
// the actual replacement value.
if (U.isUseOfPostIncrementedValue())
RetVal = SE->getAddExpr(RetVal, U.getParent()->Stride);
RetVal = SE->getAddExpr(RetVal, U.getStride());
return RetVal;
}
@ -333,9 +314,9 @@ const SCEV *IVUsers::getReplacementExpr(const IVStrideUse &U) const {
/// isUseOfPostIncrementedValue flag.
const SCEV *IVUsers::getCanonicalExpr(const IVStrideUse &U) const {
// Start with zero.
const SCEV *RetVal = SE->getIntegerSCEV(0, U.getParent()->Stride->getType());
const SCEV *RetVal = SE->getIntegerSCEV(0, U.getStride()->getType());
// Create the basic add recurrence.
RetVal = SE->getAddRecExpr(RetVal, U.getParent()->Stride, L);
RetVal = SE->getAddRecExpr(RetVal, U.getStride(), L);
// Add the offset in a separate step, because it may be loop-variant.
RetVal = SE->getAddExpr(RetVal, U.getOffset());
return RetVal;
@ -358,24 +339,17 @@ void IVUsers::print(raw_ostream &OS, const Module *M) const {
OS << ":\n";
IVUsersAsmAnnotator Annotator;
for (unsigned Stride = 0, e = StrideOrder.size(); Stride != e; ++Stride) {
std::map<const SCEV *, IVUsersOfOneStride*>::const_iterator SI =
IVUsesByStride.find(StrideOrder[Stride]);
assert(SI != IVUsesByStride.end() && "Stride doesn't exist!");
OS << " Stride " << *SI->first->getType() << " " << *SI->first << ":\n";
for (ilist<IVStrideUse>::const_iterator UI = SI->second->Users.begin(),
E = SI->second->Users.end(); UI != E; ++UI) {
OS << " ";
WriteAsOperand(OS, UI->getOperandValToReplace(), false);
OS << " = ";
OS << *getReplacementExpr(*UI);
if (UI->isUseOfPostIncrementedValue())
OS << " (post-inc)";
OS << " in ";
UI->getUser()->print(OS, &Annotator);
OS << '\n';
}
for (ilist<IVStrideUse>::const_iterator UI = IVUses.begin(),
E = IVUses.end(); UI != E; ++UI) {
OS << " ";
WriteAsOperand(OS, UI->getOperandValToReplace(), false);
OS << " = "
<< *getReplacementExpr(*UI);
if (UI->isUseOfPostIncrementedValue())
OS << " (post-inc)";
OS << " in ";
UI->getUser()->print(OS, &Annotator);
OS << '\n';
}
}
@ -384,37 +358,12 @@ void IVUsers::dump() const {
}
void IVUsers::releaseMemory() {
IVUsesByStride.clear();
StrideOrder.clear();
Processed.clear();
IVUses.clear();
}
void IVStrideUse::deleted() {
// Remove this user from the list.
Parent->Users.erase(this);
Parent->IVUses.erase(this);
// this now dangles!
}
void IVUsersOfOneStride::print(raw_ostream &OS) const {
OS << "IV Users of one stride:\n";
if (Stride)
OS << " Stride: " << *Stride << '\n';
OS << " Users:\n";
unsigned Count = 1;
for (ilist<IVStrideUse>::const_iterator
I = Users.begin(), E = Users.end(); I != E; ++I) {
const IVStrideUse &SU = *I;
OS << " " << Count++ << '\n';
OS << " Offset: " << *SU.getOffset() << '\n';
OS << " Instr: " << *SU << '\n';
}
}
void IVUsersOfOneStride::dump() const {
print(dbgs());
}

View File

@ -641,8 +641,24 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
// Reuse a previously-inserted PHI, if present.
for (BasicBlock::iterator I = L->getHeader()->begin();
PHINode *PN = dyn_cast<PHINode>(I); ++I)
if (isInsertedInstruction(PN) && SE.getSCEV(PN) == Normalized)
return PN;
if (SE.isSCEVable(PN->getType()) &&
(SE.getEffectiveSCEVType(PN->getType()) ==
SE.getEffectiveSCEVType(Normalized->getType())) &&
SE.getSCEV(PN) == Normalized)
if (BasicBlock *LatchBlock = L->getLoopLatch()) {
// Remember this PHI, even in post-inc mode.
InsertedValues.insert(PN);
// Remember the increment.
Instruction *IncV =
cast<Instruction>(PN->getIncomingValueForBlock(LatchBlock)
->stripPointerCasts());
rememberInstruction(IncV);
// Make sure the increment is where we want it. But don't move it
// down past a potential existing post-inc user.
if (L == IVIncInsertLoop && !SE.DT->dominates(IncV, IVIncInsertPos))
IncV->moveBefore(IVIncInsertPos);
return PN;
}
// Save the original insertion point so we can restore it when we're done.
BasicBlock *SaveInsertBB = Builder.GetInsertBlock();

View File

@ -14,6 +14,7 @@
#include "llvm/Target/TargetMachine.h"
#include "llvm/PassManager.h"
#include "llvm/Pass.h"
#include "llvm/Analysis/Verifier.h"
#include "llvm/Assembly/PrintModulePass.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/Passes.h"
@ -234,6 +235,9 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
PM.add(createLoopStrengthReducePass(getTargetLowering()));
if (PrintLSR)
PM.add(createPrintFunctionPass("\n\n*** Code after LSR ***\n", &dbgs()));
#ifndef NDEBUG
PM.add(createVerifierPass());
#endif
}
// Turn exception handling constructs into something the code generators can

View File

@ -364,20 +364,14 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
if (ExitingBlock)
NeedCannIV = true;
}
for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
const SCEV *Stride = IU->StrideOrder[i];
const Type *Ty = SE->getEffectiveSCEVType(Stride->getType());
for (IVUsers::const_iterator I = IU->begin(), E = IU->end(); I != E; ++I) {
const Type *Ty =
SE->getEffectiveSCEVType(I->getOperandValToReplace()->getType());
if (!LargestType ||
SE->getTypeSizeInBits(Ty) >
SE->getTypeSizeInBits(LargestType))
LargestType = Ty;
std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
IU->IVUsesByStride.find(IU->StrideOrder[i]);
assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
if (!SI->second->Users.empty())
NeedCannIV = true;
NeedCannIV = true;
}
// Now that we know the largest of the induction variable expressions
@ -455,72 +449,64 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, const Type *LargestType,
// add the offsets to the primary induction variable and cast, avoiding
// the need for the code evaluation methods to insert induction variables
// of different sizes.
for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
const SCEV *Stride = IU->StrideOrder[i];
for (IVUsers::iterator UI = IU->begin(), E = IU->end(); UI != E; ++UI) {
const SCEV *Stride = UI->getStride();
Value *Op = UI->getOperandValToReplace();
const Type *UseTy = Op->getType();
Instruction *User = UI->getUser();
std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
IU->IVUsesByStride.find(IU->StrideOrder[i]);
assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
ilist<IVStrideUse> &List = SI->second->Users;
for (ilist<IVStrideUse>::iterator UI = List.begin(),
E = List.end(); UI != E; ++UI) {
Value *Op = UI->getOperandValToReplace();
const Type *UseTy = Op->getType();
Instruction *User = UI->getUser();
// Compute the final addrec to expand into code.
const SCEV *AR = IU->getReplacementExpr(*UI);
// Compute the final addrec to expand into code.
const SCEV *AR = IU->getReplacementExpr(*UI);
// Evaluate the expression out of the loop, if possible.
if (!L->contains(UI->getUser())) {
const SCEV *ExitVal = SE->getSCEVAtScope(AR, L->getParentLoop());
if (ExitVal->isLoopInvariant(L))
AR = ExitVal;
}
// FIXME: It is an extremely bad idea to indvar substitute anything more
// complex than affine induction variables. Doing so will put expensive
// polynomial evaluations inside of the loop, and the str reduction pass
// currently can only reduce affine polynomials. For now just disable
// indvar subst on anything more complex than an affine addrec, unless
// it can be expanded to a trivial value.
if (!AR->isLoopInvariant(L) && !Stride->isLoopInvariant(L))
continue;
// Determine the insertion point for this user. By default, insert
// immediately before the user. The SCEVExpander class will automatically
// hoist loop invariants out of the loop. For PHI nodes, there may be
// multiple uses, so compute the nearest common dominator for the
// incoming blocks.
Instruction *InsertPt = User;
if (PHINode *PHI = dyn_cast<PHINode>(InsertPt))
for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
if (PHI->getIncomingValue(i) == Op) {
if (InsertPt == User)
InsertPt = PHI->getIncomingBlock(i)->getTerminator();
else
InsertPt =
DT->findNearestCommonDominator(InsertPt->getParent(),
PHI->getIncomingBlock(i))
->getTerminator();
}
// Now expand it into actual Instructions and patch it into place.
Value *NewVal = Rewriter.expandCodeFor(AR, UseTy, InsertPt);
// Patch the new value into place.
if (Op->hasName())
NewVal->takeName(Op);
User->replaceUsesOfWith(Op, NewVal);
UI->setOperandValToReplace(NewVal);
DEBUG(dbgs() << "INDVARS: Rewrote IV '" << *AR << "' " << *Op << '\n'
<< " into = " << *NewVal << "\n");
++NumRemoved;
Changed = true;
// The old value may be dead now.
DeadInsts.push_back(Op);
// Evaluate the expression out of the loop, if possible.
if (!L->contains(UI->getUser())) {
const SCEV *ExitVal = SE->getSCEVAtScope(AR, L->getParentLoop());
if (ExitVal->isLoopInvariant(L))
AR = ExitVal;
}
// FIXME: It is an extremely bad idea to indvar substitute anything more
// complex than affine induction variables. Doing so will put expensive
// polynomial evaluations inside of the loop, and the str reduction pass
// currently can only reduce affine polynomials. For now just disable
// indvar subst on anything more complex than an affine addrec, unless
// it can be expanded to a trivial value.
if (!AR->isLoopInvariant(L) && !Stride->isLoopInvariant(L))
continue;
// Determine the insertion point for this user. By default, insert
// immediately before the user. The SCEVExpander class will automatically
// hoist loop invariants out of the loop. For PHI nodes, there may be
// multiple uses, so compute the nearest common dominator for the
// incoming blocks.
Instruction *InsertPt = User;
if (PHINode *PHI = dyn_cast<PHINode>(InsertPt))
for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
if (PHI->getIncomingValue(i) == Op) {
if (InsertPt == User)
InsertPt = PHI->getIncomingBlock(i)->getTerminator();
else
InsertPt =
DT->findNearestCommonDominator(InsertPt->getParent(),
PHI->getIncomingBlock(i))
->getTerminator();
}
// Now expand it into actual Instructions and patch it into place.
Value *NewVal = Rewriter.expandCodeFor(AR, UseTy, InsertPt);
// Patch the new value into place.
if (Op->hasName())
NewVal->takeName(Op);
User->replaceUsesOfWith(Op, NewVal);
UI->setOperandValToReplace(NewVal);
DEBUG(dbgs() << "INDVARS: Rewrote IV '" << *AR << "' " << *Op << '\n'
<< " into = " << *NewVal << "\n");
++NumRemoved;
Changed = true;
// The old value may be dead now.
DeadInsts.push_back(Op);
}
// Clear the rewriter cache, because values that are in the rewriter's cache

File diff suppressed because it is too large Load Diff

View File

@ -1,8 +1,11 @@
; RUN: llc < %s -march=arm | FileCheck %s
; This loop is rewritten with an indvar which counts down, which
; frees up a register from holding the trip count.
define void @test(i32* %P, i32 %A, i32 %i) nounwind {
entry:
; CHECK: str r1, [{{r.*}}, -{{r.*}}, lsl #2]
; CHECK: str r1, [{{r.*}}, +{{r.*}}, lsl #2]
icmp eq i32 %i, 0 ; <i1>:0 [#uses=1]
br i1 %0, label %return, label %bb
@ -19,3 +22,26 @@ return: ; preds = %bb, %entry
ret void
}
; This loop has a non-address use of the count-up indvar, so
; it'll remain. Now the original store uses a negative-stride address.
define void @test_with_forced_iv(i32* %P, i32 %A, i32 %i) nounwind {
entry:
; CHECK: str r1, [{{r.*}}, -{{r.*}}, lsl #2]
icmp eq i32 %i, 0 ; <i1>:0 [#uses=1]
br i1 %0, label %return, label %bb
bb: ; preds = %bb, %entry
%indvar = phi i32 [ 0, %entry ], [ %indvar.next, %bb ] ; <i32> [#uses=2]
%i_addr.09.0 = sub i32 %i, %indvar ; <i32> [#uses=1]
%tmp2 = getelementptr i32* %P, i32 %i_addr.09.0 ; <i32*> [#uses=1]
store i32 %A, i32* %tmp2
store i32 %indvar, i32* null
%indvar.next = add i32 %indvar, 1 ; <i32> [#uses=2]
icmp eq i32 %indvar.next, %i ; <i1>:1 [#uses=1]
br i1 %1, label %return, label %bb
return: ; preds = %bb, %entry
ret void
}

View File

@ -1,5 +1,5 @@
; RUN: llc < %s -stats |& grep {40.*Number of machine instrs printed}
; RUN: llc < %s -stats |& grep {.*Number of re-materialization}
; RUN: llc < %s -stats |& grep {39.*Number of machine instrs printed}
; RUN: llc < %s -stats |& not grep {.*Number of re-materialization}
; This test really wants to check that the resultant "cond_true" block only
; has a single store in it, and that cond_true55 only has code to materialize
; the constant and do a store. We do *not* want something like this:

View File

@ -1,25 +1,29 @@
; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -relocation-model=pic | FileCheck %s
; rdar://7387640
; FIXME: We still need to rewrite array reference iv of stride -4 with loop
; count iv of stride -1.
; This now reduces to a single induction variable.
; TODO: It still gets a GPR shuffle at the end of the loop
; This is because something in instruction selection has decided
; that comparing the pre-incremented value with zero is better
; than comparing the post-incremented value with -4.
@G = external global i32 ; <i32*> [#uses=2]
@array = external global i32* ; <i32**> [#uses=1]
define arm_apcscc void @t() nounwind optsize {
; CHECK: t:
; CHECK: mov.w r2, #4000
; CHECK: movw r3, #1001
; CHECK: mov.w r2, #1000
entry:
%.pre = load i32* @G, align 4 ; <i32> [#uses=1]
br label %bb
bb: ; preds = %bb, %entry
; CHECK: LBB1_1:
; CHECK: subs r3, #1
; CHECK: cmp r3, #0
; CHECK: sub.w r2, r2, #4
; CHECK: cmp r2, #0
; CHECK: sub.w r9, r2, #1
; CHECK: mov r2, r9
%0 = phi i32 [ %.pre, %entry ], [ %3, %bb ] ; <i32> [#uses=1]
%indvar = phi i32 [ 0, %entry ], [ %indvar.next, %bb ] ; <i32> [#uses=2]
%tmp5 = sub i32 1000, %indvar ; <i32> [#uses=1]

View File

@ -1,6 +1,6 @@
; RUN: llc < %s -mtriple=thumbv7-apple-darwin | FileCheck %s
define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) {
define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
; CHECK: t1:
; CHECK: it ne
; CHECK: cmpne
@ -20,12 +20,12 @@ cond_next:
}
; FIXME: Check for # of unconditional branch after adding branch folding post ifcvt.
define i32 @t2(i32 %a, i32 %b) {
define i32 @t2(i32 %a, i32 %b) nounwind {
entry:
; CHECK: t2:
; CHECK: ite le
; CHECK: suble
; CHECK: ite gt
; CHECK: subgt
; CHECK: suble
%tmp1434 = icmp eq i32 %a, %b ; <i1> [#uses=1]
br i1 %tmp1434, label %bb17, label %bb.outer
@ -60,14 +60,14 @@ bb17: ; preds = %cond_false, %cond_true, %entry
@x = external global i32* ; <i32**> [#uses=1]
define void @foo(i32 %a) {
define void @foo(i32 %a) nounwind {
entry:
%tmp = load i32** @x ; <i32*> [#uses=1]
store i32 %a, i32* %tmp
ret void
}
define void @t3(i32 %a, i32 %b) {
define void @t3(i32 %a, i32 %b) nounwind {
entry:
; CHECK: t3:
; CHECK: it lt

View File

@ -1,5 +1,5 @@
; RUN: llc < %s -march=x86 -mattr=+sse2 -stats -realign-stack=0 |&\
; RUN: grep {asm-printer} | grep 31
; RUN: grep {asm-printer} | grep 34
target datalayout = "e-p:32:32"
define void @foo(i32* %mc, i32* %bp, i32* %ms, i32* %xmb, i32* %mpp, i32* %tpmm, i32* %ip, i32* %tpim, i32* %dpp, i32* %tpdm, i32* %bpi, i32 %M) nounwind {
@ -40,7 +40,7 @@ cond_true: ; preds = %cond_true, %entry
%tmp137.upgrd.7 = bitcast i32* %tmp137 to <2 x i64>* ; <<2 x i64>*> [#uses=1]
store <2 x i64> %tmp131, <2 x i64>* %tmp137.upgrd.7
%tmp147 = add nsw i32 %tmp.10, 8 ; <i32> [#uses=1]
%tmp.upgrd.8 = icmp slt i32 %tmp147, %M ; <i1> [#uses=1]
%tmp.upgrd.8 = icmp ne i32 %tmp147, %M ; <i1> [#uses=1]
%indvar.next = add i32 %indvar, 1 ; <i32> [#uses=1]
br i1 %tmp.upgrd.8, label %cond_true, label %return

View File

@ -1,7 +1,7 @@
; RUN: llc < %s -march=x86 -mtriple=i686-darwin | \
; RUN: grep push | count 3
define void @foo(i8** %buf, i32 %size, i32 %col, i8* %p) {
define void @foo(i8** %buf, i32 %size, i32 %col, i8* %p) nounwind {
entry:
icmp sgt i32 %size, 0 ; <i1>:0 [#uses=1]
br i1 %0, label %bb.preheader, label %return

View File

@ -35,7 +35,7 @@ cond_next36.i: ; preds = %cond_next.i
bb.i28.i: ; preds = %bb.i28.i, %cond_next36.i
; CHECK: %bb.i28.i
; CHECK: addl $2
; CHECK: addl $2
; CHECK: addl $-2
%j.0.reg2mem.0.i16.i = phi i32 [ 0, %cond_next36.i ], [ %indvar.next39.i, %bb.i28.i ] ; <i32> [#uses=2]
%din_addr.1.reg2mem.0.i17.i = phi double [ 0.000000e+00, %cond_next36.i ], [ %tmp16.i25.i, %bb.i28.i ] ; <double> [#uses=1]
%tmp1.i18.i = fptosi double %din_addr.1.reg2mem.0.i17.i to i32 ; <i32> [#uses=2]

View File

@ -1,12 +1,7 @@
; RUN: llc < %s -march=x86 >%t
; TODO: Enhance full lsr mode to get this:
; RUNX: grep {addl \\\$4,} %t | count 3
; RUNX: not grep {,%} %t
; For now, it should find this, which is still pretty good:
; RUN: not grep {addl \\\$4,} %t
; RUN: grep {,%} %t | count 6
; RUN: grep {addl \\\$4,} %t | count 3
; RUN: not grep {,%} %t
define void @foo(float* nocapture %A, float* nocapture %B, float* nocapture %C, i32 %N) nounwind {
entry:

View File

@ -1,11 +1,11 @@
; RUN: llc < %s -march=x86-64 -o %t
; RUN: grep inc %t | count 1
; RUN: not grep inc %t
; RUN: grep dec %t | count 2
; RUN: grep addq %t | count 13
; RUN: not grep addb %t
; RUN: grep leaq %t | count 9
; RUN: grep leal %t | count 3
; RUN: grep movq %t | count 5
; RUN: not grep leaq %t
; RUN: not grep leal %t
; RUN: not grep movq %t
; IV users in each of the loops from other loops shouldn't cause LSR
; to insert new induction variables. Previously it would create a

View File

@ -1,11 +1,24 @@
; RUN: llc < %s -march=x86 -relocation-model=pic | \
; RUN: grep {, 4} | count 1
; RUN: llc < %s -march=x86 | not grep lea
; RUN: llc < %s -march=x86 -relocation-model=pic | FileCheck %s -check-prefix=PIC
; RUN: llc < %s -march=x86 -relocation-model=static | FileCheck %s -check-prefix=STATIC
;
; Make sure the common loop invariant A is hoisted up to preheader,
; since too many registers are needed to subsume it into the addressing modes.
; It's safe to sink A in when it's not pic.
; PIC: align
; PIC: movl $4, -4([[REG:%e[a-z]+]])
; PIC: movl $5, ([[REG]])
; PIC: addl $4, [[REG]]
; PIC: decl {{%e[[a-z]+}}
; PIC: jne
; STATIC: align
; STATIC: movl $4, -4(%ecx)
; STATIC: movl $5, (%ecx)
; STATIC: addl $4, %ecx
; STATIC: decl %eax
; STATIC: jne
@A = global [16 x [16 x i32]] zeroinitializer, align 32 ; <[16 x [16 x i32]]*> [#uses=2]
define void @test(i32 %row, i32 %N.in) nounwind {

View File

@ -1,8 +1,11 @@
; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=dynamic-no-pic | \
; RUN: grep {A+} | count 2
;
; Make sure the common loop invariant A is not hoisted up to preheader,
; since it can be subsumed it into the addressing modes.
; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=dynamic-no-pic | FileCheck %s
; CHECK: align
; CHECK: movl $4, -4(%ecx)
; CHECK: movl $5, (%ecx)
; CHECK: addl $4, %ecx
; CHECK: decl %eax
; CHECK: jne
@A = global [16 x [16 x i32]] zeroinitializer, align 32 ; <[16 x [16 x i32]]*> [#uses=2]

View File

@ -1,8 +1,11 @@
; RUN: llc < %s -march=x86 -relocation-model=static | \
; RUN: grep {A+} | count 2
;
; Make sure the common loop invariant A is not hoisted up to preheader,
; since it can be subsumed into the addressing mode in all uses.
; RUN: llc < %s -march=x86 -relocation-model=static | FileCheck %s
; CHECK: align
; CHECK: movl $4, -4(%ecx)
; CHECK: movl $5, (%ecx)
; CHECK: addl $4, %ecx
; CHECK: decl %eax
; CHECK: jne
@A = internal global [16 x [16 x i32]] zeroinitializer, align 32 ; <[16 x [16 x i32]]*> [#uses=2]

View File

@ -1,5 +1,19 @@
; RUN: llc < %s -march=x86 | grep cmp | grep 64
; RUN: llc < %s -march=x86 | not grep inc
; RUN: llc < %s -march=x86 -relocation-model=static -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=STATIC
; RUN: llc < %s -march=x86 -relocation-model=pic | FileCheck %s -check-prefix=PIC
; By starting the IV at -64 instead of 0, a cmp is eliminated,
; as the flags from the add can be used directly.
; STATIC: movl $-64, %ecx
; STATIC: movl %eax, _state+76(%ecx)
; STATIC: addl $16, %ecx
; STATIC: jne
; In PIC mode the symbol can't be folded, so the change-compare-stride
; trick applies.
; PIC: cmpl $64
@state = external global [0 x i32] ; <[0 x i32]*> [#uses=4]
@S = external global [0 x i32] ; <[0 x i32]*> [#uses=4]

View File

@ -1,4 +1,10 @@
; RUN: llc < %s -mtriple=i386-apple-darwin | grep leal | not grep 16
; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
; CHECK: leal 16(%eax), %edx
; CHECK: align
; CHECK: addl $4, %edx
; CHECK: decl %ecx
; CHECK: jne LBB1_2
%struct.CUMULATIVE_ARGS = type { i32, i32, i32, i32, i32, i32, i32 }
%struct.bitmap_element = type { %struct.bitmap_element*, %struct.bitmap_element*, i32, [2 x i64] }

View File

@ -0,0 +1,386 @@
; RUN: llc < %s -march=x86-64 -O3 | FileCheck %s
target datalayout = "e-p:64:64:64"
target triple = "x86_64-unknown-unknown"
; Full strength reduction reduces register pressure from 5 to 4 here.
; Instruction selection should use the FLAGS value from the dec for
; the branch. Scheduling should push the adds upwards.
; CHECK: full_me_0:
; CHECK: movsd (%rsi), %xmm0
; CHECK: addq $8, %rsi
; CHECK: mulsd (%rdx), %xmm0
; CHECK: addq $8, %rdx
; CHECK: movsd %xmm0, (%rdi)
; CHECK: addq $8, %rdi
; CHECK: decq %rcx
; CHECK: jne
define void @full_me_0(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
entry:
%t0 = icmp sgt i64 %n, 0
br i1 %t0, label %loop, label %return
loop:
%i = phi i64 [ %i.next, %loop ], [ 0, %entry ]
%Ai = getelementptr inbounds double* %A, i64 %i
%Bi = getelementptr inbounds double* %B, i64 %i
%Ci = getelementptr inbounds double* %C, i64 %i
%t1 = load double* %Bi
%t2 = load double* %Ci
%m = fmul double %t1, %t2
store double %m, double* %Ai
%i.next = add nsw i64 %i, 1
%exitcond = icmp eq i64 %i.next, %n
br i1 %exitcond, label %return, label %loop
return:
ret void
}
; Mostly-full strength reduction means we do full strength reduction on all
; except for the offsets.
;
; Given a choice between constant offsets -2048 and 2048, choose the negative
; value, because at boundary conditions it has a smaller encoding.
; TODO: That's an over-general heuristic. It would be better for the target
; to indicate what the encoding cost would be. Then using a 2048 offset
; would be better on x86-64, since the start value would be 0 instead of
; 2048.
; CHECK: mostly_full_me_0:
; CHECK: movsd -2048(%rsi), %xmm0
; CHECK: mulsd -2048(%rdx), %xmm0
; CHECK: movsd %xmm0, -2048(%rdi)
; CHECK: movsd (%rsi), %xmm0
; CHECK: addq $8, %rsi
; CHECK: divsd (%rdx), %xmm0
; CHECK: addq $8, %rdx
; CHECK: movsd %xmm0, (%rdi)
; CHECK: addq $8, %rdi
; CHECK: decq %rcx
; CHECK: jne
define void @mostly_full_me_0(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
entry:
%t0 = icmp sgt i64 %n, 0
br i1 %t0, label %loop, label %return
loop:
%i = phi i64 [ %i.next, %loop ], [ 0, %entry ]
%Ai = getelementptr inbounds double* %A, i64 %i
%Bi = getelementptr inbounds double* %B, i64 %i
%Ci = getelementptr inbounds double* %C, i64 %i
%t1 = load double* %Bi
%t2 = load double* %Ci
%m = fmul double %t1, %t2
store double %m, double* %Ai
%j = add i64 %i, 256
%Aj = getelementptr inbounds double* %A, i64 %j
%Bj = getelementptr inbounds double* %B, i64 %j
%Cj = getelementptr inbounds double* %C, i64 %j
%t3 = load double* %Bj
%t4 = load double* %Cj
%o = fdiv double %t3, %t4
store double %o, double* %Aj
%i.next = add nsw i64 %i, 1
%exitcond = icmp eq i64 %i.next, %n
br i1 %exitcond, label %return, label %loop
return:
ret void
}
; A minor variation on mostly_full_me_0.
; Prefer to start the indvar at 0.
; CHECK: mostly_full_me_1:
; CHECK: movsd (%rsi), %xmm0
; CHECK: mulsd (%rdx), %xmm0
; CHECK: movsd %xmm0, (%rdi)
; CHECK: movsd -2048(%rsi), %xmm0
; CHECK: addq $8, %rsi
; CHECK: divsd -2048(%rdx), %xmm0
; CHECK: addq $8, %rdx
; CHECK: movsd %xmm0, -2048(%rdi)
; CHECK: addq $8, %rdi
; CHECK: decq %rcx
; CHECK: jne
define void @mostly_full_me_1(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
entry:
%t0 = icmp sgt i64 %n, 0
br i1 %t0, label %loop, label %return
loop:
%i = phi i64 [ %i.next, %loop ], [ 0, %entry ]
%Ai = getelementptr inbounds double* %A, i64 %i
%Bi = getelementptr inbounds double* %B, i64 %i
%Ci = getelementptr inbounds double* %C, i64 %i
%t1 = load double* %Bi
%t2 = load double* %Ci
%m = fmul double %t1, %t2
store double %m, double* %Ai
%j = sub i64 %i, 256
%Aj = getelementptr inbounds double* %A, i64 %j
%Bj = getelementptr inbounds double* %B, i64 %j
%Cj = getelementptr inbounds double* %C, i64 %j
%t3 = load double* %Bj
%t4 = load double* %Cj
%o = fdiv double %t3, %t4
store double %o, double* %Aj
%i.next = add nsw i64 %i, 1
%exitcond = icmp eq i64 %i.next, %n
br i1 %exitcond, label %return, label %loop
return:
ret void
}
; A slightly less minor variation on mostly_full_me_0.
; CHECK: mostly_full_me_2:
; CHECK: movsd (%rsi), %xmm0
; CHECK: mulsd (%rdx), %xmm0
; CHECK: movsd %xmm0, (%rdi)
; CHECK: movsd -4096(%rsi), %xmm0
; CHECK: addq $8, %rsi
; CHECK: divsd -4096(%rdx), %xmm0
; CHECK: addq $8, %rdx
; CHECK: movsd %xmm0, -4096(%rdi)
; CHECK: addq $8, %rdi
; CHECK: decq %rcx
; CHECK: jne
define void @mostly_full_me_2(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
entry:
%t0 = icmp sgt i64 %n, 0
br i1 %t0, label %loop, label %return
loop:
%i = phi i64 [ %i.next, %loop ], [ 0, %entry ]
%k = add i64 %i, 256
%Ak = getelementptr inbounds double* %A, i64 %k
%Bk = getelementptr inbounds double* %B, i64 %k
%Ck = getelementptr inbounds double* %C, i64 %k
%t1 = load double* %Bk
%t2 = load double* %Ck
%m = fmul double %t1, %t2
store double %m, double* %Ak
%j = sub i64 %i, 256
%Aj = getelementptr inbounds double* %A, i64 %j
%Bj = getelementptr inbounds double* %B, i64 %j
%Cj = getelementptr inbounds double* %C, i64 %j
%t3 = load double* %Bj
%t4 = load double* %Cj
%o = fdiv double %t3, %t4
store double %o, double* %Aj
%i.next = add nsw i64 %i, 1
%exitcond = icmp eq i64 %i.next, %n
br i1 %exitcond, label %return, label %loop
return:
ret void
}
; In this test, the counting IV exit value is used, so full strength reduction
; would not reduce register pressure. IndVarSimplify ought to simplify such
; cases away, but it's useful here to verify that LSR's register pressure
; heuristics are working as expected.
; CHECK: count_me_0:
; CHECK: movsd (%rsi,%rax,8), %xmm0
; CHECK: mulsd (%rdx,%rax,8), %xmm0
; CHECK: movsd %xmm0, (%rdi,%rax,8)
; CHECK: incq %rax
; CHECK: cmpq %rax, %rcx
; CHECK: jne
define i64 @count_me_0(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
entry:
%t0 = icmp sgt i64 %n, 0
br i1 %t0, label %loop, label %return
loop:
%i = phi i64 [ %i.next, %loop ], [ 0, %entry ]
%Ai = getelementptr inbounds double* %A, i64 %i
%Bi = getelementptr inbounds double* %B, i64 %i
%Ci = getelementptr inbounds double* %C, i64 %i
%t1 = load double* %Bi
%t2 = load double* %Ci
%m = fmul double %t1, %t2
store double %m, double* %Ai
%i.next = add nsw i64 %i, 1
%exitcond = icmp eq i64 %i.next, %n
br i1 %exitcond, label %return, label %loop
return:
%q = phi i64 [ 0, %entry ], [ %i.next, %loop ]
ret i64 %q
}
; In this test, the trip count value is used, so full strength reduction
; would not reduce register pressure.
; (though it would reduce register pressure inside the loop...)
; CHECK: count_me_1:
; CHECK: movsd (%rsi,%rax,8), %xmm0
; CHECK: mulsd (%rdx,%rax,8), %xmm0
; CHECK: movsd %xmm0, (%rdi,%rax,8)
; CHECK: incq %rax
; CHECK: cmpq %rax, %rcx
; CHECK: jne
define i64 @count_me_1(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
entry:
%t0 = icmp sgt i64 %n, 0
br i1 %t0, label %loop, label %return
loop:
%i = phi i64 [ %i.next, %loop ], [ 0, %entry ]
%Ai = getelementptr inbounds double* %A, i64 %i
%Bi = getelementptr inbounds double* %B, i64 %i
%Ci = getelementptr inbounds double* %C, i64 %i
%t1 = load double* %Bi
%t2 = load double* %Ci
%m = fmul double %t1, %t2
store double %m, double* %Ai
%i.next = add nsw i64 %i, 1
%exitcond = icmp eq i64 %i.next, %n
br i1 %exitcond, label %return, label %loop
return:
%q = phi i64 [ 0, %entry ], [ %n, %loop ]
ret i64 %q
}
; Full strength reduction doesn't save any registers here because the
; loop tripcount is a constant.
; CHECK: count_me_2:
; CHECK: movl $10, %eax
; CHECK: align
; CHECK: BB7_1:
; CHECK: movsd -40(%rdi,%rax,8), %xmm0
; CHECK: addsd -40(%rsi,%rax,8), %xmm0
; CHECK: movsd %xmm0, -40(%rdx,%rax,8)
; CHECK: movsd (%rdi,%rax,8), %xmm0
; CHECK: subsd (%rsi,%rax,8), %xmm0
; CHECK: movsd %xmm0, (%rdx,%rax,8)
; CHECK: incq %rax
; CHECK: cmpq $5010, %rax
; CHECK: jne
define void @count_me_2(double* nocapture %A, double* nocapture %B, double* nocapture %C) nounwind {
entry:
br label %loop
loop:
%i = phi i64 [ 0, %entry ], [ %i.next, %loop ]
%i5 = add i64 %i, 5
%Ai = getelementptr double* %A, i64 %i5
%t2 = load double* %Ai
%Bi = getelementptr double* %B, i64 %i5
%t4 = load double* %Bi
%t5 = fadd double %t2, %t4
%Ci = getelementptr double* %C, i64 %i5
store double %t5, double* %Ci
%i10 = add i64 %i, 10
%Ai10 = getelementptr double* %A, i64 %i10
%t9 = load double* %Ai10
%Bi10 = getelementptr double* %B, i64 %i10
%t11 = load double* %Bi10
%t12 = fsub double %t9, %t11
%Ci10 = getelementptr double* %C, i64 %i10
store double %t12, double* %Ci10
%i.next = add i64 %i, 1
%exitcond = icmp eq i64 %i.next, 5000
br i1 %exitcond, label %return, label %loop
return:
ret void
}
; This should be fully strength-reduced to reduce register pressure.
; CHECK: full_me_1:
; CHECK: align
; CHECK: BB8_1:
; CHECK: movsd (%rdi), %xmm0
; CHECK: addsd (%rsi), %xmm0
; CHECK: movsd %xmm0, (%rdx)
; CHECK: movsd 40(%rdi), %xmm0
; CHECK: addq $8, %rdi
; CHECK: subsd 40(%rsi), %xmm0
; CHECK: addq $8, %rsi
; CHECK: movsd %xmm0, 40(%rdx)
; CHECK: addq $8, %rdx
; CHECK: decq %rcx
; CHECK: jne
define void @full_me_1(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
entry:
br label %loop
loop:
%i = phi i64 [ 0, %entry ], [ %i.next, %loop ]
%i5 = add i64 %i, 5
%Ai = getelementptr double* %A, i64 %i5
%t2 = load double* %Ai
%Bi = getelementptr double* %B, i64 %i5
%t4 = load double* %Bi
%t5 = fadd double %t2, %t4
%Ci = getelementptr double* %C, i64 %i5
store double %t5, double* %Ci
%i10 = add i64 %i, 10
%Ai10 = getelementptr double* %A, i64 %i10
%t9 = load double* %Ai10
%Bi10 = getelementptr double* %B, i64 %i10
%t11 = load double* %Bi10
%t12 = fsub double %t9, %t11
%Ci10 = getelementptr double* %C, i64 %i10
store double %t12, double* %Ci10
%i.next = add i64 %i, 1
%exitcond = icmp eq i64 %i.next, %n
br i1 %exitcond, label %return, label %loop
return:
ret void
}
; This is a variation on full_me_0 in which the 0,+,1 induction variable
; has a non-address use, pinning that value in a register.
; CHECK: count_me_3:
; CHECK: call
; CHECK: movsd (%r15,%r13,8), %xmm0
; CHECK: mulsd (%r14,%r13,8), %xmm0
; CHECK: movsd %xmm0, (%r12,%r13,8)
; CHECK: incq %r13
; CHECK: cmpq %r13, %rbx
; CHECK: jne
declare void @use(i64)
define void @count_me_3(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
entry:
%t0 = icmp sgt i64 %n, 0
br i1 %t0, label %loop, label %return
loop:
%i = phi i64 [ %i.next, %loop ], [ 0, %entry ]
call void @use(i64 %i)
%Ai = getelementptr inbounds double* %A, i64 %i
%Bi = getelementptr inbounds double* %B, i64 %i
%Ci = getelementptr inbounds double* %C, i64 %i
%t1 = load double* %Bi
%t2 = load double* %Ci
%m = fmul double %t1, %t2
store double %m, double* %Ai
%i.next = add nsw i64 %i, 1
%exitcond = icmp eq i64 %i.next, %n
br i1 %exitcond, label %return, label %loop
return:
ret void
}

View File

@ -169,7 +169,7 @@ loop:
%indvar.i24 = and i64 %indvar, 16777215
%t3 = getelementptr double* %d, i64 %indvar.i24
%t4 = load double* %t3
%t5 = fmul double %t4, 2.3
%t5 = fdiv double %t4, 2.3
store double %t5, double* %t3
%t6 = getelementptr double* %d, i64 %indvar
%t7 = load double* %t6
@ -199,7 +199,7 @@ loop:
%indvar.i24 = ashr i64 %s1, 24
%t3 = getelementptr double* %d, i64 %indvar.i24
%t4 = load double* %t3
%t5 = fmul double %t4, 2.3
%t5 = fdiv double %t4, 2.3
store double %t5, double* %t3
%t6 = getelementptr double* %d, i64 %indvar
%t7 = load double* %t6
@ -229,7 +229,7 @@ loop:
%indvar.i24 = ashr i64 %s1, 24
%t3 = getelementptr double* %d, i64 %indvar.i24
%t4 = load double* %t3
%t5 = fmul double %t4, 2.3
%t5 = fdiv double %t4, 2.3
store double %t5, double* %t3
%t6 = getelementptr double* %d, i64 %indvar
%t7 = load double* %t6

View File

@ -1,8 +1,7 @@
; RUN: llc < %s -march=x86 -stats |& grep {Number of loads added} | grep 2
; RUN: llc < %s -march=x86 -stats |& grep {Number of register spills} | grep 1
; RUN: llc < %s -march=x86 -stats |& grep {Number of machine instrs printed} | grep 37
; RUN: llc < %s -march=x86 -stats |& grep {Number of machine instrs printed} | grep 34
; PR3495
; The loop reversal kicks in once here, resulting in one fewer instruction.
target triple = "i386-pc-linux-gnu"
@x = external global [8 x i32], align 32 ; <[8 x i32]*> [#uses=1]

View File

@ -25,7 +25,7 @@ bb1: ; preds = %bb2, %bb.nph
%j.01 = phi i64 [ %tmp9, %bb2 ], [ 0, %bb.nph ] ; <i64> [#uses=3]
%tmp3 = add i64 %j.01, %tmp1 ; <i64> [#uses=1]
%tmp4 = add i64 %j.01, %tmp2 ; <i64> [#uses=1]
%z0 = add i64 %tmp4, 5203
%z0 = add i64 %tmp3, 5203
%tmp5 = getelementptr double* %p, i64 %z0 ; <double*> [#uses=1]
%tmp6 = load double* %tmp5, align 8 ; <double> [#uses=1]
%tmp7 = fdiv double %tmp6, 2.100000e+00 ; <double> [#uses=1]

View File

@ -1,5 +1,4 @@
; RUN: opt < %s -loop-reduce -S | grep ugt
; PR2535
; RUN: llc -march=x86-64 < %s -o - | grep {cmpl \\$\[1\], %}
@.str = internal constant [4 x i8] c"%d\0A\00"
@ -16,7 +15,7 @@ forbody:
%add166 = or i32 %mul15, 1 ; <i32> [#uses=1] *
call i32 (i8*, ...)* @printf( i8* noalias getelementptr ([4 x i8]* @.str, i32 0, i32 0), i32 %add166 ) nounwind
%inc = add i32 %i.0, 1 ; <i32> [#uses=3]
%cmp = icmp ult i32 %inc, 1027 ; <i1> [#uses=1]
%cmp = icmp ne i32 %inc, 1027 ; <i1> [#uses=1]
br i1 %cmp, label %forbody, label %afterfor
afterfor: ; preds = %forcond

View File

@ -1,10 +1,15 @@
; RUN: llc %s -o - --x86-asm-syntax=att | grep {cmpl \$4}
; RUN: llc < %s -o - | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-apple-darwin9"
; This is like change-compare-stride-trickiness-1.ll except the comparison
; happens before the relevant use, so the comparison stride can't be
; easily changed.
; The comparison happens before the relevant use, but it can still be rewritten
; to compare with zero.
; CHECK: foo:
; CHECK: align
; CHECK: incl %eax
; CHECK-NEXT: decl %ecx
; CHECK-NEXT: jne
define void @foo() nounwind {
entry:

View File

@ -1,10 +1,12 @@
; RUN: llc %s -o - --x86-asm-syntax=att | grep {cmp. \$8}
; RUN: llc %s -o - --x86-asm-syntax=att | grep {cmp. \$10}
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-apple-darwin9"
; The comparison happens after the relevant use, so the stride can easily
; be changed. The comparison can be done in a narrower mode than the
; induction variable.
; TODO: By making the first store post-increment as well, the loop setup
; could be made simpler.
define void @foo() nounwind {
entry:

View File

@ -19,7 +19,7 @@ bb3: ; preds = %bb1
%tmp4 = add i32 %c_addr.1, -1 ; <i32> [#uses=1]
%c_addr.1.be = select i1 %tmp2, i32 %tmp3, i32 %tmp4 ; <i32> [#uses=1]
%indvar.next = add i32 %indvar, 1 ; <i32> [#uses=1]
; CHECK: sub i32 %lsr.iv, 1
; CHECK: add i32 %lsr.iv, -1
br label %bb6
bb6: ; preds = %bb3, %entry

View File

@ -1,5 +1,5 @@
; Check that the index of 'P[outer]' is pulled out of the loop.
; RUN: opt < %s -loop-reduce -S | \
; RUN: opt < %s -loop-reduce -S -default-data-layout="e-p:32:32:32" | \
; RUN: not grep {getelementptr.*%outer.*%INDVAR}
declare i1 @pred()

View File

@ -1,5 +1,5 @@
; Check that the index of 'P[outer]' is pulled out of the loop.
; RUN: opt < %s -loop-reduce -S | \
; RUN: opt < %s -loop-reduce -S -default-data-layout="e-p:32:32:32" | \
; RUN: not grep {getelementptr.*%outer.*%INDVAR}
declare i1 @pred()

View File

@ -1,7 +1,7 @@
; Check that this test makes INDVAR and related stuff dead, because P[indvar]
; gets reduced, making INDVAR dead.
; RUN: opt < %s -loop-reduce -S | not grep INDVAR
; RUN: opt < %s -loop-reduce -S -default-data-layout="e-p:32:32:32" | not grep INDVAR
declare i1 @pred()

View File

@ -1,4 +1,4 @@
; RUN: opt < %s -analyze -iv-users | grep {Stride i64 {3,+,2}<%loop>:}
; RUN: opt < %s -analyze -iv-users | grep {\{1,+,3,+,2\}<%loop> (post-inc)}
; The value of %r is dependent on a polynomial iteration expression.

View File

@ -7,10 +7,12 @@ define void @test(i32* %P) {
; <label>:0
br label %Loop
Loop: ; preds = %Loop, %0
%i = phi i32 [ 0, %0 ], [ %i.next, %Loop ]
%INDVAR = phi i32 [ 0, %0 ], [ %INDVAR2, %Loop ] ; <i32> [#uses=2]
%STRRED = getelementptr i32* %P, i32 %INDVAR ; <i32*> [#uses=1]
store i32 0, i32* %STRRED
%INDVAR2 = add i32 %INDVAR, 1 ; <i32> [#uses=1]
%i.next = add i32 %i, 1
%cond = call i1 @pred( ) ; <i1> [#uses=1]
br i1 %cond, label %Loop, label %Out
Out: ; preds = %Loop

View File

@ -1,5 +1,5 @@
; RUN: opt < %s -loop-reduce -S | \
; RUN: grep {add i32 %lsr.iv.next, 1}
; RUN: grep {add i32 %indvar630.ui, 1}
;
; Make sure that the use of the IV outside of the loop (the store) uses the
; post incremented value of the IV, not the preincremented value. This