2007-04-07 09:25:15 +08:00
|
|
|
//===- LoopRotation.cpp - Loop Rotation Pass ------------------------------===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
2007-12-30 04:36:04 +08:00
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
2007-04-07 09:25:15 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This file implements Loop Rotation Pass.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "llvm/Transforms/Scalar.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/ADT/Statistic.h"
|
2015-07-23 17:34:01 +08:00
|
|
|
#include "llvm/Analysis/AliasAnalysis.h"
|
2015-01-04 20:03:27 +08:00
|
|
|
#include "llvm/Analysis/AssumptionCache.h"
|
2011-01-02 15:35:53 +08:00
|
|
|
#include "llvm/Analysis/CodeMetrics.h"
|
2011-01-08 16:24:46 +08:00
|
|
|
#include "llvm/Analysis/InstructionSimplify.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/Analysis/LoopPass.h"
|
2007-07-12 07:47:28 +08:00
|
|
|
#include "llvm/Analysis/ScalarEvolution.h"
|
2013-01-21 21:04:33 +08:00
|
|
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
2012-02-14 08:00:23 +08:00
|
|
|
#include "llvm/Analysis/ValueTracking.h"
|
2014-03-04 19:45:46 +08:00
|
|
|
#include "llvm/IR/CFG.h"
|
2014-01-13 17:26:24 +08:00
|
|
|
#include "llvm/IR/Dominators.h"
|
2013-01-02 19:36:10 +08:00
|
|
|
#include "llvm/IR/Function.h"
|
|
|
|
#include "llvm/IR/IntrinsicInst.h"
|
2015-03-10 10:37:25 +08:00
|
|
|
#include "llvm/IR/Module.h"
|
2014-05-26 16:58:51 +08:00
|
|
|
#include "llvm/Support/CommandLine.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/Support/Debug.h"
|
2015-03-24 03:32:43 +08:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2007-07-12 07:47:28 +08:00
|
|
|
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/Transforms/Utils/Local.h"
|
2009-10-25 07:19:52 +08:00
|
|
|
#include "llvm/Transforms/Utils/SSAUpdater.h"
|
2011-01-08 15:21:31 +08:00
|
|
|
#include "llvm/Transforms/Utils/ValueMapper.h"
|
2007-04-07 09:25:15 +08:00
|
|
|
using namespace llvm;
|
|
|
|
|
2014-04-22 10:55:47 +08:00
|
|
|
#define DEBUG_TYPE "loop-rotate"
|
|
|
|
|
2014-05-26 16:58:51 +08:00
|
|
|
static cl::opt<unsigned>
|
|
|
|
DefaultRotationThreshold("rotation-max-header-size", cl::init(16), cl::Hidden,
|
|
|
|
cl::desc("The default maximum header size for automatic loop rotation"));
|
2007-04-07 09:25:15 +08:00
|
|
|
|
|
|
|
STATISTIC(NumRotated, "Number of loops rotated");
|
|
|
|
namespace {
|
|
|
|
|
2009-09-02 14:11:42 +08:00
|
|
|
class LoopRotate : public LoopPass {
|
2007-04-07 09:25:15 +08:00
|
|
|
public:
|
2007-05-03 09:11:54 +08:00
|
|
|
static char ID; // Pass ID, replacement for typeid
|
2014-05-26 16:58:51 +08:00
|
|
|
LoopRotate(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) {
|
2010-10-20 01:21:58 +08:00
|
|
|
initializeLoopRotatePass(*PassRegistry::getPassRegistry());
|
2014-05-26 16:58:51 +08:00
|
|
|
if (SpecifiedMaxHeaderSize == -1)
|
|
|
|
MaxHeaderSize = DefaultRotationThreshold;
|
|
|
|
else
|
|
|
|
MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize);
|
2010-10-20 01:21:58 +08:00
|
|
|
}
|
2007-05-02 05:15:47 +08:00
|
|
|
|
2007-04-10 00:11:48 +08:00
|
|
|
// LCSSA form makes instruction renaming easier.
|
2014-03-05 17:10:37 +08:00
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
2015-07-23 17:34:01 +08:00
|
|
|
AU.addPreserved<AliasAnalysis>();
|
2015-01-04 20:03:27 +08:00
|
|
|
AU.addRequired<AssumptionCacheTracker>();
|
2014-01-13 21:07:17 +08:00
|
|
|
AU.addPreserved<DominatorTreeWrapperPass>();
|
2015-01-17 22:16:18 +08:00
|
|
|
AU.addRequired<LoopInfoWrapperPass>();
|
|
|
|
AU.addPreserved<LoopInfoWrapperPass>();
|
2008-02-15 09:24:49 +08:00
|
|
|
AU.addRequiredID(LoopSimplifyID);
|
|
|
|
AU.addPreservedID(LoopSimplifyID);
|
2007-04-07 09:25:15 +08:00
|
|
|
AU.addRequiredID(LCSSAID);
|
|
|
|
AU.addPreservedID(LCSSAID);
|
[PM] Port ScalarEvolution to the new pass manager.
This change makes ScalarEvolution a stand-alone object and just produces
one from a pass as needed. Making this work well requires making the
object movable, using references instead of overwritten pointers in
a number of places, and other refactorings.
I've also wired it up to the new pass manager and added a RUN line to
a test to exercise it under the new pass manager. This includes basic
printing support much like with other analyses.
But there is a big and somewhat scary change here. Prior to this patch
ScalarEvolution was never *actually* invalidated!!! Re-running the pass
just re-wired up the various other analyses and didn't remove any of the
existing entries in the SCEV caches or clear out anything at all. This
might seem OK as everything in SCEV that can uses ValueHandles to track
updates to the values that serve as SCEV keys. However, this still means
that as we ran SCEV over each function in the module, we kept
accumulating more and more SCEVs into the cache. At the end, we would
have a SCEV cache with every value that we ever needed a SCEV for in the
entire module!!! Yowzers. The releaseMemory routine would dump all of
this, but that isn't realy called during normal runs of the pipeline as
far as I can see.
To make matters worse, there *is* actually a key that we don't update
with value handles -- there is a map keyed off of Loop*s. Because
LoopInfo *does* release its memory from run to run, it is entirely
possible to run SCEV over one function, then over another function, and
then lookup a Loop* from the second function but find an entry inserted
for the first function! Ouch.
To make matters still worse, there are plenty of updates that *don't*
trip a value handle. It seems incredibly unlikely that today GVN or
another pass that invalidates SCEV can update values in *just* such
a way that a subsequent run of SCEV will incorrectly find lookups in
a cache, but it is theoretically possible and would be a nightmare to
debug.
With this refactoring, I've fixed all this by actually destroying and
recreating the ScalarEvolution object from run to run. Technically, this
could increase the amount of malloc traffic we see, but then again it is
also technically correct. ;] I don't actually think we're suffering from
tons of malloc traffic from SCEV because if we were, the fact that we
never clear the memory would seem more likely to have come up as an
actual problem before now. So, I've made the simple fix here. If in fact
there are serious issues with too much allocation and deallocation,
I can work on a clever fix that preserves the allocations (while
clearing the data) between each run, but I'd prefer to do that kind of
optimization with a test case / benchmark that shows why we need such
cleverness (and that can test that we actually make it faster). It's
possible that this will make some things faster by making the SCEV
caches have higher locality (due to being significantly smaller) so
until there is a clear benchmark, I think the simple change is best.
Differential Revision: http://reviews.llvm.org/D12063
llvm-svn: 245193
2015-08-17 10:08:17 +08:00
|
|
|
AU.addPreserved<ScalarEvolutionWrapperPass>();
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
AU.addRequired<TargetTransformInfoWrapperPass>();
|
2007-04-07 09:25:15 +08:00
|
|
|
}
|
|
|
|
|
2014-03-05 17:10:37 +08:00
|
|
|
bool runOnLoop(Loop *L, LPPassManager &LPM) override;
|
2013-05-07 01:58:18 +08:00
|
|
|
bool simplifyLoopLatch(Loop *L);
|
|
|
|
bool rotateLoop(Loop *L, bool SimplifiedLatch);
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2007-04-07 09:25:15 +08:00
|
|
|
private:
|
2014-05-26 16:58:51 +08:00
|
|
|
unsigned MaxHeaderSize;
|
2011-01-09 01:38:45 +08:00
|
|
|
LoopInfo *LI;
|
2013-01-21 21:04:33 +08:00
|
|
|
const TargetTransformInfo *TTI;
|
2015-01-04 20:03:27 +08:00
|
|
|
AssumptionCache *AC;
|
2015-01-18 10:08:05 +08:00
|
|
|
DominatorTree *DT;
|
2007-04-07 09:25:15 +08:00
|
|
|
};
|
2015-06-23 17:49:53 +08:00
|
|
|
}
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2008-05-13 08:00:25 +08:00
|
|
|
char LoopRotate::ID = 0;
|
2010-10-13 03:48:12 +08:00
|
|
|
INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
|
2015-01-04 20:03:27 +08:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
|
2015-01-17 22:16:18 +08:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
|
2010-10-13 03:48:12 +08:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(LCSSA)
|
|
|
|
INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
|
2007-04-07 09:25:15 +08:00
|
|
|
|
2014-05-26 16:58:51 +08:00
|
|
|
Pass *llvm::createLoopRotatePass(int MaxHeaderSize) {
|
|
|
|
return new LoopRotate(MaxHeaderSize);
|
|
|
|
}
|
2007-04-07 09:25:15 +08:00
|
|
|
|
2007-04-10 00:11:48 +08:00
|
|
|
/// Rotate Loop L as many times as possible. Return true if
|
2009-06-25 08:22:44 +08:00
|
|
|
/// the loop is rotated at least once.
|
2011-01-09 01:48:33 +08:00
|
|
|
bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) {
|
2014-02-06 08:07:05 +08:00
|
|
|
if (skipOptnoneFunction(L))
|
|
|
|
return false;
|
|
|
|
|
2014-04-15 17:37:30 +08:00
|
|
|
// Save the loop metadata.
|
|
|
|
MDNode *LoopMD = L->getLoopID();
|
|
|
|
|
2015-02-01 20:01:35 +08:00
|
|
|
Function &F = *L->getHeader()->getParent();
|
|
|
|
|
2015-01-17 22:16:18 +08:00
|
|
|
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
|
2015-02-01 20:01:35 +08:00
|
|
|
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
|
|
|
|
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
|
2015-01-18 10:08:05 +08:00
|
|
|
auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
|
|
|
|
DT = DTWP ? &DTWP->getDomTree() : nullptr;
|
2007-07-12 07:47:28 +08:00
|
|
|
|
2012-02-14 08:00:23 +08:00
|
|
|
// Simplify the loop latch before attempting to rotate the header
|
|
|
|
// upward. Rotation may not be needed if the loop tail can be folded into the
|
|
|
|
// loop exit.
|
2013-05-07 01:58:18 +08:00
|
|
|
bool SimplifiedLatch = simplifyLoopLatch(L);
|
2012-02-14 08:00:23 +08:00
|
|
|
|
2007-04-07 09:25:15 +08:00
|
|
|
// One loop can be rotated multiple times.
|
2011-01-09 01:38:45 +08:00
|
|
|
bool MadeChange = false;
|
2013-05-07 01:58:18 +08:00
|
|
|
while (rotateLoop(L, SimplifiedLatch)) {
|
2011-01-09 01:38:45 +08:00
|
|
|
MadeChange = true;
|
2013-05-07 01:58:18 +08:00
|
|
|
SimplifiedLatch = false;
|
|
|
|
}
|
2014-04-15 17:37:30 +08:00
|
|
|
|
|
|
|
// Restore the loop metadata.
|
|
|
|
// NB! We presume LoopRotation DOESN'T ADD its own metadata.
|
|
|
|
if ((MadeChange || SimplifiedLatch) && LoopMD)
|
|
|
|
L->setLoopID(LoopMD);
|
|
|
|
|
2011-01-09 01:38:45 +08:00
|
|
|
return MadeChange;
|
2007-04-07 09:25:15 +08:00
|
|
|
}
|
|
|
|
|
2011-01-09 03:26:33 +08:00
|
|
|
/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
|
|
|
|
/// old header into the preheader. If there were uses of the values produced by
|
|
|
|
/// these instruction that were outside of the loop, we have to insert PHI nodes
|
|
|
|
/// to merge the two values. Do this now.
|
|
|
|
static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
|
|
|
|
BasicBlock *OrigPreheader,
|
|
|
|
ValueToValueMapTy &ValueMap) {
|
|
|
|
// Remove PHI node entries that are no longer live.
|
|
|
|
BasicBlock::iterator I, E = OrigHeader->end();
|
|
|
|
for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
|
|
|
|
PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2011-01-09 03:26:33 +08:00
|
|
|
// Now fix up users of the instructions in OrigHeader, inserting PHI nodes
|
|
|
|
// as necessary.
|
|
|
|
SSAUpdater SSA;
|
|
|
|
for (I = OrigHeader->begin(); I != E; ++I) {
|
|
|
|
Value *OrigHeaderVal = I;
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2011-01-09 03:26:33 +08:00
|
|
|
// If there are no uses of the value (e.g. because it returns void), there
|
|
|
|
// is nothing to rewrite.
|
|
|
|
if (OrigHeaderVal->use_empty())
|
|
|
|
continue;
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2011-01-09 03:26:33 +08:00
|
|
|
Value *OrigPreHeaderVal = ValueMap[OrigHeaderVal];
|
|
|
|
|
|
|
|
// The value now exits in two versions: the initial value in the preheader
|
|
|
|
// and the loop "next" value in the original header.
|
|
|
|
SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
|
|
|
|
SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
|
|
|
|
SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2011-01-09 03:26:33 +08:00
|
|
|
// Visit each use of the OrigHeader instruction.
|
|
|
|
for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
|
|
|
|
UE = OrigHeaderVal->use_end(); UI != UE; ) {
|
|
|
|
// Grab the use before incrementing the iterator.
|
2014-03-09 11:16:01 +08:00
|
|
|
Use &U = *UI;
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2011-01-09 03:26:33 +08:00
|
|
|
// Increment the iterator before removing the use from the list.
|
|
|
|
++UI;
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2011-01-09 03:26:33 +08:00
|
|
|
// SSAUpdater can't handle a non-PHI use in the same block as an
|
|
|
|
// earlier def. We can easily handle those cases manually.
|
|
|
|
Instruction *UserInst = cast<Instruction>(U.getUser());
|
|
|
|
if (!isa<PHINode>(UserInst)) {
|
|
|
|
BasicBlock *UserBB = UserInst->getParent();
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2011-01-09 03:26:33 +08:00
|
|
|
// The original users in the OrigHeader are already using the
|
|
|
|
// original definitions.
|
|
|
|
if (UserBB == OrigHeader)
|
|
|
|
continue;
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2011-01-09 03:26:33 +08:00
|
|
|
// Users in the OrigPreHeader need to use the value to which the
|
|
|
|
// original definitions are mapped.
|
|
|
|
if (UserBB == OrigPreheader) {
|
|
|
|
U = OrigPreHeaderVal;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2011-01-09 03:26:33 +08:00
|
|
|
// Anything else can be handled by SSAUpdater.
|
|
|
|
SSA.RewriteUse(U);
|
|
|
|
}
|
|
|
|
}
|
2012-02-14 08:00:19 +08:00
|
|
|
}
|
2011-01-09 03:26:33 +08:00
|
|
|
|
2014-08-06 07:27:34 +08:00
|
|
|
/// Determine whether the instructions in this range may be safely and cheaply
|
2012-02-14 08:00:23 +08:00
|
|
|
/// speculated. This is not an important enough situation to develop complex
|
|
|
|
/// heuristics. We handle a single arithmetic instruction along with any type
|
|
|
|
/// conversions.
|
|
|
|
static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
|
2014-10-30 04:19:47 +08:00
|
|
|
BasicBlock::iterator End, Loop *L) {
|
2012-02-14 08:00:23 +08:00
|
|
|
bool seenIncrement = false;
|
2014-10-30 04:19:47 +08:00
|
|
|
bool MultiExitLoop = false;
|
|
|
|
|
|
|
|
if (!L->getExitingBlock())
|
|
|
|
MultiExitLoop = true;
|
|
|
|
|
2012-02-14 08:00:23 +08:00
|
|
|
for (BasicBlock::iterator I = Begin; I != End; ++I) {
|
|
|
|
|
|
|
|
if (!isSafeToSpeculativelyExecute(I))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (isa<DbgInfoIntrinsic>(I))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
switch (I->getOpcode()) {
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
case Instruction::GetElementPtr:
|
|
|
|
// GEPs are cheap if all indices are constant.
|
|
|
|
if (!cast<GEPOperator>(I)->hasAllConstantIndices())
|
|
|
|
return false;
|
|
|
|
// fall-thru to increment case
|
|
|
|
case Instruction::Add:
|
|
|
|
case Instruction::Sub:
|
|
|
|
case Instruction::And:
|
|
|
|
case Instruction::Or:
|
|
|
|
case Instruction::Xor:
|
|
|
|
case Instruction::Shl:
|
|
|
|
case Instruction::LShr:
|
2014-10-30 04:19:47 +08:00
|
|
|
case Instruction::AShr: {
|
2015-01-27 14:21:43 +08:00
|
|
|
Value *IVOpnd = !isa<Constant>(I->getOperand(0))
|
|
|
|
? I->getOperand(0)
|
|
|
|
: !isa<Constant>(I->getOperand(1))
|
|
|
|
? I->getOperand(1)
|
|
|
|
: nullptr;
|
|
|
|
if (!IVOpnd)
|
|
|
|
return false;
|
2014-10-30 04:19:47 +08:00
|
|
|
|
|
|
|
// If increment operand is used outside of the loop, this speculation
|
|
|
|
// could cause extra live range interference.
|
2015-01-27 14:21:43 +08:00
|
|
|
if (MultiExitLoop) {
|
2014-10-30 04:19:47 +08:00
|
|
|
for (User *UseI : IVOpnd->users()) {
|
|
|
|
auto *UserInst = cast<Instruction>(UseI);
|
|
|
|
if (!L->contains(UserInst))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-02-14 08:00:23 +08:00
|
|
|
if (seenIncrement)
|
|
|
|
return false;
|
|
|
|
seenIncrement = true;
|
|
|
|
break;
|
2014-10-30 04:19:47 +08:00
|
|
|
}
|
2012-02-14 08:00:23 +08:00
|
|
|
case Instruction::Trunc:
|
|
|
|
case Instruction::ZExt:
|
|
|
|
case Instruction::SExt:
|
|
|
|
// ignore type conversions
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Fold the loop tail into the loop exit by speculating the loop tail
|
|
|
|
/// instructions. Typically, this is a single post-increment. In the case of a
|
|
|
|
/// simple 2-block loop, hoisting the increment can be much better than
|
2014-08-06 07:27:34 +08:00
|
|
|
/// duplicating the entire loop header. In the case of loops with early exits,
|
2012-02-14 08:00:23 +08:00
|
|
|
/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
|
|
|
|
/// canonical form so downstream passes can handle it.
|
|
|
|
///
|
|
|
|
/// I don't believe this invalidates SCEV.
|
2013-05-07 01:58:18 +08:00
|
|
|
bool LoopRotate::simplifyLoopLatch(Loop *L) {
|
2012-02-14 08:00:23 +08:00
|
|
|
BasicBlock *Latch = L->getLoopLatch();
|
|
|
|
if (!Latch || Latch->hasAddressTaken())
|
2013-05-07 01:58:18 +08:00
|
|
|
return false;
|
2012-02-14 08:00:23 +08:00
|
|
|
|
|
|
|
BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
|
|
|
|
if (!Jmp || !Jmp->isUnconditional())
|
2013-05-07 01:58:18 +08:00
|
|
|
return false;
|
2012-02-14 08:00:23 +08:00
|
|
|
|
|
|
|
BasicBlock *LastExit = Latch->getSinglePredecessor();
|
|
|
|
if (!LastExit || !L->isLoopExiting(LastExit))
|
2013-05-07 01:58:18 +08:00
|
|
|
return false;
|
2012-02-14 08:00:23 +08:00
|
|
|
|
|
|
|
BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
|
|
|
|
if (!BI)
|
2013-05-07 01:58:18 +08:00
|
|
|
return false;
|
2012-02-14 08:00:23 +08:00
|
|
|
|
2014-10-30 04:19:47 +08:00
|
|
|
if (!shouldSpeculateInstrs(Latch->begin(), Jmp, L))
|
2013-05-07 01:58:18 +08:00
|
|
|
return false;
|
2012-02-14 08:00:23 +08:00
|
|
|
|
|
|
|
DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
|
|
|
|
<< LastExit->getName() << "\n");
|
|
|
|
|
|
|
|
// Hoist the instructions from Latch into LastExit.
|
|
|
|
LastExit->getInstList().splice(BI, Latch->getInstList(), Latch->begin(), Jmp);
|
|
|
|
|
|
|
|
unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
|
|
|
|
BasicBlock *Header = Jmp->getSuccessor(0);
|
|
|
|
assert(Header == L->getHeader() && "expected a backward branch");
|
|
|
|
|
|
|
|
// Remove Latch from the CFG so that LastExit becomes the new Latch.
|
|
|
|
BI->setSuccessor(FallThruPath, Header);
|
|
|
|
Latch->replaceSuccessorsPhiUsesWith(LastExit);
|
|
|
|
Jmp->eraseFromParent();
|
|
|
|
|
|
|
|
// Nuke the Latch block.
|
|
|
|
assert(Latch->empty() && "unable to evacuate Latch");
|
|
|
|
LI->removeBlock(Latch);
|
2015-01-18 10:08:05 +08:00
|
|
|
if (DT)
|
|
|
|
DT->eraseNode(Latch);
|
2012-02-14 08:00:23 +08:00
|
|
|
Latch->eraseFromParent();
|
2013-05-07 01:58:18 +08:00
|
|
|
return true;
|
2012-02-14 08:00:23 +08:00
|
|
|
}
|
|
|
|
|
2007-05-12 05:10:54 +08:00
|
|
|
/// Rotate loop LP. Return true if the loop is rotated.
|
2013-05-07 01:58:18 +08:00
|
|
|
///
|
|
|
|
/// \param SimplifiedLatch is true if the latch was just folded into the final
|
|
|
|
/// loop exit. In this case we may want to rotate even though the new latch is
|
|
|
|
/// now an exiting branch. This rotation would have happened had the latch not
|
|
|
|
/// been simplified. However, if SimplifiedLatch is false, then we avoid
|
|
|
|
/// rotating loops in which the latch exits to avoid excessive or endless
|
|
|
|
/// rotation. LoopRotate should be repeatable and converge to a canonical
|
|
|
|
/// form. This property is satisfied because simplifying the loop latch can only
|
|
|
|
/// happen once across multiple invocations of the LoopRotate pass.
|
|
|
|
bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
|
2009-06-25 08:22:44 +08:00
|
|
|
// If the loop has only one block then there is not much to rotate.
|
2007-04-10 00:11:48 +08:00
|
|
|
if (L->getBlocks().size() == 1)
|
2007-04-07 09:25:15 +08:00
|
|
|
return false;
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2011-01-09 02:06:22 +08:00
|
|
|
BasicBlock *OrigHeader = L->getHeader();
|
2012-08-30 23:39:42 +08:00
|
|
|
BasicBlock *OrigLatch = L->getLoopLatch();
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2011-01-09 02:06:22 +08:00
|
|
|
BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
|
2014-04-25 13:29:35 +08:00
|
|
|
if (!BI || BI->isUnconditional())
|
2011-01-09 02:06:22 +08:00
|
|
|
return false;
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2009-06-25 08:22:44 +08:00
|
|
|
// If the loop header is not one of the loop exiting blocks then
|
|
|
|
// either this loop is already rotated or it is not
|
2007-04-07 09:25:15 +08:00
|
|
|
// suitable for loop rotation transformations.
|
2009-10-25 07:34:26 +08:00
|
|
|
if (!L->isLoopExiting(OrigHeader))
|
2007-04-07 09:25:15 +08:00
|
|
|
return false;
|
|
|
|
|
2012-08-30 23:39:42 +08:00
|
|
|
// If the loop latch already contains a branch that leaves the loop then the
|
|
|
|
// loop is already rotated.
|
2014-04-25 13:29:35 +08:00
|
|
|
if (!OrigLatch)
|
2013-05-07 01:58:18 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
// Rotate if either the loop latch does *not* exit the loop, or if the loop
|
|
|
|
// latch was just simplified.
|
|
|
|
if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch)
|
2007-04-07 09:25:15 +08:00
|
|
|
return false;
|
|
|
|
|
2012-12-21 00:04:27 +08:00
|
|
|
// Check size of original header and reject loop if it is very big or we can't
|
|
|
|
// duplicate blocks inside it.
|
2011-01-02 15:35:53 +08:00
|
|
|
{
|
2014-09-07 21:49:57 +08:00
|
|
|
SmallPtrSet<const Value *, 32> EphValues;
|
2015-01-04 20:03:27 +08:00
|
|
|
CodeMetrics::collectEphemeralValues(L, AC, EphValues);
|
2014-09-07 21:49:57 +08:00
|
|
|
|
2011-01-02 15:35:53 +08:00
|
|
|
CodeMetrics Metrics;
|
2014-09-07 21:49:57 +08:00
|
|
|
Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
|
2012-12-21 00:04:27 +08:00
|
|
|
if (Metrics.notDuplicatable) {
|
2013-12-05 13:44:44 +08:00
|
|
|
DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
|
2012-12-21 00:04:27 +08:00
|
|
|
<< " instructions: "; L->dump());
|
|
|
|
return false;
|
|
|
|
}
|
2014-05-26 16:58:51 +08:00
|
|
|
if (Metrics.NumInsts > MaxHeaderSize)
|
2011-01-02 15:35:53 +08:00
|
|
|
return false;
|
2009-03-06 11:51:30 +08:00
|
|
|
}
|
|
|
|
|
2007-07-12 07:47:28 +08:00
|
|
|
// Now, this loop is suitable for rotation.
|
2011-01-09 03:26:33 +08:00
|
|
|
BasicBlock *OrigPreheader = L->getLoopPreheader();
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2011-04-09 15:25:58 +08:00
|
|
|
// If the loop could not be converted to canonical form, it must have an
|
|
|
|
// indirectbr in it, just give up.
|
2014-04-25 13:29:35 +08:00
|
|
|
if (!OrigPreheader)
|
2011-04-09 15:25:58 +08:00
|
|
|
return false;
|
2007-07-12 07:47:28 +08:00
|
|
|
|
2009-09-27 23:37:03 +08:00
|
|
|
// Anything ScalarEvolution may know about this loop or the PHI nodes
|
|
|
|
// in its header will soon be invalidated.
|
[PM] Port ScalarEvolution to the new pass manager.
This change makes ScalarEvolution a stand-alone object and just produces
one from a pass as needed. Making this work well requires making the
object movable, using references instead of overwritten pointers in
a number of places, and other refactorings.
I've also wired it up to the new pass manager and added a RUN line to
a test to exercise it under the new pass manager. This includes basic
printing support much like with other analyses.
But there is a big and somewhat scary change here. Prior to this patch
ScalarEvolution was never *actually* invalidated!!! Re-running the pass
just re-wired up the various other analyses and didn't remove any of the
existing entries in the SCEV caches or clear out anything at all. This
might seem OK as everything in SCEV that can uses ValueHandles to track
updates to the values that serve as SCEV keys. However, this still means
that as we ran SCEV over each function in the module, we kept
accumulating more and more SCEVs into the cache. At the end, we would
have a SCEV cache with every value that we ever needed a SCEV for in the
entire module!!! Yowzers. The releaseMemory routine would dump all of
this, but that isn't realy called during normal runs of the pipeline as
far as I can see.
To make matters worse, there *is* actually a key that we don't update
with value handles -- there is a map keyed off of Loop*s. Because
LoopInfo *does* release its memory from run to run, it is entirely
possible to run SCEV over one function, then over another function, and
then lookup a Loop* from the second function but find an entry inserted
for the first function! Ouch.
To make matters still worse, there are plenty of updates that *don't*
trip a value handle. It seems incredibly unlikely that today GVN or
another pass that invalidates SCEV can update values in *just* such
a way that a subsequent run of SCEV will incorrectly find lookups in
a cache, but it is theoretically possible and would be a nightmare to
debug.
With this refactoring, I've fixed all this by actually destroying and
recreating the ScalarEvolution object from run to run. Technically, this
could increase the amount of malloc traffic we see, but then again it is
also technically correct. ;] I don't actually think we're suffering from
tons of malloc traffic from SCEV because if we were, the fact that we
never clear the memory would seem more likely to have come up as an
actual problem before now. So, I've made the simple fix here. If in fact
there are serious issues with too much allocation and deallocation,
I can work on a clever fix that preserves the allocations (while
clearing the data) between each run, but I'd prefer to do that kind of
optimization with a test case / benchmark that shows why we need such
cleverness (and that can test that we actually make it faster). It's
possible that this will make some things faster by making the SCEV
caches have higher locality (due to being significantly smaller) so
until there is a clear benchmark, I think the simple change is best.
Differential Revision: http://reviews.llvm.org/D12063
llvm-svn: 245193
2015-08-17 10:08:17 +08:00
|
|
|
if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>())
|
|
|
|
SEWP->getSE().forgetLoop(L);
|
2009-09-27 23:37:03 +08:00
|
|
|
|
2012-08-30 23:39:42 +08:00
|
|
|
DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
|
|
|
|
|
2007-04-07 09:25:15 +08:00
|
|
|
// Find new Loop header. NewHeader is a Header's one and only successor
|
2009-01-26 09:57:01 +08:00
|
|
|
// that is inside loop. Header's other successor is outside the
|
|
|
|
// loop. Otherwise loop is not suitable for rotation.
|
2011-01-09 01:48:33 +08:00
|
|
|
BasicBlock *Exit = BI->getSuccessor(0);
|
|
|
|
BasicBlock *NewHeader = BI->getSuccessor(1);
|
2007-04-10 00:11:48 +08:00
|
|
|
if (L->contains(Exit))
|
|
|
|
std::swap(Exit, NewHeader);
|
2009-01-26 09:38:24 +08:00
|
|
|
assert(NewHeader && "Unable to determine new loop header");
|
2012-02-14 08:00:19 +08:00
|
|
|
assert(L->contains(NewHeader) && !L->contains(Exit) &&
|
2007-04-10 00:11:48 +08:00
|
|
|
"Unable to determine loop header and exit blocks");
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2009-06-25 08:22:44 +08:00
|
|
|
// This code assumes that the new header has exactly one predecessor.
|
|
|
|
// Remove any single-entry PHI nodes in it.
|
2009-01-26 10:11:30 +08:00
|
|
|
assert(NewHeader->getSinglePredecessor() &&
|
|
|
|
"New header doesn't have one pred!");
|
|
|
|
FoldSingleEntryPHINodes(NewHeader);
|
2007-04-07 09:25:15 +08:00
|
|
|
|
2009-10-25 07:19:52 +08:00
|
|
|
// Begin by walking OrigHeader and populating ValueMap with an entry for
|
|
|
|
// each Instruction.
|
2007-04-10 00:11:48 +08:00
|
|
|
BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
|
2011-01-08 15:21:31 +08:00
|
|
|
ValueToValueMapTy ValueMap;
|
2007-04-10 00:11:48 +08:00
|
|
|
|
2009-10-25 07:19:52 +08:00
|
|
|
// For PHI nodes, the value available in OldPreHeader is just the
|
|
|
|
// incoming value from OldPreHeader.
|
|
|
|
for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
|
2011-06-20 22:18:48 +08:00
|
|
|
ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader);
|
2007-04-10 03:04:21 +08:00
|
|
|
|
2015-03-10 10:37:25 +08:00
|
|
|
const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
|
|
|
|
|
2010-09-06 09:10:22 +08:00
|
|
|
// For the rest of the instructions, either hoist to the OrigPreheader if
|
|
|
|
// possible or create a clone in the OldPreHeader if not.
|
2011-01-09 03:26:33 +08:00
|
|
|
TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
|
2010-09-06 09:10:22 +08:00
|
|
|
while (I != E) {
|
|
|
|
Instruction *Inst = I++;
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2010-09-06 09:10:22 +08:00
|
|
|
// If the instruction's operands are invariant and it doesn't read or write
|
|
|
|
// memory, then it is safe to hoist. Doing this doesn't change the order of
|
|
|
|
// execution in the preheader, but does prevent the instruction from
|
|
|
|
// executing in each iteration of the loop. This means it is safe to hoist
|
|
|
|
// something that might trap, but isn't safe to hoist something that reads
|
|
|
|
// memory (without proving that the loop doesn't write).
|
|
|
|
if (L->hasLoopInvariantOperands(Inst) &&
|
|
|
|
!Inst->mayReadFromMemory() && !Inst->mayWriteToMemory() &&
|
2012-02-16 08:41:10 +08:00
|
|
|
!isa<TerminatorInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst) &&
|
|
|
|
!isa<AllocaInst>(Inst)) {
|
2010-09-06 09:10:22 +08:00
|
|
|
Inst->moveBefore(LoopEntryBranch);
|
|
|
|
continue;
|
|
|
|
}
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2010-09-06 09:10:22 +08:00
|
|
|
// Otherwise, create a duplicate of the instruction.
|
|
|
|
Instruction *C = Inst->clone();
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2011-01-08 16:24:46 +08:00
|
|
|
// Eagerly remap the operands of the instruction.
|
|
|
|
RemapInstruction(C, ValueMap,
|
|
|
|
RF_NoModuleLevelChanges|RF_IgnoreMissingEntries);
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2011-01-08 16:24:46 +08:00
|
|
|
// With the operands remapped, see if the instruction constant folds or is
|
|
|
|
// otherwise simplifyable. This commonly occurs because the entry from PHI
|
|
|
|
// nodes allows icmps and other instructions to fold.
|
2015-03-10 10:37:25 +08:00
|
|
|
// FIXME: Provide TLI, DT, AC to SimplifyInstruction.
|
|
|
|
Value *V = SimplifyInstruction(C, DL);
|
2011-01-09 01:38:45 +08:00
|
|
|
if (V && LI->replacementPreservesLCSSAForm(C, V)) {
|
2011-01-08 16:24:46 +08:00
|
|
|
// If so, then delete the temporary instruction and stick the folded value
|
|
|
|
// in the map.
|
|
|
|
delete C;
|
|
|
|
ValueMap[Inst] = V;
|
|
|
|
} else {
|
|
|
|
// Otherwise, stick the new instruction into the new block!
|
|
|
|
C->setName(Inst->getName());
|
|
|
|
C->insertBefore(LoopEntryBranch);
|
|
|
|
ValueMap[Inst] = C;
|
|
|
|
}
|
2007-04-07 09:25:15 +08:00
|
|
|
}
|
|
|
|
|
2009-10-25 07:19:52 +08:00
|
|
|
// Along with all the other instructions, we just cloned OrigHeader's
|
|
|
|
// terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
|
|
|
|
// successors by duplicating their incoming values for OrigHeader.
|
|
|
|
TerminatorInst *TI = OrigHeader->getTerminator();
|
2015-08-07 04:22:46 +08:00
|
|
|
for (BasicBlock *SuccBB : TI->successors())
|
|
|
|
for (BasicBlock::iterator BI = SuccBB->begin();
|
2009-10-25 07:19:52 +08:00
|
|
|
PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
|
2011-01-09 03:26:33 +08:00
|
|
|
PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
|
2009-10-25 07:19:52 +08:00
|
|
|
|
|
|
|
// Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
|
|
|
|
// OrigPreHeader's old terminator (the original branch into the loop), and
|
|
|
|
// remove the corresponding incoming values from the PHI nodes in OrigHeader.
|
|
|
|
LoopEntryBranch->eraseFromParent();
|
|
|
|
|
2011-01-09 03:26:33 +08:00
|
|
|
// If there were any uses of instructions in the duplicated block outside the
|
|
|
|
// loop, update them, inserting PHI nodes as required
|
|
|
|
RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap);
|
2007-04-07 09:25:15 +08:00
|
|
|
|
2009-10-25 07:19:52 +08:00
|
|
|
// NewHeader is now the header of the loop.
|
2007-04-07 09:25:15 +08:00
|
|
|
L->moveToHeader(NewHeader);
|
2011-01-09 03:10:28 +08:00
|
|
|
assert(L->getHeader() == NewHeader && "Latch block is our new header");
|
2007-04-07 09:25:15 +08:00
|
|
|
|
2012-02-14 08:00:19 +08:00
|
|
|
|
When loop rotation happens, it is *very* common for the duplicated condbr
to be foldable into an uncond branch. When this happens, we can make a
much simpler CFG for the loop, which is important for nested loop cases
where we want the outer loop to be aggressively optimized.
Handle this case more aggressively. For example, previously on
phi-duplicate.ll we would get this:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
%cmp1 = icmp slt i64 1, 1000
br i1 %cmp1, label %bb.nph, label %for.end
bb.nph: ; preds = %entry
br label %for.body
for.body: ; preds = %bb.nph, %for.cond
%j.02 = phi i64 [ 1, %bb.nph ], [ %inc, %for.cond ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.02
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.02, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.02
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.02, 1
br label %for.cond
for.cond: ; preds = %for.body
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge: ; preds = %for.cond
br label %for.end
for.end: ; preds = %for.cond.for.end_crit_edge, %entry
ret void
}
Now we get the much nicer:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
br label %for.body
for.body: ; preds = %entry, %for.body
%j.01 = phi i64 [ 1, %entry ], [ %inc, %for.body ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.01
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.01, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.01
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.01, 1
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.end
for.end: ; preds = %for.body
ret void
}
With all of these recent changes, we are now able to compile:
void foo(char *X) {
for (int i = 0; i != 100; ++i)
for (int j = 0; j != 100; ++j)
X[j+i*100] = 0;
}
into a single memset of 10000 bytes. This series of changes
should also be helpful for other nested loop scenarios as well.
llvm-svn: 123079
2011-01-09 03:59:06 +08:00
|
|
|
// At this point, we've finished our major CFG changes. As part of cloning
|
|
|
|
// the loop into the preheader we've simplified instructions and the
|
|
|
|
// duplicated conditional branch may now be branching on a constant. If it is
|
|
|
|
// branching on a constant and if that constant means that we enter the loop,
|
|
|
|
// then we fold away the cond branch to an uncond branch. This simplifies the
|
|
|
|
// loop in cases important for nested loops, and it also means we don't have
|
|
|
|
// to split as many edges.
|
|
|
|
BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
|
|
|
|
assert(PHBI->isConditional() && "Should be clone of BI condbr!");
|
|
|
|
if (!isa<ConstantInt>(PHBI->getCondition()) ||
|
|
|
|
PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero())
|
|
|
|
!= NewHeader) {
|
|
|
|
// The conditional branch can't be folded, handle the general case.
|
|
|
|
// Update DominatorTree to reflect the CFG change we just made. Then split
|
|
|
|
// edges as necessary to preserve LoopSimplify form.
|
2015-01-18 10:08:05 +08:00
|
|
|
if (DT) {
|
2012-08-30 23:39:42 +08:00
|
|
|
// Everything that was dominated by the old loop header is now dominated
|
|
|
|
// by the original loop preheader. Conceptually the header was merged
|
|
|
|
// into the preheader, even though we reuse the actual block as a new
|
|
|
|
// loop latch.
|
2015-01-18 10:08:05 +08:00
|
|
|
DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
|
2012-08-30 23:39:42 +08:00
|
|
|
SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
|
|
|
|
OrigHeaderNode->end());
|
2015-01-18 10:08:05 +08:00
|
|
|
DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader);
|
2012-08-30 23:39:42 +08:00
|
|
|
for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I)
|
2015-01-18 10:08:05 +08:00
|
|
|
DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode);
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2015-01-18 10:08:05 +08:00
|
|
|
assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode);
|
|
|
|
assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode);
|
2012-09-01 20:04:51 +08:00
|
|
|
|
When loop rotation happens, it is *very* common for the duplicated condbr
to be foldable into an uncond branch. When this happens, we can make a
much simpler CFG for the loop, which is important for nested loop cases
where we want the outer loop to be aggressively optimized.
Handle this case more aggressively. For example, previously on
phi-duplicate.ll we would get this:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
%cmp1 = icmp slt i64 1, 1000
br i1 %cmp1, label %bb.nph, label %for.end
bb.nph: ; preds = %entry
br label %for.body
for.body: ; preds = %bb.nph, %for.cond
%j.02 = phi i64 [ 1, %bb.nph ], [ %inc, %for.cond ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.02
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.02, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.02
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.02, 1
br label %for.cond
for.cond: ; preds = %for.body
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge: ; preds = %for.cond
br label %for.end
for.end: ; preds = %for.cond.for.end_crit_edge, %entry
ret void
}
Now we get the much nicer:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
br label %for.body
for.body: ; preds = %entry, %for.body
%j.01 = phi i64 [ 1, %entry ], [ %inc, %for.body ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.01
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.01, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.01
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.01, 1
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.end
for.end: ; preds = %for.body
ret void
}
With all of these recent changes, we are now able to compile:
void foo(char *X) {
for (int i = 0; i != 100; ++i)
for (int j = 0; j != 100; ++j)
X[j+i*100] = 0;
}
into a single memset of 10000 bytes. This series of changes
should also be helpful for other nested loop scenarios as well.
llvm-svn: 123079
2011-01-09 03:59:06 +08:00
|
|
|
// Update OrigHeader to be dominated by the new header block.
|
2015-01-18 10:08:05 +08:00
|
|
|
DT->changeImmediateDominator(OrigHeader, OrigLatch);
|
When loop rotation happens, it is *very* common for the duplicated condbr
to be foldable into an uncond branch. When this happens, we can make a
much simpler CFG for the loop, which is important for nested loop cases
where we want the outer loop to be aggressively optimized.
Handle this case more aggressively. For example, previously on
phi-duplicate.ll we would get this:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
%cmp1 = icmp slt i64 1, 1000
br i1 %cmp1, label %bb.nph, label %for.end
bb.nph: ; preds = %entry
br label %for.body
for.body: ; preds = %bb.nph, %for.cond
%j.02 = phi i64 [ 1, %bb.nph ], [ %inc, %for.cond ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.02
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.02, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.02
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.02, 1
br label %for.cond
for.cond: ; preds = %for.body
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge: ; preds = %for.cond
br label %for.end
for.end: ; preds = %for.cond.for.end_crit_edge, %entry
ret void
}
Now we get the much nicer:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
br label %for.body
for.body: ; preds = %entry, %for.body
%j.01 = phi i64 [ 1, %entry ], [ %inc, %for.body ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.01
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.01, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.01
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.01, 1
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.end
for.end: ; preds = %for.body
ret void
}
With all of these recent changes, we are now able to compile:
void foo(char *X) {
for (int i = 0; i != 100; ++i)
for (int j = 0; j != 100; ++j)
X[j+i*100] = 0;
}
into a single memset of 10000 bytes. This series of changes
should also be helpful for other nested loop scenarios as well.
llvm-svn: 123079
2011-01-09 03:59:06 +08:00
|
|
|
}
|
2012-02-14 08:00:19 +08:00
|
|
|
|
When loop rotation happens, it is *very* common for the duplicated condbr
to be foldable into an uncond branch. When this happens, we can make a
much simpler CFG for the loop, which is important for nested loop cases
where we want the outer loop to be aggressively optimized.
Handle this case more aggressively. For example, previously on
phi-duplicate.ll we would get this:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
%cmp1 = icmp slt i64 1, 1000
br i1 %cmp1, label %bb.nph, label %for.end
bb.nph: ; preds = %entry
br label %for.body
for.body: ; preds = %bb.nph, %for.cond
%j.02 = phi i64 [ 1, %bb.nph ], [ %inc, %for.cond ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.02
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.02, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.02
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.02, 1
br label %for.cond
for.cond: ; preds = %for.body
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge: ; preds = %for.cond
br label %for.end
for.end: ; preds = %for.cond.for.end_crit_edge, %entry
ret void
}
Now we get the much nicer:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
br label %for.body
for.body: ; preds = %entry, %for.body
%j.01 = phi i64 [ 1, %entry ], [ %inc, %for.body ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.01
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.01, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.01
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.01, 1
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.end
for.end: ; preds = %for.body
ret void
}
With all of these recent changes, we are now able to compile:
void foo(char *X) {
for (int i = 0; i != 100; ++i)
for (int j = 0; j != 100; ++j)
X[j+i*100] = 0;
}
into a single memset of 10000 bytes. This series of changes
should also be helpful for other nested loop scenarios as well.
llvm-svn: 123079
2011-01-09 03:59:06 +08:00
|
|
|
// Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
|
2012-07-24 18:51:42 +08:00
|
|
|
// thus is not a preheader anymore.
|
|
|
|
// Split the edge to form a real preheader.
|
2015-01-19 20:09:11 +08:00
|
|
|
BasicBlock *NewPH = SplitCriticalEdge(
|
|
|
|
OrigPreheader, NewHeader,
|
|
|
|
CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
|
When loop rotation happens, it is *very* common for the duplicated condbr
to be foldable into an uncond branch. When this happens, we can make a
much simpler CFG for the loop, which is important for nested loop cases
where we want the outer loop to be aggressively optimized.
Handle this case more aggressively. For example, previously on
phi-duplicate.ll we would get this:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
%cmp1 = icmp slt i64 1, 1000
br i1 %cmp1, label %bb.nph, label %for.end
bb.nph: ; preds = %entry
br label %for.body
for.body: ; preds = %bb.nph, %for.cond
%j.02 = phi i64 [ 1, %bb.nph ], [ %inc, %for.cond ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.02
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.02, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.02
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.02, 1
br label %for.cond
for.cond: ; preds = %for.body
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge: ; preds = %for.cond
br label %for.end
for.end: ; preds = %for.cond.for.end_crit_edge, %entry
ret void
}
Now we get the much nicer:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
br label %for.body
for.body: ; preds = %entry, %for.body
%j.01 = phi i64 [ 1, %entry ], [ %inc, %for.body ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.01
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.01, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.01
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.01, 1
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.end
for.end: ; preds = %for.body
ret void
}
With all of these recent changes, we are now able to compile:
void foo(char *X) {
for (int i = 0; i != 100; ++i)
for (int j = 0; j != 100; ++j)
X[j+i*100] = 0;
}
into a single memset of 10000 bytes. This series of changes
should also be helpful for other nested loop scenarios as well.
llvm-svn: 123079
2011-01-09 03:59:06 +08:00
|
|
|
NewPH->setName(NewHeader->getName() + ".lr.ph");
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2012-07-24 18:51:42 +08:00
|
|
|
// Preserve canonical loop form, which means that 'Exit' should have only
|
[LPM] Fix PR18643, another scary place where loop transforms failed to
preserve loop simplify of enclosing loops.
The problem here starts with LoopRotation which ends up cloning code out
of the latch into the new preheader it is buidling. This can create
a new edge from the preheader into the exit block of the loop which
breaks LoopSimplify form. The code tries to fix this by splitting the
critical edge between the latch and the exit block to get a new exit
block that only the latch dominates. This sadly isn't sufficient.
The exit block may be an exit block for multiple nested loops. When we
clone an edge from the latch of the inner loop to the new preheader
being built in the outer loop, we create an exiting edge from the outer
loop to this exit block. Despite breaking the LoopSimplify form for the
inner loop, this is fine for the outer loop. However, when we split the
edge from the inner loop to the exit block, we create a new block which
is in neither the inner nor outer loop as the new exit block. This is
a predecessor to the old exit block, and so the split itself takes the
outer loop out of LoopSimplify form. We need to split every edge
entering the exit block from inside a loop nested more deeply than the
exit block in order to preserve all of the loop simplify constraints.
Once we try to do that, a problem with splitting critical edges
surfaces. Previously, we tried a very brute force to update LoopSimplify
form by re-computing it for all exit blocks. We don't need to do this,
and doing this much will sometimes but not always overlap with the
LoopRotate bug fix. Instead, the code needs to specifically handle the
cases which can start to violate LoopSimplify -- they aren't that
common. We need to see if the destination of the split edge was a loop
exit block in simplified form for the loop of the source of the edge.
For this to be true, all the predecessors need to be in the exact same
loop as the source of the edge being split. If the dest block was
originally in this form, we have to split all of the deges back into
this loop to recover it. The old mechanism of doing this was
conservatively correct because at least *one* of the exiting blocks it
rewrote was the DestBB and so the DestBB's predecessors were fixed. But
this is a much more targeted way of doing it. Making it targeted is
important, because ballooning the set of edges touched prevents
LoopRotate from being able to split edges *it* needs to split to
preserve loop simplify in a coherent way -- the critical edge splitting
would sometimes find the other edges in need of splitting but not
others.
Many, *many* thanks for help from Nick reducing these test cases
mightily. And helping lots with the analysis here as this one was quite
tricky to track down.
llvm-svn: 200393
2014-01-29 21:16:53 +08:00
|
|
|
// one predecessor. Note that Exit could be an exit block for multiple
|
|
|
|
// nested loops, causing both of the edges to now be critical and need to
|
|
|
|
// be split.
|
|
|
|
SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
|
|
|
|
bool SplitLatchEdge = false;
|
|
|
|
for (SmallVectorImpl<BasicBlock *>::iterator PI = ExitPreds.begin(),
|
|
|
|
PE = ExitPreds.end();
|
|
|
|
PI != PE; ++PI) {
|
|
|
|
// We only need to split loop exit edges.
|
|
|
|
Loop *PredLoop = LI->getLoopFor(*PI);
|
|
|
|
if (!PredLoop || PredLoop->contains(Exit))
|
|
|
|
continue;
|
2015-02-21 04:49:25 +08:00
|
|
|
if (isa<IndirectBrInst>((*PI)->getTerminator()))
|
|
|
|
continue;
|
[LPM] Fix PR18643, another scary place where loop transforms failed to
preserve loop simplify of enclosing loops.
The problem here starts with LoopRotation which ends up cloning code out
of the latch into the new preheader it is buidling. This can create
a new edge from the preheader into the exit block of the loop which
breaks LoopSimplify form. The code tries to fix this by splitting the
critical edge between the latch and the exit block to get a new exit
block that only the latch dominates. This sadly isn't sufficient.
The exit block may be an exit block for multiple nested loops. When we
clone an edge from the latch of the inner loop to the new preheader
being built in the outer loop, we create an exiting edge from the outer
loop to this exit block. Despite breaking the LoopSimplify form for the
inner loop, this is fine for the outer loop. However, when we split the
edge from the inner loop to the exit block, we create a new block which
is in neither the inner nor outer loop as the new exit block. This is
a predecessor to the old exit block, and so the split itself takes the
outer loop out of LoopSimplify form. We need to split every edge
entering the exit block from inside a loop nested more deeply than the
exit block in order to preserve all of the loop simplify constraints.
Once we try to do that, a problem with splitting critical edges
surfaces. Previously, we tried a very brute force to update LoopSimplify
form by re-computing it for all exit blocks. We don't need to do this,
and doing this much will sometimes but not always overlap with the
LoopRotate bug fix. Instead, the code needs to specifically handle the
cases which can start to violate LoopSimplify -- they aren't that
common. We need to see if the destination of the split edge was a loop
exit block in simplified form for the loop of the source of the edge.
For this to be true, all the predecessors need to be in the exact same
loop as the source of the edge being split. If the dest block was
originally in this form, we have to split all of the deges back into
this loop to recover it. The old mechanism of doing this was
conservatively correct because at least *one* of the exiting blocks it
rewrote was the DestBB and so the DestBB's predecessors were fixed. But
this is a much more targeted way of doing it. Making it targeted is
important, because ballooning the set of edges touched prevents
LoopRotate from being able to split edges *it* needs to split to
preserve loop simplify in a coherent way -- the critical edge splitting
would sometimes find the other edges in need of splitting but not
others.
Many, *many* thanks for help from Nick reducing these test cases
mightily. And helping lots with the analysis here as this one was quite
tricky to track down.
llvm-svn: 200393
2014-01-29 21:16:53 +08:00
|
|
|
SplitLatchEdge |= L->getLoopLatch() == *PI;
|
2015-01-19 20:09:11 +08:00
|
|
|
BasicBlock *ExitSplit = SplitCriticalEdge(
|
|
|
|
*PI, Exit, CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
|
[LPM] Fix PR18643, another scary place where loop transforms failed to
preserve loop simplify of enclosing loops.
The problem here starts with LoopRotation which ends up cloning code out
of the latch into the new preheader it is buidling. This can create
a new edge from the preheader into the exit block of the loop which
breaks LoopSimplify form. The code tries to fix this by splitting the
critical edge between the latch and the exit block to get a new exit
block that only the latch dominates. This sadly isn't sufficient.
The exit block may be an exit block for multiple nested loops. When we
clone an edge from the latch of the inner loop to the new preheader
being built in the outer loop, we create an exiting edge from the outer
loop to this exit block. Despite breaking the LoopSimplify form for the
inner loop, this is fine for the outer loop. However, when we split the
edge from the inner loop to the exit block, we create a new block which
is in neither the inner nor outer loop as the new exit block. This is
a predecessor to the old exit block, and so the split itself takes the
outer loop out of LoopSimplify form. We need to split every edge
entering the exit block from inside a loop nested more deeply than the
exit block in order to preserve all of the loop simplify constraints.
Once we try to do that, a problem with splitting critical edges
surfaces. Previously, we tried a very brute force to update LoopSimplify
form by re-computing it for all exit blocks. We don't need to do this,
and doing this much will sometimes but not always overlap with the
LoopRotate bug fix. Instead, the code needs to specifically handle the
cases which can start to violate LoopSimplify -- they aren't that
common. We need to see if the destination of the split edge was a loop
exit block in simplified form for the loop of the source of the edge.
For this to be true, all the predecessors need to be in the exact same
loop as the source of the edge being split. If the dest block was
originally in this form, we have to split all of the deges back into
this loop to recover it. The old mechanism of doing this was
conservatively correct because at least *one* of the exiting blocks it
rewrote was the DestBB and so the DestBB's predecessors were fixed. But
this is a much more targeted way of doing it. Making it targeted is
important, because ballooning the set of edges touched prevents
LoopRotate from being able to split edges *it* needs to split to
preserve loop simplify in a coherent way -- the critical edge splitting
would sometimes find the other edges in need of splitting but not
others.
Many, *many* thanks for help from Nick reducing these test cases
mightily. And helping lots with the analysis here as this one was quite
tricky to track down.
llvm-svn: 200393
2014-01-29 21:16:53 +08:00
|
|
|
ExitSplit->moveBefore(Exit);
|
|
|
|
}
|
|
|
|
assert(SplitLatchEdge &&
|
|
|
|
"Despite splitting all preds, failed to split latch exit?");
|
When loop rotation happens, it is *very* common for the duplicated condbr
to be foldable into an uncond branch. When this happens, we can make a
much simpler CFG for the loop, which is important for nested loop cases
where we want the outer loop to be aggressively optimized.
Handle this case more aggressively. For example, previously on
phi-duplicate.ll we would get this:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
%cmp1 = icmp slt i64 1, 1000
br i1 %cmp1, label %bb.nph, label %for.end
bb.nph: ; preds = %entry
br label %for.body
for.body: ; preds = %bb.nph, %for.cond
%j.02 = phi i64 [ 1, %bb.nph ], [ %inc, %for.cond ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.02
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.02, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.02
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.02, 1
br label %for.cond
for.cond: ; preds = %for.body
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge: ; preds = %for.cond
br label %for.end
for.end: ; preds = %for.cond.for.end_crit_edge, %entry
ret void
}
Now we get the much nicer:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
br label %for.body
for.body: ; preds = %entry, %for.body
%j.01 = phi i64 [ 1, %entry ], [ %inc, %for.body ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.01
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.01, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.01
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.01, 1
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.end
for.end: ; preds = %for.body
ret void
}
With all of these recent changes, we are now able to compile:
void foo(char *X) {
for (int i = 0; i != 100; ++i)
for (int j = 0; j != 100; ++j)
X[j+i*100] = 0;
}
into a single memset of 10000 bytes. This series of changes
should also be helpful for other nested loop scenarios as well.
llvm-svn: 123079
2011-01-09 03:59:06 +08:00
|
|
|
} else {
|
|
|
|
// We can fold the conditional branch in the preheader, this makes things
|
|
|
|
// simpler. The first step is to remove the extra edge to the Exit block.
|
|
|
|
Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
|
2011-04-30 04:38:55 +08:00
|
|
|
BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
|
|
|
|
NewBI->setDebugLoc(PHBI->getDebugLoc());
|
When loop rotation happens, it is *very* common for the duplicated condbr
to be foldable into an uncond branch. When this happens, we can make a
much simpler CFG for the loop, which is important for nested loop cases
where we want the outer loop to be aggressively optimized.
Handle this case more aggressively. For example, previously on
phi-duplicate.ll we would get this:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
%cmp1 = icmp slt i64 1, 1000
br i1 %cmp1, label %bb.nph, label %for.end
bb.nph: ; preds = %entry
br label %for.body
for.body: ; preds = %bb.nph, %for.cond
%j.02 = phi i64 [ 1, %bb.nph ], [ %inc, %for.cond ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.02
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.02, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.02
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.02, 1
br label %for.cond
for.cond: ; preds = %for.body
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge: ; preds = %for.cond
br label %for.end
for.end: ; preds = %for.cond.for.end_crit_edge, %entry
ret void
}
Now we get the much nicer:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
br label %for.body
for.body: ; preds = %entry, %for.body
%j.01 = phi i64 [ 1, %entry ], [ %inc, %for.body ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.01
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.01, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.01
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.01, 1
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.end
for.end: ; preds = %for.body
ret void
}
With all of these recent changes, we are now able to compile:
void foo(char *X) {
for (int i = 0; i != 100; ++i)
for (int j = 0; j != 100; ++j)
X[j+i*100] = 0;
}
into a single memset of 10000 bytes. This series of changes
should also be helpful for other nested loop scenarios as well.
llvm-svn: 123079
2011-01-09 03:59:06 +08:00
|
|
|
PHBI->eraseFromParent();
|
2012-02-14 08:00:19 +08:00
|
|
|
|
When loop rotation happens, it is *very* common for the duplicated condbr
to be foldable into an uncond branch. When this happens, we can make a
much simpler CFG for the loop, which is important for nested loop cases
where we want the outer loop to be aggressively optimized.
Handle this case more aggressively. For example, previously on
phi-duplicate.ll we would get this:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
%cmp1 = icmp slt i64 1, 1000
br i1 %cmp1, label %bb.nph, label %for.end
bb.nph: ; preds = %entry
br label %for.body
for.body: ; preds = %bb.nph, %for.cond
%j.02 = phi i64 [ 1, %bb.nph ], [ %inc, %for.cond ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.02
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.02, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.02
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.02, 1
br label %for.cond
for.cond: ; preds = %for.body
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge: ; preds = %for.cond
br label %for.end
for.end: ; preds = %for.cond.for.end_crit_edge, %entry
ret void
}
Now we get the much nicer:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
br label %for.body
for.body: ; preds = %entry, %for.body
%j.01 = phi i64 [ 1, %entry ], [ %inc, %for.body ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.01
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.01, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.01
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.01, 1
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.end
for.end: ; preds = %for.body
ret void
}
With all of these recent changes, we are now able to compile:
void foo(char *X) {
for (int i = 0; i != 100; ++i)
for (int j = 0; j != 100; ++j)
X[j+i*100] = 0;
}
into a single memset of 10000 bytes. This series of changes
should also be helpful for other nested loop scenarios as well.
llvm-svn: 123079
2011-01-09 03:59:06 +08:00
|
|
|
// With our CFG finalized, update DomTree if it is available.
|
2015-01-18 10:08:05 +08:00
|
|
|
if (DT) {
|
When loop rotation happens, it is *very* common for the duplicated condbr
to be foldable into an uncond branch. When this happens, we can make a
much simpler CFG for the loop, which is important for nested loop cases
where we want the outer loop to be aggressively optimized.
Handle this case more aggressively. For example, previously on
phi-duplicate.ll we would get this:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
%cmp1 = icmp slt i64 1, 1000
br i1 %cmp1, label %bb.nph, label %for.end
bb.nph: ; preds = %entry
br label %for.body
for.body: ; preds = %bb.nph, %for.cond
%j.02 = phi i64 [ 1, %bb.nph ], [ %inc, %for.cond ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.02
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.02, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.02
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.02, 1
br label %for.cond
for.cond: ; preds = %for.body
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge: ; preds = %for.cond
br label %for.end
for.end: ; preds = %for.cond.for.end_crit_edge, %entry
ret void
}
Now we get the much nicer:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
br label %for.body
for.body: ; preds = %entry, %for.body
%j.01 = phi i64 [ 1, %entry ], [ %inc, %for.body ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.01
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.01, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.01
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.01, 1
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.end
for.end: ; preds = %for.body
ret void
}
With all of these recent changes, we are now able to compile:
void foo(char *X) {
for (int i = 0; i != 100; ++i)
for (int j = 0; j != 100; ++j)
X[j+i*100] = 0;
}
into a single memset of 10000 bytes. This series of changes
should also be helpful for other nested loop scenarios as well.
llvm-svn: 123079
2011-01-09 03:59:06 +08:00
|
|
|
// Update OrigHeader to be dominated by the new header block.
|
2015-01-18 10:08:05 +08:00
|
|
|
DT->changeImmediateDominator(NewHeader, OrigPreheader);
|
|
|
|
DT->changeImmediateDominator(OrigHeader, OrigLatch);
|
2012-08-30 23:39:42 +08:00
|
|
|
|
|
|
|
// Brute force incremental dominator tree update. Call
|
|
|
|
// findNearestCommonDominator on all CFG predecessors of each child of the
|
|
|
|
// original header.
|
2015-01-18 10:08:05 +08:00
|
|
|
DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
|
2012-09-02 19:57:22 +08:00
|
|
|
SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
|
|
|
|
OrigHeaderNode->end());
|
|
|
|
bool Changed;
|
|
|
|
do {
|
|
|
|
Changed = false;
|
|
|
|
for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) {
|
|
|
|
DomTreeNode *Node = HeaderChildren[I];
|
|
|
|
BasicBlock *BB = Node->getBlock();
|
|
|
|
|
|
|
|
pred_iterator PI = pred_begin(BB);
|
|
|
|
BasicBlock *NearestDom = *PI;
|
|
|
|
for (pred_iterator PE = pred_end(BB); PI != PE; ++PI)
|
2015-01-18 10:08:05 +08:00
|
|
|
NearestDom = DT->findNearestCommonDominator(NearestDom, *PI);
|
2012-09-02 19:57:22 +08:00
|
|
|
|
|
|
|
// Remember if this changes the DomTree.
|
|
|
|
if (Node->getIDom()->getBlock() != NearestDom) {
|
2015-01-18 10:08:05 +08:00
|
|
|
DT->changeImmediateDominator(BB, NearestDom);
|
2012-09-02 19:57:22 +08:00
|
|
|
Changed = true;
|
2012-08-30 23:39:42 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-09-02 19:57:22 +08:00
|
|
|
// If the dominator changed, this may have an effect on other
|
|
|
|
// predecessors, continue until we reach a fixpoint.
|
|
|
|
} while (Changed);
|
When loop rotation happens, it is *very* common for the duplicated condbr
to be foldable into an uncond branch. When this happens, we can make a
much simpler CFG for the loop, which is important for nested loop cases
where we want the outer loop to be aggressively optimized.
Handle this case more aggressively. For example, previously on
phi-duplicate.ll we would get this:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
%cmp1 = icmp slt i64 1, 1000
br i1 %cmp1, label %bb.nph, label %for.end
bb.nph: ; preds = %entry
br label %for.body
for.body: ; preds = %bb.nph, %for.cond
%j.02 = phi i64 [ 1, %bb.nph ], [ %inc, %for.cond ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.02
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.02, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.02
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.02, 1
br label %for.cond
for.cond: ; preds = %for.body
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge: ; preds = %for.cond
br label %for.end
for.end: ; preds = %for.cond.for.end_crit_edge, %entry
ret void
}
Now we get the much nicer:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
br label %for.body
for.body: ; preds = %entry, %for.body
%j.01 = phi i64 [ 1, %entry ], [ %inc, %for.body ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.01
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.01, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.01
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.01, 1
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.end
for.end: ; preds = %for.body
ret void
}
With all of these recent changes, we are now able to compile:
void foo(char *X) {
for (int i = 0; i != 100; ++i)
for (int j = 0; j != 100; ++j)
X[j+i*100] = 0;
}
into a single memset of 10000 bytes. This series of changes
should also be helpful for other nested loop scenarios as well.
llvm-svn: 123079
2011-01-09 03:59:06 +08:00
|
|
|
}
|
2007-07-12 07:47:28 +08:00
|
|
|
}
|
2012-02-14 08:00:19 +08:00
|
|
|
|
When loop rotation happens, it is *very* common for the duplicated condbr
to be foldable into an uncond branch. When this happens, we can make a
much simpler CFG for the loop, which is important for nested loop cases
where we want the outer loop to be aggressively optimized.
Handle this case more aggressively. For example, previously on
phi-duplicate.ll we would get this:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
%cmp1 = icmp slt i64 1, 1000
br i1 %cmp1, label %bb.nph, label %for.end
bb.nph: ; preds = %entry
br label %for.body
for.body: ; preds = %bb.nph, %for.cond
%j.02 = phi i64 [ 1, %bb.nph ], [ %inc, %for.cond ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.02
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.02, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.02
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.02, 1
br label %for.cond
for.cond: ; preds = %for.body
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge: ; preds = %for.cond
br label %for.end
for.end: ; preds = %for.cond.for.end_crit_edge, %entry
ret void
}
Now we get the much nicer:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
br label %for.body
for.body: ; preds = %entry, %for.body
%j.01 = phi i64 [ 1, %entry ], [ %inc, %for.body ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.01
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.01, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.01
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.01, 1
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.end
for.end: ; preds = %for.body
ret void
}
With all of these recent changes, we are now able to compile:
void foo(char *X) {
for (int i = 0; i != 100; ++i)
for (int j = 0; j != 100; ++j)
X[j+i*100] = 0;
}
into a single memset of 10000 bytes. This series of changes
should also be helpful for other nested loop scenarios as well.
llvm-svn: 123079
2011-01-09 03:59:06 +08:00
|
|
|
assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
|
2011-01-09 02:52:51 +08:00
|
|
|
assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
|
2011-01-09 02:55:50 +08:00
|
|
|
|
2011-01-11 15:47:59 +08:00
|
|
|
// Now that the CFG and DomTree are in a consistent state again, try to merge
|
|
|
|
// the OrigHeader block into OrigLatch. This will succeed if they are
|
|
|
|
// connected by an unconditional branch. This is just a cleanup so the
|
|
|
|
// emitted code isn't too gross in this common case.
|
2015-01-18 10:11:23 +08:00
|
|
|
MergeBlockIntoPredecessor(OrigHeader, DT, LI);
|
2012-02-14 08:00:19 +08:00
|
|
|
|
2012-08-30 23:39:42 +08:00
|
|
|
DEBUG(dbgs() << "LoopRotation: into "; L->dump());
|
|
|
|
|
2011-01-09 02:55:50 +08:00
|
|
|
++NumRotated;
|
|
|
|
return true;
|
2007-04-10 04:19:46 +08:00
|
|
|
}
|