[UnrollAndJam] New Unroll and Jam pass

This is a simple implementation of the unroll-and-jam classical loop
optimisation.

The basic idea is that we take an outer loop of the form:

  for i..
    ForeBlocks(i)
    for j..
      SubLoopBlocks(i, j)
    AftBlocks(i)

Instead of doing normal inner or outer unrolling, we unroll as follows:

  for i... i+=2
    ForeBlocks(i)
    ForeBlocks(i+1)
    for j..
      SubLoopBlocks(i, j)
      SubLoopBlocks(i+1, j)
    AftBlocks(i)
    AftBlocks(i+1)
  Remainder Loop

So we have unrolled the outer loop, then jammed the two inner loops into
one. This can lead to a simpler inner loop if memory accesses can be shared
between the now jammed loops.

To do this we have to prove that this is all safe, both for the memory
accesses (using dependence analysis) and that ForeBlocks(i+1) can move before
AftBlocks(i) and SubLoopBlocks(i, j).

Differential Revision: https://reviews.llvm.org/D41953

llvm-svn: 336062
This commit is contained in:
David Green 2018-07-01 12:47:30 +00:00
parent 8dabda70af
commit 963401d2be
23 changed files with 3849 additions and 20 deletions

View File

@ -89,6 +89,9 @@ void LLVMAddLoopRerollPass(LLVMPassManagerRef PM);
/** See llvm::createLoopUnrollPass function. */
void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM);
/** See llvm::createLoopUnrollAndJamPass function. */
void LLVMAddLoopUnrollAndJamPass(LLVMPassManagerRef PM);
/** See llvm::createLoopUnswitchPass function. */
void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM);

View File

@ -422,6 +422,13 @@ public:
bool AllowPeeling;
/// Allow unrolling of all the iterations of the runtime loop remainder.
bool UnrollRemainder;
/// Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollAndJam;
/// Threshold for unroll and jam, for inner loop size. The 'Threshold'
/// value above is used during unroll and jam for the outer loop size.
/// This value is used in the same manner to limit the size of the inner
/// loop.
unsigned UnrollAndJamInnerLoopThreshold;
};
/// Get target-customized preferences for the generic loop unrolling

View File

@ -226,6 +226,7 @@ void initializeLoopSimplifyCFGLegacyPassPass(PassRegistry&);
void initializeLoopSimplifyPass(PassRegistry&);
void initializeLoopStrengthReducePass(PassRegistry&);
void initializeLoopUnrollPass(PassRegistry&);
void initializeLoopUnrollAndJamPass(PassRegistry&);
void initializeLoopUnswitchPass(PassRegistry&);
void initializeLoopVectorizePass(PassRegistry&);
void initializeLoopVersioningLICMPass(PassRegistry&);

View File

@ -132,6 +132,7 @@ namespace {
(void) llvm::createLoopStrengthReducePass();
(void) llvm::createLoopRerollPass();
(void) llvm::createLoopUnrollPass();
(void) llvm::createLoopUnrollAndJamPass();
(void) llvm::createLoopUnswitchPass();
(void) llvm::createLoopVersioningLICMPass();
(void) llvm::createLoopIdiomPass();

View File

@ -190,6 +190,12 @@ Pass *createLoopUnrollPass(int OptLevel = 2, int Threshold = -1, int Count = -1,
// Create an unrolling pass for full unrolling that uses exact trip count only.
Pass *createSimpleLoopUnrollPass(int OptLevel = 2);
//===----------------------------------------------------------------------===//
//
// LoopUnrollAndJam - This pass is a simple loop unroll and jam pass.
//
Pass *createLoopUnrollAndJamPass(int OptLevel = 2);
//===----------------------------------------------------------------------===//
//
// LoopReroll - This pass is a simple loop rerolling pass.

View File

@ -0,0 +1,35 @@
//===- LoopUnrollAndJamPass.h -----------------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TRANSFORMS_SCALAR_LOOPUNROLLANDJAMPASS_H
#define LLVM_TRANSFORMS_SCALAR_LOOPUNROLLANDJAMPASS_H
#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/PassManager.h"
namespace llvm {
class Loop;
struct LoopStandardAnalysisResults;
class LPMUpdater;
/// A simple loop rotation transformation.
class LoopUnrollAndJamPass : public PassInfoMixin<LoopUnrollAndJamPass> {
const int OptLevel;
public:
explicit LoopUnrollAndJamPass(int OptLevel = 2) : OptLevel(OptLevel) {}
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR, LPMUpdater &U);
};
} // end namespace llvm
#endif // LLVM_TRANSFORMS_SCALAR_LOOPUNROLLANDJAMPASS_H

View File

@ -19,11 +19,13 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
namespace llvm {
class AssumptionCache;
class BasicBlock;
class DependenceInfo;
class DominatorTree;
class Loop;
class LoopInfo;
@ -78,8 +80,47 @@ bool canPeel(Loop *L);
bool peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, ScalarEvolution *SE,
DominatorTree *DT, AssumptionCache *AC, bool PreserveLCSSA);
LoopUnrollResult UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
unsigned TripMultiple, bool UnrollRemainder,
LoopInfo *LI, ScalarEvolution *SE,
DominatorTree *DT, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE);
bool isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
DependenceInfo &DI);
bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
const SmallPtrSetImpl<const Value *> &EphValues,
OptimizationRemarkEmitter *ORE, unsigned &TripCount,
unsigned MaxTripCount, unsigned &TripMultiple,
unsigned LoopSize,
TargetTransformInfo::UnrollingPreferences &UP,
bool &UseUpperBound);
BasicBlock *foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI,
ScalarEvolution *SE, DominatorTree *DT);
void remapInstruction(Instruction *I, ValueToValueMapTy &VMap);
void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
ScalarEvolution *SE, DominatorTree *DT,
AssumptionCache *AC);
MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name);
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling);
unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
bool &NotDuplicatable, bool &Convergent,
const TargetTransformInfo &TTI,
const SmallPtrSetImpl<const Value *> &EphValues,
unsigned BEInsns);
} // end namespace llvm
#endif // LLVM_TRANSFORMS_UTILS_UNROLLLOOP_H

View File

@ -121,6 +121,7 @@
#include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
#include "llvm/Transforms/Scalar/LoopSink.h"
#include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
#include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
#include "llvm/Transforms/Scalar/LoopUnrollPass.h"
#include "llvm/Transforms/Scalar/LowerAtomic.h"
#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
@ -179,6 +180,10 @@ static cl::opt<bool> EnableGVNSink(
"enable-npm-gvn-sink", cl::init(false), cl::Hidden,
cl::desc("Enable the GVN hoisting pass for the new PM (default = off)"));
static cl::opt<bool> EnableUnrollAndJam(
"enable-npm-unroll-and-jam", cl::init(false), cl::Hidden,
cl::desc("Enable the Unroll and Jam pass for the new PM (default = off)"));
static cl::opt<bool> EnableSyntheticCounts(
"enable-npm-synthetic-counts", cl::init(false), cl::Hidden, cl::ZeroOrMore,
cl::desc("Run synthetic function entry count generation "
@ -798,6 +803,11 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
// FIXME: It would be really good to use a loop-integrated instruction
// combiner for cleanup here so that the unrolling and LICM can be pipelined
// across the loop nests.
// We do UnrollAndJam in a separate LPM to ensure it happens before unroll
if (EnableUnrollAndJam) {
OptimizePM.addPass(
createFunctionToLoopPassAdaptor(LoopUnrollAndJamPass(Level)));
}
OptimizePM.addPass(LoopUnrollPass(Level));
OptimizePM.addPass(InstCombinePass());
OptimizePM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());

View File

@ -241,6 +241,7 @@ LOOP_PASS("simplify-cfg", LoopSimplifyCFGPass())
LOOP_PASS("strength-reduce", LoopStrengthReducePass())
LOOP_PASS("indvars", IndVarSimplifyPass())
LOOP_PASS("irce", IRCEPass())
LOOP_PASS("unroll-and-jam", LoopUnrollAndJamPass())
LOOP_PASS("unroll-full", LoopFullUnrollPass())
LOOP_PASS("unswitch", SimpleLoopUnswitchPass())
LOOP_PASS("print-access-info", LoopAccessInfoPrinterPass(dbgs()))

View File

@ -622,6 +622,8 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
UP.Runtime = true;
UP.UnrollRemainder = true;
UP.DefaultUnrollRuntimeCount = 4;
UP.UnrollAndJam = true;
UP.UnrollAndJamInnerLoopThreshold = 60;
// Force unrolling small loops can be very useful because of the branch
// taken cost of the backedge.

View File

@ -96,6 +96,10 @@ static cl::opt<bool> EnableLoopInterchange(
"enable-loopinterchange", cl::init(false), cl::Hidden,
cl::desc("Enable the new, experimental LoopInterchange Pass"));
static cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam",
cl::init(false), cl::Hidden,
cl::desc("Enable Unroll And Jam Pass"));
static cl::opt<bool>
EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden,
cl::desc("Enable preparation for ThinLTO."));
@ -669,6 +673,13 @@ void PassManagerBuilder::populateModulePassManager(
addInstructionCombiningPass(MPM);
if (!DisableUnrollLoops) {
if (EnableUnrollAndJam) {
// Unroll and Jam. We do this before unroll but need to be in a separate
// loop pass manager in order for the outer loop to be processed by
// unroll and jam before the inner loop is unrolled.
MPM.add(createLoopUnrollAndJamPass(OptLevel));
}
MPM.add(createLoopUnrollPass(OptLevel)); // Unroll small loops
// LoopUnroll may generate some redundency to cleanup.

View File

@ -39,6 +39,7 @@ add_llvm_library(LLVMScalarOpts
LoopSimplifyCFG.cpp
LoopStrengthReduce.cpp
LoopUnrollPass.cpp
LoopUnrollAndJamPass.cpp
LoopUnswitch.cpp
LoopVersioningLICM.cpp
LowerAtomic.cpp

View File

@ -0,0 +1,447 @@
//===- LoopUnrollAndJam.cpp - Loop unroll and jam pass --------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This pass implements an unroll and jam pass. Most of the work is done by
// Utils/UnrollLoopAndJam.cpp.
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/DependenceAnalysis.h"
#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/PassManager.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <string>
using namespace llvm;
#define DEBUG_TYPE "loop-unroll-and-jam"
static cl::opt<bool>
AllowUnrollAndJam("allow-unroll-and-jam", cl::Hidden,
cl::desc("Allows loops to be unroll-and-jammed."));
static cl::opt<unsigned> UnrollAndJamCount(
"unroll-and-jam-count", cl::Hidden,
cl::desc("Use this unroll count for all loops including those with "
"unroll_and_jam_count pragma values, for testing purposes"));
static cl::opt<unsigned> UnrollAndJamThreshold(
"unroll-and-jam-threshold", cl::init(60), cl::Hidden,
cl::desc("Threshold to use for inner loop when doing unroll and jam."));
static cl::opt<unsigned> PragmaUnrollAndJamThreshold(
"pragma-unroll-and-jam-threshold", cl::init(1024), cl::Hidden,
cl::desc("Unrolled size limit for loops with an unroll_and_jam(full) or "
"unroll_count pragma."));
// Returns the loop hint metadata node with the given name (for example,
// "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
// returned.
static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) {
if (MDNode *LoopID = L->getLoopID())
return GetUnrollMetadata(LoopID, Name);
return nullptr;
}
// Returns true if the loop has any metadata starting with Prefix. For example a
// Prefix of "llvm.loop.unroll." returns true if we have any unroll metadata.
static bool HasAnyUnrollPragma(const Loop *L, StringRef Prefix) {
if (MDNode *LoopID = L->getLoopID()) {
// First operand should refer to the loop id itself.
assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
if (!MD)
continue;
MDString *S = dyn_cast<MDString>(MD->getOperand(0));
if (!S)
continue;
if (S->getString().startswith(Prefix))
return true;
}
}
return false;
}
// Returns true if the loop has an unroll_and_jam(enable) pragma.
static bool HasUnrollAndJamEnablePragma(const Loop *L) {
return GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.enable");
}
// Returns true if the loop has an unroll_and_jam(disable) pragma.
static bool HasUnrollAndJamDisablePragma(const Loop *L) {
return GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.disable");
}
// If loop has an unroll_and_jam_count pragma return the (necessarily
// positive) value from the pragma. Otherwise return 0.
static unsigned UnrollAndJamCountPragmaValue(const Loop *L) {
MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.count");
if (MD) {
assert(MD->getNumOperands() == 2 &&
"Unroll count hint metadata should have two operands.");
unsigned Count =
mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
assert(Count >= 1 && "Unroll count must be positive.");
return Count;
}
return 0;
}
// Returns loop size estimation for unrolled loop.
static uint64_t
getUnrollAndJammedLoopSize(unsigned LoopSize,
TargetTransformInfo::UnrollingPreferences &UP) {
assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
return static_cast<uint64_t>(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
}
// Calculates unroll and jam count and writes it to UP.Count. Returns true if
// unroll count was set explicitly.
static bool computeUnrollAndJamCount(
Loop *L, Loop *SubLoop, const TargetTransformInfo &TTI, DominatorTree &DT,
LoopInfo *LI, ScalarEvolution &SE,
const SmallPtrSetImpl<const Value *> &EphValues,
OptimizationRemarkEmitter *ORE, unsigned OuterTripCount,
unsigned OuterTripMultiple, unsigned OuterLoopSize, unsigned InnerTripCount,
unsigned InnerLoopSize, TargetTransformInfo::UnrollingPreferences &UP) {
// Check for explicit Count from the "unroll-and-jam-count" option.
bool UserUnrollCount = UnrollAndJamCount.getNumOccurrences() > 0;
if (UserUnrollCount) {
UP.Count = UnrollAndJamCount;
UP.Force = true;
if (UP.AllowRemainder &&
getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold &&
getUnrollAndJammedLoopSize(InnerLoopSize, UP) <
UP.UnrollAndJamInnerLoopThreshold)
return true;
}
// Check for unroll_and_jam pragmas
unsigned PragmaCount = UnrollAndJamCountPragmaValue(L);
if (PragmaCount > 0) {
UP.Count = PragmaCount;
UP.Runtime = true;
UP.Force = true;
if ((UP.AllowRemainder || (OuterTripMultiple % PragmaCount == 0)) &&
getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold &&
getUnrollAndJammedLoopSize(InnerLoopSize, UP) <
UP.UnrollAndJamInnerLoopThreshold)
return true;
}
// Use computeUnrollCount from the loop unroller to get a sensible count
// for the unrolling the outer loop. This uses UP.Threshold /
// UP.PartialThreshold / UP.MaxCount to come up with sensible loop values.
// We have already checked that the loop has no unroll.* pragmas.
unsigned MaxTripCount = 0;
bool UseUpperBound = false;
bool ExplicitUnroll = computeUnrollCount(
L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount,
OuterTripMultiple, OuterLoopSize, UP, UseUpperBound);
if (ExplicitUnroll || UseUpperBound) {
// If the user explicitly set the loop as unrolled, dont UnJ it. Leave it
// for the unroller instead.
UP.Count = 0;
return false;
}
bool PragmaEnableUnroll = HasUnrollAndJamEnablePragma(L);
ExplicitUnroll = PragmaCount > 0 || PragmaEnableUnroll || UserUnrollCount;
// If the loop has an unrolling pragma, we want to be more aggressive with
// unrolling limits.
if (ExplicitUnroll && OuterTripCount != 0)
UP.UnrollAndJamInnerLoopThreshold = PragmaUnrollAndJamThreshold;
if (!UP.AllowRemainder && getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
UP.UnrollAndJamInnerLoopThreshold) {
UP.Count = 0;
return false;
}
// If the inner loop count is known and small, leave the entire loop nest to
// be the unroller
if (!ExplicitUnroll && InnerTripCount &&
InnerLoopSize * InnerTripCount < UP.Threshold) {
UP.Count = 0;
return false;
}
// We have a sensible limit for the outer loop, now adjust it for the inner
// loop and UP.UnrollAndJamInnerLoopThreshold.
while (UP.Count != 0 && UP.AllowRemainder &&
getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
UP.UnrollAndJamInnerLoopThreshold)
UP.Count--;
if (!ExplicitUnroll) {
// Check for situations where UnJ is likely to be unprofitable. Including
// subloops with more than 1 block.
if (SubLoop->getBlocks().size() != 1) {
UP.Count = 0;
return false;
}
// Limit to loops where there is something to gain from unrolling and
// jamming the loop. In this case, look for loads that are invariant in the
// outer loop and can become shared.
unsigned NumInvariant = 0;
for (BasicBlock *BB : SubLoop->getBlocks()) {
for (Instruction &I : *BB) {
if (auto *Ld = dyn_cast<LoadInst>(&I)) {
Value *V = Ld->getPointerOperand();
const SCEV *LSCEV = SE.getSCEVAtScope(V, L);
if (SE.isLoopInvariant(LSCEV, L))
NumInvariant++;
}
}
}
if (NumInvariant == 0) {
UP.Count = 0;
return false;
}
}
return ExplicitUnroll;
}
static LoopUnrollResult
tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
ScalarEvolution &SE, const TargetTransformInfo &TTI,
AssumptionCache &AC, DependenceInfo &DI,
OptimizationRemarkEmitter &ORE, int OptLevel) {
// Quick checks of the correct loop form
if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
return LoopUnrollResult::Unmodified;
Loop *SubLoop = L->getSubLoops()[0];
if (!SubLoop->isLoopSimplifyForm())
return LoopUnrollResult::Unmodified;
BasicBlock *Latch = L->getLoopLatch();
BasicBlock *Exit = L->getExitingBlock();
BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
if (Latch != Exit || SubLoopLatch != SubLoopExit)
return LoopUnrollResult::Unmodified;
TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
L, SE, TTI, OptLevel, None, None, None, None, None, None);
if (AllowUnrollAndJam.getNumOccurrences() > 0)
UP.UnrollAndJam = AllowUnrollAndJam;
if (UnrollAndJamThreshold.getNumOccurrences() > 0)
UP.UnrollAndJamInnerLoopThreshold = UnrollAndJamThreshold;
// Exit early if unrolling is disabled.
if (!UP.UnrollAndJam || UP.UnrollAndJamInnerLoopThreshold == 0)
return LoopUnrollResult::Unmodified;
LLVM_DEBUG(dbgs() << "Loop Unroll and Jam: F["
<< L->getHeader()->getParent()->getName() << "] Loop %"
<< L->getHeader()->getName() << "\n");
// A loop with any unroll pragma (enabling/disabling/count/etc) is left for
// the unroller, so long as it does not explicitly have unroll_and_jam
// metadata. This means #pragma nounroll will disable unroll and jam as well
// as unrolling
if (HasUnrollAndJamDisablePragma(L) ||
(HasAnyUnrollPragma(L, "llvm.loop.unroll.") &&
!HasAnyUnrollPragma(L, "llvm.loop.unroll_and_jam."))) {
LLVM_DEBUG(dbgs() << " Disabled due to pragma.\n");
return LoopUnrollResult::Unmodified;
}
if (!isSafeToUnrollAndJam(L, SE, DT, DI)) {
LLVM_DEBUG(dbgs() << " Disabled due to not being safe.\n");
return LoopUnrollResult::Unmodified;
}
// Approximate the loop size and collect useful info
unsigned NumInlineCandidates;
bool NotDuplicatable;
bool Convergent;
SmallPtrSet<const Value *, 32> EphValues;
CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
unsigned InnerLoopSize =
ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable,
Convergent, TTI, EphValues, UP.BEInsns);
unsigned OuterLoopSize =
ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
TTI, EphValues, UP.BEInsns);
LLVM_DEBUG(dbgs() << " Outer Loop Size: " << OuterLoopSize << "\n");
LLVM_DEBUG(dbgs() << " Inner Loop Size: " << InnerLoopSize << "\n");
if (NotDuplicatable) {
LLVM_DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable "
"instructions.\n");
return LoopUnrollResult::Unmodified;
}
if (NumInlineCandidates != 0) {
LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
return LoopUnrollResult::Unmodified;
}
if (Convergent) {
LLVM_DEBUG(
dbgs() << " Not unrolling loop with convergent instructions.\n");
return LoopUnrollResult::Unmodified;
}
// Find trip count and trip multiple
unsigned OuterTripCount = SE.getSmallConstantTripCount(L, Latch);
unsigned OuterTripMultiple = SE.getSmallConstantTripMultiple(L, Latch);
unsigned InnerTripCount = SE.getSmallConstantTripCount(SubLoop, SubLoopLatch);
// Decide if, and by how much, to unroll
bool IsCountSetExplicitly = computeUnrollAndJamCount(
L, SubLoop, TTI, DT, LI, SE, EphValues, &ORE, OuterTripCount,
OuterTripMultiple, OuterLoopSize, InnerTripCount, InnerLoopSize, UP);
if (UP.Count <= 1)
return LoopUnrollResult::Unmodified;
// Unroll factor (Count) must be less or equal to TripCount.
if (OuterTripCount && UP.Count > OuterTripCount)
UP.Count = OuterTripCount;
LoopUnrollResult UnrollResult =
UnrollAndJamLoop(L, UP.Count, OuterTripCount, OuterTripMultiple,
UP.UnrollRemainder, LI, &SE, &DT, &AC, &ORE);
// If loop has an unroll count pragma or unrolled by explicitly set count
// mark loop as unrolled to prevent unrolling beyond that requested.
if (UnrollResult != LoopUnrollResult::FullyUnrolled && IsCountSetExplicitly)
L->setLoopAlreadyUnrolled();
return UnrollResult;
}
namespace {
class LoopUnrollAndJam : public LoopPass {
public:
static char ID; // Pass ID, replacement for typeid
unsigned OptLevel;
LoopUnrollAndJam(int OptLevel = 2) : LoopPass(ID), OptLevel(OptLevel) {
initializeLoopUnrollAndJamPass(*PassRegistry::getPassRegistry());
}
bool runOnLoop(Loop *L, LPPassManager &LPM) override {
if (skipLoop(L))
return false;
Function &F = *L->getHeader()->getParent();
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
const TargetTransformInfo &TTI =
getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
// For the old PM, we can't use OptimizationRemarkEmitter as an analysis
// pass. Function analyses need to be preserved across loop transformations
// but ORE cannot be preserved (see comment before the pass definition).
OptimizationRemarkEmitter ORE(&F);
LoopUnrollResult Result =
tryToUnrollAndJamLoop(L, DT, LI, SE, TTI, AC, DI, ORE, OptLevel);
if (Result == LoopUnrollResult::FullyUnrolled)
LPM.markLoopAsDeleted(*L);
return Result != LoopUnrollResult::Unmodified;
}
/// This transformation requires natural loop information & requires that
/// loop preheaders be inserted into the CFG...
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<DependenceAnalysisWrapperPass>();
getLoopAnalysisUsage(AU);
}
};
} // end anonymous namespace
char LoopUnrollAndJam::ID = 0;
INITIALIZE_PASS_BEGIN(LoopUnrollAndJam, "loop-unroll-and-jam",
"Unroll and Jam loops", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(LoopPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
INITIALIZE_PASS_END(LoopUnrollAndJam, "loop-unroll-and-jam",
"Unroll and Jam loops", false, false)
Pass *llvm::createLoopUnrollAndJamPass(int OptLevel) {
return new LoopUnrollAndJam(OptLevel);
}
PreservedAnalyses LoopUnrollAndJamPass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
LPMUpdater &) {
const auto &FAM =
AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
Function *F = L.getHeader()->getParent();
auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
// FIXME: This should probably be optional rather than required.
if (!ORE)
report_fatal_error(
"LoopUnrollAndJamPass: OptimizationRemarkEmitterAnalysis not cached at "
"a higher level");
DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI);
LoopUnrollResult Result = tryToUnrollAndJamLoop(
&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, DI, *ORE, OptLevel);
if (Result == LoopUnrollResult::Unmodified)
return PreservedAnalyses::all();
return getLoopPassPreservedAnalyses();
}

View File

@ -165,7 +165,7 @@ static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
/// Gather the various unrolling parameters based on the defaults, compiler
/// flags, TTI overrides and user specified parameters.
static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
@ -192,6 +192,8 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
UP.Force = false;
UP.UpperBound = false;
UP.AllowPeeling = true;
UP.UnrollAndJam = false;
UP.UnrollAndJamInnerLoopThreshold = 60;
// Override with any target specific settings
TTI.getUnrollingPreferences(L, SE, UP);
@ -615,11 +617,10 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
}
/// ApproximateLoopSize - Approximate the size of the loop.
static unsigned
ApproximateLoopSize(const Loop *L, unsigned &NumCalls, bool &NotDuplicatable,
bool &Convergent, const TargetTransformInfo &TTI,
const SmallPtrSetImpl<const Value *> &EphValues,
unsigned BEInsns) {
unsigned llvm::ApproximateLoopSize(
const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent,
const TargetTransformInfo &TTI,
const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
CodeMetrics Metrics;
for (BasicBlock *BB : L->blocks())
Metrics.analyzeBasicBlock(BB, TTI, EphValues);
@ -712,7 +713,7 @@ static uint64_t getUnrolledLoopSize(
// Returns true if unroll count was set explicitly.
// Calculates unroll count and writes it to UP.Count.
static bool computeUnrollCount(
bool llvm::computeUnrollCount(
Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
@ -753,8 +754,8 @@ static bool computeUnrollCount(
if (ExplicitUnroll && TripCount != 0) {
// If the loop has an unrolling pragma, we want to be more aggressive with
// unrolling limits. Set thresholds to at least the PragmaThreshold value
// which is larger than the default limits.
// unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
// value which is larger than the default limits.
UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
UP.PartialThreshold =
std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);

View File

@ -70,6 +70,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeLoopStrengthReducePass(Registry);
initializeLoopRerollPass(Registry);
initializeLoopUnrollPass(Registry);
initializeLoopUnrollAndJamPass(Registry);
initializeLoopUnswitchPass(Registry);
initializeLoopVersioningLICMPass(Registry);
initializeLoopIdiomRecognizeLegacyPassPass(Registry);
@ -185,6 +186,10 @@ void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createLoopUnrollPass());
}
void LLVMAddLoopUnrollAndJamPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createLoopUnrollAndJamPass());
}
void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createLoopUnswitchPass());
}

View File

@ -28,6 +28,7 @@ add_llvm_library(LLVMTransformUtils
LoopRotationUtils.cpp
LoopSimplify.cpp
LoopUnroll.cpp
LoopUnrollAndJam.cpp
LoopUnrollPeel.cpp
LoopUnrollRuntime.cpp
LoopUtils.cpp

View File

@ -63,8 +63,7 @@ UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden,
/// Convert the instruction operands from referencing the current values into
/// those specified by VMap.
static inline void remapInstruction(Instruction *I,
ValueToValueMapTy &VMap) {
void llvm::remapInstruction(Instruction *I, ValueToValueMapTy &VMap) {
for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
Value *Op = I->getOperand(op);
@ -98,9 +97,9 @@ static inline void remapInstruction(Instruction *I,
/// Folds a basic block into its predecessor if it only has one predecessor, and
/// that predecessor only has one successor.
/// The LoopInfo Analysis that is passed will be kept consistent.
static BasicBlock *
foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, ScalarEvolution *SE,
DominatorTree *DT) {
BasicBlock *llvm::foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI,
ScalarEvolution *SE,
DominatorTree *DT) {
// Merge basic blocks into their predecessor if there is only one distinct
// pred, and if there is only one distinct successor of the predecessor, and
// if there are no PHI nodes.
@ -110,7 +109,8 @@ foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, ScalarEvolution *SE,
if (OnlyPred->getTerminator()->getNumSuccessors() != 1)
return nullptr;
LLVM_DEBUG(dbgs() << "Merging: " << *BB << "into: " << *OnlyPred);
LLVM_DEBUG(dbgs() << "Merging: " << BB->getName() << " into "
<< OnlyPred->getName() << "\n");
// Resolve any PHI nodes at the start of the block. They are all
// guaranteed to have exactly one entry if they exist, unless there are
@ -255,9 +255,9 @@ static bool isEpilogProfitable(Loop *L) {
/// Perform some cleanup and simplifications on loops after unrolling. It is
/// useful to simplify the IV's in the new loop, as well as do a quick
/// simplify/dce pass of the instructions.
static void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
ScalarEvolution *SE, DominatorTree *DT,
AssumptionCache *AC) {
void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
ScalarEvolution *SE, DominatorTree *DT,
AssumptionCache *AC) {
// Simplify any new induction variables in the partially unrolled loop.
if (SE && SimplifyIVs) {
SmallVector<WeakTrackingVH, 16> DeadInsts;
@ -473,8 +473,8 @@ LoopUnrollResult llvm::UnrollLoop(
if (Force)
RuntimeTripCount = false;
else {
LLVM_DEBUG(dbgs() << "Wont unroll; remainder loop could not be generated"
"when assuming runtime trip count\n");
LLVM_DEBUG(dbgs() << "Won't unroll; remainder loop could not be "
"generated when assuming runtime trip count\n");
return LoopUnrollResult::Unmodified;
}
}

View File

@ -0,0 +1,774 @@
//===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file implements loop unroll and jam as a routine, much like
// LoopUnroll.cpp implements loop unroll.
//
//===----------------------------------------------------------------------===//
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/DependenceAnalysis.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/SimplifyIndVar.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
using namespace llvm;
#define DEBUG_TYPE "loop-unroll-and-jam"
STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
static bool containsBB(std::vector<BasicBlock *> &V, BasicBlock *BB) {
return std::find(V.begin(), V.end(), BB) != V.end();
}
// Partition blocks in an outer/inner loop pair into blocks before and after
// the loop
static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
std::vector<BasicBlock *> &ForeBlocks,
std::vector<BasicBlock *> &SubLoopBlocks,
std::vector<BasicBlock *> &AftBlocks,
DominatorTree *DT) {
BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
SubLoopBlocks = SubLoop->getBlocks();
for (BasicBlock *BB : L->blocks()) {
if (!SubLoop->contains(BB)) {
if (DT->dominates(SubLoopLatch, BB))
AftBlocks.push_back(BB);
else
ForeBlocks.push_back(BB);
}
}
// Check that all blocks in ForeBlocks together dominate the subloop
// TODO: This might ideally be done better with a dominator/postdominators.
BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
for (BasicBlock *BB : ForeBlocks) {
if (BB == SubLoopPreHeader)
continue;
TerminatorInst *TI = BB->getTerminator();
for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
if (!containsBB(ForeBlocks, TI->getSuccessor(i)))
return false;
}
return true;
}
// Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
static void
moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header, BasicBlock *Latch,
Instruction *InsertLoc,
std::vector<BasicBlock *> &AftBlocks) {
// We need to ensure we move the instructions in the correct order,
// starting with the earliest required instruction and moving forward.
std::vector<Instruction *> Worklist;
std::vector<Instruction *> Visited;
for (auto &Phi : Header->phis()) {
Value *V = Phi.getIncomingValueForBlock(Latch);
if (Instruction *I = dyn_cast<Instruction>(V))
Worklist.push_back(I);
}
while (!Worklist.empty()) {
Instruction *I = Worklist.back();
Worklist.pop_back();
if (!containsBB(AftBlocks, I->getParent()))
continue;
Visited.push_back(I);
for (auto &U : I->operands())
if (Instruction *II = dyn_cast<Instruction>(U))
Worklist.push_back(II);
}
// Move all instructions in program order to before the InsertLoc
BasicBlock *InsertLocBB = InsertLoc->getParent();
for (Instruction *I : reverse(Visited)) {
if (I->getParent() != InsertLocBB)
I->moveBefore(InsertLoc);
}
}
/*
This method performs Unroll and Jam. For a simple loop like:
for (i = ..)
Fore(i)
for (j = ..)
SubLoop(i, j)
Aft(i)
Instead of doing normal inner or outer unrolling, we do:
for (i = .., i+=2)
Fore(i)
Fore(i+1)
for (j = ..)
SubLoop(i, j)
SubLoop(i+1, j)
Aft(i)
Aft(i+1)
So the outer loop is essetially unrolled and then the inner loops are fused
("jammed") together into a single loop. This can increase speed when there
are loads in SubLoop that are invariant to i, as they become shared between
the now jammed inner loops.
We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
Fore blocks are those before the inner loop, Aft are those after. Normal
Unroll code is used to copy each of these sets of blocks and the results are
combined together into the final form above.
isSafeToUnrollAndJam should be used prior to calling this to make sure the
unrolling will be valid. Checking profitablility is also advisable.
*/
LoopUnrollResult
llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
unsigned TripMultiple, bool UnrollRemainder,
LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
AssumptionCache *AC, OptimizationRemarkEmitter *ORE) {
// When we enter here we should have already checked that it is safe
BasicBlock *Header = L->getHeader();
assert(L->getSubLoops().size() == 1);
Loop *SubLoop = *L->begin();
// Don't enter the unroll code if there is nothing to do.
if (TripCount == 0 && Count < 2) {
LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
return LoopUnrollResult::Unmodified;
}
assert(Count > 0);
assert(TripMultiple > 0);
assert(TripCount == 0 || TripCount % TripMultiple == 0);
// Are we eliminating the loop control altogether?
bool CompletelyUnroll = (Count == TripCount);
// We use the runtime remainder in cases where we don't know trip multiple
if (TripMultiple == 1 || TripMultiple % Count != 0) {
if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
/*UseEpilogRemainder*/ true,
UnrollRemainder, LI, SE, DT, AC, true)) {
LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
"generated when assuming runtime trip count\n");
return LoopUnrollResult::Unmodified;
}
}
// Notify ScalarEvolution that the loop will be substantially changed,
// if not outright eliminated.
if (SE) {
SE->forgetLoop(L);
SE->forgetLoop(SubLoop);
}
using namespace ore;
// Report the unrolling decision.
if (CompletelyUnroll) {
LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
<< Header->getName() << " with trip count " << TripCount
<< "!\n");
ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
L->getHeader())
<< "completely unroll and jammed loop with "
<< NV("UnrollCount", TripCount) << " iterations");
} else {
auto DiagBuilder = [&]() {
OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
L->getHeader());
return Diag << "unroll and jammed loop by a factor of "
<< NV("UnrollCount", Count);
};
LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
<< " by " << Count);
if (TripMultiple != 1) {
LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
ORE->emit([&]() {
return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
<< " trips per branch";
});
} else {
LLVM_DEBUG(dbgs() << " with run-time trip count");
ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
}
LLVM_DEBUG(dbgs() << "!\n");
}
BasicBlock *Preheader = L->getLoopPreheader();
BasicBlock *LatchBlock = L->getLoopLatch();
BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
assert(Preheader && LatchBlock && Header);
assert(BI && !BI->isUnconditional());
bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
bool SubLoopContinueOnTrue = SubLoop->contains(
SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
// Partition blocks in an outer/inner loop pair into blocks before and after
// the loop
std::vector<BasicBlock *> SubLoopBlocks;
std::vector<BasicBlock *> ForeBlocks;
std::vector<BasicBlock *> AftBlocks;
partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
DT);
// We keep track of the entering/first and exiting/last block of each of
// Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
// blocks easier.
std::vector<BasicBlock *> ForeBlocksFirst;
std::vector<BasicBlock *> ForeBlocksLast;
std::vector<BasicBlock *> SubLoopBlocksFirst;
std::vector<BasicBlock *> SubLoopBlocksLast;
std::vector<BasicBlock *> AftBlocksFirst;
std::vector<BasicBlock *> AftBlocksLast;
ForeBlocksFirst.push_back(Header);
ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
SubLoopBlocksFirst.push_back(SubLoop->getHeader());
SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
AftBlocksFirst.push_back(SubLoop->getExitBlock());
AftBlocksLast.push_back(L->getExitingBlock());
// Maps Blocks[0] -> Blocks[It]
ValueToValueMapTy LastValueMap;
// Move any instructions from fore phi operands from AftBlocks into Fore.
moveHeaderPhiOperandsToForeBlocks(
Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
AftBlocks);
// The current on-the-fly SSA update requires blocks to be processed in
// reverse postorder so that LastValueMap contains the correct value at each
// exit.
LoopBlocksDFS DFS(L);
DFS.perform(LI);
// Stash the DFS iterators before adding blocks to the loop.
LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
if (Header->getParent()->isDebugInfoForProfiling())
for (BasicBlock *BB : L->getBlocks())
for (Instruction &I : *BB)
if (!isa<DbgInfoIntrinsic>(&I))
if (const DILocation *DIL = I.getDebugLoc())
I.setDebugLoc(DIL->cloneWithDuplicationFactor(Count));
// Copy all blocks
for (unsigned It = 1; It != Count; ++It) {
std::vector<BasicBlock *> NewBlocks;
// Maps Blocks[It] -> Blocks[It-1]
DenseMap<Value *, Value *> PrevItValueMap;
for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
ValueToValueMapTy VMap;
BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
Header->getParent()->getBasicBlockList().push_back(New);
if (containsBB(ForeBlocks, *BB)) {
L->addBasicBlockToLoop(New, *LI);
if (*BB == ForeBlocksFirst[0])
ForeBlocksFirst.push_back(New);
if (*BB == ForeBlocksLast[0])
ForeBlocksLast.push_back(New);
} else if (containsBB(SubLoopBlocks, *BB)) {
SubLoop->addBasicBlockToLoop(New, *LI);
if (*BB == SubLoopBlocksFirst[0])
SubLoopBlocksFirst.push_back(New);
if (*BB == SubLoopBlocksLast[0])
SubLoopBlocksLast.push_back(New);
} else if (containsBB(AftBlocks, *BB)) {
L->addBasicBlockToLoop(New, *LI);
if (*BB == AftBlocksFirst[0])
AftBlocksFirst.push_back(New);
if (*BB == AftBlocksLast[0])
AftBlocksLast.push_back(New);
} else {
llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
}
// Update our running maps of newest clones
PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
LastValueMap[*BB] = New;
for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
VI != VE; ++VI) {
PrevItValueMap[VI->second] =
const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
LastValueMap[VI->first] = VI->second;
}
NewBlocks.push_back(New);
// Update DomTree:
if (*BB == ForeBlocksFirst[0])
DT->addNewBlock(New, ForeBlocksLast[It - 1]);
else if (*BB == SubLoopBlocksFirst[0])
DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
else if (*BB == AftBlocksFirst[0])
DT->addNewBlock(New, AftBlocksLast[It - 1]);
else {
// Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
// structure.
auto BBDomNode = DT->getNode(*BB);
auto BBIDom = BBDomNode->getIDom();
BasicBlock *OriginalBBIDom = BBIDom->getBlock();
assert(OriginalBBIDom);
assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
DT->addNewBlock(
New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
}
}
// Remap all instructions in the most recent iteration
for (BasicBlock *NewBlock : NewBlocks) {
for (Instruction &I : *NewBlock) {
::remapInstruction(&I, LastValueMap);
if (auto *II = dyn_cast<IntrinsicInst>(&I))
if (II->getIntrinsicID() == Intrinsic::assume)
AC->registerAssumption(II);
}
}
// Alter the ForeBlocks phi's, pointing them at the latest version of the
// value from the previous iteration's phis
for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
assert(OldValue && "should have incoming edge from Aft[It]");
Value *NewValue = OldValue;
if (Value *PrevValue = PrevItValueMap[OldValue])
NewValue = PrevValue;
assert(Phi.getNumOperands() == 2);
Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
Phi.setIncomingValue(0, NewValue);
Phi.removeIncomingValue(1);
}
}
// Now that all the basic blocks for the unrolled iterations are in place,
// finish up connecting the blocks and phi nodes. At this point LastValueMap
// is the last unrolled iterations values.
// Update Phis in BB from OldBB to point to NewBB
auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
BasicBlock *NewBB) {
for (PHINode &Phi : BB->phis()) {
int I = Phi.getBasicBlockIndex(OldBB);
Phi.setIncomingBlock(I, NewBB);
}
};
// Update Phis in BB from OldBB to point to NewBB and use the latest value
// from LastValueMap
auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
BasicBlock *NewBB,
ValueToValueMapTy &LastValueMap) {
for (PHINode &Phi : BB->phis()) {
for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
if (Phi.getIncomingBlock(b) == OldBB) {
Value *OldValue = Phi.getIncomingValue(b);
if (Value *LastValue = LastValueMap[OldValue])
Phi.setIncomingValue(b, LastValue);
Phi.setIncomingBlock(b, NewBB);
break;
}
}
}
};
// Move all the phis from Src into Dest
auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
Instruction *insertPoint = Dest->getFirstNonPHI();
while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
Phi->moveBefore(insertPoint);
};
// Update the PHI values outside the loop to point to the last block
updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
LastValueMap);
// Update ForeBlocks successors and phi nodes
BranchInst *ForeTerm =
cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
BasicBlock *Dest = SubLoopBlocksFirst[0];
ForeTerm->setSuccessor(0, Dest);
if (CompletelyUnroll) {
while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
Phi->getParent()->getInstList().erase(Phi);
}
} else {
// Update the PHI values to point to the last aft block
updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
AftBlocksLast.back(), LastValueMap);
}
for (unsigned It = 1; It != Count; It++) {
// Remap ForeBlock successors from previous iteration to this
BranchInst *ForeTerm =
cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
BasicBlock *Dest = ForeBlocksFirst[It];
ForeTerm->setSuccessor(0, Dest);
}
// Subloop successors and phis
BranchInst *SubTerm =
cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
ForeBlocksLast.back());
updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
SubLoopBlocksLast.back());
for (unsigned It = 1; It != Count; It++) {
// Replace the conditional branch of the previous iteration subloop with an
// unconditional one to this one
BranchInst *SubTerm =
cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
SubTerm->eraseFromParent();
updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
ForeBlocksLast.back());
updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
SubLoopBlocksLast.back());
movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
}
// Aft blocks successors and phis
BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
if (CompletelyUnroll) {
BranchInst::Create(LoopExit, Term);
Term->eraseFromParent();
} else {
Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
}
updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
SubLoopBlocksLast.back());
for (unsigned It = 1; It != Count; It++) {
// Replace the conditional branch of the previous iteration subloop with an
// unconditional one to this one
BranchInst *AftTerm =
cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
BranchInst::Create(AftBlocksFirst[It], AftTerm);
AftTerm->eraseFromParent();
updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
SubLoopBlocksLast.back());
movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
}
// Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
// new ones required.
if (Count != 1) {
SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
SubLoopBlocksFirst[0]);
DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
SubLoopBlocksLast[0], AftBlocksFirst[0]);
DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
SubLoopBlocksLast.back(), AftBlocksFirst[0]);
DT->applyUpdates(DTUpdates);
}
// Merge adjacent basic blocks, if possible.
SmallPtrSet<BasicBlock *, 16> MergeBlocks;
MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
while (!MergeBlocks.empty()) {
BasicBlock *BB = *MergeBlocks.begin();
BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
BasicBlock *Dest = Term->getSuccessor(0);
if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) {
// Don't remove BB and add Fold as they are the same BB
assert(Fold == BB);
(void)Fold;
MergeBlocks.erase(Dest);
} else
MergeBlocks.erase(BB);
} else
MergeBlocks.erase(BB);
}
// At this point, the code is well formed. We now do a quick sweep over the
// inserted code, doing constant propagation and dead code elimination as we
// go.
simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
NumCompletelyUnrolledAndJammed += CompletelyUnroll;
++NumUnrolledAndJammed;
#ifndef NDEBUG
// We shouldn't have done anything to break loop simplify form or LCSSA.
Loop *OuterL = L->getParentLoop();
Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
if (!CompletelyUnroll)
assert(L->isLoopSimplifyForm());
assert(SubLoop->isLoopSimplifyForm());
assert(DT->verify());
#endif
// Update LoopInfo if the loop is completely removed.
if (CompletelyUnroll)
LI->erase(L);
return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
: LoopUnrollResult::PartiallyUnrolled;
}
static bool getLoadsAndStores(std::vector<BasicBlock *> &Blocks,
SmallVector<Value *, 4> &MemInstr) {
// Scan the BBs and collect legal loads and stores.
// Returns false if non-simple loads/stores are found.
for (BasicBlock *BB : Blocks) {
for (Instruction &I : *BB) {
if (auto *Ld = dyn_cast<LoadInst>(&I)) {
if (!Ld->isSimple())
return false;
MemInstr.push_back(&I);
} else if (auto *St = dyn_cast<StoreInst>(&I)) {
if (!St->isSimple())
return false;
MemInstr.push_back(&I);
} else if (I.mayReadOrWriteMemory()) {
return false;
}
}
}
return true;
}
static bool checkDependencies(SmallVector<Value *, 4> &Earlier,
SmallVector<Value *, 4> &Later,
unsigned LoopDepth, bool InnerLoop,
DependenceInfo &DI) {
// Use DA to check for dependencies between loads and stores that make unroll
// and jam invalid
for (Value *I : Earlier) {
for (Value *J : Later) {
Instruction *Src = cast<Instruction>(I);
Instruction *Dst = cast<Instruction>(J);
if (Src == Dst)
continue;
// Ignore Input dependencies.
if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
continue;
// Track dependencies, and if we find them take a conservative approach
// by allowing only = or < (not >), altough some > would be safe
// (depending upon unroll width).
// For the inner loop, we need to disallow any (> <) dependencies
// FIXME: Allow > so long as distance is less than unroll width
if (auto D = DI.depends(Src, Dst, true)) {
assert(D->isOrdered() && "Expected an output, flow or anti dep.");
if (D->isConfused())
return false;
if (!InnerLoop) {
if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT)
return false;
} else {
assert(LoopDepth + 1 <= D->getLevels());
if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT)
return false;
}
}
}
}
return true;
}
static bool checkDependencies(Loop *L, std::vector<BasicBlock *> &ForeBlocks,
std::vector<BasicBlock *> &SubLoopBlocks,
std::vector<BasicBlock *> &AftBlocks,
DependenceInfo &DI) {
// Get all loads/store pairs for each blocks
SmallVector<Value *, 4> ForeMemInstr;
SmallVector<Value *, 4> SubLoopMemInstr;
SmallVector<Value *, 4> AftMemInstr;
if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
!getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
!getLoadsAndStores(AftBlocks, AftMemInstr))
return false;
// Check for dependencies between any blocks that may change order
unsigned LoopDepth = L->getLoopDepth();
return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
DI) &&
checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
DI) &&
checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
DI);
}
bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
DependenceInfo &DI) {
/* We currently handle outer loops like this:
|
ForeFirst <----\ }
Blocks | } ForeBlocks
ForeLast | }
| |
SubLoopFirst <\ | }
Blocks | | } SubLoopBlocks
SubLoopLast -/ | }
| |
AftFirst | }
Blocks | } AftBlocks
AftLast ------/ }
|
There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
and AftBlocks, providing that there is one edge from Fores to SubLoops,
one edge from SubLoops to Afts and a single outer loop exit (from Afts).
In practice we currently limit Aft blocks to a single block, and limit
things further in the profitablility checks of the unroll and jam pass.
Because of the way we rearrange basic blocks, we also require that
the Fore blocks on all unrolled iterations are safe to move before the
SubLoop blocks of all iterations. So we require that the phi node looping
operands of ForeHeader can be moved to at least the end of ForeEnd, so that
we can arrange cloned Fore Blocks before the subloop and match up Phi's
correctly.
i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
There are then a number of checks along the lines of no calls, no
exceptions, inner loop IV is consistent, etc. Note that for loops requiring
runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
UnrollAndJamLoop if the trip count cannot be easily calculated.
*/
if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
return false;
Loop *SubLoop = L->getSubLoops()[0];
if (!SubLoop->isLoopSimplifyForm())
return false;
BasicBlock *Header = L->getHeader();
BasicBlock *Latch = L->getLoopLatch();
BasicBlock *Exit = L->getExitingBlock();
BasicBlock *SubLoopHeader = SubLoop->getHeader();
BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
if (Latch != Exit)
return false;
if (SubLoopLatch != SubLoopExit)
return false;
if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken())
return false;
// Split blocks into Fore/SubLoop/Aft based on dominators
std::vector<BasicBlock *> SubLoopBlocks;
std::vector<BasicBlock *> ForeBlocks;
std::vector<BasicBlock *> AftBlocks;
if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
AftBlocks, &DT))
return false;
// Aft blocks may need to move instructions to fore blocks, which becomes more
// difficult if there are multiple (potentially conditionally executed)
// blocks. For now we just exclude loops with multiple aft blocks.
if (AftBlocks.size() != 1)
return false;
// Check inner loop IV is consistent between all iterations
const SCEV *SubLoopBECountSC = SE.getExitCount(SubLoop, SubLoopLatch);
if (isa<SCEVCouldNotCompute>(SubLoopBECountSC) ||
!SubLoopBECountSC->getType()->isIntegerTy())
return false;
ScalarEvolution::LoopDisposition LD =
SE.getLoopDisposition(SubLoopBECountSC, L);
if (LD != ScalarEvolution::LoopInvariant)
return false;
// Check the loop safety info for exceptions.
LoopSafetyInfo LSI;
computeLoopSafetyInfo(&LSI, L);
if (LSI.MayThrow)
return false;
// We've ruled out the easy stuff and now need to check that there are no
// interdependencies which may prevent us from moving the:
// ForeBlocks before Subloop and AftBlocks.
// Subloop before AftBlocks.
// ForeBlock phi operands before the subloop
// Make sure we can move all instructions we need to before the subloop
SmallVector<Instruction *, 8> Worklist;
SmallPtrSet<Instruction *, 8> Visited;
for (auto &Phi : Header->phis()) {
Value *V = Phi.getIncomingValueForBlock(Latch);
if (Instruction *I = dyn_cast<Instruction>(V))
Worklist.push_back(I);
}
while (!Worklist.empty()) {
Instruction *I = Worklist.back();
Worklist.pop_back();
if (Visited.insert(I).second) {
if (SubLoop->contains(I->getParent()))
return false;
if (containsBB(AftBlocks, I->getParent())) {
// If we hit a phi node in afts we know we are done (probably LCSSA)
if (isa<PHINode>(I))
return false;
if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
return false;
for (auto &U : I->operands())
if (Instruction *II = dyn_cast<Instruction>(U))
Worklist.push_back(II);
}
}
}
// Check for memory dependencies which prohibit the unrolling we are doing.
// Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
// there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI))
return false;
return true;
}

View File

@ -0,0 +1,470 @@
; RUN: opt -basicaa -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
; CHECK-LABEL: fore_aft_less
; CHECK: %j = phi
; CHECK: %j.1 = phi
; CHECK: %j.2 = phi
; CHECK: %j.3 = phi
define void @fore_aft_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
entry:
%cmp = icmp sgt i32 %N, 0
br i1 %cmp, label %for.outer, label %cleanup
for.outer:
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
store i32 1, i32* %arrayidx, align 4
br label %for.inner
for.inner:
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx5, align 4
%mul = mul nsw i32 %0, %i
%add = add nsw i32 %mul, %sum
%add6 = add nuw nsw i32 %j, 1
%exitcond = icmp eq i32 %add6, %N
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add7 = add nuw nsw i32 %i, 1
%add72 = add nuw nsw i32 %i, -1
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
store i32 %add, i32* %arrayidx8, align 4
%exitcond29 = icmp eq i32 %add7, %N
br i1 %exitcond29, label %cleanup, label %for.outer
cleanup:
ret void
}
; CHECK-LABEL: fore_aft_eq
; CHECK: %j = phi
; CHECK: %j.1 = phi
; CHECK: %j.2 = phi
; CHECK: %j.3 = phi
define void @fore_aft_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
entry:
%cmp = icmp sgt i32 %N, 0
br i1 %cmp, label %for.outer, label %cleanup
for.outer:
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
store i32 1, i32* %arrayidx, align 4
br label %for.inner
for.inner:
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx5, align 4
%mul = mul nsw i32 %0, %i
%add = add nsw i32 %mul, %sum
%add6 = add nuw nsw i32 %j, 1
%exitcond = icmp eq i32 %add6, %N
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add7 = add nuw nsw i32 %i, 1
%add72 = add nuw nsw i32 %i, 0
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %i
store i32 %add, i32* %arrayidx8, align 4
%exitcond29 = icmp eq i32 %add7, %N
br i1 %exitcond29, label %cleanup, label %for.outer
cleanup:
ret void
}
; CHECK-LABEL: fore_aft_more
; CHECK: %j = phi
; CHECK-NOT: %j.1 = phi
define void @fore_aft_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
entry:
%cmp = icmp sgt i32 %N, 0
br i1 %cmp, label %for.outer, label %cleanup
for.outer:
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
store i32 1, i32* %arrayidx, align 4
br label %for.inner
for.inner:
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx5, align 4
%mul = mul nsw i32 %0, %i
%add = add nsw i32 %mul, %sum
%add6 = add nuw nsw i32 %j, 1
%exitcond = icmp eq i32 %add6, %N
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add7 = add nuw nsw i32 %i, 1
%add72 = add nuw nsw i32 %i, 1
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
store i32 %add, i32* %arrayidx8, align 4
%exitcond29 = icmp eq i32 %add7, %N
br i1 %exitcond29, label %cleanup, label %for.outer
cleanup:
ret void
}
; CHECK-LABEL: fore_sub_less
; CHECK: %j = phi
; CHECK: %j.1 = phi
; CHECK: %j.2 = phi
; CHECK: %j.3 = phi
define void @fore_sub_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
entry:
%cmp = icmp sgt i32 %N, 0
br i1 %cmp, label %for.outer, label %cleanup
for.outer:
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
store i32 1, i32* %arrayidx, align 4
br label %for.inner
for.inner:
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx5, align 4
%mul = mul nsw i32 %0, %i
%add = add nsw i32 %mul, %sum
%add72 = add nuw nsw i32 %i, -1
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
store i32 %add, i32* %arrayidx8, align 4
%add6 = add nuw nsw i32 %j, 1
%exitcond = icmp eq i32 %add6, %N
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add7 = add nuw nsw i32 %i, 1
%exitcond29 = icmp eq i32 %add7, %N
br i1 %exitcond29, label %cleanup, label %for.outer
cleanup:
ret void
}
; CHECK-LABEL: fore_sub_eq
; CHECK: %j = phi
; CHECK: %j.1 = phi
; CHECK: %j.2 = phi
; CHECK: %j.3 = phi
define void @fore_sub_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
entry:
%cmp = icmp sgt i32 %N, 0
br i1 %cmp, label %for.outer, label %cleanup
for.outer:
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
store i32 1, i32* %arrayidx, align 4
br label %for.inner
for.inner:
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx5, align 4
%mul = mul nsw i32 %0, %i
%add = add nsw i32 %mul, %sum
%add72 = add nuw nsw i32 %i, 0
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
store i32 %add, i32* %arrayidx8, align 4
%add6 = add nuw nsw i32 %j, 1
%exitcond = icmp eq i32 %add6, %N
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add7 = add nuw nsw i32 %i, 1
%exitcond29 = icmp eq i32 %add7, %N
br i1 %exitcond29, label %cleanup, label %for.outer
cleanup:
ret void
}
; CHECK-LABEL: fore_sub_more
; CHECK: %j = phi
; CHECK-NOT: %j.1 = phi
define void @fore_sub_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
entry:
%cmp = icmp sgt i32 %N, 0
br i1 %cmp, label %for.outer, label %cleanup
for.outer:
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
store i32 1, i32* %arrayidx, align 4
br label %for.inner
for.inner:
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx5, align 4
%mul = mul nsw i32 %0, %i
%add = add nsw i32 %mul, %sum
%add72 = add nuw nsw i32 %i, 1
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
store i32 %add, i32* %arrayidx8, align 4
%add6 = add nuw nsw i32 %j, 1
%exitcond = icmp eq i32 %add6, %N
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add7 = add nuw nsw i32 %i, 1
%exitcond29 = icmp eq i32 %add7, %N
br i1 %exitcond29, label %cleanup, label %for.outer
cleanup:
ret void
}
; CHECK-LABEL: sub_aft_less
; CHECK: %j = phi
; CHECK: %j.1 = phi
; CHECK: %j.2 = phi
; CHECK: %j.3 = phi
define void @sub_aft_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
entry:
%cmp = icmp sgt i32 %N, 0
br i1 %cmp, label %for.outer, label %cleanup
for.outer:
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
br label %for.inner
for.inner:
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx5, align 4
%mul = mul nsw i32 %0, %i
%add = add nsw i32 %mul, %sum
%add6 = add nuw nsw i32 %j, 1
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
store i32 1, i32* %arrayidx, align 4
%exitcond = icmp eq i32 %add6, %N
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add7 = add nuw nsw i32 %i, 1
%add72 = add nuw nsw i32 %i, -1
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
store i32 %add, i32* %arrayidx8, align 4
%exitcond29 = icmp eq i32 %add7, %N
br i1 %exitcond29, label %cleanup, label %for.outer
cleanup:
ret void
}
; CHECK-LABEL: sub_aft_eq
; CHECK: %j = phi
; CHECK: %j.1 = phi
; CHECK: %j.2 = phi
; CHECK: %j.3 = phi
define void @sub_aft_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
entry:
%cmp = icmp sgt i32 %N, 0
br i1 %cmp, label %for.outer, label %cleanup
for.outer:
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
br label %for.inner
for.inner:
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx5, align 4
%mul = mul nsw i32 %0, %i
%add = add nsw i32 %mul, %sum
%add6 = add nuw nsw i32 %j, 1
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
store i32 1, i32* %arrayidx, align 4
%exitcond = icmp eq i32 %add6, %N
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add7 = add nuw nsw i32 %i, 1
%add72 = add nuw nsw i32 %i, 0
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %i
store i32 %add, i32* %arrayidx8, align 4
%exitcond29 = icmp eq i32 %add7, %N
br i1 %exitcond29, label %cleanup, label %for.outer
cleanup:
ret void
}
; CHECK-LABEL: sub_aft_more
; CHECK: %j = phi
; CHECK-NOT: %j.1 = phi
define void @sub_aft_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
entry:
%cmp = icmp sgt i32 %N, 0
br i1 %cmp, label %for.outer, label %cleanup
for.outer:
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
br label %for.inner
for.inner:
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx5, align 4
%mul = mul nsw i32 %0, %i
%add = add nsw i32 %mul, %sum
%add6 = add nuw nsw i32 %j, 1
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
store i32 1, i32* %arrayidx, align 4
%exitcond = icmp eq i32 %add6, %N
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add7 = add nuw nsw i32 %i, 1
%add72 = add nuw nsw i32 %i, 1
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
store i32 %add, i32* %arrayidx8, align 4
%exitcond29 = icmp eq i32 %add7, %N
br i1 %exitcond29, label %cleanup, label %for.outer
cleanup:
ret void
}
; CHECK-LABEL: sub_sub_less
; CHECK: %j = phi
; CHECK-NOT: %j.1 = phi
define void @sub_sub_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
entry:
%cmp = icmp sgt i32 %N, 0
br i1 %cmp, label %for.outer, label %cleanup
for.outer:
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
br label %for.inner
for.inner:
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx5, align 4
%mul = mul nsw i32 %0, %i
%add = add nsw i32 %mul, %sum
%add6 = add nuw nsw i32 %j, 1
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
store i32 1, i32* %arrayidx, align 4
%add72 = add nuw nsw i32 %i, -1
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
store i32 %add, i32* %arrayidx8, align 4
%exitcond = icmp eq i32 %add6, %N
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add7 = add nuw nsw i32 %i, 1
%exitcond29 = icmp eq i32 %add7, %N
br i1 %exitcond29, label %cleanup, label %for.outer
cleanup:
ret void
}
; CHECK-LABEL: sub_sub_eq
; CHECK: %j = phi
; CHECK: %j.1 = phi
define void @sub_sub_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
entry:
%cmp = icmp sgt i32 %N, 0
br i1 %cmp, label %for.outer, label %cleanup
for.outer:
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
br label %for.inner
for.inner:
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx5, align 4
%mul = mul nsw i32 %0, %i
%add = add nsw i32 %mul, %sum
%add6 = add nuw nsw i32 %j, 1
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
store i32 1, i32* %arrayidx, align 4
%add72 = add nuw nsw i32 %i, 0
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
store i32 %add, i32* %arrayidx8, align 4
%exitcond = icmp eq i32 %add6, %N
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add7 = add nuw nsw i32 %i, 1
%exitcond29 = icmp eq i32 %add7, %N
br i1 %exitcond29, label %cleanup, label %for.outer
cleanup:
ret void
}
; CHECK-LABEL: sub_sub_more
; CHECK: %j = phi
; CHECK-NOT: %j.1 = phi
define void @sub_sub_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
entry:
%cmp = icmp sgt i32 %N, 0
br i1 %cmp, label %for.outer, label %cleanup
for.outer:
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
br label %for.inner
for.inner:
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx5, align 4
%mul = mul nsw i32 %0, %i
%add = add nsw i32 %mul, %sum
%add6 = add nuw nsw i32 %j, 1
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
store i32 1, i32* %arrayidx, align 4
%add72 = add nuw nsw i32 %i, 1
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
store i32 %add, i32* %arrayidx8, align 4
%exitcond = icmp eq i32 %add6, %N
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add7 = add nuw nsw i32 %i, 1
%exitcond29 = icmp eq i32 %add7, %N
br i1 %exitcond29, label %cleanup, label %for.outer
cleanup:
ret void
}

View File

@ -0,0 +1,741 @@
; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 -pass-remarks=loop-unroll-and-jam < %s -S 2>&1 | FileCheck %s
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
;; Common check for all tests. None should be unroll and jammed
; CHECK-NOT: remark: {{.*}} unroll and jammed
; CHECK-LABEL: disabled1
; Tests for(i) { sum = A[i]; for(j) sum += B[j]; A[i+1] = sum; }
; A[i] to A[i+1] dependency should block unrollandjam
define void @disabled1(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
; CHECK: %i.029 = phi i32 [ %add10, %for.latch ], [ 0, %for.preheader ]
; CHECK: %j.026 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp127 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp127, %cmp
br i1 %or.cond, label %for.preheader, label %return
for.preheader:
br label %for.outer
for.outer:
%i.029 = phi i32 [ %add10, %for.latch ], [ 0, %for.preheader ]
%b.028 = phi i32 [ %inc8, %for.latch ], [ 1, %for.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.029
%0 = load i32, i32* %arrayidx, align 4
br label %for.inner
for.inner:
%j.026 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
%sum1.025 = phi i32 [ %0, %for.outer ], [ %add, %for.inner ]
%arrayidx6 = getelementptr inbounds i32, i32* %B, i32 %j.026
%1 = load i32, i32* %arrayidx6, align 4
%add = add i32 %1, %sum1.025
%inc = add nuw i32 %j.026, 1
%exitcond = icmp eq i32 %inc, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%arrayidx7 = getelementptr inbounds i32, i32* %A, i32 %b.028
store i32 %add, i32* %arrayidx7, align 4
%inc8 = add nuw nsw i32 %b.028, 1
%add10 = add nuw nsw i32 %i.029, 1
%exitcond30 = icmp eq i32 %add10, %I
br i1 %exitcond30, label %return, label %for.outer
return:
ret void
}
; CHECK-LABEL: disabled2
; Tests an incompatible block layout (for.outer jumps past for.inner)
; FIXME: Make this work
define void @disabled2(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
; CHECK: %i.032 = phi i32 [ %add13, %for.latch ], [ 0, %for.preheader ]
; CHECK: %j.030 = phi i32 [ %inc, %for.inner ], [ 0, %for.inner.preheader ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp131 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp131, %cmp
br i1 %or.cond, label %for.preheader, label %for.end14
for.preheader:
br label %for.outer
for.outer:
%i.032 = phi i32 [ %add13, %for.latch ], [ 0, %for.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.032
%0 = load i32, i32* %arrayidx, align 4
%tobool = icmp eq i32 %0, 0
br i1 %tobool, label %for.latch, label %for.inner
for.inner:
%j.030 = phi i32 [ %inc, %for.inner ], [ 0, %for.outer ]
%sum1.029 = phi i32 [ %sum1.1, %for.inner ], [ 0, %for.outer ]
%arrayidx6 = getelementptr inbounds i32, i32* %B, i32 %j.030
%1 = load i32, i32* %arrayidx6, align 4
%tobool7 = icmp eq i32 %1, 0
%sub = add i32 %sum1.029, 10
%add = sub i32 %sub, %1
%sum1.1 = select i1 %tobool7, i32 %sum1.029, i32 %add
%inc = add nuw i32 %j.030, 1
%exitcond = icmp eq i32 %inc, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%sum1.1.lcssa = phi i32 [ 0, %for.outer ], [ %sum1.1, %for.inner ]
%arrayidx11 = getelementptr inbounds i32, i32* %A, i32 %i.032
store i32 %sum1.1.lcssa, i32* %arrayidx11, align 4
%add13 = add nuw i32 %i.032, 1
%exitcond33 = icmp eq i32 %add13, %I
br i1 %exitcond33, label %for.end14, label %for.outer
for.end14:
ret void
}
; CHECK-LABEL: disabled3
; Tests loop carry dependencies in an array S
define void @disabled3(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
; CHECK: %i.029 = phi i32 [ 0, %for.preheader ], [ %add12, %for.latch ]
; CHECK: %j.027 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
entry:
%S = alloca [4 x i32], align 4
%cmp = icmp eq i32 %J, 0
br i1 %cmp, label %return, label %if.end
if.end:
%0 = bitcast [4 x i32]* %S to i8*
%cmp128 = icmp eq i32 %I, 0
br i1 %cmp128, label %for.cond.cleanup, label %for.preheader
for.preheader:
%arrayidx9 = getelementptr inbounds [4 x i32], [4 x i32]* %S, i32 0, i32 0
br label %for.outer
for.cond.cleanup:
br label %return
for.outer:
%i.029 = phi i32 [ 0, %for.preheader ], [ %add12, %for.latch ]
br label %for.inner
for.inner:
%j.027 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %j.027
%l2 = load i32, i32* %arrayidx, align 4
%add = add i32 %j.027, %i.029
%rem = urem i32 %add, %J
%arrayidx6 = getelementptr inbounds i32, i32* %B, i32 %rem
%l3 = load i32, i32* %arrayidx6, align 4
%mul = mul i32 %l3, %l2
%rem7 = urem i32 %j.027, 3
%arrayidx8 = getelementptr inbounds [4 x i32], [4 x i32]* %S, i32 0, i32 %rem7
store i32 %mul, i32* %arrayidx8, align 4
%inc = add nuw i32 %j.027, 1
%exitcond = icmp eq i32 %inc, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%l1 = load i32, i32* %arrayidx9, align 4
%arrayidx10 = getelementptr inbounds i32, i32* %A, i32 %i.029
store i32 %l1, i32* %arrayidx10, align 4
%add12 = add nuw i32 %i.029, 1
%exitcond31 = icmp eq i32 %add12, %I
br i1 %exitcond31, label %for.cond.cleanup, label %for.outer
return:
ret void
}
; CHECK-LABEL: disabled4
; Inner looop induction variable is not consistent
; ie for(i = 0..n) for (j = 0..i) sum+=B[j]
define void @disabled4(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
; CHECK: %indvars.iv = phi i32 [ %indvars.iv.next, %for.latch ], [ 1, %for.preheader ]
; CHECK: %j.021 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp122 = icmp ugt i32 %I, 1
%or.cond = and i1 %cmp122, %cmp
br i1 %or.cond, label %for.preheader, label %for.end9
for.preheader:
br label %for.outer
for.outer:
%indvars.iv = phi i32 [ %indvars.iv.next, %for.latch ], [ 1, %for.preheader ]
br label %for.inner
for.inner:
%j.021 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
%sum1.020 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %j.021
%0 = load i32, i32* %arrayidx, align 4
%add = add i32 %0, %sum1.020
%inc = add nuw i32 %j.021, 1
%exitcond = icmp eq i32 %inc, %indvars.iv
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
store i32 %add, i32* %arrayidx6, align 4
%indvars.iv.next = add nuw i32 %indvars.iv, 1
%exitcond24 = icmp eq i32 %indvars.iv.next, %I
br i1 %exitcond24, label %for.end9, label %for.outer
for.end9:
ret void
}
; CHECK-LABEL: disabled5
; Test odd uses of phi nodes where the outer IV cannot be moved into Fore as it hits a PHI
@f = hidden global i32 0, align 4
define i32 @disabled5() #0 {
; CHECK: %0 = phi i32 [ %f.promoted10, %entry ], [ 2, %for.latch ]
; CHECK: %1 = phi i32 [ %0, %for.outer ], [ 2, %for.inner ]
entry:
%f.promoted10 = load i32, i32* @f, align 4
br label %for.outer
for.outer:
%0 = phi i32 [ %f.promoted10, %entry ], [ 2, %for.latch ]
%d.018 = phi i16 [ 0, %entry ], [ %odd.lcssa, %for.latch ]
%inc5.sink9 = phi i32 [ 2, %entry ], [ %inc5, %for.latch ]
br label %for.inner
for.inner:
%1 = phi i32 [ %0, %for.outer ], [ 2, %for.inner ]
%inc.sink8 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
%inc = add nuw nsw i32 %inc.sink8, 1
%exitcond = icmp ne i32 %inc, 7
br i1 %exitcond, label %for.inner, label %for.latch
for.latch:
%.lcssa = phi i32 [ %1, %for.inner ]
%odd.lcssa = phi i16 [ 1, %for.inner ]
%inc5 = add nuw nsw i32 %inc5.sink9, 1
%exitcond11 = icmp ne i32 %inc5, 7
br i1 %exitcond11, label %for.outer, label %for.end
for.end:
%.lcssa.lcssa = phi i32 [ %.lcssa, %for.latch ]
%inc.lcssa.lcssa = phi i32 [ 7, %for.latch ]
ret i32 0
}
; CHECK-LABEL: disabled6
; There is a dependency in here, between @d and %0 (=@f)
@d6 = hidden global i16 5, align 2
@f6 = hidden global i16* @d6, align 4
define i32 @disabled6() #0 {
; CHECK: %inc8.sink14.i = phi i16 [ 1, %entry ], [ %inc8.i, %for.cond.cleanup.i ]
; CHECK: %c.013.i = phi i32 [ 0, %for.body.i ], [ %inc.i, %for.body6.i ]
entry:
store i16 1, i16* @d6, align 2
%0 = load i16*, i16** @f6, align 4
br label %for.body.i
for.body.i:
%inc8.sink14.i = phi i16 [ 1, %entry ], [ %inc8.i, %for.cond.cleanup.i ]
%1 = load i16, i16* %0, align 2
br label %for.body6.i
for.cond.cleanup.i:
%inc8.i = add nuw nsw i16 %inc8.sink14.i, 1
store i16 %inc8.i, i16* @d6, align 2
%cmp.i = icmp ult i16 %inc8.i, 6
br i1 %cmp.i, label %for.body.i, label %test.exit
for.body6.i:
%c.013.i = phi i32 [ 0, %for.body.i ], [ %inc.i, %for.body6.i ]
%inc.i = add nuw nsw i32 %c.013.i, 1
%exitcond.i = icmp eq i32 %inc.i, 7
br i1 %exitcond.i, label %for.cond.cleanup.i, label %for.body6.i
test.exit:
%conv2.i = sext i16 %1 to i32
ret i32 0
}
; CHECK-LABEL: disabled7
; Has negative output dependency
define void @disabled7(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
; CHECK: %i.028 = phi i32 [ %add11, %for.cond3.for.cond.cleanup5_crit_edge ], [ 0, %for.body.preheader ]
; CHECK: %j.026 = phi i32 [ 0, %for.body ], [ %add9, %for.body6 ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp127 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp127, %cmp
br i1 %or.cond, label %for.body.preheader, label %for.end12
for.body.preheader:
br label %for.body
for.body:
%i.028 = phi i32 [ %add11, %for.cond3.for.cond.cleanup5_crit_edge ], [ 0, %for.body.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.028
store i32 0, i32* %arrayidx, align 4
%sub = add i32 %i.028, -1
%arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %sub
store i32 2, i32* %arrayidx2, align 4
br label %for.body6
for.cond3.for.cond.cleanup5_crit_edge:
store i32 %add, i32* %arrayidx, align 4
%add11 = add nuw i32 %i.028, 1
%exitcond29 = icmp eq i32 %add11, %I
br i1 %exitcond29, label %for.end12, label %for.body
for.body6:
%0 = phi i32 [ 0, %for.body ], [ %add, %for.body6 ]
%j.026 = phi i32 [ 0, %for.body ], [ %add9, %for.body6 ]
%arrayidx7 = getelementptr inbounds i32, i32* %B, i32 %j.026
%1 = load i32, i32* %arrayidx7, align 4
%add = add i32 %1, %0
%add9 = add nuw i32 %j.026, 1
%exitcond = icmp eq i32 %add9, %J
br i1 %exitcond, label %for.cond3.for.cond.cleanup5_crit_edge, label %for.body6
for.end12:
ret void
}
; CHECK-LABEL: disabled8
; Same as above with an extra outer loop nest
define void @disabled8(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
; CHECK: %i.036 = phi i32 [ %add15, %for.latch ], [ 0, %for.body ]
; CHECK: %j.034 = phi i32 [ 0, %for.outer ], [ %add13, %for.inner ]
entry:
%cmp = icmp eq i32 %J, 0
%cmp335 = icmp eq i32 %I, 0
%or.cond = or i1 %cmp, %cmp335
br i1 %or.cond, label %for.end18, label %for.body.preheader
for.body.preheader:
br label %for.body
for.body:
%x.037 = phi i32 [ %inc, %for.cond.cleanup4 ], [ 0, %for.body.preheader ]
br label %for.outer
for.cond.cleanup4:
%inc = add nuw nsw i32 %x.037, 1
%exitcond40 = icmp eq i32 %inc, 5
br i1 %exitcond40, label %for.end18, label %for.body
for.outer:
%i.036 = phi i32 [ %add15, %for.latch ], [ 0, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.036
store i32 0, i32* %arrayidx, align 4
%sub = add i32 %i.036, -1
%arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %sub
store i32 2, i32* %arrayidx6, align 4
br label %for.inner
for.latch:
store i32 %add, i32* %arrayidx, align 4
%add15 = add nuw i32 %i.036, 1
%exitcond38 = icmp eq i32 %add15, %I
br i1 %exitcond38, label %for.cond.cleanup4, label %for.outer
for.inner:
%0 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
%j.034 = phi i32 [ 0, %for.outer ], [ %add13, %for.inner ]
%arrayidx11 = getelementptr inbounds i32, i32* %B, i32 %j.034
%1 = load i32, i32* %arrayidx11, align 4
%add = add i32 %1, %0
%add13 = add nuw i32 %j.034, 1
%exitcond = icmp eq i32 %add13, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.end18:
ret void
}
; CHECK-LABEL: disabled9
; Can't prove alias between A and B
define void @disabled9(i32 %I, i32 %J, i32* nocapture %A, i32* nocapture readonly %B) #0 {
; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp122 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmp122
br i1 %or.cond, label %for.outer.preheader, label %for.end
for.outer.preheader:
br label %for.outer
for.outer:
%i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
br label %for.inner
for.inner:
%j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
%sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx, align 4
%add = add i32 %0, %sum1
%inc = add nuw i32 %j, 1
%exitcond = icmp eq i32 %inc, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add.lcssa = phi i32 [ %add, %for.inner ]
%arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
store i32 %add.lcssa, i32* %arrayidx6, align 4
%add8 = add nuw i32 %i, 1
%exitcond25 = icmp eq i32 %add8, %I
br i1 %exitcond25, label %for.end.loopexit, label %for.outer
for.end.loopexit:
br label %for.end
for.end:
ret void
}
; CHECK-LABEL: disable10
; Simple call
declare void @f10(i32, i32) #0
define void @disable10(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp122 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmp122
br i1 %or.cond, label %for.outer.preheader, label %for.end
for.outer.preheader:
br label %for.outer
for.outer:
%i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
br label %for.inner
for.inner:
%j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
%sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx, align 4
%add = add i32 %0, %sum1
%inc = add nuw i32 %j, 1
%exitcond = icmp eq i32 %inc, %J
tail call void @f10(i32 %i, i32 %j) nounwind
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add.lcssa = phi i32 [ %add, %for.inner ]
%arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
store i32 %add.lcssa, i32* %arrayidx6, align 4
%add8 = add nuw i32 %i, 1
%exitcond25 = icmp eq i32 %add8, %I
br i1 %exitcond25, label %for.end.loopexit, label %for.outer
for.end.loopexit:
br label %for.end
for.end:
ret void
}
; CHECK-LABEL: disable11
; volatile
define void @disable11(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp122 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmp122
br i1 %or.cond, label %for.outer.preheader, label %for.end
for.outer.preheader:
br label %for.outer
for.outer:
%i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
br label %for.inner
for.inner:
%j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
%sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load volatile i32, i32* %arrayidx, align 4
%add = add i32 %0, %sum1
%inc = add nuw i32 %j, 1
%exitcond = icmp eq i32 %inc, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add.lcssa = phi i32 [ %add, %for.inner ]
%arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
store i32 %add.lcssa, i32* %arrayidx6, align 4
%add8 = add nuw i32 %i, 1
%exitcond25 = icmp eq i32 %add8, %I
br i1 %exitcond25, label %for.end.loopexit, label %for.outer
for.end.loopexit:
br label %for.end
for.end:
ret void
}
; CHECK-LABEL: disable12
; Multiple aft blocks
define void @disable12(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
; CHECK: %i = phi i32 [ %add8, %for.latch3 ], [ 0, %for.outer.preheader ]
; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp122 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmp122
br i1 %or.cond, label %for.outer.preheader, label %for.end
for.outer.preheader:
br label %for.outer
for.outer:
%i = phi i32 [ %add8, %for.latch3 ], [ 0, %for.outer.preheader ]
br label %for.inner
for.inner:
%j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
%sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx, align 4
%add = add i32 %0, %sum1
%inc = add nuw i32 %j, 1
%exitcond = icmp eq i32 %inc, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add.lcssa = phi i32 [ %add, %for.inner ]
%arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
store i32 %add.lcssa, i32* %arrayidx6, align 4
%cmpl = icmp eq i32 %add.lcssa, 10
br i1 %cmpl, label %for.latch2, label %for.latch3
for.latch2:
br label %for.latch3
for.latch3:
%add8 = add nuw i32 %i, 1
%exitcond25 = icmp eq i32 %add8, %I
br i1 %exitcond25, label %for.end.loopexit, label %for.outer
for.end.loopexit:
br label %for.end
for.end:
ret void
}
; CHECK-LABEL: disable13
; Two subloops
define void @disable13(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
; CHECK: %j2 = phi i32 [ %inc2, %for.inner2 ], [ 0, %for.inner2.preheader ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp122 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmp122
br i1 %or.cond, label %for.outer.preheader, label %for.end
for.outer.preheader:
br label %for.outer
for.outer:
%i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
br label %for.inner
for.inner:
%j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
%sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx, align 4
%add = add i32 %0, %sum1
%inc = add nuw i32 %j, 1
%exitcond = icmp eq i32 %inc, %J
br i1 %exitcond, label %for.inner2, label %for.inner
for.inner2:
%j2 = phi i32 [ 0, %for.inner ], [ %inc2, %for.inner2 ]
%sum12 = phi i32 [ 0, %for.inner ], [ %add2, %for.inner2 ]
%arrayidx2 = getelementptr inbounds i32, i32* %B, i32 %j2
%l0 = load i32, i32* %arrayidx2, align 4
%add2 = add i32 %l0, %sum12
%inc2 = add nuw i32 %j2, 1
%exitcond2 = icmp eq i32 %inc2, %J
br i1 %exitcond2, label %for.latch, label %for.inner2
for.latch:
%add.lcssa = phi i32 [ %add, %for.inner2 ]
%arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
store i32 %add.lcssa, i32* %arrayidx6, align 4
%add8 = add nuw i32 %i, 1
%exitcond25 = icmp eq i32 %add8, %I
br i1 %exitcond25, label %for.end.loopexit, label %for.outer
for.end.loopexit:
br label %for.end
for.end:
ret void
}
; CHECK-LABEL: disable14
; Multiple exits blocks
define void @disable14(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
; CHECK: %j = phi i32 [ %inc, %for.inner ], [ 0, %for.inner.preheader ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp122 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmp122
br i1 %or.cond, label %for.outer.preheader, label %for.end
for.outer.preheader:
br label %for.outer
for.outer:
%i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
%add8 = add nuw i32 %i, 1
%exitcond23 = icmp eq i32 %add8, %I
br i1 %exitcond23, label %for.end.loopexit, label %for.inner
for.inner:
%j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
%sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx, align 4
%add = add i32 %0, %sum1
%inc = add nuw i32 %j, 1
%exitcond = icmp eq i32 %inc, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add.lcssa = phi i32 [ %add, %for.inner ]
%arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
store i32 %add.lcssa, i32* %arrayidx6, align 4
%exitcond25 = icmp eq i32 %add8, %I
br i1 %exitcond25, label %for.end.loopexit, label %for.outer
for.end.loopexit:
br label %for.end
for.end:
ret void
}
; CHECK-LABEL: disable15
; Latch != exit
define void @disable15(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
; CHECK: %j = phi i32 [ %inc, %for.inner ], [ 0, %for.inner.preheader ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp122 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmp122
br i1 %or.cond, label %for.outer.preheader, label %for.end
for.outer.preheader:
br label %for.outer
for.outer:
%i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
%add8 = add nuw i32 %i, 1
%exitcond25 = icmp eq i32 %add8, %I
br i1 %exitcond25, label %for.end.loopexit, label %for.inner
for.inner:
%j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
%sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx, align 4
%add = add i32 %0, %sum1
%inc = add nuw i32 %j, 1
%exitcond = icmp eq i32 %inc, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add.lcssa = phi i32 [ %add, %for.inner ]
%arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
store i32 %add.lcssa, i32* %arrayidx6, align 4
br label %for.outer
for.end.loopexit:
br label %for.end
for.end:
ret void
}
; CHECK-LABEL: disable16
; Cannot move other before inner loop
define void @disable16(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp122 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmp122
br i1 %or.cond, label %for.outer.preheader, label %for.end
for.outer.preheader:
br label %for.outer
for.outer:
%i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
%otherphi = phi i32 [ %other, %for.latch ], [ 0, %for.outer.preheader ]
br label %for.inner
for.inner:
%j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
%sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx, align 4
%add = add i32 %0, %sum1
%inc = add nuw i32 %j, 1
%exitcond = icmp eq i32 %inc, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add.lcssa = phi i32 [ %add, %for.inner ]
%arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
store i32 %add.lcssa, i32* %arrayidx6, align 4
%add8 = add nuw i32 %i, 1
%exitcond25 = icmp eq i32 %add8, %I
%loadarr = getelementptr inbounds i32, i32* %A, i32 %i
%load = load i32, i32* %arrayidx6, align 4
%other = add i32 %otherphi, %load
br i1 %exitcond25, label %for.end.loopexit, label %for.outer
for.end.loopexit:
br label %for.end
for.end:
ret void
}

View File

@ -0,0 +1,319 @@
; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-runtime < %s -S | FileCheck %s
; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-runtime -unroll-and-jam-threshold=15 < %s -S | FileCheck %s --check-prefix=CHECK-LOWTHRES
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
; CHECK-LABEL: test1
; Basic check that these loops are by default UnJ'd
define void @test1(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
; CHECK: %i.us = phi i32 [ %add8.us.{{[1-9]*}}, %for.latch ], [ 0, %for.outer.preheader.new ]
; CHECK-LOWTHRES: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp122 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmp122
br i1 %or.cond, label %for.outer.preheader, label %for.end
for.outer.preheader:
br label %for.outer
for.outer:
%i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
br label %for.inner
for.inner:
%j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
%sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
%arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
%0 = load i32, i32* %arrayidx.us, align 4
%add.us = add i32 %0, %sum1.us
%inc.us = add nuw i32 %j.us, 1
%exitcond = icmp eq i32 %inc.us, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add.us.lcssa = phi i32 [ %add.us, %for.inner ]
%arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
%add8.us = add nuw i32 %i.us, 1
%exitcond25 = icmp eq i32 %add8.us, %I
br i1 %exitcond25, label %for.end.loopexit, label %for.outer
for.end.loopexit:
br label %for.end
for.end:
ret void
}
; CHECK-LABEL: nounroll_and_jam
; #pragma nounroll_and_jam
define void @nounroll_and_jam(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp122 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmp122
br i1 %or.cond, label %for.outer.preheader, label %for.end
for.outer.preheader:
br label %for.outer
for.outer:
%i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
br label %for.inner
for.inner:
%j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
%sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
%arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
%0 = load i32, i32* %arrayidx.us, align 4
%add.us = add i32 %0, %sum1.us
%inc.us = add nuw i32 %j.us, 1
%exitcond = icmp eq i32 %inc.us, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add.us.lcssa = phi i32 [ %add.us, %for.inner ]
%arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
%add8.us = add nuw i32 %i.us, 1
%exitcond25 = icmp eq i32 %add8.us, %I
br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !1
for.end.loopexit:
br label %for.end
for.end:
ret void
}
; CHECK-LABEL: unroll_and_jam_count
; #pragma unroll_and_jam(8)
define void @unroll_and_jam_count(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
; CHECK: %i.us = phi i32 [ %add8.us.7, %for.latch ], [ 0, %for.outer.preheader.new ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp122 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmp122
br i1 %or.cond, label %for.outer.preheader, label %for.end
for.outer.preheader:
br label %for.outer
for.outer:
%i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
br label %for.inner
for.inner:
%j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
%sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
%arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
%0 = load i32, i32* %arrayidx.us, align 4
%add.us = add i32 %0, %sum1.us
%inc.us = add nuw i32 %j.us, 1
%exitcond = icmp eq i32 %inc.us, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add.us.lcssa = phi i32 [ %add.us, %for.inner ]
%arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
%add8.us = add nuw i32 %i.us, 1
%exitcond25 = icmp eq i32 %add8.us, %I
br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !3
for.end.loopexit:
br label %for.end
for.end:
ret void
}
; CHECK-LABEL: unroll_and_jam
; #pragma unroll_and_jam
define void @unroll_and_jam(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
; CHECK: %i.us = phi i32 [ %add8.us.{{[1-9]*}}, %for.latch ], [ 0, %for.outer.preheader.new ]
; CHECK-LOWTHRES: %i.us = phi i32 [ %add8.us.{{[1-9]*}}, %for.latch ], [ 0, %for.outer.preheader.new ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp122 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmp122
br i1 %or.cond, label %for.outer.preheader, label %for.end
for.outer.preheader:
br label %for.outer
for.outer:
%i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
br label %for.inner
for.inner:
%j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
%sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
%arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
%0 = load i32, i32* %arrayidx.us, align 4
%add.us = add i32 %0, %sum1.us
%inc.us = add nuw i32 %j.us, 1
%exitcond = icmp eq i32 %inc.us, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add.us.lcssa = phi i32 [ %add.us, %for.inner ]
%arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
%add8.us = add nuw i32 %i.us, 1
%exitcond25 = icmp eq i32 %add8.us, %I
br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !5
for.end.loopexit:
br label %for.end
for.end:
ret void
}
; CHECK-LABEL: nounroll
; #pragma nounroll (which we take to mean disable unroll and jam too)
define void @nounroll(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp122 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmp122
br i1 %or.cond, label %for.outer.preheader, label %for.end
for.outer.preheader:
br label %for.outer
for.outer:
%i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
br label %for.inner
for.inner:
%j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
%sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
%arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
%0 = load i32, i32* %arrayidx.us, align 4
%add.us = add i32 %0, %sum1.us
%inc.us = add nuw i32 %j.us, 1
%exitcond = icmp eq i32 %inc.us, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add.us.lcssa = phi i32 [ %add.us, %for.inner ]
%arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
%add8.us = add nuw i32 %i.us, 1
%exitcond25 = icmp eq i32 %add8.us, %I
br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !7
for.end.loopexit:
br label %for.end
for.end:
ret void
}
; CHECK-LABEL: unroll
; #pragma unroll (which we take to mean disable unroll and jam)
define void @unroll(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp122 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmp122
br i1 %or.cond, label %for.outer.preheader, label %for.end
for.outer.preheader:
br label %for.outer
for.outer:
%i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
br label %for.inner
for.inner:
%j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
%sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
%arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
%0 = load i32, i32* %arrayidx.us, align 4
%add.us = add i32 %0, %sum1.us
%inc.us = add nuw i32 %j.us, 1
%exitcond = icmp eq i32 %inc.us, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add.us.lcssa = phi i32 [ %add.us, %for.inner ]
%arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
%add8.us = add nuw i32 %i.us, 1
%exitcond25 = icmp eq i32 %add8.us, %I
br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !9
for.end.loopexit:
br label %for.end
for.end:
ret void
}
; CHECK-LABEL: nounroll_plus_unroll_and_jam
; #pragma clang loop nounroll, unroll_and_jam (which we take to mean do unroll_and_jam)
define void @nounroll_plus_unroll_and_jam(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
; CHECK: %i.us = phi i32 [ %add8.us.{{[1-9]*}}, %for.latch ], [ 0, %for.outer.preheader.new ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp122 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmp122
br i1 %or.cond, label %for.outer.preheader, label %for.end
for.outer.preheader:
br label %for.outer
for.outer:
%i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
br label %for.inner
for.inner:
%j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
%sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
%arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
%0 = load i32, i32* %arrayidx.us, align 4
%add.us = add i32 %0, %sum1.us
%inc.us = add nuw i32 %j.us, 1
%exitcond = icmp eq i32 %inc.us, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add.us.lcssa = phi i32 [ %add.us, %for.inner ]
%arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
%add8.us = add nuw i32 %i.us, 1
%exitcond25 = icmp eq i32 %add8.us, %I
br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !11
for.end.loopexit:
br label %for.end
for.end:
ret void
}
!1 = distinct !{!1, !2}
!2 = distinct !{!"llvm.loop.unroll_and_jam.disable"}
!3 = distinct !{!3, !4}
!4 = distinct !{!"llvm.loop.unroll_and_jam.count", i32 8}
!5 = distinct !{!5, !6}
!6 = distinct !{!"llvm.loop.unroll_and_jam.enable"}
!7 = distinct !{!7, !8}
!8 = distinct !{!"llvm.loop.unroll.disable"}
!9 = distinct !{!9, !10}
!10 = distinct !{!"llvm.loop.unroll.enable"}
!11 = distinct !{!11, !8, !6}

View File

@ -0,0 +1,217 @@
; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -pass-remarks=loop-unroll < %s -S 2>&1 | FileCheck %s
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv8m.main-arm-none-eabi"
;; Common check for all tests. None should be unroll and jammed due to profitability
; CHECK-NOT: remark: {{.*}} unroll and jammed
; CHECK-LABEL: unprof1
; Multiple inner loop blocks
define void @unprof1(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
; CHECK: %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner2 ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp122 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmp122
br i1 %or.cond, label %for.outer.preheader, label %for.end
for.outer.preheader:
br label %for.outer
for.outer:
%i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
br label %for.inner
for.inner:
%j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner2 ]
%sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner2 ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx, align 4
%add = add i32 %0, %sum1
br label %for.inner2
for.inner2:
%inc = add nuw i32 %j, 1
%exitcond = icmp eq i32 %inc, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add.lcssa = phi i32 [ %add, %for.inner2 ]
%arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
store i32 %add.lcssa, i32* %arrayidx6, align 4
%addinc = add nuw i32 %i, 1
%exitcond25 = icmp eq i32 %addinc, %I
br i1 %exitcond25, label %for.loopexit, label %for.outer
for.loopexit:
br label %for.end
for.end:
ret void
}
; CHECK-LABEL: unprof2
; Constant inner loop count
define void @unprof2(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
; CHECK: %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp122 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmp122
br i1 %or.cond, label %for.outer.preheader, label %for.end
for.outer.preheader:
br label %for.outer
for.outer:
%i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
br label %for.inner
for.inner:
%j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
%sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx, align 4
%add = add i32 %0, %sum1
%inc = add nuw i32 %j, 1
%exitcond = icmp eq i32 %inc, 10
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add.lcssa = phi i32 [ %add, %for.inner ]
%arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
store i32 %add.lcssa, i32* %arrayidx6, align 4
%addinc = add nuw i32 %i, 1
%exitcond25 = icmp eq i32 %addinc, %I
br i1 %exitcond25, label %for.loopexit, label %for.outer
for.loopexit:
br label %for.end
for.end:
ret void
}
; CHECK-LABEL: unprof3
; Complex inner loop
define void @unprof3(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
; CHECK: %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp122 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmp122
br i1 %or.cond, label %for.outer.preheader, label %for.end
for.outer.preheader:
br label %for.outer
for.outer:
%i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
br label %for.inner
for.inner:
%j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
%sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx, align 4
%add = add i32 %0, %sum1
%add0 = add i32 %0, %sum1
%add1 = add i32 %0, %sum1
%add2 = add i32 %0, %sum1
%add3 = add i32 %0, %sum1
%add4 = add i32 %0, %sum1
%add5 = add i32 %0, %sum1
%add6 = add i32 %0, %sum1
%add7 = add i32 %0, %sum1
%add8 = add i32 %0, %sum1
%add9 = add i32 %0, %sum1
%add10 = add i32 %0, %sum1
%add11 = add i32 %0, %sum1
%add12 = add i32 %0, %sum1
%add13 = add i32 %0, %sum1
%add14 = add i32 %0, %sum1
%add15 = add i32 %0, %sum1
%add16 = add i32 %0, %sum1
%add17 = add i32 %0, %sum1
%add18 = add i32 %0, %sum1
%add19 = add i32 %0, %sum1
%add20 = add i32 %0, %sum1
%add21 = add i32 %0, %sum1
%add22 = add i32 %0, %sum1
%add23 = add i32 %0, %sum1
%add24 = add i32 %0, %sum1
%add25 = add i32 %0, %sum1
%add26 = add i32 %0, %sum1
%add27 = add i32 %0, %sum1
%add28 = add i32 %0, %sum1
%add29 = add i32 %0, %sum1
%inc = add nuw i32 %j, 1
%exitcond = icmp eq i32 %inc, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add.lcssa = phi i32 [ %add, %for.inner ]
%arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
store i32 %add.lcssa, i32* %arrayidx6, align 4
%addinc = add nuw i32 %i, 1
%exitcond25 = icmp eq i32 %addinc, %I
br i1 %exitcond25, label %for.loopexit, label %for.outer
for.loopexit:
br label %for.end
for.end:
ret void
}
; CHECK-LABEL: unprof4
; No loop invariant loads
define void @unprof4(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
; CHECK: %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
entry:
%cmp = icmp ne i32 %J, 0
%cmp122 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmp122
br i1 %or.cond, label %for.outer.preheader, label %for.end
for.outer.preheader:
br label %for.outer
for.outer:
%i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
br label %for.inner
for.inner:
%j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
%sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
%j2 = add i32 %j, %i
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %j2
%0 = load i32, i32* %arrayidx, align 4
%add = add i32 %0, %sum1
%inc = add nuw i32 %j, 1
%exitcond = icmp eq i32 %inc, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add.lcssa = phi i32 [ %add, %for.inner ]
%arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
store i32 %add.lcssa, i32* %arrayidx6, align 4
%addinc = add nuw i32 %i, 1
%exitcond25 = icmp eq i32 %addinc, %I
br i1 %exitcond25, label %for.loopexit, label %for.outer
for.loopexit:
br label %for.end
for.end:
ret void
}

View File

@ -0,0 +1,735 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -basicaa -tbaa -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 -unroll-remainder < %s -S | FileCheck %s
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
; CHECK-LABEL: test1
; Tests for(i) { sum = 0; for(j) sum += B[j]; A[i] = sum; }
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[J:%.*]], 0
; CHECK-NEXT: [[CMPJ:%.*]] = icmp ne i32 [[I:%.*]], 0
; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[CMP]], [[CMPJ]]
; CHECK-NEXT: br i1 [[OR_COND]], label [[FOR_OUTER_PREHEADER:%.*]], label [[FOR_END:%.*]]
; CHECK: for.outer.preheader:
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[I]], -1
; CHECK-NEXT: [[XTRAITER:%.*]] = and i32 [[I]], 3
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 3
; CHECK-NEXT: br i1 [[TMP1]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_OUTER_PREHEADER_NEW:%.*]]
; CHECK: for.outer.preheader.new:
; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i32 [[I]], [[XTRAITER]]
; CHECK-NEXT: br label [[FOR_OUTER:%.*]]
; CHECK: for.outer:
; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[ADD8_3:%.*]], [[FOR_LATCH:%.*]] ], [ 0, [[FOR_OUTER_PREHEADER_NEW]] ]
; CHECK-NEXT: [[NITER:%.*]] = phi i32 [ [[UNROLL_ITER]], [[FOR_OUTER_PREHEADER_NEW]] ], [ [[NITER_NSUB_3:%.*]], [[FOR_LATCH]] ]
; CHECK-NEXT: [[ADD8:%.*]] = add nuw nsw i32 [[I]], 1
; CHECK-NEXT: [[NITER_NSUB:%.*]] = sub i32 [[NITER]], 1
; CHECK-NEXT: [[ADD8_1:%.*]] = add nuw nsw i32 [[ADD8]], 1
; CHECK-NEXT: [[NITER_NSUB_1:%.*]] = sub i32 [[NITER_NSUB]], 1
; CHECK-NEXT: [[ADD8_2:%.*]] = add nuw nsw i32 [[ADD8_1]], 1
; CHECK-NEXT: [[NITER_NSUB_2:%.*]] = sub i32 [[NITER_NSUB_1]], 1
; CHECK-NEXT: [[ADD8_3]] = add nuw i32 [[ADD8_2]], 1
; CHECK-NEXT: [[NITER_NSUB_3]] = sub i32 [[NITER_NSUB_2]], 1
; CHECK-NEXT: br label [[FOR_INNER:%.*]]
; CHECK: for.inner:
; CHECK-NEXT: [[J_0:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC:%.*]], [[FOR_INNER]] ]
; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD:%.*]], [[FOR_INNER]] ]
; CHECK-NEXT: [[J_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_1:%.*]], [[FOR_INNER]] ]
; CHECK-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_1:%.*]], [[FOR_INNER]] ]
; CHECK-NEXT: [[J_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_2:%.*]], [[FOR_INNER]] ]
; CHECK-NEXT: [[SUM_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_2:%.*]], [[FOR_INNER]] ]
; CHECK-NEXT: [[J_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_3:%.*]], [[FOR_INNER]] ]
; CHECK-NEXT: [[SUM_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_3:%.*]], [[FOR_INNER]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[J_0]]
; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !tbaa !0
; CHECK-NEXT: [[ADD]] = add i32 [[TMP2]], [[SUM]]
; CHECK-NEXT: [[INC]] = add nuw i32 [[J_0]], 1
; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_1]]
; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4, !tbaa !0
; CHECK-NEXT: [[ADD_1]] = add i32 [[TMP3]], [[SUM_1]]
; CHECK-NEXT: [[INC_1]] = add nuw i32 [[J_1]], 1
; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_2]]
; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4, !tbaa !0
; CHECK-NEXT: [[ADD_2]] = add i32 [[TMP4]], [[SUM_2]]
; CHECK-NEXT: [[INC_2]] = add nuw i32 [[J_2]], 1
; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_3]]
; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4, !tbaa !0
; CHECK-NEXT: [[ADD_3]] = add i32 [[TMP5]], [[SUM_3]]
; CHECK-NEXT: [[INC_3]] = add nuw i32 [[J_3]], 1
; CHECK-NEXT: [[EXITCOND_3:%.*]] = icmp eq i32 [[INC_3]], [[J]]
; CHECK-NEXT: br i1 [[EXITCOND_3]], label [[FOR_LATCH]], label [[FOR_INNER]]
; CHECK: for.latch:
; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_INNER]] ]
; CHECK-NEXT: [[ADD_LCSSA_1:%.*]] = phi i32 [ [[ADD_1]], [[FOR_INNER]] ]
; CHECK-NEXT: [[ADD_LCSSA_2:%.*]] = phi i32 [ [[ADD_2]], [[FOR_INNER]] ]
; CHECK-NEXT: [[ADD_LCSSA_3:%.*]] = phi i32 [ [[ADD_3]], [[FOR_INNER]] ]
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I]]
; CHECK-NEXT: store i32 [[ADD_LCSSA]], i32* [[ARRAYIDX6]], align 4, !tbaa !0
; CHECK-NEXT: [[ARRAYIDX6_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[ADD8]]
; CHECK-NEXT: store i32 [[ADD_LCSSA_1]], i32* [[ARRAYIDX6_1]], align 4, !tbaa !0
; CHECK-NEXT: [[ARRAYIDX6_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[ADD8_1]]
; CHECK-NEXT: store i32 [[ADD_LCSSA_2]], i32* [[ARRAYIDX6_2]], align 4, !tbaa !0
; CHECK-NEXT: [[ARRAYIDX6_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[ADD8_2]]
; CHECK-NEXT: store i32 [[ADD_LCSSA_3]], i32* [[ARRAYIDX6_3]], align 4, !tbaa !0
; CHECK-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i32 [[NITER_NSUB_3]], 0
; CHECK-NEXT: br i1 [[NITER_NCMP_3]], label [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[FOR_OUTER]], !llvm.loop !4
; CHECK: for.end.loopexit.unr-lcssa.loopexit:
; CHECK-NEXT: [[I_UNR_PH:%.*]] = phi i32 [ [[ADD8_3]], [[FOR_LATCH]] ]
; CHECK-NEXT: br label [[FOR_END_LOOPEXIT_UNR_LCSSA]]
; CHECK: for.end.loopexit.unr-lcssa:
; CHECK-NEXT: [[I_UNR:%.*]] = phi i32 [ 0, [[FOR_OUTER_PREHEADER]] ], [ [[I_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0
; CHECK-NEXT: br i1 [[LCMP_MOD]], label [[FOR_OUTER_EPIL_PREHEADER:%.*]], label [[FOR_END_LOOPEXIT:%.*]]
; CHECK: for.outer.epil.preheader:
; CHECK-NEXT: br label [[FOR_OUTER_EPIL:%.*]]
; CHECK: for.outer.epil:
; CHECK-NEXT: br label [[FOR_INNER_EPIL:%.*]]
; CHECK: for.inner.epil:
; CHECK-NEXT: [[J_EPIL:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL]] ], [ [[INC_EPIL:%.*]], [[FOR_INNER_EPIL]] ]
; CHECK-NEXT: [[SUM_EPIL:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL]] ], [ [[ADD_EPIL:%.*]], [[FOR_INNER_EPIL]] ]
; CHECK-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_EPIL]]
; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_EPIL]], align 4, !tbaa !0
; CHECK-NEXT: [[ADD_EPIL]] = add i32 [[TMP6]], [[SUM_EPIL]]
; CHECK-NEXT: [[INC_EPIL]] = add nuw i32 [[J_EPIL]], 1
; CHECK-NEXT: [[EXITCOND_EPIL:%.*]] = icmp eq i32 [[INC_EPIL]], [[J]]
; CHECK-NEXT: br i1 [[EXITCOND_EPIL]], label [[FOR_LATCH_EPIL:%.*]], label [[FOR_INNER_EPIL]]
; CHECK: for.latch.epil:
; CHECK-NEXT: [[ADD_LCSSA_EPIL:%.*]] = phi i32 [ [[ADD_EPIL]], [[FOR_INNER_EPIL]] ]
; CHECK-NEXT: [[ARRAYIDX6_EPIL:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_UNR]]
; CHECK-NEXT: store i32 [[ADD_LCSSA_EPIL]], i32* [[ARRAYIDX6_EPIL]], align 4, !tbaa !0
; CHECK-NEXT: [[ADD8_EPIL:%.*]] = add nuw i32 [[I_UNR]], 1
; CHECK-NEXT: [[EPIL_ITER_SUB:%.*]] = sub i32 [[XTRAITER]], 1
; CHECK-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i32 [[EPIL_ITER_SUB]], 0
; CHECK-NEXT: br i1 [[EPIL_ITER_CMP]], label [[FOR_OUTER_EPIL_1:%.*]], label [[FOR_END_LOOPEXIT_EPILOG_LCSSA:%.*]]
; CHECK: for.end.loopexit.epilog-lcssa:
; CHECK-NEXT: br label [[FOR_END_LOOPEXIT]]
; CHECK: for.end.loopexit:
; CHECK-NEXT: br label [[FOR_END]]
; CHECK: for.end:
; CHECK-NEXT: ret void
; CHECK: for.outer.epil.1:
; CHECK-NEXT: br label [[FOR_INNER_EPIL_1:%.*]]
; CHECK: for.inner.epil.1:
; CHECK-NEXT: [[J_EPIL_1:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_1]] ], [ [[INC_EPIL_1:%.*]], [[FOR_INNER_EPIL_1]] ]
; CHECK-NEXT: [[SUM_EPIL_1:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_1]] ], [ [[ADD_EPIL_1:%.*]], [[FOR_INNER_EPIL_1]] ]
; CHECK-NEXT: [[ARRAYIDX_EPIL_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_EPIL_1]]
; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_EPIL_1]], align 4, !tbaa !0
; CHECK-NEXT: [[ADD_EPIL_1]] = add i32 [[TMP7]], [[SUM_EPIL_1]]
; CHECK-NEXT: [[INC_EPIL_1]] = add nuw i32 [[J_EPIL_1]], 1
; CHECK-NEXT: [[EXITCOND_EPIL_1:%.*]] = icmp eq i32 [[INC_EPIL_1]], [[J]]
; CHECK-NEXT: br i1 [[EXITCOND_EPIL_1]], label [[FOR_LATCH_EPIL_1:%.*]], label [[FOR_INNER_EPIL_1]]
; CHECK: for.latch.epil.1:
; CHECK-NEXT: [[ADD_LCSSA_EPIL_1:%.*]] = phi i32 [ [[ADD_EPIL_1]], [[FOR_INNER_EPIL_1]] ]
; CHECK-NEXT: [[ARRAYIDX6_EPIL_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[ADD8_EPIL]]
; CHECK-NEXT: store i32 [[ADD_LCSSA_EPIL_1]], i32* [[ARRAYIDX6_EPIL_1]], align 4, !tbaa !0
; CHECK-NEXT: [[ADD8_EPIL_1:%.*]] = add nuw i32 [[ADD8_EPIL]], 1
; CHECK-NEXT: [[EPIL_ITER_SUB_1:%.*]] = sub i32 [[EPIL_ITER_SUB]], 1
; CHECK-NEXT: [[EPIL_ITER_CMP_1:%.*]] = icmp ne i32 [[EPIL_ITER_SUB_1]], 0
; CHECK-NEXT: br i1 [[EPIL_ITER_CMP_1]], label [[FOR_OUTER_EPIL_2:%.*]], label [[FOR_END_LOOPEXIT_EPILOG_LCSSA]]
; CHECK: for.outer.epil.2:
; CHECK-NEXT: br label [[FOR_INNER_EPIL_2:%.*]]
; CHECK: for.inner.epil.2:
; CHECK-NEXT: [[J_EPIL_2:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_2]] ], [ [[INC_EPIL_2:%.*]], [[FOR_INNER_EPIL_2]] ]
; CHECK-NEXT: [[SUM_EPIL_2:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_2]] ], [ [[ADD_EPIL_2:%.*]], [[FOR_INNER_EPIL_2]] ]
; CHECK-NEXT: [[ARRAYIDX_EPIL_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_EPIL_2]]
; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX_EPIL_2]], align 4, !tbaa !0
; CHECK-NEXT: [[ADD_EPIL_2]] = add i32 [[TMP8]], [[SUM_EPIL_2]]
; CHECK-NEXT: [[INC_EPIL_2]] = add nuw i32 [[J_EPIL_2]], 1
; CHECK-NEXT: [[EXITCOND_EPIL_2:%.*]] = icmp eq i32 [[INC_EPIL_2]], [[J]]
; CHECK-NEXT: br i1 [[EXITCOND_EPIL_2]], label [[FOR_LATCH_EPIL_2:%.*]], label [[FOR_INNER_EPIL_2]]
; CHECK: for.latch.epil.2:
; CHECK-NEXT: [[ADD_LCSSA_EPIL_2:%.*]] = phi i32 [ [[ADD_EPIL_2]], [[FOR_INNER_EPIL_2]] ]
; CHECK-NEXT: [[ARRAYIDX6_EPIL_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[ADD8_EPIL_1]]
; CHECK-NEXT: store i32 [[ADD_LCSSA_EPIL_2]], i32* [[ARRAYIDX6_EPIL_2]], align 4, !tbaa !0
; CHECK-NEXT: [[ADD8_EPIL_2:%.*]] = add nuw i32 [[ADD8_EPIL_1]], 1
; CHECK-NEXT: [[EPIL_ITER_SUB_2:%.*]] = sub i32 [[EPIL_ITER_SUB_1]], 1
; CHECK-NEXT: br label [[FOR_END_LOOPEXIT_EPILOG_LCSSA]]
define void @test1(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
entry:
%cmp = icmp ne i32 %J, 0
%cmpJ = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmpJ
br i1 %or.cond, label %for.outer.preheader, label %for.end
for.outer.preheader:
br label %for.outer
for.outer:
%i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
br label %for.inner
for.inner:
%j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
%sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx, align 4, !tbaa !5
%add = add i32 %0, %sum
%inc = add nuw i32 %j, 1
%exitcond = icmp eq i32 %inc, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add.lcssa = phi i32 [ %add, %for.inner ]
%arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
store i32 %add.lcssa, i32* %arrayidx6, align 4, !tbaa !5
%add8 = add nuw i32 %i, 1
%exitcond25 = icmp eq i32 %add8, %I
br i1 %exitcond25, label %for.end.loopexit, label %for.outer
for.end.loopexit:
br label %for.end
for.end:
ret void
}
; CHECK-LABEL: test2
; Tests for(i) { sum = A[i]; for(j) sum += B[j]; A[i] = sum; }
; A[i] load/store dependency should not block unroll-and-jam
; CHECK: for.outer:
; CHECK: %i = phi i32 [ %add9.3, %for.latch ], [ 0, %for.outer.preheader.new ]
; CHECK: %niter = phi i32 [ %unroll_iter, %for.outer.preheader.new ], [ %niter.nsub.3, %for.latch ]
; CHECK: br label %for.inner
; CHECK: for.inner:
; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
; CHECK: %sum = phi i32 [ %2, %for.outer ], [ %add, %for.inner ]
; CHECK: %j.1 = phi i32 [ 0, %for.outer ], [ %inc.1, %for.inner ]
; CHECK: %sum.1 = phi i32 [ %3, %for.outer ], [ %add.1, %for.inner ]
; CHECK: %j.2 = phi i32 [ 0, %for.outer ], [ %inc.2, %for.inner ]
; CHECK: %sum.2 = phi i32 [ %4, %for.outer ], [ %add.2, %for.inner ]
; CHECK: %j.3 = phi i32 [ 0, %for.outer ], [ %inc.3, %for.inner ]
; CHECK: %sum.3 = phi i32 [ %5, %for.outer ], [ %add.3, %for.inner ]
; CHECK: br i1 %exitcond.3, label %for.latch, label %for.inner
; CHECK: for.latch:
; CHECK: %add.lcssa = phi i32 [ %add, %for.inner ]
; CHECK: %add.lcssa.1 = phi i32 [ %add.1, %for.inner ]
; CHECK: %add.lcssa.2 = phi i32 [ %add.2, %for.inner ]
; CHECK: %add.lcssa.3 = phi i32 [ %add.3, %for.inner ]
; CHECK: br i1 %niter.ncmp.3, label %for.end10.loopexit.unr-lcssa.loopexit, label %for.outer
; CHECK: for.end10.loopexit.unr-lcssa.loopexit:
define void @test2(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
entry:
%cmp = icmp ne i32 %J, 0
%cmp125 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmp125
br i1 %or.cond, label %for.outer.preheader, label %for.end10
for.outer.preheader:
br label %for.outer
for.outer:
%i = phi i32 [ %add9, %for.latch ], [ 0, %for.outer.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
%0 = load i32, i32* %arrayidx, align 4, !tbaa !5
br label %for.inner
for.inner:
%j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
%sum = phi i32 [ %0, %for.outer ], [ %add, %for.inner ]
%arrayidx6 = getelementptr inbounds i32, i32* %B, i32 %j
%1 = load i32, i32* %arrayidx6, align 4, !tbaa !5
%add = add i32 %1, %sum
%inc = add nuw i32 %j, 1
%exitcond = icmp eq i32 %inc, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add.lcssa = phi i32 [ %add, %for.inner ]
store i32 %add.lcssa, i32* %arrayidx, align 4, !tbaa !5
%add9 = add nuw i32 %i, 1
%exitcond28 = icmp eq i32 %add9, %I
br i1 %exitcond28, label %for.end10.loopexit, label %for.outer
for.end10.loopexit:
br label %for.end10
for.end10:
ret void
}
; CHECK-LABEL: test3
; Tests Complete unroll-and-jam of the outer loop
; CHECK: for.outer:
; CHECK: br label %for.inner
; CHECK: for.inner:
; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
; CHECK: %sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
; CHECK: %j.1 = phi i32 [ 0, %for.outer ], [ %inc.1, %for.inner ]
; CHECK: %sum.1 = phi i32 [ 0, %for.outer ], [ %add.1, %for.inner ]
; CHECK: %j.2 = phi i32 [ 0, %for.outer ], [ %inc.2, %for.inner ]
; CHECK: %sum.2 = phi i32 [ 0, %for.outer ], [ %add.2, %for.inner ]
; CHECK: %j.3 = phi i32 [ 0, %for.outer ], [ %inc.3, %for.inner ]
; CHECK: %sum.3 = phi i32 [ 0, %for.outer ], [ %add.3, %for.inner ]
; CHECK: br i1 %exitcond.3, label %for.latch, label %for.inner
; CHECK: for.latch:
; CHECK: %add.lcssa = phi i32 [ %add, %for.inner ]
; CHECK: %add.lcssa.1 = phi i32 [ %add.1, %for.inner ]
; CHECK: %add.lcssa.2 = phi i32 [ %add.2, %for.inner ]
; CHECK: %add.lcssa.3 = phi i32 [ %add.3, %for.inner ]
; CHECK: br label %for.end
; CHECK: for.end:
define void @test3(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
entry:
%cmp = icmp eq i32 %J, 0
br i1 %cmp, label %for.end, label %for.preheader
for.preheader:
br label %for.outer
for.outer:
%i = phi i32 [ %add8, %for.latch ], [ 0, %for.preheader ]
br label %for.inner
for.inner:
%j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
%sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx, align 4, !tbaa !5
%sub = add i32 %sum, 10
%add = sub i32 %sub, %0
%inc = add nuw i32 %j, 1
%exitcond = icmp eq i32 %inc, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
store i32 %add, i32* %arrayidx6, align 4, !tbaa !5
%add8 = add nuw nsw i32 %i, 1
%exitcond23 = icmp eq i32 %add8, 4
br i1 %exitcond23, label %for.end, label %for.outer
for.end:
ret void
}
; CHECK-LABEL: test4
; Tests Complete unroll-and-jam with a trip count of 1
; CHECK: for.outer:
; CHECK: br label %for.inner
; CHECK: for.inner:
; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
; CHECK: %sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
; CHECK: br i1 %exitcond, label %for.latch, label %for.inner
; CHECK: for.latch:
; CHECK: %add.lcssa = phi i32 [ %add, %for.inner ]
; CHECK: br label %for.end
; CHECK: for.end:
define void @test4(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
entry:
%cmp = icmp eq i32 %J, 0
br i1 %cmp, label %for.end, label %for.preheader
for.preheader:
br label %for.outer
for.outer:
%i = phi i32 [ %add8, %for.latch ], [ 0, %for.preheader ]
br label %for.inner
for.inner:
%j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
%sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
%0 = load i32, i32* %arrayidx, align 4, !tbaa !5
%sub = add i32 %sum, 10
%add = sub i32 %sub, %0
%inc = add nuw i32 %j, 1
%exitcond = icmp eq i32 %inc, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
store i32 %add, i32* %arrayidx6, align 4, !tbaa !5
%add8 = add nuw nsw i32 %i, 1
%exitcond23 = icmp eq i32 %add8, 1
br i1 %exitcond23, label %for.end, label %for.outer
for.end:
ret void
}
; CHECK-LABEL: test5
; Multiple SubLoopBlocks
; CHECK: for.outer:
; CHECK: br label %for.inner
; CHECK: for.inner:
; CHECK: %inc8.sink15 = phi i32 [ 0, %for.outer ], [ %inc8, %for.inc.1 ]
; CHECK: %inc8.sink15.1 = phi i32 [ 0, %for.outer ], [ %inc8.1, %for.inc.1 ]
; CHECK: br label %for.inner2
; CHECK: for.inner2:
; CHECK: br i1 %tobool, label %for.cond4, label %for.inc
; CHECK: for.cond4:
; CHECK: br i1 %tobool.1, label %for.cond4a, label %for.inc
; CHECK: for.cond4a:
; CHECK: br label %for.inc
; CHECK: for.inc:
; CHECK: br i1 %tobool.11, label %for.cond4.1, label %for.inc.1
; CHECK: for.latch:
; CHECK: br label %for.end
; CHECK: for.end:
; CHECK: ret i32 0
; CHECK: for.cond4.1:
; CHECK: br i1 %tobool.1.1, label %for.cond4a.1, label %for.inc.1
; CHECK: for.cond4a.1:
; CHECK: br label %for.inc.1
; CHECK: for.inc.1:
; CHECK: br i1 %exitcond.1, label %for.latch, label %for.inner
@a = hidden global [1 x i32] zeroinitializer, align 4
define i32 @test5() #0 {
entry:
br label %for.outer
for.outer:
%.sink16 = phi i32 [ 0, %entry ], [ %add, %for.latch ]
br label %for.inner
for.inner:
%inc8.sink15 = phi i32 [ 0, %for.outer ], [ %inc8, %for.inc ]
br label %for.inner2
for.inner2:
%l1 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @a, i32 0, i32 0), align 4
%tobool = icmp eq i32 %l1, 0
br i1 %tobool, label %for.cond4, label %for.inc
for.cond4:
%l0 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @a, i32 1, i32 0), align 4
%tobool.1 = icmp eq i32 %l0, 0
br i1 %tobool.1, label %for.cond4a, label %for.inc
for.cond4a:
br label %for.inc
for.inc:
%l2 = phi i32 [ 0, %for.inner2 ], [ 1, %for.cond4 ], [ 2, %for.cond4a ]
%inc8 = add nuw nsw i32 %inc8.sink15, 1
%exitcond = icmp eq i32 %inc8, 3
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%.lcssa = phi i32 [ %l2, %for.inc ]
%conv11 = and i32 %.sink16, 255
%add = add nuw nsw i32 %conv11, 4
%cmp = icmp eq i32 %add, 8
br i1 %cmp, label %for.end, label %for.outer
for.end:
%.lcssa.lcssa = phi i32 [ %.lcssa, %for.latch ]
ret i32 0
}
; CHECK-LABEL: test6
; Test odd uses of phi nodes
; CHECK: for.outer:
; CHECK: br label %for.inner
; CHECK: for.inner:
; CHECK: br i1 %exitcond.3, label %for.inner, label %for.latch
; CHECK: for.latch:
; CHECK: br label %for.end
; CHECK: for.end:
; CHECK: ret i32 0
@f = hidden global i32 0, align 4
define i32 @test6() #0 {
entry:
%f.promoted10 = load i32, i32* @f, align 4, !tbaa !5
br label %for.outer
for.outer:
%p0 = phi i32 [ %f.promoted10, %entry ], [ 2, %for.latch ]
%inc5.sink9 = phi i32 [ 2, %entry ], [ %inc5, %for.latch ]
br label %for.inner
for.inner:
%p1 = phi i32 [ %p0, %for.outer ], [ 2, %for.inner ]
%inc.sink8 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
%inc = add nuw nsw i32 %inc.sink8, 1
%exitcond = icmp ne i32 %inc, 7
br i1 %exitcond, label %for.inner, label %for.latch
for.latch:
%.lcssa = phi i32 [ %p1, %for.inner ]
%inc5 = add nuw nsw i32 %inc5.sink9, 1
%exitcond11 = icmp ne i32 %inc5, 7
br i1 %exitcond11, label %for.outer, label %for.end
for.end:
%.lcssa.lcssa = phi i32 [ %.lcssa, %for.latch ]
%inc.lcssa.lcssa = phi i32 [ 7, %for.latch ]
ret i32 0
}
; CHECK-LABEL: test7
; Has a positive dependency between two stores. Still valid.
; The negative dependecy is in unroll-and-jam-disabled.ll
; CHECK: for.outer:
; CHECK: %i = phi i32 [ %add.3, %for.latch ], [ 0, %for.preheader.new ]
; CHECK: %niter = phi i32 [ %unroll_iter, %for.preheader.new ], [ %niter.nsub.3, %for.latch ]
; CHECK: br label %for.inner
; CHECK: for.latch:
; CHECK: %add9.lcssa = phi i32 [ %add9, %for.inner ]
; CHECK: %add9.lcssa.1 = phi i32 [ %add9.1, %for.inner ]
; CHECK: %add9.lcssa.2 = phi i32 [ %add9.2, %for.inner ]
; CHECK: %add9.lcssa.3 = phi i32 [ %add9.3, %for.inner ]
; CHECK: br i1 %niter.ncmp.3, label %for.end.loopexit.unr-lcssa.loopexit, label %for.outer
; CHECK: for.inner:
; CHECK: %sum = phi i32 [ 0, %for.outer ], [ %add9, %for.inner ]
; CHECK: %j = phi i32 [ 0, %for.outer ], [ %add10, %for.inner ]
; CHECK: %sum.1 = phi i32 [ 0, %for.outer ], [ %add9.1, %for.inner ]
; CHECK: %j.1 = phi i32 [ 0, %for.outer ], [ %add10.1, %for.inner ]
; CHECK: %sum.2 = phi i32 [ 0, %for.outer ], [ %add9.2, %for.inner ]
; CHECK: %j.2 = phi i32 [ 0, %for.outer ], [ %add10.2, %for.inner ]
; CHECK: %sum.3 = phi i32 [ 0, %for.outer ], [ %add9.3, %for.inner ]
; CHECK: %j.3 = phi i32 [ 0, %for.outer ], [ %add10.3, %for.inner ]
; CHECK: br i1 %exitcond.3, label %for.latch, label %for.inner
; CHECK: for.end.loopexit.unr-lcssa.loopexit:
define void @test7(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
entry:
%cmp = icmp ne i32 %J, 0
%cmp128 = icmp ne i32 %I, 0
%or.cond = and i1 %cmp128, %cmp
br i1 %or.cond, label %for.preheader, label %for.end
for.preheader:
br label %for.outer
for.outer:
%i = phi i32 [ %add, %for.latch ], [ 0, %for.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
store i32 0, i32* %arrayidx, align 4, !tbaa !5
%add = add nuw i32 %i, 1
%arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %add
store i32 2, i32* %arrayidx2, align 4, !tbaa !5
br label %for.inner
for.latch:
store i32 %add9, i32* %arrayidx, align 4, !tbaa !5
%exitcond30 = icmp eq i32 %add, %I
br i1 %exitcond30, label %for.end, label %for.outer
for.inner:
%sum = phi i32 [ 0, %for.outer ], [ %add9, %for.inner ]
%j = phi i32 [ 0, %for.outer ], [ %add10, %for.inner ]
%arrayidx7 = getelementptr inbounds i32, i32* %B, i32 %j
%l1 = load i32, i32* %arrayidx7, align 4, !tbaa !5
%add9 = add i32 %l1, %sum
%add10 = add nuw i32 %j, 1
%exitcond = icmp eq i32 %add10, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.end:
ret void
}
; CHECK-LABEL: test8
; Same as test7 with an extra outer loop nest
; CHECK: for.outest:
; CHECK: br label %for.outer
; CHECK: for.outer:
; CHECK: %i = phi i32 [ %add.3, %for.latch ], [ 0, %for.outest.new ]
; CHECK: %niter = phi i32 [ %unroll_iter, %for.outest.new ], [ %niter.nsub.3, %for.latch ]
; CHECK: br label %for.inner
; CHECK: for.inner:
; CHECK: %sum = phi i32 [ 0, %for.outer ], [ %add9, %for.inner ]
; CHECK: %j = phi i32 [ 0, %for.outer ], [ %add10, %for.inner ]
; CHECK: %sum.1 = phi i32 [ 0, %for.outer ], [ %add9.1, %for.inner ]
; CHECK: %j.1 = phi i32 [ 0, %for.outer ], [ %add10.1, %for.inner ]
; CHECK: %sum.2 = phi i32 [ 0, %for.outer ], [ %add9.2, %for.inner ]
; CHECK: %j.2 = phi i32 [ 0, %for.outer ], [ %add10.2, %for.inner ]
; CHECK: %sum.3 = phi i32 [ 0, %for.outer ], [ %add9.3, %for.inner ]
; CHECK: %j.3 = phi i32 [ 0, %for.outer ], [ %add10.3, %for.inner ]
; CHECK: br i1 %exitcond.3, label %for.latch, label %for.inner
; CHECK: for.latch:
; CHECK: %add9.lcssa = phi i32 [ %add9, %for.inner ]
; CHECK: %add9.lcssa.1 = phi i32 [ %add9.1, %for.inner ]
; CHECK: %add9.lcssa.2 = phi i32 [ %add9.2, %for.inner ]
; CHECK: %add9.lcssa.3 = phi i32 [ %add9.3, %for.inner ]
; CHECK: br i1 %niter.ncmp.3, label %for.cleanup.unr-lcssa.loopexit, label %for.outer
; CHECK: for.cleanup.epilog-lcssa:
; CHECK: br label %for.cleanup
; CHECK: for.cleanup:
; CHECK: br i1 %exitcond41, label %for.end.loopexit, label %for.outest
; CHECK: for.end.loopexit:
; CHECK: br label %for.end
define void @test8(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
entry:
%cmp = icmp eq i32 %J, 0
%cmp336 = icmp eq i32 %I, 0
%or.cond = or i1 %cmp, %cmp336
br i1 %or.cond, label %for.end, label %for.preheader
for.preheader:
br label %for.outest
for.outest:
%x.038 = phi i32 [ %inc, %for.cleanup ], [ 0, %for.preheader ]
br label %for.outer
for.outer:
%i = phi i32 [ %add, %for.latch ], [ 0, %for.outest ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
store i32 0, i32* %arrayidx, align 4, !tbaa !5
%add = add nuw i32 %i, 1
%arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %add
store i32 2, i32* %arrayidx6, align 4, !tbaa !5
br label %for.inner
for.inner:
%sum = phi i32 [ 0, %for.outer ], [ %add9, %for.inner ]
%j = phi i32 [ 0, %for.outer ], [ %add10, %for.inner ]
%arrayidx11 = getelementptr inbounds i32, i32* %B, i32 %j
%l1 = load i32, i32* %arrayidx11, align 4, !tbaa !5
%add9 = add i32 %l1, %sum
%add10 = add nuw i32 %j, 1
%exitcond = icmp eq i32 %add10, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
store i32 %add9, i32* %arrayidx, align 4, !tbaa !5
%exitcond39 = icmp eq i32 %add, %I
br i1 %exitcond39, label %for.cleanup, label %for.outer
for.cleanup:
%inc = add nuw nsw i32 %x.038, 1
%exitcond41 = icmp eq i32 %inc, 5
br i1 %exitcond41, label %for.end, label %for.outest
for.end:
ret void
}
; CHECK-LABEL: test9
; Same as test1 with tbaa, not noalias
; CHECK: for.outer:
; CHECK: %i = phi i32 [ %add8.3, %for.latch ], [ 0, %for.outer.preheader.new ]
; CHECK: %niter = phi i32 [ %unroll_iter, %for.outer.preheader.new ], [ %niter.nsub.3, %for.latch ]
; CHECK: br label %for.inner
; CHECK: for.inner:
; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
; CHECK: %sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
; CHECK: %j.1 = phi i32 [ 0, %for.outer ], [ %inc.1, %for.inner ]
; CHECK: %sum.1 = phi i32 [ 0, %for.outer ], [ %add.1, %for.inner ]
; CHECK: %j.2 = phi i32 [ 0, %for.outer ], [ %inc.2, %for.inner ]
; CHECK: %sum.2 = phi i32 [ 0, %for.outer ], [ %add.2, %for.inner ]
; CHECK: %j.3 = phi i32 [ 0, %for.outer ], [ %inc.3, %for.inner ]
; CHECK: %sum.3 = phi i32 [ 0, %for.outer ], [ %add.3, %for.inner ]
; CHECK: br i1 %exitcond.3, label %for.latch, label %for.inner
; CHECK: for.latch:
; CHECK: %add.lcssa = phi i32 [ %add, %for.inner ]
; CHECK: %add.lcssa.1 = phi i32 [ %add.1, %for.inner ]
; CHECK: %add.lcssa.2 = phi i32 [ %add.2, %for.inner ]
; CHECK: %add.lcssa.3 = phi i32 [ %add.3, %for.inner ]
; CHECK: br i1 %niter.ncmp.3, label %for.end.loopexit.unr-lcssa.loopexit, label %for.outer
; CHECK: for.end.loopexit.unr-lcssa.loopexit:
define void @test9(i32 %I, i32 %J, i32* nocapture %A, i16* nocapture readonly %B) #0 {
entry:
%cmp = icmp ne i32 %J, 0
%cmpJ = icmp ne i32 %I, 0
%or.cond = and i1 %cmp, %cmpJ
br i1 %or.cond, label %for.outer.preheader, label %for.end
for.outer.preheader:
br label %for.outer
for.outer:
%i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
br label %for.inner
for.inner:
%j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
%sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
%arrayidx = getelementptr inbounds i16, i16* %B, i32 %j
%0 = load i16, i16* %arrayidx, align 4, !tbaa !9
%sext = sext i16 %0 to i32
%add = add i32 %sext, %sum
%inc = add nuw i32 %j, 1
%exitcond = icmp eq i32 %inc, %J
br i1 %exitcond, label %for.latch, label %for.inner
for.latch:
%add.lcssa = phi i32 [ %add, %for.inner ]
%arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
store i32 %add.lcssa, i32* %arrayidx6, align 4, !tbaa !5
%add8 = add nuw i32 %i, 1
%exitcond25 = icmp eq i32 %add8, %I
br i1 %exitcond25, label %for.end.loopexit, label %for.outer
for.end.loopexit:
br label %for.end
for.end:
ret void
}
; CHECK-LABEL: test10
; Be careful not to incorrectly update the exit phi nodes
; CHECK: %dec.lcssa.lcssa.ph.ph = phi i64 [ 0, %for.inc24 ]
%struct.a = type { i64 }
@g = common global %struct.a zeroinitializer, align 8
@c = common global [1 x i8] zeroinitializer, align 1
define signext i16 @test10(i32 %k) #0 {
entry:
%0 = load i8, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @c, i64 0, i64 0), align 1
%tobool9 = icmp eq i8 %0, 0
%tobool13 = icmp ne i32 %k, 0
br label %for.body
for.body:
%storemerge82 = phi i64 [ 0, %entry ], [ %inc25, %for.inc24 ]
br label %for.body2
for.body2:
%storemerge = phi i64 [ 4, %for.body ], [ %dec, %for.inc21 ]
br i1 %tobool9, label %for.body2.split, label %for.body2.split2
for.body2.split2:
br i1 %tobool13, label %for.inc21, label %for.inc21.if
for.body2.split:
br i1 %tobool13, label %for.inc21, label %for.inc21.then
for.inc21.if:
%storemerge.1 = phi i64 [ 0, %for.body2.split2 ]
br label %for.inc21
for.inc21.then:
%storemerge.2 = phi i64 [ 0, %for.body2.split ]
%storemerge.3 = phi i32 [ 0, %for.body2.split ]
br label %for.inc21
for.inc21:
%storemerge.4 = phi i64 [ %storemerge.1, %for.inc21.if ], [ %storemerge.2, %for.inc21.then ], [ 4, %for.body2.split2 ], [ 4, %for.body2.split ]
%storemerge.5 = phi i32 [ 0, %for.inc21.if ], [ %storemerge.3, %for.inc21.then ], [ 0, %for.body2.split2 ], [ 0, %for.body2.split ]
%dec = add nsw i64 %storemerge, -1
%tobool = icmp eq i64 %dec, 0
br i1 %tobool, label %for.inc24, label %for.body2
for.inc24:
%storemerge.4.lcssa = phi i64 [ %storemerge.4, %for.inc21 ]
%storemerge.5.lcssa = phi i32 [ %storemerge.5, %for.inc21 ]
%inc25 = add nuw nsw i64 %storemerge82, 1
%exitcond = icmp ne i64 %inc25, 5
br i1 %exitcond, label %for.body, label %for.end26
for.end26:
%dec.lcssa.lcssa = phi i64 [ 0, %for.inc24 ]
%storemerge.4.lcssa.lcssa = phi i64 [ %storemerge.4.lcssa, %for.inc24 ]
%storemerge.5.lcssa.lcssa = phi i32 [ %storemerge.5.lcssa, %for.inc24 ]
store i64 %dec.lcssa.lcssa, i64* getelementptr inbounds (%struct.a, %struct.a* @g, i64 0, i32 0), align 8
ret i16 0
}
!5 = !{!6, !6, i64 0}
!6 = !{!"int", !7, i64 0}
!7 = !{!"omnipotent char", !8, i64 0}
!8 = !{!"Simple C/C++ TBAA"}
!9 = !{!10, !10, i64 0}
!10 = !{!"short", !7, i64 0}