Extend loop unrolling and unroll-jamming to non-matching bound operands and

multi-result upper bounds, complete TODOs, fix/improve test cases.

- complete TODOs for loop unroll/unroll-and-jam. Something as simple as
  "for %i = 0 to %N" wasn't being unrolled earlier (unless it had been written
  as "for %i = ()[s0] -> (0)()[%N] to %N"; addressed now.

- update/replace getTripCountExpr with buildTripCountMapAndOperands; makes it
  more powerful as it composes inputs into it

- getCleanupLowerBound and getUnrolledLoopUpperBound actually needed the same
  code; refactor and remove one.

- reorganize test cases, write previous ones better; most of these changes are
  "label replacements".

- fix wrongly labeled test cases in unroll-jam.mlir

PiperOrigin-RevId: 238014653
This commit is contained in:
Uday Bondhugula 2019-03-12 08:00:52 -07:00 committed by jpienaar
parent 9abea4a466
commit 075090f891
7 changed files with 528 additions and 413 deletions

View File

@ -36,10 +36,17 @@ class Instruction;
class MemRefType;
class Value;
/// Returns the trip count of the loop as an affine expression if the latter is
/// expressible as an affine expression, and nullptr otherwise. The trip count
/// expression is simplified before returning.
AffineExpr getTripCountExpr(ConstOpPointer<AffineForOp> forOp);
/// Returns the trip count of the loop as an affine map with its corresponding
/// operands if the latter is expressible as an affine expression, and nullptr
/// otherwise. This method always succeeds as long as the lower bound is not a
/// multi-result map. The trip count expression is simplified before returning.
/// This method only utilizes map composition to construct lower and upper
/// bounds before computing the trip count expressions
// TODO(mlir-team): this should be moved into 'Transforms/' and be replaced by a
// pure analysis method relying on FlatAffineConstraints
void buildTripCountMapAndOperands(ConstOpPointer<AffineForOp> forOp,
AffineMap *map,
SmallVectorImpl<Value *> *operands);
/// Returns the trip count of the loop if it's a constant, None otherwise. This
/// uses affine expression analysis and is able to determine constant trip count

View File

@ -34,6 +34,7 @@ template <typename T> class ConstOpPointer;
class Function;
class FuncBuilder;
template <typename T> class OpPointer;
class Value;
/// Unrolls this for instruction completely if the trip count is known to be
/// constant. Returns failure otherwise.
@ -66,16 +67,15 @@ LogicalResult promoteIfSingleIteration(OpPointer<AffineForOp> forOp);
/// their body into the containing Block.
void promoteSingleIterationLoops(Function *f);
/// Returns the lower bound of the cleanup loop when unrolling a loop
/// with the specified unroll factor.
AffineMap getCleanupLoopLowerBound(ConstOpPointer<AffineForOp> forOp,
unsigned unrollFactor, FuncBuilder *builder);
/// Returns the upper bound of an unrolled loop when unrolling with
/// the specified trip count, stride, and unroll factor.
AffineMap getUnrolledLoopUpperBound(ConstOpPointer<AffineForOp> forOp,
unsigned unrollFactor,
FuncBuilder *builder);
/// Computes the cleanup loop lower bound of the loop being unrolled with
/// the specified unroll factor; this bound will also be upper bound of the main
/// part of the unrolled loop. Computes the bound as an AffineMap with its
/// operands or a null map when the trip count can't be expressed as an affine
/// expression.
void getCleanupLoopLowerBound(ConstOpPointer<AffineForOp> forOp,
unsigned unrollFactor, AffineMap *map,
SmallVectorImpl<Value *> *operands,
FuncBuilder *builder);
/// Skew the instructions in the body of a 'for' instruction with the specified
/// instruction-wise shifts. The shifts are with respect to the original

View File

@ -26,6 +26,7 @@
#include "mlir/Analysis/AffineStructures.h"
#include "mlir/Analysis/NestedMatcher.h"
#include "mlir/Analysis/VectorAnalysis.h"
#include "mlir/IR/AffineMap.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/Instruction.h"
#include "mlir/StandardOps/Ops.h"
@ -41,88 +42,141 @@ using namespace mlir;
/// Returns the trip count of the loop as an affine expression if the latter is
/// expressible as an affine expression, and nullptr otherwise. The trip count
/// expression is simplified before returning.
AffineExpr mlir::getTripCountExpr(ConstOpPointer<AffineForOp> forOp) {
// upper_bound - lower_bound
/// expression is simplified before returning. This method only utilizes map
/// composition to construct lower and upper bounds before computing the trip
/// count expressions.
// TODO(mlir-team): this should be moved into 'Transforms/' and be replaced by a
// pure analysis method relying on FlatAffineConstraints; the latter will also
// be more powerful (since both inequalities and equalities will be considered).
void mlir::buildTripCountMapAndOperands(
ConstOpPointer<AffineForOp> forOp, AffineMap *map,
SmallVectorImpl<Value *> *tripCountOperands) {
int64_t loopSpan;
int64_t step = forOp->getStep();
auto *context = forOp->getInstruction()->getContext();
// We need to get operands; we aren't changing them here.
auto ncForOp = *reinterpret_cast<OpPointer<AffineForOp> *>(&forOp);
FuncBuilder b(ncForOp->getInstruction());
if (forOp->hasConstantBounds()) {
int64_t lb = forOp->getConstantLowerBound();
int64_t ub = forOp->getConstantUpperBound();
loopSpan = ub - lb;
} else {
auto lbMap = forOp->getLowerBoundMap();
auto ubMap = forOp->getUpperBoundMap();
// TODO(bondhugula): handle max/min of multiple expressions.
if (lbMap.getNumResults() != 1 || ubMap.getNumResults() != 1)
return nullptr;
// TODO(bondhugula): handle bounds with different operands.
// Bounds have different operands, unhandled for now.
if (!forOp->matchingBoundOperandList())
return nullptr;
// ub_expr - lb_expr
AffineExpr lbExpr(lbMap.getResult(0));
AffineExpr ubExpr(ubMap.getResult(0));
auto loopSpanExpr = simplifyAffineExpr(
ubExpr - lbExpr, std::max(lbMap.getNumDims(), ubMap.getNumDims()),
std::max(lbMap.getNumSymbols(), ubMap.getNumSymbols()));
auto cExpr = loopSpanExpr.dyn_cast<AffineConstantExpr>();
if (!cExpr)
return loopSpanExpr.ceilDiv(step);
loopSpan = cExpr.getValue();
if (loopSpan < 0)
loopSpan = 0;
*map = b.getConstantAffineMap(ceilDiv(loopSpan, step));
tripCountOperands->clear();
return;
}
auto lbMap = forOp->getLowerBoundMap();
auto ubMap = forOp->getUpperBoundMap();
if (lbMap.getNumResults() != 1) {
*map = AffineMap();
return;
}
SmallVector<Value *, 4> lbOperands(ncForOp->getLowerBoundOperands());
SmallVector<Value *, 4> ubOperands(ncForOp->getUpperBoundOperands());
auto lb = b.create<AffineApplyOp>(forOp->getLoc(), lbMap, lbOperands);
SmallVector<Value *, 4> ubs;
ubs.reserve(ubMap.getNumResults());
for (auto ubExpr : ubMap.getResults())
ubs.push_back(b.create<AffineApplyOp>(
forOp->getLoc(),
b.getAffineMap(ubMap.getNumDims(), ubMap.getNumSymbols(), {ubExpr}, {}),
ubOperands));
// 0 iteration loops.
if (loopSpan < 0)
return 0;
tripCountOperands->clear();
tripCountOperands->reserve(1 + ubs.size());
tripCountOperands->push_back(lb);
tripCountOperands->append(ubs.begin(), ubs.end());
return getAffineConstantExpr(static_cast<uint64_t>(ceilDiv(loopSpan, step)),
context);
SmallVector<AffineExpr, 4> tripCountExprs(ubs.size());
for (unsigned i = 0, e = ubs.size(); i < e; i++)
tripCountExprs[i] =
(b.getAffineDimExpr(1 + i) - b.getAffineDimExpr(0)).ceilDiv(step);
*map = b.getAffineMap(1 + ubs.size(), 0, tripCountExprs, {});
forOp->getInstruction()->getFunction()->dump();
fullyComposeAffineMapAndOperands(map, tripCountOperands);
*map = simplifyAffineMap(*map);
canonicalizeMapAndOperands(map, tripCountOperands);
// Remove any affine.apply's that became dead as a result of composition,
// simplification, and canonicalization above.
for (auto *v : ubs)
if (v->use_empty())
v->getDefiningInst()->erase();
if (lb->use_empty())
lb->erase();
}
/// Returns the trip count of the loop if it's a constant, None otherwise. This
/// method uses affine expression analysis (in turn using getTripCount) and is
/// able to determine constant trip count in non-trivial cases.
// FIXME(mlir-team): this is really relying on buildTripCountMapAndOperands;
// being an analysis utility, it shouldn't. Replace with a version that just
// works with analysis structures (FlatAffineConstraints) and thus doesn't
// update the IR.
llvm::Optional<uint64_t>
mlir::getConstantTripCount(ConstOpPointer<AffineForOp> forOp) {
auto tripCountExpr = getTripCountExpr(forOp);
SmallVector<Value *, 4> operands;
AffineMap map;
buildTripCountMapAndOperands(forOp, &map, &operands);
if (!tripCountExpr)
if (!map)
return None;
if (auto constExpr = tripCountExpr.dyn_cast<AffineConstantExpr>())
return constExpr.getValue();
return None;
// Take the min if all trip counts are constant.
Optional<uint64_t> tripCount;
for (auto resultExpr : map.getResults()) {
if (auto constExpr = resultExpr.dyn_cast<AffineConstantExpr>()) {
if (tripCount.hasValue())
tripCount = std::min(tripCount.getValue(),
static_cast<uint64_t>(constExpr.getValue()));
else
tripCount = constExpr.getValue();
} else
return None;
}
return tripCount;
}
/// Returns the greatest known integral divisor of the trip count. Affine
/// expression analysis is used (indirectly through getTripCount), and
/// this method is thus able to determine non-trivial divisors.
uint64_t mlir::getLargestDivisorOfTripCount(ConstOpPointer<AffineForOp> forOp) {
auto tripCountExpr = getTripCountExpr(forOp);
SmallVector<Value *, 4> operands;
AffineMap map;
buildTripCountMapAndOperands(forOp, &map, &operands);
if (!tripCountExpr)
if (!map)
return 1;
if (auto constExpr = tripCountExpr.dyn_cast<AffineConstantExpr>()) {
uint64_t tripCount = constExpr.getValue();
// 0 iteration loops (greatest divisor is 2^64 - 1).
if (tripCount == 0)
return ULONG_MAX;
// The greatest divisor is the trip count.
return tripCount;
// The largest divisor of the trip count is the GCD of the individual largest
// divisors.
assert(map.getNumResults() >= 1 && "expected one or more results");
Optional<uint64_t> gcd;
for (auto resultExpr : map.getResults()) {
uint64_t thisGcd;
if (auto constExpr = resultExpr.dyn_cast<AffineConstantExpr>()) {
uint64_t tripCount = constExpr.getValue();
// 0 iteration loops (greatest divisor is 2^64 - 1).
if (tripCount == 0)
thisGcd = std::numeric_limits<uint64_t>::max();
else
// The greatest divisor is the trip count.
thisGcd = tripCount;
} else {
// Trip count is not a known constant; return its largest known divisor.
thisGcd = resultExpr.getLargestKnownDivisor();
}
if (gcd.hasValue())
gcd = llvm::GreatestCommonDivisor64(gcd.getValue(), thisGcd);
else
gcd = thisGcd;
}
// Trip count is not a known constant; return its largest known divisor.
return tripCountExpr.getLargestKnownDivisor();
assert(gcd.hasValue() && "value expected per above logic");
return gcd.getValue();
}
bool mlir::isAccessInvariant(const Value &iv, const Value &index) {

View File

@ -152,32 +152,23 @@ LogicalResult mlir::loopUnrollJamByFactor(OpPointer<AffineForOp> forOp,
assert(unrollJamFactor >= 1 && "unroll jam factor should be >= 1");
if (unrollJamFactor == 1 || forOp->getBody()->empty())
if (unrollJamFactor == 1)
return promoteIfSingleIteration(forOp);
if (forOp->getBody()->empty())
return failure();
// Loops where both lower and upper bounds are multi-result maps won't be
// unrolled (since the trip can't be expressed as an affine function in
// general).
// TODO(mlir-team): this may not be common, but we could support the case
// where the lower bound is a multi-result map and the ub is a single result
// one.
if (forOp->getLowerBoundMap().getNumResults() != 1)
return failure();
Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
if (!mayBeConstantTripCount.hasValue() &&
getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0)
return failure();
auto lbMap = forOp->getLowerBoundMap();
auto ubMap = forOp->getUpperBoundMap();
// Loops with max/min expressions won't be unrolled here (the output can't be
// expressed as a Function in the general case). However, the right way to
// do such unrolling for a Function would be to specialize the loop for the
// 'hotspot' case and unroll that hotspot.
if (lbMap.getNumResults() != 1 || ubMap.getNumResults() != 1)
return failure();
// Same operand list for lower and upper bound for now.
// TODO(bondhugula): handle bounds with different sets of operands.
if (!forOp->matchingBoundOperandList())
return failure();
// If the trip count is lower than the unroll jam factor, no unroll jam.
// TODO(bondhugula): option to specify cleanup loop unrolling.
if (mayBeConstantTripCount.hasValue() &&
mayBeConstantTripCount.getValue() < unrollJamFactor)
return failure();
@ -191,21 +182,25 @@ LogicalResult mlir::loopUnrollJamByFactor(OpPointer<AffineForOp> forOp,
// Generate the cleanup loop if trip count isn't a multiple of
// unrollJamFactor.
if (mayBeConstantTripCount.hasValue() &&
mayBeConstantTripCount.getValue() % unrollJamFactor != 0) {
if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
// Insert the cleanup loop right after 'forOp'.
FuncBuilder builder(forInst->getBlock(),
std::next(Block::iterator(forInst)));
auto cleanupAffineForOp = builder.clone(*forInst)->cast<AffineForOp>();
cleanupAffineForOp->setLowerBoundMap(
getCleanupLoopLowerBound(forOp, unrollJamFactor, &builder));
// Adjust the lower bound of the cleanup loop; its upper bound is the same
// as the original loop's upper bound.
AffineMap cleanupMap;
SmallVector<Value *, 4> cleanupOperands;
getCleanupLoopLowerBound(forOp, unrollJamFactor, &cleanupMap,
&cleanupOperands, &builder);
cleanupAffineForOp->setLowerBound(cleanupOperands, cleanupMap);
// The upper bound needs to be adjusted.
forOp->setUpperBoundMap(
getUnrolledLoopUpperBound(forOp, unrollJamFactor, &builder));
// Promote the loop body up if this has turned into a single iteration loop.
// Promote the cleanup loop if it has turned into a single iteration loop.
promoteIfSingleIteration(cleanupAffineForOp);
// Adjust the upper bound of the original loop - it will be the same as the
// cleanup loop's lower bound. Its lower bound remains unchanged.
forOp->setUpperBound(cleanupOperands, cleanupMap);
}
// Scale the step of loop being unroll-jammed by the unroll-jam factor.

View File

@ -38,54 +38,78 @@
using namespace mlir;
/// Returns the upper bound of an unrolled loop with lower bound 'lb' and with
/// the specified trip count, stride, and unroll factor. Returns nullptr when
/// the trip count can't be expressed as an affine expression.
AffineMap mlir::getUnrolledLoopUpperBound(ConstOpPointer<AffineForOp> forOp,
unsigned unrollFactor,
FuncBuilder *builder) {
/// Computes the cleanup loop lower bound of the loop being unrolled with
/// the specified unroll factor; this bound will also be upper bound of the main
/// part of the unrolled loop. Computes the bound as an AffineMap with its
/// operands or a null map when the trip count can't be expressed as an affine
/// expression.
void mlir::getCleanupLoopLowerBound(ConstOpPointer<AffineForOp> forOp,
unsigned unrollFactor, AffineMap *map,
SmallVectorImpl<Value *> *operands,
FuncBuilder *b) {
auto lbMap = forOp->getLowerBoundMap();
// Single result lower bound map only.
if (lbMap.getNumResults() != 1)
return AffineMap();
if (lbMap.getNumResults() != 1) {
*map = AffineMap();
return;
}
// Sometimes, the trip count cannot be expressed as an affine expression.
auto tripCount = getTripCountExpr(forOp);
if (!tripCount)
return AffineMap();
AffineExpr lb(lbMap.getResult(0));
unsigned step = forOp->getStep();
auto newUb = lb + (tripCount - tripCount % unrollFactor - 1) * step;
return builder->getAffineMap(lbMap.getNumDims(), lbMap.getNumSymbols(),
{newUb}, {});
}
/// Returns the lower bound of the cleanup loop when unrolling a loop with lower
/// bound 'lb' and with the specified trip count, stride, and unroll factor.
/// Returns an AffinMap with nullptr storage (that evaluates to false)
/// when the trip count can't be expressed as an affine expression.
AffineMap mlir::getCleanupLoopLowerBound(ConstOpPointer<AffineForOp> forOp,
unsigned unrollFactor,
FuncBuilder *builder) {
auto lbMap = forOp->getLowerBoundMap();
// Single result lower bound map only.
if (lbMap.getNumResults() != 1)
return AffineMap();
AffineMap tripCountMap;
SmallVector<Value *, 4> tripCountOperands;
buildTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
// Sometimes the trip count cannot be expressed as an affine expression.
AffineExpr tripCount(getTripCountExpr(forOp));
if (!tripCount)
return AffineMap();
if (!tripCountMap) {
*map = AffineMap();
return;
}
AffineExpr lb(lbMap.getResult(0));
unsigned step = forOp->getStep();
auto newLb = lb + (tripCount - tripCount % unrollFactor) * step;
return builder->getAffineMap(lbMap.getNumDims(), lbMap.getNumSymbols(),
{newLb}, {});
// We need to get non-const operands; we aren't changing them here.
auto ncForOp = *reinterpret_cast<OpPointer<AffineForOp> *>(&forOp);
SmallVector<Value *, 4> lbOperands(ncForOp->getLowerBoundOperands());
auto lb = b->create<AffineApplyOp>(ncForOp->getLoc(), lbMap, lbOperands);
// For each upper bound expr, get the range.
// Eg: for %i = lb to min (ub1, ub2),
// where tripCountExprs yield (tr1, tr2), we create affine.apply's:
// lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
// these affine.apply's make up the cleanup loop lower bound.
SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
SmallVector<Value *, 4> bumpValues(tripCountMap.getNumResults());
for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
auto tripCountExpr = tripCountMap.getResult(i);
bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
auto bumpMap =
b->getAffineMap(tripCountMap.getNumDims(), tripCountMap.getNumSymbols(),
bumpExprs[i], {});
bumpValues[i] =
b->create<AffineApplyOp>(forOp->getLoc(), bumpMap, tripCountOperands);
}
SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
newUbExprs[i] = b->getAffineDimExpr(0) + b->getAffineDimExpr(i + 1);
operands->clear();
operands->push_back(lb);
operands->append(bumpValues.begin(), bumpValues.end());
*map = b->getAffineMap(1 + tripCountMap.getNumResults(), 0, newUbExprs, {});
// Simplify the map + operands.
fullyComposeAffineMapAndOperands(map, operands);
*map = simplifyAffineMap(*map);
canonicalizeMapAndOperands(map, operands);
// Remove any affine.apply's that became dead from the simplification above.
for (auto *v : bumpValues) {
if (v->use_empty()) {
v->getDefiningInst()->erase();
}
}
if (lb->use_empty())
lb->erase();
}
/// Promotes the loop body of a forOp to its containing block if the forOp
@ -369,25 +393,17 @@ LogicalResult mlir::loopUnrollByFactor(OpPointer<AffineForOp> forOp,
if (forOp->getBody()->empty())
return failure();
auto lbMap = forOp->getLowerBoundMap();
auto ubMap = forOp->getUpperBoundMap();
// Loops with max/min expressions won't be unrolled here (the output can't be
// expressed as a Function in the general case). However, the right way to
// do such unrolling for a Function would be to specialize the loop for the
// 'hotspot' case and unroll that hotspot.
if (lbMap.getNumResults() != 1 || ubMap.getNumResults() != 1)
// Loops where the lower bound is a max expression isn't supported for
// unrolling since the trip count can be expressed as an affine function when
// both the lower bound and the upper bound are multi-result maps. However,
// one meaningful way to do such unrolling would be to specialize the loop for
// the 'hotspot' case and unroll that hotspot.
if (forOp->getLowerBoundMap().getNumResults() != 1)
return failure();
// Same operand list for lower and upper bound for now.
// TODO(bondhugula): handle bounds with different operand lists.
if (!forOp->matchingBoundOperandList())
return failure();
Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
// If the trip count is lower than the unroll factor, no unrolled body.
// TODO(bondhugula): option to specify cleanup loop unrolling.
Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
if (mayBeConstantTripCount.hasValue() &&
mayBeConstantTripCount.getValue() < unrollFactor)
return failure();
@ -397,21 +413,20 @@ LogicalResult mlir::loopUnrollByFactor(OpPointer<AffineForOp> forOp,
if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
FuncBuilder builder(forInst->getBlock(), ++Block::iterator(forInst));
auto cleanupForInst = builder.clone(*forInst)->cast<AffineForOp>();
auto clLbMap = getCleanupLoopLowerBound(forOp, unrollFactor, &builder);
assert(clLbMap &&
"cleanup loop lower bound map for single result bound maps can "
"always be determined");
cleanupForInst->setLowerBoundMap(clLbMap);
AffineMap cleanupMap;
SmallVector<Value *, 4> cleanupOperands;
getCleanupLoopLowerBound(forOp, unrollFactor, &cleanupMap, &cleanupOperands,
&builder);
assert(cleanupMap &&
"cleanup loop lower bound map for single result lower bound maps "
"can always be determined");
cleanupForInst->setLowerBound(cleanupOperands, cleanupMap);
// Promote the loop body up if this has turned into a single iteration loop.
promoteIfSingleIteration(cleanupForInst);
// Adjust upper bound.
auto unrolledUbMap =
getUnrolledLoopUpperBound(forOp, unrollFactor, &builder);
assert(unrolledUbMap &&
"upper bound map can alwayys be determined for an unrolled loop "
"with single result bounds");
forOp->setUpperBoundMap(unrolledUbMap);
// Adjust upper bound of the original loop; this is the same as the lower
// bound of the cleanup loop.
forOp->setUpperBound(cleanupOperands, cleanupMap);
}
// Scale the step of loop being unrolled by unroll factor.

View File

@ -1,20 +1,21 @@
// RUN: mlir-opt %s -loop-unroll-jam -unroll-jam-factor=2 | FileCheck %s
// CHECK: [[MAP_PLUS_1:#map[0-9]+]] = (d0) -> (d0 + 1)
// This should be matched to M1, but M1 is defined later.
// CHECK: {{#map[0-9]+}} = ()[s0] -> (s0 + 8)
// CHECK-DAG: [[MAP_PLUS_1:#map[0-9]+]] = (d0) -> (d0 + 1)
// CHECK-DAG: [[M1:#map[0-9]+]] = ()[s0] -> (s0 + 8)
// CHECK-DAG: [[MAP_DIV_OFFSET:#map[0-9]+]] = ()[s0] -> (((s0 - 1) floordiv 2) * 2 + 1)
// CHECK-DAG: [[MAP_MULTI_RES:#map[0-9]+]] = ()[s0, s1] -> ((s0 floordiv 2) * 2, (s1 floordiv 2) * 2, 1024)
// CHECK-LABEL: func @unroll_jam_imperfect_nest() {
func @unroll_jam_imperfect_nest() {
// CHECK: %c100 = constant 100 : index
// CHECK-NEXT: for %i0 = 0 to 99 step 2 {
// CHECK-NEXT: for %i0 = 0 to 100 step 2 {
for %i = 0 to 101 {
// CHECK: %0 = "addi32"(%i0, %i0) : (index, index) -> i32
// CHECK-NEXT: %1 = affine.apply [[MAP_PLUS_1]](%i0)
// CHECK-NEXT: %2 = "addi32"(%1, %1) : (index, index) -> i32
%x = "addi32"(%i, %i) : (index, index) -> i32
for %j = 0 to 17 {
// CHECK: %3 = "addi32"(%i0, %i0) : (index, index) -> i32
// CHECK: %3 = "addi32"(%i0, %i0) : (index, index) -> i32
// CHECK-NEXT: %4 = "addi32"(%3, %3) : (i32, i32) -> i32
// CHECK-NEXT: %5 = affine.apply [[MAP_PLUS_1]](%i0)
// CHECK-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> i32
@ -30,31 +31,28 @@ func @unroll_jam_imperfect_nest() {
// cleanup loop (single iteration)
// CHECK: %11 = "addi32"(%c100, %c100) : (index, index) -> i32
// CHECK-NEXT: for %i2 = 0 to 17 {
// CHECK-NEXT: %12 = "addi32"(%c100, %c100) : (index, index) -> i32
// CHECK-NEXT: %13 = "addi32"(%12, %12) : (i32, i32) -> i32
// CHECK-NEXT: %12 = "addi32"(%c100, %c100) : (index, index) -> i32
// CHECK-NEXT: %13 = "addi32"(%12, %12) : (i32, i32) -> i32
// CHECK-NEXT: }
// CHECK-NEXT: %14 = "addi32"(%c100, %c100) : (index, index) -> i32
return
}
// UNROLL-BY-4-LABEL: func @loop_nest_unknown_count_1(%arg0: index) {
// CHECK-LABEL: func @loop_nest_unknown_count_1(%arg0: index) {
func @loop_nest_unknown_count_1(%N : index) {
// UNROLL-BY-4-NEXT: for %i0 = 1 to #map{{[0-9]+}}()[%arg0] step 4 {
// UNROLL-BY-4-NEXT: for %i1 = 1 to 100 {
// UNROLL-BY-4-NEXT: %0 = "foo"() : () -> i32
// UNROLL-BY-4-NEXT: %1 = "foo"() : () -> i32
// UNROLL-BY-4-NEXT: %2 = "foo"() : () -> i32
// UNROLL-BY-4-NEXT: %3 = "foo"() : () -> i32
// UNROLL-BY-4-NEXT: }
// UNROLL-BY-4-NEXT: }
// CHECK-NEXT: for %i0 = 1 to [[MAP_DIV_OFFSET]]()[%arg0] step 2 {
// CHECK-NEXT: for %i1 = 1 to 100 {
// CHECK-NEXT: %0 = "foo"() : () -> i32
// CHECK-NEXT: %1 = "foo"() : () -> i32
// CHECK-NEXT: }
// CHECK-NEXT: }
// A cleanup loop should be generated here.
// UNROLL-BY-4-NEXT: for %i2 = #map{{[0-9]+}}()[%arg0] to %arg0 {
// UNROLL-BY-4-NEXT: for %i3 = 1 to 100 {
// UNROLL-BY-4-NEXT: %4 = "foo"() : () -> i32
// UNROLL-BY-4_NEXT: }
// UNROLL-BY-4_NEXT: }
// Specify the lower bound in a form so that both lb and ub operands match.
for %i = ()[s0] -> (1)()[%N] to %N {
// CHECK-NEXT: for %i2 = [[MAP_DIV_OFFSET]]()[%arg0] to %arg0 {
// CHECK-NEXT: for %i3 = 1 to 100 {
// CHECK-NEXT: %2 = "foo"() : () -> i32
// CHECK_NEXT: }
// CHECK_NEXT: }
for %i = 1 to %N {
for %j = 1 to 100 {
%x = "foo"() : () -> i32
}
@ -62,29 +60,47 @@ func @loop_nest_unknown_count_1(%N : index) {
return
}
// UNROLL-BY-4-LABEL: func @loop_nest_unknown_count_2(%arg0: index) {
// CHECK-LABEL: func @loop_nest_unknown_count_2(%arg0: index) {
func @loop_nest_unknown_count_2(%arg : index) {
// UNROLL-BY-4-NEXT: for %i0 = %arg0 to #map{{[0-9]+}}()[%arg0] step 4 {
// UNROLL-BY-4-NEXT: for %i1 = 1 to 100 {
// UNROLL-BY-4-NEXT: %0 = "foo"(%i0) : (index) -> i32
// UNROLL-BY-4-NEXT: %1 = affine.apply #map{{[0-9]+}}(%i0)
// UNROLL-BY-4-NEXT: %2 = "foo"(%1) : (index) -> i32
// UNROLL-BY-4-NEXT: %3 = affine.apply #map{{[0-9]+}}(%i0)
// UNROLL-BY-4-NEXT: %4 = "foo"(%3) : (index) -> i32
// UNROLL-BY-4-NEXT: %5 = affine.apply #map{{[0-9]+}}(%i0)
// UNROLL-BY-4-NEXT: %6 = "foo"(%5) : (index) -> i32
// UNROLL-BY-4-NEXT: }
// UNROLL-BY-4-NEXT: }
// CHECK-NEXT: for %i0 = %arg0 to [[M1]]()[%arg0] step 2 {
// CHECK-NEXT: for %i1 = 1 to 100 {
// CHECK-NEXT: %0 = "foo"(%i0) : (index) -> i32
// CHECK-NEXT: %1 = affine.apply #map{{[0-9]+}}(%i0)
// CHECK-NEXT: %2 = "foo"(%1) : (index) -> i32
// CHECK-NEXT: }
// CHECK-NEXT: }
// The cleanup loop is a single iteration one and is promoted.
// UNROLL-BY-4-NEXT: %7 = affine.apply [[M1:#map{{[0-9]+}}]]()[%arg0]
// UNROLL-BY-4-NEXT: for %i3 = 1 to 100 {
// UNROLL-BY-4-NEXT: %8 = "foo"() : () -> i32
// UNROLL-BY-4_NEXT: }
// Specify the lower bound in a form so that both lb and ub operands match.
for %i = ()[s0] -> (s0) ()[%arg] to ()[s0] -> (s0+8) ()[%arg] {
// CHECK-NEXT: %3 = affine.apply [[M1]]()[%arg0]
// CHECK-NEXT: for %i2 = 1 to 100 {
// CHECK-NEXT: %4 = "foo"(%3) : (index) -> i32
// CHECK_NEXT: }
for %i = %arg to ()[s0] -> (s0+9) ()[%arg] {
for %j = 1 to 100 {
%x = "foo"(%i) : (index) -> i32
}
}
return
}
// CHECK-LABEL: func @loop_nest_symbolic_and_min_upper_bound
func @loop_nest_symbolic_and_min_upper_bound(%M : index, %N : index, %K : index) {
for %i = 0 to min ()[s0, s1] -> (s0, s1, 1024)()[%M, %N] {
for %j = 0 to %K {
"foo"(%i, %j) : (index, index) -> ()
}
}
return
}
// CHECK-NEXT: for %i0 = 0 to min [[MAP_MULTI_RES]]()[%arg0, %arg1] step 2 {
// CHECK-NEXT: for %i1 = 0 to %arg2 {
// CHECK-NEXT: "foo"(%i0, %i1) : (index, index) -> ()
// CHECK-NEXT: %0 = affine.apply #map2(%i0)
// CHECK-NEXT: "foo"(%0, %i1) : (index, index) -> ()
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT: for %i2 = max [[MAP_MULTI_RES]]()[%arg0, %arg1] to min #map9()[%arg0, %arg1] {
// CHECK-NEXT: for %i3 = 0 to %arg2 {
// CHECK-NEXT: "foo"(%i2, %i3) : (index, index) -> ()
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT: return

View File

@ -1,253 +1,244 @@
// RUN: mlir-opt %s -loop-unroll -unroll-full | FileCheck %s
// RUN: mlir-opt %s -loop-unroll -unroll-full | FileCheck %s --check-prefix UNROLL-FULL
// RUN: mlir-opt %s -loop-unroll -unroll-full -unroll-full-threshold=2 | FileCheck %s --check-prefix SHORT
// RUN: mlir-opt %s -loop-unroll -unroll-factor=4 | FileCheck %s --check-prefix UNROLL-BY-4
// RUN: mlir-opt %s -loop-unroll -unroll-factor=1 | FileCheck %s --check-prefix UNROLL-BY-1
// CHECK: [[MAP0:#map[0-9]+]] = (d0) -> (d0 + 1)
// CHECK: [[MAP1:#map[0-9]+]] = (d0) -> (d0 + 2)
// CHECK: [[MAP2:#map[0-9]+]] = (d0) -> (d0 + 3)
// CHECK: [[MAP3:#map[0-9]+]] = (d0) -> (d0 + 4)
// CHECK: [[MAP4:#map[0-9]+]] = (d0, d1) -> (d0 + 1)
// CHECK: [[MAP5:#map[0-9]+]] = (d0, d1) -> (d0 + 3)
// CHECK: [[MAP6:#map[0-9]+]] = (d0)[s0] -> (d0 + s0 + 1)
// CHECK: [[MAP7:#map[0-9]+]] = (d0) -> (d0 + 5)
// CHECK: [[MAP8:#map[0-9]+]] = (d0) -> (d0 + 6)
// CHECK: [[MAP9:#map[0-9]+]] = (d0) -> (d0 + 7)
// CHECK: [[MAP10:#map[0-9]+]] = (d0, d1) -> (d0 * 16 + d1)
// CHECK: [[MAP11:#map[0-9]+]] = (d0) -> (d0 + 8)
// CHECK: [[MAP12:#map[0-9]+]] = (d0) -> (d0 + 9)
// CHECK: [[MAP13:#map[0-9]+]] = (d0) -> (d0 + 10)
// CHECK: [[MAP14:#map[0-9]+]] = (d0) -> (d0 + 15)
// CHECK: [[MAP15:#map[0-9]+]] = (d0) -> (d0 + 20)
// CHECK: [[MAP16:#map[0-9]+]] = (d0) -> (d0 + 25)
// CHECK: [[MAP17:#map[0-9]+]] = (d0) -> (d0 + 30)
// CHECK: [[MAP18:#map[0-9]+]] = (d0) -> (d0 + 35)
// UNROLL-FULL-DAG: [[MAP0:#map[0-9]+]] = (d0) -> (d0 + 1)
// UNROLL-FULL-DAG: [[MAP1:#map[0-9]+]] = (d0) -> (d0 + 2)
// UNROLL-FULL-DAG: [[MAP2:#map[0-9]+]] = (d0) -> (d0 + 3)
// UNROLL-FULL-DAG: [[MAP3:#map[0-9]+]] = (d0) -> (d0 + 4)
// UNROLL-FULL-DAG: [[MAP4:#map[0-9]+]] = (d0, d1) -> (d0 + 1)
// UNROLL-FULL-DAG: [[MAP5:#map[0-9]+]] = (d0, d1) -> (d0 + 3)
// UNROLL-FULL-DAG: [[MAP6:#map[0-9]+]] = (d0)[s0] -> (d0 + s0 + 1)
// SHORT: [[MAP0:#map[0-9]+]] = (d0) -> (d0 + 1)
// SHORT: [[MAP1:#map[0-9]+]] = (d0) -> (d0 + 2)
// SHORT: [[MAP2:#map[0-9]+]] = (d0, d1) -> (d0 + 1)
// SHORT: [[MAP3:#map[0-9]+]] = (d0, d1) -> (d0 + 3)
// SHORT: [[MAP4:#map[0-9]+]] = (d0)[s0] -> (d0 + s0 + 1)
// SHORT: [[MAP5:#map[0-9]+]] = (d0, d1) -> (d0 * 16 + d1)
// SHORT-DAG: [[MAP0:#map[0-9]+]] = (d0) -> (d0 + 1)
// UNROLL-BY-4: [[MAP0:#map[0-9]+]] = (d0) -> (d0 + 1)
// UNROLL-BY-4: [[MAP1:#map[0-9]+]] = (d0) -> (d0 + 2)
// UNROLL-BY-4: [[MAP2:#map[0-9]+]] = (d0) -> (d0 + 3)
// UNROLL-BY-4: [[MAP3:#map[0-9]+]] = (d0, d1) -> (d0 + 1)
// UNROLL-BY-4: [[MAP4:#map[0-9]+]] = (d0, d1) -> (d0 + 3)
// UNROLL-BY-4: [[MAP5:#map[0-9]+]] = (d0)[s0] -> (d0 + s0 + 1)
// UNROLL-BY-4: [[MAP6:#map[0-9]+]] = (d0, d1) -> (d0 * 16 + d1)
// UNROLL-BY-4: [[MAP7:#map[0-9]+]] = (d0) -> (d0 + 5)
// UNROLL-BY-4: [[MAP8:#map[0-9]+]] = (d0) -> (d0 + 10)
// UNROLL-BY-4: [[MAP9:#map[0-9]+]] = (d0) -> (d0 + 15)
// UNROLL-BY-4: [[MAP10:#map[0-9]+]] = (d0) -> (0)
// UNROLL-BY-4: [[MAP11:#map[0-9]+]] = (d0) -> (d0)
// UNROLL-BY-4: [[MAP12:#map[0-9]+]] = ()[s0] -> (0)
// UNROLL-BY-4-DAG: [[MAP0:#map[0-9]+]] = (d0) -> (d0 + 1)
// UNROLL-BY-4-DAG: [[MAP1:#map[0-9]+]] = (d0) -> (d0 + 2)
// UNROLL-BY-4-DAG: [[MAP2:#map[0-9]+]] = (d0) -> (d0 + 3)
// UNROLL-BY-4-DAG: [[MAP3:#map[0-9]+]] = (d0, d1) -> (d0 + 1)
// UNROLL-BY-4-DAG: [[MAP4:#map[0-9]+]] = (d0, d1) -> (d0 + 3)
// UNROLL-BY-4-DAG: [[MAP5:#map[0-9]+]] = (d0)[s0] -> (d0 + s0 + 1)
// UNROLL-BY-4-DAG: [[MAP6:#map[0-9]+]] = (d0, d1) -> (d0 * 16 + d1)
// UNROLL-BY-4-DAG: [[MAP11:#map[0-9]+]] = (d0) -> (d0)
// UNROLL-BY-4-DAG: [[MAP_TRIP_COUNT_MULTIPLE_FOUR:#map[0-9]+]] = ()[s0, s1, s2] -> (s0 + ((-s0 + s1) floordiv 4) * 4, s0 + ((-s0 + s2) floordiv 4) * 4, s0 + ((-s0 + 1024) floordiv 4) * 4)
// CHECK-LABEL: func @loop_nest_simplest() {
// UNROLL-FULL-LABEL: func @loop_nest_simplest() {
func @loop_nest_simplest() {
// CHECK: for %i0 = 0 to 100 step 2 {
// UNROLL-FULL: for %i0 = 0 to 100 step 2 {
for %i = 0 to 100 step 2 {
// CHECK: %c1_i32 = constant 1 : i32
// CHECK-NEXT: %c1_i32_0 = constant 1 : i32
// CHECK-NEXT: %c1_i32_1 = constant 1 : i32
// CHECK-NEXT: %c1_i32_2 = constant 1 : i32
// UNROLL-FULL: %c1_i32 = constant 1 : i32
// UNROLL-FULL-NEXT: %c1_i32_0 = constant 1 : i32
// UNROLL-FULL-NEXT: %c1_i32_1 = constant 1 : i32
// UNROLL-FULL-NEXT: %c1_i32_2 = constant 1 : i32
for %j = 0 to 4 {
%x = constant 1 : i32
}
} // CHECK: }
return // CHECK: return
} // CHECK }
} // UNROLL-FULL: }
return // UNROLL-FULL: return
} // UNROLL-FULL }
// CHECK-LABEL: func @loop_nest_simple_iv_use() {
// UNROLL-FULL-LABEL: func @loop_nest_simple_iv_use() {
func @loop_nest_simple_iv_use() {
// CHECK: %c0 = constant 0 : index
// CHECK-NEXT: for %i0 = 0 to 100 step 2 {
// UNROLL-FULL: %c0 = constant 0 : index
// UNROLL-FULL-NEXT: for %i0 = 0 to 100 step 2 {
for %i = 0 to 100 step 2 {
// CHECK: %0 = "addi32"(%c0, %c0) : (index, index) -> i32
// CHECK: %1 = affine.apply [[MAP0]](%c0)
// CHECK-NEXT: %2 = "addi32"(%1, %1) : (index, index) -> i32
// CHECK: %3 = affine.apply [[MAP1]](%c0)
// CHECK-NEXT: %4 = "addi32"(%3, %3) : (index, index) -> i32
// CHECK: %5 = affine.apply [[MAP2]](%c0)
// CHECK-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> i32
// UNROLL-FULL: %0 = "addi32"(%c0, %c0) : (index, index) -> i32
// UNROLL-FULL: %1 = affine.apply [[MAP0]](%c0)
// UNROLL-FULL-NEXT: %2 = "addi32"(%1, %1) : (index, index) -> i32
// UNROLL-FULL: %3 = affine.apply [[MAP1]](%c0)
// UNROLL-FULL-NEXT: %4 = "addi32"(%3, %3) : (index, index) -> i32
// UNROLL-FULL: %5 = affine.apply [[MAP2]](%c0)
// UNROLL-FULL-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> i32
for %j = 0 to 4 {
%x = "addi32"(%j, %j) : (index, index) -> i32
}
} // CHECK: }
return // CHECK: return
} // CHECK }
} // UNROLL-FULL: }
return // UNROLL-FULL: return
} // UNROLL-FULL }
// Operations in the loop body have results that are used therein.
// CHECK-LABEL: func @loop_nest_body_def_use() {
// UNROLL-FULL-LABEL: func @loop_nest_body_def_use() {
func @loop_nest_body_def_use() {
// CHECK: %c0 = constant 0 : index
// CHECK-NEXT: for %i0 = 0 to 100 step 2 {
// UNROLL-FULL: %c0 = constant 0 : index
// UNROLL-FULL-NEXT: for %i0 = 0 to 100 step 2 {
for %i = 0 to 100 step 2 {
// CHECK: %c0_0 = constant 0 : index
// UNROLL-FULL: %c0_0 = constant 0 : index
%c0 = constant 0 : index
// CHECK: %0 = affine.apply [[MAP0]](%c0)
// CHECK-NEXT: %1 = "addi32"(%0, %c0_0) : (index, index) -> index
// CHECK-NEXT: %2 = affine.apply [[MAP0]](%c0)
// CHECK-NEXT: %3 = affine.apply [[MAP0]](%2)
// CHECK-NEXT: %4 = "addi32"(%3, %c0_0) : (index, index) -> index
// CHECK-NEXT: %5 = affine.apply [[MAP1]](%c0)
// CHECK-NEXT: %6 = affine.apply [[MAP0]](%5)
// CHECK-NEXT: %7 = "addi32"(%6, %c0_0) : (index, index) -> index
// CHECK-NEXT: %8 = affine.apply [[MAP2]](%c0)
// CHECK-NEXT: %9 = affine.apply [[MAP0]](%8)
// CHECK-NEXT: %10 = "addi32"(%9, %c0_0) : (index, index) -> index
// UNROLL-FULL: %0 = affine.apply [[MAP0]](%c0)
// UNROLL-FULL-NEXT: %1 = "addi32"(%0, %c0_0) : (index, index) -> index
// UNROLL-FULL-NEXT: %2 = affine.apply [[MAP0]](%c0)
// UNROLL-FULL-NEXT: %3 = affine.apply [[MAP0]](%2)
// UNROLL-FULL-NEXT: %4 = "addi32"(%3, %c0_0) : (index, index) -> index
// UNROLL-FULL-NEXT: %5 = affine.apply [[MAP1]](%c0)
// UNROLL-FULL-NEXT: %6 = affine.apply [[MAP0]](%5)
// UNROLL-FULL-NEXT: %7 = "addi32"(%6, %c0_0) : (index, index) -> index
// UNROLL-FULL-NEXT: %8 = affine.apply [[MAP2]](%c0)
// UNROLL-FULL-NEXT: %9 = affine.apply [[MAP0]](%8)
// UNROLL-FULL-NEXT: %10 = "addi32"(%9, %c0_0) : (index, index) -> index
for %j = 0 to 4 {
%x = "affine.apply" (%j) { map: (d0) -> (d0 + 1) } :
(index) -> (index)
%y = "addi32"(%x, %c0) : (index, index) -> index
}
} // CHECK: }
return // CHECK: return
} // CHECK }
} // UNROLL-FULL: }
return // UNROLL-FULL: return
} // UNROLL-FULL }
// CHECK-LABEL: func @loop_nest_strided() {
// UNROLL-FULL-LABEL: func @loop_nest_strided() {
func @loop_nest_strided() {
// CHECK: %c2 = constant 2 : index
// CHECK-NEXT: %c2_0 = constant 2 : index
// CHECK-NEXT: for %i0 = 0 to 100 {
// UNROLL-FULL: %c2 = constant 2 : index
// UNROLL-FULL-NEXT: %c2_0 = constant 2 : index
// UNROLL-FULL-NEXT: for %i0 = 0 to 100 {
for %i = 0 to 100 {
// CHECK: %0 = affine.apply [[MAP0]](%c2_0)
// CHECK-NEXT: %1 = "addi32"(%0, %0) : (index, index) -> index
// CHECK-NEXT: %2 = affine.apply [[MAP1]](%c2_0)
// CHECK-NEXT: %3 = affine.apply [[MAP0]](%2)
// CHECK-NEXT: %4 = "addi32"(%3, %3) : (index, index) -> index
// UNROLL-FULL: %0 = affine.apply [[MAP0]](%c2_0)
// UNROLL-FULL-NEXT: %1 = "addi32"(%0, %0) : (index, index) -> index
// UNROLL-FULL-NEXT: %2 = affine.apply [[MAP1]](%c2_0)
// UNROLL-FULL-NEXT: %3 = affine.apply [[MAP0]](%2)
// UNROLL-FULL-NEXT: %4 = "addi32"(%3, %3) : (index, index) -> index
for %j = 2 to 6 step 2 {
%x = "affine.apply" (%j) { map: (d0) -> (d0 + 1) } :
(index) -> (index)
%y = "addi32"(%x, %x) : (index, index) -> index
}
// CHECK: %5 = affine.apply [[MAP0]](%c2)
// CHECK-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> index
// CHECK-NEXT: %7 = affine.apply [[MAP1]](%c2)
// CHECK-NEXT: %8 = affine.apply [[MAP0]](%7)
// CHECK-NEXT: %9 = "addi32"(%8, %8) : (index, index) -> index
// CHECK-NEXT: %10 = affine.apply [[MAP3]](%c2)
// CHECK-NEXT: %11 = affine.apply [[MAP0]](%10)
// CHECK-NEXT: %12 = "addi32"(%11, %11) : (index, index) -> index
// UNROLL-FULL: %5 = affine.apply [[MAP0]](%c2)
// UNROLL-FULL-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> index
// UNROLL-FULL-NEXT: %7 = affine.apply [[MAP1]](%c2)
// UNROLL-FULL-NEXT: %8 = affine.apply [[MAP0]](%7)
// UNROLL-FULL-NEXT: %9 = "addi32"(%8, %8) : (index, index) -> index
// UNROLL-FULL-NEXT: %10 = affine.apply [[MAP3]](%c2)
// UNROLL-FULL-NEXT: %11 = affine.apply [[MAP0]](%10)
// UNROLL-FULL-NEXT: %12 = "addi32"(%11, %11) : (index, index) -> index
for %k = 2 to 7 step 2 {
%z = "affine.apply" (%k) { map: (d0) -> (d0 + 1) } :
(index) -> (index)
%w = "addi32"(%z, %z) : (index, index) -> index
}
} // CHECK: }
return // CHECK: return
} // CHECK }
} // UNROLL-FULL: }
return // UNROLL-FULL: return
} // UNROLL-FULL }
// CHECK-LABEL: func @loop_nest_multiple_results() {
// UNROLL-FULL-LABEL: func @loop_nest_multiple_results() {
func @loop_nest_multiple_results() {
// CHECK: %c0 = constant 0 : index
// CHECK-NEXT: for %i0 = 0 to 100 {
// UNROLL-FULL: %c0 = constant 0 : index
// UNROLL-FULL-NEXT: for %i0 = 0 to 100 {
for %i = 0 to 100 {
// CHECK: %0 = affine.apply [[MAP4]](%i0, %c0)
// CHECK-NEXT: %1 = "addi32"(%0, %0) : (index, index) -> index
// CHECK-NEXT: %2 = affine.apply #map{{.*}}(%i0, %c0)
// CHECK-NEXT: %3 = "fma"(%2, %0, %0) : (index, index, index) -> (index, index)
// CHECK-NEXT: %4 = affine.apply #map{{.*}}(%c0)
// CHECK-NEXT: %5 = affine.apply #map{{.*}}(%i0, %4)
// CHECK-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> index
// CHECK-NEXT: %7 = affine.apply #map{{.*}}(%i0, %4)
// CHECK-NEXT: %8 = "fma"(%7, %5, %5) : (index, index, index) -> (index, index)
// UNROLL-FULL: %0 = affine.apply [[MAP4]](%i0, %c0)
// UNROLL-FULL-NEXT: %1 = "addi32"(%0, %0) : (index, index) -> index
// UNROLL-FULL-NEXT: %2 = affine.apply #map{{.*}}(%i0, %c0)
// UNROLL-FULL-NEXT: %3 = "fma"(%2, %0, %0) : (index, index, index) -> (index, index)
// UNROLL-FULL-NEXT: %4 = affine.apply #map{{.*}}(%c0)
// UNROLL-FULL-NEXT: %5 = affine.apply #map{{.*}}(%i0, %4)
// UNROLL-FULL-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> index
// UNROLL-FULL-NEXT: %7 = affine.apply #map{{.*}}(%i0, %4)
// UNROLL-FULL-NEXT: %8 = "fma"(%7, %5, %5) : (index, index, index) -> (index, index)
for %j = 0 to 2 step 1 {
%x = affine.apply (d0, d1) -> (d0 + 1) (%i, %j)
%y = "addi32"(%x, %x) : (index, index) -> index
%z = affine.apply (d0, d1) -> (d0 + 3) (%i, %j)
%w = "fma"(%z, %x, %x) : (index, index, index) -> (index, index)
}
} // CHECK: }
return // CHECK: return
} // CHECK }
} // UNROLL-FULL: }
return // UNROLL-FULL: return
} // UNROLL-FULL }
// Imperfect loop nest. Unrolling innermost here yields a perfect nest.
// CHECK-LABEL: func @loop_nest_seq_imperfect(%arg0: memref<128x128xf32>) {
// UNROLL-FULL-LABEL: func @loop_nest_seq_imperfect(%arg0: memref<128x128xf32>) {
func @loop_nest_seq_imperfect(%a : memref<128x128xf32>) {
// CHECK: %c0 = constant 0 : index
// CHECK-NEXT: %c128 = constant 128 : index
// UNROLL-FULL: %c0 = constant 0 : index
// UNROLL-FULL-NEXT: %c128 = constant 128 : index
%c128 = constant 128 : index
// CHECK: for %i0 = 0 to 100 {
// UNROLL-FULL: for %i0 = 0 to 100 {
for %i = 0 to 100 {
// CHECK: %0 = "vld"(%i0) : (index) -> i32
// UNROLL-FULL: %0 = "vld"(%i0) : (index) -> i32
%ld = "vld"(%i) : (index) -> i32
// CHECK: %1 = affine.apply [[MAP0]](%c0)
// CHECK-NEXT: %2 = "vmulf"(%c0, %1) : (index, index) -> index
// CHECK-NEXT: %3 = "vaddf"(%2, %2) : (index, index) -> index
// CHECK-NEXT: %4 = affine.apply [[MAP0]](%c0)
// CHECK-NEXT: %5 = affine.apply [[MAP0]](%4)
// CHECK-NEXT: %6 = "vmulf"(%4, %5) : (index, index) -> index
// CHECK-NEXT: %7 = "vaddf"(%6, %6) : (index, index) -> index
// CHECK-NEXT: %8 = affine.apply [[MAP1]](%c0)
// CHECK-NEXT: %9 = affine.apply [[MAP0]](%8)
// CHECK-NEXT: %10 = "vmulf"(%8, %9) : (index, index) -> index
// CHECK-NEXT: %11 = "vaddf"(%10, %10) : (index, index) -> index
// CHECK-NEXT: %12 = affine.apply [[MAP2]](%c0)
// CHECK-NEXT: %13 = affine.apply [[MAP0]](%12)
// CHECK-NEXT: %14 = "vmulf"(%12, %13) : (index, index) -> index
// CHECK-NEXT: %15 = "vaddf"(%14, %14) : (index, index) -> index
// UNROLL-FULL: %1 = affine.apply [[MAP0]](%c0)
// UNROLL-FULL-NEXT: %2 = "vmulf"(%c0, %1) : (index, index) -> index
// UNROLL-FULL-NEXT: %3 = "vaddf"(%2, %2) : (index, index) -> index
// UNROLL-FULL-NEXT: %4 = affine.apply [[MAP0]](%c0)
// UNROLL-FULL-NEXT: %5 = affine.apply [[MAP0]](%4)
// UNROLL-FULL-NEXT: %6 = "vmulf"(%4, %5) : (index, index) -> index
// UNROLL-FULL-NEXT: %7 = "vaddf"(%6, %6) : (index, index) -> index
// UNROLL-FULL-NEXT: %8 = affine.apply [[MAP1]](%c0)
// UNROLL-FULL-NEXT: %9 = affine.apply [[MAP0]](%8)
// UNROLL-FULL-NEXT: %10 = "vmulf"(%8, %9) : (index, index) -> index
// UNROLL-FULL-NEXT: %11 = "vaddf"(%10, %10) : (index, index) -> index
// UNROLL-FULL-NEXT: %12 = affine.apply [[MAP2]](%c0)
// UNROLL-FULL-NEXT: %13 = affine.apply [[MAP0]](%12)
// UNROLL-FULL-NEXT: %14 = "vmulf"(%12, %13) : (index, index) -> index
// UNROLL-FULL-NEXT: %15 = "vaddf"(%14, %14) : (index, index) -> index
for %j = 0 to 4 {
%x = "affine.apply" (%j) { map: (d0) -> (d0 + 1) } :
(index) -> (index)
%y = "vmulf"(%j, %x) : (index, index) -> index
%z = "vaddf"(%y, %y) : (index, index) -> index
}
// CHECK: %16 = "scale"(%c128, %i0) : (index, index) -> index
// UNROLL-FULL: %16 = "scale"(%c128, %i0) : (index, index) -> index
%addr = "scale"(%c128, %i) : (index, index) -> index
// CHECK: "vst"(%16, %i0) : (index, index) -> ()
// UNROLL-FULL: "vst"(%16, %i0) : (index, index) -> ()
"vst"(%addr, %i) : (index, index) -> ()
} // CHECK }
return // CHECK: return
} // UNROLL-FULL }
return // UNROLL-FULL: return
}
// CHECK-LABEL: func @loop_nest_seq_multiple() {
// UNROLL-FULL-LABEL: func @loop_nest_seq_multiple() {
func @loop_nest_seq_multiple() {
// CHECK: c0 = constant 0 : index
// CHECK-NEXT: %c0_0 = constant 0 : index
// CHECK-NEXT: %0 = affine.apply [[MAP0]](%c0_0)
// CHECK-NEXT: "mul"(%0, %0) : (index, index) -> ()
// CHECK-NEXT: %1 = affine.apply [[MAP0]](%c0_0)
// CHECK-NEXT: %2 = affine.apply [[MAP0]](%1)
// CHECK-NEXT: "mul"(%2, %2) : (index, index) -> ()
// CHECK-NEXT: %3 = affine.apply [[MAP1]](%c0_0)
// CHECK-NEXT: %4 = affine.apply [[MAP0]](%3)
// CHECK-NEXT: "mul"(%4, %4) : (index, index) -> ()
// CHECK-NEXT: %5 = affine.apply [[MAP2]](%c0_0)
// CHECK-NEXT: %6 = affine.apply [[MAP0]](%5)
// CHECK-NEXT: "mul"(%6, %6) : (index, index) -> ()
// UNROLL-FULL: c0 = constant 0 : index
// UNROLL-FULL-NEXT: %c0_0 = constant 0 : index
// UNROLL-FULL-NEXT: %0 = affine.apply [[MAP0]](%c0_0)
// UNROLL-FULL-NEXT: "mul"(%0, %0) : (index, index) -> ()
// UNROLL-FULL-NEXT: %1 = affine.apply [[MAP0]](%c0_0)
// UNROLL-FULL-NEXT: %2 = affine.apply [[MAP0]](%1)
// UNROLL-FULL-NEXT: "mul"(%2, %2) : (index, index) -> ()
// UNROLL-FULL-NEXT: %3 = affine.apply [[MAP1]](%c0_0)
// UNROLL-FULL-NEXT: %4 = affine.apply [[MAP0]](%3)
// UNROLL-FULL-NEXT: "mul"(%4, %4) : (index, index) -> ()
// UNROLL-FULL-NEXT: %5 = affine.apply [[MAP2]](%c0_0)
// UNROLL-FULL-NEXT: %6 = affine.apply [[MAP0]](%5)
// UNROLL-FULL-NEXT: "mul"(%6, %6) : (index, index) -> ()
for %j = 0 to 4 {
%x = "affine.apply" (%j) { map: (d0) -> (d0 + 1) } :
(index) -> (index)
"mul"(%x, %x) : (index, index) -> ()
}
// CHECK: %c99 = constant 99 : index
// UNROLL-FULL: %c99 = constant 99 : index
%k = constant 99 : index
// CHECK: for %i0 = 0 to 100 step 2 {
// UNROLL-FULL: for %i0 = 0 to 100 step 2 {
for %m = 0 to 100 step 2 {
// CHECK: %7 = affine.apply [[MAP0]](%c0)
// CHECK-NEXT: %8 = affine.apply [[MAP6]](%c0)[%c99]
// CHECK-NEXT: %9 = affine.apply [[MAP0]](%c0)
// CHECK-NEXT: %10 = affine.apply [[MAP0]](%9)
// CHECK-NEXT: %11 = affine.apply [[MAP6]](%9)[%c99]
// CHECK-NEXT: %12 = affine.apply [[MAP1]](%c0)
// CHECK-NEXT: %13 = affine.apply [[MAP0]](%12)
// CHECK-NEXT: %14 = affine.apply [[MAP6]](%12)[%c99]
// CHECK-NEXT: %15 = affine.apply [[MAP2]](%c0)
// CHECK-NEXT: %16 = affine.apply [[MAP0]](%15)
// CHECK-NEXT: %17 = affine.apply [[MAP6]](%15)[%c99]
// UNROLL-FULL: %7 = affine.apply [[MAP0]](%c0)
// UNROLL-FULL-NEXT: %8 = affine.apply [[MAP6]](%c0)[%c99]
// UNROLL-FULL-NEXT: %9 = affine.apply [[MAP0]](%c0)
// UNROLL-FULL-NEXT: %10 = affine.apply [[MAP0]](%9)
// UNROLL-FULL-NEXT: %11 = affine.apply [[MAP6]](%9)[%c99]
// UNROLL-FULL-NEXT: %12 = affine.apply [[MAP1]](%c0)
// UNROLL-FULL-NEXT: %13 = affine.apply [[MAP0]](%12)
// UNROLL-FULL-NEXT: %14 = affine.apply [[MAP6]](%12)[%c99]
// UNROLL-FULL-NEXT: %15 = affine.apply [[MAP2]](%c0)
// UNROLL-FULL-NEXT: %16 = affine.apply [[MAP0]](%15)
// UNROLL-FULL-NEXT: %17 = affine.apply [[MAP6]](%15)[%c99]
for %n = 0 to 4 {
%y = "affine.apply" (%n) { map: (d0) -> (d0 + 1) } :
(index) -> (index)
%z = "affine.apply" (%n, %k) { map: (d0) [s0] -> (d0 + s0 + 1) } :
(index, index) -> (index)
} // CHECK }
} // CHECK }
return // CHECK: return
} // CHECK }
} // UNROLL-FULL }
} // UNROLL-FULL }
return // UNROLL-FULL: return
} // UNROLL-FULL }
// UNROLL-FULL-LABEL: func @loop_nest_unroll_full() {
func @loop_nest_unroll_full() {
// UNROLL-FULL-NEXT: %0 = "foo"() : () -> i32
// UNROLL-FULL-NEXT: %1 = "bar"() : () -> i32
// UNROLL-FULL-NEXT: return
for %i = 0 to 1 {
%x = "foo"() : () -> i32
%y = "bar"() : () -> i32
}
return
} // UNROLL-FULL }
// SHORT-LABEL: func @loop_nest_outer_unroll() {
func @loop_nest_outer_unroll() {
@ -269,8 +260,8 @@ func @loop_nest_outer_unroll() {
return // SHORT: return
} // SHORT }
// We aren't doing any file check here. We just need this test case to
// successfully run. Both %i0 and i1 will get unrolled here with the min trip
// We are doing a minimal FileCheck here. We just need this test case to
// successfully run. Both %x and %y will get unrolled here as the min trip
// count threshold set to 2.
// SHORT-LABEL: func @loop_nest_seq_long() -> i32 {
func @loop_nest_seq_long() -> i32 {
@ -284,7 +275,9 @@ func @loop_nest_seq_long() -> i32 {
%zero_idx = constant 0 : index
// CHECK: for %i0 = 0 to 512
for %n0 = 0 to 512 {
// CHECK: for %i1 = 0 to 8
for %n1 = 0 to 8 {
store %one, %A[%n0, %n1] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
store %two, %B[%n0, %n1] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
@ -292,22 +285,25 @@ func @loop_nest_seq_long() -> i32 {
}
}
for %i0 = 0 to 2 {
for %i1 = 0 to 2 {
for %x = 0 to 2 {
for %y = 0 to 2 {
// CHECK: for %i2
for %i2 = 0 to 8 {
%b2 = "affine.apply" (%i1, %i2) {map: (d0, d1) -> (16*d0 + d1)} : (index, index) -> index
%x = load %B[%i0, %b2] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
"op1"(%x) : (i32) -> ()
// CHECK-NOT: for %i3
// CHECK: %{{[0-9]+}} = affine.apply
%b2 = "affine.apply" (%y, %i2) {map: (d0, d1) -> (16*d0 + d1)} : (index, index) -> index
%z = load %B[%x, %b2] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
"op1"(%z) : (i32) -> ()
}
for %j1 = 0 to 8 {
for %j2 = 0 to 8 {
%a2 = "affine.apply" (%i1, %j2) {map: (d0, d1) -> (16*d0 + d1)} : (index, index) -> index
%a2 = "affine.apply" (%y, %j2) {map: (d0, d1) -> (16*d0 + d1)} : (index, index) -> index
%v203 = load %A[%j1, %a2] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
"op2"(%v203) : (i32) -> ()
}
for %k2 = 0 to 8 {
%s0 = "op3"() : () -> i32
%c2 = "affine.apply" (%i0, %k2) {map: (d0, d1) -> (16*d0 + d1)} : (index, index) -> index
%c2 = "affine.apply" (%x, %k2) {map: (d0, d1) -> (16*d0 + d1)} : (index, index) -> index
%s1 = load %C[%j1, %c2] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
%s2 = "addi32"(%s0, %s1) : (i32, i32) -> i32
store %s2, %C[%j1, %c2] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
@ -353,22 +349,22 @@ func @unroll_unit_stride_no_cleanup() {
func @unroll_unit_stride_cleanup() {
// UNROLL-BY-4: for %i0 = 0 to 100 {
for %i = 0 to 100 {
// UNROLL-BY-4: for [[L1:%i[0-9]+]] = 0 to 7 step 4 {
// UNROLL-BY-4-NEXT: %0 = "addi32"([[L1]], [[L1]]) : (index, index) -> i32
// UNROLL-BY-4-NEXT: %1 = "addi32"(%0, %0) : (i32, i32) -> i32
// UNROLL-BY-4-NEXT: %2 = affine.apply #map{{[0-9]+}}([[L1]])
// UNROLL-BY-4-NEXT: %3 = "addi32"(%2, %2) : (index, index) -> i32
// UNROLL-BY-4-NEXT: %4 = "addi32"(%3, %3) : (i32, i32) -> i32
// UNROLL-BY-4-NEXT: %5 = affine.apply #map{{[0-9]+}}([[L1]])
// UNROLL-BY-4-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> i32
// UNROLL-BY-4-NEXT: %7 = "addi32"(%6, %6) : (i32, i32) -> i32
// UNROLL-BY-4-NEXT: %8 = affine.apply #map{{[0-9]+}}([[L1]])
// UNROLL-BY-4-NEXT: %9 = "addi32"(%8, %8) : (index, index) -> i32
// UNROLL-BY-4-NEXT: %10 = "addi32"(%9, %9) : (i32, i32) -> i32
// UNROLL-BY-4: for [[L1:%i[0-9]+]] = 0 to 8 step 4 {
// UNROLL-BY-4-NEXT: %0 = "addi32"([[L1]], [[L1]]) : (index, index) -> i32
// UNROLL-BY-4-NEXT: %1 = "addi32"(%0, %0) : (i32, i32) -> i32
// UNROLL-BY-4-NEXT: %2 = affine.apply #map{{[0-9]+}}([[L1]])
// UNROLL-BY-4-NEXT: %3 = "addi32"(%2, %2) : (index, index) -> i32
// UNROLL-BY-4-NEXT: %4 = "addi32"(%3, %3) : (i32, i32) -> i32
// UNROLL-BY-4-NEXT: %5 = affine.apply #map{{[0-9]+}}([[L1]])
// UNROLL-BY-4-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> i32
// UNROLL-BY-4-NEXT: %7 = "addi32"(%6, %6) : (i32, i32) -> i32
// UNROLL-BY-4-NEXT: %8 = affine.apply #map{{[0-9]+}}([[L1]])
// UNROLL-BY-4-NEXT: %9 = "addi32"(%8, %8) : (index, index) -> i32
// UNROLL-BY-4-NEXT: %10 = "addi32"(%9, %9) : (i32, i32) -> i32
// UNROLL-BY-4-NEXT: }
// UNROLL-BY-4-NEXT: for [[L2:%i[0-9]+]] = 8 to 10 {
// UNROLL-BY-4-NEXT: %11 = "addi32"([[L2]], [[L2]]) : (index, index) -> i32
// UNROLL-BY-4-NEXT: %12 = "addi32"(%11, %11) : (i32, i32) -> i32
// UNROLL-BY-4-NEXT: %11 = "addi32"([[L2]], [[L2]]) : (index, index) -> i32
// UNROLL-BY-4-NEXT: %12 = "addi32"(%11, %11) : (i32, i32) -> i32
// UNROLL-BY-4-NEXT: }
for %j = 0 to 10 {
%x = "addi32"(%j, %j) : (index, index) -> i32
@ -382,7 +378,7 @@ func @unroll_unit_stride_cleanup() {
func @unroll_non_unit_stride_cleanup() {
// UNROLL-BY-4: for %i0 = 0 to 100 {
for %i = 0 to 100 {
// UNROLL-BY-4: for [[L1:%i[0-9]+]] = 2 to 37 step 20 {
// UNROLL-BY-4: for [[L1:%i[0-9]+]] = 2 to 42 step 20 {
// UNROLL-BY-4-NEXT: %0 = "addi32"([[L1]], [[L1]]) : (index, index) -> i32
// UNROLL-BY-4-NEXT: %1 = "addi32"(%0, %0) : (i32, i32) -> i32
// UNROLL-BY-4-NEXT: %2 = affine.apply #map{{[0-9]+}}([[L1]])
@ -408,6 +404,7 @@ func @unroll_non_unit_stride_cleanup() {
}
// Both the unrolled loop and the cleanup loop are single iteration loops.
// UNROLL-BY-4-LABEL: func @loop_nest_single_iteration_after_unroll
func @loop_nest_single_iteration_after_unroll(%N: index) {
// UNROLL-BY-4: %c0 = constant 0 : index
// UNROLL-BY-4: %c4 = constant 4 : index
@ -435,7 +432,7 @@ func @loop_nest_single_iteration_after_unroll(%N: index) {
// UNROLL-BY-4-LABEL: func @loop_nest_operand1() {
func @loop_nest_operand1() {
// UNROLL-BY-4: for %i0 = 0 to 100 step 2 {
// UNROLL-BY-4-NEXT: for %i1 = [[MAP10]](%i0) to #map{{[0-9]+}}(%i0) step 4
// UNROLL-BY-4-NEXT: for %i1 = 0 to #map{{[0-9]+}}(%i0) step 4
// UNROLL-BY-4-NEXT: %0 = "foo"() : () -> i32
// UNROLL-BY-4-NEXT: %1 = "foo"() : () -> i32
// UNROLL-BY-4-NEXT: %2 = "foo"() : () -> i32
@ -444,7 +441,7 @@ func @loop_nest_operand1() {
// UNROLL-BY-4-NEXT: }
// UNROLL-BY-4-NEXT: return
for %i = 0 to 100 step 2 {
for %j = (d0) -> (0) (%i) to (d0) -> (d0 - d0 mod 4) (%i) {
for %j = 0 to (d0) -> (d0 - d0 mod 4) (%i) {
%x = "foo"() : () -> i32
}
}
@ -491,11 +488,11 @@ func @loop_nest_operand3() {
return
}
// UNROLL-BY-4-LABEL: func @loop_nest_operand4(%arg0: index) {
func @loop_nest_operand4(%N : index) {
// UNROLL-BY-4-LABEL: func @loop_nest_symbolic_bound(%arg0: index) {
func @loop_nest_symbolic_bound(%N : index) {
// UNROLL-BY-4: for %i0 = 0 to 100 {
for %i = 0 to 100 {
// UNROLL-BY-4: for %i1 = [[MAP12]]()[%arg0] to #map{{[0-9]+}}()[%arg0] step 4 {
// UNROLL-BY-4: for %i1 = 0 to #map{{[0-9]+}}()[%arg0] step 4 {
// UNROLL-BY-4: %0 = "foo"() : () -> i32
// UNROLL-BY-4-NEXT: %1 = "foo"() : () -> i32
// UNROLL-BY-4-NEXT: %2 = "foo"() : () -> i32
@ -505,25 +502,56 @@ func @loop_nest_operand4(%N : index) {
// UNROLL-BY-4-NEXT: for %i2 = #map{{[0-9]+}}()[%arg0] to %arg0 {
// UNROLL-BY-4-NEXT: %4 = "foo"() : () -> i32
// UNROLL-BY-4_NEXT: }
// Specify the lower bound so that both lb and ub operands match.
for %j = ()[s0] -> (0)()[%N] to %N {
for %j = 0 to %N {
%x = "foo"() : () -> i32
}
}
return
}
// CHECK-LABEL: func @loop_nest_unroll_full() {
func @loop_nest_unroll_full() {
// CHECK-NEXT: %0 = "foo"() : () -> i32
// CHECK-NEXT: %1 = "bar"() : () -> i32
// CHECK-NEXT: return
for %i = 0 to 1 {
%x = "foo"() : () -> i32
%y = "bar"() : () -> i32
// UNROLL-BY-4-LABEL: func @loop_nest_symbolic_and_min_upper_bound
func @loop_nest_symbolic_and_min_upper_bound(%M : index, %N : index, %K : index) {
for %i = %M to min ()[s0, s1] -> (s0, s1, 1024)()[%N, %K] {
"foo"() : () -> ()
}
return
} // CHECK }
}
// CHECK-NEXT: for %i0 = %arg0 to min [[MAP_TRIP_COUNT_MULTIPLE_FOUR]]()[%arg0, %arg1, %arg2] step 4 {
// CHECK-NEXT: "foo"() : () -> ()
// CHECK-NEXT: "foo"() : () -> ()
// CHECK-NEXT: "foo"() : () -> ()
// CHECK-NEXT: "foo"() : () -> ()
// CHECK-NEXT: }
// CHECK-NEXT: for %i1 = max [[MAP_TRIP_COUNT_MULTIPLE_FOUR]]()[%arg0, %arg1, %arg2] to min #map28()[%arg1, %arg2] {
// CHECK-NEXT: "foo"() : () -> ()
// CHECK-NEXT: }
// CHECK-NEXT: return
// The trip count here is a multiple of four, but this can be inferred only
// through composition. Check for no cleanup loop.
// UNROLL-BY-4-LABEL: func @loop_nest_non_trivial_multiple_unroll_factor
func @loop_nest_non_trivial_multiple_unroll_factor(%M : index, %N : index) {
%T = affine.apply (d0) -> (4*d0 + 1)(%M)
%K = affine.apply (d0) -> (d0 - 1) (%T)
for %i = 0 to min (d0, d1) -> (4 * d0, d1, 1024)(%N, %K) {
"foo"() : () -> ()
}
return
}
// UNROLL-BY-4: for %i0 = 0 to min
// UNROLL-BY-4-NOT: for
// UNROLL-BY-4: return
// Commented due to b/128340045
// xUNROLL-BY-4-LABEL: func @loop_nest_non_trivial_multiple_unroll_factor
// func @loop_nest_non_trivial_multiple_unroll_factor(%M : index, %N : index) {
// %K = affine.apply (d0) -> (4*d0) (%M)
// for %i = 0 to min ()[s0, s1] -> (4 * s0, s1, 1024)()[%N, %K] {
// "foo"() : () -> ()
// }
// return
//}
// UNROLL-BY-1-LABEL: func @unroll_by_one_should_promote_single_iteration_loop()
func @unroll_by_one_should_promote_single_iteration_loop() {