[MLIR][Affine] Add parametric tile size support for affine.for tiling

Add support to tile affine.for ops with parametric sizes (i.e., SSA
values). Currently supports hyper-rectangular loop nests with constant
lower bounds only. Move methods

  - moveLoopBody(*)
  - getTileableBands(*)
  - checkTilingLegality(*)
  - tilePerfectlyNested(*)
  - constructTiledIndexSetHyperRect(*)

to allow reuse with constant tile size API. Add a test pass -test-affine
-parametric-tile to test parametric tiling.

Differential Revision: https://reviews.llvm.org/D87353
This commit is contained in:
Navdeep Kumar 2020-09-17 23:37:21 +05:30 committed by Uday Bondhugula
parent 296e97ae8f
commit 0602e8f77f
7 changed files with 954 additions and 283 deletions

View File

@ -88,16 +88,28 @@ LLVM_NODISCARD
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts,
bool unrollPrologueEpilogue = false);
/// Identify valid and profitable bands of loops to tile. This is currently just
/// a temporary placeholder to test the mechanics of tiled code generation.
/// Returns all maximal outermost perfect loop nests to tile.
void getTileableBands(FuncOp f,
std::vector<SmallVector<AffineForOp, 6>> *bands);
/// Tiles the specified band of perfectly nested loops creating tile-space loops
/// and intra-tile loops. A band is a contiguous set of loops. `tiledNest` when
/// non-null is set to the loops of the tiled nest from outermost to innermost.
/// Loops in `input` are erased when the tiling is successful.
/// and intra-tile loops. A band is a contiguous set of loops.
LLVM_NODISCARD
LogicalResult
tilePerfectlyNested(MutableArrayRef<AffineForOp> input,
ArrayRef<unsigned> tileSizes,
SmallVectorImpl<AffineForOp> *tiledNest = nullptr);
/// Tiles the specified band of perfectly nested loops creating tile-space
/// loops and intra-tile loops, using SSA values as tiling parameters. A band
/// is a contiguous set of loops.
LLVM_NODISCARD
LogicalResult tilePerfectlyNestedParametric(
MutableArrayRef<AffineForOp> input, ArrayRef<Value> tileSizes,
SmallVectorImpl<AffineForOp> *tiledNest = nullptr);
/// Performs loop interchange on 'forOpA' and 'forOpB'. Requires that 'forOpA'
/// and 'forOpB' are part of a perfectly nested sequence of loops.
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB);

View File

@ -61,278 +61,6 @@ std::unique_ptr<OperationPass<FuncOp>> mlir::createLoopTilingPass() {
return std::make_unique<LoopTiling>();
}
// Move the loop body of AffineForOp 'src' from 'src' into the specified
// location in destination's body, ignoring the terminator.
static inline void moveLoopBody(AffineForOp src, AffineForOp dest,
Block::iterator loc) {
auto &insts = src.getBody()->getOperations();
dest.getBody()->getOperations().splice(loc, insts, insts.begin(),
std::prev(insts.end()));
}
// Move the loop body of AffineForOp 'src' from 'src' to the start of dest's
// body.
static inline void moveLoopBody(AffineForOp src, AffineForOp dest) {
moveLoopBody(src, dest, dest.getBody()->begin());
}
/// Constructs and sets new loop bounds after tiling for the case of
/// hyper-rectangular index sets, where the bounds of one dimension do not
/// depend on other dimensions. Bounds of each dimension can thus be treated
/// independently, and deriving the new bounds is much simpler and faster
/// than for the case of tiling arbitrary polyhedral shapes.
static void
constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,
MutableArrayRef<AffineForOp> newLoops,
ArrayRef<unsigned> tileSizes) {
assert(!origLoops.empty());
assert(origLoops.size() == tileSizes.size());
OpBuilder b(origLoops[0].getOperation());
unsigned width = origLoops.size();
// Bounds for tile space loops.
for (unsigned i = 0; i < width; i++) {
OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
newLoops[i].setStep(tileSizes[i]);
}
// Bounds for intra-tile loops.
for (unsigned i = 0; i < width; i++) {
int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
auto mayBeConstantCount = getConstantTripCount(origLoops[i]);
// The lower bound is just the tile-space loop.
AffineMap lbMap = b.getDimIdentityMap();
newLoops[width + i].setLowerBound(
/*operands=*/newLoops[i].getInductionVar(), lbMap);
// Set the upper bound.
if (mayBeConstantCount && mayBeConstantCount.getValue() < tileSizes[i]) {
// Trip count is less than the tile size: upper bound is lower bound +
// trip count.
auto ubMap = b.getSingleDimShiftAffineMap(mayBeConstantCount.getValue());
newLoops[width + i].setUpperBound(
/*operands=*/newLoops[i].getInductionVar(), ubMap);
} else if (largestDiv % tileSizes[i] != 0) {
// Intra-tile loop ii goes from i to min(i + tileSize, ub_i).
// Construct the upper bound map; the operands are the original operands
// with 'i' (tile-space loop) appended to it. The new upper bound map is
// the original one with an additional expression i + tileSize appended.
// Add dim operands from original upper bound.
SmallVector<Value, 4> ubOperands;
auto ub = origLoops[i].getUpperBound();
ubOperands.reserve(ub.getNumOperands() + 1);
auto origUbMap = ub.getMap();
for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
ubOperands.push_back(ub.getOperand(j));
// Add dim operand for new loop upper bound.
ubOperands.push_back(newLoops[i].getInductionVar());
// Add symbol operands from original upper bound.
for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
SmallVector<AffineExpr, 4> boundExprs;
boundExprs.reserve(1 + origUbMap.getNumResults());
auto dim = b.getAffineDimExpr(origUbMap.getNumDims());
// The new upper bound map is the original one with an additional
// expression i + tileSize appended.
boundExprs.push_back(dim + tileSizes[i]);
boundExprs.append(origUbMap.getResults().begin(),
origUbMap.getResults().end());
auto ubMap =
AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
boundExprs, b.getContext());
newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
} else {
// No need of the min expression.
auto dim = b.getAffineDimExpr(0);
auto ubMap = AffineMap::get(1, 0, dim + tileSizes[i]);
newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
}
}
}
/// This function checks whether hyper-rectangular loop tiling of the nest
/// represented by `origLoops` is valid. The validity condition is from Irigoin
/// and Triolet, which states that two tiles cannot depend on each other. We
/// simplify such condition to just checking whether there is any negative
/// dependence direction, since we have the prior knowledge that the tiling
/// results will be hyper-rectangles, which are scheduled in the
/// lexicographically increasing order on the vector of loop indices. This
/// function will return failure when any dependence component is negative along
/// any of `origLoops`.
static LogicalResult
checkTilingLegality(MutableArrayRef<mlir::AffineForOp> origLoops) {
assert(!origLoops.empty() && "no original loops provided");
// We first find out all dependences we intend to check.
SmallVector<Operation *, 8> loadAndStoreOps;
origLoops[0].getOperation()->walk([&](Operation *op) {
if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
loadAndStoreOps.push_back(op);
});
unsigned numOps = loadAndStoreOps.size();
unsigned numLoops = origLoops.size();
FlatAffineConstraints dependenceConstraints;
for (unsigned d = 1; d <= numLoops + 1; ++d) {
for (unsigned i = 0; i < numOps; ++i) {
Operation *srcOp = loadAndStoreOps[i];
MemRefAccess srcAccess(srcOp);
for (unsigned j = 0; j < numOps; ++j) {
Operation *dstOp = loadAndStoreOps[j];
MemRefAccess dstAccess(dstOp);
SmallVector<DependenceComponent, 2> depComps;
dependenceConstraints.reset();
DependenceResult result = checkMemrefAccessDependence(
srcAccess, dstAccess, d, &dependenceConstraints, &depComps);
// Skip if there is no dependence in this case.
if (!hasDependence(result))
continue;
// Check whether there is any negative direction vector in the
// dependence components found above, which means that dependence is
// violated by the default hyper-rect tiling method.
LLVM_DEBUG(llvm::dbgs() << "Checking whether tiling legality violated "
"for dependence at depth: "
<< Twine(d) << " between:\n";);
LLVM_DEBUG(srcAccess.opInst->dump(););
LLVM_DEBUG(dstAccess.opInst->dump(););
for (unsigned k = 0, e = depComps.size(); k < e; k++) {
DependenceComponent depComp = depComps[k];
if (depComp.lb.hasValue() && depComp.ub.hasValue() &&
depComp.lb.getValue() < depComp.ub.getValue() &&
depComp.ub.getValue() < 0) {
LLVM_DEBUG(llvm::dbgs()
<< "Dependence component lb = "
<< Twine(depComp.lb.getValue())
<< " ub = " << Twine(depComp.ub.getValue())
<< " is negative at depth: " << Twine(d)
<< " and thus violates the legality rule.\n");
return failure();
}
}
}
}
}
return success();
}
/// Tiles the specified band of perfectly nested loops creating tile-space loops
/// and intra-tile loops. A band is a contiguous set of loops.
// TODO: handle non hyper-rectangular spaces.
LogicalResult
mlir::tilePerfectlyNested(MutableArrayRef<AffineForOp> input,
ArrayRef<unsigned> tileSizes,
SmallVectorImpl<AffineForOp> *tiledNest) {
// Check if the supplied for op's are all successively nested.
assert(!input.empty() && "no loops in input band");
assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
assert(isPerfectlyNested(input) && "input loops not perfectly nested");
auto origLoops = input;
// Perform tiling legality test.
if (failed(checkTilingLegality(origLoops)))
origLoops[0].emitRemark("tiled code is illegal due to dependences");
AffineForOp rootAffineForOp = origLoops[0];
auto loc = rootAffineForOp.getLoc();
// Note that width is at least one since band isn't empty.
unsigned width = input.size();
SmallVector<AffineForOp, 6> tiledLoops(2 * width);
// The outermost among the loops as we add more..
auto *topLoop = rootAffineForOp.getOperation();
AffineForOp innermostPointLoop;
// Add intra-tile (or point) loops.
for (unsigned i = 0; i < width; i++) {
OpBuilder b(topLoop);
// Loop bounds will be set later.
auto pointLoop = b.create<AffineForOp>(loc, 0, 0);
pointLoop.getBody()->getOperations().splice(
pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
topLoop);
tiledLoops[2 * width - 1 - i] = pointLoop;
topLoop = pointLoop.getOperation();
if (i == 0)
innermostPointLoop = pointLoop;
}
// Add tile space loops;
for (unsigned i = width; i < 2 * width; i++) {
OpBuilder b(topLoop);
// Loop bounds will be set later.
auto tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
tileSpaceLoop.getBody()->getOperations().splice(
tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
topLoop);
tiledLoops[2 * width - i - 1] = tileSpaceLoop;
topLoop = tileSpaceLoop.getOperation();
}
// Move the loop body of the original nest to the new one.
moveLoopBody(origLoops.back(), innermostPointLoop);
SmallVector<Value, 8> origLoopIVs;
extractForInductionVars(input, &origLoopIVs);
FlatAffineConstraints cst;
SmallVector<Operation *, 8> ops;
ops.reserve(input.size());
for (AffineForOp forOp : input)
ops.push_back(forOp);
getIndexSet(ops, &cst);
if (!cst.isHyperRectangular(0, width)) {
rootAffineForOp.emitError("tiled code generation unimplemented for the "
"non-hyperrectangular case");
return failure();
}
constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
// Replace original IVs with intra-tile loop IVs.
for (unsigned i = 0; i < width; i++)
origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
// Erase the old loop nest.
rootAffineForOp.erase();
if (tiledNest)
*tiledNest = std::move(tiledLoops);
return success();
}
// Identify valid and profitable bands of loops to tile. This is currently just
// a temporary placeholder to test the mechanics of tiled code generation.
// Returns all maximal outermost perfect loop nests to tile.
static void getTileableBands(FuncOp f,
std::vector<SmallVector<AffineForOp, 6>> *bands) {
// Get maximal perfect nest of 'affine.for' insts starting from root
// (inclusive).
auto getMaximalPerfectLoopNest = [&](AffineForOp root) {
SmallVector<AffineForOp, 6> band;
getPerfectlyNestedLoops(band, root);
bands->push_back(band);
};
for (auto &block : f)
for (auto &op : block)
if (auto forOp = dyn_cast<AffineForOp>(op))
getMaximalPerfectLoopNest(forOp);
}
/// Reduces each tile size to the largest divisor of the corresponding trip
/// count (if the trip count is known).
static void adjustToDivisorsOfTripCounts(ArrayRef<AffineForOp> band,
@ -340,7 +68,7 @@ static void adjustToDivisorsOfTripCounts(ArrayRef<AffineForOp> band,
assert(band.size() == tileSizes->size() && "invalid tile size count");
for (unsigned i = 0, e = band.size(); i < e; i++) {
unsigned &tSizeAdjusted = (*tileSizes)[i];
auto mayConst = getConstantTripCount(band[i]);
Optional<uint64_t> mayConst = getConstantTripCount(band[i]);
if (!mayConst)
continue;
// Adjust the tile size to largest factor of the trip count less than
@ -379,14 +107,14 @@ void LoopTiling::getTileSizes(ArrayRef<AffineForOp> band,
tileSizes->resize(band.size());
// The first loop in the band.
auto rootForOp = band[0];
AffineForOp rootForOp = band[0];
(void)rootForOp;
// Obtain memory footprint and set tile sizes so that a tile fits in
// the cache size. This is an approximation with the assumption that the
// footprint increases with the tile size linearly in that dimension (i.e.,
// assumes one-to-one access function).
auto fp = getMemoryFootprintBytes(band[0], 0);
Optional<int64_t> fp = getMemoryFootprintBytes(band[0], 0);
if (!fp) {
// Fill with default tile sizes if footprint is unknown.
std::fill(tileSizes->begin(), tileSizes->end(),
@ -445,7 +173,7 @@ void LoopTiling::runOnFunction() {
getTileSizes(band, &tileSizes);
if (llvm::DebugFlag) {
auto diag = band[0].emitRemark("using tile sizes [");
for (auto tSize : tileSizes)
for (unsigned tSize : tileSizes)
diag << tSize << ' ';
diag << "]\n";
}

View File

@ -418,10 +418,559 @@ LogicalResult mlir::affineForOpBodySkew(AffineForOp forOp,
return success();
}
// Collect perfectly nested loops starting from `rootForOps`. Loops are
// perfectly nested if each loop is the first and only non-terminator operation
// in the parent loop. Collect at most `maxLoops` loops and append them to
// `forOps`.
/// Checks the legality of tiling of a hyper-rectangular loop nest by simply
/// checking if there is a 'negative' dependence in the memrefs present in
/// the loop nest. If yes then tiling is invalid.
static bool
checkTilingLegalityImpl(MutableArrayRef<mlir::AffineForOp> origLoops) {
assert(!origLoops.empty() && "no original loops provided");
// We first find out all dependences we intend to check.
SmallVector<Operation *, 8> loadAndStoreOps;
origLoops[0].getOperation()->walk([&](Operation *op) {
if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
loadAndStoreOps.push_back(op);
});
unsigned numOps = loadAndStoreOps.size();
unsigned numLoops = origLoops.size();
FlatAffineConstraints dependenceConstraints;
for (unsigned d = 1; d <= numLoops + 1; ++d) {
for (unsigned i = 0; i < numOps; ++i) {
Operation *srcOp = loadAndStoreOps[i];
MemRefAccess srcAccess(srcOp);
for (unsigned j = 0; j < numOps; ++j) {
Operation *dstOp = loadAndStoreOps[j];
MemRefAccess dstAccess(dstOp);
SmallVector<DependenceComponent, 2> depComps;
dependenceConstraints.reset();
DependenceResult result = checkMemrefAccessDependence(
srcAccess, dstAccess, d, &dependenceConstraints, &depComps);
// Skip if there is no dependence in this case.
if (!hasDependence(result))
continue;
// Check whether there is any negative direction vector in the
// dependence components found above, which means that dependence is
// violated by the default hyper-rect tiling method.
LLVM_DEBUG(llvm::dbgs() << "Checking whether tiling legality violated "
"for dependence at depth: "
<< Twine(d) << " between:\n";);
LLVM_DEBUG(srcAccess.opInst->dump(););
LLVM_DEBUG(dstAccess.opInst->dump(););
for (unsigned k = 0, e = depComps.size(); k < e; k++) {
DependenceComponent depComp = depComps[k];
if (depComp.lb.hasValue() && depComp.ub.hasValue() &&
depComp.lb.getValue() < depComp.ub.getValue() &&
depComp.ub.getValue() < 0) {
LLVM_DEBUG(llvm::dbgs()
<< "Dependence component lb = "
<< Twine(depComp.lb.getValue())
<< " ub = " << Twine(depComp.ub.getValue())
<< " is negative at depth: " << Twine(d)
<< " and thus violates the legality rule.\n");
return false;
}
}
}
}
}
return true;
}
/// Checks whether hyper-rectangular loop tiling of the nest
/// represented by `origLoops` is valid. The validity condition is from Irigoin
/// and Triolet, which states that two tiles cannot depend on each other. We
/// simplify such condition to just checking whether there is any negative
/// dependence direction, since we have the prior knowledge that the tiling
/// results will be hyper-rectangles, which are scheduled in the
/// lexicographically increasing order on the vector of loop indices. This
/// function will return failure when any dependence component is negative along
/// any of `origLoops`.
LogicalResult
checkTilingLegality(MutableArrayRef<mlir::AffineForOp> origLoops) {
return success(checkTilingLegalityImpl(origLoops));
}
/// Check if the input data is valid and wheter tiled code will be legal or not.
template <typename t>
void performPreTilingChecks(MutableArrayRef<AffineForOp> input,
ArrayRef<t> tileSizes) {
// Check if the supplied for op's are all successively nested.
assert(!input.empty() && "no loops in input band");
assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
assert(isPerfectlyNested(input) && "input loops not perfectly nested");
// Perform tiling legality test.
if (failed(checkTilingLegality(input)))
input[0].emitRemark("tiled code is illegal due to dependences");
}
/// Move the loop body of AffineForOp 'src' from 'src' into the specified
/// location in destination's body, ignoring the terminator.
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
Block::iterator loc) {
auto &ops = src.getBody()->getOperations();
dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
std::prev(ops.end()));
}
/// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
/// body.
void moveLoopBody(AffineForOp src, AffineForOp dest) {
moveLoopBodyImpl(src, dest, dest.getBody()->begin());
}
/// Constructs tiled loop nest, without setting the loop bounds and move the
/// body of the original loop nest to the tiled loop nest.
void constructTiledLoopNest(MutableArrayRef<AffineForOp> origLoops,
AffineForOp rootAffineForOp, unsigned width,
MutableArrayRef<AffineForOp> tiledLoops) {
Location loc = rootAffineForOp.getLoc();
// The outermost among the loops as we add more..
Operation *topLoop = rootAffineForOp.getOperation();
AffineForOp innermostPointLoop;
// Add intra-tile (or point) loops.
for (unsigned i = 0; i < width; i++) {
OpBuilder b(topLoop);
// Loop bounds will be set later.
AffineForOp pointLoop = b.create<AffineForOp>(loc, 0, 0);
pointLoop.getBody()->getOperations().splice(
pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
topLoop);
tiledLoops[2 * width - 1 - i] = pointLoop;
topLoop = pointLoop.getOperation();
if (i == 0)
innermostPointLoop = pointLoop;
}
// Add tile space loops;
for (unsigned i = width; i < 2 * width; i++) {
OpBuilder b(topLoop);
// Loop bounds will be set later.
AffineForOp tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
tileSpaceLoop.getBody()->getOperations().splice(
tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
topLoop);
tiledLoops[2 * width - i - 1] = tileSpaceLoop;
topLoop = tileSpaceLoop.getOperation();
}
// Move the loop body of the original nest to the new one.
moveLoopBody(origLoops.back(), innermostPointLoop);
}
/// Checks whether a loop nest is hyper-rectangular or not.
LogicalResult checkIfHyperRectangular(MutableArrayRef<AffineForOp> input,
AffineForOp rootAffineForOp,
unsigned width) {
FlatAffineConstraints cst;
SmallVector<Operation *, 8> ops(input.begin(), input.end());
getIndexSet(ops, &cst);
if (!cst.isHyperRectangular(0, width)) {
rootAffineForOp.emitError("tiled code generation unimplemented for the "
"non-hyperrectangular case");
return failure();
}
return success();
}
/// Set lower and upper bounds of intra-tile loops for parametric tiling.
// TODO: Handle non-constant lower bounds.
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
AffineForOp newInterTileLoop,
AffineForOp newIntraTileLoop,
Value tileSize) {
// The lower bound for the intra-tile loop is represented by an affine map
// as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
// for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
// - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
// of the corresponding inter-tile loop, %t0 is the corresponding tiling
// parameter, %origlb is lower bound and %origLoopStep is the loop step of the
// corresponding inter-tile loop.
assert(origLoop.hasConstantLowerBound() &&
"expected input loops to have constant lower bound.");
// Get lower bound of original loop as an affine expression.
AffineExpr origLowerBoundExpr;
origLowerBoundExpr =
b.getAffineConstantExpr(origLoop.getConstantLowerBound());
// Add dim operands from original lower/upper bound.
SmallVector<Value, 4> lbOperands, ubOperands;
AffineBound lb = origLoop.getLowerBound();
AffineBound ub = origLoop.getUpperBound();
lbOperands.reserve(lb.getNumOperands() + 2);
ubOperands.reserve(ub.getNumOperands() + 2);
AffineMap origLbMap = lb.getMap();
AffineMap origUbMap = ub.getMap();
for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
lbOperands.push_back(lb.getOperand(j));
for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
ubOperands.push_back(ub.getOperand(j));
// Add a new dim operand in lb/ubOperands corresponding to the origLoop
// IV.
lbOperands.push_back(newInterTileLoop.getInductionVar());
ubOperands.push_back(newInterTileLoop.getInductionVar());
// Get loop IV as an affine expression for lower/upper bound. Size of
// lb/ubOperands is guaranteed to be atleast one.
AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1);
AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1);
// Add symbol operands from original lower/upper bound.
for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j));
for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
// Add a new symbol operand which is the tile size for this loop.
lbOperands.push_back(tileSize);
ubOperands.push_back(tileSize);
SmallVector<AffineExpr, 4> lbBoundExprs;
SmallVector<AffineExpr, 4> ubBoundExprs;
lbBoundExprs.reserve(origLbMap.getNumResults());
ubBoundExprs.reserve(origUbMap.getNumResults());
// Get tiling parameter as an affine expression for lb/ub.
AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols());
AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
// Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
lbBoundExprs.push_back(
((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
origLowerBoundExpr);
// Get the origLoopStep as an affine expression.
AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStep());
// Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
// (tilingParameter * origLoopStep) + origlb.
ubBoundExprs.push_back(
((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
(ubTileParameter * origLoopStep) + origLowerBoundExpr);
ubBoundExprs.append(origUbMap.getResults().begin(),
origUbMap.getResults().end());
AffineMap lbMap =
AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1,
lbBoundExprs, b.getContext());
newIntraTileLoop.setLowerBound(lbOperands, lbMap);
AffineMap ubMap =
AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1,
ubBoundExprs, b.getContext());
newIntraTileLoop.setUpperBound(ubOperands, ubMap);
// Original loop step must be preserved.
newIntraTileLoop.setStep(origLoop.getStep());
}
/// Set lower and upper bounds of inter-tile loops for parametric tiling.
// TODO: Handle non-constant lower bounds.
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
AffineForOp newLoop, Value tileSize) {
OperandRange newLbOperands = origLoop.getLowerBoundOperands();
// The lower bounds for inter-tile loops are same as the correspondig lower
// bounds of original loops.
newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
// The new upper bound map for inter-tile loops, assuming constant lower
// bounds, are now originalLowerBound + ceildiv((orignalUpperBound -
// originalLowerBound), tiling paramter); where tiling parameter is the
// respective tile size for that loop. For e.g. if the original ubmap was
// ()->(1024), the new map will be
// ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
// Therefore a new symbol operand is inserted in the map and the result
// expression is overwritten.
assert(origLoop.hasConstantLowerBound() &&
"expected input loops to have constant lower bound.");
// Get lower bound of original loop as an affine expression.
AffineExpr origLowerBoundExpr;
origLowerBoundExpr =
b.getAffineConstantExpr(origLoop.getConstantLowerBound());
// Add dim operands from original upper bound.
SmallVector<Value, 4> ubOperands;
AffineBound ub = origLoop.getUpperBound();
ubOperands.reserve(ub.getNumOperands() + 1);
AffineMap origUbMap = ub.getMap();
for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
ubOperands.push_back(ub.getOperand(j));
// Add symbol operands from original upper bound.
for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
// Add a new symbol operand which is the tile size for this loop.
ubOperands.push_back(tileSize);
// Get tiling parameter as an affine expression.
AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
SmallVector<AffineExpr, 4> boundExprs;
boundExprs.reserve(origUbMap.getNumResults());
int64_t origUpperBound;
AffineExpr origUpperBoundExpr;
// If upper bound for the original loop is constant, then the constant can
// be obtained as an affine expression straight away.
if (origLoop.hasConstantUpperBound()) {
origUpperBound = origLoop.getConstantUpperBound();
// Get original constant upper bound as an affine expression.
origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound);
// Insert the bound as originalLowerBoundceildiv((originalUpperBound -
// originalLowerBound), tilingParameter).
boundExprs.push_back(
origLowerBoundExpr +
(origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
} else {
// If upper bound for the original loop is not constant then two cases
// are possible, although there handeling is the same, 1.) The result of
// ubmap has only one result expression. For e.g.
// affine.for %i = 5 to %ub
//
// A symbol operand is added which represents the tiling paramater. The
// new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
// where 's0' is the original upper bound and 's1' is the tiling
// parameter. 2.) When ubMap has more than one result expression. For e.g.
// #map0 = affine_map<()[s0, s1] -> (s0, s1)
// affine.for %i = 5 to min #map0()[%s0, %s1]
//
// A symbol operand is added which represents the tiling parameter. The
// new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
// (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
// Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
// originalLowerBound), tilingParameter).
for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
boundExprs.push_back(
origLowerBoundExpr +
(origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
}
AffineMap ubMap =
AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1,
boundExprs, b.getContext());
newLoop.setUpperBound(ubOperands, ubMap);
// Original loop step must be preserved.
newLoop.setStep(origLoop.getStep());
}
/// Constructs and sets new loop bounds after tiling for the case of
/// hyper-rectangular index sets, where the bounds of one dimension do not
/// depend on other dimensions and tiling parameters are captured from SSA
/// values. Bounds of each dimension can thus be treated independently,
/// and deriving the new bounds is much simpler and faster than for the case of
/// tiling arbitrary polyhedral shapes.
static void constructParametricallyTiledIndexSetHyperRect(
MutableArrayRef<AffineForOp> origLoops,
MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
assert(!origLoops.empty() && "expected atleast one loop in band");
assert(origLoops.size() == tileSizes.size() &&
"expected tiling parameter for each loop in band.");
OpBuilder b(origLoops[0].getOperation());
unsigned width = origLoops.size();
// Set bounds for tile space loops.
for (unsigned i = 0; i < width; ++i) {
setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]);
}
// Set bounds for intra-tile loops.
for (unsigned i = 0; i < width; ++i) {
setIntraTileBoundsParametric(b, origLoops[i], newLoops[i],
newLoops[i + width], tileSizes[i]);
}
}
/// Constructs and sets new loop bounds after tiling for the case of
/// hyper-rectangular index sets, where the bounds of one dimension do not
/// depend on other dimensions. Bounds of each dimension can thus be treated
/// independently, and deriving the new bounds is much simpler and faster
/// than for the case of tiling arbitrary polyhedral shapes.
static void
constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,
MutableArrayRef<AffineForOp> newLoops,
ArrayRef<unsigned> tileSizes) {
assert(!origLoops.empty());
assert(origLoops.size() == tileSizes.size());
OpBuilder b(origLoops[0].getOperation());
unsigned width = origLoops.size();
// Bounds for tile space loops.
for (unsigned i = 0; i < width; i++) {
OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
newLoops[i].setStep(tileSizes[i]);
}
// Bounds for intra-tile loops.
for (unsigned i = 0; i < width; i++) {
int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
Optional<uint64_t> mayBeConstantCount = getConstantTripCount(origLoops[i]);
// The lower bound is just the tile-space loop.
AffineMap lbMap = b.getDimIdentityMap();
newLoops[width + i].setLowerBound(
/*operands=*/newLoops[i].getInductionVar(), lbMap);
// Set the upper bound.
if (mayBeConstantCount && mayBeConstantCount.getValue() < tileSizes[i]) {
// Trip count is less than the tile size: upper bound is lower bound +
// trip count.
AffineMap ubMap =
b.getSingleDimShiftAffineMap(mayBeConstantCount.getValue());
newLoops[width + i].setUpperBound(
/*operands=*/newLoops[i].getInductionVar(), ubMap);
} else if (largestDiv % tileSizes[i] != 0) {
// Intra-tile loop ii goes from i to min(i + tileSize, ub_i).
// Construct the upper bound map; the operands are the original operands
// with 'i' (tile-space loop) appended to it. The new upper bound map is
// the original one with an additional expression i + tileSize appended.
// Add dim operands from original upper bound.
SmallVector<Value, 4> ubOperands;
AffineBound ub = origLoops[i].getUpperBound();
ubOperands.reserve(ub.getNumOperands() + 1);
AffineMap origUbMap = ub.getMap();
for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
ubOperands.push_back(ub.getOperand(j));
// Add dim operand for new loop upper bound.
ubOperands.push_back(newLoops[i].getInductionVar());
// Add symbol operands from original upper bound.
for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
SmallVector<AffineExpr, 4> boundExprs;
boundExprs.reserve(1 + origUbMap.getNumResults());
AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims());
// The new upper bound map is the original one with an additional
// expression i + tileSize appended.
boundExprs.push_back(dim + tileSizes[i]);
boundExprs.append(origUbMap.getResults().begin(),
origUbMap.getResults().end());
AffineMap ubMap =
AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
boundExprs, b.getContext());
newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
} else {
// No need of the min expression.
AffineExpr dim = b.getAffineDimExpr(0);
AffineMap ubMap = AffineMap::get(1, 0, dim + tileSizes[i]);
newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
}
}
}
/// Tiles the specified band of perfectly nested loops creating tile-space loops
/// and intra-tile loops. A band is a contiguous set of loops.
// TODO: handle non hyper-rectangular spaces.
LogicalResult
mlir::tilePerfectlyNested(MutableArrayRef<AffineForOp> input,
ArrayRef<unsigned> tileSizes,
SmallVectorImpl<AffineForOp> *tiledNest) {
performPreTilingChecks(input, tileSizes);
MutableArrayRef<AffineForOp> origLoops = input;
AffineForOp rootAffineForOp = origLoops[0];
// Note that width is at least one since band isn't empty.
unsigned width = input.size();
SmallVector<AffineForOp, 6> tiledLoops(2 * width);
// Construct a tiled loop nest without setting their bounds. Bounds are
// set later.
constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
SmallVector<Value, 8> origLoopIVs;
extractForInductionVars(input, &origLoopIVs);
if (failed(checkIfHyperRectangular(input, rootAffineForOp, width)))
return failure();
// Set loop bounds for the tiled loop nest.
constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
// Replace original IVs with intra-tile loop IVs.
for (unsigned i = 0; i < width; i++)
origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
// Erase the old loop nest.
rootAffineForOp.erase();
if (tiledNest)
*tiledNest = std::move(tiledLoops);
return success();
}
/// Tiles the specified band of perfectly nested loops creating tile-space
/// loops and intra-tile loops, using SSA values as tiling parameters. A band
/// is a contiguous set of loops.
// TODO: handle non hyper-rectangular spaces.
LogicalResult
mlir::tilePerfectlyNestedParametric(MutableArrayRef<AffineForOp> input,
ArrayRef<Value> tileSizes,
SmallVectorImpl<AffineForOp> *tiledNest) {
performPreTilingChecks(input, tileSizes);
MutableArrayRef<AffineForOp> origLoops = input;
AffineForOp rootAffineForOp = origLoops[0];
// Note that width is at least one since band isn't empty.
unsigned width = input.size();
SmallVector<AffineForOp, 6> tiledLoops(2 * width);
// Construct a tiled loop nest without setting their bounds. Bounds are
// set later.
constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
SmallVector<Value, 8> origLoopIVs;
extractForInductionVars(input, &origLoopIVs);
if (failed(checkIfHyperRectangular(input, rootAffineForOp, width)))
return failure();
// Set loop bounds for the tiled loop nest.
constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops,
tileSizes);
// Replace original IVs with intra-tile loop IVs.
for (unsigned i = 0; i < width; i++)
origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
// Erase the old loop nest.
rootAffineForOp.erase();
if (tiledNest)
*tiledNest = std::move(tiledLoops);
return success();
}
/// Collect perfectly nested loops starting from `rootForOps`. Loops are
/// perfectly nested if each loop is the first and only non-terminator operation
/// in the parent loop. Collect at most `maxLoops` loops and append them to
/// `forOps`.
template <typename T>
static void getPerfectlyNestedLoopsImpl(
SmallVectorImpl<T> &forOps, T rootForOp,
@ -452,6 +1001,20 @@ void mlir::getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops,
getPerfectlyNestedLoopsImpl(nestedLoops, root);
}
/// Identify valid and profitable bands of loops to tile. This is currently just
/// a temporary placeholder to test the mechanics of tiled code generation.
/// Returns all maximal outermost perfect loop nests to tile.
void mlir::getTileableBands(FuncOp f,
std::vector<SmallVector<AffineForOp, 6>> *bands) {
// Get maximal perfect nest of 'affine.for' insts starting from root
// (inclusive).
for (AffineForOp forOp : f.getOps<AffineForOp>()) {
SmallVector<AffineForOp, 6> band;
getPerfectlyNestedLoops(band, forOp);
bands->push_back(band);
}
}
/// Unrolls this loop completely.
LogicalResult mlir::loopUnrollFull(AffineForOp forOp) {
Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);

View File

@ -0,0 +1,275 @@
// RUN: mlir-opt %s -split-input-file -test-affine-parametric-tile | FileCheck %s
// Test cases to test the utility introduced to tile affine for loops using
// SSA values as tiling parameters(tile sizes). The tile sizes are expected
// to be passed as input arguments(before any other argument) to the function
// enclosing the loop nest. Currently hyper-rectangular loop nests with constant
// lower bounds are supported.
// -----
// CHECK-DAG: [[LBI:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 256)>
// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 512)>
// CHECK-DAG: [[UBI2:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 1024)>
// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0] -> (256 ceildiv s0)>
// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<()[s0] -> (512 ceildiv s0)>
// CHECK-DAG: [[UBO2:#map[0-9]+]] = affine_map<()[s0] -> (1024 ceildiv s0)>
// CHECK: func @loop_tiling_3d([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index)
// CHECK-NEXT: affine.for [[ARG3:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG0]]
// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO1]](){{.*}}[[ARG1]]
// CHECK-NEXT: affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO2]](){{.*}}[[ARG2]]
// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} to min [[UBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]
// CHECK-NEXT: affine.for %[[J:.*]] = [[LBI]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} to min [[UBI1]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]
// CHECK-NEXT: affine.for %[[K:.*]] = [[LBI]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} to min [[UBI2]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]
// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]])
func @loop_tiling_3d(%t0 : index, %t1 : index, %t2 : index) {
affine.for %i = 0 to 256 {
affine.for %j = 0 to 512 {
affine.for %k = 0 to 1024 {
"test.foo"(%i, %j, %k) : (index, index, index) -> ()
}
}
}
return
}
// -----
// CHECK-DAG: [[LBI:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0 * 4, 256)>
// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0 * 3, 512)>
// CHECK-DAG: [[UBI2:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0 * 2, 1024)>
// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0] -> (256 ceildiv s0)>
// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<()[s0] -> (512 ceildiv s0)>
// CHECK-DAG: [[UBO2:#map[0-9]+]] = affine_map<()[s0] -> (1024 ceildiv s0)>
// CHECK: func @loop_tiling_non_unit_step([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index)
// CHECK-NEXT: affine.for [[ARG3:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG0]]{{.*}}step 4
// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO1]](){{.*}}[[ARG1]]{{.*}} step 3
// CHECK-NEXT: affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO2]](){{.*}}[[ARG2]]{{.*}} step 2
// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} to min [[UBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} step 4
// CHECK-NEXT: affine.for %[[J:.*]] = [[LBI]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} to min [[UBI1]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} step 3
// CHECK-NEXT: affine.for %[[K:.*]] = [[LBI]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} to min [[UBI2]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} step 2
// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]])
func @loop_tiling_non_unit_step(%t0: index, %t1: index, %t2: index){
affine.for %i = 0 to 256 step 4 {
affine.for %j = 0 to 512 step 3 {
affine.for %k = 0 to 1024 step 2 {
"test.foo"(%i, %j, %k) : (index, index, index) -> ()
}
}
}
return
}
// -----
// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0, s1, s2] -> (d0 * s2 + s2, s0, 4096 floordiv s1)>
// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0, s1, s2] -> (s0 ceildiv s2, (4096 floordiv s1) ceildiv s2)>
// CHECK: func @tile_loop_with_div_in_upper_bound([[ARG0:%arg[0-9]+]]: index, %{{.*}}: memref<?xi32>, %{{.*}}: index, %{{.*}}: index)
#ub = affine_map<()[s0, s1] -> (s0, 4096 floordiv s1)>
func @tile_loop_with_div_in_upper_bound(%t5 : index, %A : memref<? x i32>, %L : index, %U : index) {
%c0 = constant 0 : index
%M = dim %A, %c0 : memref<? x i32>
affine.for %i = 0 to min #ub()[%M, %U] {
addi %i, %i : index
}
// CHECK: affine.for [[ARG1:%arg[0-9]+]] = 0 to min [[UBO0]]()[%{{.*}}, %{{.*}}, [[ARG0]]]
// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI0]]([[ARG1]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]({{.*}})[{{.*}}, {{.*}}, [[ARG0]]]
// CHECK-NEXT: addi %[[I]], %[[I]]
return
}
// -----
// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0, s1, s2] -> (d0 * s2 + s2 * 4, s0, 4096 floordiv s1)>
// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0, s1, s2] -> (s0 ceildiv s2, (4096 floordiv s1) ceildiv s2)>
// CHECK: func @tile_loop_with_div_in_upper_bound_non_unit_step([[ARG0:%arg[0-9]+]]: index, %{{.*}}: memref<?xi32>, %{{.*}}: index, %{{.*}}: index)
#ub = affine_map<()[s0, s1] -> (s0, 4096 floordiv s1)>
func @tile_loop_with_div_in_upper_bound_non_unit_step(%t5 : index, %A : memref<? x i32>, %L : index, %U : index) {
%c0 = constant 0 : index
%M = dim %A, %c0 : memref<? x i32>
affine.for %i = 0 to min #ub()[%M, %U] step 4 {
addi %i, %i : index
}
// CHECK: affine.for [[ARG1:%arg[0-9]+]] = 0 to min [[UBO0]]()[%{{.*}}, %{{.*}}, [[ARG0]]]{{.*}} step 4{{.*}}
// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI0]]([[ARG1]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]({{.*}})[{{.*}}, {{.*}}, [[ARG0]]]{{.*}} step 4{{.*}}
// CHECK-NEXT: addi %[[I]], %[[I]]
return
}
// -----
// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> ((d0 - 8) * s0 + 8)>
// CHECK-DAG: [[UBI2:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 - 8) * s1 + s1 * 4 + 8, s0 + 16)>
// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 - 8) * s1 + s1 + 8, s0 + 16)>
// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> ((d0 - 8) * s0 + s0 + 8, 256)>
// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<()[s0, s1] -> ((s0 + 8) ceildiv s1 + 8)>
// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0] -> (248 ceildiv s0 + 8)>
// CHECK: func @tile_loop_with_non_zero_lb([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index, %{{.*}}: index)
// CHECK-NEXT: affine.for [[ARG3:%arg[0-9+]]] = 8 to [[UBO0]]{{.*}}[[ARG0]]{{.*}}
// CHECK-NEXT: affine.for [[ARG4:%arg[0-9+]]] = 8 to [[UBO1]]{{.*}}[[ARG1]]{{.*}}
// CHECK-NEXT: affine.for [[ARG5:%arg[0-9+]]] = 8 to [[UBO1]]{{.*}}[[ARG2]]{{.*}} step 4
// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI0]]([[ARG3]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]([[ARG3]]){{.*}}[[ARG0]]{{.*}}
// CHECK-NEXT: affine.for %[[J:.*]] = [[LBI0]]([[ARG4]]){{.*}}[[ARG1]]{{.*}} to min [[UBI1]]([[ARG4]]){{.*}}[[ARG1]]{{.*}}
// CHECK-NEXT: affine.for %[[K:.*]] = [[LBI0]]([[ARG5]]){{.*}}[[ARG2]]{{.*}} to min [[UBI2]]([[ARG5]]){{.*}}[[ARG2]]{{.*}}step 4{{.*}}
// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) : (index, index, index) -> ()
#ubi = affine_map<()[s0] -> (s0 + 16)>
func @tile_loop_with_non_zero_lb(%t0: index, %t1: index, %t2: index, %U: index){
affine.for %i = 8 to 256 {
affine.for %j = 8 to #ubi()[%U] {
affine.for %k = 8 to #ubi()[%U] step 4 {
"test.foo"(%i, %j, %k) : (index, index, index) -> ()
}
}
}
return
}
// -----
// CHECK-DAG: [[LBI:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 256)>
// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 250)>
// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0] -> (256 ceildiv s0)>
// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<()[s0] -> (250 ceildiv s0)>
// CHECK: func @simple_matmul([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index{{.*}})
// CHECK-NEXT: affine.for [[ARG3:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG0]]{{.*}}
// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG1]]{{.*}}
// CHECK-NEXT: affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO1]](){{.*}}[[ARG2]]{{.*}}
// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} to min [[UBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}}
// CHECK-NEXT: affine.for %[[J:.*]] = [[LBI]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} to min [[UBI0]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}}
// CHECK-NEXT: affine.for %[[K:.*]] = [[LBI]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} to min [[UBI1]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}}
// CHECK-NEXT: affine.load %{{.*}}[%[[I]], %[[K]]]
// CHECK-NEXT: affine.load %{{.*}}[%[[K]], %[[J]]]
// CHECK-NEXT: affine.load %{{.*}}[%[[I]], %[[J]]]
// CHECK-NEXT: mulf %{{.*}}
// CHECK-NEXT: addf %{{.*}}
// CHECK-NEXT: affine.store %{{.*}}[%[[I]], %[[J]]]
func @simple_matmul(%t6 : index, %t7 : index, %t8 : index, %arg0: memref<256x256xvector<64xf32>>, %arg1: memref<256x256xvector<64xf32>>, %arg2: memref<256x256xvector<64xf32>>) -> memref<256x256xvector<64xf32>> {
affine.for %i = 0 to 256 {
affine.for %j = 0 to 256 {
affine.for %k = 0 to 250 {
%l = affine.load %arg0[%i, %k] : memref<256x256xvector<64xf32>>
%r = affine.load %arg1[%k, %j] : memref<256x256xvector<64xf32>>
%o = affine.load %arg2[%i, %j] : memref<256x256xvector<64xf32>>
%m = mulf %l, %r : vector<64xf32>
%a = addf %o, %m : vector<64xf32>
affine.store %a, %arg2[%i, %j] : memref<256x256xvector<64xf32>>
}
}
}
return %arg2 : memref<256x256xvector<64xf32>>
}
// -----
// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s1, s0)>
// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0, s1] -> (s0 ceildiv s1)>
// CHECK: func @tile_with_symbolic_loop_upper_bounds([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index{{.*}}){{.*}}
// CHECK: affine.for [[ARG2:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG0]]{{.*}}
// CHECK-NEXT: affine.for [[ARG3:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG1]]{{.*}}
// CHECK-NEXT: affine.for %[[I0:.*]] = [[LBI0]]{{.*}}[[ARG2]]{{.*}}[[ARG0]]{{.*}} to min [[UBI0]]{{.*}}[[ARG2]]{{.*}}[[ARG0]]{{.*}}
// CHECK-NEXT: affine.for %[[I1:.*]] = [[LBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG1]]{{.*}} to min [[UBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG1]]{{.*}}
// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%[[I0]], %[[I1]]] : memref<?x?xf32>
// CHECK-NEXT: affine.for %[[I2:.*]] = 0 to %{{.*}} {
// CHECK-NEXT: affine.load %{{.*}}%[[I0]], %[[I2]]
// CHECK-NEXT: affine.load %{{.*}}%[[I2]], %[[I1]]
// CHECK-NEXT: mulf
// CHECK-NEXT: affine.load %{{.*}}%[[I0]], %[[I1]]
// CHECK-NEXT: addf
// CHECK-NEXT: affine.store %{{.*}}%[[I0]], %[[I1]]
func @tile_with_symbolic_loop_upper_bounds(%t9 : index, %t10: index, %arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
%cst = constant 0.000000e+00 : f32
%c0 = constant 0 : index
%0 = dim %arg0, %c0 : memref<?x?xf32>
affine.for %i0 = 0 to %0 {
affine.for %i1 = 0 to %0 {
affine.store %cst, %arg2[%i0, %i1] : memref<?x?xf32>
affine.for %i2 = 0 to %0 {
%1 = affine.load %arg0[%i0, %i2] : memref<?x?xf32>
%2 = affine.load %arg1[%i2, %i1] : memref<?x?xf32>
%3 = mulf %1, %2 : f32
%4 = affine.load %arg2[%i0, %i1] : memref<?x?xf32>
%5 = addf %4, %3 : f32
affine.store %5, %arg2[%i0, %i1] : memref<?x?xf32>
}
}
}
return
}
// -----
// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0, s1, s2] -> (d0 * s2 + s2, s0 + s1)>
// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0, s1, s2] -> ((s0 + s1) ceildiv s2)>
// CHECK: func @tile_with_loop_upper_bounds_in_two_symbols([[ARG0:%arg[0-9]+]]: index{{.*}}){{.*}}
func @tile_with_loop_upper_bounds_in_two_symbols(%t11 : index, %arg0: memref<?xf32>, %limit: index) {
%c0 = constant 0 : index
%dim0 = dim %arg0, %c0 : memref<?xf32>
affine.for %i0 = 0 to affine_map<()[s0, s1] -> (s0 + s1)> ()[%dim0, %limit] {
%v0 = affine.load %arg0[%i0] : memref<?xf32>
}
// CHECK: affine.for [[ARG1:%arg[0-9]+]] = 0 to [[UBO0]]()[%{{.*}}, %{{.*}}, [[ARG0]]]
// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI0]]([[ARG1]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]([[ARG1]])[{{.*}}, {{.*}}, [[ARG0]]]
// CHECK-NEXT: affine.load %{{.*}}[%[[I]]]
return
}
// -----
// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 * s1 + s1, d0 + s0 + 4)>
// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 * s1 + s1, d0 + s0 + 2)>
// CHECK-DAG: [[LBO0:#map[0-9]+]] = affine_map<() -> (0)>
// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0 + 4) ceildiv s1)>
// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0 + 2) ceildiv s1)>
// CHECK: func @tile_with_upper_bounds_in_dimensions_and_symbols([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index, [[ARG3:%arg[0-9]+]]: index{{.*}}){{.*}}
// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO0]]({{.*}}){{.*}}[[ARG0]]
// CHECK-NEXT: affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO1]]({{.*}}){{.*}}[[ARG1]]
// CHECK-NEXT: affine.for {{.*}} = [[LBI0]]([[ARG4]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]({{.*}}, [[ARG4]]){{.*}}[[ARG0]]{{.*}}
// CHECK-NEXT: affine.for {{.*}} = [[LBI0]]([[ARG5]]){{.*}}[[ARG1]]{{.*}} to min [[UBI1]]({{.*}}, [[ARG5]]){{.*}}[[ARG1]]{{.*}}
func @tile_with_upper_bounds_in_dimensions_and_symbols(%t12 : index, %t13 :index, %M: index, %N: index, %K: index) {
affine.for %i = 0 to affine_map<(d0)[s0] -> (d0 + s0 + 2)>(%M)[%K] {
affine.for %j = 0 to affine_map<(d0)[s0] -> (d0 + s0 + 4)>(%N)[%K] {
"test.foo" () : () -> ()
}
}
return
}
// -----
// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 * s1 + s1 * 4, d0 + s0 + 4)>
// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 * s1 + s1 * 2, d0 + s0 + 2)>
// CHECK-DAG: [[LBO0:#map[0-9]+]] = affine_map<() -> (0)>
// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0 + 4) ceildiv s1)>
// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0 + 2) ceildiv s1)>
// CHECK: func @tile_with_upper_bounds_in_dimensions_and_symbols_non_unit_steps
// CHECK-SAME: ([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index, [[ARG3:%arg[0-9]+]]: index{{.*}}){{.*}}
// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO0]]({{.*}}){{.*}}[[ARG0]]{{.*}} step 2{{.*}}
// CHECK-NEXT: affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO1]]({{.*}}){{.*}}[[ARG1]]{{.*}} step 4{{.*}}
// CHECK-NEXT: affine.for {{.*}} = [[LBI0]]([[ARG4]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]({{.*}}, [[ARG4]]){{.*}}[[ARG0]]{{.*}} step 2{{.*}}
// CHECK-NEXT: affine.for {{.*}} = [[LBI0]]([[ARG5]]){{.*}}[[ARG1]]{{.*}} to min [[UBI1]]({{.*}}, [[ARG5]]){{.*}}[[ARG1]]{{.*}} step 4{{.*}}
func @tile_with_upper_bounds_in_dimensions_and_symbols_non_unit_steps(%t12 : index, %t13 :index, %M: index, %N : index, %K: index) {
affine.for %i = 0 to affine_map<(d0)[s0] -> (d0 + s0 + 2)>(%M)[%K] step 2 {
affine.for %j = 0 to affine_map<(d0)[s0] -> (d0 + s0 + 4)>(%N)[%K] step 4 {
"test.foo" () : () -> ()
}
}
return
}

View File

@ -1,6 +1,7 @@
# Exclude tests from libMLIR.so
add_mlir_library(MLIRTestTransforms
TestAllReduceLowering.cpp
TestAffineLoopParametricTiling.cpp
TestBufferPlacement.cpp
TestExpandTanh.cpp
TestCallGraph.cpp

View File

@ -0,0 +1,90 @@
//= TestAffineLoopParametricTiling.cpp -- Parametric Affine loop tiling pass =//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements a test pass to test parametric tiling of perfectly
// nested affine for loops.
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/Passes.h"
#include "mlir/Transforms/LoopUtils.h"
using namespace mlir;
#define DEBUG_TYPE "test-affine-parametric-tile"
namespace {
struct TestAffineLoopParametricTiling
: public PassWrapper<TestAffineLoopParametricTiling, FunctionPass> {
void runOnFunction() override;
};
} // end anonymous namespace
/// Checks if the function enclosing the loop nest has any arguments passed to
/// it, which can be used as tiling parameters. Assumes that atleast 'n'
/// arguments are passed, where 'n' is the number of loops in the loop nest.
static void checkIfTilingParametersExist(ArrayRef<AffineForOp> band) {
assert(!band.empty() && "no loops in input band");
AffineForOp topLoop = band[0];
if (FuncOp funcOp = dyn_cast<FuncOp>(topLoop.getParentOp()))
assert(funcOp.getNumArguments() >= band.size() && "Too few tile sizes");
}
/// Captures tiling parameters, which are expected to be passed as arguments
/// to the function enclosing the loop nest. Also checks if the required
/// parameters are of index type. This approach is temporary for testing
/// purposes.
static void getTilingParameters(ArrayRef<AffineForOp> band,
SmallVectorImpl<Value> &tilingParameters) {
AffineForOp topLoop = band[0];
Region *funcOpRegion = topLoop.getParentRegion();
unsigned nestDepth = band.size();
for (BlockArgument blockArgument :
funcOpRegion->getArguments().take_front(nestDepth)) {
if (blockArgument.getArgNumber() < nestDepth) {
assert(blockArgument.getType().isIndex() &&
"expected tiling parameters to be of index type.");
tilingParameters.push_back(blockArgument);
}
}
}
void TestAffineLoopParametricTiling::runOnFunction() {
// Bands of loops to tile.
std::vector<SmallVector<AffineForOp, 6>> bands;
getTileableBands(getFunction(), &bands);
// Tile each band.
for (SmallVectorImpl<AffineForOp> &band : bands) {
// Capture the tiling parameters from the arguments to the function
// enclosing this loop nest.
SmallVector<AffineForOp, 6> tiledNest;
SmallVector<Value, 6> tilingParameters;
// Check if tiling parameters are present.
checkIfTilingParametersExist(band);
// Get function arguments as tiling parameters.
getTilingParameters(band, tilingParameters);
if (failed(
tilePerfectlyNestedParametric(band, tilingParameters, &tiledNest)))
return signalPassFailure();
}
}
namespace mlir {
void registerTestAffineLoopParametricTilingPass() {
PassRegistration<TestAffineLoopParametricTiling>(
"test-affine-parametric-tile",
"Tile affine loops using SSA values as tile sizes");
}
} // namespace mlir

View File

@ -41,6 +41,7 @@ void registerSimpleParametricTilingPass();
void registerSliceAnalysisTestPass();
void registerSymbolTestPasses();
void registerTestAffineDataCopyPass();
void registerTestAffineLoopParametricTilingPass();
void registerTestAffineLoopUnswitchingPass();
void registerTestAllReduceLoweringPass();
void registerTestBufferPlacementPreparationPass();
@ -104,6 +105,7 @@ void registerTestPasses() {
#if MLIR_ROCM_CONVERSIONS_ENABLED
registerTestConvertGPUKernelToHsacoPass();
#endif
registerTestAffineLoopParametricTilingPass();
registerTestBufferPlacementPreparationPass();
registerTestDominancePass();
registerTestFunc();