forked from OSchip/llvm-project
[MLIR][Affine] Add parametric tile size support for affine.for tiling
Add support to tile affine.for ops with parametric sizes (i.e., SSA values). Currently supports hyper-rectangular loop nests with constant lower bounds only. Move methods - moveLoopBody(*) - getTileableBands(*) - checkTilingLegality(*) - tilePerfectlyNested(*) - constructTiledIndexSetHyperRect(*) to allow reuse with constant tile size API. Add a test pass -test-affine -parametric-tile to test parametric tiling. Differential Revision: https://reviews.llvm.org/D87353
This commit is contained in:
parent
296e97ae8f
commit
0602e8f77f
|
@ -88,16 +88,28 @@ LLVM_NODISCARD
|
|||
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts,
|
||||
bool unrollPrologueEpilogue = false);
|
||||
|
||||
/// Identify valid and profitable bands of loops to tile. This is currently just
|
||||
/// a temporary placeholder to test the mechanics of tiled code generation.
|
||||
/// Returns all maximal outermost perfect loop nests to tile.
|
||||
void getTileableBands(FuncOp f,
|
||||
std::vector<SmallVector<AffineForOp, 6>> *bands);
|
||||
|
||||
/// Tiles the specified band of perfectly nested loops creating tile-space loops
|
||||
/// and intra-tile loops. A band is a contiguous set of loops. `tiledNest` when
|
||||
/// non-null is set to the loops of the tiled nest from outermost to innermost.
|
||||
/// Loops in `input` are erased when the tiling is successful.
|
||||
/// and intra-tile loops. A band is a contiguous set of loops.
|
||||
LLVM_NODISCARD
|
||||
LogicalResult
|
||||
tilePerfectlyNested(MutableArrayRef<AffineForOp> input,
|
||||
ArrayRef<unsigned> tileSizes,
|
||||
SmallVectorImpl<AffineForOp> *tiledNest = nullptr);
|
||||
|
||||
/// Tiles the specified band of perfectly nested loops creating tile-space
|
||||
/// loops and intra-tile loops, using SSA values as tiling parameters. A band
|
||||
/// is a contiguous set of loops.
|
||||
LLVM_NODISCARD
|
||||
LogicalResult tilePerfectlyNestedParametric(
|
||||
MutableArrayRef<AffineForOp> input, ArrayRef<Value> tileSizes,
|
||||
SmallVectorImpl<AffineForOp> *tiledNest = nullptr);
|
||||
|
||||
/// Performs loop interchange on 'forOpA' and 'forOpB'. Requires that 'forOpA'
|
||||
/// and 'forOpB' are part of a perfectly nested sequence of loops.
|
||||
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB);
|
||||
|
|
|
@ -61,278 +61,6 @@ std::unique_ptr<OperationPass<FuncOp>> mlir::createLoopTilingPass() {
|
|||
return std::make_unique<LoopTiling>();
|
||||
}
|
||||
|
||||
// Move the loop body of AffineForOp 'src' from 'src' into the specified
|
||||
// location in destination's body, ignoring the terminator.
|
||||
static inline void moveLoopBody(AffineForOp src, AffineForOp dest,
|
||||
Block::iterator loc) {
|
||||
auto &insts = src.getBody()->getOperations();
|
||||
dest.getBody()->getOperations().splice(loc, insts, insts.begin(),
|
||||
std::prev(insts.end()));
|
||||
}
|
||||
|
||||
// Move the loop body of AffineForOp 'src' from 'src' to the start of dest's
|
||||
// body.
|
||||
static inline void moveLoopBody(AffineForOp src, AffineForOp dest) {
|
||||
moveLoopBody(src, dest, dest.getBody()->begin());
|
||||
}
|
||||
|
||||
/// Constructs and sets new loop bounds after tiling for the case of
|
||||
/// hyper-rectangular index sets, where the bounds of one dimension do not
|
||||
/// depend on other dimensions. Bounds of each dimension can thus be treated
|
||||
/// independently, and deriving the new bounds is much simpler and faster
|
||||
/// than for the case of tiling arbitrary polyhedral shapes.
|
||||
static void
|
||||
constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,
|
||||
MutableArrayRef<AffineForOp> newLoops,
|
||||
ArrayRef<unsigned> tileSizes) {
|
||||
assert(!origLoops.empty());
|
||||
assert(origLoops.size() == tileSizes.size());
|
||||
|
||||
OpBuilder b(origLoops[0].getOperation());
|
||||
unsigned width = origLoops.size();
|
||||
|
||||
// Bounds for tile space loops.
|
||||
for (unsigned i = 0; i < width; i++) {
|
||||
OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
|
||||
OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
|
||||
newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
|
||||
newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
|
||||
newLoops[i].setStep(tileSizes[i]);
|
||||
}
|
||||
// Bounds for intra-tile loops.
|
||||
for (unsigned i = 0; i < width; i++) {
|
||||
int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
|
||||
auto mayBeConstantCount = getConstantTripCount(origLoops[i]);
|
||||
// The lower bound is just the tile-space loop.
|
||||
AffineMap lbMap = b.getDimIdentityMap();
|
||||
newLoops[width + i].setLowerBound(
|
||||
/*operands=*/newLoops[i].getInductionVar(), lbMap);
|
||||
|
||||
// Set the upper bound.
|
||||
if (mayBeConstantCount && mayBeConstantCount.getValue() < tileSizes[i]) {
|
||||
// Trip count is less than the tile size: upper bound is lower bound +
|
||||
// trip count.
|
||||
auto ubMap = b.getSingleDimShiftAffineMap(mayBeConstantCount.getValue());
|
||||
newLoops[width + i].setUpperBound(
|
||||
/*operands=*/newLoops[i].getInductionVar(), ubMap);
|
||||
} else if (largestDiv % tileSizes[i] != 0) {
|
||||
// Intra-tile loop ii goes from i to min(i + tileSize, ub_i).
|
||||
// Construct the upper bound map; the operands are the original operands
|
||||
// with 'i' (tile-space loop) appended to it. The new upper bound map is
|
||||
// the original one with an additional expression i + tileSize appended.
|
||||
|
||||
// Add dim operands from original upper bound.
|
||||
SmallVector<Value, 4> ubOperands;
|
||||
auto ub = origLoops[i].getUpperBound();
|
||||
ubOperands.reserve(ub.getNumOperands() + 1);
|
||||
auto origUbMap = ub.getMap();
|
||||
for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
|
||||
ubOperands.push_back(ub.getOperand(j));
|
||||
|
||||
// Add dim operand for new loop upper bound.
|
||||
ubOperands.push_back(newLoops[i].getInductionVar());
|
||||
|
||||
// Add symbol operands from original upper bound.
|
||||
for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
|
||||
ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
|
||||
|
||||
SmallVector<AffineExpr, 4> boundExprs;
|
||||
boundExprs.reserve(1 + origUbMap.getNumResults());
|
||||
auto dim = b.getAffineDimExpr(origUbMap.getNumDims());
|
||||
// The new upper bound map is the original one with an additional
|
||||
// expression i + tileSize appended.
|
||||
boundExprs.push_back(dim + tileSizes[i]);
|
||||
boundExprs.append(origUbMap.getResults().begin(),
|
||||
origUbMap.getResults().end());
|
||||
auto ubMap =
|
||||
AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
|
||||
boundExprs, b.getContext());
|
||||
newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
|
||||
} else {
|
||||
// No need of the min expression.
|
||||
auto dim = b.getAffineDimExpr(0);
|
||||
auto ubMap = AffineMap::get(1, 0, dim + tileSizes[i]);
|
||||
newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This function checks whether hyper-rectangular loop tiling of the nest
|
||||
/// represented by `origLoops` is valid. The validity condition is from Irigoin
|
||||
/// and Triolet, which states that two tiles cannot depend on each other. We
|
||||
/// simplify such condition to just checking whether there is any negative
|
||||
/// dependence direction, since we have the prior knowledge that the tiling
|
||||
/// results will be hyper-rectangles, which are scheduled in the
|
||||
/// lexicographically increasing order on the vector of loop indices. This
|
||||
/// function will return failure when any dependence component is negative along
|
||||
/// any of `origLoops`.
|
||||
static LogicalResult
|
||||
checkTilingLegality(MutableArrayRef<mlir::AffineForOp> origLoops) {
|
||||
assert(!origLoops.empty() && "no original loops provided");
|
||||
|
||||
// We first find out all dependences we intend to check.
|
||||
SmallVector<Operation *, 8> loadAndStoreOps;
|
||||
origLoops[0].getOperation()->walk([&](Operation *op) {
|
||||
if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
|
||||
loadAndStoreOps.push_back(op);
|
||||
});
|
||||
|
||||
unsigned numOps = loadAndStoreOps.size();
|
||||
unsigned numLoops = origLoops.size();
|
||||
FlatAffineConstraints dependenceConstraints;
|
||||
for (unsigned d = 1; d <= numLoops + 1; ++d) {
|
||||
for (unsigned i = 0; i < numOps; ++i) {
|
||||
Operation *srcOp = loadAndStoreOps[i];
|
||||
MemRefAccess srcAccess(srcOp);
|
||||
for (unsigned j = 0; j < numOps; ++j) {
|
||||
Operation *dstOp = loadAndStoreOps[j];
|
||||
MemRefAccess dstAccess(dstOp);
|
||||
|
||||
SmallVector<DependenceComponent, 2> depComps;
|
||||
dependenceConstraints.reset();
|
||||
DependenceResult result = checkMemrefAccessDependence(
|
||||
srcAccess, dstAccess, d, &dependenceConstraints, &depComps);
|
||||
|
||||
// Skip if there is no dependence in this case.
|
||||
if (!hasDependence(result))
|
||||
continue;
|
||||
|
||||
// Check whether there is any negative direction vector in the
|
||||
// dependence components found above, which means that dependence is
|
||||
// violated by the default hyper-rect tiling method.
|
||||
LLVM_DEBUG(llvm::dbgs() << "Checking whether tiling legality violated "
|
||||
"for dependence at depth: "
|
||||
<< Twine(d) << " between:\n";);
|
||||
LLVM_DEBUG(srcAccess.opInst->dump(););
|
||||
LLVM_DEBUG(dstAccess.opInst->dump(););
|
||||
for (unsigned k = 0, e = depComps.size(); k < e; k++) {
|
||||
DependenceComponent depComp = depComps[k];
|
||||
if (depComp.lb.hasValue() && depComp.ub.hasValue() &&
|
||||
depComp.lb.getValue() < depComp.ub.getValue() &&
|
||||
depComp.ub.getValue() < 0) {
|
||||
LLVM_DEBUG(llvm::dbgs()
|
||||
<< "Dependence component lb = "
|
||||
<< Twine(depComp.lb.getValue())
|
||||
<< " ub = " << Twine(depComp.ub.getValue())
|
||||
<< " is negative at depth: " << Twine(d)
|
||||
<< " and thus violates the legality rule.\n");
|
||||
return failure();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return success();
|
||||
}
|
||||
/// Tiles the specified band of perfectly nested loops creating tile-space loops
|
||||
/// and intra-tile loops. A band is a contiguous set of loops.
|
||||
// TODO: handle non hyper-rectangular spaces.
|
||||
LogicalResult
|
||||
mlir::tilePerfectlyNested(MutableArrayRef<AffineForOp> input,
|
||||
ArrayRef<unsigned> tileSizes,
|
||||
SmallVectorImpl<AffineForOp> *tiledNest) {
|
||||
// Check if the supplied for op's are all successively nested.
|
||||
assert(!input.empty() && "no loops in input band");
|
||||
assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
|
||||
|
||||
assert(isPerfectlyNested(input) && "input loops not perfectly nested");
|
||||
|
||||
auto origLoops = input;
|
||||
|
||||
// Perform tiling legality test.
|
||||
if (failed(checkTilingLegality(origLoops)))
|
||||
origLoops[0].emitRemark("tiled code is illegal due to dependences");
|
||||
|
||||
AffineForOp rootAffineForOp = origLoops[0];
|
||||
auto loc = rootAffineForOp.getLoc();
|
||||
// Note that width is at least one since band isn't empty.
|
||||
unsigned width = input.size();
|
||||
|
||||
SmallVector<AffineForOp, 6> tiledLoops(2 * width);
|
||||
|
||||
// The outermost among the loops as we add more..
|
||||
auto *topLoop = rootAffineForOp.getOperation();
|
||||
AffineForOp innermostPointLoop;
|
||||
|
||||
// Add intra-tile (or point) loops.
|
||||
for (unsigned i = 0; i < width; i++) {
|
||||
OpBuilder b(topLoop);
|
||||
// Loop bounds will be set later.
|
||||
auto pointLoop = b.create<AffineForOp>(loc, 0, 0);
|
||||
pointLoop.getBody()->getOperations().splice(
|
||||
pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
|
||||
topLoop);
|
||||
tiledLoops[2 * width - 1 - i] = pointLoop;
|
||||
topLoop = pointLoop.getOperation();
|
||||
if (i == 0)
|
||||
innermostPointLoop = pointLoop;
|
||||
}
|
||||
|
||||
// Add tile space loops;
|
||||
for (unsigned i = width; i < 2 * width; i++) {
|
||||
OpBuilder b(topLoop);
|
||||
// Loop bounds will be set later.
|
||||
auto tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
|
||||
tileSpaceLoop.getBody()->getOperations().splice(
|
||||
tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
|
||||
topLoop);
|
||||
tiledLoops[2 * width - i - 1] = tileSpaceLoop;
|
||||
topLoop = tileSpaceLoop.getOperation();
|
||||
}
|
||||
|
||||
// Move the loop body of the original nest to the new one.
|
||||
moveLoopBody(origLoops.back(), innermostPointLoop);
|
||||
|
||||
SmallVector<Value, 8> origLoopIVs;
|
||||
extractForInductionVars(input, &origLoopIVs);
|
||||
|
||||
FlatAffineConstraints cst;
|
||||
SmallVector<Operation *, 8> ops;
|
||||
ops.reserve(input.size());
|
||||
for (AffineForOp forOp : input)
|
||||
ops.push_back(forOp);
|
||||
getIndexSet(ops, &cst);
|
||||
if (!cst.isHyperRectangular(0, width)) {
|
||||
rootAffineForOp.emitError("tiled code generation unimplemented for the "
|
||||
"non-hyperrectangular case");
|
||||
return failure();
|
||||
}
|
||||
|
||||
constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
|
||||
|
||||
// Replace original IVs with intra-tile loop IVs.
|
||||
for (unsigned i = 0; i < width; i++)
|
||||
origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
|
||||
|
||||
// Erase the old loop nest.
|
||||
rootAffineForOp.erase();
|
||||
|
||||
if (tiledNest)
|
||||
*tiledNest = std::move(tiledLoops);
|
||||
|
||||
return success();
|
||||
}
|
||||
|
||||
// Identify valid and profitable bands of loops to tile. This is currently just
|
||||
// a temporary placeholder to test the mechanics of tiled code generation.
|
||||
// Returns all maximal outermost perfect loop nests to tile.
|
||||
static void getTileableBands(FuncOp f,
|
||||
std::vector<SmallVector<AffineForOp, 6>> *bands) {
|
||||
// Get maximal perfect nest of 'affine.for' insts starting from root
|
||||
// (inclusive).
|
||||
auto getMaximalPerfectLoopNest = [&](AffineForOp root) {
|
||||
SmallVector<AffineForOp, 6> band;
|
||||
getPerfectlyNestedLoops(band, root);
|
||||
bands->push_back(band);
|
||||
};
|
||||
|
||||
for (auto &block : f)
|
||||
for (auto &op : block)
|
||||
if (auto forOp = dyn_cast<AffineForOp>(op))
|
||||
getMaximalPerfectLoopNest(forOp);
|
||||
}
|
||||
|
||||
/// Reduces each tile size to the largest divisor of the corresponding trip
|
||||
/// count (if the trip count is known).
|
||||
static void adjustToDivisorsOfTripCounts(ArrayRef<AffineForOp> band,
|
||||
|
@ -340,7 +68,7 @@ static void adjustToDivisorsOfTripCounts(ArrayRef<AffineForOp> band,
|
|||
assert(band.size() == tileSizes->size() && "invalid tile size count");
|
||||
for (unsigned i = 0, e = band.size(); i < e; i++) {
|
||||
unsigned &tSizeAdjusted = (*tileSizes)[i];
|
||||
auto mayConst = getConstantTripCount(band[i]);
|
||||
Optional<uint64_t> mayConst = getConstantTripCount(band[i]);
|
||||
if (!mayConst)
|
||||
continue;
|
||||
// Adjust the tile size to largest factor of the trip count less than
|
||||
|
@ -379,14 +107,14 @@ void LoopTiling::getTileSizes(ArrayRef<AffineForOp> band,
|
|||
tileSizes->resize(band.size());
|
||||
|
||||
// The first loop in the band.
|
||||
auto rootForOp = band[0];
|
||||
AffineForOp rootForOp = band[0];
|
||||
(void)rootForOp;
|
||||
|
||||
// Obtain memory footprint and set tile sizes so that a tile fits in
|
||||
// the cache size. This is an approximation with the assumption that the
|
||||
// footprint increases with the tile size linearly in that dimension (i.e.,
|
||||
// assumes one-to-one access function).
|
||||
auto fp = getMemoryFootprintBytes(band[0], 0);
|
||||
Optional<int64_t> fp = getMemoryFootprintBytes(band[0], 0);
|
||||
if (!fp) {
|
||||
// Fill with default tile sizes if footprint is unknown.
|
||||
std::fill(tileSizes->begin(), tileSizes->end(),
|
||||
|
@ -445,7 +173,7 @@ void LoopTiling::runOnFunction() {
|
|||
getTileSizes(band, &tileSizes);
|
||||
if (llvm::DebugFlag) {
|
||||
auto diag = band[0].emitRemark("using tile sizes [");
|
||||
for (auto tSize : tileSizes)
|
||||
for (unsigned tSize : tileSizes)
|
||||
diag << tSize << ' ';
|
||||
diag << "]\n";
|
||||
}
|
||||
|
|
|
@ -418,10 +418,559 @@ LogicalResult mlir::affineForOpBodySkew(AffineForOp forOp,
|
|||
return success();
|
||||
}
|
||||
|
||||
// Collect perfectly nested loops starting from `rootForOps`. Loops are
|
||||
// perfectly nested if each loop is the first and only non-terminator operation
|
||||
// in the parent loop. Collect at most `maxLoops` loops and append them to
|
||||
// `forOps`.
|
||||
/// Checks the legality of tiling of a hyper-rectangular loop nest by simply
|
||||
/// checking if there is a 'negative' dependence in the memrefs present in
|
||||
/// the loop nest. If yes then tiling is invalid.
|
||||
static bool
|
||||
checkTilingLegalityImpl(MutableArrayRef<mlir::AffineForOp> origLoops) {
|
||||
assert(!origLoops.empty() && "no original loops provided");
|
||||
|
||||
// We first find out all dependences we intend to check.
|
||||
SmallVector<Operation *, 8> loadAndStoreOps;
|
||||
origLoops[0].getOperation()->walk([&](Operation *op) {
|
||||
if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
|
||||
loadAndStoreOps.push_back(op);
|
||||
});
|
||||
|
||||
unsigned numOps = loadAndStoreOps.size();
|
||||
unsigned numLoops = origLoops.size();
|
||||
FlatAffineConstraints dependenceConstraints;
|
||||
for (unsigned d = 1; d <= numLoops + 1; ++d) {
|
||||
for (unsigned i = 0; i < numOps; ++i) {
|
||||
Operation *srcOp = loadAndStoreOps[i];
|
||||
MemRefAccess srcAccess(srcOp);
|
||||
for (unsigned j = 0; j < numOps; ++j) {
|
||||
Operation *dstOp = loadAndStoreOps[j];
|
||||
MemRefAccess dstAccess(dstOp);
|
||||
|
||||
SmallVector<DependenceComponent, 2> depComps;
|
||||
dependenceConstraints.reset();
|
||||
DependenceResult result = checkMemrefAccessDependence(
|
||||
srcAccess, dstAccess, d, &dependenceConstraints, &depComps);
|
||||
|
||||
// Skip if there is no dependence in this case.
|
||||
if (!hasDependence(result))
|
||||
continue;
|
||||
|
||||
// Check whether there is any negative direction vector in the
|
||||
// dependence components found above, which means that dependence is
|
||||
// violated by the default hyper-rect tiling method.
|
||||
LLVM_DEBUG(llvm::dbgs() << "Checking whether tiling legality violated "
|
||||
"for dependence at depth: "
|
||||
<< Twine(d) << " between:\n";);
|
||||
LLVM_DEBUG(srcAccess.opInst->dump(););
|
||||
LLVM_DEBUG(dstAccess.opInst->dump(););
|
||||
for (unsigned k = 0, e = depComps.size(); k < e; k++) {
|
||||
DependenceComponent depComp = depComps[k];
|
||||
if (depComp.lb.hasValue() && depComp.ub.hasValue() &&
|
||||
depComp.lb.getValue() < depComp.ub.getValue() &&
|
||||
depComp.ub.getValue() < 0) {
|
||||
LLVM_DEBUG(llvm::dbgs()
|
||||
<< "Dependence component lb = "
|
||||
<< Twine(depComp.lb.getValue())
|
||||
<< " ub = " << Twine(depComp.ub.getValue())
|
||||
<< " is negative at depth: " << Twine(d)
|
||||
<< " and thus violates the legality rule.\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Checks whether hyper-rectangular loop tiling of the nest
|
||||
/// represented by `origLoops` is valid. The validity condition is from Irigoin
|
||||
/// and Triolet, which states that two tiles cannot depend on each other. We
|
||||
/// simplify such condition to just checking whether there is any negative
|
||||
/// dependence direction, since we have the prior knowledge that the tiling
|
||||
/// results will be hyper-rectangles, which are scheduled in the
|
||||
/// lexicographically increasing order on the vector of loop indices. This
|
||||
/// function will return failure when any dependence component is negative along
|
||||
/// any of `origLoops`.
|
||||
LogicalResult
|
||||
checkTilingLegality(MutableArrayRef<mlir::AffineForOp> origLoops) {
|
||||
return success(checkTilingLegalityImpl(origLoops));
|
||||
}
|
||||
|
||||
/// Check if the input data is valid and wheter tiled code will be legal or not.
|
||||
template <typename t>
|
||||
void performPreTilingChecks(MutableArrayRef<AffineForOp> input,
|
||||
ArrayRef<t> tileSizes) {
|
||||
// Check if the supplied for op's are all successively nested.
|
||||
assert(!input.empty() && "no loops in input band");
|
||||
assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
|
||||
|
||||
assert(isPerfectlyNested(input) && "input loops not perfectly nested");
|
||||
|
||||
// Perform tiling legality test.
|
||||
if (failed(checkTilingLegality(input)))
|
||||
input[0].emitRemark("tiled code is illegal due to dependences");
|
||||
}
|
||||
|
||||
/// Move the loop body of AffineForOp 'src' from 'src' into the specified
|
||||
/// location in destination's body, ignoring the terminator.
|
||||
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
|
||||
Block::iterator loc) {
|
||||
auto &ops = src.getBody()->getOperations();
|
||||
dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
|
||||
std::prev(ops.end()));
|
||||
}
|
||||
|
||||
/// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
|
||||
/// body.
|
||||
void moveLoopBody(AffineForOp src, AffineForOp dest) {
|
||||
moveLoopBodyImpl(src, dest, dest.getBody()->begin());
|
||||
}
|
||||
|
||||
/// Constructs tiled loop nest, without setting the loop bounds and move the
|
||||
/// body of the original loop nest to the tiled loop nest.
|
||||
void constructTiledLoopNest(MutableArrayRef<AffineForOp> origLoops,
|
||||
AffineForOp rootAffineForOp, unsigned width,
|
||||
MutableArrayRef<AffineForOp> tiledLoops) {
|
||||
Location loc = rootAffineForOp.getLoc();
|
||||
|
||||
// The outermost among the loops as we add more..
|
||||
Operation *topLoop = rootAffineForOp.getOperation();
|
||||
AffineForOp innermostPointLoop;
|
||||
|
||||
// Add intra-tile (or point) loops.
|
||||
for (unsigned i = 0; i < width; i++) {
|
||||
OpBuilder b(topLoop);
|
||||
// Loop bounds will be set later.
|
||||
AffineForOp pointLoop = b.create<AffineForOp>(loc, 0, 0);
|
||||
pointLoop.getBody()->getOperations().splice(
|
||||
pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
|
||||
topLoop);
|
||||
tiledLoops[2 * width - 1 - i] = pointLoop;
|
||||
topLoop = pointLoop.getOperation();
|
||||
if (i == 0)
|
||||
innermostPointLoop = pointLoop;
|
||||
}
|
||||
|
||||
// Add tile space loops;
|
||||
for (unsigned i = width; i < 2 * width; i++) {
|
||||
OpBuilder b(topLoop);
|
||||
// Loop bounds will be set later.
|
||||
AffineForOp tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
|
||||
tileSpaceLoop.getBody()->getOperations().splice(
|
||||
tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
|
||||
topLoop);
|
||||
tiledLoops[2 * width - i - 1] = tileSpaceLoop;
|
||||
topLoop = tileSpaceLoop.getOperation();
|
||||
}
|
||||
|
||||
// Move the loop body of the original nest to the new one.
|
||||
moveLoopBody(origLoops.back(), innermostPointLoop);
|
||||
}
|
||||
|
||||
/// Checks whether a loop nest is hyper-rectangular or not.
|
||||
LogicalResult checkIfHyperRectangular(MutableArrayRef<AffineForOp> input,
|
||||
AffineForOp rootAffineForOp,
|
||||
unsigned width) {
|
||||
FlatAffineConstraints cst;
|
||||
SmallVector<Operation *, 8> ops(input.begin(), input.end());
|
||||
getIndexSet(ops, &cst);
|
||||
if (!cst.isHyperRectangular(0, width)) {
|
||||
rootAffineForOp.emitError("tiled code generation unimplemented for the "
|
||||
"non-hyperrectangular case");
|
||||
return failure();
|
||||
}
|
||||
return success();
|
||||
}
|
||||
|
||||
/// Set lower and upper bounds of intra-tile loops for parametric tiling.
|
||||
// TODO: Handle non-constant lower bounds.
|
||||
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
|
||||
AffineForOp newInterTileLoop,
|
||||
AffineForOp newIntraTileLoop,
|
||||
Value tileSize) {
|
||||
// The lower bound for the intra-tile loop is represented by an affine map
|
||||
// as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
|
||||
// for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
|
||||
// - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
|
||||
// of the corresponding inter-tile loop, %t0 is the corresponding tiling
|
||||
// parameter, %origlb is lower bound and %origLoopStep is the loop step of the
|
||||
// corresponding inter-tile loop.
|
||||
|
||||
assert(origLoop.hasConstantLowerBound() &&
|
||||
"expected input loops to have constant lower bound.");
|
||||
|
||||
// Get lower bound of original loop as an affine expression.
|
||||
AffineExpr origLowerBoundExpr;
|
||||
origLowerBoundExpr =
|
||||
b.getAffineConstantExpr(origLoop.getConstantLowerBound());
|
||||
|
||||
// Add dim operands from original lower/upper bound.
|
||||
SmallVector<Value, 4> lbOperands, ubOperands;
|
||||
AffineBound lb = origLoop.getLowerBound();
|
||||
AffineBound ub = origLoop.getUpperBound();
|
||||
lbOperands.reserve(lb.getNumOperands() + 2);
|
||||
ubOperands.reserve(ub.getNumOperands() + 2);
|
||||
AffineMap origLbMap = lb.getMap();
|
||||
AffineMap origUbMap = ub.getMap();
|
||||
for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
|
||||
lbOperands.push_back(lb.getOperand(j));
|
||||
for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
|
||||
ubOperands.push_back(ub.getOperand(j));
|
||||
|
||||
// Add a new dim operand in lb/ubOperands corresponding to the origLoop
|
||||
// IV.
|
||||
lbOperands.push_back(newInterTileLoop.getInductionVar());
|
||||
ubOperands.push_back(newInterTileLoop.getInductionVar());
|
||||
|
||||
// Get loop IV as an affine expression for lower/upper bound. Size of
|
||||
// lb/ubOperands is guaranteed to be atleast one.
|
||||
AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1);
|
||||
AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1);
|
||||
|
||||
// Add symbol operands from original lower/upper bound.
|
||||
for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
|
||||
lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j));
|
||||
for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
|
||||
ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
|
||||
|
||||
// Add a new symbol operand which is the tile size for this loop.
|
||||
lbOperands.push_back(tileSize);
|
||||
ubOperands.push_back(tileSize);
|
||||
|
||||
SmallVector<AffineExpr, 4> lbBoundExprs;
|
||||
SmallVector<AffineExpr, 4> ubBoundExprs;
|
||||
lbBoundExprs.reserve(origLbMap.getNumResults());
|
||||
ubBoundExprs.reserve(origUbMap.getNumResults());
|
||||
|
||||
// Get tiling parameter as an affine expression for lb/ub.
|
||||
AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols());
|
||||
AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
|
||||
|
||||
// Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
|
||||
lbBoundExprs.push_back(
|
||||
((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
|
||||
origLowerBoundExpr);
|
||||
|
||||
// Get the origLoopStep as an affine expression.
|
||||
AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStep());
|
||||
|
||||
// Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
|
||||
// (tilingParameter * origLoopStep) + origlb.
|
||||
ubBoundExprs.push_back(
|
||||
((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
|
||||
(ubTileParameter * origLoopStep) + origLowerBoundExpr);
|
||||
|
||||
ubBoundExprs.append(origUbMap.getResults().begin(),
|
||||
origUbMap.getResults().end());
|
||||
|
||||
AffineMap lbMap =
|
||||
AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1,
|
||||
lbBoundExprs, b.getContext());
|
||||
newIntraTileLoop.setLowerBound(lbOperands, lbMap);
|
||||
|
||||
AffineMap ubMap =
|
||||
AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1,
|
||||
ubBoundExprs, b.getContext());
|
||||
newIntraTileLoop.setUpperBound(ubOperands, ubMap);
|
||||
|
||||
// Original loop step must be preserved.
|
||||
newIntraTileLoop.setStep(origLoop.getStep());
|
||||
}
|
||||
|
||||
/// Set lower and upper bounds of inter-tile loops for parametric tiling.
|
||||
// TODO: Handle non-constant lower bounds.
|
||||
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
|
||||
AffineForOp newLoop, Value tileSize) {
|
||||
OperandRange newLbOperands = origLoop.getLowerBoundOperands();
|
||||
|
||||
// The lower bounds for inter-tile loops are same as the correspondig lower
|
||||
// bounds of original loops.
|
||||
newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
|
||||
|
||||
// The new upper bound map for inter-tile loops, assuming constant lower
|
||||
// bounds, are now originalLowerBound + ceildiv((orignalUpperBound -
|
||||
// originalLowerBound), tiling paramter); where tiling parameter is the
|
||||
// respective tile size for that loop. For e.g. if the original ubmap was
|
||||
// ()->(1024), the new map will be
|
||||
// ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
|
||||
// Therefore a new symbol operand is inserted in the map and the result
|
||||
// expression is overwritten.
|
||||
|
||||
assert(origLoop.hasConstantLowerBound() &&
|
||||
"expected input loops to have constant lower bound.");
|
||||
|
||||
// Get lower bound of original loop as an affine expression.
|
||||
AffineExpr origLowerBoundExpr;
|
||||
origLowerBoundExpr =
|
||||
b.getAffineConstantExpr(origLoop.getConstantLowerBound());
|
||||
|
||||
// Add dim operands from original upper bound.
|
||||
SmallVector<Value, 4> ubOperands;
|
||||
AffineBound ub = origLoop.getUpperBound();
|
||||
ubOperands.reserve(ub.getNumOperands() + 1);
|
||||
AffineMap origUbMap = ub.getMap();
|
||||
for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
|
||||
ubOperands.push_back(ub.getOperand(j));
|
||||
|
||||
// Add symbol operands from original upper bound.
|
||||
for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
|
||||
ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
|
||||
|
||||
// Add a new symbol operand which is the tile size for this loop.
|
||||
ubOperands.push_back(tileSize);
|
||||
|
||||
// Get tiling parameter as an affine expression.
|
||||
AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
|
||||
|
||||
SmallVector<AffineExpr, 4> boundExprs;
|
||||
boundExprs.reserve(origUbMap.getNumResults());
|
||||
int64_t origUpperBound;
|
||||
AffineExpr origUpperBoundExpr;
|
||||
|
||||
// If upper bound for the original loop is constant, then the constant can
|
||||
// be obtained as an affine expression straight away.
|
||||
if (origLoop.hasConstantUpperBound()) {
|
||||
origUpperBound = origLoop.getConstantUpperBound();
|
||||
|
||||
// Get original constant upper bound as an affine expression.
|
||||
origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound);
|
||||
|
||||
// Insert the bound as originalLowerBoundceildiv((originalUpperBound -
|
||||
// originalLowerBound), tilingParameter).
|
||||
boundExprs.push_back(
|
||||
origLowerBoundExpr +
|
||||
(origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
|
||||
} else {
|
||||
// If upper bound for the original loop is not constant then two cases
|
||||
// are possible, although there handeling is the same, 1.) The result of
|
||||
// ubmap has only one result expression. For e.g.
|
||||
// affine.for %i = 5 to %ub
|
||||
//
|
||||
// A symbol operand is added which represents the tiling paramater. The
|
||||
// new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
|
||||
// where 's0' is the original upper bound and 's1' is the tiling
|
||||
// parameter. 2.) When ubMap has more than one result expression. For e.g.
|
||||
// #map0 = affine_map<()[s0, s1] -> (s0, s1)
|
||||
// affine.for %i = 5 to min #map0()[%s0, %s1]
|
||||
//
|
||||
// A symbol operand is added which represents the tiling parameter. The
|
||||
// new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
|
||||
// (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
|
||||
|
||||
// Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
|
||||
// originalLowerBound), tilingParameter).
|
||||
for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
|
||||
boundExprs.push_back(
|
||||
origLowerBoundExpr +
|
||||
(origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
|
||||
}
|
||||
|
||||
AffineMap ubMap =
|
||||
AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1,
|
||||
boundExprs, b.getContext());
|
||||
newLoop.setUpperBound(ubOperands, ubMap);
|
||||
|
||||
// Original loop step must be preserved.
|
||||
newLoop.setStep(origLoop.getStep());
|
||||
}
|
||||
|
||||
/// Constructs and sets new loop bounds after tiling for the case of
|
||||
/// hyper-rectangular index sets, where the bounds of one dimension do not
|
||||
/// depend on other dimensions and tiling parameters are captured from SSA
|
||||
/// values. Bounds of each dimension can thus be treated independently,
|
||||
/// and deriving the new bounds is much simpler and faster than for the case of
|
||||
/// tiling arbitrary polyhedral shapes.
|
||||
static void constructParametricallyTiledIndexSetHyperRect(
|
||||
MutableArrayRef<AffineForOp> origLoops,
|
||||
MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
|
||||
assert(!origLoops.empty() && "expected atleast one loop in band");
|
||||
assert(origLoops.size() == tileSizes.size() &&
|
||||
"expected tiling parameter for each loop in band.");
|
||||
|
||||
OpBuilder b(origLoops[0].getOperation());
|
||||
unsigned width = origLoops.size();
|
||||
|
||||
// Set bounds for tile space loops.
|
||||
for (unsigned i = 0; i < width; ++i) {
|
||||
setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]);
|
||||
}
|
||||
|
||||
// Set bounds for intra-tile loops.
|
||||
for (unsigned i = 0; i < width; ++i) {
|
||||
setIntraTileBoundsParametric(b, origLoops[i], newLoops[i],
|
||||
newLoops[i + width], tileSizes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/// Constructs and sets new loop bounds after tiling for the case of
|
||||
/// hyper-rectangular index sets, where the bounds of one dimension do not
|
||||
/// depend on other dimensions. Bounds of each dimension can thus be treated
|
||||
/// independently, and deriving the new bounds is much simpler and faster
|
||||
/// than for the case of tiling arbitrary polyhedral shapes.
|
||||
static void
|
||||
constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,
|
||||
MutableArrayRef<AffineForOp> newLoops,
|
||||
ArrayRef<unsigned> tileSizes) {
|
||||
assert(!origLoops.empty());
|
||||
assert(origLoops.size() == tileSizes.size());
|
||||
|
||||
OpBuilder b(origLoops[0].getOperation());
|
||||
unsigned width = origLoops.size();
|
||||
|
||||
// Bounds for tile space loops.
|
||||
for (unsigned i = 0; i < width; i++) {
|
||||
OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
|
||||
OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
|
||||
newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
|
||||
newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
|
||||
newLoops[i].setStep(tileSizes[i]);
|
||||
}
|
||||
// Bounds for intra-tile loops.
|
||||
for (unsigned i = 0; i < width; i++) {
|
||||
int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
|
||||
Optional<uint64_t> mayBeConstantCount = getConstantTripCount(origLoops[i]);
|
||||
// The lower bound is just the tile-space loop.
|
||||
AffineMap lbMap = b.getDimIdentityMap();
|
||||
newLoops[width + i].setLowerBound(
|
||||
/*operands=*/newLoops[i].getInductionVar(), lbMap);
|
||||
|
||||
// Set the upper bound.
|
||||
if (mayBeConstantCount && mayBeConstantCount.getValue() < tileSizes[i]) {
|
||||
// Trip count is less than the tile size: upper bound is lower bound +
|
||||
// trip count.
|
||||
AffineMap ubMap =
|
||||
b.getSingleDimShiftAffineMap(mayBeConstantCount.getValue());
|
||||
newLoops[width + i].setUpperBound(
|
||||
/*operands=*/newLoops[i].getInductionVar(), ubMap);
|
||||
} else if (largestDiv % tileSizes[i] != 0) {
|
||||
// Intra-tile loop ii goes from i to min(i + tileSize, ub_i).
|
||||
// Construct the upper bound map; the operands are the original operands
|
||||
// with 'i' (tile-space loop) appended to it. The new upper bound map is
|
||||
// the original one with an additional expression i + tileSize appended.
|
||||
|
||||
// Add dim operands from original upper bound.
|
||||
SmallVector<Value, 4> ubOperands;
|
||||
AffineBound ub = origLoops[i].getUpperBound();
|
||||
ubOperands.reserve(ub.getNumOperands() + 1);
|
||||
AffineMap origUbMap = ub.getMap();
|
||||
for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
|
||||
ubOperands.push_back(ub.getOperand(j));
|
||||
|
||||
// Add dim operand for new loop upper bound.
|
||||
ubOperands.push_back(newLoops[i].getInductionVar());
|
||||
|
||||
// Add symbol operands from original upper bound.
|
||||
for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
|
||||
ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
|
||||
|
||||
SmallVector<AffineExpr, 4> boundExprs;
|
||||
boundExprs.reserve(1 + origUbMap.getNumResults());
|
||||
AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims());
|
||||
// The new upper bound map is the original one with an additional
|
||||
// expression i + tileSize appended.
|
||||
boundExprs.push_back(dim + tileSizes[i]);
|
||||
boundExprs.append(origUbMap.getResults().begin(),
|
||||
origUbMap.getResults().end());
|
||||
AffineMap ubMap =
|
||||
AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
|
||||
boundExprs, b.getContext());
|
||||
newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
|
||||
} else {
|
||||
// No need of the min expression.
|
||||
AffineExpr dim = b.getAffineDimExpr(0);
|
||||
AffineMap ubMap = AffineMap::get(1, 0, dim + tileSizes[i]);
|
||||
newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Tiles the specified band of perfectly nested loops creating tile-space loops
|
||||
/// and intra-tile loops. A band is a contiguous set of loops.
|
||||
// TODO: handle non hyper-rectangular spaces.
|
||||
LogicalResult
|
||||
mlir::tilePerfectlyNested(MutableArrayRef<AffineForOp> input,
|
||||
ArrayRef<unsigned> tileSizes,
|
||||
SmallVectorImpl<AffineForOp> *tiledNest) {
|
||||
performPreTilingChecks(input, tileSizes);
|
||||
|
||||
MutableArrayRef<AffineForOp> origLoops = input;
|
||||
AffineForOp rootAffineForOp = origLoops[0];
|
||||
// Note that width is at least one since band isn't empty.
|
||||
unsigned width = input.size();
|
||||
SmallVector<AffineForOp, 6> tiledLoops(2 * width);
|
||||
|
||||
// Construct a tiled loop nest without setting their bounds. Bounds are
|
||||
// set later.
|
||||
constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
|
||||
|
||||
SmallVector<Value, 8> origLoopIVs;
|
||||
extractForInductionVars(input, &origLoopIVs);
|
||||
|
||||
if (failed(checkIfHyperRectangular(input, rootAffineForOp, width)))
|
||||
return failure();
|
||||
|
||||
// Set loop bounds for the tiled loop nest.
|
||||
constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
|
||||
|
||||
// Replace original IVs with intra-tile loop IVs.
|
||||
for (unsigned i = 0; i < width; i++)
|
||||
origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
|
||||
|
||||
// Erase the old loop nest.
|
||||
rootAffineForOp.erase();
|
||||
|
||||
if (tiledNest)
|
||||
*tiledNest = std::move(tiledLoops);
|
||||
|
||||
return success();
|
||||
}
|
||||
|
||||
/// Tiles the specified band of perfectly nested loops creating tile-space
|
||||
/// loops and intra-tile loops, using SSA values as tiling parameters. A band
|
||||
/// is a contiguous set of loops.
|
||||
// TODO: handle non hyper-rectangular spaces.
|
||||
LogicalResult
|
||||
mlir::tilePerfectlyNestedParametric(MutableArrayRef<AffineForOp> input,
|
||||
ArrayRef<Value> tileSizes,
|
||||
SmallVectorImpl<AffineForOp> *tiledNest) {
|
||||
performPreTilingChecks(input, tileSizes);
|
||||
|
||||
MutableArrayRef<AffineForOp> origLoops = input;
|
||||
AffineForOp rootAffineForOp = origLoops[0];
|
||||
// Note that width is at least one since band isn't empty.
|
||||
unsigned width = input.size();
|
||||
SmallVector<AffineForOp, 6> tiledLoops(2 * width);
|
||||
|
||||
// Construct a tiled loop nest without setting their bounds. Bounds are
|
||||
// set later.
|
||||
constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
|
||||
|
||||
SmallVector<Value, 8> origLoopIVs;
|
||||
extractForInductionVars(input, &origLoopIVs);
|
||||
|
||||
if (failed(checkIfHyperRectangular(input, rootAffineForOp, width)))
|
||||
return failure();
|
||||
|
||||
// Set loop bounds for the tiled loop nest.
|
||||
constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops,
|
||||
tileSizes);
|
||||
|
||||
// Replace original IVs with intra-tile loop IVs.
|
||||
for (unsigned i = 0; i < width; i++)
|
||||
origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
|
||||
|
||||
// Erase the old loop nest.
|
||||
rootAffineForOp.erase();
|
||||
|
||||
if (tiledNest)
|
||||
*tiledNest = std::move(tiledLoops);
|
||||
|
||||
return success();
|
||||
}
|
||||
|
||||
/// Collect perfectly nested loops starting from `rootForOps`. Loops are
|
||||
/// perfectly nested if each loop is the first and only non-terminator operation
|
||||
/// in the parent loop. Collect at most `maxLoops` loops and append them to
|
||||
/// `forOps`.
|
||||
template <typename T>
|
||||
static void getPerfectlyNestedLoopsImpl(
|
||||
SmallVectorImpl<T> &forOps, T rootForOp,
|
||||
|
@ -452,6 +1001,20 @@ void mlir::getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops,
|
|||
getPerfectlyNestedLoopsImpl(nestedLoops, root);
|
||||
}
|
||||
|
||||
/// Identify valid and profitable bands of loops to tile. This is currently just
|
||||
/// a temporary placeholder to test the mechanics of tiled code generation.
|
||||
/// Returns all maximal outermost perfect loop nests to tile.
|
||||
void mlir::getTileableBands(FuncOp f,
|
||||
std::vector<SmallVector<AffineForOp, 6>> *bands) {
|
||||
// Get maximal perfect nest of 'affine.for' insts starting from root
|
||||
// (inclusive).
|
||||
for (AffineForOp forOp : f.getOps<AffineForOp>()) {
|
||||
SmallVector<AffineForOp, 6> band;
|
||||
getPerfectlyNestedLoops(band, forOp);
|
||||
bands->push_back(band);
|
||||
}
|
||||
}
|
||||
|
||||
/// Unrolls this loop completely.
|
||||
LogicalResult mlir::loopUnrollFull(AffineForOp forOp) {
|
||||
Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
|
||||
|
|
|
@ -0,0 +1,275 @@
|
|||
// RUN: mlir-opt %s -split-input-file -test-affine-parametric-tile | FileCheck %s
|
||||
// Test cases to test the utility introduced to tile affine for loops using
|
||||
// SSA values as tiling parameters(tile sizes). The tile sizes are expected
|
||||
// to be passed as input arguments(before any other argument) to the function
|
||||
// enclosing the loop nest. Currently hyper-rectangular loop nests with constant
|
||||
// lower bounds are supported.
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-DAG: [[LBI:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
|
||||
// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 256)>
|
||||
// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 512)>
|
||||
// CHECK-DAG: [[UBI2:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 1024)>
|
||||
// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0] -> (256 ceildiv s0)>
|
||||
// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<()[s0] -> (512 ceildiv s0)>
|
||||
// CHECK-DAG: [[UBO2:#map[0-9]+]] = affine_map<()[s0] -> (1024 ceildiv s0)>
|
||||
|
||||
// CHECK: func @loop_tiling_3d([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index)
|
||||
// CHECK-NEXT: affine.for [[ARG3:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG0]]
|
||||
// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO1]](){{.*}}[[ARG1]]
|
||||
// CHECK-NEXT: affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO2]](){{.*}}[[ARG2]]
|
||||
// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} to min [[UBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]
|
||||
// CHECK-NEXT: affine.for %[[J:.*]] = [[LBI]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} to min [[UBI1]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]
|
||||
// CHECK-NEXT: affine.for %[[K:.*]] = [[LBI]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} to min [[UBI2]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]
|
||||
// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]])
|
||||
func @loop_tiling_3d(%t0 : index, %t1 : index, %t2 : index) {
|
||||
affine.for %i = 0 to 256 {
|
||||
affine.for %j = 0 to 512 {
|
||||
affine.for %k = 0 to 1024 {
|
||||
"test.foo"(%i, %j, %k) : (index, index, index) -> ()
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-DAG: [[LBI:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
|
||||
// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0 * 4, 256)>
|
||||
// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0 * 3, 512)>
|
||||
// CHECK-DAG: [[UBI2:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0 * 2, 1024)>
|
||||
// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0] -> (256 ceildiv s0)>
|
||||
// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<()[s0] -> (512 ceildiv s0)>
|
||||
// CHECK-DAG: [[UBO2:#map[0-9]+]] = affine_map<()[s0] -> (1024 ceildiv s0)>
|
||||
|
||||
// CHECK: func @loop_tiling_non_unit_step([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index)
|
||||
// CHECK-NEXT: affine.for [[ARG3:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG0]]{{.*}}step 4
|
||||
// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO1]](){{.*}}[[ARG1]]{{.*}} step 3
|
||||
// CHECK-NEXT: affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO2]](){{.*}}[[ARG2]]{{.*}} step 2
|
||||
// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} to min [[UBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} step 4
|
||||
// CHECK-NEXT: affine.for %[[J:.*]] = [[LBI]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} to min [[UBI1]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} step 3
|
||||
// CHECK-NEXT: affine.for %[[K:.*]] = [[LBI]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} to min [[UBI2]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} step 2
|
||||
// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]])
|
||||
func @loop_tiling_non_unit_step(%t0: index, %t1: index, %t2: index){
|
||||
affine.for %i = 0 to 256 step 4 {
|
||||
affine.for %j = 0 to 512 step 3 {
|
||||
affine.for %k = 0 to 1024 step 2 {
|
||||
"test.foo"(%i, %j, %k) : (index, index, index) -> ()
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
|
||||
// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0, s1, s2] -> (d0 * s2 + s2, s0, 4096 floordiv s1)>
|
||||
// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0, s1, s2] -> (s0 ceildiv s2, (4096 floordiv s1) ceildiv s2)>
|
||||
|
||||
// CHECK: func @tile_loop_with_div_in_upper_bound([[ARG0:%arg[0-9]+]]: index, %{{.*}}: memref<?xi32>, %{{.*}}: index, %{{.*}}: index)
|
||||
#ub = affine_map<()[s0, s1] -> (s0, 4096 floordiv s1)>
|
||||
func @tile_loop_with_div_in_upper_bound(%t5 : index, %A : memref<? x i32>, %L : index, %U : index) {
|
||||
%c0 = constant 0 : index
|
||||
%M = dim %A, %c0 : memref<? x i32>
|
||||
affine.for %i = 0 to min #ub()[%M, %U] {
|
||||
addi %i, %i : index
|
||||
}
|
||||
// CHECK: affine.for [[ARG1:%arg[0-9]+]] = 0 to min [[UBO0]]()[%{{.*}}, %{{.*}}, [[ARG0]]]
|
||||
// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI0]]([[ARG1]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]({{.*}})[{{.*}}, {{.*}}, [[ARG0]]]
|
||||
// CHECK-NEXT: addi %[[I]], %[[I]]
|
||||
return
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
|
||||
// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0, s1, s2] -> (d0 * s2 + s2 * 4, s0, 4096 floordiv s1)>
|
||||
// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0, s1, s2] -> (s0 ceildiv s2, (4096 floordiv s1) ceildiv s2)>
|
||||
|
||||
// CHECK: func @tile_loop_with_div_in_upper_bound_non_unit_step([[ARG0:%arg[0-9]+]]: index, %{{.*}}: memref<?xi32>, %{{.*}}: index, %{{.*}}: index)
|
||||
#ub = affine_map<()[s0, s1] -> (s0, 4096 floordiv s1)>
|
||||
func @tile_loop_with_div_in_upper_bound_non_unit_step(%t5 : index, %A : memref<? x i32>, %L : index, %U : index) {
|
||||
%c0 = constant 0 : index
|
||||
%M = dim %A, %c0 : memref<? x i32>
|
||||
affine.for %i = 0 to min #ub()[%M, %U] step 4 {
|
||||
addi %i, %i : index
|
||||
}
|
||||
// CHECK: affine.for [[ARG1:%arg[0-9]+]] = 0 to min [[UBO0]]()[%{{.*}}, %{{.*}}, [[ARG0]]]{{.*}} step 4{{.*}}
|
||||
// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI0]]([[ARG1]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]({{.*}})[{{.*}}, {{.*}}, [[ARG0]]]{{.*}} step 4{{.*}}
|
||||
// CHECK-NEXT: addi %[[I]], %[[I]]
|
||||
return
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> ((d0 - 8) * s0 + 8)>
|
||||
// CHECK-DAG: [[UBI2:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 - 8) * s1 + s1 * 4 + 8, s0 + 16)>
|
||||
// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 - 8) * s1 + s1 + 8, s0 + 16)>
|
||||
// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> ((d0 - 8) * s0 + s0 + 8, 256)>
|
||||
// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<()[s0, s1] -> ((s0 + 8) ceildiv s1 + 8)>
|
||||
// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0] -> (248 ceildiv s0 + 8)>
|
||||
|
||||
// CHECK: func @tile_loop_with_non_zero_lb([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index, %{{.*}}: index)
|
||||
// CHECK-NEXT: affine.for [[ARG3:%arg[0-9+]]] = 8 to [[UBO0]]{{.*}}[[ARG0]]{{.*}}
|
||||
// CHECK-NEXT: affine.for [[ARG4:%arg[0-9+]]] = 8 to [[UBO1]]{{.*}}[[ARG1]]{{.*}}
|
||||
// CHECK-NEXT: affine.for [[ARG5:%arg[0-9+]]] = 8 to [[UBO1]]{{.*}}[[ARG2]]{{.*}} step 4
|
||||
// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI0]]([[ARG3]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]([[ARG3]]){{.*}}[[ARG0]]{{.*}}
|
||||
// CHECK-NEXT: affine.for %[[J:.*]] = [[LBI0]]([[ARG4]]){{.*}}[[ARG1]]{{.*}} to min [[UBI1]]([[ARG4]]){{.*}}[[ARG1]]{{.*}}
|
||||
// CHECK-NEXT: affine.for %[[K:.*]] = [[LBI0]]([[ARG5]]){{.*}}[[ARG2]]{{.*}} to min [[UBI2]]([[ARG5]]){{.*}}[[ARG2]]{{.*}}step 4{{.*}}
|
||||
// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) : (index, index, index) -> ()
|
||||
#ubi = affine_map<()[s0] -> (s0 + 16)>
|
||||
func @tile_loop_with_non_zero_lb(%t0: index, %t1: index, %t2: index, %U: index){
|
||||
affine.for %i = 8 to 256 {
|
||||
affine.for %j = 8 to #ubi()[%U] {
|
||||
affine.for %k = 8 to #ubi()[%U] step 4 {
|
||||
"test.foo"(%i, %j, %k) : (index, index, index) -> ()
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-DAG: [[LBI:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
|
||||
// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 256)>
|
||||
// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 250)>
|
||||
// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0] -> (256 ceildiv s0)>
|
||||
// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<()[s0] -> (250 ceildiv s0)>
|
||||
|
||||
// CHECK: func @simple_matmul([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index{{.*}})
|
||||
// CHECK-NEXT: affine.for [[ARG3:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG0]]{{.*}}
|
||||
// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG1]]{{.*}}
|
||||
// CHECK-NEXT: affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO1]](){{.*}}[[ARG2]]{{.*}}
|
||||
// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} to min [[UBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}}
|
||||
// CHECK-NEXT: affine.for %[[J:.*]] = [[LBI]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} to min [[UBI0]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}}
|
||||
// CHECK-NEXT: affine.for %[[K:.*]] = [[LBI]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} to min [[UBI1]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}}
|
||||
// CHECK-NEXT: affine.load %{{.*}}[%[[I]], %[[K]]]
|
||||
// CHECK-NEXT: affine.load %{{.*}}[%[[K]], %[[J]]]
|
||||
// CHECK-NEXT: affine.load %{{.*}}[%[[I]], %[[J]]]
|
||||
// CHECK-NEXT: mulf %{{.*}}
|
||||
// CHECK-NEXT: addf %{{.*}}
|
||||
// CHECK-NEXT: affine.store %{{.*}}[%[[I]], %[[J]]]
|
||||
func @simple_matmul(%t6 : index, %t7 : index, %t8 : index, %arg0: memref<256x256xvector<64xf32>>, %arg1: memref<256x256xvector<64xf32>>, %arg2: memref<256x256xvector<64xf32>>) -> memref<256x256xvector<64xf32>> {
|
||||
affine.for %i = 0 to 256 {
|
||||
affine.for %j = 0 to 256 {
|
||||
affine.for %k = 0 to 250 {
|
||||
%l = affine.load %arg0[%i, %k] : memref<256x256xvector<64xf32>>
|
||||
%r = affine.load %arg1[%k, %j] : memref<256x256xvector<64xf32>>
|
||||
%o = affine.load %arg2[%i, %j] : memref<256x256xvector<64xf32>>
|
||||
%m = mulf %l, %r : vector<64xf32>
|
||||
%a = addf %o, %m : vector<64xf32>
|
||||
affine.store %a, %arg2[%i, %j] : memref<256x256xvector<64xf32>>
|
||||
}
|
||||
}
|
||||
}
|
||||
return %arg2 : memref<256x256xvector<64xf32>>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
|
||||
// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s1, s0)>
|
||||
// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0, s1] -> (s0 ceildiv s1)>
|
||||
|
||||
// CHECK: func @tile_with_symbolic_loop_upper_bounds([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index{{.*}}){{.*}}
|
||||
// CHECK: affine.for [[ARG2:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG0]]{{.*}}
|
||||
// CHECK-NEXT: affine.for [[ARG3:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG1]]{{.*}}
|
||||
// CHECK-NEXT: affine.for %[[I0:.*]] = [[LBI0]]{{.*}}[[ARG2]]{{.*}}[[ARG0]]{{.*}} to min [[UBI0]]{{.*}}[[ARG2]]{{.*}}[[ARG0]]{{.*}}
|
||||
// CHECK-NEXT: affine.for %[[I1:.*]] = [[LBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG1]]{{.*}} to min [[UBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG1]]{{.*}}
|
||||
// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%[[I0]], %[[I1]]] : memref<?x?xf32>
|
||||
// CHECK-NEXT: affine.for %[[I2:.*]] = 0 to %{{.*}} {
|
||||
// CHECK-NEXT: affine.load %{{.*}}%[[I0]], %[[I2]]
|
||||
// CHECK-NEXT: affine.load %{{.*}}%[[I2]], %[[I1]]
|
||||
// CHECK-NEXT: mulf
|
||||
// CHECK-NEXT: affine.load %{{.*}}%[[I0]], %[[I1]]
|
||||
// CHECK-NEXT: addf
|
||||
// CHECK-NEXT: affine.store %{{.*}}%[[I0]], %[[I1]]
|
||||
func @tile_with_symbolic_loop_upper_bounds(%t9 : index, %t10: index, %arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
|
||||
%cst = constant 0.000000e+00 : f32
|
||||
%c0 = constant 0 : index
|
||||
%0 = dim %arg0, %c0 : memref<?x?xf32>
|
||||
affine.for %i0 = 0 to %0 {
|
||||
affine.for %i1 = 0 to %0 {
|
||||
affine.store %cst, %arg2[%i0, %i1] : memref<?x?xf32>
|
||||
affine.for %i2 = 0 to %0 {
|
||||
%1 = affine.load %arg0[%i0, %i2] : memref<?x?xf32>
|
||||
%2 = affine.load %arg1[%i2, %i1] : memref<?x?xf32>
|
||||
%3 = mulf %1, %2 : f32
|
||||
%4 = affine.load %arg2[%i0, %i1] : memref<?x?xf32>
|
||||
%5 = addf %4, %3 : f32
|
||||
affine.store %5, %arg2[%i0, %i1] : memref<?x?xf32>
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
|
||||
// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0, s1, s2] -> (d0 * s2 + s2, s0 + s1)>
|
||||
// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0, s1, s2] -> ((s0 + s1) ceildiv s2)>
|
||||
|
||||
// CHECK: func @tile_with_loop_upper_bounds_in_two_symbols([[ARG0:%arg[0-9]+]]: index{{.*}}){{.*}}
|
||||
func @tile_with_loop_upper_bounds_in_two_symbols(%t11 : index, %arg0: memref<?xf32>, %limit: index) {
|
||||
%c0 = constant 0 : index
|
||||
%dim0 = dim %arg0, %c0 : memref<?xf32>
|
||||
affine.for %i0 = 0 to affine_map<()[s0, s1] -> (s0 + s1)> ()[%dim0, %limit] {
|
||||
%v0 = affine.load %arg0[%i0] : memref<?xf32>
|
||||
}
|
||||
// CHECK: affine.for [[ARG1:%arg[0-9]+]] = 0 to [[UBO0]]()[%{{.*}}, %{{.*}}, [[ARG0]]]
|
||||
// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI0]]([[ARG1]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]([[ARG1]])[{{.*}}, {{.*}}, [[ARG0]]]
|
||||
// CHECK-NEXT: affine.load %{{.*}}[%[[I]]]
|
||||
return
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
|
||||
// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 * s1 + s1, d0 + s0 + 4)>
|
||||
// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 * s1 + s1, d0 + s0 + 2)>
|
||||
// CHECK-DAG: [[LBO0:#map[0-9]+]] = affine_map<() -> (0)>
|
||||
// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0 + 4) ceildiv s1)>
|
||||
// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0 + 2) ceildiv s1)>
|
||||
|
||||
// CHECK: func @tile_with_upper_bounds_in_dimensions_and_symbols([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index, [[ARG3:%arg[0-9]+]]: index{{.*}}){{.*}}
|
||||
// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO0]]({{.*}}){{.*}}[[ARG0]]
|
||||
// CHECK-NEXT: affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO1]]({{.*}}){{.*}}[[ARG1]]
|
||||
// CHECK-NEXT: affine.for {{.*}} = [[LBI0]]([[ARG4]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]({{.*}}, [[ARG4]]){{.*}}[[ARG0]]{{.*}}
|
||||
// CHECK-NEXT: affine.for {{.*}} = [[LBI0]]([[ARG5]]){{.*}}[[ARG1]]{{.*}} to min [[UBI1]]({{.*}}, [[ARG5]]){{.*}}[[ARG1]]{{.*}}
|
||||
func @tile_with_upper_bounds_in_dimensions_and_symbols(%t12 : index, %t13 :index, %M: index, %N: index, %K: index) {
|
||||
affine.for %i = 0 to affine_map<(d0)[s0] -> (d0 + s0 + 2)>(%M)[%K] {
|
||||
affine.for %j = 0 to affine_map<(d0)[s0] -> (d0 + s0 + 4)>(%N)[%K] {
|
||||
"test.foo" () : () -> ()
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
|
||||
// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 * s1 + s1 * 4, d0 + s0 + 4)>
|
||||
// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 * s1 + s1 * 2, d0 + s0 + 2)>
|
||||
// CHECK-DAG: [[LBO0:#map[0-9]+]] = affine_map<() -> (0)>
|
||||
// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0 + 4) ceildiv s1)>
|
||||
// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0 + 2) ceildiv s1)>
|
||||
|
||||
// CHECK: func @tile_with_upper_bounds_in_dimensions_and_symbols_non_unit_steps
|
||||
// CHECK-SAME: ([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index, [[ARG3:%arg[0-9]+]]: index{{.*}}){{.*}}
|
||||
// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO0]]({{.*}}){{.*}}[[ARG0]]{{.*}} step 2{{.*}}
|
||||
// CHECK-NEXT: affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO1]]({{.*}}){{.*}}[[ARG1]]{{.*}} step 4{{.*}}
|
||||
// CHECK-NEXT: affine.for {{.*}} = [[LBI0]]([[ARG4]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]({{.*}}, [[ARG4]]){{.*}}[[ARG0]]{{.*}} step 2{{.*}}
|
||||
// CHECK-NEXT: affine.for {{.*}} = [[LBI0]]([[ARG5]]){{.*}}[[ARG1]]{{.*}} to min [[UBI1]]({{.*}}, [[ARG5]]){{.*}}[[ARG1]]{{.*}} step 4{{.*}}
|
||||
func @tile_with_upper_bounds_in_dimensions_and_symbols_non_unit_steps(%t12 : index, %t13 :index, %M: index, %N : index, %K: index) {
|
||||
affine.for %i = 0 to affine_map<(d0)[s0] -> (d0 + s0 + 2)>(%M)[%K] step 2 {
|
||||
affine.for %j = 0 to affine_map<(d0)[s0] -> (d0 + s0 + 4)>(%N)[%K] step 4 {
|
||||
"test.foo" () : () -> ()
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
|
@ -1,6 +1,7 @@
|
|||
# Exclude tests from libMLIR.so
|
||||
add_mlir_library(MLIRTestTransforms
|
||||
TestAllReduceLowering.cpp
|
||||
TestAffineLoopParametricTiling.cpp
|
||||
TestBufferPlacement.cpp
|
||||
TestExpandTanh.cpp
|
||||
TestCallGraph.cpp
|
||||
|
|
|
@ -0,0 +1,90 @@
|
|||
//= TestAffineLoopParametricTiling.cpp -- Parametric Affine loop tiling pass =//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file implements a test pass to test parametric tiling of perfectly
|
||||
// nested affine for loops.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/Passes.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
#define DEBUG_TYPE "test-affine-parametric-tile"
|
||||
|
||||
namespace {
|
||||
|
||||
struct TestAffineLoopParametricTiling
|
||||
: public PassWrapper<TestAffineLoopParametricTiling, FunctionPass> {
|
||||
void runOnFunction() override;
|
||||
};
|
||||
} // end anonymous namespace
|
||||
|
||||
/// Checks if the function enclosing the loop nest has any arguments passed to
|
||||
/// it, which can be used as tiling parameters. Assumes that atleast 'n'
|
||||
/// arguments are passed, where 'n' is the number of loops in the loop nest.
|
||||
static void checkIfTilingParametersExist(ArrayRef<AffineForOp> band) {
|
||||
assert(!band.empty() && "no loops in input band");
|
||||
AffineForOp topLoop = band[0];
|
||||
|
||||
if (FuncOp funcOp = dyn_cast<FuncOp>(topLoop.getParentOp()))
|
||||
assert(funcOp.getNumArguments() >= band.size() && "Too few tile sizes");
|
||||
}
|
||||
|
||||
/// Captures tiling parameters, which are expected to be passed as arguments
|
||||
/// to the function enclosing the loop nest. Also checks if the required
|
||||
/// parameters are of index type. This approach is temporary for testing
|
||||
/// purposes.
|
||||
static void getTilingParameters(ArrayRef<AffineForOp> band,
|
||||
SmallVectorImpl<Value> &tilingParameters) {
|
||||
AffineForOp topLoop = band[0];
|
||||
Region *funcOpRegion = topLoop.getParentRegion();
|
||||
unsigned nestDepth = band.size();
|
||||
|
||||
for (BlockArgument blockArgument :
|
||||
funcOpRegion->getArguments().take_front(nestDepth)) {
|
||||
if (blockArgument.getArgNumber() < nestDepth) {
|
||||
assert(blockArgument.getType().isIndex() &&
|
||||
"expected tiling parameters to be of index type.");
|
||||
tilingParameters.push_back(blockArgument);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestAffineLoopParametricTiling::runOnFunction() {
|
||||
// Bands of loops to tile.
|
||||
std::vector<SmallVector<AffineForOp, 6>> bands;
|
||||
getTileableBands(getFunction(), &bands);
|
||||
|
||||
// Tile each band.
|
||||
for (SmallVectorImpl<AffineForOp> &band : bands) {
|
||||
// Capture the tiling parameters from the arguments to the function
|
||||
// enclosing this loop nest.
|
||||
SmallVector<AffineForOp, 6> tiledNest;
|
||||
SmallVector<Value, 6> tilingParameters;
|
||||
// Check if tiling parameters are present.
|
||||
checkIfTilingParametersExist(band);
|
||||
|
||||
// Get function arguments as tiling parameters.
|
||||
getTilingParameters(band, tilingParameters);
|
||||
|
||||
if (failed(
|
||||
tilePerfectlyNestedParametric(band, tilingParameters, &tiledNest)))
|
||||
return signalPassFailure();
|
||||
}
|
||||
}
|
||||
|
||||
namespace mlir {
|
||||
void registerTestAffineLoopParametricTilingPass() {
|
||||
PassRegistration<TestAffineLoopParametricTiling>(
|
||||
"test-affine-parametric-tile",
|
||||
"Tile affine loops using SSA values as tile sizes");
|
||||
}
|
||||
} // namespace mlir
|
|
@ -41,6 +41,7 @@ void registerSimpleParametricTilingPass();
|
|||
void registerSliceAnalysisTestPass();
|
||||
void registerSymbolTestPasses();
|
||||
void registerTestAffineDataCopyPass();
|
||||
void registerTestAffineLoopParametricTilingPass();
|
||||
void registerTestAffineLoopUnswitchingPass();
|
||||
void registerTestAllReduceLoweringPass();
|
||||
void registerTestBufferPlacementPreparationPass();
|
||||
|
@ -104,6 +105,7 @@ void registerTestPasses() {
|
|||
#if MLIR_ROCM_CONVERSIONS_ENABLED
|
||||
registerTestConvertGPUKernelToHsacoPass();
|
||||
#endif
|
||||
registerTestAffineLoopParametricTilingPass();
|
||||
registerTestBufferPlacementPreparationPass();
|
||||
registerTestDominancePass();
|
||||
registerTestFunc();
|
||||
|
|
Loading…
Reference in New Issue