[mlir][linalg] Use getUpperBoundForIndex in hoisting (NFC).

Use the custom upper bound computation in hoisting by the new getUpperBoundForIndex method.

Depends On D113546

Reviewed By: nicolasvasilache

Differential Revision: https://reviews.llvm.org/D113547
This commit is contained in:
Tobias Gysi 2021-11-10 15:55:16 +00:00
parent b326eb64fd
commit 4e2c978f44
1 changed files with 36 additions and 197 deletions

View File

@ -11,9 +11,7 @@
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/Linalg/Transforms/HoistPadding.h"
#include "mlir/Analysis/AffineStructures.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
@ -26,7 +24,6 @@
#include "mlir/IR/AsmState.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/Dominance.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/LoopUtils.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Debug.h"
@ -58,9 +55,8 @@ struct HoistingAnalysis {
bool isValid() { return valid; }
/// Footprint of the packedTensor, computed from the packingLoops and
/// `backwardSlice`.
FailureOr<SmallVector<Value>> getPackedTensorSizes(ImplicitLocOpBuilder &b);
/// Footprint of the packedTensor, computed from the packingLoops.
SmallVector<Value> getPackedTensorSizes(ImplicitLocOpBuilder &b);
/// The outermost loop, determined by `nLevels` above which `padTensorOp` will
/// be hoisted.
@ -229,21 +225,20 @@ HoistingAnalysis::HoistingAnalysis(PadTensorOp padTensorOp, int numLoops) {
valid = true;
}
/// Add all index operands of `operation` to `indexEdges`. An index operand is
/// an operand of type index.
static void addIndexOperandsToIndexEdges(Operation *operation,
SetVector<Value> &indexEdges) {
for (Value operand : operation->getOperands())
if (operand.getType().isIndex())
indexEdges.insert(operand);
}
SmallVector<scf::ForOp>
HoistingAnalysis::getIndexingLoops(PadTensorOp padTensorOp,
tensor::ExtractSliceOp sliceOp) {
// Set of all values used for index computation.
SetVector<Value> indexEdges;
// Add all index operands of `operation` to `indexEdges`. An index operand is
// an operand of type index.
auto addIndexOperandsToIndexEdges = [&](Operation *operation) {
for (Value operand : operation->getOperands())
if (operand.getType().isIndex())
indexEdges.insert(operand);
};
// Starting from `padTensorOp` and `sliceOp` walk the use-def edges of index
// type in `backwardSlice`. Add the index operands of an operation to
// `indexEdges` if one of its results is an index edge found so far and store
@ -268,7 +263,7 @@ HoistingAnalysis::getIndexingLoops(PadTensorOp padTensorOp,
// Add the index operands of `padTensorOp` and `sliceOp` to start the
// exploration of the index computation.
if (op == padTensorOp || op == sliceOp) {
addIndexOperandsToIndexEdges(op, indexEdges);
addIndexOperandsToIndexEdges(op);
continue;
}
// Add the index operands of the loop if its induction variable is
@ -276,7 +271,7 @@ HoistingAnalysis::getIndexingLoops(PadTensorOp padTensorOp,
// `indexingLoops`
if (auto forOp = dyn_cast<scf::ForOp>(op)) {
if (indexEdges.contains(forOp.getInductionVar())) {
addIndexOperandsToIndexEdges(op, indexEdges);
addIndexOperandsToIndexEdges(op);
indexingLoops.push_back(forOp);
continue;
}
@ -285,199 +280,46 @@ HoistingAnalysis::getIndexingLoops(PadTensorOp padTensorOp,
// used for index computation.
if (llvm::any_of(op->getResults(),
[&](Value result) { return indexEdges.contains(result); }))
addIndexOperandsToIndexEdges(op, indexEdges);
addIndexOperandsToIndexEdges(op);
}
return indexingLoops;
}
static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp<ConstantOp>();
}
/// For each loop in `loops`, determine the ops involved in the construction of
/// its upper bound---up to the outerLimit loop--- and fold them as new
/// inequalities in the constraint set.
/// This is achieved by computing the backwardSlice of the loop's upper bound
/// and iteratively folding each op in reverse topological order to guarantee
/// use-def ordering.
/// As operations are folded in, their result is projected out of the
/// constraints set.
/// The following operations are supported:
/// - scf::ForOp are simply skipped.
/// - AffineApplyOp are composed to replace the result by an equality.
/// - AffineMinOp are composed by adding each entry as an upper bound.
/// Additionally, the following terminal operations are handled:
/// - DimOp and ConstantOp are skipped.
/// If any other operation is met, return failure.
// TODO: extend on a per-need basis.
static LogicalResult
foldUpperBoundsIntoConstraintsSet(FlatAffineValueConstraints &constraints,
scf::ForOp outerLimit,
ArrayRef<scf::ForOp> loops) {
SetVector<Value> toProjectOut;
for (scf::ForOp loop : loops) {
auto ub = loop.upperBound();
// Set of all values used for index computation.
SetVector<Value> indexEdges;
indexEdges.insert(ub);
// Compute the backward slice `indexSlice` containing the index computation
// performed to obtain the upper bound `ub`. Starting from `ub` add the
// index operands of an operation to `indexEdges` if one of its results is
// an index edge. Otherwise, stop the slice computation. For a loop, check
// if its induction variable is an index edge.
//
// Example:
// ```
// %c0 = arith.constant 0
// scf.for %i = %c0 to ...
// scf.for %j = %c0 to ...
// %ub = affine.min #map(%i)
// scf.for %k = %c0 to %ub
// ```
// After computing the backward slice we obtain:
// indexEdges = [%ub, %i, %c0]
// indexSlice = [arith.constant 0, scf.for %i, affine.min #map(%i)]
SetVector<Operation *> indexSlice;
getBackwardSlice(ub, &indexSlice, [&](Operation *op) {
// Continue only along the index operands of the ForOp.
if (auto forOp = dyn_cast<scf::ForOp>(op)) {
// Consider only loops part of the enclosing loops.
if (!outerLimit->isAncestor(op))
return false;
if (!indexEdges.contains(forOp.getInductionVar()))
return false;
addIndexOperandsToIndexEdges(op, indexEdges);
return true;
}
// All supported index operations have one result.
assert(op->getNumResults() == 1 &&
"expect operations to have one result");
if (!indexEdges.contains(op->getResult(0)))
return false;
addIndexOperandsToIndexEdges(op, indexEdges);
return true;
});
indexSlice.insert(ub.getDefiningOp());
// Iterate over all ops in the slice and compose them in the constraints.
for (Operation *op : llvm::reverse(indexSlice)) {
// All ForOps have previously been added to the constraints and ConstantOp
// and DimOp are terminals of the index computation.
if (isa<scf::ForOp, arith::ConstantOp, tensor::DimOp>(op))
continue;
// Check all index computation operations are supported.
if (!isa<AffineApplyOp, AffineMinOp>(op))
return failure();
// Ensure there is an id.
auto ensureIdFailed = [&](Value v) {
if (constraints.containsId(v)) {
unsigned pos;
constraints.findId(v, &pos);
return pos >= constraints.getNumDimIds();
}
constraints.appendDimId(v);
return false;
};
// Ensure all ids exist and add results for later projection.
if (llvm::any_of(op->getResults(), ensureIdFailed) ||
llvm::any_of(op->getOperands(), ensureIdFailed))
return failure();
// All supported ops have 1 result.
// TODO: extend when needed.
assert(op->getNumResults() == 1 &&
"expect operations to have one result");
toProjectOut.insert(op->getResult(0));
// Compose supported ops.
if (auto affineApplyOp = dyn_cast<AffineApplyOp>(op)) {
AffineValueMap avm(affineApplyOp.getAffineMap(),
affineApplyOp.getOperands(),
affineApplyOp.getResult());
if (failed(constraints.composeMap(&avm)))
return failure();
continue;
}
auto affineMinOp = cast<AffineMinOp>(op);
unsigned pos;
bool foundMinOp = constraints.findId(affineMinOp.getResult(), &pos);
(void)foundMinOp;
assert(foundMinOp);
AffineMap alignedMap = constraints.computeAlignedMap(
affineMinOp.getAffineMap(), affineMinOp.getOperands());
if (failed(
constraints.addBound(FlatAffineConstraints::UB, pos, alignedMap)))
return failure();
}
}
for (Value v : toProjectOut)
constraints.projectOut(v);
return success();
}
// Footprint of the packedTensor, computed from the packingLoops and
// `backwardSlice`.
FailureOr<SmallVector<Value>>
SmallVector<Value>
HoistingAnalysis::getPackedTensorSizes(ImplicitLocOpBuilder &b) {
// Create the base affine constaints for the packedLoops.
auto constraints = FlatAffineValueConstraints::getHyperrectangular(
llvm::to_vector<8>(llvm::map_range(
packingLoops, [](scf::ForOp op) { return op.getInductionVar(); })),
llvm::to_vector<8>(llvm::map_range(
packingLoops, [](scf::ForOp op) { return op.lowerBound(); })),
llvm::to_vector<8>(llvm::map_range(
packingLoops, [](scf::ForOp op) { return op.upperBound(); })));
// Iteratively try to fold the upper bounds into the constraints set.
if (failed(foldUpperBoundsIntoConstraintsSet(
constraints, outermostEnclosingForOp, packingLoops)))
return failure();
int nPackedLoops = packingLoops.size();
SmallVector<AffineMap> lbs(nPackedLoops), ubs(nPackedLoops);
// Compute the bounds of the first positions, assuming the others are fixed.
constraints.getSliceBounds(/*pos=*/0, /*num=*/nPackedLoops,
outermostEnclosingForOp->getContext(), &lbs, &ubs);
SmallVector<Value> allValues;
constraints.getAllValues(&allValues);
SmallVector<Value> allNonLoopValues(allValues.begin() + nPackedLoops,
allValues.end());
// For each packingLoop, create the extent by (ub - lb).ceilDiv(step).
// IP just before the outermost loop considered that we hoist above.
assert(nPackedLoops == static_cast<int64_t>(lbs.size()) &&
"expected matching lb sizes");
assert(nPackedLoops == static_cast<int64_t>(ubs.size()) &&
"expected matching ub sizes");
SmallVector<Value> dynamicTensorSizes;
for (auto it : llvm::zip(packingLoops, lbs, ubs)) {
scf::ForOp loop = std::get<0>(it);
AffineMap lbMap = std::get<1>(it);
AffineMap ubMap = std::get<2>(it);
SmallVector<Value> lbOperands(allNonLoopValues);
canonicalizeMapAndOperands(&lbMap, &lbOperands);
Value lbVal = b.createOrFold<AffineMaxOp>(lbMap, lbOperands);
SmallVector<Value> ubOperands(allNonLoopValues);
canonicalizeMapAndOperands(&ubMap, &ubOperands);
Value ubVal = b.createOrFold<AffineMinOp>(ubMap, ubOperands);
// Upper bound the packing loop lengths to size the packed tensor. Taking
// upper bounds can make the sizes of the packed tensor independent of the
// enclosing loops. This independence is a prerequisite for reusing the same
// buffer for all enclosing loop iterations and hoisting its allocation out of
// the enclosing loops.
for (auto forOp : packingLoops) {
// Compute an upper bound `ubVal` for the upper bound of `forOp`.
AffineMap boundMap;
SmallVector<Value> boundOperands;
getUpperBoundForIndex(forOp.upperBound(), boundMap, boundOperands);
Value ubVal = b.createOrFold<AffineMinOp>(boundMap, boundOperands);
// Compute the maximal packing loop length as (ub - lb).ceilDiv(step) and
// store the result to `dynamicTensorSizes`.
// TODO: instead of using the lower bound of `forOp` directly, implement a
// lower bound computation similar to the upper bound computation.
AffineExpr lb, ub, step;
bindDims(b.getContext(), lb, ub);
bindSymbols(b.getContext(), step);
Value res = b.createOrFold<AffineApplyOp>(
(ub - lb).ceilDiv(step),
ValueRange{lbVal, ubVal, cast<scf::ForOp>(loop).step()});
ValueRange{forOp.lowerBound(), ubVal, cast<scf::ForOp>(forOp).step()});
dynamicTensorSizes.push_back(res);
}
return dynamicTensorSizes;
}
static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp<ConstantOp>();
}
/// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
/// The returned Value is guaranteed not to depend on any loop comprised in
/// [`outer`, `forOp`].
@ -512,10 +354,7 @@ FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(PadTensorOp opToHoist,
scf::ForOp outer = analysis.outermostEnclosingForOp;
ImplicitLocOpBuilder b(outer->getLoc(), outer);
auto maybeDynamicTensorSizes = analysis.getPackedTensorSizes(b);
if (failed(maybeDynamicTensorSizes))
return failure();
SmallVector<Value> dynamicTensorSizes = *maybeDynamicTensorSizes;
SmallVector<Value> dynamicTensorSizes = analysis.getPackedTensorSizes(b);
// Update actual number of loops, which may be smaller.
int nPackedLoops = analysis.packingLoops.size();