[mlir] more aggressive folding in tiling/fusion transformations

Combine the recently added utilities for folded-by-construction affine
operations with the attribute-based Range to enable more folding. This
decreases the amount of emitted code but has little effect on test
precisely because the tests are not checking for the spurious constants.
The difference in the shape of affine maps comes from the internals of
affine folding.

Depends on D129633

Reviewed By: nicolasvasilache

Differential Revision: https://reviews.llvm.org/D130167
This commit is contained in:
Alex Zinenko 2022-07-13 14:40:59 +00:00
parent 70e99f387a
commit e99fae8997
21 changed files with 353 additions and 322 deletions

View File

@ -392,6 +392,13 @@ OpFoldResult makeComposedFoldedAffineApply(RewriterBase &b, Location loc,
OpFoldResult makeComposedFoldedAffineApply(RewriterBase &b, Location loc,
AffineExpr expr,
ArrayRef<OpFoldResult> operands);
/// Variant of `makeComposedFoldedAffineApply` suitable for multi-result maps.
/// Note that this may create as many affine.apply operations as the map has
/// results given that affine.apply must be single-result.
SmallVector<OpFoldResult>
makeComposedFoldedMultiResultAffineApply(RewriterBase &b, Location loc,
AffineMap map,
ArrayRef<OpFoldResult> operands);
/// Returns an AffineMinOp obtained by composing `map` and `operands` with
/// AffineApplyOps supplying those operands.
@ -405,16 +412,17 @@ OpFoldResult makeComposedFoldedAffineMin(RewriterBase &b, Location loc,
AffineMap map,
ArrayRef<OpFoldResult> operands);
/// Constructs an AffineMinOp that computes a maximum across the results of
/// applying `map` to `operands`, then immediately attempts to fold it. If
/// folding results in a constant value, erases all created ops.
OpFoldResult makeComposedFoldedAffineMax(RewriterBase &b, Location loc,
AffineMap map,
ArrayRef<OpFoldResult> operands);
/// Returns the values obtained by applying `map` to the list of values.
SmallVector<Value, 4> applyMapToValues(OpBuilder &b, Location loc,
AffineMap map, ValueRange values);
/// Returns the values obtained by applying `map` to the list of values, which
/// may be known constants.
SmallVector<OpFoldResult> applyMapToValues(RewriterBase &b, Location loc,
AffineMap map,
ArrayRef<OpFoldResult> values);
/// Given an affine map `map` and its input `operands`, this method composes
/// into `map`, maps of AffineApplyOps whose results are the values in
/// `operands`, iteratively until no more of `operands` are the result of an

View File

@ -1133,7 +1133,7 @@ def LinalgStructuredInterface : OpInterface<"LinalgOp"> {
let extraClassDeclaration = [{
/// Return the flat list of all operand dimension sizes in the order they
/// appear in the operands.
SmallVector<Value, 4> createFlatListOfOperandDims(OpBuilder &, Location);
SmallVector<OpFoldResult> createFlatListOfOperandDims(OpBuilder &, Location);
/// Return the flat list of all operands' static dimension sizes in the
/// order they appear in the operands. All operand dimension sizes have to

View File

@ -410,7 +410,8 @@ using TileSizeComputationFunction =
using LoopIndexToRangeIndexMap = DenseMap<int, int>;
std::tuple<SmallVector<Range, 4>, LoopIndexToRangeIndexMap>
makeTiledLoopRanges(RewriterBase &b, Location loc, AffineMap map,
ValueRange allShapeSizes, ValueRange allTileSizes);
ArrayRef<OpFoldResult> allShapeSizes,
ArrayRef<OpFoldResult> allTileSizes);
/// A description of a multi-size tiling comprising tile sizes and numbers of
/// tiles, expressed as Values which may or may not be constant. Multi-size

View File

@ -48,6 +48,8 @@ bool isPermutation(ArrayRef<int64_t> permutation);
/// Helper function that creates a memref::DimOp or tensor::DimOp depending on
/// the type of `source`.
Value createOrFoldDimOp(OpBuilder &b, Location loc, Value source, int64_t dim);
OpFoldResult createFoldedDimOp(OpBuilder &b, Location loc, Value source,
int64_t dim);
/// Given an operation, retrieves the value of each dynamic dimension through
/// constructing the necessary DimOp operators.
@ -179,16 +181,17 @@ bool isFusableInto(const LinalgDependenceGraph &graph, LinalgOp consumer,
/// Computes tile offsets, given a list of loop `ivs` and `tileSizes`. In case a
/// tile size is zero (i.e., no tiling), the corresponding offset is also zero.
SmallVector<Value> computeTileOffsets(OpBuilder &b, Location loc,
ValueRange ivs, ValueRange tileSizes);
SmallVector<OpFoldResult> computeTileOffsets(OpBuilder &b, Location loc,
ArrayRef<OpFoldResult> ivs,
ArrayRef<OpFoldResult> tileSizes);
/// Computes tile sizes, given a list of `tileSizes` and dimension
/// sizes (`sizeBounds`). In case a tile size is zero (i.e., no tiling), the
/// corresponding result size is the corresponding value from `sizeBounds`.
/// Note: The returned tile sizes are closed intervals.
SmallVector<Value> computeTileSizes(OpBuilder &b, Location loc,
ValueRange tileSizes,
ArrayRef<Value> sizeBounds);
SmallVector<OpFoldResult> computeTileSizes(OpBuilder &b, Location loc,
ArrayRef<OpFoldResult> tileSizes,
ArrayRef<OpFoldResult> sizeBounds);
/// Returns the list of tensor output types produced when the given structured
/// operation `op` is applied to the given `operands`. Note that `operands` are
@ -217,8 +220,9 @@ Value materializeOpFoldResult(OpBuilder &b, Location loc,
/// controls whether to omit the partial/boundary tile condition check in cases
/// where we statically know that it is unnecessary.
Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile,
ValueRange tileSizes, AffineMap map, ValueRange lbs,
ValueRange ubs, ValueRange subShapeSizes,
ArrayRef<OpFoldResult> tileSizes, AffineMap map,
ArrayRef<OpFoldResult> lbs, ArrayRef<OpFoldResult> ubs,
ArrayRef<OpFoldResult> subShapeSizes,
bool omitPartialTileCheck);
/// Creates extract_slice/subview ops for all `valuesToTile` of the given
@ -232,18 +236,20 @@ Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile,
/// Note that a constant zero in `tileSizes` means no tiling at that implicit
/// loop. The number of non-zero values in `tileSizes` should be equal to the
/// number of values in `ivs`.
SmallVector<Value, 4> makeTiledShapes(OpBuilder &builder, Location loc,
LinalgOp linalgOp,
ArrayRef<Value> valuesToTile,
ValueRange ivs, ValueRange tileSizes,
ArrayRef<Value> sizeBounds,
bool omitPartialTileCheck);
SmallVector<Value> makeTiledShapes(OpBuilder &builder, Location loc,
LinalgOp linalgOp, ValueRange valuesToTile,
ArrayRef<OpFoldResult> ivs,
ArrayRef<OpFoldResult> tileSizes,
ArrayRef<OpFoldResult> sizeBounds,
bool omitPartialTileCheck);
/// Add the specified offsets to any `linalg.index` ops contained in the given
/// `linalgOp`. The offsets are provided in the same order as iteration space
/// dimensions. Null offests are assumed to be zero.
void offsetIndices(OpBuilder &b, LinalgOp linalgOp, ArrayRef<Value> offests);
void offsetIndices(RewriterBase &b, LinalgOp linalgOp, ArrayRef<Value> offests);
void offsetIndices(OpBuilder &b, LinalgOp linalgOp,
ArrayRef<OpFoldResult> offests);
void offsetIndices(RewriterBase &b, LinalgOp linalgOp,
ArrayRef<OpFoldResult> offests);
using FusableOpDependencesTy = llvm::MapVector<
Operation *,

View File

@ -790,33 +790,6 @@ AffineApplyOp mlir::makeComposedAffineApply(OpBuilder &b, Location loc,
values);
}
OpFoldResult
mlir::makeComposedFoldedAffineApply(RewriterBase &b, Location loc,
AffineMap map,
ArrayRef<OpFoldResult> operands) {
assert(map.getNumResults() == 1 && "building affine.apply with !=1 result");
SmallVector<Operation *> constants;
SmallVector<Value> actualValues;
materializeConstants(b, loc, operands, constants, actualValues);
composeAffineMapAndOperands(&map, &actualValues);
OpFoldResult result = createOrFold<AffineApplyOp>(b, loc, actualValues, map);
if (result.is<Attribute>()) {
for (Operation *op : constants)
b.eraseOp(op);
}
return result;
}
OpFoldResult
mlir::makeComposedFoldedAffineApply(RewriterBase &b, Location loc,
AffineExpr expr,
ArrayRef<OpFoldResult> operands) {
return makeComposedFoldedAffineApply(
b, loc, AffineMap::inferFromExprList(ArrayRef<AffineExpr>{expr}).front(),
operands);
}
/// Composes the given affine map with the given list of operands, pulling in
/// the maps from any affine.apply operations that supply the operands.
static void composeMultiResultAffineMap(AffineMap &map,
@ -847,6 +820,44 @@ static void composeMultiResultAffineMap(AffineMap &map,
canonicalizeMapAndOperands(&map, &operands);
}
OpFoldResult
mlir::makeComposedFoldedAffineApply(RewriterBase &b, Location loc,
AffineMap map,
ArrayRef<OpFoldResult> operands) {
assert(map.getNumResults() == 1 && "building affine.apply with !=1 result");
SmallVector<Operation *> constants;
SmallVector<Value> actualValues;
materializeConstants(b, loc, operands, constants, actualValues);
composeAffineMapAndOperands(&map, &actualValues);
OpFoldResult result = createOrFold<AffineApplyOp>(b, loc, actualValues, map);
// Constants are always folded into affine min/max because they can be
// represented as constant expressions, so delete them.
for (Operation *op : constants)
b.eraseOp(op);
return result;
}
OpFoldResult
mlir::makeComposedFoldedAffineApply(RewriterBase &b, Location loc,
AffineExpr expr,
ArrayRef<OpFoldResult> operands) {
return makeComposedFoldedAffineApply(
b, loc, AffineMap::inferFromExprList(ArrayRef<AffineExpr>{expr}).front(),
operands);
}
SmallVector<OpFoldResult> mlir::makeComposedFoldedMultiResultAffineApply(
RewriterBase &b, Location loc, AffineMap map,
ArrayRef<OpFoldResult> operands) {
return llvm::to_vector(llvm::map_range(
llvm::seq<unsigned>(0, map.getNumResults()), [&](unsigned i) {
return makeComposedFoldedAffineApply(b, loc, map.getSubMap({i}),
operands);
}));
}
Value mlir::makeComposedAffineMin(OpBuilder &b, Location loc, AffineMap map,
ValueRange operands) {
SmallVector<Value> allOperands = llvm::to_vector(operands);
@ -854,22 +865,36 @@ Value mlir::makeComposedAffineMin(OpBuilder &b, Location loc, AffineMap map,
return b.createOrFold<AffineMinOp>(loc, b.getIndexType(), map, allOperands);
}
OpFoldResult
mlir::makeComposedFoldedAffineMin(RewriterBase &b, Location loc, AffineMap map,
ArrayRef<OpFoldResult> operands) {
template <typename OpTy>
static OpFoldResult makeComposedFoldedMinMax(RewriterBase &b, Location loc,
AffineMap map,
ArrayRef<OpFoldResult> operands) {
SmallVector<Operation *> constants;
SmallVector<Value> actualValues;
materializeConstants(b, loc, operands, constants, actualValues);
composeMultiResultAffineMap(map, actualValues);
OpFoldResult result =
createOrFold<AffineMinOp>(b, loc, actualValues, b.getIndexType(), map);
if (result.is<Attribute>()) {
for (Operation *op : constants)
b.eraseOp(op);
}
createOrFold<OpTy>(b, loc, actualValues, b.getIndexType(), map);
// Constants are always folded into affine min/max because they can be
// represented as constant expressions, so delete them.
for (Operation *op : constants)
b.eraseOp(op);
return result;
}
OpFoldResult
mlir::makeComposedFoldedAffineMin(RewriterBase &b, Location loc, AffineMap map,
ArrayRef<OpFoldResult> operands) {
return makeComposedFoldedMinMax<AffineMinOp>(b, loc, map, operands);
}
OpFoldResult
mlir::makeComposedFoldedAffineMax(RewriterBase &b, Location loc, AffineMap map,
ArrayRef<OpFoldResult> operands) {
return makeComposedFoldedMinMax<AffineMaxOp>(b, loc, map, operands);
}
/// Fully compose map with operands and canonicalize the result.
/// Return the `createOrFold`'ed AffineApply op.
static Value createFoldedComposedAffineApply(OpBuilder &b, Location loc,
@ -896,40 +921,6 @@ SmallVector<Value, 4> mlir::applyMapToValues(OpBuilder &b, Location loc,
return res;
}
SmallVector<OpFoldResult>
mlir::applyMapToValues(RewriterBase &b, Location loc, AffineMap map,
ArrayRef<OpFoldResult> values) {
// Materialize constants and keep track of produced operations so we can clean
// them up later.
SmallVector<Operation *> constants;
SmallVector<Value> actualValues;
materializeConstants(b, loc, values, constants, actualValues);
// Compose, fold and construct maps for each result independently because they
// may simplify more effectively.
SmallVector<OpFoldResult> results;
results.reserve(map.getNumResults());
bool foldedAll = true;
for (auto i : llvm::seq<unsigned>(0, map.getNumResults())) {
AffineMap submap = map.getSubMap({i});
SmallVector<Value> operands = actualValues;
fullyComposeAffineMapAndOperands(&submap, &operands);
canonicalizeMapAndOperands(&submap, &operands);
results.push_back(createOrFold<AffineApplyOp>(b, loc, operands, submap));
if (!results.back().is<Attribute>())
foldedAll = false;
}
// If the entire map could be folded, remove the constants that were used in
// the initial ops.
if (foldedAll) {
for (Operation *constant : constants)
b.eraseOp(constant);
}
return results;
}
// A symbol may appear as a dim in affine.apply operations. This function
// canonicalizes dims that are valid symbols into actual symbols.
template <class MapOrSet>

View File

@ -16,6 +16,7 @@ add_mlir_dialect_library(MLIRLinalgDialect
LINK_LIBS PUBLIC
MLIRAffineDialect
MLIRArithmeticDialect
MLIRArithmeticUtils
MLIRBufferizationDialect
MLIRDialectUtils
MLIRInferTypeOpInterface

View File

@ -10,6 +10,7 @@
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/Arithmetic/Utils/Utils.h"
#include "mlir/Dialect/Complex/IR/Complex.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
@ -486,13 +487,20 @@ static Value createOrFoldDimOp(OpBuilder &b, Location loc, Value source,
return b.createOrFold<tensor::DimOp>(loc, source, dim);
llvm_unreachable("Expected MemRefType or TensorType");
}
static OpFoldResult createFoldedDimOp(OpBuilder &b, Location loc, Value source,
int64_t dim) {
auto shapedType = source.getType().cast<ShapedType>();
if (!shapedType.hasRank() || shapedType.isDynamicDim(dim))
return createOrFoldDimOp(b, loc, source, dim);
return b.getIndexAttr(shapedType.getDimSize(dim));
}
SmallVector<Value, 4> LinalgOp::createFlatListOfOperandDims(OpBuilder &b,
Location loc) {
SmallVector<Value, 4> res;
SmallVector<OpFoldResult> LinalgOp::createFlatListOfOperandDims(OpBuilder &b,
Location loc) {
SmallVector<OpFoldResult> res;
for (OpOperand *opOperand : getInputAndOutputOperands()) {
for (int64_t i = 0, e = getRank(opOperand); i < e; ++i)
res.push_back(createOrFoldDimOp(b, loc, opOperand->get(), i));
res.push_back(createFoldedDimOp(b, loc, opOperand->get(), i));
}
return res;
}
@ -510,14 +518,13 @@ SmallVector<Range, 4> LinalgOp::createLoopRanges(OpBuilder &b, Location loc) {
unsigned numDims = map.getNumDims(), numRes = map.getNumResults();
auto viewSizes = createFlatListOfOperandDims(b, loc);
SmallVector<Range, 4> res(numDims);
Value zeroVal = b.create<arith::ConstantIndexOp>(loc, 0);
Value oneVal = b.create<arith::ConstantIndexOp>(loc, 1);
for (unsigned idx = 0; idx < numRes; ++idx) {
auto result = map.getResult(idx);
if (auto d = result.dyn_cast<AffineDimExpr>()) {
if (res[d.getPosition()].offset)
continue;
res[d.getPosition()] = Range{zeroVal, viewSizes[idx], oneVal};
res[d.getPosition()] =
Range{b.getIndexAttr(0), viewSizes[idx], b.getIndexAttr(1)};
}
}
return res;
@ -591,9 +598,11 @@ LinalgOp::reifyResultShapes(OpBuilder &b,
outputDims.set(resultShapesSubMapPos.first, resultShapesSubMapPos.second);
HasAffineDimExprVisitor checkDimExpr(std::move(outputDims));
Location loc = getOperation()->getLoc();
auto allResultDimValues =
applyMapToValues(b, loc, resultShapesFromInputShapesMap,
createFlatListOfOperandDims(b, loc));
IRRewriter rewriter(b);
SmallVector<OpFoldResult> allResultDimValues =
makeComposedFoldedMultiResultAffineApply(
rewriter, loc, resultShapesFromInputShapesMap,
createFlatListOfOperandDims(b, loc));
int64_t pos = 0;
ArrayRef<AffineExpr> shapeExprs = resultShapesFromInputShapesMap.getResults();
for (OpOperand *opOperand : getOutputOperands()) {
@ -602,7 +611,8 @@ LinalgOp::reifyResultShapes(OpBuilder &b,
if (checkDimExpr.visit(shapeExprs[pos]))
shapes.push_back(createOrFoldDimOp(b, loc, opOperand->get(), dim));
else
shapes.push_back(allResultDimValues[pos]);
shapes.push_back(
getValueOrCreateConstantIndexOp(b, loc, allResultDimValues[pos]));
pos++;
}
reifiedReturnShapes.emplace_back(std::move(shapes));

View File

@ -630,12 +630,8 @@ struct FoldInsertPadIntoFill : public OpRewritePattern<tensor::InsertSliceOp> {
// plus low padding sizes.
SmallVector<OpFoldResult, 4> newOffsets;
for (const auto &p : llvm::zip(lowPads, oldOffsets)) {
Value padValue = getValueOrCreateConstantIndexOp(
rewriter, srcPadOp.getLoc(), std::get<0>(p));
Value offsetValue = getValueOrCreateConstantIndexOp(
rewriter, insertOp.getLoc(), std::get<1>(p));
newOffsets.push_back(
applyMapToValues(rewriter, loc, addMap, {offsetValue, padValue})[0]);
newOffsets.push_back(makeComposedFoldedAffineApply(
rewriter, loc, addMap, {std::get<0>(p), std::get<1>(p)}));
}
SmallVector<OpFoldResult, 4> newSizes;

View File

@ -18,6 +18,7 @@
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/Passes.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
#include "mlir/Dialect/Linalg/Utils/Utils.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
using namespace mlir;
@ -88,40 +89,35 @@ struct BubbleUpExtractSliceOpPattern
}
auto linalgLoc = linalgOp.getLoc();
auto allShapeSizes =
SmallVector<OpFoldResult> allShapeSizes =
linalgOp.createFlatListOfOperandDims(rewriter, linalgLoc);
AffineMap shapeSizesToLoopsMap = linalgOp.getShapesToLoopsMap();
if (!shapeSizesToLoopsMap) {
return rewriter.notifyMatchFailure(
linalgOp, "failed to get loops map from shape sizes");
}
auto sizeBounds = applyMapToValues(rewriter, linalgLoc,
shapeSizesToLoopsMap, allShapeSizes);
auto sliceLoc = sliceOp.getLoc();
auto offsetVals = getValueOrCreateConstantIndexOp(
rewriter, sliceLoc, sliceOp.getMixedOffsets());
auto sizeVals = getValueOrCreateConstantIndexOp(rewriter, sliceLoc,
sliceOp.getMixedSizes());
SmallVector<OpFoldResult> sizeBounds =
makeComposedFoldedMultiResultAffineApply(
rewriter, linalgLoc, shapeSizesToLoopsMap, allShapeSizes);
// The offsets and sizes from the slice operation only give you the tile
// size of the output. Use that compute the tile sizes and offsets of the
// loops. For loops not used to access the output, set the tile sizes to
// loop bounds and set the offset to 0.
Value zero = rewriter.create<arith::ConstantIndexOp>(linalgLoc, 0);
SmallVector<Value, 4> tileOffsets(sizeBounds.size(), zero);
SmallVector<Value, 4> tileSizes = sizeBounds;
SmallVector<OpFoldResult> tileOffsets(sizeBounds.size(),
rewriter.getIndexAttr(0));
SmallVector<OpFoldResult> tileSizes = sizeBounds;
for (auto const &result : enumerate(indexingMap.getResults())) {
unsigned position = result.value().cast<AffineDimExpr>().getPosition();
tileOffsets[position] = offsetVals[result.index()];
tileSizes[position] = sizeVals[result.index()];
tileOffsets[position] = sliceOp.getMixedOffsets()[result.index()];
tileSizes[position] = sliceOp.getMixedSizes()[result.index()];
}
SmallVector<Value> valuesToTile = linalgOp.getInputAndOutputOperands();
SmallVector<Value, 4> tiledOperands = makeTiledShapes(
rewriter, linalgLoc, linalgOp, valuesToTile, tileOffsets, tileSizes,
sizeBounds, /*omitPartialTileCheck=*/true);
SmallVector<Value> tiledOperands =
makeTiledShapes(rewriter, linalgLoc, linalgOp, valuesToTile,
tileOffsets, tileSizes, sizeBounds,
/*omitPartialTileCheck=*/true);
SmallVector<Type, 4> resultTensorTypes;
for (OpOperand *opOperand : linalgOp.getOutputTensorOperands())

View File

@ -109,7 +109,9 @@ static SmallVector<OpFoldResult> getGenericOpLoopRange(OpBuilder &b,
auto allShapesSizes =
cast<LinalgOp>(op.getOperation()).createFlatListOfOperandDims(b, loc);
AffineMap map = op.getShapesToLoopsMap();
return getAsOpFoldResult(applyMapToValues(b, loc, map, allShapesSizes));
IRRewriter rewriter(b);
return makeComposedFoldedMultiResultAffineApply(rewriter, loc, map,
allShapesSizes);
}
/// Helper method to permute the list of `values` based on the `map`.

View File

@ -113,24 +113,24 @@ static SmallVector<Value> getTiledOperands(LinalgOp producer) {
/// obtained from the producer itself, since they are not tiled + fused.
static LinalgOp fuse(OpBuilder &b, LinalgOp producer,
const DenseMap<unsigned, Range> &fusedLoopsAndRanges) {
SmallVector<Value, 8> ivs, tileSizes, sizeBounds;
SmallVector<Range, 8> loopRanges;
SmallVector<OpFoldResult> ivs, tileSizes, sizeBounds;
SmallVector<Range> loopRanges;
Location loc = producer.getLoc();
auto zero = b.create<arith::ConstantIndexOp>(loc, 0);
for (unsigned i = 0, e = producer.getNumLoops(); i < e; ++i) {
auto shapeDim = getShapeDefiningLoopRange(producer, i);
Value dim = createOrFoldDimOp(b, loc, shapeDim.shape, shapeDim.dimension);
OpFoldResult dim =
createFoldedDimOp(b, loc, shapeDim.shape, shapeDim.dimension);
sizeBounds.push_back(dim);
auto it = fusedLoopsAndRanges.find(i);
if (it != fusedLoopsAndRanges.end()) {
ivs.push_back(materializeOpFoldResult(b, loc, it->second.offset));
tileSizes.push_back(materializeOpFoldResult(b, loc, it->second.size));
ivs.push_back(it->second.offset);
tileSizes.push_back(it->second.size);
loopRanges.push_back(it->second);
LLVM_DEBUG(llvm::dbgs() << "tiled loop#" << i << " with LoopRange "
<< loopRanges.back() << "\n");
} else {
tileSizes.push_back(zero);
tileSizes.push_back(b.getIndexAttr(0));
loopRanges.push_back(Range{b.getIndexAttr(0), dim, b.getIndexAttr(1)});
LLVM_DEBUG(llvm::dbgs() << "full loop#" << i << " with LoopRange "
<< loopRanges.back() << "\n");
@ -166,10 +166,8 @@ static LinalgOp fuse(OpBuilder &b, LinalgOp producer,
Operation *clonedOp = producer.clone(b, loc, resultTypes, clonedShapes);
// Shift all IndexOp results by the tile offset.
SmallVector<Value> allIvs;
llvm::transform(loopRanges, std::back_inserter(allIvs), [&](Range range) {
return materializeOpFoldResult(b, loc, range.offset);
});
SmallVector<OpFoldResult> allIvs = llvm::to_vector(
llvm::map_range(loopRanges, [&](Range range) { return range.offset; }));
offsetIndices(b, clonedOp, allIvs);
return clonedOp;

View File

@ -141,30 +141,27 @@ static LinalgOp getTiledProducer(OpBuilder &b, OpResult producerResult,
Location loc = producerOp.getLoc();
// Obtain the `producerOp` loop bounds and the `sliceOp` ranges.
SmallVector<Value> producerLoopBounds;
SmallVector<OpFoldResult> producerLoopBounds;
llvm::transform(producerOp.createLoopRanges(b, loc),
std::back_inserter(producerLoopBounds), [&](Range range) {
return materializeOpFoldResult(b, loc, range.size);
});
std::back_inserter(producerLoopBounds),
[&](Range range) { return range.size; });
SmallVector<Range> sliceOpRanges = sliceOp.getOrCreateRanges(b, loc);
// Tile the producer operands given the `sliceOp` ranges. Iterate the
// `tiledSliceDimIndices` and store the tile offset and size for the tiled
// slice dimension.
auto zero = b.create<arith::ConstantIndexOp>(loc, 0);
SmallVector<Value> tileIvs(producerOp.getNumLoops(), nullptr);
SmallVector<Value> tileSizes(producerOp.getNumLoops(), zero);
SmallVector<Value> allIvs(producerOp.getNumLoops(), nullptr);
SmallVector<OpFoldResult> tileIvs(producerOp.getNumLoops(), nullptr);
SmallVector<OpFoldResult> tileSizes(producerOp.getNumLoops(),
b.getIndexAttr(0));
SmallVector<OpFoldResult> allIvs(producerOp.getNumLoops(), nullptr);
for (auto it : zip(tiledSliceDimIndices, tiledProducerLoopIndices)) {
int64_t tiledSliceDim = std::get<0>(it);
int64_t tiledProducerLoop = std::get<1>(it);
tileIvs[tiledProducerLoop] =
materializeOpFoldResult(b, loc, sliceOpRanges[tiledSliceDim].offset);
tileSizes[tiledProducerLoop] =
materializeOpFoldResult(b, loc, sliceOpRanges[tiledSliceDim].size);
tileIvs[tiledProducerLoop] = sliceOpRanges[tiledSliceDim].offset;
tileSizes[tiledProducerLoop] = sliceOpRanges[tiledSliceDim].size;
allIvs[tiledProducerLoop] = tileIvs[tiledProducerLoop];
}
erase_value(tileIvs, nullptr);
erase_value(tileIvs, OpFoldResult());
SmallVector<Value> tiledOperands = producerOp.getInputAndOutputOperands();
tiledOperands = makeTiledShapes(b, loc, producerOp, tiledOperands, tileIvs,
tileSizes, producerLoopBounds,

View File

@ -13,6 +13,7 @@
#include <utility>
#include "PassDetail.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arithmetic/Utils/Utils.h"
#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
@ -36,20 +37,27 @@ using namespace mlir::scf;
#define DEBUG_TYPE "linalg-tiling"
static bool isZero(Value v) {
if (auto cst = v.getDefiningOp<arith::ConstantIndexOp>())
static bool isZero(OpFoldResult v) {
if (!v)
return false;
if (auto attr = v.dyn_cast<Attribute>()) {
IntegerAttr intAttr = attr.dyn_cast<IntegerAttr>();
return intAttr && intAttr.getValue().isZero();
}
if (auto cst = v.get<Value>().getDefiningOp<arith::ConstantIndexOp>())
return cst.value() == 0;
return false;
}
std::tuple<SmallVector<Range, 4>, LoopIndexToRangeIndexMap>
mlir::linalg::makeTiledLoopRanges(RewriterBase &b, Location loc, AffineMap map,
ValueRange allShapeSizes,
ValueRange allTileSizes) {
ArrayRef<OpFoldResult> allShapeSizes,
ArrayRef<OpFoldResult> allTileSizes) {
assert(allTileSizes.size() == map.getNumResults());
// Apply `map` to get shape sizes in loop order.
auto shapeSizes = applyMapToValues(b, loc, map, allShapeSizes);
SmallVector<Value, 4> tileSizes(allTileSizes.begin(), allTileSizes.end());
SmallVector<OpFoldResult> shapeSizes =
makeComposedFoldedMultiResultAffineApply(b, loc, map, allShapeSizes);
SmallVector<OpFoldResult> tileSizes(allTileSizes.begin(), allTileSizes.end());
// Traverse the tile sizes, which are in loop order, erase zeros everywhere.
LoopIndexToRangeIndexMap loopIndexToRangeIndex;
@ -80,7 +88,7 @@ void mlir::linalg::transformIndexOps(
continue;
en.value() = ivs[rangeIndex->second];
}
offsetIndices(b, op, allIvs);
offsetIndices(b, op, getAsOpFoldResult(allIvs));
}
/// Asserts that the given index-typed value is strictly positive. If the value
@ -121,14 +129,15 @@ mlir::linalg::computeMultiTileSizes(OpBuilder &builder, LinalgOp op,
// Find the trip count of the iteration space dimension for which the tile
// sizes are computed.
// TODO: update createFlatListOfOperandDims to return OpFoldResults and avoid
// littering by useless constant materialization.
SmallVector<Value, 4> allShapes =
SmallVector<OpFoldResult> allShapes =
op.createFlatListOfOperandDims(b, b.getLoc());
AffineMap shapesToLoops = op.getShapesToLoopsMap();
SmallVector<Value, 4> loopRanges =
applyMapToValues(b, op.getLoc(), shapesToLoops, allShapes);
Value tripCount = loopRanges[dimension];
IRRewriter rewriter(b);
SmallVector<OpFoldResult> loopRanges =
makeComposedFoldedMultiResultAffineApply(rewriter, op.getLoc(),
shapesToLoops, allShapes);
Value tripCount =
materializeOpFoldResult(rewriter, op.getLoc(), loopRanges[dimension]);
// Compute the tile sizes and the respective numbers of tiles.
AffineExpr s0 = b.getAffineSymbolExpr(0);
@ -181,15 +190,6 @@ createMatchingParallelSubsetInsertOp(OpBuilder &b, Location loc,
subsetExtractOp.getMixedSizes(), subsetExtractOp.getMixedStrides());
}
/// Build an `affine_max` of all the `vals`.
static OpFoldResult buildMax(OpBuilder &b, Location loc,
ArrayRef<OpFoldResult> vals) {
SmallVector<Value> args = getValueOrCreateConstantIndexOp(b, loc, vals);
return b.createOrFold<AffineMaxOp>(
loc, AffineMap::getMultiDimIdentityMap(vals.size(), loc.getContext()),
args);
}
/// Returns true if the maximum tile offset `tileSize * numThreads-1` is less
/// than `iterationSize`.
static bool canOmitTileOffsetInBoundsCheck(OpFoldResult tileSize,
@ -203,6 +203,24 @@ static bool canOmitTileOffsetInBoundsCheck(OpFoldResult tileSize,
return *tileSizeConst * (*numThreadsConst - 1) < *iterSizeConst;
}
/// Build an `affine_max` of all the `vals`.
static OpFoldResult buildMax(OpBuilder &b, Location loc,
ArrayRef<OpFoldResult> vals) {
IRRewriter rewriter(b);
return makeComposedFoldedAffineMax(
rewriter, loc,
AffineMap::getMultiDimIdentityMap(vals.size(), loc.getContext()), vals);
}
/// Build an `affine_min` of all the `vals`.
static OpFoldResult buildMin(OpBuilder &b, Location loc,
ArrayRef<OpFoldResult> vals) {
IRRewriter rewriter(b);
return makeComposedFoldedAffineMin(
rewriter, loc,
AffineMap::getMultiDimIdentityMap(vals.size(), loc.getContext()), vals);
}
/// Rewrite a TilingInterface `op` to a tiled `scf.foreach_thread`. The
/// tiling is specified by the number of tiles/threads `numThreads` and the
/// optional nominal tile size `nominalTileSizes`. If `nominalTilSizes` is
@ -242,7 +260,6 @@ static FailureOr<ForeachThreadTilingResult> tileToForeachThreadOpImpl(
return materializeOpFoldResult(ilocb, ofr);
}));
Value zero = b.create<arith::ConstantIndexOp>(loc, 0);
Operation *tiledOp = nullptr;
// Create the ForeachThreadOp. We don't use the lambda body-builder
@ -273,9 +290,9 @@ static FailureOr<ForeachThreadTilingResult> tileToForeachThreadOpImpl(
AffineExpr i, j, M, N, O;
bindDims(b.getContext(), i, j);
bindSymbols(b.getContext(), M, N, O);
Value size = loopRanges[loopIdx].size;
Value offset = loopRanges[loopIdx].offset;
Value threadId = threadIds[threadIdIdx];
OpFoldResult size = loopRanges[loopIdx].size;
OpFoldResult offset = loopRanges[loopIdx].offset;
OpFoldResult threadId = threadIds[threadIdIdx];
// Symbolic fixed max size per thread.
// TODO: floor + 0/1 depending on case for better load-balancing.
OpFoldResult tileSizePerThread =
@ -295,9 +312,8 @@ static FailureOr<ForeachThreadTilingResult> tileToForeachThreadOpImpl(
if (!isConstantIntValue(residualTileSize, 0)) {
OpFoldResult sizeMinusOffsetPerThread = makeComposedFoldedAffineApply(
b, loc, -i + M, {offsetPerThread, size});
tileSizePerThread = makeComposedFoldedAffineMin(
b, loc, AffineMap::getMultiDimIdentityMap(2, b.getContext()),
ArrayRef<OpFoldResult>{sizeMinusOffsetPerThread, tileSizePerThread});
tileSizePerThread =
buildMin(b, loc, {sizeMinusOffsetPerThread, tileSizePerThread});
}
tiledOffsets.push_back(offsetPerThread);
@ -305,7 +321,8 @@ static FailureOr<ForeachThreadTilingResult> tileToForeachThreadOpImpl(
if (!omitTileOffsetBoundsCheck &&
!canOmitTileOffsetInBoundsCheck(tileSizePerThread,
nonZeroNumThreads[threadIdIdx], size))
tileSizePerThread = buildMax(b, loc, {zero, tileSizePerThread});
tileSizePerThread =
buildMax(b, loc, {b.getIndexAttr(0), tileSizePerThread});
tiledSizes.push_back(tileSizePerThread);
++threadIdIdx;
@ -380,7 +397,7 @@ static Value insertSliceIntoTensor(RewriterBase &b, Location loc,
template <typename LoopTy>
static FailureOr<TiledLinalgOp>
tileLinalgOpImpl(RewriterBase &b, LinalgOp op, ValueRange tileSizes,
tileLinalgOpImpl(RewriterBase &b, LinalgOp op, ArrayRef<OpFoldResult> tileSizes,
const LinalgTilingOptions &options) {
auto nLoops = op.getNumLoops();
// Initial tile sizes may be too big, only take the first nLoops.
@ -395,7 +412,8 @@ tileLinalgOpImpl(RewriterBase &b, LinalgOp op, ValueRange tileSizes,
}
// 1. Build the tiled loop ranges.
auto allShapeSizes = op.createFlatListOfOperandDims(b, op.getLoc());
SmallVector<OpFoldResult> allShapeSizes =
op.createFlatListOfOperandDims(b, op.getLoc());
AffineMap shapeSizesToLoopsMap = op.getShapesToLoopsMap();
if (!shapeSizesToLoopsMap)
return failure();
@ -460,11 +478,14 @@ tileLinalgOpImpl(RewriterBase &b, LinalgOp op, ValueRange tileSizes,
static_cast<size_t>(op.getNumInputsAndOutputs()) &&
"expect the number of operands and inputs and outputs to match");
SmallVector<Value> valuesToTile = operandValuesToUse;
auto sizeBounds =
applyMapToValues(b, loc, shapeSizesToLoopsMap, allShapeSizes);
SmallVector<Value, 4> tiledOperands =
makeTiledShapes(b, loc, op, valuesToTile, interchangedIvs, tileSizes,
sizeBounds, /*omitPartialTileCheck=*/false);
IRRewriter rewriter(b);
SmallVector<OpFoldResult> sizeBounds =
makeComposedFoldedMultiResultAffineApply(
rewriter, loc, shapeSizesToLoopsMap, allShapeSizes);
SmallVector<Value> tiledOperands = makeTiledShapes(
b, loc, op, valuesToTile, getAsOpFoldResult(interchangedIvs), tileSizes,
sizeBounds,
/*omitPartialTileCheck=*/false);
SmallVector<Type> resultTensorTypes =
getTensorOutputTypes(op, tiledOperands);
@ -518,11 +539,10 @@ FailureOr<TiledLinalgOp> static tileLinalgOpImpl(
// dimension. This convention is significantly simpler to handle instead of
// adjusting affine maps to account for missing dimensions.
auto nLoops = op.getNumLoops();
SmallVector<Value, 4> tileSizeVector =
options.tileSizeComputationFunction(b, op);
SmallVector<OpFoldResult> tileSizeVector =
getAsOpFoldResult(options.tileSizeComputationFunction(b, op));
if (tileSizeVector.size() < nLoops) {
auto zero = b.create<arith::ConstantIndexOp>(op.getLoc(), 0);
tileSizeVector.append(nLoops - tileSizeVector.size(), zero);
tileSizeVector.append(nLoops - tileSizeVector.size(), b.getIndexAttr(0));
}
return tileLinalgOpImpl<LoopTy>(b, op, tileSizeVector, options);
@ -555,24 +575,22 @@ static LogicalResult tilePadOp(RewriterBase &builder, tensor::PadOp op,
newPadOp = cast<tensor::PadOp>(builder.clone(*op.getOperation()));
// Get rank and tile sizes.
int64_t rank = op.getResultType().getRank();
SmallVector<Value> tileSizes =
options.tileSizeComputationFunction(builder, op);
SmallVector<OpFoldResult> tileSizes =
getAsOpFoldResult(options.tileSizeComputationFunction(builder, op));
// Normalize untiled padding dimensions to 0.
Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);
tileSizes.append(rank - tileSizes.size(), zero);
tileSizes.append(rank - tileSizes.size(), builder.getIndexAttr(0));
// Compute lower and upper bounds of the loop nest.
TilingInterface tilingInterface =
dyn_cast<TilingInterface>(op.getOperation());
SmallVector<Range> ranges = tilingInterface.getIterationDomain(builder);
SmallVector<Value> lbs, dims, allDims, steps;
SmallVector<Value> lbs, dims, steps;
SmallVector<OpFoldResult> allDims;
for (int64_t i = 0; i < rank; ++i) {
Value materializedSize =
materializeOpFoldResult(builder, loc, ranges[i].size);
allDims.push_back(materializedSize);
allDims.push_back(ranges[i].size);
if (!isZero(tileSizes[i])) {
lbs.push_back(materializeOpFoldResult(builder, loc, ranges[i].offset));
dims.push_back(materializedSize);
steps.push_back(tileSizes[i]);
dims.push_back(materializeOpFoldResult(builder, loc, ranges[i].size));
steps.push_back(materializeOpFoldResult(builder, loc, tileSizes[i]));
}
}
// Generate loop nest: One loop per dimension.
@ -583,9 +601,11 @@ static LogicalResult tilePadOp(RewriterBase &builder, tensor::PadOp op,
[&](OpBuilder &b, Location loc, ValueRange localIvs,
ValueRange iterArgs) -> scf::ValueVector {
// Compute offsets and sizes of ExtractSliceOp.
SmallVector<Value> offsets =
computeTileOffsets(b, loc, localIvs, tileSizes);
SmallVector<Value> sizes = computeTileSizes(b, loc, tileSizes, allDims);
SmallVector<Value> localIVVector = llvm::to_vector(localIvs);
SmallVector<OpFoldResult> offsets = computeTileOffsets(
b, loc, getAsOpFoldResult(localIVVector), tileSizes);
SmallVector<OpFoldResult> sizes =
computeTileSizes(b, loc, tileSizes, allDims);
// Create ExtractSliceOp: Extract a tile from the tensor::PadOp.
// Note: The tensor::PadOp is located outside of the loop nest. It is
// later moved inside by ExtractSliceOfPadTensorSwapPattern.

View File

@ -32,7 +32,7 @@ struct LinalgOpTilingInterface
LinalgOpTy> {
/// Return the destination operands.
SmallVector<Value> getDestinationOperands(Operation *op, OpBuilder &b) const {
return llvm::cast<LinalgOp>(op).getOutputOperands();
return cast<LinalgOp>(op).getOutputOperands();
}
/// Return the loop iterator type.
@ -50,13 +50,16 @@ struct LinalgOpTilingInterface
b.setInsertionPoint(op);
Location loc = op->getLoc();
LinalgOp linalgOp = cast<LinalgOp>(op);
auto allShapesSizes = linalgOp.createFlatListOfOperandDims(b, loc);
SmallVector<OpFoldResult> allShapesSizes =
linalgOp.createFlatListOfOperandDims(b, loc);
AffineMap map = linalgOp.getShapesToLoopsMap();
Value zero = b.create<arith::ConstantIndexOp>(loc, 0);
Value one = b.create<arith::ConstantIndexOp>(loc, 1);
return llvm::to_vector(llvm::map_range(
applyMapToValues(b, loc, map, allShapesSizes), [&](Value v) {
return Range{zero, v, one};
IRRewriter rewriter(b);
return llvm::to_vector(
llvm::map_range(map.getResults(), [&](AffineExpr loopExpr) {
OpFoldResult ofr = makeComposedFoldedAffineApply(
rewriter, loc, loopExpr, allShapesSizes);
return Range{b.getIndexAttr(0), ofr, b.getIndexAttr(1)};
}));
}
@ -71,11 +74,8 @@ struct LinalgOpTilingInterface
Location loc = op->getLoc();
LinalgOp linalgOp = cast<LinalgOp>(op);
SmallVector<Value> valuesToTile = linalgOp.getInputAndOutputOperands();
SmallVector<Value> offsetValues =
getValueOrCreateConstantIndexOp(b, loc, offsets);
SmallVector<Value, 4> tiledOperands = makeTiledShapes(
b, loc, linalgOp, valuesToTile, offsetValues,
getValueOrCreateConstantIndexOp(b, loc, sizes), {}, true);
b, loc, linalgOp, valuesToTile, offsets, sizes, {}, true);
SmallVector<Type> resultTensorTypes = llvm::to_vector(llvm::map_range(
linalgOp.getOutputTensorOperands(), [&](OpOperand *opOperand) {
@ -84,7 +84,7 @@ struct LinalgOpTilingInterface
Operation *tiledOp =
linalgOp.clone(b, loc, resultTensorTypes, tiledOperands);
offsetIndices(b, cast<LinalgOp>(tiledOp), offsetValues);
offsetIndices(b, cast<LinalgOp>(tiledOp), offsets);
return {tiledOp};
}
@ -102,28 +102,16 @@ struct LinalgOpTilingInterface
AffineExpr d0;
bindDims(b.getContext(), d0);
auto fullyComposeAffineMapAndOperands = [](OpBuilder &builder, Location loc,
AffineExpr expr,
ValueRange operands) -> Value {
AffineMap map = AffineMap::inferFromExprList({expr}).front();
SmallVector<Value> normalizedOperands(operands.begin(), operands.end());
mlir::fullyComposeAffineMapAndOperands(&map, &normalizedOperands);
canonicalizeMapAndOperands(&map, &normalizedOperands);
return builder.createOrFold<AffineApplyOp>(loc, map, normalizedOperands);
};
SmallVector<Value> sizeVals =
getValueOrCreateConstantIndexOp(b, loc, sizes);
SmallVector<Value> subShapeSizes =
llvm::to_vector(llvm::map_range(sizeVals, [&](Value v) {
return fullyComposeAffineMapAndOperands(b, loc, d0 - 1, v);
IRRewriter rewriter(b);
SmallVector<OpFoldResult> subShapeSizes =
llvm::to_vector(llvm::map_range(sizes, [&](OpFoldResult ofr) {
return makeComposedFoldedAffineApply(rewriter, loc, d0 - 1, ofr);
}));
OpOperand *outOperand = linalgOp.getOutputOperand(resultNumber);
Value sliceOpResult =
makeTiledShape(b, loc, outOperand->get(), sizeVals,
linalgOp.getTiedIndexingMap(outOperand),
getValueOrCreateConstantIndexOp(b, loc, offsets),
makeTiledShape(b, loc, outOperand->get(), sizes,
linalgOp.getTiedIndexingMap(outOperand), offsets,
/*ubs*/ {}, subShapeSizes, true);
auto sliceOp = sliceOpResult.getDefiningOp<tensor::ExtractSliceOp>();
if (!sliceOp)

View File

@ -142,14 +142,18 @@ LinalgTilingOptions &mlir::linalg::LinalgTilingOptions::scalarizeDynamicDims() {
if (!linalgOp)
return tileSizes;
Location loc = linalgOp.getLoc();
auto allShapeSizes = linalgOp.createFlatListOfOperandDims(b, loc);
SmallVector<OpFoldResult> allShapeSizes =
linalgOp.createFlatListOfOperandDims(b, loc);
AffineMap map = linalgOp.getShapesToLoopsMap();
if (!map)
return tileSizes;
auto shapeSizes = applyMapToValues(b, loc, map, allShapeSizes);
IRRewriter rewriter(b);
SmallVector<OpFoldResult> shapeSizes =
makeComposedFoldedMultiResultAffineApply(rewriter, loc, map,
allShapeSizes);
// If the shape size is dynamic, tile by 1. Otherwise, do not tile (tile
// size 0).
for (Value shapeSize : shapeSizes)
for (OpFoldResult shapeSize : shapeSizes)
tileSizes.push_back(getConstantIntValue(shapeSize)
? b.create<arith::ConstantIndexOp>(loc, 0)
: b.create<arith::ConstantIndexOp>(loc, 1));

View File

@ -42,8 +42,14 @@ using namespace presburger;
using namespace mlir::linalg;
using namespace mlir::scf;
static bool isZero(Value v) {
if (auto cst = v.getDefiningOp<arith::ConstantIndexOp>())
static bool isZero(OpFoldResult v) {
if (!v)
return false;
if (auto attr = v.dyn_cast<Attribute>()) {
IntegerAttr intAttr = attr.dyn_cast<IntegerAttr>();
return intAttr && intAttr.getValue().isZero();
}
if (auto cst = v.get<Value>().getDefiningOp<arith::ConstantIndexOp>())
return cst.value() == 0;
return false;
}
@ -59,7 +65,7 @@ namespace {
// `d0 + 2 * d1 + d3` is tiled by [0, 0, 0, 2] but not by [0, 0, 2, 0]
//
struct TileCheck : public AffineExprVisitor<TileCheck> {
TileCheck(ValueRange tileSizes) : tileSizes(tileSizes) {}
TileCheck(ArrayRef<OpFoldResult> tileSizes) : tileSizes(tileSizes) {}
void visitDimExpr(AffineDimExpr expr) {
isTiled |= !isZero(tileSizes[expr.getPosition()]);
@ -72,12 +78,12 @@ struct TileCheck : public AffineExprVisitor<TileCheck> {
"nonpositive multiplying coefficient");
}
bool isTiled = false;
ValueRange tileSizes;
ArrayRef<OpFoldResult> tileSizes;
};
} // namespace
static bool isTiled(AffineExpr expr, ValueRange tileSizes) {
static bool isTiled(AffineExpr expr, ArrayRef<OpFoldResult> tileSizes) {
if (!expr)
return false;
TileCheck t(tileSizes);
@ -86,7 +92,7 @@ static bool isTiled(AffineExpr expr, ValueRange tileSizes) {
}
// Checks whether the `map varies with respect to a non-zero `tileSize`.
static bool isTiled(AffineMap map, ValueRange tileSizes) {
static bool isTiled(AffineMap map, ArrayRef<OpFoldResult> tileSizes) {
if (!map)
return false;
for (unsigned r = 0; r < map.getNumResults(); ++r)
@ -201,6 +207,14 @@ Value createOrFoldDimOp(OpBuilder &b, Location loc, Value source, int64_t dim) {
llvm_unreachable("Expected MemRefType or TensorType");
}
OpFoldResult createFoldedDimOp(OpBuilder &b, Location loc, Value source,
int64_t dim) {
auto shapedType = source.getType().cast<ShapedType>();
if (!shapedType.hasRank() || shapedType.isDynamicDim(dim))
return createOrFoldDimOp(b, loc, source, dim);
return b.getIndexAttr(shapedType.getDimSize(dim));
}
/// Given an operation, retrieves the value of each dynamic dimension through
/// constructing the necessary DimOp operators.
SmallVector<Value, 4> getDynOperands(Location loc, Value val, OpBuilder &b) {
@ -788,18 +802,10 @@ void GenerateLoopNest<scf::ParallelOp>::doit(
assert(ivs.size() == iteratorTypes.size() && "did not generate enough loops");
}
static Value fullyComposeAndAffineApply(OpBuilder &b, Location loc,
AffineExpr expr, ValueRange operands) {
AffineMap map = AffineMap::inferFromExprList({expr}).front();
SmallVector<Value> normalizedOperands(operands.begin(), operands.end());
mlir::fullyComposeAffineMapAndOperands(&map, &normalizedOperands);
canonicalizeMapAndOperands(&map, &normalizedOperands);
return b.createOrFold<AffineApplyOp>(loc, map, normalizedOperands);
}
Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile,
ValueRange tileSizes, AffineMap map, ValueRange lbs,
ValueRange ubs, ValueRange subShapeSizes,
ArrayRef<OpFoldResult> tileSizes, AffineMap map,
ArrayRef<OpFoldResult> lbs, ArrayRef<OpFoldResult> ubs,
ArrayRef<OpFoldResult> subShapeSizes,
bool omitPartialTileCheck) {
auto shapedType = valueToTile.getType().dyn_cast<ShapedType>();
assert(shapedType && "only shaped types can be tiled");
@ -815,8 +821,8 @@ Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile,
LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: for dim#" << r);
if (!isTiled(map.getSubMap({r}), tileSizes)) {
offsets.push_back(builder.getIndexAttr(0));
Value dim = createOrFoldDimOp(builder, loc, valueToTile, r);
sizes.push_back(getAsOpFoldResult(dim));
OpFoldResult dim = createFoldedDimOp(builder, loc, valueToTile, r);
sizes.push_back(dim);
strides.push_back(builder.getIndexAttr(1));
LLVM_DEBUG(llvm::dbgs() << ": not tiled: use size: " << dim << "\n");
continue;
@ -827,14 +833,15 @@ Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile,
// (i.e. the op does not subsample, stepping occurs in the loop).
auto m = map.getSubMap({r});
LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: submap: " << m << "\n");
auto offset = applyMapToValues(builder, loc, m, lbs).front();
offsets.push_back(getAsOpFoldResult(offset));
auto closedIntSize =
applyMapToValues(builder, loc, m, subShapeSizes).front();
IRRewriter rewriter(builder);
OpFoldResult offset = makeComposedFoldedAffineApply(rewriter, loc, m, lbs);
offsets.push_back(offset);
OpFoldResult closedIntSize =
makeComposedFoldedAffineApply(rewriter, loc, m, subShapeSizes);
// Resulting size needs to be made half open interval again.
AffineExpr s0 = getAffineSymbolExpr(0, builder.getContext());
Value size =
fullyComposeAndAffineApply(builder, loc, s0 + 1, closedIntSize);
OpFoldResult size =
makeComposedFoldedAffineApply(rewriter, loc, s0 + 1, closedIntSize);
LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: raw size: " << size << "\n");
LLVM_DEBUG(llvm::dbgs()
<< "makeTiledShape: new offset: " << offset << "\n");
@ -844,7 +851,7 @@ Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile,
// We statically know that the partial/boundary tile condition is
// unnecessary.
LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: new size: " << size << "\n");
sizes.push_back(getAsOpFoldResult(size));
sizes.push_back(size);
continue;
}
@ -854,10 +861,10 @@ Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile,
// b. The subshape size is 1. According to the way the loops are set up,
// tensors with "0" dimensions would never be constructed.
int64_t shapeSize = shape[r];
auto sizeCst = size.getDefiningOp<arith::ConstantIndexOp>();
auto hasTileSizeOne = sizeCst && sizeCst.value() == 1;
Optional<int64_t> sizeCst = getConstantIntValue(size);
auto hasTileSizeOne = sizeCst && *sizeCst == 1;
auto dividesEvenly = sizeCst && !ShapedType::isDynamic(shapeSize) &&
((shapeSize % sizeCst.value()) == 0);
((shapeSize % *sizeCst) == 0);
if (!hasTileSizeOne && !dividesEvenly) {
LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: shapeSize=" << shapeSize
<< ", size: " << size
@ -878,25 +885,25 @@ Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile,
AffineMap plusOneMap =
AffineMap::inferFromExprList({ArrayRef<AffineExpr>{dim0 + 1}})
.front();
auto maxIndices = llvm::to_vector<8>(llvm::map_range(ubs, [&](Value ub) {
return makeComposedAffineApply(builder, loc, minusOneMap, {ub})
.getResult();
}));
Value maxIndex = applyMapToValues(builder, loc, m, maxIndices).front();
Value d = makeComposedAffineApply(builder, loc, plusOneMap, {maxIndex});
SmallVector<OpFoldResult> maxIndices =
llvm::to_vector(llvm::map_range(ubs, [&](OpFoldResult ub) {
return makeComposedFoldedAffineApply(rewriter, loc, minusOneMap,
{ub});
}));
OpFoldResult maxIndex =
makeComposedFoldedAffineApply(rewriter, loc, m, maxIndices);
OpFoldResult d =
makeComposedFoldedAffineApply(rewriter, loc, plusOneMap, {maxIndex});
// Compute min(dim - offset, size) to avoid out-of-bounds accesses.
AffineMap minMap = AffineMap::inferFromExprList(
{ArrayRef<AffineExpr>{dim1 - dim2, dim0}})
.front();
SmallVector<Value, 4> operands{size, d, offset};
fullyComposeAffineMapAndOperands(&minMap, &operands);
canonicalizeMapAndOperands(&minMap, &operands);
size = builder.create<AffineMinOp>(loc, builder.getIndexType(), minMap,
operands);
size =
makeComposedFoldedAffineMin(rewriter, loc, minMap, {size, d, offset});
}
LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: new size: " << size << "\n");
sizes.push_back(getAsOpFoldResult(size));
sizes.push_back(size);
}
auto *sliceOp = TypeSwitch<ShapedType, Operation *>(shapedType)
@ -914,31 +921,31 @@ Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile,
return sliceOp->getResult(0);
}
SmallVector<Value> computeTileOffsets(OpBuilder &b, Location loc,
ValueRange ivs, ValueRange tileSizes) {
SmallVector<Value> offsets;
SmallVector<OpFoldResult> computeTileOffsets(OpBuilder &b, Location loc,
ArrayRef<OpFoldResult> ivs,
ArrayRef<OpFoldResult> tileSizes) {
SmallVector<OpFoldResult> offsets;
for (unsigned idx = 0, idxIvs = 0, e = tileSizes.size(); idx < e; ++idx) {
LLVM_DEBUG(llvm::dbgs() << "makeTiledShapes: for loop#" << idx << "\n");
bool isTiled = !isZero(tileSizes[idx]);
offsets.push_back(
isTiled ? ivs[idxIvs++]
: b.create<arith::ConstantIndexOp>(loc, 0).getResult());
offsets.push_back(isTiled ? ivs[idxIvs++] : b.getIndexAttr(0));
LLVM_DEBUG(llvm::dbgs()
<< "computeTileOffsets: " << offsets.back() << "\n");
}
return offsets;
}
SmallVector<Value> computeTileSizes(OpBuilder &b, Location loc,
ValueRange tileSizes,
ArrayRef<Value> sizeBounds) {
SmallVector<Value> sizes;
SmallVector<OpFoldResult> computeTileSizes(OpBuilder &b, Location loc,
ArrayRef<OpFoldResult> tileSizes,
ArrayRef<OpFoldResult> sizeBounds) {
SmallVector<OpFoldResult> sizes;
for (unsigned idx = 0, e = tileSizes.size(); idx < e; ++idx) {
bool isTiled = !isZero(tileSizes[idx]);
// Before composing, we need to make range a closed interval.
Value size = isTiled ? tileSizes[idx] : sizeBounds[idx];
OpFoldResult size = isTiled ? tileSizes[idx] : sizeBounds[idx];
AffineExpr d0 = getAffineDimExpr(0, b.getContext());
sizes.push_back(fullyComposeAndAffineApply(b, loc, d0 - 1, size));
IRRewriter rewriter(b);
sizes.push_back(makeComposedFoldedAffineApply(rewriter, loc, d0 - 1, size));
LLVM_DEBUG(llvm::dbgs() << "computeTileSizes: " << sizes.back() << "\n");
}
return sizes;
@ -981,6 +988,9 @@ SmallVector<Value> insertSlicesBack(OpBuilder &builder, Location loc,
Value materializeOpFoldResult(ImplicitLocOpBuilder &builder,
OpFoldResult opFoldResult) {
if (!opFoldResult)
return nullptr;
if (auto value = opFoldResult.dyn_cast<Value>())
return value;
auto attr = opFoldResult.get<Attribute>().cast<IntegerAttr>();
@ -993,27 +1003,27 @@ Value materializeOpFoldResult(OpBuilder &builder, Location loc,
return materializeOpFoldResult(b, opFoldResult);
}
SmallVector<Value, 4> makeTiledShapes(OpBuilder &b, Location loc,
LinalgOp linalgOp,
ArrayRef<Value> valuesToTile,
ValueRange ivs, ValueRange tileSizes,
ArrayRef<Value> sizeBounds,
bool omitPartialTileCheck) {
SmallVector<Value> makeTiledShapes(OpBuilder &b, Location loc,
LinalgOp linalgOp, ValueRange valuesToTile,
ArrayRef<OpFoldResult> ivs,
ArrayRef<OpFoldResult> tileSizes,
ArrayRef<OpFoldResult> sizeBounds,
bool omitPartialTileCheck) {
assert(ivs.size() == static_cast<size_t>(llvm::count_if(
llvm::make_range(tileSizes.begin(), tileSizes.end()),
[](Value v) { return !isZero(v); })) &&
[](OpFoldResult v) { return !isZero(v); })) &&
"expected as many ivs as non-zero sizes");
// Construct (potentially temporary) mins and maxes on which to apply maps
// that define tile subshapes.
SmallVector<Value> lbs = computeTileOffsets(b, loc, ivs, tileSizes);
SmallVector<Value> subShapeSizes =
SmallVector<OpFoldResult> lbs = computeTileOffsets(b, loc, ivs, tileSizes);
SmallVector<OpFoldResult> subShapeSizes =
computeTileSizes(b, loc, tileSizes, sizeBounds);
assert(static_cast<int64_t>(valuesToTile.size()) ==
linalgOp.getNumInputsAndOutputs() &&
"expected one value to tile for every operand");
SmallVector<Value, 4> tiledShapes;
SmallVector<Value> tiledShapes;
tiledShapes.reserve(valuesToTile.size());
for (OpOperand *opOperand : linalgOp.getInputAndOutputOperands()) {
Value shapedOp = valuesToTile[opOperand->getOperandNumber()];
@ -1040,28 +1050,30 @@ SmallVector<Value, 4> makeTiledShapes(OpBuilder &b, Location loc,
return tiledShapes;
}
void offsetIndices(OpBuilder &b, LinalgOp linalgOp, ArrayRef<Value> offsets) {
void offsetIndices(OpBuilder &b, LinalgOp linalgOp,
ArrayRef<OpFoldResult> offsets) {
IRRewriter rewriter(b);
offsetIndices(rewriter, linalgOp, offsets);
}
void offsetIndices(RewriterBase &b, LinalgOp linalgOp,
ArrayRef<Value> offsets) {
ArrayRef<OpFoldResult> offsets) {
if (!linalgOp.hasIndexSemantics())
return;
for (IndexOp indexOp : linalgOp.getBlock()->getOps<IndexOp>()) {
if (indexOp.dim() >= offsets.size() || offsets[indexOp.dim()] == nullptr)
if (indexOp.dim() >= offsets.size() || !offsets[indexOp.dim()])
continue;
OpBuilder::InsertionGuard guard(b);
b.setInsertionPointAfter(indexOp);
AffineExpr index, offset;
bindDims(b.getContext(), index, offset);
AffineApplyOp applyOp = makeComposedAffineApply(
OpFoldResult applied = makeComposedFoldedAffineApply(
b, indexOp.getLoc(), index + offset,
ValueRange{indexOp.getResult(), offsets[indexOp.dim()]});
b.replaceOpWithIf(indexOp, applyOp.getResult(), [&](OpOperand &use) {
return use.getOwner() != applyOp;
{getAsOpFoldResult(indexOp.getResult()), offsets[indexOp.dim()]});
Value materialized = materializeOpFoldResult(b, indexOp.getLoc(), applied);
b.replaceOpWithIf(indexOp, materialized, [&](OpOperand &use) {
return use.getOwner() != materialized.getDefiningOp();
});
}
}

View File

@ -52,6 +52,8 @@ SmallVector<int64_t, 4> extractFromI64ArrayAttr(Attribute attr) {
/// Given a value, try to extract a constant Attribute. If this fails, return
/// the original value.
OpFoldResult getAsOpFoldResult(Value val) {
if (!val)
return OpFoldResult();
Attribute attr;
if (matchPattern(val, m_Constant(&attr)))
return attr;

View File

@ -65,7 +65,7 @@ func.func @remove_dim_result_uses
return %3, %4 : index, index
}
// CHECK: #[[MAP0:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
// CHECK: #[[MAP1:.+]] = affine_map<()[s0, s1] -> (s1 - s0)>
// CHECK: #[[MAP1:.+]] = affine_map<()[s0, s1] -> (-s0 + s1)>
// CHECK: func @remove_dim_result_uses
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x?xf32>

View File

@ -196,7 +196,7 @@ func.func @conv_tensors_dynamic(%input: tensor<?x?x?x?xf32>, %filter: tensor<?x?
// CHECK: #[[BOUND8_MAP_2:.+]] = affine_map<(d0)[s0, s1] -> (-d0 + s1, -d0 + s0, 8)>
// CHECK: #[[BOUND16_MAP:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
// CHECK: #[[X2_MAP:.+]] = affine_map<(d0) -> (d0 * 2)>
// CHECK: #[[INPUT_BOUND:.+]] = affine_map<(d0, d1)[s0, s1] -> (d1 * -2 + s0 + s1 * 2 - 2, d0 * 2 + s0 - 2)>
// CHECK: #[[INPUT_BOUND:.+]] = affine_map<(d0, d1)[s0, s1] -> (d0 * -2 + s0 * 2 + s1 - 2, d1 * 2 + s1 - 2)>
// CHECK: #[[BOUND16_MAP_2:.+]] = affine_map<(d0)[s0, s1] -> (-d0 + s1, -d0 + s0, 16)>
// CHECK: #[[BOUND4_MAP:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)>
// CHECK: #[[BOUND2_MAP:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
@ -234,13 +234,13 @@ func.func @conv_tensors_dynamic(%input: tensor<?x?x?x?xf32>, %filter: tensor<?x?
// CHECK-NEXT: scf.for %[[IV1:.+]] = %{{.+}} to %[[ELEM_OH]]
// CHECK-NEXT: %[[SIZE_ELEM_OH:.+]] = affine.min #[[BOUND16_MAP]](%[[IV1]])[%[[ELEM_OH]]]
// CHECK-NEXT: %[[OFFSET_OH:.+]] = affine.apply #[[X2_MAP]](%[[IV1]])
// CHECK-NEXT: %[[SIZE_INPUT_H:.+]] = affine.min #[[INPUT_BOUND]](%[[SIZE_ELEM_OH]], %[[IV1]])[%[[FILTER_H]], %[[FILL_H]]]
// CHECK-NEXT: %[[SIZE_INPUT_H:.+]] = affine.min #[[INPUT_BOUND]](%[[IV1]], %[[SIZE_ELEM_OH]])[%[[FILL_H]], %[[FILTER_H]]]
// CHECK-NEXT: %[[SIZE_ELEM_OH_2:.+]] = affine.min #[[BOUND16_MAP_2]](%[[IV1]])[%[[FILL_H]], %[[ELEM_OH]]]
// CHECK-NEXT: scf.for %[[IV2:.+]] = %{{.+}} to %[[ELEM_OW]]
// CHECK-NEXT: %[[SIZE_ELEM_OW:.+]] = affine.min #[[BOUND4_MAP]](%[[IV2]])[%[[ELEM_OW]]]
// CHECK-NEXT: %[[SIZE_ELEM_OC:.+]] = affine.min #[[BOUND2_MAP]](%[[IV2]])[%[[ELEM_OC]]]
// CHECK-NEXT: %[[OFFSET_OW:.+]] = affine.apply #[[X2_MAP]](%[[IV2]])
// CHECK-NEXT: %[[SIZE_INPUT_W:.+]] = affine.min #[[INPUT_BOUND]](%[[SIZE_ELEM_OW]], %[[IV2]])[%[[FILTER_W]], %[[FILL_W]]]
// CHECK-NEXT: %[[SIZE_INPUT_W:.+]] = affine.min #[[INPUT_BOUND]](%[[IV2]], %[[SIZE_ELEM_OW]])[%[[FILL_W]], %[[FILTER_W]]]
// CHECK-NEXT: %[[ST_INPUT:.+]] = tensor.extract_slice %[[INPUT]][%[[IV0]], %[[OFFSET_OH]], %[[OFFSET_OW]], 0]
// CHECK-SAME: [%[[SIZE_INPUT_N]], %[[SIZE_INPUT_H]], %[[SIZE_INPUT_W]], %[[INPUT_C]]]
// CHECK-NEXT: %[[SIZE_ELEM_OW_2:.+]] = affine.min #[[BOUND4_MAP_2]](%[[IV2]])[%[[FILL_W]], %[[ELEM_OW]]]

View File

@ -1,7 +1,7 @@
// RUN: mlir-opt %s -linalg-tile="tile-sizes=2,3" | FileCheck %s
// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0)[s0, s1] -> (-d0 + s0 + s1 - 1, s0 + 1)>
// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0, s1] -> (-d0 + s0 + s1 - 1, s0 + 2)>
// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0)[s0, s1] -> (-d0 + s0 + s1 - 1, s1 + 1)>
// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0, s1] -> (-d0 + s0 + s1 - 1, s1 + 2)>
// CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
// CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 3)>
@ -24,8 +24,8 @@ func.func @conv(%arg0 : memref<?x?xf32>, %arg1 : memref<?x?xf32>, %arg2 : memref
// CHECK-DAG: %[[T3:.*]] = memref.dim %[[ARG2]], %[[C1]]
// CHECK: scf.for %[[ARG3:.*]] = %[[C0]] to %[[T2]] step %[[C2]]
// CHECK: scf.for %[[ARG4:.*]] = %[[C0]] to %[[T3]] step %[[C3]]
// CHECK: %[[T4:.*]] = affine.min #[[MAP0]](%[[ARG3]])[%[[T0]], %[[T2]]]
// CHECK: %[[T5:.*]] = affine.min #[[MAP1]](%[[ARG4]])[%[[T1]], %[[T3]]]
// CHECK: %[[T4:.*]] = affine.min #[[MAP0]](%[[ARG3]])[%[[T2]], %[[T0]]]
// CHECK: %[[T5:.*]] = affine.min #[[MAP1]](%[[ARG4]])[%[[T3]], %[[T1]]]
// CHECK: %[[SV1:.*]] = memref.subview %[[ARG0]][%[[ARG3]], %[[ARG4]]] [%[[T4]], %[[T5]]]
// CHECK: %[[T6:.*]] = affine.min #[[MAP2]](%[[ARG3]])[%[[T2]]
// CHECK: %[[T7:.*]] = affine.min #[[MAP3]](%[[ARG4]])[%[[T3]]]

View File

@ -61,15 +61,14 @@ func.func @interchange_reduction(%input: tensor<12x7x25xf32>) -> tensor<12x25xf3
%five = arith.constant 5.0 : f32
%init = linalg.init_tensor [12, 25] : tensor<12x25xf32>
// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
// CHECK-DAG: %[[C5:.+]] = arith.constant 5 : index
// CHECK-DAG: %[[C7:.+]] = arith.constant 7 : index
// CHECK: %[[INIT:.+]] = linalg.init_tensor [12, 25]
// CHECK-DAG: %[[C5:.+]] = arith.constant 5 : index
// CHECK-DAG: %[[C7:.+]] = arith.constant 7 : index
// CHECK: scf.for %[[IV0:.+]] = %{{.+}} to %{{.+}} step %[[C5]] iter_args(%[[FOR_ARG0:.+]] = %[[INIT]])
// CHECK: scf.for %[[IV1:.+]] = %{{.+}} to %{{.+}} step %[[C7]] iter_args(%[[FOR_ARG1:.+]] = %[[FOR_ARG0]])
// CHECK: %[[OUT_SLICE0:.+]] = tensor.extract_slice %[[FOR_ARG1]][%[[IV0]], %[[IV1]]]
// CHECK: %[[FILL:.+]] = linalg.fill {{.+}} outs(%[[OUT_SLICE0]] : tensor<?x?xf32>)
// CHECK: %[[C4:.+]] = arith.constant 4 : index
// CHECK: scf.for %[[IV2:.+]] = %{{.+}} to %{{.+}} step %[[C4]] iter_args(%[[FOR_ARG2:.+]] = %[[FILL]])
// CHECK: %[[IN_SLICE:.+]] = tensor.extract_slice %[[INPUT]]
// CHECK: %[[OUT_SLICE2:.+]] = tensor.extract_slice %[[FOR_ARG2]][0, 0]