forked from OSchip/llvm-project
Implement parametric tiling on standard for loops
Parametric tiling can be used to extract outer loops with fixed number of iterations. This in turn enables mapping to GPU kernels on a fixed grid independently of the range of the original loops, which may be unknown statically, making the kernel adaptable to different sizes. Provide a utility function that also computes the parametric tile size given the range of the loop. Exercise the utility function through a simple pass that applies it to all top-level loop nests. Permutability or parallelism checks must be performed before calling this utility function in actual passes. Note that parametric tiling cannot be implemented in a purely affine way, although it can be encoded using semi-affine maps. The choice to implement it on standard loops is guided by them being the common representation between Affine loops, Linalg and GPU kernels. PiperOrigin-RevId: 257180251
This commit is contained in:
parent
80e2871087
commit
9d03f5674f
|
@ -798,7 +798,9 @@ public:
|
|||
Dialect *getDialect() { return getOperation()->getDialect(); }
|
||||
|
||||
/// Return the Region enclosing this Op.
|
||||
Region *getContainingRegion() { return getOperation()->getParentRegion(); }
|
||||
Region *getContainingRegion() {
|
||||
return getOperation()->getContainingRegion();
|
||||
}
|
||||
|
||||
/// Return true if this "op class" can match against the specified operation.
|
||||
/// This hook can be overridden with a more specific implementation in
|
||||
|
|
|
@ -105,16 +105,21 @@ void registerPass(StringRef arg, StringRef description, const PassID *passID,
|
|||
const PassAllocatorFunction &function);
|
||||
|
||||
/// PassRegistration provides a global initializer that registers a Pass
|
||||
/// allocation routine for a concrete pass instance.
|
||||
/// allocation routine for a concrete pass instance. The third argument is
|
||||
/// optional and provides a callback to construct a pass that does not have
|
||||
/// a default constructor.
|
||||
///
|
||||
/// Usage:
|
||||
///
|
||||
/// // At namespace scope.
|
||||
/// static PassRegistration<MyPass> Unused("unused", "Unused pass");
|
||||
template <typename ConcretePass> struct PassRegistration {
|
||||
PassRegistration(StringRef arg, StringRef description) {
|
||||
registerPass(arg, description, PassID::getID<ConcretePass>(),
|
||||
[] { return new ConcretePass(); });
|
||||
PassRegistration(
|
||||
StringRef arg, StringRef description,
|
||||
const PassAllocatorFunction &constructor = [] {
|
||||
return new ConcretePass();
|
||||
}) {
|
||||
registerPass(arg, description, PassID::getID<ConcretePass>(), constructor);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -697,6 +697,8 @@ def ForOp : Std_Op<"for"> {
|
|||
OpBuilder getBodyBuilder() {
|
||||
return OpBuilder(body(), std::prev(body()->end()));
|
||||
}
|
||||
void setLowerBound(Value *bound) { getOperation()->setOperand(0, bound); }
|
||||
void setUpperBound(Value *bound) { getOperation()->setOperand(1, bound); }
|
||||
}];
|
||||
}
|
||||
|
||||
|
|
|
@ -30,6 +30,7 @@
|
|||
namespace mlir {
|
||||
class AffineMap;
|
||||
class AffineForOp;
|
||||
class ForOp;
|
||||
class FuncOp;
|
||||
using Function = FuncOp;
|
||||
class OpBuilder;
|
||||
|
@ -140,6 +141,17 @@ SmallVector<SmallVector<AffineForOp, 8>, 8> tile(ArrayRef<AffineForOp> forOps,
|
|||
SmallVector<AffineForOp, 8> tile(ArrayRef<AffineForOp> forOps,
|
||||
ArrayRef<uint64_t> sizes, AffineForOp target);
|
||||
|
||||
/// Tile a nest of standard for loops rooted at `rootForOp` with the given
|
||||
/// (parametric) sizes. Sizes are expected to be strictly positive values at
|
||||
/// runtime. If more sizes than loops provided, discard the trailing values in
|
||||
/// sizes. Assumes the loop nest is permutable.
|
||||
void tile(ForOp rootForOp, ArrayRef<Value *> sizes);
|
||||
|
||||
/// Tile a nest of standard for loops rooted at `rootForOp` by finding such
|
||||
/// parametric tile sizes that the outer loops have a fixed number of iterations
|
||||
/// as defined in `sizes`.
|
||||
void extractFixedOuterLoops(ForOp rootFOrOp, ArrayRef<int64_t> sizes);
|
||||
|
||||
} // end namespace mlir
|
||||
|
||||
#endif // MLIR_TRANSFORMS_LOOP_UTILS_H
|
||||
|
|
|
@ -99,6 +99,12 @@ FunctionPassBase *createLowerAffinePass();
|
|||
/// Creates a pass to perform tiling on loop nests.
|
||||
FunctionPassBase *createLoopTilingPass(uint64_t cacheSizeBytes);
|
||||
|
||||
/// Creates a pass that performs parametric tiling so that the outermost loops
|
||||
/// have the given fixed number of iterations. Assumes outermost loop nests
|
||||
/// are permutable.
|
||||
FunctionPassBase *
|
||||
createSimpleParametricTilingPass(ArrayRef<int64_t> outerLoopSizes);
|
||||
|
||||
/// Promotes all accessed memref regions to the specified faster memory space
|
||||
/// while generating DMAs to move data.
|
||||
FunctionPassBase *createDmaGenerationPass(
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
//===- LoopParametricTiling.cpp --- Parametric loop tiling pass -----------===//
|
||||
//
|
||||
// Copyright 2019 The MLIR Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
// =============================================================================
|
||||
//
|
||||
// This file implements a pass to parametrically tile nests of standard loops.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "mlir/StandardOps/Ops.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
|
||||
#include "mlir/IR/Builders.h"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
static llvm::cl::list<int> clOuterLoopSizes(
|
||||
"outer-loop-sizes", llvm::cl::MiscFlags::CommaSeparated,
|
||||
llvm::cl::desc(
|
||||
"fixed number of iterations that the outer loops should have"));
|
||||
|
||||
namespace {
|
||||
// Extracts fixed-range loops for top-level loop nests with ranges defined in
|
||||
// the pass constructor. Assumes loops are permutable.
|
||||
class SimpleParametricLoopTilingPass
|
||||
: public FunctionPass<SimpleParametricLoopTilingPass> {
|
||||
public:
|
||||
explicit SimpleParametricLoopTilingPass(ArrayRef<int64_t> outerLoopSizes)
|
||||
: sizes(outerLoopSizes.begin(), outerLoopSizes.end()) {}
|
||||
|
||||
void runOnFunction() override {
|
||||
Function func = getFunction();
|
||||
|
||||
func.walk<ForOp>([this](ForOp op) {
|
||||
// Ignore nested loops.
|
||||
if (op.getContainingRegion()->getParentOfType<ForOp>())
|
||||
return;
|
||||
extractFixedOuterLoops(op, sizes);
|
||||
});
|
||||
}
|
||||
|
||||
SmallVector<int64_t, 4> sizes;
|
||||
};
|
||||
} // end namespace
|
||||
|
||||
FunctionPassBase *
|
||||
mlir::createSimpleParametricTilingPass(ArrayRef<int64_t> outerLoopSizes) {
|
||||
return new SimpleParametricLoopTilingPass(outerLoopSizes);
|
||||
}
|
||||
|
||||
static PassRegistration<SimpleParametricLoopTilingPass>
|
||||
reg("extract-fixed-outer-loops",
|
||||
"apply parametric tiling to the outer loops so that the ranges of "
|
||||
"outer loops become static",
|
||||
[] {
|
||||
auto *pass = new SimpleParametricLoopTilingPass({});
|
||||
pass->sizes.assign(clOuterLoopSizes.begin(), clOuterLoopSizes.end());
|
||||
return pass;
|
||||
});
|
|
@ -351,20 +351,36 @@ LogicalResult mlir::instBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts,
|
|||
return success();
|
||||
}
|
||||
|
||||
// Collect perfectly nested loops starting from `rootForOps`. Loops are
|
||||
// perfectly nested if each loop is the first and only non-terminator operation
|
||||
// in the parent loop. Collect at most `maxLoops` loops and append them to
|
||||
// `forOps`.
|
||||
template <typename T>
|
||||
void getPerfectlyNestedLoopsImpl(
|
||||
SmallVectorImpl<T> &forOps, T rootForOp,
|
||||
unsigned maxLoops = std::numeric_limits<unsigned>::max()) {
|
||||
for (unsigned i = 0; i < maxLoops; ++i) {
|
||||
forOps.push_back(rootForOp);
|
||||
// FIXME: ForOp and AffineForOp currently provide different names to access
|
||||
// the region ("region" and "getRegion"). Remove this generic access when
|
||||
// AffineForOp moves to ODS and also gets "region".
|
||||
Block &body = rootForOp.getOperation()->getRegion(0).front();
|
||||
if (body.begin() != std::prev(body.end(), 2))
|
||||
return;
|
||||
|
||||
rootForOp = dyn_cast<T>(&body.front());
|
||||
if (!rootForOp)
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/// Get perfectly nested sequence of loops starting at root of loop nest
|
||||
/// (the first op being another AffineFor, and the second op - a terminator).
|
||||
/// A loop is perfectly nested iff: the first op in the loop's body is another
|
||||
/// AffineForOp, and the second op is a terminator).
|
||||
void mlir::getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
|
||||
AffineForOp root) {
|
||||
AffineForOp curr = root;
|
||||
nestedLoops.push_back(curr);
|
||||
auto *currBody = curr.getBody();
|
||||
while (currBody->begin() == std::prev(currBody->end(), 2) &&
|
||||
(curr = dyn_cast<AffineForOp>(curr.getBody()->front()))) {
|
||||
nestedLoops.push_back(curr);
|
||||
currBody = curr.getBody();
|
||||
}
|
||||
getPerfectlyNestedLoopsImpl(nestedLoops, root);
|
||||
}
|
||||
|
||||
/// Unrolls this loop completely.
|
||||
|
@ -762,3 +778,144 @@ SmallVector<AffineForOp, 8> mlir::tile(ArrayRef<AffineForOp> forOps,
|
|||
AffineForOp target) {
|
||||
return tile(forOps, sizes, ArrayRef<AffineForOp>{target})[0];
|
||||
}
|
||||
|
||||
// Tile the given nest of standard for loops with the given (parametric) sizes.
|
||||
// Sizes are expected to be strictly positive values at runtime. If more
|
||||
// sizes than loops provided, discard the trailing values in sizes. When
|
||||
// applied to a loop nest
|
||||
// for %i_0 = %lb_0 to %ub_0 step %s_0 {
|
||||
// for %i_1 = %lb_1 to %ub_1 step %s_1 {
|
||||
// "op"(%i0, %i1) : (index, index) -> () }}
|
||||
// this splits the loops into tile loops with step %sj * sizes[j] and the
|
||||
// original bounds, and the point loops iteration from %i_j to
|
||||
// min(%i_j + %s_j * sizes[j], %ub_j) with the original step. No verification
|
||||
// of `forOps` being suitable for tiling is performed, this function only
|
||||
// applies the transformation.
|
||||
static void tile(MutableArrayRef<ForOp> forOps, ArrayRef<Value *> sizes) {
|
||||
assert(sizes.size() >= forOps.size() && "insufficient number of tile sizes");
|
||||
if (sizes.empty() || forOps.empty())
|
||||
return;
|
||||
|
||||
ForOp rootForOp = forOps.front();
|
||||
OpBuilder builder(rootForOp);
|
||||
|
||||
// Compute new steps for the outer loops.
|
||||
SmallVector<Value *, 4> newSteps;
|
||||
newSteps.reserve(sizes.size());
|
||||
for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
|
||||
auto op = forOps[i];
|
||||
Value *newStep = builder.create<MulIOp>(op.getLoc(), op.step(), sizes[i]);
|
||||
newSteps.push_back(newStep);
|
||||
}
|
||||
|
||||
// Create new outer loops nested one into another.
|
||||
SmallVector<ForOp, 4> outerForOps;
|
||||
for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
|
||||
auto outerForOp =
|
||||
builder.create<ForOp>(forOps[i].getLoc(), forOps[i].lowerBound(),
|
||||
forOps[i].upperBound(), newSteps[i]);
|
||||
|
||||
builder.setInsertionPointToStart(outerForOp.body());
|
||||
|
||||
// FIXME: builder should do this for us.
|
||||
ensureStdTerminator(outerForOp.getOperation()->getRegion(0), builder,
|
||||
forOps[i].getLoc());
|
||||
outerForOp.body()->addArgument(builder.getIndexType());
|
||||
builder.setInsertionPointToStart(outerForOp.body());
|
||||
|
||||
outerForOps.push_back(outerForOp);
|
||||
}
|
||||
|
||||
// Move the outermost original loop into the innermost new outer loop. Thus
|
||||
// the body of the original loops does not need updating.
|
||||
auto lastOuterForOp = outerForOps.back();
|
||||
lastOuterForOp.body()->getOperations().splice(
|
||||
lastOuterForOp.body()->getOperations().begin(),
|
||||
rootForOp.getOperation()->getBlock()->getOperations(),
|
||||
rootForOp.getOperation());
|
||||
|
||||
// Immediately before the (now sunk) outermost original loop, insert the
|
||||
// computation of the upper bounds of the inner loops. Update the bounds of
|
||||
// the orginial loops to make them point loops.
|
||||
builder.setInsertionPointToStart(lastOuterForOp.body());
|
||||
for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
|
||||
Value *stepped = builder.create<AddIOp>(
|
||||
forOps[i].getLoc(), outerForOps[i].getInductionVar(), newSteps[i]);
|
||||
Value *less = builder.create<CmpIOp>(forOps[i].getLoc(), CmpIPredicate::SLT,
|
||||
forOps[i].upperBound(), stepped);
|
||||
Value *upperBound = builder.create<SelectOp>(
|
||||
forOps[i].getLoc(), less, forOps[i].upperBound(), stepped);
|
||||
forOps[i].setLowerBound(outerForOps[i].getInductionVar());
|
||||
forOps[i].setUpperBound(upperBound);
|
||||
}
|
||||
}
|
||||
|
||||
void mlir::tile(ForOp rootForOp, ArrayRef<Value *> sizes) {
|
||||
// Collect prefectly nested loops. If more size values provided than nested
|
||||
// loops available, truncate `sizes`.
|
||||
SmallVector<ForOp, 4> forOps;
|
||||
forOps.reserve(sizes.size());
|
||||
getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
|
||||
if (forOps.size() < sizes.size())
|
||||
sizes = sizes.take_front(forOps.size());
|
||||
|
||||
return ::tile(forOps, sizes);
|
||||
}
|
||||
|
||||
// Build the IR that performs ceil division of a positive value by a constant:
|
||||
// ceildiv(a, B) = divis(a + (B-1), B)
|
||||
// where divis is roundning-to-zero division.
|
||||
static Value *ceilDivPositive(OpBuilder &builder, Location loc, Value *dividend,
|
||||
int64_t divisor) {
|
||||
assert(divisor > 0 && "expected positive divisor");
|
||||
assert(dividend->getType().isIndex() && "expected index-typed value");
|
||||
|
||||
Value *divisorMinusOneCst = builder.create<ConstantIndexOp>(loc, divisor - 1);
|
||||
Value *divisorCst = builder.create<ConstantIndexOp>(loc, divisor);
|
||||
Value *sum = builder.create<AddIOp>(loc, dividend, divisorMinusOneCst);
|
||||
return builder.create<DivISOp>(loc, sum, divisorCst);
|
||||
}
|
||||
|
||||
static Value *ceilDivPositive(OpBuilder &builder, Location loc, Value *dividend,
|
||||
Value *divisor) {
|
||||
assert(dividend->getType().isIndex() && "expected index-typed value");
|
||||
|
||||
Value *cstOne = builder.create<ConstantIndexOp>(loc, 1);
|
||||
Value *divisorMinusOne = builder.create<SubIOp>(loc, divisor, cstOne);
|
||||
Value *sum = builder.create<AddIOp>(loc, dividend, divisorMinusOne);
|
||||
return builder.create<DivISOp>(loc, sum, divisor);
|
||||
}
|
||||
|
||||
void mlir::extractFixedOuterLoops(ForOp rootForOp, ArrayRef<int64_t> sizes) {
|
||||
// Collect prefectly nested loops. If more size values provided than nested
|
||||
// loops available, truncate `sizes`.
|
||||
SmallVector<ForOp, 4> forOps;
|
||||
forOps.reserve(sizes.size());
|
||||
getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
|
||||
if (forOps.size() < sizes.size())
|
||||
sizes = sizes.take_front(forOps.size());
|
||||
|
||||
OpBuilder builder(rootForOp);
|
||||
auto loc = rootForOp.getLoc();
|
||||
|
||||
// Compute the tile sizes such that i-th outer loop executes size[i]
|
||||
// iterations. Given that the loop current executes
|
||||
// numIterations = ceildiv((upperBound - lowerBound), step)
|
||||
// iterations, we need to tile with size ceildiv(numIterations, size[i]).
|
||||
SmallVector<Value *, 4> tileSizes;
|
||||
tileSizes.reserve(sizes.size());
|
||||
for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
|
||||
assert(sizes[i] > 0 && "expected strictly positive size for strip-mining");
|
||||
|
||||
auto forOp = forOps[i];
|
||||
Value *diff =
|
||||
builder.create<SubIOp>(loc, forOp.upperBound(), forOp.lowerBound());
|
||||
Value *numIterations = ceilDivPositive(builder, loc, diff, forOp.step());
|
||||
Value *iterationsPerBlock =
|
||||
ceilDivPositive(builder, loc, numIterations, sizes[i]);
|
||||
tileSizes.push_back(iterationsPerBlock);
|
||||
}
|
||||
|
||||
// Call parametric tiling with the given sizes.
|
||||
return ::tile(forOps, tileSizes);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,66 @@
|
|||
// RUN: mlir-opt -extract-fixed-outer-loops -outer-loop-sizes=7 %s | FileCheck %s --check-prefixes=COMMON,TILE_7
|
||||
// RUN: mlir-opt -extract-fixed-outer-loops -outer-loop-sizes=7,4 %s | FileCheck %s --check-prefixes=COMMON,TILE_74
|
||||
|
||||
// COMMON-LABEL: @foo
|
||||
func @foo(%arg0: memref<?x?xf32>) {
|
||||
%c2 = constant 2 : index
|
||||
%c44 = constant 44 : index
|
||||
%c1 = constant 1 : index
|
||||
// Range of the original loop:
|
||||
// (upper - lower + step - 1) / step
|
||||
// where step is known to be %c1.
|
||||
// COMMON: %[[diff:.*]] = subi %c44, %c2
|
||||
// COMMON: %[[adjustment:.*]] = subi %c1, %c1_{{.*}}
|
||||
// COMMON-NEXT: %[[diff_adj:.*]] = addi %[[diff]], %[[adjustment]]
|
||||
// COMMON-NEXT: %[[range:.*]] = divis %[[diff_adj]], %c1
|
||||
|
||||
// Ceildiv to get the parametric tile size.
|
||||
// COMMON: %[[sum:.*]] = addi %[[range]], %c6
|
||||
// COMMON-NEXT: %[[size:.*]] = divis %[[sum]], %c7
|
||||
|
||||
// Range of the second original loop
|
||||
// (upper - lower + step - 1) / step
|
||||
// where step is known to be %c2.
|
||||
// TILE_74: %[[diff2:.*]] = subi %c44, %c1
|
||||
// TILE_74: %[[adjustment2:.*]] = subi %c2, %c1_{{.*}}
|
||||
// TILE_74-NEXT: %[[diff2_adj:.*]] = addi %[[diff2]], %[[adjustment2]]
|
||||
// TILE_74-NEXT: %[[range2:.*]] = divis %[[diff2_adj]], %c2
|
||||
|
||||
// Ceildiv to get the parametric tile size for the second original loop.
|
||||
// TILE_74: %[[sum2:.*]] = addi %[[range2]], %c3
|
||||
// TILE_74-NEXT: %[[size2:.*]] = divis %[[sum2]], %c4
|
||||
|
||||
// New step(s) (original is %c1 and %c2).
|
||||
// COMMON: %[[step:.*]] = muli %c1, %[[size]]
|
||||
// TILE_74: %[[step2:.*]] = muli %c2, %[[size2]]
|
||||
|
||||
// Updated outer loop(s) use new steps.
|
||||
// COMMON: for %[[i:.*]] = %c2 to %c44 step %[[step]]
|
||||
// TILE_74: for %[[j:.*]] = %c1 to %c44 step %[[step2]]
|
||||
for %i = %c2 to %c44 step %c1 {
|
||||
// Upper bound for the inner loop min(%i + %step, %c44).
|
||||
// COMMON: %[[stepped:.*]] = addi %[[i]], %[[step]]
|
||||
// COMMON-NEXT: cmpi "slt", %c44, %[[stepped]]
|
||||
// COMMON-NEXT: %[[ub:.*]] = select {{.*}}, %c44, %[[stepped]]
|
||||
//
|
||||
// TILE_74: %[[stepped2:.*]] = addi %[[j]], %[[step2]]
|
||||
// TILE_74-NEXT: cmpi "slt", %c44, %[[stepped2]]
|
||||
// TILE_74-NEXT: %[[ub2:.*]] = select {{.*}}, %c44, %[[stepped2]]
|
||||
|
||||
// Created inner loop.
|
||||
// COMMON: for %[[ii:.*]] = %[[i]] to %[[ub:.*]] step %c1
|
||||
|
||||
// This loop is not modified in TILE_7 case.
|
||||
// TILE_7: for %[[j:.*]] = %c1 to %c44 step %c2
|
||||
//
|
||||
// But is modified in TILE_74 case.
|
||||
// TILE_74: for %[[jj:.*]] = %[[j]] to %[[ub2]] step %c2
|
||||
for %j = %c1 to %c44 step %c2 {
|
||||
// The right iterator are used.
|
||||
// TILE_7: load %arg0[%[[ii]], %[[j]]]
|
||||
// TILE_74: load %arg0[%[[ii]], %[[jj]]]
|
||||
load %arg0[%i, %j]: memref<?x?xf32>
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
Loading…
Reference in New Issue