Implement parametric tiling on standard for loops

Parametric tiling can be used to extract outer loops with fixed number of
iterations.  This in turn enables mapping to GPU kernels on a fixed grid
independently of the range of the original loops, which may be unknown
statically, making the kernel adaptable to different sizes.  Provide a utility
function that also computes the parametric tile size given the range of the
loop.  Exercise the utility function through a simple pass that applies it to
all top-level loop nests.  Permutability or parallelism checks must be
performed before calling this utility function in actual passes.

Note that parametric tiling cannot be implemented in a purely affine way,
although it can be encoded using semi-affine maps.  The choice to implement it
on standard loops is guided by them being the common representation between
Affine loops, Linalg and GPU kernels.

PiperOrigin-RevId: 257180251
This commit is contained in:
Alex Zinenko 2019-07-09 06:37:17 -07:00 committed by A. Unique TensorFlower
parent 80e2871087
commit 9d03f5674f
8 changed files with 336 additions and 13 deletions

View File

@ -798,7 +798,9 @@ public:
Dialect *getDialect() { return getOperation()->getDialect(); }
/// Return the Region enclosing this Op.
Region *getContainingRegion() { return getOperation()->getParentRegion(); }
Region *getContainingRegion() {
return getOperation()->getContainingRegion();
}
/// Return true if this "op class" can match against the specified operation.
/// This hook can be overridden with a more specific implementation in

View File

@ -105,16 +105,21 @@ void registerPass(StringRef arg, StringRef description, const PassID *passID,
const PassAllocatorFunction &function);
/// PassRegistration provides a global initializer that registers a Pass
/// allocation routine for a concrete pass instance.
/// allocation routine for a concrete pass instance. The third argument is
/// optional and provides a callback to construct a pass that does not have
/// a default constructor.
///
/// Usage:
///
/// // At namespace scope.
/// static PassRegistration<MyPass> Unused("unused", "Unused pass");
template <typename ConcretePass> struct PassRegistration {
PassRegistration(StringRef arg, StringRef description) {
registerPass(arg, description, PassID::getID<ConcretePass>(),
[] { return new ConcretePass(); });
PassRegistration(
StringRef arg, StringRef description,
const PassAllocatorFunction &constructor = [] {
return new ConcretePass();
}) {
registerPass(arg, description, PassID::getID<ConcretePass>(), constructor);
}
};

View File

@ -697,6 +697,8 @@ def ForOp : Std_Op<"for"> {
OpBuilder getBodyBuilder() {
return OpBuilder(body(), std::prev(body()->end()));
}
void setLowerBound(Value *bound) { getOperation()->setOperand(0, bound); }
void setUpperBound(Value *bound) { getOperation()->setOperand(1, bound); }
}];
}

View File

@ -30,6 +30,7 @@
namespace mlir {
class AffineMap;
class AffineForOp;
class ForOp;
class FuncOp;
using Function = FuncOp;
class OpBuilder;
@ -140,6 +141,17 @@ SmallVector<SmallVector<AffineForOp, 8>, 8> tile(ArrayRef<AffineForOp> forOps,
SmallVector<AffineForOp, 8> tile(ArrayRef<AffineForOp> forOps,
ArrayRef<uint64_t> sizes, AffineForOp target);
/// Tile a nest of standard for loops rooted at `rootForOp` with the given
/// (parametric) sizes. Sizes are expected to be strictly positive values at
/// runtime. If more sizes than loops provided, discard the trailing values in
/// sizes. Assumes the loop nest is permutable.
void tile(ForOp rootForOp, ArrayRef<Value *> sizes);
/// Tile a nest of standard for loops rooted at `rootForOp` by finding such
/// parametric tile sizes that the outer loops have a fixed number of iterations
/// as defined in `sizes`.
void extractFixedOuterLoops(ForOp rootFOrOp, ArrayRef<int64_t> sizes);
} // end namespace mlir
#endif // MLIR_TRANSFORMS_LOOP_UTILS_H

View File

@ -99,6 +99,12 @@ FunctionPassBase *createLowerAffinePass();
/// Creates a pass to perform tiling on loop nests.
FunctionPassBase *createLoopTilingPass(uint64_t cacheSizeBytes);
/// Creates a pass that performs parametric tiling so that the outermost loops
/// have the given fixed number of iterations. Assumes outermost loop nests
/// are permutable.
FunctionPassBase *
createSimpleParametricTilingPass(ArrayRef<int64_t> outerLoopSizes);
/// Promotes all accessed memref regions to the specified faster memory space
/// while generating DMAs to move data.
FunctionPassBase *createDmaGenerationPass(

View File

@ -0,0 +1,73 @@
//===- LoopParametricTiling.cpp --- Parametric loop tiling pass -----------===//
//
// Copyright 2019 The MLIR Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
//
// This file implements a pass to parametrically tile nests of standard loops.
//
//===----------------------------------------------------------------------===//
#include "mlir/Pass/Pass.h"
#include "mlir/StandardOps/Ops.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Passes.h"
#include "mlir/IR/Builders.h"
using namespace mlir;
static llvm::cl::list<int> clOuterLoopSizes(
"outer-loop-sizes", llvm::cl::MiscFlags::CommaSeparated,
llvm::cl::desc(
"fixed number of iterations that the outer loops should have"));
namespace {
// Extracts fixed-range loops for top-level loop nests with ranges defined in
// the pass constructor. Assumes loops are permutable.
class SimpleParametricLoopTilingPass
: public FunctionPass<SimpleParametricLoopTilingPass> {
public:
explicit SimpleParametricLoopTilingPass(ArrayRef<int64_t> outerLoopSizes)
: sizes(outerLoopSizes.begin(), outerLoopSizes.end()) {}
void runOnFunction() override {
Function func = getFunction();
func.walk<ForOp>([this](ForOp op) {
// Ignore nested loops.
if (op.getContainingRegion()->getParentOfType<ForOp>())
return;
extractFixedOuterLoops(op, sizes);
});
}
SmallVector<int64_t, 4> sizes;
};
} // end namespace
FunctionPassBase *
mlir::createSimpleParametricTilingPass(ArrayRef<int64_t> outerLoopSizes) {
return new SimpleParametricLoopTilingPass(outerLoopSizes);
}
static PassRegistration<SimpleParametricLoopTilingPass>
reg("extract-fixed-outer-loops",
"apply parametric tiling to the outer loops so that the ranges of "
"outer loops become static",
[] {
auto *pass = new SimpleParametricLoopTilingPass({});
pass->sizes.assign(clOuterLoopSizes.begin(), clOuterLoopSizes.end());
return pass;
});

View File

@ -351,20 +351,36 @@ LogicalResult mlir::instBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts,
return success();
}
// Collect perfectly nested loops starting from `rootForOps`. Loops are
// perfectly nested if each loop is the first and only non-terminator operation
// in the parent loop. Collect at most `maxLoops` loops and append them to
// `forOps`.
template <typename T>
void getPerfectlyNestedLoopsImpl(
SmallVectorImpl<T> &forOps, T rootForOp,
unsigned maxLoops = std::numeric_limits<unsigned>::max()) {
for (unsigned i = 0; i < maxLoops; ++i) {
forOps.push_back(rootForOp);
// FIXME: ForOp and AffineForOp currently provide different names to access
// the region ("region" and "getRegion"). Remove this generic access when
// AffineForOp moves to ODS and also gets "region".
Block &body = rootForOp.getOperation()->getRegion(0).front();
if (body.begin() != std::prev(body.end(), 2))
return;
rootForOp = dyn_cast<T>(&body.front());
if (!rootForOp)
return;
}
}
/// Get perfectly nested sequence of loops starting at root of loop nest
/// (the first op being another AffineFor, and the second op - a terminator).
/// A loop is perfectly nested iff: the first op in the loop's body is another
/// AffineForOp, and the second op is a terminator).
void mlir::getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
AffineForOp root) {
AffineForOp curr = root;
nestedLoops.push_back(curr);
auto *currBody = curr.getBody();
while (currBody->begin() == std::prev(currBody->end(), 2) &&
(curr = dyn_cast<AffineForOp>(curr.getBody()->front()))) {
nestedLoops.push_back(curr);
currBody = curr.getBody();
}
getPerfectlyNestedLoopsImpl(nestedLoops, root);
}
/// Unrolls this loop completely.
@ -762,3 +778,144 @@ SmallVector<AffineForOp, 8> mlir::tile(ArrayRef<AffineForOp> forOps,
AffineForOp target) {
return tile(forOps, sizes, ArrayRef<AffineForOp>{target})[0];
}
// Tile the given nest of standard for loops with the given (parametric) sizes.
// Sizes are expected to be strictly positive values at runtime. If more
// sizes than loops provided, discard the trailing values in sizes. When
// applied to a loop nest
// for %i_0 = %lb_0 to %ub_0 step %s_0 {
// for %i_1 = %lb_1 to %ub_1 step %s_1 {
// "op"(%i0, %i1) : (index, index) -> () }}
// this splits the loops into tile loops with step %sj * sizes[j] and the
// original bounds, and the point loops iteration from %i_j to
// min(%i_j + %s_j * sizes[j], %ub_j) with the original step. No verification
// of `forOps` being suitable for tiling is performed, this function only
// applies the transformation.
static void tile(MutableArrayRef<ForOp> forOps, ArrayRef<Value *> sizes) {
assert(sizes.size() >= forOps.size() && "insufficient number of tile sizes");
if (sizes.empty() || forOps.empty())
return;
ForOp rootForOp = forOps.front();
OpBuilder builder(rootForOp);
// Compute new steps for the outer loops.
SmallVector<Value *, 4> newSteps;
newSteps.reserve(sizes.size());
for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
auto op = forOps[i];
Value *newStep = builder.create<MulIOp>(op.getLoc(), op.step(), sizes[i]);
newSteps.push_back(newStep);
}
// Create new outer loops nested one into another.
SmallVector<ForOp, 4> outerForOps;
for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
auto outerForOp =
builder.create<ForOp>(forOps[i].getLoc(), forOps[i].lowerBound(),
forOps[i].upperBound(), newSteps[i]);
builder.setInsertionPointToStart(outerForOp.body());
// FIXME: builder should do this for us.
ensureStdTerminator(outerForOp.getOperation()->getRegion(0), builder,
forOps[i].getLoc());
outerForOp.body()->addArgument(builder.getIndexType());
builder.setInsertionPointToStart(outerForOp.body());
outerForOps.push_back(outerForOp);
}
// Move the outermost original loop into the innermost new outer loop. Thus
// the body of the original loops does not need updating.
auto lastOuterForOp = outerForOps.back();
lastOuterForOp.body()->getOperations().splice(
lastOuterForOp.body()->getOperations().begin(),
rootForOp.getOperation()->getBlock()->getOperations(),
rootForOp.getOperation());
// Immediately before the (now sunk) outermost original loop, insert the
// computation of the upper bounds of the inner loops. Update the bounds of
// the orginial loops to make them point loops.
builder.setInsertionPointToStart(lastOuterForOp.body());
for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
Value *stepped = builder.create<AddIOp>(
forOps[i].getLoc(), outerForOps[i].getInductionVar(), newSteps[i]);
Value *less = builder.create<CmpIOp>(forOps[i].getLoc(), CmpIPredicate::SLT,
forOps[i].upperBound(), stepped);
Value *upperBound = builder.create<SelectOp>(
forOps[i].getLoc(), less, forOps[i].upperBound(), stepped);
forOps[i].setLowerBound(outerForOps[i].getInductionVar());
forOps[i].setUpperBound(upperBound);
}
}
void mlir::tile(ForOp rootForOp, ArrayRef<Value *> sizes) {
// Collect prefectly nested loops. If more size values provided than nested
// loops available, truncate `sizes`.
SmallVector<ForOp, 4> forOps;
forOps.reserve(sizes.size());
getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
if (forOps.size() < sizes.size())
sizes = sizes.take_front(forOps.size());
return ::tile(forOps, sizes);
}
// Build the IR that performs ceil division of a positive value by a constant:
// ceildiv(a, B) = divis(a + (B-1), B)
// where divis is roundning-to-zero division.
static Value *ceilDivPositive(OpBuilder &builder, Location loc, Value *dividend,
int64_t divisor) {
assert(divisor > 0 && "expected positive divisor");
assert(dividend->getType().isIndex() && "expected index-typed value");
Value *divisorMinusOneCst = builder.create<ConstantIndexOp>(loc, divisor - 1);
Value *divisorCst = builder.create<ConstantIndexOp>(loc, divisor);
Value *sum = builder.create<AddIOp>(loc, dividend, divisorMinusOneCst);
return builder.create<DivISOp>(loc, sum, divisorCst);
}
static Value *ceilDivPositive(OpBuilder &builder, Location loc, Value *dividend,
Value *divisor) {
assert(dividend->getType().isIndex() && "expected index-typed value");
Value *cstOne = builder.create<ConstantIndexOp>(loc, 1);
Value *divisorMinusOne = builder.create<SubIOp>(loc, divisor, cstOne);
Value *sum = builder.create<AddIOp>(loc, dividend, divisorMinusOne);
return builder.create<DivISOp>(loc, sum, divisor);
}
void mlir::extractFixedOuterLoops(ForOp rootForOp, ArrayRef<int64_t> sizes) {
// Collect prefectly nested loops. If more size values provided than nested
// loops available, truncate `sizes`.
SmallVector<ForOp, 4> forOps;
forOps.reserve(sizes.size());
getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
if (forOps.size() < sizes.size())
sizes = sizes.take_front(forOps.size());
OpBuilder builder(rootForOp);
auto loc = rootForOp.getLoc();
// Compute the tile sizes such that i-th outer loop executes size[i]
// iterations. Given that the loop current executes
// numIterations = ceildiv((upperBound - lowerBound), step)
// iterations, we need to tile with size ceildiv(numIterations, size[i]).
SmallVector<Value *, 4> tileSizes;
tileSizes.reserve(sizes.size());
for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
assert(sizes[i] > 0 && "expected strictly positive size for strip-mining");
auto forOp = forOps[i];
Value *diff =
builder.create<SubIOp>(loc, forOp.upperBound(), forOp.lowerBound());
Value *numIterations = ceilDivPositive(builder, loc, diff, forOp.step());
Value *iterationsPerBlock =
ceilDivPositive(builder, loc, numIterations, sizes[i]);
tileSizes.push_back(iterationsPerBlock);
}
// Call parametric tiling with the given sizes.
return ::tile(forOps, tileSizes);
}

View File

@ -0,0 +1,66 @@
// RUN: mlir-opt -extract-fixed-outer-loops -outer-loop-sizes=7 %s | FileCheck %s --check-prefixes=COMMON,TILE_7
// RUN: mlir-opt -extract-fixed-outer-loops -outer-loop-sizes=7,4 %s | FileCheck %s --check-prefixes=COMMON,TILE_74
// COMMON-LABEL: @foo
func @foo(%arg0: memref<?x?xf32>) {
%c2 = constant 2 : index
%c44 = constant 44 : index
%c1 = constant 1 : index
// Range of the original loop:
// (upper - lower + step - 1) / step
// where step is known to be %c1.
// COMMON: %[[diff:.*]] = subi %c44, %c2
// COMMON: %[[adjustment:.*]] = subi %c1, %c1_{{.*}}
// COMMON-NEXT: %[[diff_adj:.*]] = addi %[[diff]], %[[adjustment]]
// COMMON-NEXT: %[[range:.*]] = divis %[[diff_adj]], %c1
// Ceildiv to get the parametric tile size.
// COMMON: %[[sum:.*]] = addi %[[range]], %c6
// COMMON-NEXT: %[[size:.*]] = divis %[[sum]], %c7
// Range of the second original loop
// (upper - lower + step - 1) / step
// where step is known to be %c2.
// TILE_74: %[[diff2:.*]] = subi %c44, %c1
// TILE_74: %[[adjustment2:.*]] = subi %c2, %c1_{{.*}}
// TILE_74-NEXT: %[[diff2_adj:.*]] = addi %[[diff2]], %[[adjustment2]]
// TILE_74-NEXT: %[[range2:.*]] = divis %[[diff2_adj]], %c2
// Ceildiv to get the parametric tile size for the second original loop.
// TILE_74: %[[sum2:.*]] = addi %[[range2]], %c3
// TILE_74-NEXT: %[[size2:.*]] = divis %[[sum2]], %c4
// New step(s) (original is %c1 and %c2).
// COMMON: %[[step:.*]] = muli %c1, %[[size]]
// TILE_74: %[[step2:.*]] = muli %c2, %[[size2]]
// Updated outer loop(s) use new steps.
// COMMON: for %[[i:.*]] = %c2 to %c44 step %[[step]]
// TILE_74: for %[[j:.*]] = %c1 to %c44 step %[[step2]]
for %i = %c2 to %c44 step %c1 {
// Upper bound for the inner loop min(%i + %step, %c44).
// COMMON: %[[stepped:.*]] = addi %[[i]], %[[step]]
// COMMON-NEXT: cmpi "slt", %c44, %[[stepped]]
// COMMON-NEXT: %[[ub:.*]] = select {{.*}}, %c44, %[[stepped]]
//
// TILE_74: %[[stepped2:.*]] = addi %[[j]], %[[step2]]
// TILE_74-NEXT: cmpi "slt", %c44, %[[stepped2]]
// TILE_74-NEXT: %[[ub2:.*]] = select {{.*}}, %c44, %[[stepped2]]
// Created inner loop.
// COMMON: for %[[ii:.*]] = %[[i]] to %[[ub:.*]] step %c1
// This loop is not modified in TILE_7 case.
// TILE_7: for %[[j:.*]] = %c1 to %c44 step %c2
//
// But is modified in TILE_74 case.
// TILE_74: for %[[jj:.*]] = %[[j]] to %[[ub2]] step %c2
for %j = %c1 to %c44 step %c2 {
// The right iterator are used.
// TILE_7: load %arg0[%[[ii]], %[[j]]]
// TILE_74: load %arg0[%[[ii]], %[[jj]]]
load %arg0[%i, %j]: memref<?x?xf32>
}
}
return
}