From 9d03f5674f4e511d834b3de9d24eb1248a06f864 Mon Sep 17 00:00:00 2001 From: Alex Zinenko Date: Tue, 9 Jul 2019 06:37:17 -0700 Subject: [PATCH] Implement parametric tiling on standard for loops Parametric tiling can be used to extract outer loops with fixed number of iterations. This in turn enables mapping to GPU kernels on a fixed grid independently of the range of the original loops, which may be unknown statically, making the kernel adaptable to different sizes. Provide a utility function that also computes the parametric tile size given the range of the loop. Exercise the utility function through a simple pass that applies it to all top-level loop nests. Permutability or parallelism checks must be performed before calling this utility function in actual passes. Note that parametric tiling cannot be implemented in a purely affine way, although it can be encoded using semi-affine maps. The choice to implement it on standard loops is guided by them being the common representation between Affine loops, Linalg and GPU kernels. PiperOrigin-RevId: 257180251 --- mlir/include/mlir/IR/OpDefinition.h | 4 +- mlir/include/mlir/Pass/PassRegistry.h | 13 +- mlir/include/mlir/StandardOps/Ops.td | 2 + mlir/include/mlir/Transforms/LoopUtils.h | 12 ++ mlir/include/mlir/Transforms/Passes.h | 6 + mlir/lib/Transforms/LoopParametricTiling.cpp | 73 ++++++++ mlir/lib/Transforms/Utils/LoopUtils.cpp | 173 ++++++++++++++++++- mlir/test/Transforms/parametric_tiling.mlir | 66 +++++++ 8 files changed, 336 insertions(+), 13 deletions(-) create mode 100644 mlir/lib/Transforms/LoopParametricTiling.cpp create mode 100644 mlir/test/Transforms/parametric_tiling.mlir diff --git a/mlir/include/mlir/IR/OpDefinition.h b/mlir/include/mlir/IR/OpDefinition.h index 6913b7638d74..32f7efafd852 100644 --- a/mlir/include/mlir/IR/OpDefinition.h +++ b/mlir/include/mlir/IR/OpDefinition.h @@ -798,7 +798,9 @@ public: Dialect *getDialect() { return getOperation()->getDialect(); } /// Return the Region enclosing this Op. - Region *getContainingRegion() { return getOperation()->getParentRegion(); } + Region *getContainingRegion() { + return getOperation()->getContainingRegion(); + } /// Return true if this "op class" can match against the specified operation. /// This hook can be overridden with a more specific implementation in diff --git a/mlir/include/mlir/Pass/PassRegistry.h b/mlir/include/mlir/Pass/PassRegistry.h index 170881863c19..27096f354d31 100644 --- a/mlir/include/mlir/Pass/PassRegistry.h +++ b/mlir/include/mlir/Pass/PassRegistry.h @@ -105,16 +105,21 @@ void registerPass(StringRef arg, StringRef description, const PassID *passID, const PassAllocatorFunction &function); /// PassRegistration provides a global initializer that registers a Pass -/// allocation routine for a concrete pass instance. +/// allocation routine for a concrete pass instance. The third argument is +/// optional and provides a callback to construct a pass that does not have +/// a default constructor. /// /// Usage: /// /// // At namespace scope. /// static PassRegistration Unused("unused", "Unused pass"); template struct PassRegistration { - PassRegistration(StringRef arg, StringRef description) { - registerPass(arg, description, PassID::getID(), - [] { return new ConcretePass(); }); + PassRegistration( + StringRef arg, StringRef description, + const PassAllocatorFunction &constructor = [] { + return new ConcretePass(); + }) { + registerPass(arg, description, PassID::getID(), constructor); } }; diff --git a/mlir/include/mlir/StandardOps/Ops.td b/mlir/include/mlir/StandardOps/Ops.td index 189d3c8a3415..8e37a58cd14b 100644 --- a/mlir/include/mlir/StandardOps/Ops.td +++ b/mlir/include/mlir/StandardOps/Ops.td @@ -697,6 +697,8 @@ def ForOp : Std_Op<"for"> { OpBuilder getBodyBuilder() { return OpBuilder(body(), std::prev(body()->end())); } + void setLowerBound(Value *bound) { getOperation()->setOperand(0, bound); } + void setUpperBound(Value *bound) { getOperation()->setOperand(1, bound); } }]; } diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h index 1e46f2304ec5..654555830ec4 100644 --- a/mlir/include/mlir/Transforms/LoopUtils.h +++ b/mlir/include/mlir/Transforms/LoopUtils.h @@ -30,6 +30,7 @@ namespace mlir { class AffineMap; class AffineForOp; +class ForOp; class FuncOp; using Function = FuncOp; class OpBuilder; @@ -140,6 +141,17 @@ SmallVector, 8> tile(ArrayRef forOps, SmallVector tile(ArrayRef forOps, ArrayRef sizes, AffineForOp target); +/// Tile a nest of standard for loops rooted at `rootForOp` with the given +/// (parametric) sizes. Sizes are expected to be strictly positive values at +/// runtime. If more sizes than loops provided, discard the trailing values in +/// sizes. Assumes the loop nest is permutable. +void tile(ForOp rootForOp, ArrayRef sizes); + +/// Tile a nest of standard for loops rooted at `rootForOp` by finding such +/// parametric tile sizes that the outer loops have a fixed number of iterations +/// as defined in `sizes`. +void extractFixedOuterLoops(ForOp rootFOrOp, ArrayRef sizes); + } // end namespace mlir #endif // MLIR_TRANSFORMS_LOOP_UTILS_H diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h index a253871bc29e..83532f295cf8 100644 --- a/mlir/include/mlir/Transforms/Passes.h +++ b/mlir/include/mlir/Transforms/Passes.h @@ -99,6 +99,12 @@ FunctionPassBase *createLowerAffinePass(); /// Creates a pass to perform tiling on loop nests. FunctionPassBase *createLoopTilingPass(uint64_t cacheSizeBytes); +/// Creates a pass that performs parametric tiling so that the outermost loops +/// have the given fixed number of iterations. Assumes outermost loop nests +/// are permutable. +FunctionPassBase * +createSimpleParametricTilingPass(ArrayRef outerLoopSizes); + /// Promotes all accessed memref regions to the specified faster memory space /// while generating DMAs to move data. FunctionPassBase *createDmaGenerationPass( diff --git a/mlir/lib/Transforms/LoopParametricTiling.cpp b/mlir/lib/Transforms/LoopParametricTiling.cpp new file mode 100644 index 000000000000..c2b239437948 --- /dev/null +++ b/mlir/lib/Transforms/LoopParametricTiling.cpp @@ -0,0 +1,73 @@ +//===- LoopParametricTiling.cpp --- Parametric loop tiling pass -----------===// +// +// Copyright 2019 The MLIR Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ============================================================================= +// +// This file implements a pass to parametrically tile nests of standard loops. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Pass/Pass.h" +#include "mlir/StandardOps/Ops.h" +#include "mlir/Transforms/LoopUtils.h" +#include "mlir/Transforms/Passes.h" + +#include "mlir/IR/Builders.h" + +using namespace mlir; + +static llvm::cl::list clOuterLoopSizes( + "outer-loop-sizes", llvm::cl::MiscFlags::CommaSeparated, + llvm::cl::desc( + "fixed number of iterations that the outer loops should have")); + +namespace { +// Extracts fixed-range loops for top-level loop nests with ranges defined in +// the pass constructor. Assumes loops are permutable. +class SimpleParametricLoopTilingPass + : public FunctionPass { +public: + explicit SimpleParametricLoopTilingPass(ArrayRef outerLoopSizes) + : sizes(outerLoopSizes.begin(), outerLoopSizes.end()) {} + + void runOnFunction() override { + Function func = getFunction(); + + func.walk([this](ForOp op) { + // Ignore nested loops. + if (op.getContainingRegion()->getParentOfType()) + return; + extractFixedOuterLoops(op, sizes); + }); + } + + SmallVector sizes; +}; +} // end namespace + +FunctionPassBase * +mlir::createSimpleParametricTilingPass(ArrayRef outerLoopSizes) { + return new SimpleParametricLoopTilingPass(outerLoopSizes); +} + +static PassRegistration + reg("extract-fixed-outer-loops", + "apply parametric tiling to the outer loops so that the ranges of " + "outer loops become static", + [] { + auto *pass = new SimpleParametricLoopTilingPass({}); + pass->sizes.assign(clOuterLoopSizes.begin(), clOuterLoopSizes.end()); + return pass; + }); diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp index 65847fc8bee8..8fbd59b6bc83 100644 --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -351,20 +351,36 @@ LogicalResult mlir::instBodySkew(AffineForOp forOp, ArrayRef shifts, return success(); } +// Collect perfectly nested loops starting from `rootForOps`. Loops are +// perfectly nested if each loop is the first and only non-terminator operation +// in the parent loop. Collect at most `maxLoops` loops and append them to +// `forOps`. +template +void getPerfectlyNestedLoopsImpl( + SmallVectorImpl &forOps, T rootForOp, + unsigned maxLoops = std::numeric_limits::max()) { + for (unsigned i = 0; i < maxLoops; ++i) { + forOps.push_back(rootForOp); + // FIXME: ForOp and AffineForOp currently provide different names to access + // the region ("region" and "getRegion"). Remove this generic access when + // AffineForOp moves to ODS and also gets "region". + Block &body = rootForOp.getOperation()->getRegion(0).front(); + if (body.begin() != std::prev(body.end(), 2)) + return; + + rootForOp = dyn_cast(&body.front()); + if (!rootForOp) + return; + } +} + /// Get perfectly nested sequence of loops starting at root of loop nest /// (the first op being another AffineFor, and the second op - a terminator). /// A loop is perfectly nested iff: the first op in the loop's body is another /// AffineForOp, and the second op is a terminator). void mlir::getPerfectlyNestedLoops(SmallVectorImpl &nestedLoops, AffineForOp root) { - AffineForOp curr = root; - nestedLoops.push_back(curr); - auto *currBody = curr.getBody(); - while (currBody->begin() == std::prev(currBody->end(), 2) && - (curr = dyn_cast(curr.getBody()->front()))) { - nestedLoops.push_back(curr); - currBody = curr.getBody(); - } + getPerfectlyNestedLoopsImpl(nestedLoops, root); } /// Unrolls this loop completely. @@ -762,3 +778,144 @@ SmallVector mlir::tile(ArrayRef forOps, AffineForOp target) { return tile(forOps, sizes, ArrayRef{target})[0]; } + +// Tile the given nest of standard for loops with the given (parametric) sizes. +// Sizes are expected to be strictly positive values at runtime. If more +// sizes than loops provided, discard the trailing values in sizes. When +// applied to a loop nest +// for %i_0 = %lb_0 to %ub_0 step %s_0 { +// for %i_1 = %lb_1 to %ub_1 step %s_1 { +// "op"(%i0, %i1) : (index, index) -> () }} +// this splits the loops into tile loops with step %sj * sizes[j] and the +// original bounds, and the point loops iteration from %i_j to +// min(%i_j + %s_j * sizes[j], %ub_j) with the original step. No verification +// of `forOps` being suitable for tiling is performed, this function only +// applies the transformation. +static void tile(MutableArrayRef forOps, ArrayRef sizes) { + assert(sizes.size() >= forOps.size() && "insufficient number of tile sizes"); + if (sizes.empty() || forOps.empty()) + return; + + ForOp rootForOp = forOps.front(); + OpBuilder builder(rootForOp); + + // Compute new steps for the outer loops. + SmallVector newSteps; + newSteps.reserve(sizes.size()); + for (unsigned i = 0, e = sizes.size(); i < e; ++i) { + auto op = forOps[i]; + Value *newStep = builder.create(op.getLoc(), op.step(), sizes[i]); + newSteps.push_back(newStep); + } + + // Create new outer loops nested one into another. + SmallVector outerForOps; + for (unsigned i = 0, e = sizes.size(); i < e; ++i) { + auto outerForOp = + builder.create(forOps[i].getLoc(), forOps[i].lowerBound(), + forOps[i].upperBound(), newSteps[i]); + + builder.setInsertionPointToStart(outerForOp.body()); + + // FIXME: builder should do this for us. + ensureStdTerminator(outerForOp.getOperation()->getRegion(0), builder, + forOps[i].getLoc()); + outerForOp.body()->addArgument(builder.getIndexType()); + builder.setInsertionPointToStart(outerForOp.body()); + + outerForOps.push_back(outerForOp); + } + + // Move the outermost original loop into the innermost new outer loop. Thus + // the body of the original loops does not need updating. + auto lastOuterForOp = outerForOps.back(); + lastOuterForOp.body()->getOperations().splice( + lastOuterForOp.body()->getOperations().begin(), + rootForOp.getOperation()->getBlock()->getOperations(), + rootForOp.getOperation()); + + // Immediately before the (now sunk) outermost original loop, insert the + // computation of the upper bounds of the inner loops. Update the bounds of + // the orginial loops to make them point loops. + builder.setInsertionPointToStart(lastOuterForOp.body()); + for (unsigned i = 0, e = sizes.size(); i < e; ++i) { + Value *stepped = builder.create( + forOps[i].getLoc(), outerForOps[i].getInductionVar(), newSteps[i]); + Value *less = builder.create(forOps[i].getLoc(), CmpIPredicate::SLT, + forOps[i].upperBound(), stepped); + Value *upperBound = builder.create( + forOps[i].getLoc(), less, forOps[i].upperBound(), stepped); + forOps[i].setLowerBound(outerForOps[i].getInductionVar()); + forOps[i].setUpperBound(upperBound); + } +} + +void mlir::tile(ForOp rootForOp, ArrayRef sizes) { + // Collect prefectly nested loops. If more size values provided than nested + // loops available, truncate `sizes`. + SmallVector forOps; + forOps.reserve(sizes.size()); + getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size()); + if (forOps.size() < sizes.size()) + sizes = sizes.take_front(forOps.size()); + + return ::tile(forOps, sizes); +} + +// Build the IR that performs ceil division of a positive value by a constant: +// ceildiv(a, B) = divis(a + (B-1), B) +// where divis is roundning-to-zero division. +static Value *ceilDivPositive(OpBuilder &builder, Location loc, Value *dividend, + int64_t divisor) { + assert(divisor > 0 && "expected positive divisor"); + assert(dividend->getType().isIndex() && "expected index-typed value"); + + Value *divisorMinusOneCst = builder.create(loc, divisor - 1); + Value *divisorCst = builder.create(loc, divisor); + Value *sum = builder.create(loc, dividend, divisorMinusOneCst); + return builder.create(loc, sum, divisorCst); +} + +static Value *ceilDivPositive(OpBuilder &builder, Location loc, Value *dividend, + Value *divisor) { + assert(dividend->getType().isIndex() && "expected index-typed value"); + + Value *cstOne = builder.create(loc, 1); + Value *divisorMinusOne = builder.create(loc, divisor, cstOne); + Value *sum = builder.create(loc, dividend, divisorMinusOne); + return builder.create(loc, sum, divisor); +} + +void mlir::extractFixedOuterLoops(ForOp rootForOp, ArrayRef sizes) { + // Collect prefectly nested loops. If more size values provided than nested + // loops available, truncate `sizes`. + SmallVector forOps; + forOps.reserve(sizes.size()); + getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size()); + if (forOps.size() < sizes.size()) + sizes = sizes.take_front(forOps.size()); + + OpBuilder builder(rootForOp); + auto loc = rootForOp.getLoc(); + + // Compute the tile sizes such that i-th outer loop executes size[i] + // iterations. Given that the loop current executes + // numIterations = ceildiv((upperBound - lowerBound), step) + // iterations, we need to tile with size ceildiv(numIterations, size[i]). + SmallVector tileSizes; + tileSizes.reserve(sizes.size()); + for (unsigned i = 0, e = sizes.size(); i < e; ++i) { + assert(sizes[i] > 0 && "expected strictly positive size for strip-mining"); + + auto forOp = forOps[i]; + Value *diff = + builder.create(loc, forOp.upperBound(), forOp.lowerBound()); + Value *numIterations = ceilDivPositive(builder, loc, diff, forOp.step()); + Value *iterationsPerBlock = + ceilDivPositive(builder, loc, numIterations, sizes[i]); + tileSizes.push_back(iterationsPerBlock); + } + + // Call parametric tiling with the given sizes. + return ::tile(forOps, tileSizes); +} diff --git a/mlir/test/Transforms/parametric_tiling.mlir b/mlir/test/Transforms/parametric_tiling.mlir new file mode 100644 index 000000000000..201c9c5b1585 --- /dev/null +++ b/mlir/test/Transforms/parametric_tiling.mlir @@ -0,0 +1,66 @@ +// RUN: mlir-opt -extract-fixed-outer-loops -outer-loop-sizes=7 %s | FileCheck %s --check-prefixes=COMMON,TILE_7 +// RUN: mlir-opt -extract-fixed-outer-loops -outer-loop-sizes=7,4 %s | FileCheck %s --check-prefixes=COMMON,TILE_74 + +// COMMON-LABEL: @foo +func @foo(%arg0: memref) { + %c2 = constant 2 : index + %c44 = constant 44 : index + %c1 = constant 1 : index + // Range of the original loop: + // (upper - lower + step - 1) / step + // where step is known to be %c1. + // COMMON: %[[diff:.*]] = subi %c44, %c2 + // COMMON: %[[adjustment:.*]] = subi %c1, %c1_{{.*}} + // COMMON-NEXT: %[[diff_adj:.*]] = addi %[[diff]], %[[adjustment]] + // COMMON-NEXT: %[[range:.*]] = divis %[[diff_adj]], %c1 + + // Ceildiv to get the parametric tile size. + // COMMON: %[[sum:.*]] = addi %[[range]], %c6 + // COMMON-NEXT: %[[size:.*]] = divis %[[sum]], %c7 + + // Range of the second original loop + // (upper - lower + step - 1) / step + // where step is known to be %c2. + // TILE_74: %[[diff2:.*]] = subi %c44, %c1 + // TILE_74: %[[adjustment2:.*]] = subi %c2, %c1_{{.*}} + // TILE_74-NEXT: %[[diff2_adj:.*]] = addi %[[diff2]], %[[adjustment2]] + // TILE_74-NEXT: %[[range2:.*]] = divis %[[diff2_adj]], %c2 + + // Ceildiv to get the parametric tile size for the second original loop. + // TILE_74: %[[sum2:.*]] = addi %[[range2]], %c3 + // TILE_74-NEXT: %[[size2:.*]] = divis %[[sum2]], %c4 + + // New step(s) (original is %c1 and %c2). + // COMMON: %[[step:.*]] = muli %c1, %[[size]] + // TILE_74: %[[step2:.*]] = muli %c2, %[[size2]] + + // Updated outer loop(s) use new steps. + // COMMON: for %[[i:.*]] = %c2 to %c44 step %[[step]] + // TILE_74: for %[[j:.*]] = %c1 to %c44 step %[[step2]] + for %i = %c2 to %c44 step %c1 { + // Upper bound for the inner loop min(%i + %step, %c44). + // COMMON: %[[stepped:.*]] = addi %[[i]], %[[step]] + // COMMON-NEXT: cmpi "slt", %c44, %[[stepped]] + // COMMON-NEXT: %[[ub:.*]] = select {{.*}}, %c44, %[[stepped]] + // + // TILE_74: %[[stepped2:.*]] = addi %[[j]], %[[step2]] + // TILE_74-NEXT: cmpi "slt", %c44, %[[stepped2]] + // TILE_74-NEXT: %[[ub2:.*]] = select {{.*}}, %c44, %[[stepped2]] + + // Created inner loop. + // COMMON: for %[[ii:.*]] = %[[i]] to %[[ub:.*]] step %c1 + + // This loop is not modified in TILE_7 case. + // TILE_7: for %[[j:.*]] = %c1 to %c44 step %c2 + // + // But is modified in TILE_74 case. + // TILE_74: for %[[jj:.*]] = %[[j]] to %[[ub2]] step %c2 + for %j = %c1 to %c44 step %c2 { + // The right iterator are used. + // TILE_7: load %arg0[%[[ii]], %[[j]]] + // TILE_74: load %arg0[%[[ii]], %[[jj]]] + load %arg0[%i, %j]: memref + } + } + return +}