Add a stripmineSink and imperfectly nested tiling primitives.

This CL adds a primitive to perform stripmining of a loop by a given factor and sinking it under multiple target loops. In turn this is used to implement imperfectly nested loop tiling (with interchange) by repeatedly calling the stripmineSink primitive. The API returns the point loops and allows repeated invocations of tiling to achieve declarative, multi-level, imperfectly-nested tiling. Note that this CL is only concerned with the mechanical aspects and does not worry about analysis and legality. The API is demonstrated in an example which creates an EDSC block, emits the corresponding MLIR and applies imperfectly-nested tiling: ```cpp auto block = edsc::block({ For(ArrayRef<edsc::Expr>{i, j}, {zero, zero}, {M, N}, {one, one}, { For(k1, zero, O, one, { C({i, j, k1}) = A({i, j, k1}) + B({i, j, k1}) }), For(k2, zero, O, one, { C({i, j, k2}) = A({i, j, k2}) + B({i, j, k2}) }), }), }); // clang-format on emitter.emitStmts(block.getBody()); auto l_i = emitter.getAffineForOp(i), l_j = emitter.getAffineForOp(j), l_k1 = emitter.getAffineForOp(k1), l_k2 = emitter.getAffineForOp(k2); auto indicesL1 = mlir::tile({l_i, l_j}, {512, 1024}, {l_k1, l_k2}); auto l_ii1 = indicesL1[0][0], l_jj1 = indicesL1[1][0]; mlir::tile({l_jj1, l_ii1}, {32, 16}, l_jj1); ``` The edsc::Expr for the induction variables (i, j, k_1, k_2) provide the programmatic hooks from which tiling can be applied declaratively. PiperOrigin-RevId: 235548228
2019-02-25 09:53:05 -08:00 · 2019-02-25 09:53:05 -08:00 · 62c54a2ec4
parent e7193a70f8
commit 62c54a2ec4
6 changed files with 181 additions and 1 deletions
--- a/mlir/include/mlir/EDSC/MLIREmitter.h
+++ b/mlir/include/mlir/EDSC/MLIREmitter.h
@ -40,6 +40,7 @@
 #include "llvm/ADT/DenseMap.h"

 namespace mlir {
+class AffineForOp;
 class FuncBuilder;
 class Value;

@ -164,6 +165,7 @@ struct MLIREmitter {
    }
    return res;
  }
+  OpPointer<AffineForOp> getAffineForOp(Expr e);

 private:
  /// Emits the MLIR for `expr` and inserts at the `builder`'s insertion point.
--- a/mlir/include/mlir/Transforms/LoopUtils.h
+++ b/mlir/include/mlir/Transforms/LoopUtils.h
@ -104,6 +104,23 @@ void interchangeLoops(OpPointer<AffineForOp> forOpA,
 /// 'loopDepth' AffineForOps consecutively nested under it.
 void sinkLoop(OpPointer<AffineForOp> forOp, unsigned loopDepth);

+/// Performs tiling fo imperfectly nested loops (with interchange) by
+/// strip-mining the `forOps` by `sizes` and sinking them, in their order of
+/// occurrence in `forOps`, under each of the `targets`.
+/// Returns the new AffineForOps, one per each of (`forOps`, `targets`) pair,
+/// nested immediately under each of `targets`.
+SmallVector<SmallVector<OpPointer<AffineForOp>, 8>, 8>
+tile(ArrayRef<OpPointer<AffineForOp>> forOps, ArrayRef<uint64_t> sizes,
+     ArrayRef<OpPointer<AffineForOp>> targets);
+
+/// Performs tiling (with interchange) by strip-mining the `forOps` by `sizes`
+/// and sinking them, in their order of occurrence in `forOps`, under `target`.
+/// Returns the new AffineForOps, one per `forOps`, nested immediately under
+/// `target`.
+SmallVector<OpPointer<AffineForOp>, 8>
+tile(ArrayRef<OpPointer<AffineForOp>> forOps, ArrayRef<uint64_t> sizes,
+     OpPointer<AffineForOp> target);
+
 } // end namespace mlir

 #endif // MLIR_TRANSFORMS_LOOP_UTILS_H
--- a/mlir/lib/EDSC/LowerEDSCTestPass.cpp
+++ b/mlir/lib/EDSC/LowerEDSCTestPass.cpp
@ -15,6 +15,7 @@
 // limitations under the License.
 // =============================================================================

+#include "mlir/AffineOps/AffineOps.h"
 #include "mlir/EDSC/MLIREmitter.h"
 #include "mlir/EDSC/Types.h"
 #include "mlir/IR/Builders.h"
@ -25,6 +26,8 @@
 #include "mlir/IR/Types.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/StandardOps/StandardOps.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "llvm/Support/raw_ostream.h"

 using namespace mlir;

@ -260,10 +263,48 @@ PassResult LowerEDSCTestPass::runOnFunction(Function *f) {
      }),
    });
    // clang-format on
-
    emitter.emitStmts(block.getBody());
  }

+  // Inject an EDSC-constructed computation to exercise imperfectly nested 2-d
+  // tiling.
+  if (f->getName().strref().contains("tile_2d")) {
+    FuncBuilder builder(f);
+    edsc::ScopedEDSCContext context;
+    edsc::MLIREmitter emitter(&builder, f->getLoc());
+
+    edsc::Expr zero = emitter.zero();
+    edsc::Expr one = emitter.one();
+    auto args = emitter.makeBoundFunctionArguments(f);
+    auto views = emitter.makeBoundMemRefViews(args.begin(), args.end());
+
+    Type indexType = builder.getIndexType();
+    edsc::Expr i(indexType), j(indexType), k1(indexType), k2(indexType);
+    edsc::Indexed A(args[0]), B(args[1]), C(args[2]);
+    edsc::Expr M = views[0].dim(0), N = views[0].dim(1), O = views[0].dim(2);
+    // clang-format off
+    using namespace edsc::op;
+    edsc::Stmt scalarA, scalarB, tmp;
+    auto block = edsc::block({
+      For(ArrayRef<edsc::Expr>{i, j}, {zero, zero}, {M, N}, {one, one}, {
+        For(k1, zero, O, one, {
+          C({i, j, k1}) = A({i, j, k1}) + B({i, j, k1})
+        }),
+        For(k2, zero, O, one, {
+          C({i, j, k2}) = A({i, j, k2}) + B({i, j, k2})
+        }),
+      }),
+    });
+    // clang-format on
+    emitter.emitStmts(block.getBody());
+
+    auto li = emitter.getAffineForOp(i), lj = emitter.getAffineForOp(j),
+         lk1 = emitter.getAffineForOp(k1), lk2 = emitter.getAffineForOp(k2);
+    auto indicesL1 = mlir::tile({li, lj}, {512, 1024}, {lk1, lk2});
+    auto lii1 = indicesL1[0][0], ljj1 = indicesL1[1][0];
+    mlir::tile({ljj1, lii1}, {32, 16}, ljj1);
+  }
+
  f->walk([](Instruction *op) {
    if (op->getName().getStringRef() == "print") {
      auto opName = op->getAttrOfType<StringAttr>("op");
--- a/mlir/lib/EDSC/MLIREmitter.cpp
+++ b/mlir/lib/EDSC/MLIREmitter.cpp
@ -388,6 +388,12 @@ mlir::edsc::MLIREmitter::makeBoundMemRefView(Expr boundMemRef) {
  return makeBoundMemRefView(v);
 }

+OpPointer<AffineForOp> mlir::edsc::MLIREmitter::getAffineForOp(Expr e) {
+  auto *value = ssaBindings.lookup(e);
+  assert(value && "Expr not bound");
+  return getForInductionVarOwner(value);
+}
+
 edsc_expr_t bindConstantBF16(edsc_mlir_emitter_t emitter, double value) {
  auto *e = reinterpret_cast<mlir::edsc::MLIREmitter *>(emitter);
  Expr b(e->getBuilder()->getBF16Type());
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@ -34,6 +34,7 @@
 #include "mlir/StandardOps/StandardOps.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Debug.h"
+
 #define DEBUG_TYPE "LoopUtils"

 using namespace mlir;
@ -486,3 +487,97 @@ void mlir::sinkLoop(OpPointer<AffineForOp> forOp, unsigned loopDepth) {
    interchangeLoops(forOp, nextForOp);
  }
 }
+
+// Factors out common behavior to add max(`iv`, ...), min(`iv` + `offset`, ...)
+// to loop bounds.
+static void augmentMapAndBounds(FuncBuilder *b, Value *iv, AffineMap *map,
+                                SmallVector<Value *, 4> *operands,
+                                int64_t offset = 0) {
+  auto bounds = llvm::to_vector<4>(map->getResults());
+  operands->push_back(iv);
+  auto numOperands = operands->size();
+  bounds.push_back(b->getAffineDimExpr(numOperands - 1) + offset);
+  *map = b->getAffineMap(numOperands, map->getNumSymbols(), bounds, {});
+  canonicalizeMapAndOperands(map, operands);
+}
+
+// Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
+// Stripmine-sink is a primitive building block for generalized tiling of
+// imperfectly nested loops.
+// This transformation is purely mechanical and does not check legality,
+// profitability or even structural correctness. It is the user's
+// responsibility to specify `targets` that are dominated by `forOp`.
+// Returns the new AffineForOps, one per `targets`, nested immediately under
+// each of the `targets`.
+static SmallVector<OpPointer<AffineForOp>, 8>
+stripmineSink(OpPointer<AffineForOp> forOp, uint64_t factor,
+              ArrayRef<OpPointer<AffineForOp>> targets) {
+  // TODO(ntv): Use cheap structural assertions that targets are nested under
+  // forOp and that targets are not nested under each other when DominanceInfo
+  // exposes the capability. It seems overkill to construct a whole function
+  // dominance tree at this point.
+  auto originalStep = forOp->getStep();
+  auto scaledStep = originalStep * factor;
+  forOp->setStep(scaledStep);
+
+  auto *forInst = forOp->getInstruction();
+  FuncBuilder b(forInst->getBlock(), ++Block::iterator(forInst));
+
+  // Lower-bound map creation.
+  auto lbMap = forOp->getLowerBoundMap();
+  SmallVector<Value *, 4> lbOperands(forOp->getLowerBoundOperands());
+  augmentMapAndBounds(&b, forOp->getInductionVar(), &lbMap, &lbOperands);
+
+  // Upper-bound map creation.
+  auto ubMap = forOp->getLowerBoundMap();
+  SmallVector<Value *, 4> ubOperands(forOp->getUpperBoundOperands());
+  augmentMapAndBounds(&b, forOp->getInductionVar(), &ubMap, &ubOperands,
+                      /*offset=*/scaledStep);
+
+  SmallVector<OpPointer<AffineForOp>, 8> innerLoops;
+  for (auto t : targets) {
+    // Insert forOp just before the first instruction in the body.
+    auto *body = t->getBody();
+    auto &inst = body->getInstructions().front();
+    FuncBuilder b(&inst);
+    auto newLoop = b.create<AffineForOp>(t->getLoc(), lbOperands, lbMap,
+                                         ubOperands, ubMap, originalStep);
+    newLoop->createBody()->getInstructions().splice(
+        newLoop->getBody()->end(), body->getInstructions(), ++body->begin(),
+        body->end());
+    innerLoops.push_back(newLoop);
+  }
+
+  return innerLoops;
+}
+
+// Stripmines a `forOp` by `factor` and sinks it under a single `target`.
+// Returns the new AffineForOps, nested immediately under `target`.
+OpPointer<AffineForOp> stripmineSink(OpPointer<AffineForOp> forOp,
+                                     uint64_t factor,
+                                     OpPointer<AffineForOp> target) {
+  auto res =
+      stripmineSink(forOp, factor, ArrayRef<OpPointer<AffineForOp>>{target});
+  assert(res.size() == 1 && "Expected 1 inner forOp");
+  return res[0];
+}
+
+SmallVector<SmallVector<OpPointer<AffineForOp>, 8>, 8>
+mlir::tile(ArrayRef<OpPointer<AffineForOp>> forOps, ArrayRef<uint64_t> sizes,
+           ArrayRef<OpPointer<AffineForOp>> targets) {
+  SmallVector<SmallVector<OpPointer<AffineForOp>, 8>, 8> res;
+  SmallVector<OpPointer<AffineForOp>, 8> currentTargets(targets.begin(),
+                                                        targets.end());
+  for (auto it : llvm::zip(forOps, sizes)) {
+    auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
+    res.push_back(step);
+    currentTargets = step;
+  }
+  return res;
+}
+
+SmallVector<OpPointer<AffineForOp>, 8>
+mlir::tile(ArrayRef<OpPointer<AffineForOp>> forOps, ArrayRef<uint64_t> sizes,
+           OpPointer<AffineForOp> target) {
+  return tile(forOps, sizes, ArrayRef<OpPointer<AffineForOp>>{target})[0];
+}
--- a/mlir/test/EDSC/for-loops.mlir
+++ b/mlir/test/EDSC/for-loops.mlir
@ -127,3 +127,22 @@ func @second_order_callee(() -> ()) -> (() -> (index))
 func @call_indirect() {
  return
 }
+
+// This function will be detected by the test pass that will insert an
+// EDSC-constructed chain of indirect calls that corresponds to an imperfectly
+// nested loop nest with 2 common outer loops and 2 inner 1-d loops.
+// CHECK-LABEL: func @tile_2d
+// CHECK:   for %i0 = #[[idmap]]({{.*}}) to #[[idmap]]({{.*}}) step 512 {
+// CHECK:     for %i1 = #[[idmap]]({{.*}}) to #[[idmap]]({{.*}}) step 1024 {
+// CHECK:       for %i2 = #[[idmap]]({{.*}}) to #[[idmap]]({{.*}}) {
+// CHECK:         for %i3 = max #{{.*}}, %i0) to min #{{.*}}, %i0) step 16 {
+// CHECK:           for %i4 = max #{{.*}}, %i1) to min #{{.*}}, %i1) step 32 {
+// CHECK:             for %i5 = max #{{.*}}, %i1, %i4) to min #{{.*}}, %i1, %i4) {
+// CHECK:               for %i6 = max #{{.*}}, %i0, %i3) to min #{{.*}}, %i0, %i3) {
+// CHECK:                     for %i7 = #[[idmap]]({{.*}}) to #[[idmap]]({{.*}}) {
+// CHECK:         for %i8 = max #{{.*}}, %i0) to min #{{.*}}, %i0) {
+// CHECK:           for %i9 = max #{{.*}}, %i1) to min #{{.*}}, %i1) {
+func @tile_2d(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>, %arg2: memref<?x?x?xf32>) {
+  return
+}
+