Uniformize the API for the mlir::tile functions on AffineForOp and loop::ForOp

This CL adapts the recently introduced parametric tiling to have an API matching the tiling of AffineForOp. The transformation using stripmineSink is more general and produces imperfectly nested loops. Perfect nesting invariants of the tiled version are obtained by selectively applying hoisting of ops to isolate perfectly nested bands. Such hoisting may fail to produce a perfect loop nest in cases where ForOp transitively depend on enclosing induction variables. In such cases, the API provides a LogicalResult return but the SimpleParametricLoopTilingPass does not currently use this result. A new unit test is added with a triangular loop for which the perfect nesting property does not hold. For this example, the old behavior was to produce IR that did not verify (some use was not dominated by its def). PiperOrigin-RevId: 258928309
2019-07-19 01:52:36 -07:00 · 2019-07-19 01:52:36 -07:00 · 5bc344743c
parent 28057ff3da
commit 5bc344743c
6 changed files with 267 additions and 102 deletions
--- a/mlir/include/mlir/Transforms/LoopUtils.h
+++ b/mlir/include/mlir/Transforms/LoopUtils.h
@ -134,9 +134,14 @@ void sinkLoop(AffineForOp forOp, unsigned loopDepth);
 /// occurrence in `forOps`, under each of the `targets`.
 /// Returns the new AffineForOps, one per each of (`forOps`, `targets`) pair,
 /// nested immediately under each of `targets`.
+using Loops = SmallVector<loop::ForOp, 8>;
+using TileLoops = std::pair<Loops, Loops>;
 SmallVector<SmallVector<AffineForOp, 8>, 8> tile(ArrayRef<AffineForOp> forOps,
                                                 ArrayRef<uint64_t> sizes,
                                                 ArrayRef<AffineForOp> targets);
+SmallVector<Loops, 8> tile(ArrayRef<loop::ForOp> forOps,
+                           ArrayRef<Value *> sizes,
+                           ArrayRef<loop::ForOp> targets);

 /// Performs tiling (with interchange) by strip-mining the `forOps` by `sizes`
 /// and sinking them, in their order of occurrence in `forOps`, under `target`.
@ -144,17 +149,21 @@ SmallVector<SmallVector<AffineForOp, 8>, 8> tile(ArrayRef<AffineForOp> forOps,
 /// `target`.
 SmallVector<AffineForOp, 8> tile(ArrayRef<AffineForOp> forOps,
                                 ArrayRef<uint64_t> sizes, AffineForOp target);
+Loops tile(ArrayRef<loop::ForOp> forOps, ArrayRef<Value *> sizes,
+           loop::ForOp target);

-/// Tile a nest of standard for loops rooted at `rootForOp` with the given
+/// Tile a nest of loop::ForOp loops rooted at `rootForOp` with the given
 /// (parametric) sizes. Sizes are expected to be strictly positive values at
-/// runtime.  If more sizes than loops provided, discard the trailing values in
-/// sizes.  Assumes the loop nest is permutable.
-void tile(loop::ForOp rootForOp, ArrayRef<Value *> sizes);
+/// runtime.  If more sizes than loops are provided, discard the trailing values
+/// in sizes.  Assumes the loop nest is permutable.
+/// Returns the newly created intra-tile loops.
+Loops tilePerfectlyNested(loop::ForOp rootForOp, ArrayRef<Value *> sizes);

 /// Tile a nest of standard for loops rooted at `rootForOp` by finding such
 /// parametric tile sizes that the outer loops have a fixed number of iterations
 /// as defined in `sizes`.
-void extractFixedOuterLoops(loop::ForOp rootFOrOp, ArrayRef<int64_t> sizes);
+TileLoops extractFixedOuterLoops(loop::ForOp rootFOrOp,
+                                 ArrayRef<int64_t> sizes);

 /// Replace a perfect nest of "for" loops with a single linearized loop. Assumes
 /// `loops` contains a list of perfectly nested loops with bounds and steps
--- a/mlir/lib/Analysis/CMakeLists.txt
+++ b/mlir/lib/Analysis/CMakeLists.txt
@ -16,5 +16,5 @@ add_llvm_library(MLIRAnalysis STATIC
  ADDITIONAL_HEADER_DIRS
  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Analysis
  )
-add_dependencies(MLIRAnalysis MLIRAffineOps)
-target_link_libraries(MLIRAnalysis MLIRAffineOps)
+add_dependencies(MLIRAnalysis MLIRAffineOps MLIRLoopOps)
+target_link_libraries(MLIRAnalysis MLIRAffineOps MLIRLoopOps)
--- a/mlir/lib/Analysis/SliceAnalysis.cpp
+++ b/mlir/lib/Analysis/SliceAnalysis.cpp
@ -22,6 +22,7 @@
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/AffineOps/AffineOps.h"
 #include "mlir/Analysis/VectorAnalysis.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Support/Functional.h"
 #include "mlir/Support/STLExtras.h"
@ -54,8 +55,13 @@ static void getForwardSliceImpl(Operation *op,
    for (auto *ownerInst : forOp.getInductionVar()->getUsers())
      if (forwardSlice->count(ownerInst) == 0)
        getForwardSliceImpl(ownerInst, forwardSlice, filter);
+  } else if (auto forOp = dyn_cast<loop::ForOp>(op)) {
+    for (auto *ownerInst : forOp.getInductionVar()->getUsers())
+      if (forwardSlice->count(ownerInst) == 0)
+        getForwardSliceImpl(ownerInst, forwardSlice, filter);
  } else {
-    assert(op->getNumResults() <= 1 && "NYI: multiple results");
+    assert(op->getNumRegions() == 0 && "unexpected generic op with regions");
+    assert(op->getNumResults() <= 1 && "unexpected multiple results");
    if (op->getNumResults() > 0) {
      for (auto *ownerInst : op->getResult(0)->getUsers())
        if (forwardSlice->count(ownerInst) == 0)
@ -87,6 +93,10 @@ static void getBackwardSliceImpl(Operation *op,
    return;
  }

+  assert((op->getNumRegions() == 0 || isa<AffineForOp>(op) ||
+          isa<loop::ForOp>(op)) &&
+         "unexpected generic op with regions");
+
  // Evaluate whether we should keep this def.
  // This is useful in particular to implement scoping; i.e. return the
  // transitive forwardSlice in the current scope.
--- a/mlir/lib/Transforms/LoopParametricTiling.cpp
+++ b/mlir/lib/Transforms/LoopParametricTiling.cpp
@ -19,20 +19,89 @@
 //
 //===----------------------------------------------------------------------===//

+#include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/LoopUtils.h"
 #include "mlir/Transforms/Passes.h"

-#include "mlir/IR/Builders.h"
+#include "llvm/ADT/SetVector.h"

 using namespace mlir;
+using llvm::SetVector;

 static llvm::cl::list<int> clOuterLoopSizes(
    "outer-loop-sizes", llvm::cl::MiscFlags::CommaSeparated,
    llvm::cl::desc(
        "fixed number of iterations that the outer loops should have"));

+// Hoist the ops within `outer` that appear before `inner`.
+// Such ops include the ops that have been introduced by parametric tiling.
+// Ops that come from triangular loops (i.e. that belong to the program slice
+// rooted at `outer`) and ops that have side effects cannot be hoisted.
+// Returns failure when any op fails to hoist.
+static LogicalResult hoistOpsBetween(loop::ForOp outer, loop::ForOp inner) {
+  SetVector<Operation *> forwardSlice;
+  getForwardSlice(outer.getOperation(), &forwardSlice, [&inner](Operation *op) {
+    return op != inner.getOperation();
+  });
+  LogicalResult status = success();
+  SmallVector<Operation *, 8> toHoist;
+  for (auto &op : outer.getBody()->getOperations()) {
+    // Stop when encountering the inner loop.
+    if (&op == inner.getOperation())
+      break;
+    // Skip over non-hoistable ops.
+    if (forwardSlice.count(&op) > 0) {
+      status = failure();
+      continue;
+    }
+    // Skip loop::ForOp, these are not considered a failure.
+    if (op.getNumRegions() > 0)
+      continue;
+    // Skip other ops with regions.
+    if (op.getNumRegions() > 0) {
+      status = failure();
+      continue;
+    }
+    // Skip if op has side effects.
+    // TODO(ntv): loads to immutable memory regions are ok.
+    if (!op.hasNoSideEffect()) {
+      status = failure();
+      continue;
+    }
+    toHoist.push_back(&op);
+  }
+  auto *outerForOp = outer.getOperation();
+  for (auto *op : toHoist)
+    op->moveBefore(outerForOp);
+  return status;
+}
+
+// Traverse the interTile and intraTile loops and tries to hoist ops such that
+// bands of perfectly nested loops are isolated.
+// Returns failure if either perfect interTile or perfect intraTile bands cannot
+// be formed.
+static LogicalResult tryIsolateBands(const SmallVector<TileLoops, 8> &loops) {
+  LogicalResult status = success();
+  for (auto &tl : loops) {
+    auto &interTile = tl.first;
+    auto &intraTile = tl.second;
+    auto size = interTile.size();
+    assert(size == intraTile.size());
+    if (size <= 1)
+      continue;
+    for (unsigned s = 1; s < size; ++s)
+      status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s])
+                                 : failure();
+    for (unsigned s = 1; s < size; ++s)
+      status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s])
+                                 : failure();
+  }
+  return status;
+}
+
 namespace {
 // Extracts fixed-range loops for top-level loop nests with ranges defined in
 // the pass constructor.  Assumes loops are permutable.
@ -45,12 +114,18 @@ public:
  void runOnFunction() override {
    FuncOp func = getFunction();

-    func.walk<loop::ForOp>([this](loop::ForOp op) {
+    SmallVector<TileLoops, 8> loops;
+    func.walk<loop::ForOp>([this, &loops](loop::ForOp op) {
      // Ignore nested loops.
      if (op.getContainingRegion()->getParentOfType<loop::ForOp>())
        return;
-      extractFixedOuterLoops(op, sizes);
+      loops.push_back(extractFixedOuterLoops(op, sizes));
    });
+
+    // TODO(ntv, zinenko) for now we just ignore the result of band isolation.
+    // In the future, mapping decisions may be impacted by the ability to
+    // isolate perfectly nested bands.
+    tryIsolateBands(loops);
  }

  SmallVector<int64_t, 4> sizes;
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@ -37,6 +37,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"

 #include "mlir/IR/Module.h"

@ -687,8 +688,8 @@ static void augmentMapAndBounds(OpBuilder &b, Value *iv, AffineMap *map,
 // substituting `oldIv` in place of
 // `forOp.getInductionVariable()` and ignoring the terminator.
 // Note: `newForOp` may be nested under `forOp`.
-static void cloneLoopBodyInto(AffineForOp forOp, Value *oldIv,
-                              AffineForOp newForOp) {
+template <typename ForOpType>
+void cloneLoopBodyInto(ForOpType forOp, Value *oldIv, ForOpType newForOp) {
  BlockAndValueMapping map;
  map.map(oldIv, newForOp.getInductionVar());
  OpBuilder b = newForOp.getBodyBuilder();
@ -697,7 +698,7 @@ static void cloneLoopBodyInto(AffineForOp forOp, Value *oldIv,
    if (&op == newForOp.getOperation()) {
      continue;
    }
-    if (isa<AffineTerminatorOp>(op)) {
+    if (op.isKnownTerminator()) {
      continue;
    }
    auto *instClone = b.clone(op, map);
@ -721,10 +722,6 @@ static void cloneLoopBodyInto(AffineForOp forOp, Value *oldIv,
 static SmallVector<AffineForOp, 8>
 stripmineSink(AffineForOp forOp, uint64_t factor,
              ArrayRef<AffineForOp> targets) {
-  // TODO(ntv): Use cheap structural assertions that targets are nested under
-  // forOp and that targets are not nested under each other when DominanceInfo
-  // exposes the capability. It seems overkill to construct a whole function
-  // dominance tree at this point.
  auto originalStep = forOp.getStep();
  auto scaledStep = originalStep * factor;
  forOp.setStep(scaledStep);
@ -762,20 +759,61 @@ stripmineSink(AffineForOp forOp, uint64_t factor,
  return innerLoops;
 }

+static Loops stripmineSink(loop::ForOp forOp, Value *factor,
+                           ArrayRef<loop::ForOp> targets) {
+  auto *originalStep = forOp.step();
+  auto *iv = forOp.getInductionVar();
+
+  OpBuilder b(forOp);
+  forOp.setStep(b.create<MulIOp>(forOp.getLoc(), originalStep, factor));
+
+  Loops innerLoops;
+  for (auto t : targets) {
+    // Save information for splicing ops out of t when done
+    auto begin = t.getBody()->begin();
+    auto nOps = t.getBody()->getOperations().size();
+
+    // Insert newForOp before the terminator of `t`.
+    OpBuilder b(t.getBodyBuilder());
+    Value *stepped = b.create<AddIOp>(t.getLoc(), iv, forOp.step());
+    Value *less = b.create<CmpIOp>(t.getLoc(), CmpIPredicate::SLT,
+                                   forOp.upperBound(), stepped);
+    Value *ub =
+        b.create<SelectOp>(t.getLoc(), less, forOp.upperBound(), stepped);
+
+    // Splice [begin, begin + nOps - 1) into `newForOp` and replace uses.
+    auto newForOp = b.create<loop::ForOp>(t.getLoc(), iv, ub, originalStep);
+    newForOp.getBody()->getOperations().splice(
+        newForOp.getBody()->getOperations().begin(),
+        t.getBody()->getOperations(), begin, std::next(begin, nOps - 1));
+    replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
+                               newForOp.region());
+
+    innerLoops.push_back(newForOp);
+  }
+
+  return innerLoops;
+}
+
 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
 // Returns the new AffineForOps, nested immediately under `target`.
-AffineForOp stripmineSink(AffineForOp forOp, uint64_t factor,
-                          AffineForOp target) {
-  auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>{target});
+template <typename ForType, typename SizeType>
+static ForType stripmineSink(ForType forOp, SizeType factor, ForType target) {
+  // TODO(ntv): Use cheap structural assertions that targets are nested under
+  // forOp and that targets are not nested under each other when DominanceInfo
+  // exposes the capability. It seems overkill to construct a whole function
+  // dominance tree at this point.
+  auto res = stripmineSink(forOp, factor, ArrayRef<ForType>{target});
  assert(res.size() == 1 && "Expected 1 inner forOp");
  return res[0];
 }

-SmallVector<SmallVector<AffineForOp, 8>, 8>
-mlir::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes,
-           ArrayRef<AffineForOp> targets) {
-  SmallVector<SmallVector<AffineForOp, 8>, 8> res;
-  SmallVector<AffineForOp, 8> currentTargets(targets.begin(), targets.end());
+template <typename ForType, typename SizeType>
+static SmallVector<SmallVector<ForType, 8>, 8>
+tileImpl(ArrayRef<ForType> forOps, ArrayRef<SizeType> sizes,
+         ArrayRef<ForType> targets) {
+  SmallVector<SmallVector<ForType, 8>, 8> res;
+  SmallVector<ForType, 8> currentTargets(targets.begin(), targets.end());
  for (auto it : llvm::zip(forOps, sizes)) {
    auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
    res.push_back(step);
@ -784,76 +822,42 @@ mlir::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes,
  return res;
 }

+SmallVector<SmallVector<AffineForOp, 8>, 8>
+mlir::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes,
+           ArrayRef<AffineForOp> targets) {
+  return tileImpl(forOps, sizes, targets);
+}
+
+SmallVector<Loops, 8> mlir::tile(ArrayRef<loop::ForOp> forOps,
+                                 ArrayRef<Value *> sizes,
+                                 ArrayRef<loop::ForOp> targets) {
+  return tileImpl(forOps, sizes, targets);
+}
+
+template <typename ForType, typename SizeType>
+static SmallVector<ForType, 8>
+tileImpl(ArrayRef<ForType> forOps, ArrayRef<SizeType> sizes, ForType target) {
+  SmallVector<ForType, 8> res;
+  for (auto loops : tile(forOps, sizes, ArrayRef<ForType>{target})) {
+    assert(loops.size() == 1);
+    res.push_back(loops[0]);
+  }
+  return res;
+}
+
 SmallVector<AffineForOp, 8> mlir::tile(ArrayRef<AffineForOp> forOps,
                                       ArrayRef<uint64_t> sizes,
                                       AffineForOp target) {
-  return tile(forOps, sizes, ArrayRef<AffineForOp>{target})[0];
+  return tileImpl(forOps, sizes, target);
 }

-// Tile the given nest of standard for loops with the given (parametric) sizes.
-// Sizes are expected to be strictly positive values at runtime.  If more
-// sizes than loops provided, discard the trailing values in sizes.  When
-// applied to a loop nest
-//    for %i_0 = %lb_0 to %ub_0 step %s_0 {
-//      for %i_1 = %lb_1 to %ub_1 step %s_1 {
-//        "op"(%i0, %i1) : (index, index) -> () }}
-// this splits the loops into tile loops with step %sj * sizes[j] and the
-// original bounds, and the point loops iteration from %i_j to
-// min(%i_j + %s_j * sizes[j], %ub_j) with the original step.  No verification
-// of `forOps` being suitable for tiling is performed, this function only
-// applies the transformation.
-static void tile(MutableArrayRef<loop::ForOp> forOps, ArrayRef<Value *> sizes) {
-  assert(sizes.size() >= forOps.size() && "insufficient number of tile sizes");
-  if (sizes.empty() || forOps.empty())
-    return;
-
-  loop::ForOp rootForOp = forOps.front();
-  OpBuilder builder(rootForOp);
-
-  // Compute new steps for the outer loops.
-  SmallVector<Value *, 4> newSteps;
-  newSteps.reserve(sizes.size());
-  for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
-    auto op = forOps[i];
-    Value *newStep = builder.create<MulIOp>(op.getLoc(), op.step(), sizes[i]);
-    newSteps.push_back(newStep);
-  }
-
-  // Create new outer loops nested one into another.
-  SmallVector<loop::ForOp, 4> outerForOps;
-  for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
-    auto outerForOp =
-        builder.create<loop::ForOp>(forOps[i].getLoc(), forOps[i].lowerBound(),
-                                    forOps[i].upperBound(), newSteps[i]);
-    builder.setInsertionPointToStart(outerForOp.getBody());
-    outerForOps.push_back(outerForOp);
-  }
-
-  // Move the outermost original loop into the innermost new outer loop.  Thus
-  // the body of the original loops does not need updating.
-  auto lastOuterForOp = outerForOps.back();
-  lastOuterForOp.getBody()->getOperations().splice(
-      lastOuterForOp.getBody()->getOperations().begin(),
-      rootForOp.getOperation()->getBlock()->getOperations(),
-      rootForOp.getOperation());
-
-  // Immediately before the (now sunk) outermost original loop, insert the
-  // computation of the upper bounds of the inner loops.  Update the bounds of
-  // the orginial loops to make them point loops.
-  builder.setInsertionPointToStart(lastOuterForOp.getBody());
-  for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
-    Value *stepped = builder.create<AddIOp>(
-        forOps[i].getLoc(), outerForOps[i].getInductionVar(), newSteps[i]);
-    Value *less = builder.create<CmpIOp>(forOps[i].getLoc(), CmpIPredicate::SLT,
-                                         forOps[i].upperBound(), stepped);
-    Value *upperBound = builder.create<SelectOp>(
-        forOps[i].getLoc(), less, forOps[i].upperBound(), stepped);
-    forOps[i].setLowerBound(outerForOps[i].getInductionVar());
-    forOps[i].setUpperBound(upperBound);
-  }
+Loops mlir::tile(ArrayRef<loop::ForOp> forOps, ArrayRef<Value *> sizes,
+                 loop::ForOp target) {
+  return tileImpl(forOps, sizes, target);
 }

-void mlir::tile(loop::ForOp rootForOp, ArrayRef<Value *> sizes) {
+Loops mlir::tilePerfectlyNested(loop::ForOp rootForOp,
+                                ArrayRef<Value *> sizes) {
  // Collect prefectly nested loops.  If more size values provided than nested
  // loops available, truncate `sizes`.
  SmallVector<loop::ForOp, 4> forOps;
@ -862,7 +866,7 @@ void mlir::tile(loop::ForOp rootForOp, ArrayRef<Value *> sizes) {
  if (forOps.size() < sizes.size())
    sizes = sizes.take_front(forOps.size());

-  return ::tile(forOps, sizes);
+  return ::tile(forOps, sizes, forOps.back());
 }

 // Build the IR that performs ceil division of a positive value by a constant:
@ -893,8 +897,8 @@ static Value *ceilDivPositive(OpBuilder &builder, Location loc, Value *dividend,
  return builder.create<DivISOp>(loc, sum, divisor);
 }

-void mlir::extractFixedOuterLoops(loop::ForOp rootForOp,
-                                  ArrayRef<int64_t> sizes) {
+TileLoops mlir::extractFixedOuterLoops(loop::ForOp rootForOp,
+                                       ArrayRef<int64_t> sizes) {
  // Collect prefectly nested loops.  If more size values provided than nested
  // loops available, truncate `sizes`.
  SmallVector<loop::ForOp, 4> forOps;
@ -903,9 +907,6 @@ void mlir::extractFixedOuterLoops(loop::ForOp rootForOp,
  if (forOps.size() < sizes.size())
    sizes = sizes.take_front(forOps.size());

-  OpBuilder builder(rootForOp);
-  auto loc = rootForOp.getLoc();
-
  // Compute the tile sizes such that i-th outer loop executes size[i]
  // iterations.  Given that the loop current executes
  //   numIterations = ceildiv((upperBound - lowerBound), step)
@ -916,6 +917,8 @@ void mlir::extractFixedOuterLoops(loop::ForOp rootForOp,
    assert(sizes[i] > 0 && "expected strictly positive size for strip-mining");

    auto forOp = forOps[i];
+    OpBuilder builder(forOp);
+    auto loc = forOp.getLoc();
    Value *diff =
        builder.create<SubIOp>(loc, forOp.upperBound(), forOp.lowerBound());
    Value *numIterations = ceilDivPositive(builder, loc, diff, forOp.step());
@ -925,7 +928,8 @@ void mlir::extractFixedOuterLoops(loop::ForOp rootForOp,
  }

  // Call parametric tiling with the given sizes.
-  return ::tile(forOps, tileSizes);
+  auto intraTile = tile(forOps, tileSizes, forOps.back());
+  return std::make_pair(forOps, intraTile);
 }

 // Replaces all uses of `orig` with `replacement` except if the user is listed
--- a/mlir/test/Transforms/parametric_tiling.mlir
+++ b/mlir/test/Transforms/parametric_tiling.mlir
@ -1,8 +1,8 @@
 // RUN: mlir-opt -extract-fixed-outer-loops -outer-loop-sizes=7 %s | FileCheck %s --check-prefixes=COMMON,TILE_7
 // RUN: mlir-opt -extract-fixed-outer-loops -outer-loop-sizes=7,4 %s | FileCheck %s --check-prefixes=COMMON,TILE_74

-// COMMON-LABEL: @foo
-func @foo(%arg0: memref<?x?xf32>) {
+// COMMON-LABEL: @rectangular
+func @rectangular(%arg0: memref<?x?xf32>) {
  %c2 = constant 2 : index
  %c44 = constant 44 : index
  %c1 = constant 1 : index
@ -17,6 +17,8 @@ func @foo(%arg0: memref<?x?xf32>) {
  // Ceildiv to get the parametric tile size.
  // COMMON:       %[[sum:.*]] = addi %[[range]], %c6
  // COMMON-NEXT:  %[[size:.*]] = divis %[[sum]], %c7
+  // New outer step (original is %c1).
+  // COMMON-NEXT:      %[[step:.*]] = muli %c1, %[[size]]

  // Range of the second original loop
  //   (upper - lower + step - 1) / step
@ -29,10 +31,8 @@ func @foo(%arg0: memref<?x?xf32>) {
  // Ceildiv to get the parametric tile size for the second original loop.
  // TILE_74:      %[[sum2:.*]] = addi %[[range2]], %c3
  // TILE_74-NEXT: %[[size2:.*]] = divis %[[sum2]], %c4
-
-  // New step(s) (original is %c1 and %c2).
-  // COMMON:      %[[step:.*]] = muli %c1, %[[size]]
-  // TILE_74:     %[[step2:.*]] = muli %c2, %[[size2]]
+  // New inner step (original is %c2).
+  // TILE_74-NEXT:     %[[step2:.*]] = muli %c2, %[[size2]]

  // Updated outer loop(s) use new steps.
  // COMMON: loop.for %[[i:.*]] = %c2 to %c44 step %[[step]]
@ -64,3 +64,70 @@ func @foo(%arg0: memref<?x?xf32>) {
  }
  return
 }
+
+// COMMON-LABEL: @triangular
+func @triangular(%arg0: memref<?x?xf32>) {
+  %c2 = constant 2 : index
+  %c44 = constant 44 : index
+  %c1 = constant 1 : index
+  // Range of the original outer loop:
+  //   (upper - lower + step - 1) / step
+  // where step is known to be %c1.
+  // COMMON:      %[[diff:.*]] = subi %c44, %c2
+  // COMMON:      %[[adjustment:.*]] = subi %c1, %c1_{{.*}}
+  // COMMON-NEXT: %[[diff_adj:.*]] = addi %[[diff]], %[[adjustment]]
+  // COMMON-NEXT: %[[range:.*]] = divis %[[diff_adj]], %c1
+
+  // Ceildiv to get the parametric tile size.
+  // COMMON:       %[[sum:.*]] = addi %[[range]], %c6
+  // COMMON-NEXT:  %[[size:.*]] = divis %[[sum]], %c7
+  // New outer step (original is %c1).
+  // COMMON-NEXT:  %[[step:.*]] = muli %c1, %[[size]]
+
+  // Constant adjustement for inner loop has been hoisted out.
+  // TILE_74:      %[[adjustment2:.*]] = subi %c2, %c1_{{.*}}
+
+  // New outer loop.
+  // COMMON: loop.for %[[i:.*]] = %c2 to %c44 step %[[step]]
+
+  // Range of the original inner loop
+  //   (upper - lower + step - 1) / step
+  // where step is known to be %c2.
+  // TILE_74:      %[[diff2:.*]] = subi %[[i]], %c1
+  // TILE_74-NEXT: %[[diff2_adj:.*]] = addi %[[diff2]], %[[adjustment2]]
+  // TILE_74-NEXT: %[[range2:.*]] = divis %[[diff2_adj]], %c2
+
+  // Ceildiv to get the parametric tile size for the second original loop.
+  // TILE_74:      %[[sum2:.*]] = addi %[[range2]], %c3
+  // TILE_74-NEXT: %[[size2:.*]] = divis %[[sum2]], %c4
+  // New inner step (original is %c2).
+  // TILE_74-NEXT:     %[[step2:.*]] = muli %c2, %[[size2]]
+
+  // New inner loop.
+  // TILE_74:loop.for %[[j:.*]] = %c1 to %[[i]] step %[[step2]]
+ loop.for %i = %c2 to %c44 step %c1 {
+    // Upper bound for the inner loop min(%i + %step, %c44).
+    // COMMON:      %[[stepped:.*]] = addi %[[i]], %[[step]]
+    // COMMON-NEXT: cmpi "slt", %c44, %[[stepped]]
+    // COMMON-NEXT: %[[ub:.*]] = select {{.*}}, %c44, %[[stepped]]
+    // TILE_74:      %[[stepped2:.*]] = addi %[[j]], %[[step2]]
+    // TILE_74-NEXT: cmpi "slt", %[[i]], %[[stepped2]]
+    // TILE_74-NEXT: %[[ub2:.*]] = select {{.*}}, %[[i]], %[[stepped2]]
+    //
+    // Created inner loop.
+    // COMMON:loop.for %[[ii:.*]] = %[[i]] to %[[ub:.*]] step %c1
+
+    // This loop is not modified in TILE_7 case.
+    // TILE_7: loop.for %[[j:.*]] = %c1 to %[[ii]] step %c2
+    //
+    // But is modified in TILE_74 case.
+    // TILE_74:loop.for %[[jj:.*]] = %[[j]] to %[[ub2]] step %c2
+   loop.for %j = %c1 to %i step %c2 {
+      // The right iterator are used.
+      // TILE_7:  load %arg0[%[[ii]], %[[j]]]
+      // TILE_74: load %arg0[%[[ii]], %[[jj]]]
+      load %arg0[%i, %j]: memref<?x?xf32>
+    }
+  }
+  return
+}