Introduce loop coalescing utility and a simple pass

Multiple (perfectly) nested loops with independent bounds can be combined into a single loop and than subdivided into blocks of arbitrary size for load balancing or more efficient parallelism exploitation. However, MLIR wants to preserve the multi-dimensional multi-loop structure at higher levels of abstraction. Introduce a transformation that coalesces nested loops with independent bounds so that they can be further subdivided by tiling. PiperOrigin-RevId: 258151016
2019-07-15 06:40:07 -07:00 · 2019-07-15 06:40:07 -07:00 · fc044e8929
parent 4de019901b
commit fc044e8929
7 changed files with 423 additions and 0 deletions
--- a/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
+++ b/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
@ -86,6 +86,7 @@ def ForOp : Loop_Op<"for"> {
    }
    void setLowerBound(Value *bound) { getOperation()->setOperand(0, bound); }
    void setUpperBound(Value *bound) { getOperation()->setOperand(1, bound); }
+    void setStep(Value *step) { getOperation()->setOperand(2, step); }
  }];
 }

--- a/mlir/include/mlir/Transforms/LoopUtils.h
+++ b/mlir/include/mlir/Transforms/LoopUtils.h
@ -57,6 +57,8 @@ LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor);
 /// AffineForOp, and the second op is a terminator).
 void getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
                             AffineForOp root);
+void getPerfectlyNestedLoops(SmallVectorImpl<loop::ForOp> &nestedLoops,
+                             loop::ForOp root);

 /// Unrolls and jams this loop by the specified factor. Returns success if the
 /// loop is successfully unroll-jammed.
@ -154,6 +156,11 @@ void tile(loop::ForOp rootForOp, ArrayRef<Value *> sizes);
 /// as defined in `sizes`.
 void extractFixedOuterLoops(loop::ForOp rootFOrOp, ArrayRef<int64_t> sizes);

+/// Replace a perfect nest of "for" loops with a single linearized loop. Assumes
+/// `loops` contains a list of perfectly nested loops with bounds and steps
+/// independent of any loop induction variable involved in the nest.
+void coalesceLoops(MutableArrayRef<loop::ForOp> loops);
+
 } // end namespace mlir

 #endif // MLIR_TRANSFORMS_LOOP_UTILS_H
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@ -105,6 +105,10 @@ FunctionPassBase *createLoopTilingPass(uint64_t cacheSizeBytes);
 FunctionPassBase *
 createSimpleParametricTilingPass(ArrayRef<int64_t> outerLoopSizes);

+/// Creates a pass that transforms perfectly nested loops with independent
+/// bounds into a single loop.
+FunctionPassBase *createLoopCoalescingPass();
+
 /// Promotes all accessed memref regions to the specified faster memory space
 /// while generating DMAs to move data.
 FunctionPassBase *createDmaGenerationPass(
--- a/mlir/lib/Transforms/CMakeLists.txt
+++ b/mlir/lib/Transforms/CMakeLists.txt
@ -5,6 +5,7 @@ add_llvm_library(MLIRTransforms
  CSE.cpp
  DialectConversion.cpp
  DmaGeneration.cpp
+  LoopCoalescing.cpp
  LoopFusion.cpp
  LoopInvariantCodeMotion.cpp
  LoopParametricTiling.cpp
--- a/mlir/lib/Transforms/LoopCoalescing.cpp
+++ b/mlir/lib/Transforms/LoopCoalescing.cpp
@ -0,0 +1,105 @@
+//===- LoopCoalescing.cpp - Pass transforming loop nests into single loops-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/StandardOps/Ops.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "llvm/Support/Debug.h"
+
+#define PASS_NAME "loop-coalescing"
+#define DEBUG_TYPE PASS_NAME
+
+using namespace mlir;
+
+namespace {
+class LoopCoalescingPass : public FunctionPass<LoopCoalescingPass> {
+public:
+  void runOnFunction() override {
+    FuncOp func = getFunction();
+
+    func.walk<loop::ForOp>([](loop::ForOp op) {
+      // Ignore nested loops.
+      if (op.getParentOfType<loop::ForOp>())
+        return;
+
+      SmallVector<loop::ForOp, 4> loops;
+      getPerfectlyNestedLoops(loops, op);
+      LLVM_DEBUG(llvm::dbgs()
+                 << "found a perfect nest of depth " << loops.size() << '\n');
+
+      // Look for a band of loops that can be coalesced, i.e. perfectly nested
+      // loops with bounds defined above some loop.
+      // 1. For each loop, find above which parent loop its operands are
+      // defined.
+      SmallVector<unsigned, 4> operandsDefinedAbove(loops.size());
+      for (unsigned i = 0, e = loops.size(); i < e; ++i) {
+        operandsDefinedAbove[i] = i;
+        for (unsigned j = 0; j < i; ++j) {
+          if (areValuesDefinedAbove(loops[i].getOperands(),
+                                    loops[j].region())) {
+            operandsDefinedAbove[i] = j;
+            break;
+          }
+        }
+        LLVM_DEBUG(llvm::dbgs()
+                   << "  bounds of loop " << i << " are known above depth "
+                   << operandsDefinedAbove[i] << '\n');
+      }
+
+      // 2. Identify bands of loops such that the operands of all of them are
+      // defined above the first loop in the band.  Traverse the nest bottom-up
+      // so that modifications don't invalidate the inner loops.
+      for (unsigned end = loops.size(); end > 0; --end) {
+        unsigned start = 0;
+        for (; start < end - 1; ++start) {
+          auto maxPos =
+              *std::max_element(std::next(operandsDefinedAbove.begin(), start),
+                                std::next(operandsDefinedAbove.begin(), end));
+          if (maxPos > start)
+            continue;
+
+          assert(maxPos == start &&
+                 "expected loop bounds to be known at the start of the band");
+          LLVM_DEBUG(llvm::dbgs() << "  found coalesceable band from " << start
+                                  << " to " << end << '\n');
+
+          auto band =
+              llvm::makeMutableArrayRef(loops.data() + start, end - start);
+          coalesceLoops(band);
+          break;
+        }
+        // If a band was found and transformed, keep looking at the loops above
+        // the outermost transformed loop.
+        if (start != end - 1)
+          end = start + 1;
+      }
+    });
+  }
+};
+
+} // namespace
+
+FunctionPassBase *mlir::createLoopCoalescingPass() {
+  return new LoopCoalescingPass;
+}
+
+static PassRegistration<LoopCoalescingPass>
+    reg(PASS_NAME,
+        "coalesce nested loops with independent bounds into a single loop");
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@ -33,9 +33,13 @@
 #include "mlir/IR/Function.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/StandardOps/Ops.h"
+#include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/Debug.h"

+#include "mlir/IR/Module.h"
+
 #define DEBUG_TYPE "LoopUtils"

 using namespace mlir;
@ -385,6 +389,11 @@ void mlir::getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
  getPerfectlyNestedLoopsImpl(nestedLoops, root);
 }

+void mlir::getPerfectlyNestedLoops(SmallVectorImpl<loop::ForOp> &nestedLoops,
+                                   loop::ForOp root) {
+  getPerfectlyNestedLoopsImpl(nestedLoops, root);
+}
+
 /// Unrolls this loop completely.
 LogicalResult mlir::loopUnrollFull(AffineForOp forOp) {
  Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
@ -870,6 +879,10 @@ static Value *ceilDivPositive(OpBuilder &builder, Location loc, Value *dividend,
  return builder.create<DivISOp>(loc, sum, divisorCst);
 }

+// Build the IR that performs ceil division of a positive value by another
+// positive value:
+//    ceildiv(a, b) = divis(a + (b - 1), b)
+// where divis is rounding-to-zero division.
 static Value *ceilDivPositive(OpBuilder &builder, Location loc, Value *dividend,
                              Value *divisor) {
  assert(dividend->getType().isIndex() && "expected index-typed value");
@ -914,3 +927,134 @@ void mlir::extractFixedOuterLoops(loop::ForOp rootForOp,
  // Call parametric tiling with the given sizes.
  return ::tile(forOps, tileSizes);
 }
+
+// Replaces all uses of `orig` with `replacement` except if the user is listed
+// in `exceptions`.
+static void
+replaceAllUsesExcept(Value *orig, Value *replacement,
+                     const SmallPtrSetImpl<Operation *> &exceptions) {
+  for (auto &use : orig->getUses()) {
+    if (exceptions.count(use.getOwner()) == 0)
+      use.set(replacement);
+  }
+}
+
+// Transform a loop with a strictly positive step
+//   for %i = %lb to %ub step %s
+// into a 0-based loop with step 1
+//   for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
+//     %i = %ii * %s + %lb
+// Insert the induction variable remapping in the body of `inner`, which is
+// expected to be either `loop` or another loop perfectly nested under `loop`.
+// Insert the definition of new bounds immediate before `outer`, which is
+// expected to be either `loop` or its parent in the loop nest.
+static void normalizeLoop(loop::ForOp loop, loop::ForOp outer,
+                          loop::ForOp inner) {
+  OpBuilder builder(outer);
+  Location loc = loop.getLoc();
+
+  // Check if the loop is already known to have a constant zero lower bound or
+  // a constant one step.
+  bool isZeroBased = false;
+  if (auto ubCst =
+          dyn_cast_or_null<ConstantIndexOp>(loop.lowerBound()->getDefiningOp()))
+    isZeroBased = ubCst.getValue() == 0;
+
+  bool isStepOne = false;
+  if (auto stepCst =
+          dyn_cast_or_null<ConstantIndexOp>(loop.step()->getDefiningOp()))
+    isStepOne = stepCst.getValue() == 1;
+
+  if (isZeroBased && isStepOne)
+    return;
+
+  // Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
+  // assuming the step is strictly positive.  Update the bounds and the step
+  // of the loop to go from 0 to the number of iterations, if necessary.
+  // TODO(zinenko): introduce support for negative steps or emit dynamic asserts
+  // on step positivity, whatever gets implemented first.
+  Value *diff =
+      builder.create<SubIOp>(loc, loop.upperBound(), loop.lowerBound());
+  Value *numIterations = ceilDivPositive(builder, loc, diff, loop.step());
+  loop.setUpperBound(numIterations);
+
+  Value *lb = loop.lowerBound();
+  if (!isZeroBased) {
+    Value *cst0 = builder.create<ConstantIndexOp>(loc, 0);
+    loop.setLowerBound(cst0);
+  }
+
+  Value *step = loop.step();
+  if (!isStepOne) {
+    Value *cst1 = builder.create<ConstantIndexOp>(loc, 1);
+    loop.setStep(cst1);
+  }
+
+  // Insert code computing the value of the original loop induction variable
+  // from the "normalized" one.
+  builder.setInsertionPointToStart(inner.body());
+  Value *scaled =
+      isStepOne ? loop.getInductionVar()
+                : builder.create<MulIOp>(loc, loop.getInductionVar(), step);
+  Value *shifted =
+      isZeroBased ? scaled : builder.create<AddIOp>(loc, scaled, lb);
+
+  SmallPtrSet<Operation *, 2> preserve{scaled->getDefiningOp(),
+                                       shifted->getDefiningOp()};
+  replaceAllUsesExcept(loop.getInductionVar(), shifted, preserve);
+}
+
+void mlir::coalesceLoops(MutableArrayRef<loop::ForOp> loops) {
+  if (loops.size() < 2)
+    return;
+
+  loop::ForOp innermost = loops.back();
+  loop::ForOp outermost = loops.front();
+
+  // 1. Make sure all loops iterate from 0 to upperBound with step 1.  This
+  // allows the following code to assume upperBound is the number of iterations.
+  for (auto loop : loops)
+    normalizeLoop(loop, outermost, innermost);
+
+  // 2. Emit code computing the upper bound of the coalesced loop as product
+  // of the number of iterations of all loops.
+  OpBuilder builder(outermost);
+  Location loc = outermost.getLoc();
+  Value *upperBound = outermost.upperBound();
+  for (auto loop : loops.drop_front())
+    upperBound = builder.create<MulIOp>(loc, upperBound, loop.upperBound());
+  outermost.setUpperBound(upperBound);
+
+  builder.setInsertionPointToStart(outermost.body());
+
+  // 3. Remap induction variables.  For each original loop, the value of the
+  // induction variable can be obtained by dividing the induction variable of
+  // the linearized loop by the total number of iterations of the loops nested
+  // in it modulo the number of iterations in this loop (remove the values
+  // related to the outer loops):
+  //   iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
+  // Compute these iteratively from the innermost loop by creating a "running
+  // quotient" of division by the range.
+  Value *previous = outermost.getInductionVar();
+  for (unsigned i = 0, e = loops.size(); i < e; ++i) {
+    unsigned idx = loops.size() - i - 1;
+    if (i != 0)
+      previous =
+          builder.create<DivISOp>(loc, previous, loops[idx + 1].upperBound());
+
+    Value *iv = (i == e - 1) ? previous
+                             : builder.create<RemISOp>(loc, previous,
+                                                       loops[idx].upperBound());
+    replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv,
+                               loops.back().region());
+  }
+
+  // 4. Move the operations from the innermost just above the second-outermost
+  // loop, delete the extra terminator and the second-outermost loop.
+  loop::ForOp second = loops[1];
+  innermost.body()->back().erase();
+  outermost.body()->getOperations().splice(
+      Block::iterator(second.getOperation()),
+      innermost.body()->getOperations());
+  second.erase();
+}
--- a/mlir/test/Transforms/loop-coalescing.mlir
+++ b/mlir/test/Transforms/loop-coalescing.mlir
@ -0,0 +1,161 @@
+// RUN: mlir-opt -loop-coalescing %s | FileCheck %s
+
+// CHECK-LABEL: @one_3d_nest
+func @one_3d_nest() {
+  // Capture original bounds.  Note that for zero-based step-one loops, the
+  // upper bound is also the number of iterations.
+  // CHECK: %[[orig_lb:.*]] = constant 0
+  // CHECK: %[[orig_step:.*]] = constant 1
+  // CHECK: %[[orig_ub_k:.*]] = constant 3
+  // CHECK: %[[orig_ub_i:.*]] = constant 42
+  // CHECK: %[[orig_ub_j:.*]] = constant 56
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c2 = constant 2 : index
+  %c3 = constant 3 : index
+  %c42 = constant 42 : index
+  %c56 = constant 56 : index
+  // The range of the new loop.
+  // CHECK:     %[[partial_range:.*]] = muli %[[orig_ub_i]], %[[orig_ub_j]]
+  // CHECK-NEXT:%[[range:.*]] = muli %[[partial_range]], %[[orig_ub_k]]
+
+  // Updated loop bounds.
+  // CHECK: loop.for %[[i:.*]] = %[[orig_lb]] to %[[range]] step %[[orig_step]]
+  loop.for %i = %c0 to %c42 step %c1 {
+    // Inner loops must have been removed.
+    // CHECK-NOT: loop.for
+
+    // Reconstruct original IVs from the linearized one.
+    // CHECK: %[[orig_k:.*]] = remis %[[i]], %[[orig_ub_k]]
+    // CHECK: %[[div:.*]] = divis %[[i]], %[[orig_ub_k]]
+    // CHECK: %[[orig_j:.*]] = remis %[[div]], %[[orig_ub_j]]
+    // CHECK: %[[orig_i:.*]] = divis %[[div]], %[[orig_ub_j]]
+    loop.for %j = %c0 to %c56 step %c1 {
+      loop.for %k = %c0 to %c3 step %c1 {
+        // CHECK: "use"(%[[orig_i]], %[[orig_j]], %[[orig_k]])
+        "use"(%i, %j, %k) : (index, index, index) -> ()
+      }
+    }
+  }
+  return
+}
+
+func @unnormalized_loops() {
+  // CHECK: %[[orig_step_i:.*]] = constant 2
+  // CHECK: %[[orig_step_j:.*]] = constant 3
+  // CHECK: %[[orig_lb_i:.*]] = constant 5
+  // CHECK: %[[orig_lb_j:.*]] = constant 7
+  // CHECK: %[[orig_ub_i:.*]] = constant 10
+  // CHECK: %[[orig_ub_j:.*]] = constant 17
+  %c2 = constant 2 : index
+  %c3 = constant 3 : index
+  %c5 = constant 5 : index
+  %c7 = constant 7 : index
+  %c10 = constant 10 : index
+  %c17 = constant 17 : index
+
+  // Number of iterations in the outer loop.
+  // CHECK: %[[diff_i:.*]] = subi %[[orig_ub_i]], %[[orig_lb_i]]
+  // CHECK: %[[c1:.*]] = constant 1
+  // CHECK: %[[step_minus_c1:.*]] = subi %[[orig_step_i]], %[[c1]]
+  // CHECK: %[[dividend:.*]] = addi %[[diff_i]], %[[step_minus_c1]]
+  // CHECK: %[[numiter_i:.*]] = divis %[[dividend]], %[[orig_step_i]]
+
+  // Normalized lower bound and step for the outer loop.
+  // CHECK: %[[lb_i:.*]] = constant 0
+  // CHECK: %[[step_i:.*]] = constant 1
+
+  // Number of iterations in the inner loop, the pattern is the same as above,
+  // only capture the final result.
+  // CHECK: %[[numiter_j:.*]] = divis {{.*}}, %[[orig_step_j]]
+
+  // New bounds of the outer loop.
+  // CHECK: %[[range:.*]] = muli %[[numiter_i]], %[[numiter_j]]
+  // CHECK: loop.for %[[i:.*]] = %[[lb_i]] to %[[range]] step %[[step_i]]
+  loop.for %i = %c5 to %c10 step %c2 {
+    // The inner loop has been removed.
+    // CHECK-NOT: loop.for
+    loop.for %j = %c7 to %c17 step %c3 {
+      // The IVs are rewritten.
+      // CHECK: %[[normalized_j:.*]] = remis %[[i]], %[[numiter_j]]
+      // CHECK: %[[normalized_i:.*]] = divis %[[i]], %[[numiter_j]]
+      // CHECK: %[[scaled_j:.*]] = muli %[[normalized_j]], %[[orig_step_j]]
+      // CHECK: %[[orig_j:.*]] = addi %[[scaled_j]], %[[orig_lb_j]]
+      // CHECK: %[[scaled_i:.*]] = muli %[[normalized_i]], %[[orig_step_i]]
+      // CHECK: %[[orig_i:.*]] = addi %[[scaled_i]], %[[orig_lb_i]]
+      // CHECK: "use"(%[[orig_i]], %[[orig_j]])
+      "use"(%i, %j) : (index, index) -> ()
+    }
+  }
+  return
+}
+
+// Check with parametric loop bounds and steps, capture the bounds here.
+// CHECK-LABEL: @parametric
+// CHECK-SAME: %[[orig_lb1:[A-Za-z0-9]+]]:
+// CHECK-SAME: %[[orig_ub1:[A-Za-z0-9]+]]:
+// CHECK-SAME: %[[orig_step1:[A-Za-z0-9]+]]:
+// CHECK-SAME: %[[orig_lb2:[A-Za-z0-9]+]]:
+// CHECK-SAME: %[[orig_ub2:[A-Za-z0-9]+]]:
+// CHECK-SAME: %[[orig_step2:[A-Za-z0-9]+]]:
+func @parametric(%lb1 : index, %ub1 : index, %step1 : index,
+                 %lb2 : index, %ub2 : index, %step2 : index) {
+  // Compute the number of iterations for each of the loops and the total
+  // number of iterations.
+  // CHECK: %[[range1:.*]] = subi %[[orig_ub1]], %[[orig_lb1]]
+  // CHECK: %[[orig_step1_minus_1:.*]] = subi %[[orig_step1]], %c1
+  // CHECK: %[[dividend1:.*]] = addi %[[range1]], %[[orig_step1_minus_1]]
+  // CHECK: %[[numiter1:.*]] = divis %[[dividend1]], %[[orig_step1]]
+  // CHECK: %[[range2:.*]] = subi %[[orig_ub2]], %[[orig_lb2]]
+  // CHECK: %[[orig_step2_minus_1:.*]] = subi %arg5, %c1
+  // CHECK: %[[dividend2:.*]] = addi %[[range2]], %[[orig_step2_minus_1]]
+  // CHECK: %[[numiter2:.*]] = divis %[[dividend2]], %[[orig_step2]]
+  // CHECK: %[[range:.*]] = muli %[[numiter1]], %[[numiter2]] : index
+
+  // Check that the outer loop is updated.
+  // CHECK: loop.for %[[i:.*]] = %c0{{.*}} to %[[range]] step %c1
+  loop.for %i = %lb1 to %ub1 step %step1 {
+    // Check that the inner loop is removed.
+    // CHECK-NOT: loop.for
+    loop.for %j = %lb2 to %ub2 step %step2 {
+      // Remapping of the induction variables.
+      // CHECK: %[[normalized_j:.*]] = remis %[[i]], %[[numiter2]] : index
+      // CHECK: %[[normalized_i:.*]] = divis %[[i]], %[[numiter2]] : index
+      // CHECK: %[[scaled_j:.*]] = muli %[[normalized_j]], %[[orig_step2]]
+      // CHECK: %[[orig_j:.*]] = addi %[[scaled_j]], %[[orig_lb2]]
+      // CHECK: %[[scaled_i:.*]] = muli %[[normalized_i]], %[[orig_step1]]
+      // CHECK: %[[orig_i:.*]] = addi %[[scaled_i]], %[[orig_lb1]]
+
+      // CHECK: "foo"(%[[orig_i]], %[[orig_j]])
+      "foo"(%i, %j) : (index, index) -> ()
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: @two_bands
+func @two_bands() {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c10 = constant 10 : index
+  // CHECK: %[[outer_range:.*]] = muli
+  // CHECK: loop.for %{{.*}} = %{{.*}} to %[[outer_range]]
+  loop.for %i = %c0 to %c10 step %c1 {
+    // Check that the "j" loop was removed and that the inner loops were
+    // coalesced as well.  The preparation step for coalescing will inject the
+    // subtraction operation unlike the IV remapping.
+    // CHECK-NOT: loop.for
+    // CHECK: subi
+    loop.for %j = %c0 to %c10 step %c1 {
+      // The inner pair of loops is coalesced separately.
+      // CHECK: loop.for
+      loop.for %k = %i to %j step %c1 {
+        // CHECK_NOT: loop.for
+        loop.for %l = %i to %j step %c1 {
+          "foo"() : () -> ()
+        }
+      }
+    }
+  }
+  return
+}