Introduce loop coalescing utility and a simple pass

Multiple (perfectly) nested loops with independent bounds can be combined into
a single loop and than subdivided into blocks of arbitrary size for load
balancing or more efficient parallelism exploitation.  However, MLIR wants to
preserve the multi-dimensional multi-loop structure at higher levels of
abstraction. Introduce a transformation that coalesces nested loops with
independent bounds so that they can be further subdivided by tiling.

PiperOrigin-RevId: 258151016
This commit is contained in:
Alex Zinenko 2019-07-15 06:40:07 -07:00 committed by Mehdi Amini
parent 4de019901b
commit fc044e8929
7 changed files with 423 additions and 0 deletions

View File

@ -86,6 +86,7 @@ def ForOp : Loop_Op<"for"> {
}
void setLowerBound(Value *bound) { getOperation()->setOperand(0, bound); }
void setUpperBound(Value *bound) { getOperation()->setOperand(1, bound); }
void setStep(Value *step) { getOperation()->setOperand(2, step); }
}];
}

View File

@ -57,6 +57,8 @@ LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor);
/// AffineForOp, and the second op is a terminator).
void getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
AffineForOp root);
void getPerfectlyNestedLoops(SmallVectorImpl<loop::ForOp> &nestedLoops,
loop::ForOp root);
/// Unrolls and jams this loop by the specified factor. Returns success if the
/// loop is successfully unroll-jammed.
@ -154,6 +156,11 @@ void tile(loop::ForOp rootForOp, ArrayRef<Value *> sizes);
/// as defined in `sizes`.
void extractFixedOuterLoops(loop::ForOp rootFOrOp, ArrayRef<int64_t> sizes);
/// Replace a perfect nest of "for" loops with a single linearized loop. Assumes
/// `loops` contains a list of perfectly nested loops with bounds and steps
/// independent of any loop induction variable involved in the nest.
void coalesceLoops(MutableArrayRef<loop::ForOp> loops);
} // end namespace mlir
#endif // MLIR_TRANSFORMS_LOOP_UTILS_H

View File

@ -105,6 +105,10 @@ FunctionPassBase *createLoopTilingPass(uint64_t cacheSizeBytes);
FunctionPassBase *
createSimpleParametricTilingPass(ArrayRef<int64_t> outerLoopSizes);
/// Creates a pass that transforms perfectly nested loops with independent
/// bounds into a single loop.
FunctionPassBase *createLoopCoalescingPass();
/// Promotes all accessed memref regions to the specified faster memory space
/// while generating DMAs to move data.
FunctionPassBase *createDmaGenerationPass(

View File

@ -5,6 +5,7 @@ add_llvm_library(MLIRTransforms
CSE.cpp
DialectConversion.cpp
DmaGeneration.cpp
LoopCoalescing.cpp
LoopFusion.cpp
LoopInvariantCodeMotion.cpp
LoopParametricTiling.cpp

View File

@ -0,0 +1,105 @@
//===- LoopCoalescing.cpp - Pass transforming loop nests into single loops-===//
//
// Copyright 2019 The MLIR Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "mlir/Dialect/LoopOps/LoopOps.h"
#include "mlir/Pass/Pass.h"
#include "mlir/StandardOps/Ops.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Passes.h"
#include "mlir/Transforms/RegionUtils.h"
#include "llvm/Support/Debug.h"
#define PASS_NAME "loop-coalescing"
#define DEBUG_TYPE PASS_NAME
using namespace mlir;
namespace {
class LoopCoalescingPass : public FunctionPass<LoopCoalescingPass> {
public:
void runOnFunction() override {
FuncOp func = getFunction();
func.walk<loop::ForOp>([](loop::ForOp op) {
// Ignore nested loops.
if (op.getParentOfType<loop::ForOp>())
return;
SmallVector<loop::ForOp, 4> loops;
getPerfectlyNestedLoops(loops, op);
LLVM_DEBUG(llvm::dbgs()
<< "found a perfect nest of depth " << loops.size() << '\n');
// Look for a band of loops that can be coalesced, i.e. perfectly nested
// loops with bounds defined above some loop.
// 1. For each loop, find above which parent loop its operands are
// defined.
SmallVector<unsigned, 4> operandsDefinedAbove(loops.size());
for (unsigned i = 0, e = loops.size(); i < e; ++i) {
operandsDefinedAbove[i] = i;
for (unsigned j = 0; j < i; ++j) {
if (areValuesDefinedAbove(loops[i].getOperands(),
loops[j].region())) {
operandsDefinedAbove[i] = j;
break;
}
}
LLVM_DEBUG(llvm::dbgs()
<< " bounds of loop " << i << " are known above depth "
<< operandsDefinedAbove[i] << '\n');
}
// 2. Identify bands of loops such that the operands of all of them are
// defined above the first loop in the band. Traverse the nest bottom-up
// so that modifications don't invalidate the inner loops.
for (unsigned end = loops.size(); end > 0; --end) {
unsigned start = 0;
for (; start < end - 1; ++start) {
auto maxPos =
*std::max_element(std::next(operandsDefinedAbove.begin(), start),
std::next(operandsDefinedAbove.begin(), end));
if (maxPos > start)
continue;
assert(maxPos == start &&
"expected loop bounds to be known at the start of the band");
LLVM_DEBUG(llvm::dbgs() << " found coalesceable band from " << start
<< " to " << end << '\n');
auto band =
llvm::makeMutableArrayRef(loops.data() + start, end - start);
coalesceLoops(band);
break;
}
// If a band was found and transformed, keep looking at the loops above
// the outermost transformed loop.
if (start != end - 1)
end = start + 1;
}
});
}
};
} // namespace
FunctionPassBase *mlir::createLoopCoalescingPass() {
return new LoopCoalescingPass;
}
static PassRegistration<LoopCoalescingPass>
reg(PASS_NAME,
"coalesce nested loops with independent bounds into a single loop");

View File

@ -33,9 +33,13 @@
#include "mlir/IR/Function.h"
#include "mlir/IR/Operation.h"
#include "mlir/StandardOps/Ops.h"
#include "mlir/Transforms/RegionUtils.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Support/Debug.h"
#include "mlir/IR/Module.h"
#define DEBUG_TYPE "LoopUtils"
using namespace mlir;
@ -385,6 +389,11 @@ void mlir::getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
getPerfectlyNestedLoopsImpl(nestedLoops, root);
}
void mlir::getPerfectlyNestedLoops(SmallVectorImpl<loop::ForOp> &nestedLoops,
loop::ForOp root) {
getPerfectlyNestedLoopsImpl(nestedLoops, root);
}
/// Unrolls this loop completely.
LogicalResult mlir::loopUnrollFull(AffineForOp forOp) {
Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
@ -870,6 +879,10 @@ static Value *ceilDivPositive(OpBuilder &builder, Location loc, Value *dividend,
return builder.create<DivISOp>(loc, sum, divisorCst);
}
// Build the IR that performs ceil division of a positive value by another
// positive value:
// ceildiv(a, b) = divis(a + (b - 1), b)
// where divis is rounding-to-zero division.
static Value *ceilDivPositive(OpBuilder &builder, Location loc, Value *dividend,
Value *divisor) {
assert(dividend->getType().isIndex() && "expected index-typed value");
@ -914,3 +927,134 @@ void mlir::extractFixedOuterLoops(loop::ForOp rootForOp,
// Call parametric tiling with the given sizes.
return ::tile(forOps, tileSizes);
}
// Replaces all uses of `orig` with `replacement` except if the user is listed
// in `exceptions`.
static void
replaceAllUsesExcept(Value *orig, Value *replacement,
const SmallPtrSetImpl<Operation *> &exceptions) {
for (auto &use : orig->getUses()) {
if (exceptions.count(use.getOwner()) == 0)
use.set(replacement);
}
}
// Transform a loop with a strictly positive step
// for %i = %lb to %ub step %s
// into a 0-based loop with step 1
// for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
// %i = %ii * %s + %lb
// Insert the induction variable remapping in the body of `inner`, which is
// expected to be either `loop` or another loop perfectly nested under `loop`.
// Insert the definition of new bounds immediate before `outer`, which is
// expected to be either `loop` or its parent in the loop nest.
static void normalizeLoop(loop::ForOp loop, loop::ForOp outer,
loop::ForOp inner) {
OpBuilder builder(outer);
Location loc = loop.getLoc();
// Check if the loop is already known to have a constant zero lower bound or
// a constant one step.
bool isZeroBased = false;
if (auto ubCst =
dyn_cast_or_null<ConstantIndexOp>(loop.lowerBound()->getDefiningOp()))
isZeroBased = ubCst.getValue() == 0;
bool isStepOne = false;
if (auto stepCst =
dyn_cast_or_null<ConstantIndexOp>(loop.step()->getDefiningOp()))
isStepOne = stepCst.getValue() == 1;
if (isZeroBased && isStepOne)
return;
// Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
// assuming the step is strictly positive. Update the bounds and the step
// of the loop to go from 0 to the number of iterations, if necessary.
// TODO(zinenko): introduce support for negative steps or emit dynamic asserts
// on step positivity, whatever gets implemented first.
Value *diff =
builder.create<SubIOp>(loc, loop.upperBound(), loop.lowerBound());
Value *numIterations = ceilDivPositive(builder, loc, diff, loop.step());
loop.setUpperBound(numIterations);
Value *lb = loop.lowerBound();
if (!isZeroBased) {
Value *cst0 = builder.create<ConstantIndexOp>(loc, 0);
loop.setLowerBound(cst0);
}
Value *step = loop.step();
if (!isStepOne) {
Value *cst1 = builder.create<ConstantIndexOp>(loc, 1);
loop.setStep(cst1);
}
// Insert code computing the value of the original loop induction variable
// from the "normalized" one.
builder.setInsertionPointToStart(inner.body());
Value *scaled =
isStepOne ? loop.getInductionVar()
: builder.create<MulIOp>(loc, loop.getInductionVar(), step);
Value *shifted =
isZeroBased ? scaled : builder.create<AddIOp>(loc, scaled, lb);
SmallPtrSet<Operation *, 2> preserve{scaled->getDefiningOp(),
shifted->getDefiningOp()};
replaceAllUsesExcept(loop.getInductionVar(), shifted, preserve);
}
void mlir::coalesceLoops(MutableArrayRef<loop::ForOp> loops) {
if (loops.size() < 2)
return;
loop::ForOp innermost = loops.back();
loop::ForOp outermost = loops.front();
// 1. Make sure all loops iterate from 0 to upperBound with step 1. This
// allows the following code to assume upperBound is the number of iterations.
for (auto loop : loops)
normalizeLoop(loop, outermost, innermost);
// 2. Emit code computing the upper bound of the coalesced loop as product
// of the number of iterations of all loops.
OpBuilder builder(outermost);
Location loc = outermost.getLoc();
Value *upperBound = outermost.upperBound();
for (auto loop : loops.drop_front())
upperBound = builder.create<MulIOp>(loc, upperBound, loop.upperBound());
outermost.setUpperBound(upperBound);
builder.setInsertionPointToStart(outermost.body());
// 3. Remap induction variables. For each original loop, the value of the
// induction variable can be obtained by dividing the induction variable of
// the linearized loop by the total number of iterations of the loops nested
// in it modulo the number of iterations in this loop (remove the values
// related to the outer loops):
// iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
// Compute these iteratively from the innermost loop by creating a "running
// quotient" of division by the range.
Value *previous = outermost.getInductionVar();
for (unsigned i = 0, e = loops.size(); i < e; ++i) {
unsigned idx = loops.size() - i - 1;
if (i != 0)
previous =
builder.create<DivISOp>(loc, previous, loops[idx + 1].upperBound());
Value *iv = (i == e - 1) ? previous
: builder.create<RemISOp>(loc, previous,
loops[idx].upperBound());
replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv,
loops.back().region());
}
// 4. Move the operations from the innermost just above the second-outermost
// loop, delete the extra terminator and the second-outermost loop.
loop::ForOp second = loops[1];
innermost.body()->back().erase();
outermost.body()->getOperations().splice(
Block::iterator(second.getOperation()),
innermost.body()->getOperations());
second.erase();
}

View File

@ -0,0 +1,161 @@
// RUN: mlir-opt -loop-coalescing %s | FileCheck %s
// CHECK-LABEL: @one_3d_nest
func @one_3d_nest() {
// Capture original bounds. Note that for zero-based step-one loops, the
// upper bound is also the number of iterations.
// CHECK: %[[orig_lb:.*]] = constant 0
// CHECK: %[[orig_step:.*]] = constant 1
// CHECK: %[[orig_ub_k:.*]] = constant 3
// CHECK: %[[orig_ub_i:.*]] = constant 42
// CHECK: %[[orig_ub_j:.*]] = constant 56
%c0 = constant 0 : index
%c1 = constant 1 : index
%c2 = constant 2 : index
%c3 = constant 3 : index
%c42 = constant 42 : index
%c56 = constant 56 : index
// The range of the new loop.
// CHECK: %[[partial_range:.*]] = muli %[[orig_ub_i]], %[[orig_ub_j]]
// CHECK-NEXT:%[[range:.*]] = muli %[[partial_range]], %[[orig_ub_k]]
// Updated loop bounds.
// CHECK: loop.for %[[i:.*]] = %[[orig_lb]] to %[[range]] step %[[orig_step]]
loop.for %i = %c0 to %c42 step %c1 {
// Inner loops must have been removed.
// CHECK-NOT: loop.for
// Reconstruct original IVs from the linearized one.
// CHECK: %[[orig_k:.*]] = remis %[[i]], %[[orig_ub_k]]
// CHECK: %[[div:.*]] = divis %[[i]], %[[orig_ub_k]]
// CHECK: %[[orig_j:.*]] = remis %[[div]], %[[orig_ub_j]]
// CHECK: %[[orig_i:.*]] = divis %[[div]], %[[orig_ub_j]]
loop.for %j = %c0 to %c56 step %c1 {
loop.for %k = %c0 to %c3 step %c1 {
// CHECK: "use"(%[[orig_i]], %[[orig_j]], %[[orig_k]])
"use"(%i, %j, %k) : (index, index, index) -> ()
}
}
}
return
}
func @unnormalized_loops() {
// CHECK: %[[orig_step_i:.*]] = constant 2
// CHECK: %[[orig_step_j:.*]] = constant 3
// CHECK: %[[orig_lb_i:.*]] = constant 5
// CHECK: %[[orig_lb_j:.*]] = constant 7
// CHECK: %[[orig_ub_i:.*]] = constant 10
// CHECK: %[[orig_ub_j:.*]] = constant 17
%c2 = constant 2 : index
%c3 = constant 3 : index
%c5 = constant 5 : index
%c7 = constant 7 : index
%c10 = constant 10 : index
%c17 = constant 17 : index
// Number of iterations in the outer loop.
// CHECK: %[[diff_i:.*]] = subi %[[orig_ub_i]], %[[orig_lb_i]]
// CHECK: %[[c1:.*]] = constant 1
// CHECK: %[[step_minus_c1:.*]] = subi %[[orig_step_i]], %[[c1]]
// CHECK: %[[dividend:.*]] = addi %[[diff_i]], %[[step_minus_c1]]
// CHECK: %[[numiter_i:.*]] = divis %[[dividend]], %[[orig_step_i]]
// Normalized lower bound and step for the outer loop.
// CHECK: %[[lb_i:.*]] = constant 0
// CHECK: %[[step_i:.*]] = constant 1
// Number of iterations in the inner loop, the pattern is the same as above,
// only capture the final result.
// CHECK: %[[numiter_j:.*]] = divis {{.*}}, %[[orig_step_j]]
// New bounds of the outer loop.
// CHECK: %[[range:.*]] = muli %[[numiter_i]], %[[numiter_j]]
// CHECK: loop.for %[[i:.*]] = %[[lb_i]] to %[[range]] step %[[step_i]]
loop.for %i = %c5 to %c10 step %c2 {
// The inner loop has been removed.
// CHECK-NOT: loop.for
loop.for %j = %c7 to %c17 step %c3 {
// The IVs are rewritten.
// CHECK: %[[normalized_j:.*]] = remis %[[i]], %[[numiter_j]]
// CHECK: %[[normalized_i:.*]] = divis %[[i]], %[[numiter_j]]
// CHECK: %[[scaled_j:.*]] = muli %[[normalized_j]], %[[orig_step_j]]
// CHECK: %[[orig_j:.*]] = addi %[[scaled_j]], %[[orig_lb_j]]
// CHECK: %[[scaled_i:.*]] = muli %[[normalized_i]], %[[orig_step_i]]
// CHECK: %[[orig_i:.*]] = addi %[[scaled_i]], %[[orig_lb_i]]
// CHECK: "use"(%[[orig_i]], %[[orig_j]])
"use"(%i, %j) : (index, index) -> ()
}
}
return
}
// Check with parametric loop bounds and steps, capture the bounds here.
// CHECK-LABEL: @parametric
// CHECK-SAME: %[[orig_lb1:[A-Za-z0-9]+]]:
// CHECK-SAME: %[[orig_ub1:[A-Za-z0-9]+]]:
// CHECK-SAME: %[[orig_step1:[A-Za-z0-9]+]]:
// CHECK-SAME: %[[orig_lb2:[A-Za-z0-9]+]]:
// CHECK-SAME: %[[orig_ub2:[A-Za-z0-9]+]]:
// CHECK-SAME: %[[orig_step2:[A-Za-z0-9]+]]:
func @parametric(%lb1 : index, %ub1 : index, %step1 : index,
%lb2 : index, %ub2 : index, %step2 : index) {
// Compute the number of iterations for each of the loops and the total
// number of iterations.
// CHECK: %[[range1:.*]] = subi %[[orig_ub1]], %[[orig_lb1]]
// CHECK: %[[orig_step1_minus_1:.*]] = subi %[[orig_step1]], %c1
// CHECK: %[[dividend1:.*]] = addi %[[range1]], %[[orig_step1_minus_1]]
// CHECK: %[[numiter1:.*]] = divis %[[dividend1]], %[[orig_step1]]
// CHECK: %[[range2:.*]] = subi %[[orig_ub2]], %[[orig_lb2]]
// CHECK: %[[orig_step2_minus_1:.*]] = subi %arg5, %c1
// CHECK: %[[dividend2:.*]] = addi %[[range2]], %[[orig_step2_minus_1]]
// CHECK: %[[numiter2:.*]] = divis %[[dividend2]], %[[orig_step2]]
// CHECK: %[[range:.*]] = muli %[[numiter1]], %[[numiter2]] : index
// Check that the outer loop is updated.
// CHECK: loop.for %[[i:.*]] = %c0{{.*}} to %[[range]] step %c1
loop.for %i = %lb1 to %ub1 step %step1 {
// Check that the inner loop is removed.
// CHECK-NOT: loop.for
loop.for %j = %lb2 to %ub2 step %step2 {
// Remapping of the induction variables.
// CHECK: %[[normalized_j:.*]] = remis %[[i]], %[[numiter2]] : index
// CHECK: %[[normalized_i:.*]] = divis %[[i]], %[[numiter2]] : index
// CHECK: %[[scaled_j:.*]] = muli %[[normalized_j]], %[[orig_step2]]
// CHECK: %[[orig_j:.*]] = addi %[[scaled_j]], %[[orig_lb2]]
// CHECK: %[[scaled_i:.*]] = muli %[[normalized_i]], %[[orig_step1]]
// CHECK: %[[orig_i:.*]] = addi %[[scaled_i]], %[[orig_lb1]]
// CHECK: "foo"(%[[orig_i]], %[[orig_j]])
"foo"(%i, %j) : (index, index) -> ()
}
}
return
}
// CHECK-LABEL: @two_bands
func @two_bands() {
%c0 = constant 0 : index
%c1 = constant 1 : index
%c10 = constant 10 : index
// CHECK: %[[outer_range:.*]] = muli
// CHECK: loop.for %{{.*}} = %{{.*}} to %[[outer_range]]
loop.for %i = %c0 to %c10 step %c1 {
// Check that the "j" loop was removed and that the inner loops were
// coalesced as well. The preparation step for coalescing will inject the
// subtraction operation unlike the IV remapping.
// CHECK-NOT: loop.for
// CHECK: subi
loop.for %j = %c0 to %c10 step %c1 {
// The inner pair of loops is coalesced separately.
// CHECK: loop.for
loop.for %k = %i to %j step %c1 {
// CHECK_NOT: loop.for
loop.for %l = %i to %j step %c1 {
"foo"() : () -> ()
}
}
}
}
return
}