[MLIR] Add parallel loop collapsing.

This allows conversion of a ParallelLoop from N induction variables to
some nuber of induction variables less than N.

The first intended use of this is for the GPUDialect to convert
ParallelLoops to iterate over 3 dimensions so they can be launched as
GPU Kernels.

To implement this:
- Normalize each iteration space of the ParallelLoop
- Use the same induction variable in a new ParallelLoop for multiple
  original iterations.
- Split the new induction variable back into the original set of values
  inside the body of the ParallelLoop.

Subscribers: mgorny, mehdi_amini, rriddle, jpienaar, burmako, shauheen, antiagainst, nicolasvasilache, arpith-jacob, mgester, lucyrfox, aartbik, liufengdb, Joonsoo, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D76363
This commit is contained in:
Tres Popp 2020-03-11 14:38:10 +01:00
parent a9ab11d408
commit 27c201aa1d
8 changed files with 315 additions and 38 deletions

View File

@ -113,6 +113,7 @@ inline void registerAllPasses() {
LLVM::createLegalizeForExportPass();
// LoopOps
createParallelLoopCollapsingPass();
createParallelLoopFusionPass();
createParallelLoopSpecializationPass();
createParallelLoopTilingPass();

View File

@ -28,6 +28,7 @@ struct MemRefRegion;
namespace loop {
class ForOp;
class ParallelOp;
} // end namespace loop
/// Unrolls this for operation completely if the trip count is known to be
@ -226,6 +227,12 @@ TileLoops extractFixedOuterLoops(loop::ForOp rootFOrOp,
/// independent of any loop induction variable involved in the nest.
void coalesceLoops(MutableArrayRef<loop::ForOp> loops);
/// Take the ParallelLoop and for each set of dimension indices, combine them
/// into a single dimension. combinedDimensions must contain each index into
/// loops exactly once.
void collapsePLoops(loop::ParallelOp loops,
ArrayRef<std::vector<unsigned>> combinedDimensions);
/// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of
/// size given by `numProcessors`. This is achieved by embedding the SSA values
/// corresponding to `processorIds` and `numProcessors` into the bounds and step

View File

@ -57,6 +57,18 @@ std::unique_ptr<OpPassBase<FuncOp>> createLowerAffinePass();
/// bounds into a single loop.
std::unique_ptr<OpPassBase<FuncOp>> createLoopCoalescingPass();
/// Creates a pass that transforms a single ParallelLoop over N induction
/// variables into another ParallelLoop over less than N induction variables.
std::unique_ptr<Pass> createParallelLoopCollapsingPass();
/// Performs packing (or explicit copying) of accessed memref regions into
/// buffers in the specified faster memory space through either pointwise copies
/// or DMA operations.
std::unique_ptr<OpPassBase<FuncOp>> createAffineDataCopyGenerationPass(
unsigned slowMemorySpace, unsigned fastMemorySpace,
unsigned tagMemorySpace = 0, int minDmaTransferSize = 1024,
uint64_t fastMemCapacityBytes = std::numeric_limits<uint64_t>::max());
/// Creates a pass to perform optimizations relying on memref dataflow such as
/// store to load forwarding, elimination of dead stores, and dead allocs.
std::unique_ptr<OpPassBase<FuncOp>> createMemRefDataFlowOptPass();

View File

@ -11,6 +11,7 @@ add_mlir_library(MLIRTransforms
LoopInvariantCodeMotion.cpp
MemRefDataFlowOpt.cpp
OpStats.cpp
ParallelLoopCollapsing.cpp
PipelineDataTransfer.cpp
StripDebugInfo.cpp
SymbolDCE.cpp

View File

@ -0,0 +1,69 @@
//===- ParallelLoopCollapsing.cpp - Pass collapsing parallel loop indices -===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/LoopOps/LoopOps.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Passes.h"
#include "mlir/Transforms/RegionUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#define PASS_NAME "parallel-loop-collapsing"
#define DEBUG_TYPE PASS_NAME
using namespace mlir;
namespace {
struct ParallelLoopCollapsing : public OperationPass<ParallelLoopCollapsing> {
ParallelLoopCollapsing() = default;
ParallelLoopCollapsing(const ParallelLoopCollapsing &) {}
void runOnOperation() override {
Operation *module = getOperation();
module->walk([&](loop::ParallelOp op) {
// The common case for GPU dialect will be simplifying the ParallelOp to 3
// arguments, so we do that here to simplify things.
llvm::SmallVector<std::vector<unsigned>, 3> combinedLoops;
if (clCollapsedIndices0.size())
combinedLoops.push_back(clCollapsedIndices0);
if (clCollapsedIndices1.size())
combinedLoops.push_back(clCollapsedIndices1);
if (clCollapsedIndices2.size())
combinedLoops.push_back(clCollapsedIndices2);
collapsePLoops(op, combinedLoops);
});
}
ListOption<unsigned> clCollapsedIndices0{
*this, "collapsed-indices-0",
llvm::cl::desc("Which loop indices to combine 0th loop index"),
llvm::cl::MiscFlags::CommaSeparated};
ListOption<unsigned> clCollapsedIndices1{
*this, "collapsed-indices-1",
llvm::cl::desc(
"Which loop indices to combine into the position 1 loop index"),
llvm::cl::MiscFlags::CommaSeparated};
ListOption<unsigned> clCollapsedIndices2{
*this, "collapsed-indices-2",
llvm::cl::desc(
"Which loop indices to combine into the position 2 loop index"),
llvm::cl::MiscFlags::CommaSeparated};
};
} // namespace
std::unique_ptr<Pass> mlir::createParallelLoopCollapsingPass() {
return std::make_unique<ParallelLoopCollapsing>();
}
static PassRegistration<ParallelLoopCollapsing>
reg(PASS_NAME, "collapse parallel loops to use less induction variables.");

View File

@ -36,6 +36,16 @@ using namespace mlir;
using llvm::SetVector;
using llvm::SmallMapVector;
namespace {
// This structure is to pass and return sets of loop parameters without
// confusing the order.
struct LoopParams {
Value lowerBound;
Value upperBound;
Value step;
};
} // namespace
/// Computes the cleanup loop lower bound of the loop being unrolled with
/// the specified unroll factor; this bound will also be upper bound of the main
/// part of the unrolled loop. Computes the bound as an AffineMap with its
@ -1094,69 +1104,78 @@ replaceAllUsesExcept(Value orig, Value replacement,
}
}
// Transform a loop with a strictly positive step
// for %i = %lb to %ub step %s
// into a 0-based loop with step 1
// for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
// %i = %ii * %s + %lb
// Insert the induction variable remapping in the body of `inner`, which is
// expected to be either `loop` or another loop perfectly nested under `loop`.
// Insert the definition of new bounds immediate before `outer`, which is
// expected to be either `loop` or its parent in the loop nest.
static void normalizeLoop(loop::ForOp loop, loop::ForOp outer,
loop::ForOp inner) {
OpBuilder builder(outer);
Location loc = loop.getLoc();
/// Return the new lower bound, upper bound, and step in that order. Insert any
/// additional bounds calculations before the given builder and any additional
/// conversion back to the original loop induction value inside the given Block.
static LoopParams normalizeLoop(OpBuilder &boundsBuilder,
OpBuilder &insideLoopBuilder, Location loc,
Value lowerBound, Value upperBound, Value step,
Value inductionVar) {
// Check if the loop is already known to have a constant zero lower bound or
// a constant one step.
bool isZeroBased = false;
if (auto ubCst =
dyn_cast_or_null<ConstantIndexOp>(loop.lowerBound().getDefiningOp()))
dyn_cast_or_null<ConstantIndexOp>(lowerBound.getDefiningOp()))
isZeroBased = ubCst.getValue() == 0;
bool isStepOne = false;
if (auto stepCst =
dyn_cast_or_null<ConstantIndexOp>(loop.step().getDefiningOp()))
if (auto stepCst = dyn_cast_or_null<ConstantIndexOp>(step.getDefiningOp()))
isStepOne = stepCst.getValue() == 1;
if (isZeroBased && isStepOne)
return;
// Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
// assuming the step is strictly positive. Update the bounds and the step
// of the loop to go from 0 to the number of iterations, if necessary.
// TODO(zinenko): introduce support for negative steps or emit dynamic asserts
// on step positivity, whatever gets implemented first.
Value diff =
builder.create<SubIOp>(loc, loop.upperBound(), loop.lowerBound());
Value numIterations = ceilDivPositive(builder, loc, diff, loop.step());
loop.setUpperBound(numIterations);
if (isZeroBased && isStepOne)
return {/*lowerBound=*/lowerBound, /*upperBound=*/upperBound,
/*step=*/step};
Value lb = loop.lowerBound();
if (!isZeroBased) {
Value cst0 = builder.create<ConstantIndexOp>(loc, 0);
loop.setLowerBound(cst0);
}
Value diff = boundsBuilder.create<SubIOp>(loc, upperBound, lowerBound);
Value newUpperBound = ceilDivPositive(boundsBuilder, loc, diff, step);
Value step = loop.step();
if (!isStepOne) {
Value cst1 = builder.create<ConstantIndexOp>(loc, 1);
loop.setStep(cst1);
}
Value newLowerBound =
isZeroBased ? lowerBound : boundsBuilder.create<ConstantIndexOp>(loc, 0);
Value newStep =
isStepOne ? step : boundsBuilder.create<ConstantIndexOp>(loc, 1);
// Insert code computing the value of the original loop induction variable
// from the "normalized" one.
builder.setInsertionPointToStart(inner.getBody());
Value scaled =
isStepOne ? loop.getInductionVar()
: builder.create<MulIOp>(loc, loop.getInductionVar(), step);
isStepOne ? inductionVar
: insideLoopBuilder.create<MulIOp>(loc, inductionVar, step);
Value shifted =
isZeroBased ? scaled : builder.create<AddIOp>(loc, scaled, lb);
isZeroBased ? scaled
: insideLoopBuilder.create<AddIOp>(loc, scaled, lowerBound);
SmallPtrSet<Operation *, 2> preserve{scaled.getDefiningOp(),
shifted.getDefiningOp()};
replaceAllUsesExcept(loop.getInductionVar(), shifted, preserve);
replaceAllUsesExcept(inductionVar, shifted, preserve);
return {/*lowerBound=*/newLowerBound, /*upperBound=*/newUpperBound,
/*step=*/newStep};
}
/// Transform a loop with a strictly positive step
/// for %i = %lb to %ub step %s
/// into a 0-based loop with step 1
/// for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
/// %i = %ii * %s + %lb
/// Insert the induction variable remapping in the body of `inner`, which is
/// expected to be either `loop` or another loop perfectly nested under `loop`.
/// Insert the definition of new bounds immediate before `outer`, which is
/// expected to be either `loop` or its parent in the loop nest.
static void normalizeLoop(loop::ForOp loop, loop::ForOp outer,
loop::ForOp inner) {
OpBuilder builder(outer);
OpBuilder innerBuilder(inner.getBody(), inner.getBody()->begin());
auto loopPieces =
normalizeLoop(builder, innerBuilder, loop.getLoc(), loop.lowerBound(),
loop.upperBound(), loop.step(), loop.getInductionVar());
loop.setLowerBound(loopPieces.lowerBound);
loop.setUpperBound(loopPieces.upperBound);
loop.setStep(loopPieces.step);
}
void mlir::coalesceLoops(MutableArrayRef<loop::ForOp> loops) {
@ -1214,6 +1233,86 @@ void mlir::coalesceLoops(MutableArrayRef<loop::ForOp> loops) {
second.erase();
}
void mlir::collapsePLoops(loop::ParallelOp loops,
ArrayRef<std::vector<unsigned>> combinedDimensions) {
OpBuilder outsideBuilder(loops);
Location loc = loops.getLoc();
// Normalize ParallelOp's iteration pattern.
SmallVector<Value, 3> normalizedLowerBounds;
SmallVector<Value, 3> normalizedSteps;
SmallVector<Value, 3> normalizedUpperBounds;
for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) {
OpBuilder insideLoopBuilder(loops.getBody(), loops.getBody()->begin());
auto resultBounds =
normalizeLoop(outsideBuilder, insideLoopBuilder, loc,
loops.lowerBound()[i], loops.upperBound()[i],
loops.step()[i], loops.getBody()->getArgument(i));
normalizedLowerBounds.push_back(resultBounds.lowerBound);
normalizedUpperBounds.push_back(resultBounds.upperBound);
normalizedSteps.push_back(resultBounds.step);
}
// Combine iteration spaces
SmallVector<Value, 3> lowerBounds;
SmallVector<Value, 3> steps;
SmallVector<Value, 3> upperBounds;
auto cst0 = outsideBuilder.create<ConstantIndexOp>(loc, 0);
auto cst1 = outsideBuilder.create<ConstantIndexOp>(loc, 1);
for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) {
Value newUpperBound = outsideBuilder.create<ConstantIndexOp>(loc, 1);
for (auto idx : combinedDimensions[i]) {
newUpperBound = outsideBuilder.create<MulIOp>(loc, newUpperBound,
normalizedUpperBounds[idx]);
}
lowerBounds.push_back(cst0);
steps.push_back(cst1);
upperBounds.push_back(newUpperBound);
}
// Create new ParallelLoop with conversions to the original induction values.
// The loop below uses divisions to get the relevant range of values in the
// new induction value that represent each range of the original induction
// value. The remainders then determine based on that range, which iteration
// of the original induction value this represents. This is a normalized value
// that is un-normalized already by the previous logic.
auto newPloop = outsideBuilder.create<loop::ParallelOp>(loc, lowerBounds,
upperBounds, steps);
OpBuilder insideBuilder(newPloop.getBody(), newPloop.getBody()->begin());
for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) {
Value previous = newPloop.getBody()->getArgument(i);
unsigned numberCombinedDimensions = combinedDimensions[i].size();
// Iterate over all except the last induction value.
for (unsigned j = 0, e = numberCombinedDimensions - 1; j < e; ++j) {
unsigned idx = combinedDimensions[i][j];
// Determine the current induction value's current loop iteration
Value iv = insideBuilder.create<SignedRemIOp>(loc, previous,
normalizedUpperBounds[idx]);
replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), iv,
loops.region());
// Remove the effect of the current induction value to prepare for the
// next value.
previous = insideBuilder.create<SignedDivIOp>(
loc, previous, normalizedUpperBounds[idx + 1]);
}
// The final induction value is just the remaining value.
unsigned idx = combinedDimensions[i][numberCombinedDimensions - 1];
replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), previous,
loops.region());
}
// Replace the old loop with the new loop.
loops.getBody()->back().erase();
newPloop.getBody()->getOperations().splice(
Block::iterator(newPloop.getBody()->back()),
loops.getBody()->getOperations());
loops.erase();
}
void mlir::mapLoopToProcessorIds(loop::ForOp forOp, ArrayRef<Value> processorId,
ArrayRef<Value> numProcessors) {
assert(processorId.size() == numProcessors.size());

View File

@ -0,0 +1,52 @@
// RUN: mlir-opt %s -pass-pipeline='func(parallel-loop-collapsing{collapsed-indices-0=0,3 collapsed-indices-1=1,4 collapsed-indices-2=2}, canonicalize)' | FileCheck %s
// CHECK-LABEL: func @parallel_many_dims() {
func @parallel_many_dims() {
// CHECK: [[VAL_0:%.*]] = constant 6 : index
// CHECK: [[VAL_1:%.*]] = constant 7 : index
// CHECK: [[VAL_2:%.*]] = constant 9 : index
// CHECK: [[VAL_3:%.*]] = constant 10 : index
// CHECK: [[VAL_4:%.*]] = constant 12 : index
// CHECK: [[VAL_5:%.*]] = constant 13 : index
// CHECK: [[VAL_6:%.*]] = constant 3 : index
// CHECK: [[VAL_7:%.*]] = constant 0 : index
// CHECK: [[VAL_8:%.*]] = constant 1 : index
// CHECK: [[VAL_9:%.*]] = constant 2 : index
%c0 = constant 0 : index
%c1 = constant 1 : index
%c2 = constant 2 : index
%c3 = constant 3 : index
%c4 = constant 4 : index
%c5 = constant 5 : index
%c6 = constant 6 : index
%c7 = constant 7 : index
%c8 = constant 8 : index
%c9 = constant 9 : index
%c10 = constant 10 : index
%c11 = constant 11 : index
%c12 = constant 12 : index
%c13 = constant 13 : index
%c14 = constant 14 : index
// CHECK: loop.parallel ([[VAL_10:%.*]], [[VAL_11:%.*]], [[VAL_12:%.*]]) = ([[VAL_7]], [[VAL_7]], [[VAL_7]]) to ([[VAL_9]], [[VAL_8]], [[VAL_8]]) step ([[VAL_8]], [[VAL_8]], [[VAL_8]]) {
loop.parallel (%i0, %i1, %i2, %i3, %i4) = (%c0, %c3, %c6, %c9, %c12) to (%c2, %c5, %c8, %c11, %c14)
step (%c1, %c4, %c7, %c10, %c13) {
// CHECK: [[VAL_13:%.*]] = remi_signed [[VAL_10]], [[VAL_9]] : index
// CHECK: [[VAL_14:%.*]] = divi_signed [[VAL_10]], [[VAL_8]] : index
// CHECK: [[VAL_15:%.*]] = divi_signed [[VAL_11]], [[VAL_8]] : index
// CHECK: [[VAL_16:%.*]] = muli [[VAL_15]], [[VAL_5]] : index
// CHECK: [[VAL_17:%.*]] = addi [[VAL_16]], [[VAL_4]] : index
// CHECK: [[VAL_18:%.*]] = muli [[VAL_14]], [[VAL_3]] : index
// CHECK: [[VAL_19:%.*]] = addi [[VAL_18]], [[VAL_2]] : index
// CHECK: [[VAL_20:%.*]] = muli [[VAL_12]], [[VAL_1]] : index
// CHECK: [[VAL_21:%.*]] = addi [[VAL_20]], [[VAL_0]] : index
// CHECK: [[VAL_22:%.*]] = "magic.op"([[VAL_13]], [[VAL_6]], [[VAL_21]], [[VAL_19]], [[VAL_17]]) : (index, index, index, index, index) -> index
%result = "magic.op"(%i0, %i1, %i2, %i3, %i4): (index, index, index, index, index) -> index
}
return
}
// CHECK: loop.yield
// CHECK: }
// CHECK: return
// CHECK: }

View File

@ -0,0 +1,36 @@
// RUN: mlir-opt %s -pass-pipeline='func(parallel-loop-collapsing{collapsed-indices-0=0,1}, canonicalize)' | FileCheck %s
// CHECK-LABEL: func @collapse_to_single() {
func @collapse_to_single() {
// CHECK: [[VAL_0:%.*]] = constant 7 : index
// CHECK: [[VAL_1:%.*]] = constant 4 : index
// CHECK: [[VAL_2:%.*]] = constant 18 : index
// CHECK: [[VAL_3:%.*]] = constant 3 : index
// CHECK: [[VAL_4:%.*]] = constant 6 : index
// CHECK: [[VAL_5:%.*]] = constant 0 : index
// CHECK: [[VAL_6:%.*]] = constant 1 : index
%c0 = constant 3 : index
%c1 = constant 7 : index
%c2 = constant 11 : index
%c3 = constant 29 : index
%c4 = constant 3 : index
%c5 = constant 4 : index
// CHECK: loop.parallel ([[VAL_7:%.*]]) = ([[VAL_5]]) to ([[VAL_2]]) step ([[VAL_6]]) {
loop.parallel (%i0, %i1) = (%c0, %c1) to (%c2, %c3) step (%c4, %c5) {
// CHECK: [[VAL_8:%.*]] = remi_signed [[VAL_7]], [[VAL_3]] : index
// CHECK: [[VAL_9:%.*]] = divi_signed [[VAL_7]], [[VAL_4]] : index
// CHECK: [[VAL_10:%.*]] = muli [[VAL_9]], [[VAL_1]] : index
// CHECK: [[VAL_11:%.*]] = addi [[VAL_10]], [[VAL_0]] : index
// CHECK: [[VAL_12:%.*]] = muli [[VAL_8]], [[VAL_3]] : index
// CHECK: [[VAL_13:%.*]] = addi [[VAL_12]], [[VAL_3]] : index
// CHECK: [[VAL_14:%.*]] = "magic.op"([[VAL_13]], [[VAL_11]]) : (index, index) -> index
%result = "magic.op"(%i0, %i1): (index, index) -> index
}
return
}
// CHECK: loop.yield
// CHECK: }
// CHECK: return
// CHECK: }