forked from OSchip/llvm-project
236 lines
9.2 KiB
C++
236 lines
9.2 KiB
C++
//===- LoopUnrollAndJam.cpp - Code to perform loop unroll and jam ---------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file implements loop unroll and jam. Unroll and jam is a transformation
|
|
// that improves locality, in particular, register reuse, while also improving
|
|
// operation level parallelism. The example below shows what it does in nearly
|
|
// the general case. Loop unroll and jam currently works if the bounds of the
|
|
// loops inner to the loop being unroll-jammed do not depend on the latter.
|
|
//
|
|
// Before After unroll and jam of i by factor 2:
|
|
//
|
|
// for i, step = 2
|
|
// for i S1(i);
|
|
// S1; S2(i);
|
|
// S2; S1(i+1);
|
|
// for j S2(i+1);
|
|
// S3; for j
|
|
// S4; S3(i, j);
|
|
// S5; S4(i, j);
|
|
// S6; S3(i+1, j)
|
|
// S4(i+1, j)
|
|
// S5(i);
|
|
// S6(i);
|
|
// S5(i+1);
|
|
// S6(i+1);
|
|
//
|
|
// Note: 'if/else' blocks are not jammed. So, if there are loops inside if
|
|
// op's, bodies of those loops will not be jammed.
|
|
//===----------------------------------------------------------------------===//
|
|
#include "mlir/Transforms/Passes.h"
|
|
|
|
#include "mlir/Analysis/LoopAnalysis.h"
|
|
#include "mlir/Dialect/AffineOps/AffineOps.h"
|
|
#include "mlir/IR/AffineExpr.h"
|
|
#include "mlir/IR/AffineMap.h"
|
|
#include "mlir/IR/BlockAndValueMapping.h"
|
|
#include "mlir/IR/Builders.h"
|
|
#include "mlir/Pass/Pass.h"
|
|
#include "mlir/Transforms/LoopUtils.h"
|
|
#include "llvm/ADT/DenseMap.h"
|
|
#include "llvm/Support/CommandLine.h"
|
|
|
|
using namespace mlir;
|
|
|
|
#define DEBUG_TYPE "affine-loop-unroll-jam"
|
|
|
|
static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
|
|
|
|
// Loop unroll and jam factor.
|
|
static llvm::cl::opt<unsigned>
|
|
clUnrollJamFactor("unroll-jam-factor", llvm::cl::Hidden,
|
|
llvm::cl::desc("Use this unroll jam factor for all loops"
|
|
" (default 4)"),
|
|
llvm::cl::cat(clOptionsCategory));
|
|
|
|
namespace {
|
|
/// Loop unroll jam pass. Currently, this just unroll jams the first
|
|
/// outer loop in a Function.
|
|
struct LoopUnrollAndJam : public FunctionPass<LoopUnrollAndJam> {
|
|
Optional<unsigned> unrollJamFactor;
|
|
static const unsigned kDefaultUnrollJamFactor = 4;
|
|
|
|
explicit LoopUnrollAndJam(Optional<unsigned> unrollJamFactor = None)
|
|
: unrollJamFactor(unrollJamFactor) {}
|
|
|
|
void runOnFunction() override;
|
|
LogicalResult runOnAffineForOp(AffineForOp forOp);
|
|
};
|
|
} // end anonymous namespace
|
|
|
|
std::unique_ptr<OpPassBase<FuncOp>>
|
|
mlir::createLoopUnrollAndJamPass(int unrollJamFactor) {
|
|
return std::make_unique<LoopUnrollAndJam>(
|
|
unrollJamFactor == -1 ? None : Optional<unsigned>(unrollJamFactor));
|
|
}
|
|
|
|
void LoopUnrollAndJam::runOnFunction() {
|
|
// Currently, just the outermost loop from the first loop nest is
|
|
// unroll-and-jammed by this pass. However, runOnAffineForOp can be called on
|
|
// any for operation.
|
|
auto &entryBlock = getFunction().front();
|
|
if (auto forOp = dyn_cast<AffineForOp>(entryBlock.front()))
|
|
runOnAffineForOp(forOp);
|
|
}
|
|
|
|
/// Unroll and jam a 'affine.for' op. Default unroll jam factor is
|
|
/// kDefaultUnrollJamFactor. Return failure if nothing was done.
|
|
LogicalResult LoopUnrollAndJam::runOnAffineForOp(AffineForOp forOp) {
|
|
// Unroll and jam by the factor that was passed if any.
|
|
if (unrollJamFactor.hasValue())
|
|
return loopUnrollJamByFactor(forOp, unrollJamFactor.getValue());
|
|
// Otherwise, unroll jam by the command-line factor if one was specified.
|
|
if (clUnrollJamFactor.getNumOccurrences() > 0)
|
|
return loopUnrollJamByFactor(forOp, clUnrollJamFactor);
|
|
|
|
// Unroll and jam by four otherwise.
|
|
return loopUnrollJamByFactor(forOp, kDefaultUnrollJamFactor);
|
|
}
|
|
|
|
LogicalResult mlir::loopUnrollJamUpToFactor(AffineForOp forOp,
|
|
uint64_t unrollJamFactor) {
|
|
Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
|
|
|
|
if (mayBeConstantTripCount.hasValue() &&
|
|
mayBeConstantTripCount.getValue() < unrollJamFactor)
|
|
return loopUnrollJamByFactor(forOp, mayBeConstantTripCount.getValue());
|
|
return loopUnrollJamByFactor(forOp, unrollJamFactor);
|
|
}
|
|
|
|
/// Unrolls and jams this loop by the specified factor.
|
|
LogicalResult mlir::loopUnrollJamByFactor(AffineForOp forOp,
|
|
uint64_t unrollJamFactor) {
|
|
// Gathers all maximal sub-blocks of operations that do not themselves
|
|
// include a for op (a operation could have a descendant for op though
|
|
// in its tree). Ignore the block terminators.
|
|
struct JamBlockGatherer {
|
|
// Store iterators to the first and last op of each sub-block found.
|
|
std::vector<std::pair<Block::iterator, Block::iterator>> subBlocks;
|
|
|
|
// This is a linear time walk.
|
|
void walk(Operation *op) {
|
|
for (auto ®ion : op->getRegions())
|
|
for (auto &block : region)
|
|
walk(block);
|
|
}
|
|
void walk(Block &block) {
|
|
for (auto it = block.begin(), e = std::prev(block.end()); it != e;) {
|
|
auto subBlockStart = it;
|
|
while (it != e && !isa<AffineForOp>(&*it))
|
|
++it;
|
|
if (it != subBlockStart)
|
|
subBlocks.push_back({subBlockStart, std::prev(it)});
|
|
// Process all for insts that appear next.
|
|
while (it != e && isa<AffineForOp>(&*it))
|
|
walk(&*it++);
|
|
}
|
|
}
|
|
};
|
|
|
|
assert(unrollJamFactor >= 1 && "unroll jam factor should be >= 1");
|
|
|
|
if (unrollJamFactor == 1)
|
|
return promoteIfSingleIteration(forOp);
|
|
|
|
if (forOp.getBody()->empty() ||
|
|
forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
|
|
return failure();
|
|
|
|
// Loops where both lower and upper bounds are multi-result maps won't be
|
|
// unrolled (since the trip can't be expressed as an affine function in
|
|
// general).
|
|
// TODO(mlir-team): this may not be common, but we could support the case
|
|
// where the lower bound is a multi-result map and the ub is a single result
|
|
// one.
|
|
if (forOp.getLowerBoundMap().getNumResults() != 1)
|
|
return failure();
|
|
|
|
Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
|
|
// If the trip count is lower than the unroll jam factor, no unroll jam.
|
|
if (mayBeConstantTripCount.hasValue() &&
|
|
mayBeConstantTripCount.getValue() < unrollJamFactor)
|
|
return failure();
|
|
|
|
auto *forInst = forOp.getOperation();
|
|
|
|
// Gather all sub-blocks to jam upon the loop being unrolled.
|
|
JamBlockGatherer jbg;
|
|
jbg.walk(forInst);
|
|
auto &subBlocks = jbg.subBlocks;
|
|
|
|
// Generate the cleanup loop if trip count isn't a multiple of
|
|
// unrollJamFactor.
|
|
if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
|
|
// Insert the cleanup loop right after 'forOp'.
|
|
OpBuilder builder(forInst->getBlock(), std::next(Block::iterator(forInst)));
|
|
auto cleanupAffineForOp = cast<AffineForOp>(builder.clone(*forInst));
|
|
// Adjust the lower bound of the cleanup loop; its upper bound is the same
|
|
// as the original loop's upper bound.
|
|
AffineMap cleanupMap;
|
|
SmallVector<Value, 4> cleanupOperands;
|
|
getCleanupLoopLowerBound(forOp, unrollJamFactor, &cleanupMap,
|
|
&cleanupOperands, builder);
|
|
cleanupAffineForOp.setLowerBound(cleanupOperands, cleanupMap);
|
|
|
|
// Promote the cleanup loop if it has turned into a single iteration loop.
|
|
promoteIfSingleIteration(cleanupAffineForOp);
|
|
|
|
// Adjust the upper bound of the original loop - it will be the same as the
|
|
// cleanup loop's lower bound. Its lower bound remains unchanged.
|
|
forOp.setUpperBound(cleanupOperands, cleanupMap);
|
|
}
|
|
|
|
// Scale the step of loop being unroll-jammed by the unroll-jam factor.
|
|
int64_t step = forOp.getStep();
|
|
forOp.setStep(step * unrollJamFactor);
|
|
|
|
auto forOpIV = forOp.getInductionVar();
|
|
// Unroll and jam (appends unrollJamFactor - 1 additional copies).
|
|
for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
|
|
// Operand map persists across all sub-blocks.
|
|
BlockAndValueMapping operandMapping;
|
|
for (auto &subBlock : subBlocks) {
|
|
// Builder to insert unroll-jammed bodies. Insert right at the end of
|
|
// sub-block.
|
|
OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
|
|
|
|
// If the induction variable is used, create a remapping to the value for
|
|
// this unrolled instance.
|
|
if (!forOpIV.use_empty()) {
|
|
// iv' = iv + i, i = 1 to unrollJamFactor-1.
|
|
auto d0 = builder.getAffineDimExpr(0);
|
|
auto bumpMap = AffineMap::get(1, 0, {d0 + i * step});
|
|
auto ivUnroll =
|
|
builder.create<AffineApplyOp>(forInst->getLoc(), bumpMap, forOpIV);
|
|
operandMapping.map(forOpIV, ivUnroll);
|
|
}
|
|
// Clone the sub-block being unroll-jammed.
|
|
for (auto it = subBlock.first; it != std::next(subBlock.second); ++it) {
|
|
builder.clone(*it, operandMapping);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Promote the loop body up if this has turned into a single iteration loop.
|
|
promoteIfSingleIteration(forOp);
|
|
return success();
|
|
}
|
|
|
|
static PassRegistration<LoopUnrollAndJam> pass("affine-loop-unroll-jam",
|
|
"Unroll and jam loops");
|