forked from OSchip/llvm-project
[MLIR][Affine][VectorOps] Utility to vectorize loop nest using strategy
This patch adds a utility based on SuperVectorizer to vectorize an affine loop nest using a given vectorization strategy. This strategy allows targeting specific loops for vectorization instead of relying of the SuperVectorizer analysis to choose the right loops to vectorize. Reviewed By: nicolasvasilache Differential Revision: https://reviews.llvm.org/D85869
This commit is contained in:
parent
1747f77764
commit
14d0735d34
|
@ -14,6 +14,8 @@
|
|||
#define MLIR_DIALECT_AFFINE_UTILS_H
|
||||
|
||||
#include "mlir/Support/LLVM.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
|
||||
namespace mlir {
|
||||
|
||||
|
@ -34,6 +36,47 @@ void affineParallelize(AffineForOp forOp);
|
|||
/// significant code expansion in some cases.
|
||||
LogicalResult hoistAffineIfOp(AffineIfOp ifOp, bool *folded = nullptr);
|
||||
|
||||
/// Holds parameters to perform n-D vectorization on a single loop nest.
|
||||
/// For example, for the following loop nest:
|
||||
///
|
||||
/// func @vec2d(%in: memref<64x128x512xf32>, %out: memref<64x128x512xf32>) {
|
||||
/// affine.for %i0 = 0 to 64 {
|
||||
/// affine.for %i1 = 0 to 128 {
|
||||
/// affine.for %i2 = 0 to 512 {
|
||||
/// %ld = affine.load %in[%i0, %i1, %i2] : memref<64x128x512xf32>
|
||||
/// affine.store %ld, %out[%i0, %i1, %i2] : memref<64x128x512xf32>
|
||||
/// }
|
||||
/// }
|
||||
/// }
|
||||
/// return
|
||||
/// }
|
||||
///
|
||||
/// and VectorizationStrategy = 'vectorSizes = {8, 4}', 'loopToVectorDim =
|
||||
/// {{i1->0}, {i2->1}}', SuperVectorizer will generate:
|
||||
///
|
||||
/// func @vec2d(%arg0: memref<64x128x512xf32>, %arg1: memref<64x128x512xf32>) {
|
||||
/// affine.for %arg2 = 0 to 64 {
|
||||
/// affine.for %arg3 = 0 to 128 step 8 {
|
||||
/// affine.for %arg4 = 0 to 512 step 4 {
|
||||
/// %cst = constant 0.000000e+00 : f32
|
||||
/// %0 = vector.transfer_read %arg0[%arg2, %arg3, %arg4], %cst : ...
|
||||
/// vector.transfer_write %0, %arg1[%arg2, %arg3, %arg4] : ...
|
||||
/// }
|
||||
/// }
|
||||
/// }
|
||||
/// return
|
||||
/// }
|
||||
// TODO: Hoist to a VectorizationStrategy.cpp when appropriate.
|
||||
struct VectorizationStrategy {
|
||||
// Vectorization factors to apply to each target vector dimension.
|
||||
// Each factor will be applied to a different loop.
|
||||
SmallVector<int64_t, 8> vectorSizes;
|
||||
// Maps each AffineForOp vectorization candidate with its vector dimension.
|
||||
// The candidate will be vectorized using the vectorization factor in
|
||||
// 'vectorSizes' for that dimension.
|
||||
DenseMap<Operation *, unsigned> loopToVectorDim;
|
||||
};
|
||||
|
||||
/// Vectorizes affine loops in 'loops' using the n-D vectorization factors in
|
||||
/// 'vectorSizes'. By default, each vectorization factor is applied
|
||||
/// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can
|
||||
|
@ -43,6 +86,45 @@ void vectorizeAffineLoops(
|
|||
llvm::DenseSet<Operation *, DenseMapInfo<Operation *>> &loops,
|
||||
ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern);
|
||||
|
||||
/// External utility to vectorize affine loops from a single loop nest using an
|
||||
/// n-D vectorization strategy (see doc in VectorizationStrategy definition).
|
||||
/// Loops are provided in a 2D vector container. The first dimension represents
|
||||
/// the nesting level relative to the loops to be vectorized. The second
|
||||
/// dimension contains the loops. This means that:
|
||||
/// a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]',
|
||||
/// b) a loop in 'loops[i]' may or may not have a child loop in 'loops[i+1]'.
|
||||
///
|
||||
/// For example, for the following loop nest:
|
||||
///
|
||||
/// func @vec2d(%in0: memref<64x128x512xf32>, %in1: memref<64x128x128xf32>,
|
||||
/// %out0: memref<64x128x512xf32>,
|
||||
/// %out1: memref<64x128x128xf32>) {
|
||||
/// affine.for %i0 = 0 to 64 {
|
||||
/// affine.for %i1 = 0 to 128 {
|
||||
/// affine.for %i2 = 0 to 512 {
|
||||
/// %ld = affine.load %in0[%i0, %i1, %i2] : memref<64x128x512xf32>
|
||||
/// affine.store %ld, %out0[%i0, %i1, %i2] : memref<64x128x512xf32>
|
||||
/// }
|
||||
/// affine.for %i3 = 0 to 128 {
|
||||
/// %ld = affine.load %in1[%i0, %i1, %i3] : memref<64x128x128xf32>
|
||||
/// affine.store %ld, %out1[%i0, %i1, %i3] : memref<64x128x128xf32>
|
||||
/// }
|
||||
/// }
|
||||
/// }
|
||||
/// return
|
||||
/// }
|
||||
///
|
||||
/// loops = {{%i0}, {%i2, %i3}}, to vectorize the outermost and the two
|
||||
/// innermost loops;
|
||||
/// loops = {{%i1}, {%i2, %i3}}, to vectorize the middle and the two innermost
|
||||
/// loops;
|
||||
/// loops = {{%i2}}, to vectorize only the first innermost loop;
|
||||
/// loops = {{%i3}}, to vectorize only the second innermost loop;
|
||||
/// loops = {{%i1}}, to vectorize only the middle loop.
|
||||
LogicalResult
|
||||
vectorizeAffineLoopNest(const std::vector<SmallVector<AffineForOp, 2>> &loops,
|
||||
const VectorizationStrategy &strategy);
|
||||
|
||||
/// Normalize a affine.parallel op so that lower bounds are 0 and steps are 1.
|
||||
/// As currently implemented, this transformation cannot fail and will return
|
||||
/// early if the op is already in a normalized form.
|
||||
|
|
|
@ -254,8 +254,8 @@ using namespace vector;
|
|||
/// interference);
|
||||
/// 3. Then, for each pattern in order:
|
||||
/// a. applying iterative rewriting of the loop and the load operations in
|
||||
/// DFS postorder. Rewriting is implemented by coarsening the loops and
|
||||
/// turning load operations into opaque vector.transfer_read ops;
|
||||
/// inner-to-outer order. Rewriting is implemented by coarsening the loops
|
||||
/// and turning load operations into opaque vector.transfer_read ops;
|
||||
/// b. keeping track of the load operations encountered as "roots" and the
|
||||
/// store operations as "terminals";
|
||||
/// c. traversing the use-def chains starting from the roots and iteratively
|
||||
|
@ -584,17 +584,6 @@ Vectorize::Vectorize(ArrayRef<int64_t> virtualVectorSize) {
|
|||
vectorSizes = virtualVectorSize;
|
||||
}
|
||||
|
||||
/////// TODO: Hoist to a VectorizationStrategy.cpp when appropriate.
|
||||
/////////
|
||||
namespace {
|
||||
|
||||
struct VectorizationStrategy {
|
||||
SmallVector<int64_t, 8> vectorSizes;
|
||||
DenseMap<Operation *, unsigned> loopToVectorDim;
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
|
||||
static void vectorizeLoopIfProfitable(Operation *loop, unsigned depthInPattern,
|
||||
unsigned patternDepth,
|
||||
VectorizationStrategy *strategy) {
|
||||
|
@ -857,44 +846,44 @@ isVectorizableLoopPtrFactory(const DenseSet<Operation *> ¶llelLoops,
|
|||
};
|
||||
}
|
||||
|
||||
/// Apply vectorization of `loop` according to `state`. This is only triggered
|
||||
/// if all vectorizations in `childrenMatches` have already succeeded
|
||||
/// recursively in DFS post-order.
|
||||
/// Apply vectorization of `loop` according to `state`. `loops` are processed in
|
||||
/// inner-to-outer order to ensure that all the children loops have already been
|
||||
/// vectorized before vectorizing the parent loop.
|
||||
static LogicalResult
|
||||
vectorizeLoopsAndLoadsRecursively(NestedMatch oneMatch,
|
||||
VectorizationState *state) {
|
||||
auto *loopInst = oneMatch.getMatchedOperation();
|
||||
auto loop = cast<AffineForOp>(loopInst);
|
||||
auto childrenMatches = oneMatch.getMatchedChildren();
|
||||
vectorizeLoopsAndLoads(std::vector<SmallVector<AffineForOp, 2>> &loops,
|
||||
VectorizationState *state) {
|
||||
// Vectorize loops in inner-to-outer order. If any children fails, the parent
|
||||
// will fail too.
|
||||
for (auto &loopsInLevel : llvm::reverse(loops)) {
|
||||
for (AffineForOp loop : loopsInLevel) {
|
||||
// 1. This loop may have been omitted from vectorization for various
|
||||
// reasons (e.g. due to the performance model or pattern depth > vector
|
||||
// size).
|
||||
auto it = state->strategy->loopToVectorDim.find(loop.getOperation());
|
||||
if (it == state->strategy->loopToVectorDim.end())
|
||||
continue;
|
||||
|
||||
// 1. DFS postorder recursion, if any of my children fails, I fail too.
|
||||
for (auto m : childrenMatches) {
|
||||
if (failed(vectorizeLoopsAndLoadsRecursively(m, state))) {
|
||||
return failure();
|
||||
}
|
||||
// 2. Actual inner-to-outer transformation.
|
||||
auto vectorDim = it->second;
|
||||
assert(vectorDim < state->strategy->vectorSizes.size() &&
|
||||
"vector dim overflow");
|
||||
// a. get actual vector size
|
||||
auto vectorSize = state->strategy->vectorSizes[vectorDim];
|
||||
// b. loop transformation for early vectorization is still subject to
|
||||
// exploratory tradeoffs (see top of the file). Apply coarsening,
|
||||
// i.e.:
|
||||
// | ub -> ub
|
||||
// | step -> step * vectorSize
|
||||
LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize
|
||||
<< " : \n"
|
||||
<< loop);
|
||||
if (failed(
|
||||
vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state)))
|
||||
return failure();
|
||||
} // end for.
|
||||
}
|
||||
|
||||
// 2. This loop may have been omitted from vectorization for various reasons
|
||||
// (e.g. due to the performance model or pattern depth > vector size).
|
||||
auto it = state->strategy->loopToVectorDim.find(loopInst);
|
||||
if (it == state->strategy->loopToVectorDim.end()) {
|
||||
return success();
|
||||
}
|
||||
|
||||
// 3. Actual post-order transformation.
|
||||
auto vectorDim = it->second;
|
||||
assert(vectorDim < state->strategy->vectorSizes.size() &&
|
||||
"vector dim overflow");
|
||||
// a. get actual vector size
|
||||
auto vectorSize = state->strategy->vectorSizes[vectorDim];
|
||||
// b. loop transformation for early vectorization is still subject to
|
||||
// exploratory tradeoffs (see top of the file). Apply coarsening, i.e.:
|
||||
// | ub -> ub
|
||||
// | step -> step * vectorSize
|
||||
LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize
|
||||
<< " : ");
|
||||
LLVM_DEBUG(loopInst->print(dbgs()));
|
||||
return vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state);
|
||||
return success();
|
||||
}
|
||||
|
||||
/// Tries to transform a scalar constant into a vector splat of that constant.
|
||||
|
@ -1145,16 +1134,46 @@ static LogicalResult vectorizeNonTerminals(VectorizationState *state) {
|
|||
return success();
|
||||
}
|
||||
|
||||
/// Vectorization is a recursive procedure where anything below can fail.
|
||||
/// The root match thus needs to maintain a clone for handling failure.
|
||||
/// Each root may succeed independently but will otherwise clean after itself if
|
||||
/// anything below it fails.
|
||||
static LogicalResult vectorizeRootMatch(NestedMatch m,
|
||||
VectorizationStrategy *strategy) {
|
||||
auto loop = cast<AffineForOp>(m.getMatchedOperation());
|
||||
OperationFolder folder(loop.getContext());
|
||||
/// Recursive implementation to convert all the nested loops in 'match' to a 2D
|
||||
/// vector container that preserves the relative nesting level of each loop with
|
||||
/// respect to the others in 'match'. 'currentLevel' is the nesting level that
|
||||
/// will be assigned to the loop in the current 'match'.
|
||||
static void
|
||||
getMatchedAffineLoopsRec(NestedMatch match, unsigned currentLevel,
|
||||
std::vector<SmallVector<AffineForOp, 2>> &loops) {
|
||||
// Add a new empty level to the output if it doesn't exist already.
|
||||
assert(currentLevel <= loops.size() && "Unexpected currentLevel");
|
||||
if (currentLevel == loops.size())
|
||||
loops.push_back(SmallVector<AffineForOp, 2>());
|
||||
|
||||
// Add current match and recursively visit its children.
|
||||
loops[currentLevel].push_back(cast<AffineForOp>(match.getMatchedOperation()));
|
||||
for (auto childMatch : match.getMatchedChildren()) {
|
||||
getMatchedAffineLoopsRec(childMatch, currentLevel + 1, loops);
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts all the nested loops in 'match' to a 2D vector container that
|
||||
/// preserves the relative nesting level of each loop with respect to the others
|
||||
/// in 'match'. This means that every loop in 'loops[i]' will have a parent loop
|
||||
/// in 'loops[i-1]'. A loop in 'loops[i]' may or may not have a child loop in
|
||||
/// 'loops[i+1]'.
|
||||
static void
|
||||
getMatchedAffineLoops(NestedMatch match,
|
||||
std::vector<SmallVector<AffineForOp, 2>> &loops) {
|
||||
getMatchedAffineLoopsRec(match, /*currLoopDepth=*/0, loops);
|
||||
}
|
||||
|
||||
/// Internal implementation to vectorize affine loops from a single loop nest
|
||||
/// using an n-D vectorization strategy.
|
||||
static LogicalResult
|
||||
vectorizeLoopNest(std::vector<SmallVector<AffineForOp, 2>> &loops,
|
||||
const VectorizationStrategy &strategy) {
|
||||
assert(loops[0].size() == 1 && "Expected single root loop");
|
||||
AffineForOp rootLoop = loops[0][0];
|
||||
OperationFolder folder(rootLoop.getContext());
|
||||
VectorizationState state;
|
||||
state.strategy = strategy;
|
||||
state.strategy = &strategy;
|
||||
state.folder = &folder;
|
||||
|
||||
// Since patterns are recursive, they can very well intersect.
|
||||
|
@ -1164,7 +1183,7 @@ static LogicalResult vectorizeRootMatch(NestedMatch m,
|
|||
// vectorizable. If a pattern is not vectorizable anymore, we just skip it.
|
||||
// TODO: implement a non-greedy profitability analysis that keeps only
|
||||
// non-intersecting patterns.
|
||||
if (!isVectorizableLoopBody(loop, vectorTransferPattern())) {
|
||||
if (!isVectorizableLoopBody(rootLoop, vectorTransferPattern())) {
|
||||
LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ loop is not vectorizable");
|
||||
return failure();
|
||||
}
|
||||
|
@ -1172,7 +1191,7 @@ static LogicalResult vectorizeRootMatch(NestedMatch m,
|
|||
/// Sets up error handling for this root loop. This is how the root match
|
||||
/// maintains a clone for handling failure and restores the proper state via
|
||||
/// RAII.
|
||||
auto *loopInst = loop.getOperation();
|
||||
auto *loopInst = rootLoop.getOperation();
|
||||
OpBuilder builder(loopInst);
|
||||
auto clonedLoop = cast<AffineForOp>(builder.clone(*loopInst));
|
||||
struct Guard {
|
||||
|
@ -1187,17 +1206,17 @@ static LogicalResult vectorizeRootMatch(NestedMatch m,
|
|||
}
|
||||
AffineForOp loop;
|
||||
AffineForOp clonedLoop;
|
||||
} guard{loop, clonedLoop};
|
||||
} guard{rootLoop, clonedLoop};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Start vectorizing.
|
||||
// From now on, any error triggers the scope guard above.
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// 1. Vectorize all the loops matched by the pattern, recursively.
|
||||
// 1. Vectorize all the loop candidates, in inner-to-outer order.
|
||||
// This also vectorizes the roots (AffineLoadOp) as well as registers the
|
||||
// terminals (AffineStoreOp) for post-processing vectorization (we need to
|
||||
// wait for all use-def chains into them to be vectorized first).
|
||||
if (failed(vectorizeLoopsAndLoadsRecursively(m, &state))) {
|
||||
if (failed(vectorizeLoopsAndLoads(loops, &state))) {
|
||||
LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed root vectorizeLoop");
|
||||
return guard.failure();
|
||||
}
|
||||
|
@ -1229,38 +1248,25 @@ static LogicalResult vectorizeRootMatch(NestedMatch m,
|
|||
return guard.success();
|
||||
}
|
||||
|
||||
/// Applies vectorization to the current Function by searching over a bunch of
|
||||
/// predetermined patterns.
|
||||
void Vectorize::runOnFunction() {
|
||||
FuncOp f = getFunction();
|
||||
if (!fastestVaryingPattern.empty() &&
|
||||
fastestVaryingPattern.size() != vectorSizes.size()) {
|
||||
f.emitRemark("Fastest varying pattern specified with different size than "
|
||||
"the vector size.");
|
||||
return signalPassFailure();
|
||||
}
|
||||
|
||||
DenseSet<Operation *> parallelLoops;
|
||||
f.walk([¶llelLoops](AffineForOp loop) {
|
||||
if (isLoopParallel(loop))
|
||||
parallelLoops.insert(loop);
|
||||
});
|
||||
|
||||
vectorizeAffineLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern);
|
||||
/// Vectorization is a recursive procedure where anything below can fail. The
|
||||
/// root match thus needs to maintain a clone for handling failure. Each root
|
||||
/// may succeed independently but will otherwise clean after itself if anything
|
||||
/// below it fails.
|
||||
static LogicalResult vectorizeRootMatch(NestedMatch m,
|
||||
const VectorizationStrategy &strategy) {
|
||||
std::vector<SmallVector<AffineForOp, 2>> loopsToVectorize;
|
||||
getMatchedAffineLoops(m, loopsToVectorize);
|
||||
return vectorizeLoopNest(loopsToVectorize, strategy);
|
||||
}
|
||||
|
||||
namespace mlir {
|
||||
|
||||
/// Vectorizes affine loops in 'loops' using the n-D vectorization factors in
|
||||
/// 'vectorSizes'. By default, each vectorization factor is applied
|
||||
/// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can
|
||||
/// be optionally used to provide a different loop vectorization order.
|
||||
void vectorizeAffineLoops(Operation *parentOp, DenseSet<Operation *> &loops,
|
||||
ArrayRef<int64_t> vectorSizes,
|
||||
ArrayRef<int64_t> fastestVaryingPattern) {
|
||||
// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
|
||||
NestedPatternContext mlContext;
|
||||
|
||||
/// Internal implementation to vectorize affine loops in 'loops' using the n-D
|
||||
/// vectorization factors in 'vectorSizes'. By default, each vectorization
|
||||
/// factor is applied inner-to-outer to the loops of each loop nest.
|
||||
/// 'fastestVaryingPattern' can be optionally used to provide a different loop
|
||||
/// vectorization order.
|
||||
static void vectorizeLoops(Operation *parentOp, DenseSet<Operation *> &loops,
|
||||
ArrayRef<int64_t> vectorSizes,
|
||||
ArrayRef<int64_t> fastestVaryingPattern) {
|
||||
for (auto &pat :
|
||||
makePatterns(loops, vectorSizes.size(), fastestVaryingPattern)) {
|
||||
LLVM_DEBUG(dbgs() << "\n******************************************");
|
||||
|
@ -1286,7 +1292,7 @@ void vectorizeAffineLoops(Operation *parentOp, DenseSet<Operation *> &loops,
|
|||
&strategy);
|
||||
// TODO: if pattern does not apply, report it; alter the
|
||||
// cost/benefit.
|
||||
vectorizeRootMatch(m, &strategy);
|
||||
vectorizeRootMatch(m, strategy);
|
||||
// TODO: some diagnostics if failure to vectorize occurs.
|
||||
}
|
||||
}
|
||||
|
@ -1301,4 +1307,127 @@ std::unique_ptr<OperationPass<FuncOp>> createSuperVectorizePass() {
|
|||
return std::make_unique<Vectorize>();
|
||||
}
|
||||
|
||||
/// Applies vectorization to the current function by searching over a bunch of
|
||||
/// predetermined patterns.
|
||||
void Vectorize::runOnFunction() {
|
||||
FuncOp f = getFunction();
|
||||
if (!fastestVaryingPattern.empty() &&
|
||||
fastestVaryingPattern.size() != vectorSizes.size()) {
|
||||
f.emitRemark("Fastest varying pattern specified with different size than "
|
||||
"the vector size.");
|
||||
return signalPassFailure();
|
||||
}
|
||||
|
||||
DenseSet<Operation *> parallelLoops;
|
||||
f.walk([¶llelLoops](AffineForOp loop) {
|
||||
if (isLoopParallel(loop))
|
||||
parallelLoops.insert(loop);
|
||||
});
|
||||
|
||||
// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
|
||||
NestedPatternContext mlContext;
|
||||
vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern);
|
||||
}
|
||||
|
||||
/// Verify that affine loops in 'loops' meet the nesting criteria expected by
|
||||
/// SuperVectorizer:
|
||||
/// * There must be at least one loop.
|
||||
/// * There must be a single root loop (nesting level 0).
|
||||
/// * Each loop at a given nesting level must be nested in a loop from a
|
||||
/// previous nesting level.
|
||||
static void
|
||||
verifyLoopNesting(const std::vector<SmallVector<AffineForOp, 2>> &loops) {
|
||||
assert(!loops.empty() && "Expected at least one loop");
|
||||
assert(!loops[0].size() && "Expected only one root loop");
|
||||
|
||||
// Traverse loops outer-to-inner to check some invariants.
|
||||
for (int i = 1, end = loops.size(); i < end; ++i) {
|
||||
for (AffineForOp loop : loops[i]) {
|
||||
// Check that each loop at this level is nested in one of the loops from
|
||||
// the previous level.
|
||||
bool parentFound = false;
|
||||
for (AffineForOp maybeParent : loops[i - 1]) {
|
||||
if (maybeParent.getOperation()->isProperAncestor(loop)) {
|
||||
parentFound = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert(parentFound && "Child loop not nested in any parent loop");
|
||||
|
||||
// Check that each loop at this level is not nested in another loop from
|
||||
// this level.
|
||||
for (AffineForOp sibling : loops[i])
|
||||
assert(!sibling.getOperation()->isProperAncestor(loop) &&
|
||||
"Loops at the same level are nested");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace mlir {
|
||||
|
||||
/// External utility to vectorize affine loops in 'loops' using the n-D
|
||||
/// vectorization factors in 'vectorSizes'. By default, each vectorization
|
||||
/// factor is applied inner-to-outer to the loops of each loop nest.
|
||||
/// 'fastestVaryingPattern' can be optionally used to provide a different loop
|
||||
/// vectorization order.
|
||||
void vectorizeAffineLoops(Operation *parentOp, DenseSet<Operation *> &loops,
|
||||
ArrayRef<int64_t> vectorSizes,
|
||||
ArrayRef<int64_t> fastestVaryingPattern) {
|
||||
// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
|
||||
NestedPatternContext mlContext;
|
||||
vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern);
|
||||
}
|
||||
|
||||
/// External utility to vectorize affine loops from a single loop nest using an
|
||||
/// n-D vectorization strategy (see doc in VectorizationStrategy definition).
|
||||
/// Loops are provided in a 2D vector container. The first dimension represents
|
||||
/// the nesting level relative to the loops to be vectorized. The second
|
||||
/// dimension contains the loops. This means that:
|
||||
/// a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]',
|
||||
/// b) a loop in 'loops[i]' may or may not have a child loop in 'loops[i+1]'.
|
||||
///
|
||||
/// For example, for the following loop nest:
|
||||
///
|
||||
/// func @vec2d(%in0: memref<64x128x512xf32>, %in1: memref<64x128x128xf32>,
|
||||
/// %out0: memref<64x128x512xf32>,
|
||||
/// %out1: memref<64x128x128xf32>) {
|
||||
/// affine.for %i0 = 0 to 64 {
|
||||
/// affine.for %i1 = 0 to 128 {
|
||||
/// affine.for %i2 = 0 to 512 {
|
||||
/// %ld = affine.load %in0[%i0, %i1, %i2] : memref<64x128x512xf32>
|
||||
/// affine.store %ld, %out0[%i0, %i1, %i2] : memref<64x128x512xf32>
|
||||
/// }
|
||||
/// affine.for %i3 = 0 to 128 {
|
||||
/// %ld = affine.load %in1[%i0, %i1, %i3] : memref<64x128x128xf32>
|
||||
/// affine.store %ld, %out1[%i0, %i1, %i3] : memref<64x128x128xf32>
|
||||
/// }
|
||||
/// }
|
||||
/// }
|
||||
/// return
|
||||
/// }
|
||||
///
|
||||
/// loops = {{%i0}, {%i2, %i3}}, to vectorize the outermost and the two
|
||||
/// innermost loops;
|
||||
/// loops = {{%i1}, {%i2, %i3}}, to vectorize the middle and the two innermost
|
||||
/// loops;
|
||||
/// loops = {{%i2}}, to vectorize only the first innermost loop;
|
||||
/// loops = {{%i3}}, to vectorize only the second innermost loop;
|
||||
/// loops = {{%i1}}, to vectorize only the middle loop.
|
||||
LogicalResult
|
||||
vectorizeAffineLoopNest(std::vector<SmallVector<AffineForOp, 2>> &loops,
|
||||
const VectorizationStrategy &strategy) {
|
||||
// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
|
||||
NestedPatternContext mlContext;
|
||||
verifyLoopNesting(loops);
|
||||
return vectorizeLoopNest(loops, strategy);
|
||||
}
|
||||
|
||||
std::unique_ptr<OperationPass<FuncOp>>
|
||||
createSuperVectorizePass(ArrayRef<int64_t> virtualVectorSize) {
|
||||
return std::make_unique<Vectorize>(virtualVectorSize);
|
||||
}
|
||||
std::unique_ptr<OperationPass<FuncOp>> createSuperVectorizePass() {
|
||||
return std::make_unique<Vectorize>();
|
||||
}
|
||||
|
||||
} // namespace mlir
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0" | FileCheck %s
|
||||
|
||||
// Permutation maps used in vectorization.
|
||||
// CHECK: #[[$map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)>
|
||||
// CHECK-DAG: #[[$map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)>
|
||||
// CHECK-DAG: #[[$map_id1:map[0-9]+]] = affine_map<(d0) -> (d0)>
|
||||
|
||||
#map0 = affine_map<(d0) -> (d0)>
|
||||
#mapadd1 = affine_map<(d0) -> (d0 + 1)>
|
||||
|
@ -26,8 +27,8 @@ func @vec1d_1(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
|
|||
%P = dim %B, %c2 : memref<?x?x?xf32>
|
||||
|
||||
// CHECK: for {{.*}} step 128
|
||||
// CHECK-NEXT: %{{.*}} = affine.apply #map0(%[[C0]])
|
||||
// CHECK-NEXT: %{{.*}} = affine.apply #map0(%[[C0]])
|
||||
// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]])
|
||||
// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]])
|
||||
// CHECK-NEXT: %{{.*}} = constant 0.0{{.*}}: f32
|
||||
// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
|
||||
affine.for %i0 = 0 to %M { // vectorized due to scalar -> vector
|
||||
|
@ -331,8 +332,8 @@ func @vec_rejected_8(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
|
|||
|
||||
// CHECK: affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} {
|
||||
// CHECK: for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
|
||||
// CHECK: %{{.*}} = affine.apply #map0(%{{.*}})
|
||||
// CHECK: %{{.*}} = affine.apply #map0(%{{.*}})
|
||||
// CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
|
||||
// CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
|
||||
// CHECK: %{{.*}} = constant 0.0{{.*}}: f32
|
||||
// CHECK: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
|
||||
affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %{{.*}} in DFS post-order prevents vectorizing %{{.*}}
|
||||
|
@ -360,8 +361,8 @@ func @vec_rejected_9(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
|
|||
|
||||
// CHECK: affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} {
|
||||
// CHECK: for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
|
||||
// CHECK: %{{.*}} = affine.apply #map0(%{{.*}})
|
||||
// CHECK-NEXT: %{{.*}} = affine.apply #map0(%{{.*}})
|
||||
// CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
|
||||
// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
|
||||
// CHECK-NEXT: %{{.*}} = constant 0.0{{.*}}: f32
|
||||
// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
|
||||
affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %i18 in DFS post-order prevents vectorizing %{{.*}}
|
||||
|
|
|
@ -124,7 +124,7 @@ func @vectorize_matmul(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: me
|
|||
}
|
||||
// VECT: affine.for %[[I2:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[M]]) step 4 {
|
||||
// VECT-NEXT: affine.for %[[I3:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[N]]) step 8 {
|
||||
// VECT-NEXT: affine.for %[[I4:.*]] = #map5(%[[C0]]) to #[[$map_id1]](%[[K]]) {
|
||||
// VECT-NEXT: affine.for %[[I4:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[K]]) {
|
||||
// VECT: %[[A:.*]] = vector.transfer_read %{{.*}}[%[[I4]], %[[I3]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_zerod1]]} : memref<?x?xf32>, vector<4x8xf32>
|
||||
// VECT: %[[B:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I4]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_d0zero]]} : memref<?x?xf32>, vector<4x8xf32>
|
||||
// VECT-NEXT: %[[C:.*]] = mulf %[[B]], %[[A]] : vector<4x8xf32>
|
||||
|
|
Loading…
Reference in New Issue