[MLIR][Affine][VectorOps] Utility to vectorize loop nest using strategy

This patch adds a utility based on SuperVectorizer to vectorize an
affine loop nest using a given vectorization strategy. This strategy allows
targeting specific loops for vectorization instead of relying of the
SuperVectorizer analysis to choose the right loops to vectorize.

Reviewed By: nicolasvasilache

Differential Revision: https://reviews.llvm.org/D85869
This commit is contained in:
Diego Caballero 2020-09-21 15:30:02 -07:00
parent 1747f77764
commit 14d0735d34
4 changed files with 312 additions and 100 deletions

View File

@ -14,6 +14,8 @@
#define MLIR_DIALECT_AFFINE_UTILS_H
#include "mlir/Support/LLVM.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
namespace mlir {
@ -34,6 +36,47 @@ void affineParallelize(AffineForOp forOp);
/// significant code expansion in some cases.
LogicalResult hoistAffineIfOp(AffineIfOp ifOp, bool *folded = nullptr);
/// Holds parameters to perform n-D vectorization on a single loop nest.
/// For example, for the following loop nest:
///
/// func @vec2d(%in: memref<64x128x512xf32>, %out: memref<64x128x512xf32>) {
/// affine.for %i0 = 0 to 64 {
/// affine.for %i1 = 0 to 128 {
/// affine.for %i2 = 0 to 512 {
/// %ld = affine.load %in[%i0, %i1, %i2] : memref<64x128x512xf32>
/// affine.store %ld, %out[%i0, %i1, %i2] : memref<64x128x512xf32>
/// }
/// }
/// }
/// return
/// }
///
/// and VectorizationStrategy = 'vectorSizes = {8, 4}', 'loopToVectorDim =
/// {{i1->0}, {i2->1}}', SuperVectorizer will generate:
///
/// func @vec2d(%arg0: memref<64x128x512xf32>, %arg1: memref<64x128x512xf32>) {
/// affine.for %arg2 = 0 to 64 {
/// affine.for %arg3 = 0 to 128 step 8 {
/// affine.for %arg4 = 0 to 512 step 4 {
/// %cst = constant 0.000000e+00 : f32
/// %0 = vector.transfer_read %arg0[%arg2, %arg3, %arg4], %cst : ...
/// vector.transfer_write %0, %arg1[%arg2, %arg3, %arg4] : ...
/// }
/// }
/// }
/// return
/// }
// TODO: Hoist to a VectorizationStrategy.cpp when appropriate.
struct VectorizationStrategy {
// Vectorization factors to apply to each target vector dimension.
// Each factor will be applied to a different loop.
SmallVector<int64_t, 8> vectorSizes;
// Maps each AffineForOp vectorization candidate with its vector dimension.
// The candidate will be vectorized using the vectorization factor in
// 'vectorSizes' for that dimension.
DenseMap<Operation *, unsigned> loopToVectorDim;
};
/// Vectorizes affine loops in 'loops' using the n-D vectorization factors in
/// 'vectorSizes'. By default, each vectorization factor is applied
/// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can
@ -43,6 +86,45 @@ void vectorizeAffineLoops(
llvm::DenseSet<Operation *, DenseMapInfo<Operation *>> &loops,
ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern);
/// External utility to vectorize affine loops from a single loop nest using an
/// n-D vectorization strategy (see doc in VectorizationStrategy definition).
/// Loops are provided in a 2D vector container. The first dimension represents
/// the nesting level relative to the loops to be vectorized. The second
/// dimension contains the loops. This means that:
/// a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]',
/// b) a loop in 'loops[i]' may or may not have a child loop in 'loops[i+1]'.
///
/// For example, for the following loop nest:
///
/// func @vec2d(%in0: memref<64x128x512xf32>, %in1: memref<64x128x128xf32>,
/// %out0: memref<64x128x512xf32>,
/// %out1: memref<64x128x128xf32>) {
/// affine.for %i0 = 0 to 64 {
/// affine.for %i1 = 0 to 128 {
/// affine.for %i2 = 0 to 512 {
/// %ld = affine.load %in0[%i0, %i1, %i2] : memref<64x128x512xf32>
/// affine.store %ld, %out0[%i0, %i1, %i2] : memref<64x128x512xf32>
/// }
/// affine.for %i3 = 0 to 128 {
/// %ld = affine.load %in1[%i0, %i1, %i3] : memref<64x128x128xf32>
/// affine.store %ld, %out1[%i0, %i1, %i3] : memref<64x128x128xf32>
/// }
/// }
/// }
/// return
/// }
///
/// loops = {{%i0}, {%i2, %i3}}, to vectorize the outermost and the two
/// innermost loops;
/// loops = {{%i1}, {%i2, %i3}}, to vectorize the middle and the two innermost
/// loops;
/// loops = {{%i2}}, to vectorize only the first innermost loop;
/// loops = {{%i3}}, to vectorize only the second innermost loop;
/// loops = {{%i1}}, to vectorize only the middle loop.
LogicalResult
vectorizeAffineLoopNest(const std::vector<SmallVector<AffineForOp, 2>> &loops,
const VectorizationStrategy &strategy);
/// Normalize a affine.parallel op so that lower bounds are 0 and steps are 1.
/// As currently implemented, this transformation cannot fail and will return
/// early if the op is already in a normalized form.

View File

@ -254,8 +254,8 @@ using namespace vector;
/// interference);
/// 3. Then, for each pattern in order:
/// a. applying iterative rewriting of the loop and the load operations in
/// DFS postorder. Rewriting is implemented by coarsening the loops and
/// turning load operations into opaque vector.transfer_read ops;
/// inner-to-outer order. Rewriting is implemented by coarsening the loops
/// and turning load operations into opaque vector.transfer_read ops;
/// b. keeping track of the load operations encountered as "roots" and the
/// store operations as "terminals";
/// c. traversing the use-def chains starting from the roots and iteratively
@ -584,17 +584,6 @@ Vectorize::Vectorize(ArrayRef<int64_t> virtualVectorSize) {
vectorSizes = virtualVectorSize;
}
/////// TODO: Hoist to a VectorizationStrategy.cpp when appropriate.
/////////
namespace {
struct VectorizationStrategy {
SmallVector<int64_t, 8> vectorSizes;
DenseMap<Operation *, unsigned> loopToVectorDim;
};
} // end anonymous namespace
static void vectorizeLoopIfProfitable(Operation *loop, unsigned depthInPattern,
unsigned patternDepth,
VectorizationStrategy *strategy) {
@ -857,44 +846,44 @@ isVectorizableLoopPtrFactory(const DenseSet<Operation *> &parallelLoops,
};
}
/// Apply vectorization of `loop` according to `state`. This is only triggered
/// if all vectorizations in `childrenMatches` have already succeeded
/// recursively in DFS post-order.
/// Apply vectorization of `loop` according to `state`. `loops` are processed in
/// inner-to-outer order to ensure that all the children loops have already been
/// vectorized before vectorizing the parent loop.
static LogicalResult
vectorizeLoopsAndLoadsRecursively(NestedMatch oneMatch,
VectorizationState *state) {
auto *loopInst = oneMatch.getMatchedOperation();
auto loop = cast<AffineForOp>(loopInst);
auto childrenMatches = oneMatch.getMatchedChildren();
vectorizeLoopsAndLoads(std::vector<SmallVector<AffineForOp, 2>> &loops,
VectorizationState *state) {
// Vectorize loops in inner-to-outer order. If any children fails, the parent
// will fail too.
for (auto &loopsInLevel : llvm::reverse(loops)) {
for (AffineForOp loop : loopsInLevel) {
// 1. This loop may have been omitted from vectorization for various
// reasons (e.g. due to the performance model or pattern depth > vector
// size).
auto it = state->strategy->loopToVectorDim.find(loop.getOperation());
if (it == state->strategy->loopToVectorDim.end())
continue;
// 1. DFS postorder recursion, if any of my children fails, I fail too.
for (auto m : childrenMatches) {
if (failed(vectorizeLoopsAndLoadsRecursively(m, state))) {
return failure();
}
// 2. Actual inner-to-outer transformation.
auto vectorDim = it->second;
assert(vectorDim < state->strategy->vectorSizes.size() &&
"vector dim overflow");
// a. get actual vector size
auto vectorSize = state->strategy->vectorSizes[vectorDim];
// b. loop transformation for early vectorization is still subject to
// exploratory tradeoffs (see top of the file). Apply coarsening,
// i.e.:
// | ub -> ub
// | step -> step * vectorSize
LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize
<< " : \n"
<< loop);
if (failed(
vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state)))
return failure();
} // end for.
}
// 2. This loop may have been omitted from vectorization for various reasons
// (e.g. due to the performance model or pattern depth > vector size).
auto it = state->strategy->loopToVectorDim.find(loopInst);
if (it == state->strategy->loopToVectorDim.end()) {
return success();
}
// 3. Actual post-order transformation.
auto vectorDim = it->second;
assert(vectorDim < state->strategy->vectorSizes.size() &&
"vector dim overflow");
// a. get actual vector size
auto vectorSize = state->strategy->vectorSizes[vectorDim];
// b. loop transformation for early vectorization is still subject to
// exploratory tradeoffs (see top of the file). Apply coarsening, i.e.:
// | ub -> ub
// | step -> step * vectorSize
LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize
<< " : ");
LLVM_DEBUG(loopInst->print(dbgs()));
return vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state);
return success();
}
/// Tries to transform a scalar constant into a vector splat of that constant.
@ -1145,16 +1134,46 @@ static LogicalResult vectorizeNonTerminals(VectorizationState *state) {
return success();
}
/// Vectorization is a recursive procedure where anything below can fail.
/// The root match thus needs to maintain a clone for handling failure.
/// Each root may succeed independently but will otherwise clean after itself if
/// anything below it fails.
static LogicalResult vectorizeRootMatch(NestedMatch m,
VectorizationStrategy *strategy) {
auto loop = cast<AffineForOp>(m.getMatchedOperation());
OperationFolder folder(loop.getContext());
/// Recursive implementation to convert all the nested loops in 'match' to a 2D
/// vector container that preserves the relative nesting level of each loop with
/// respect to the others in 'match'. 'currentLevel' is the nesting level that
/// will be assigned to the loop in the current 'match'.
static void
getMatchedAffineLoopsRec(NestedMatch match, unsigned currentLevel,
std::vector<SmallVector<AffineForOp, 2>> &loops) {
// Add a new empty level to the output if it doesn't exist already.
assert(currentLevel <= loops.size() && "Unexpected currentLevel");
if (currentLevel == loops.size())
loops.push_back(SmallVector<AffineForOp, 2>());
// Add current match and recursively visit its children.
loops[currentLevel].push_back(cast<AffineForOp>(match.getMatchedOperation()));
for (auto childMatch : match.getMatchedChildren()) {
getMatchedAffineLoopsRec(childMatch, currentLevel + 1, loops);
}
}
/// Converts all the nested loops in 'match' to a 2D vector container that
/// preserves the relative nesting level of each loop with respect to the others
/// in 'match'. This means that every loop in 'loops[i]' will have a parent loop
/// in 'loops[i-1]'. A loop in 'loops[i]' may or may not have a child loop in
/// 'loops[i+1]'.
static void
getMatchedAffineLoops(NestedMatch match,
std::vector<SmallVector<AffineForOp, 2>> &loops) {
getMatchedAffineLoopsRec(match, /*currLoopDepth=*/0, loops);
}
/// Internal implementation to vectorize affine loops from a single loop nest
/// using an n-D vectorization strategy.
static LogicalResult
vectorizeLoopNest(std::vector<SmallVector<AffineForOp, 2>> &loops,
const VectorizationStrategy &strategy) {
assert(loops[0].size() == 1 && "Expected single root loop");
AffineForOp rootLoop = loops[0][0];
OperationFolder folder(rootLoop.getContext());
VectorizationState state;
state.strategy = strategy;
state.strategy = &strategy;
state.folder = &folder;
// Since patterns are recursive, they can very well intersect.
@ -1164,7 +1183,7 @@ static LogicalResult vectorizeRootMatch(NestedMatch m,
// vectorizable. If a pattern is not vectorizable anymore, we just skip it.
// TODO: implement a non-greedy profitability analysis that keeps only
// non-intersecting patterns.
if (!isVectorizableLoopBody(loop, vectorTransferPattern())) {
if (!isVectorizableLoopBody(rootLoop, vectorTransferPattern())) {
LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ loop is not vectorizable");
return failure();
}
@ -1172,7 +1191,7 @@ static LogicalResult vectorizeRootMatch(NestedMatch m,
/// Sets up error handling for this root loop. This is how the root match
/// maintains a clone for handling failure and restores the proper state via
/// RAII.
auto *loopInst = loop.getOperation();
auto *loopInst = rootLoop.getOperation();
OpBuilder builder(loopInst);
auto clonedLoop = cast<AffineForOp>(builder.clone(*loopInst));
struct Guard {
@ -1187,17 +1206,17 @@ static LogicalResult vectorizeRootMatch(NestedMatch m,
}
AffineForOp loop;
AffineForOp clonedLoop;
} guard{loop, clonedLoop};
} guard{rootLoop, clonedLoop};
//////////////////////////////////////////////////////////////////////////////
// Start vectorizing.
// From now on, any error triggers the scope guard above.
//////////////////////////////////////////////////////////////////////////////
// 1. Vectorize all the loops matched by the pattern, recursively.
// 1. Vectorize all the loop candidates, in inner-to-outer order.
// This also vectorizes the roots (AffineLoadOp) as well as registers the
// terminals (AffineStoreOp) for post-processing vectorization (we need to
// wait for all use-def chains into them to be vectorized first).
if (failed(vectorizeLoopsAndLoadsRecursively(m, &state))) {
if (failed(vectorizeLoopsAndLoads(loops, &state))) {
LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed root vectorizeLoop");
return guard.failure();
}
@ -1229,38 +1248,25 @@ static LogicalResult vectorizeRootMatch(NestedMatch m,
return guard.success();
}
/// Applies vectorization to the current Function by searching over a bunch of
/// predetermined patterns.
void Vectorize::runOnFunction() {
FuncOp f = getFunction();
if (!fastestVaryingPattern.empty() &&
fastestVaryingPattern.size() != vectorSizes.size()) {
f.emitRemark("Fastest varying pattern specified with different size than "
"the vector size.");
return signalPassFailure();
}
DenseSet<Operation *> parallelLoops;
f.walk([&parallelLoops](AffineForOp loop) {
if (isLoopParallel(loop))
parallelLoops.insert(loop);
});
vectorizeAffineLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern);
/// Vectorization is a recursive procedure where anything below can fail. The
/// root match thus needs to maintain a clone for handling failure. Each root
/// may succeed independently but will otherwise clean after itself if anything
/// below it fails.
static LogicalResult vectorizeRootMatch(NestedMatch m,
const VectorizationStrategy &strategy) {
std::vector<SmallVector<AffineForOp, 2>> loopsToVectorize;
getMatchedAffineLoops(m, loopsToVectorize);
return vectorizeLoopNest(loopsToVectorize, strategy);
}
namespace mlir {
/// Vectorizes affine loops in 'loops' using the n-D vectorization factors in
/// 'vectorSizes'. By default, each vectorization factor is applied
/// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can
/// be optionally used to provide a different loop vectorization order.
void vectorizeAffineLoops(Operation *parentOp, DenseSet<Operation *> &loops,
ArrayRef<int64_t> vectorSizes,
ArrayRef<int64_t> fastestVaryingPattern) {
// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
NestedPatternContext mlContext;
/// Internal implementation to vectorize affine loops in 'loops' using the n-D
/// vectorization factors in 'vectorSizes'. By default, each vectorization
/// factor is applied inner-to-outer to the loops of each loop nest.
/// 'fastestVaryingPattern' can be optionally used to provide a different loop
/// vectorization order.
static void vectorizeLoops(Operation *parentOp, DenseSet<Operation *> &loops,
ArrayRef<int64_t> vectorSizes,
ArrayRef<int64_t> fastestVaryingPattern) {
for (auto &pat :
makePatterns(loops, vectorSizes.size(), fastestVaryingPattern)) {
LLVM_DEBUG(dbgs() << "\n******************************************");
@ -1286,7 +1292,7 @@ void vectorizeAffineLoops(Operation *parentOp, DenseSet<Operation *> &loops,
&strategy);
// TODO: if pattern does not apply, report it; alter the
// cost/benefit.
vectorizeRootMatch(m, &strategy);
vectorizeRootMatch(m, strategy);
// TODO: some diagnostics if failure to vectorize occurs.
}
}
@ -1301,4 +1307,127 @@ std::unique_ptr<OperationPass<FuncOp>> createSuperVectorizePass() {
return std::make_unique<Vectorize>();
}
/// Applies vectorization to the current function by searching over a bunch of
/// predetermined patterns.
void Vectorize::runOnFunction() {
FuncOp f = getFunction();
if (!fastestVaryingPattern.empty() &&
fastestVaryingPattern.size() != vectorSizes.size()) {
f.emitRemark("Fastest varying pattern specified with different size than "
"the vector size.");
return signalPassFailure();
}
DenseSet<Operation *> parallelLoops;
f.walk([&parallelLoops](AffineForOp loop) {
if (isLoopParallel(loop))
parallelLoops.insert(loop);
});
// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
NestedPatternContext mlContext;
vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern);
}
/// Verify that affine loops in 'loops' meet the nesting criteria expected by
/// SuperVectorizer:
/// * There must be at least one loop.
/// * There must be a single root loop (nesting level 0).
/// * Each loop at a given nesting level must be nested in a loop from a
/// previous nesting level.
static void
verifyLoopNesting(const std::vector<SmallVector<AffineForOp, 2>> &loops) {
assert(!loops.empty() && "Expected at least one loop");
assert(!loops[0].size() && "Expected only one root loop");
// Traverse loops outer-to-inner to check some invariants.
for (int i = 1, end = loops.size(); i < end; ++i) {
for (AffineForOp loop : loops[i]) {
// Check that each loop at this level is nested in one of the loops from
// the previous level.
bool parentFound = false;
for (AffineForOp maybeParent : loops[i - 1]) {
if (maybeParent.getOperation()->isProperAncestor(loop)) {
parentFound = true;
break;
}
}
assert(parentFound && "Child loop not nested in any parent loop");
// Check that each loop at this level is not nested in another loop from
// this level.
for (AffineForOp sibling : loops[i])
assert(!sibling.getOperation()->isProperAncestor(loop) &&
"Loops at the same level are nested");
}
}
}
namespace mlir {
/// External utility to vectorize affine loops in 'loops' using the n-D
/// vectorization factors in 'vectorSizes'. By default, each vectorization
/// factor is applied inner-to-outer to the loops of each loop nest.
/// 'fastestVaryingPattern' can be optionally used to provide a different loop
/// vectorization order.
void vectorizeAffineLoops(Operation *parentOp, DenseSet<Operation *> &loops,
ArrayRef<int64_t> vectorSizes,
ArrayRef<int64_t> fastestVaryingPattern) {
// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
NestedPatternContext mlContext;
vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern);
}
/// External utility to vectorize affine loops from a single loop nest using an
/// n-D vectorization strategy (see doc in VectorizationStrategy definition).
/// Loops are provided in a 2D vector container. The first dimension represents
/// the nesting level relative to the loops to be vectorized. The second
/// dimension contains the loops. This means that:
/// a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]',
/// b) a loop in 'loops[i]' may or may not have a child loop in 'loops[i+1]'.
///
/// For example, for the following loop nest:
///
/// func @vec2d(%in0: memref<64x128x512xf32>, %in1: memref<64x128x128xf32>,
/// %out0: memref<64x128x512xf32>,
/// %out1: memref<64x128x128xf32>) {
/// affine.for %i0 = 0 to 64 {
/// affine.for %i1 = 0 to 128 {
/// affine.for %i2 = 0 to 512 {
/// %ld = affine.load %in0[%i0, %i1, %i2] : memref<64x128x512xf32>
/// affine.store %ld, %out0[%i0, %i1, %i2] : memref<64x128x512xf32>
/// }
/// affine.for %i3 = 0 to 128 {
/// %ld = affine.load %in1[%i0, %i1, %i3] : memref<64x128x128xf32>
/// affine.store %ld, %out1[%i0, %i1, %i3] : memref<64x128x128xf32>
/// }
/// }
/// }
/// return
/// }
///
/// loops = {{%i0}, {%i2, %i3}}, to vectorize the outermost and the two
/// innermost loops;
/// loops = {{%i1}, {%i2, %i3}}, to vectorize the middle and the two innermost
/// loops;
/// loops = {{%i2}}, to vectorize only the first innermost loop;
/// loops = {{%i3}}, to vectorize only the second innermost loop;
/// loops = {{%i1}}, to vectorize only the middle loop.
LogicalResult
vectorizeAffineLoopNest(std::vector<SmallVector<AffineForOp, 2>> &loops,
const VectorizationStrategy &strategy) {
// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
NestedPatternContext mlContext;
verifyLoopNesting(loops);
return vectorizeLoopNest(loops, strategy);
}
std::unique_ptr<OperationPass<FuncOp>>
createSuperVectorizePass(ArrayRef<int64_t> virtualVectorSize) {
return std::make_unique<Vectorize>(virtualVectorSize);
}
std::unique_ptr<OperationPass<FuncOp>> createSuperVectorizePass() {
return std::make_unique<Vectorize>();
}
} // namespace mlir

View File

@ -1,7 +1,8 @@
// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0" | FileCheck %s
// Permutation maps used in vectorization.
// CHECK: #[[$map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)>
// CHECK-DAG: #[[$map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)>
// CHECK-DAG: #[[$map_id1:map[0-9]+]] = affine_map<(d0) -> (d0)>
#map0 = affine_map<(d0) -> (d0)>
#mapadd1 = affine_map<(d0) -> (d0 + 1)>
@ -26,8 +27,8 @@ func @vec1d_1(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
%P = dim %B, %c2 : memref<?x?x?xf32>
// CHECK: for {{.*}} step 128
// CHECK-NEXT: %{{.*}} = affine.apply #map0(%[[C0]])
// CHECK-NEXT: %{{.*}} = affine.apply #map0(%[[C0]])
// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]])
// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]])
// CHECK-NEXT: %{{.*}} = constant 0.0{{.*}}: f32
// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
affine.for %i0 = 0 to %M { // vectorized due to scalar -> vector
@ -331,8 +332,8 @@ func @vec_rejected_8(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
// CHECK: affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} {
// CHECK: for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
// CHECK: %{{.*}} = affine.apply #map0(%{{.*}})
// CHECK: %{{.*}} = affine.apply #map0(%{{.*}})
// CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
// CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
// CHECK: %{{.*}} = constant 0.0{{.*}}: f32
// CHECK: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %{{.*}} in DFS post-order prevents vectorizing %{{.*}}
@ -360,8 +361,8 @@ func @vec_rejected_9(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
// CHECK: affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} {
// CHECK: for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
// CHECK: %{{.*}} = affine.apply #map0(%{{.*}})
// CHECK-NEXT: %{{.*}} = affine.apply #map0(%{{.*}})
// CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
// CHECK-NEXT: %{{.*}} = constant 0.0{{.*}}: f32
// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %i18 in DFS post-order prevents vectorizing %{{.*}}

View File

@ -124,7 +124,7 @@ func @vectorize_matmul(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: me
}
// VECT: affine.for %[[I2:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[M]]) step 4 {
// VECT-NEXT: affine.for %[[I3:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[N]]) step 8 {
// VECT-NEXT: affine.for %[[I4:.*]] = #map5(%[[C0]]) to #[[$map_id1]](%[[K]]) {
// VECT-NEXT: affine.for %[[I4:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[K]]) {
// VECT: %[[A:.*]] = vector.transfer_read %{{.*}}[%[[I4]], %[[I3]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_zerod1]]} : memref<?x?xf32>, vector<4x8xf32>
// VECT: %[[B:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I4]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_d0zero]]} : memref<?x?xf32>, vector<4x8xf32>
// VECT-NEXT: %[[C:.*]] = mulf %[[B]], %[[A]] : vector<4x8xf32>