Refactor vectorization patterns

This CL removes the reliance of the vectorize pass on the specification of a `fastestVaryingDim` parameter. This parameter is a restriction meant to more easily target a particular loop/memref combination for vectorization and is mainly used for testing. This also had the side-effect of restricting vectorization patterns to only the ones in which all memrefs were contiguous along the same loop dimension. This simple restriction prevented matmul to vectorize in 2-D. this CL removes the restriction and adds the matmul test which vectorizes in 2-D along the parallel loops. Support for reduction loops is left for future work. PiperOrigin-RevId: 240993827
2019-03-29 09:34:06 -07:00 · 2019-03-29 09:34:06 -07:00 · 094ca64ab0
parent 3ddd0411d0
commit 094ca64ab0
5 changed files with 222 additions and 186 deletions
--- a/mlir/include/mlir/Analysis/LoopAnalysis.h
+++ b/mlir/include/mlir/Analysis/LoopAnalysis.h
@ -71,34 +71,34 @@ uint64_t getLargestDivisorOfTripCount(AffineForOp forOp);
 ///
 /// Returns false in cases with more than one AffineApplyOp, this is
 /// conservative.
-bool isAccessInvariant(Value &iv, Value &index);
+bool isAccessInvariant(Value *iv, Value *index);

 /// Given an induction variable `iv` of type AffineForOp and `indices` of type
 /// IndexType, returns the set of `indices` that are independent of `iv`.
 ///
 /// Prerequisites (inherited from `isAccessInvariant` above):
 ///   1. `iv` and `indices` of the proper type;
-///   2. at most one reachable AffineApplyOp from index;
+///   2. at most one affine.apply is reachable from each index in `indices`;
 ///
-/// Returns false in cases with more than one AffineApplyOp, this is
-/// conservative.
+/// Emits a note if it encounters a chain of affine.apply and conservatively
+///  those cases.
 llvm::DenseSet<Value *, llvm::DenseMapInfo<Value *>>
-getInvariantAccesses(Value &iv, llvm::ArrayRef<Value *> indices);
+getInvariantAccesses(Value *iv, llvm::ArrayRef<Value *> indices);

 using VectorizableLoopFun = std::function<bool(AffineForOp)>;

 /// Checks whether the loop is structurally vectorizable; i.e.:
-/// 1. no conditionals are nested under the loop;
-/// 2. all nested load/stores are to scalar MemRefs.
+///   1. no conditionals are nested under the loop;
+///   2. all nested load/stores are to scalar MemRefs.
 /// TODO(ntv): relax the no-conditionals restriction
 bool isVectorizableLoopBody(AffineForOp loop);

 /// Checks whether the loop is structurally vectorizable and that all the LoadOp
 /// and StoreOp matched have access indexing functions that are are either:
 ///   1. invariant along the loop induction variable created by 'loop';
-///   2. varying along the 'fastestVaryingDim' memory dimension.
-bool isVectorizableLoopBodyAlongFastestVaryingMemRefDim(
-    AffineForOp loop, unsigned fastestVaryingDim);
+///   2. varying along at most one memory dimension. If such a unique dimension
+///      is found, it is written into `memRefDim`.
+bool isVectorizableLoopBody(AffineForOp loop, int *memRefDim);

 /// Checks where SSA dominance would be violated if a for op's body
 /// operations are shifted by the specified shifts. This method checks if a
--- a/mlir/lib/Analysis/LoopAnalysis.cpp
+++ b/mlir/lib/Analysis/LoopAnalysis.cpp
@ -174,36 +174,36 @@ uint64_t mlir::getLargestDivisorOfTripCount(AffineForOp forOp) {
  return gcd.getValue();
 }

-bool mlir::isAccessInvariant(Value &iv, Value &index) {
-  assert(isForInductionVar(&iv) && "iv must be a AffineForOp");
-  assert(index.getType().isa<IndexType>() && "index must be of IndexType");
+bool mlir::isAccessInvariant(Value *iv, Value *index) {
+  assert(isForInductionVar(iv) && "iv must be a AffineForOp");
+  assert(index->getType().isa<IndexType>() && "index must be of IndexType");
  SmallVector<Operation *, 4> affineApplyOps;
-  getReachableAffineApplyOps({&index}, affineApplyOps);
+  getReachableAffineApplyOps({index}, affineApplyOps);

  if (affineApplyOps.empty()) {
    // Pointer equality test because of Value pointer semantics.
-    return &index != &iv;
+    return index != iv;
  }

  if (affineApplyOps.size() > 1) {
-    affineApplyOps[0]->emitError(
+    affineApplyOps[0]->emitNote(
        "CompositionAffineMapsPass must have been run: there should be at most "
-        "one AffineApplyOp");
+        "one AffineApplyOp, returning false conservatively.");
    return false;
  }

  auto composeOp = affineApplyOps[0]->cast<AffineApplyOp>();
  // We need yet another level of indirection because the `dim` index of the
  // access may not correspond to the `dim` index of composeOp.
-  return !(AffineValueMap(composeOp).isFunctionOf(0, &iv));
+  return !(AffineValueMap(composeOp).isFunctionOf(0, iv));
 }

 llvm::DenseSet<Value *>
-mlir::getInvariantAccesses(Value &iv, llvm::ArrayRef<Value *> indices) {
+mlir::getInvariantAccesses(Value *iv, llvm::ArrayRef<Value *> indices) {
  llvm::DenseSet<Value *> res;
  for (unsigned idx = 0, n = indices.size(); idx < n; ++idx) {
    auto *val = indices[idx];
-    if (isAccessInvariant(iv, *val)) {
+    if (isAccessInvariant(iv, val)) {
      res.insert(val);
    }
  }
@ -213,31 +213,30 @@ mlir::getInvariantAccesses(Value &iv, llvm::ArrayRef<Value *> indices) {
 /// Given:
 ///   1. an induction variable `iv` of type AffineForOp;
 ///   2. a `memoryOp` of type const LoadOp& or const StoreOp&;
-///   3. the index of the `fastestVaryingDim` along which to check;
-/// determines whether `memoryOp`[`fastestVaryingDim`] is a contiguous access
-/// along `iv`.
-/// Contiguous is defined as either invariant or varying only along
-/// `fastestVaryingDim`.
+/// determines whether `memoryOp` has a contiguous access along `iv`. Contiguous
+/// is defined as either invariant or varying only along a unique MemRef dim.
+/// Upon success, the unique MemRef dim is written in `memRefDim` (or -1 to
+/// convey the memRef access is invariant along `iv`).
 ///
 /// Prerequisites:
-///   1. `iv` of the proper type;
-///   2. the MemRef accessed by `memoryOp` has no layout map or at most an
+///   1. `memRefDim` ~= nullptr;
+///   2. `iv` of the proper type;
+///   3. the MemRef accessed by `memoryOp` has no layout map or at most an
 ///      identity layout map.
 ///
 /// Currently only supports no layoutMap or identity layoutMap in the MemRef.
-/// Returns false if the MemRef has a non-identity layoutMap or more than
-/// 1 layoutMap. This is conservative.
+/// Returns false if the MemRef has a non-identity layoutMap or more than 1
+/// layoutMap. This is conservative.
 ///
 // TODO(ntv): check strides.
 template <typename LoadOrStoreOp>
-static bool isContiguousAccess(Value &iv, LoadOrStoreOp memoryOp,
-                               unsigned fastestVaryingDim) {
+static bool isContiguousAccess(Value *iv, LoadOrStoreOp memoryOp,
+                               int *memRefDim) {
  static_assert(std::is_same<LoadOrStoreOp, LoadOp>::value ||
                    std::is_same<LoadOrStoreOp, StoreOp>::value,
                "Must be called on either const LoadOp & or const StoreOp &");
+  assert(memRefDim && "memRefDim == nullptr");
  auto memRefType = memoryOp.getMemRefType();
-  if (fastestVaryingDim >= memRefType.getRank())
-    return false;

  auto layoutMap = memRefType.getAffineMaps();
  // TODO(ntv): remove dependence on Builder once we support non-identity
@ -250,17 +249,26 @@ static bool isContiguousAccess(Value &iv, LoadOrStoreOp memoryOp,
    return memoryOp.emitError("NYI: non-trivial layoutMap"), false;
  }

+  int uniqueVaryingIndexAlongIv = -1;
  auto indices = memoryOp.getIndices();
-  auto numIndices = llvm::size(indices);
-  unsigned d = 0;
-  for (auto index : indices) {
-    if (fastestVaryingDim == (numIndices - 1) - d++) {
-      continue;
-    }
-    if (!isAccessInvariant(iv, *index)) {
-      return false;
+  unsigned numIndices = llvm::size(indices);
+  unsigned dim = 0;
+  for (auto *index : indices) {
+    if (!isAccessInvariant(iv, index)) {
+      if (uniqueVaryingIndexAlongIv != -1) {
+        // 2+ varying indices -> do not vectorize along iv.
+        return false;
+      }
+      uniqueVaryingIndexAlongIv = dim;
    }
+    ++dim;
  }
+
+  if (uniqueVaryingIndexAlongIv == -1)
+    *memRefDim = -1;
+  else
+    *memRefDim = numIndices - (uniqueVaryingIndexAlongIv + 1);
+
  return true;
 }

@ -328,15 +336,12 @@ isVectorizableLoopBodyWithOpCond(AffineForOp loop,
  return true;
 }

-bool mlir::isVectorizableLoopBodyAlongFastestVaryingMemRefDim(
-    AffineForOp loop, unsigned fastestVaryingDim) {
-  VectorizableOpFun fun([fastestVaryingDim](AffineForOp loop, Operation &op) {
+bool mlir::isVectorizableLoopBody(AffineForOp loop, int *memRefDim) {
+  VectorizableOpFun fun([memRefDim](AffineForOp loop, Operation &op) {
    auto load = op.dyn_cast<LoadOp>();
    auto store = op.dyn_cast<StoreOp>();
-    return load ? isContiguousAccess(*loop.getInductionVar(), load,
-                                     fastestVaryingDim)
-                : isContiguousAccess(*loop.getInductionVar(), store,
-                                     fastestVaryingDim);
+    return load ? isContiguousAccess(loop.getInductionVar(), load, memRefDim)
+                : isContiguousAccess(loop.getInductionVar(), store, memRefDim);
  });
  return isVectorizableLoopBodyWithOpCond(loop, fun);
 }
@ -348,8 +353,8 @@ bool mlir::isVectorizableLoopBody(AffineForOp loop) {
 /// Checks whether SSA dominance would be violated if a for op's body
 /// operations are shifted by the specified shifts. This method checks if a
 /// 'def' and all its uses have the same shift factor.
-// TODO(mlir-team): extend this to check for memory-based dependence
-// violation when we have the support.
+// TODO(mlir-team): extend this to check for memory-based dependence violation
+// when we have the support.
 bool mlir::isInstwiseShiftValid(AffineForOp forOp, ArrayRef<uint64_t> shifts) {
  auto *forBody = forOp.getBody();
  assert(shifts.size() == forBody->getOperations().size());
--- a/mlir/lib/Analysis/VectorAnalysis.cpp
+++ b/mlir/lib/Analysis/VectorAnalysis.cpp
@ -101,25 +101,33 @@ Optional<SmallVector<unsigned, 4>> mlir::shapeRatio(VectorType superVectorType,
 /// If no index is found to be invariant, 0 is added to the permutation_map and
 /// corresponds to a vector broadcast along that dimension.
 ///
+/// Returns an empty AffineMap if `enclosingLoopToVectorDim` is empty,
+/// signalling that no permutation map can be constructed given
+/// `enclosingLoopToVectorDim`.
+///
 /// Examples can be found in the documentation of `makePermutationMap`, in the
 /// header file.
 static AffineMap makePermutationMap(
-    MLIRContext *context,
-    llvm::iterator_range<Operation::operand_iterator> indices,
+    llvm::iterator_range<Operation::operand_iterator> operands,
    const DenseMap<Operation *, unsigned> &enclosingLoopToVectorDim) {
+  if (enclosingLoopToVectorDim.empty())
+    return AffineMap();
+  MLIRContext *context =
+      enclosingLoopToVectorDim.begin()->getFirst()->getContext();
  using functional::makePtrDynCaster;
  using functional::map;
-  auto unwrappedIndices = map(makePtrDynCaster<Value, Value>(), indices);
+  SmallVector<Value *, 8> indices(operands);
  SmallVector<AffineExpr, 4> perm(enclosingLoopToVectorDim.size(),
                                  getAffineConstantExpr(0, context));
+
  for (auto kvp : enclosingLoopToVectorDim) {
    assert(kvp.second < perm.size());
    auto invariants = getInvariantAccesses(
-        *kvp.first->cast<AffineForOp>().getInductionVar(), unwrappedIndices);
-    unsigned numIndices = unwrappedIndices.size();
+        kvp.first->cast<AffineForOp>().getInductionVar(), indices);
+    unsigned numIndices = indices.size();
    unsigned countInvariantIndices = 0;
    for (unsigned dim = 0; dim < numIndices; ++dim) {
-      if (!invariants.count(unwrappedIndices[dim])) {
+      if (!invariants.count(indices[dim])) {
        assert(perm[kvp.second] == getAffineConstantExpr(0, context) &&
               "permutationMap already has an entry along dim");
        perm[kvp.second] = getAffineDimExpr(dim, context);
@ -132,7 +140,7 @@ static AffineMap makePermutationMap(
           "Vectorization prerequisite violated: at most 1 index may be "
           "invariant wrt a vectorized loop");
  }
-  return AffineMap::get(unwrappedIndices.size(), 0, perm, {});
+  return AffineMap::get(indices.size(), 0, perm, {});
 }

 /// Implementation detail that walks up the parents and records the ones with
@ -170,13 +178,11 @@ AffineMap mlir::makePermutationMap(
  }

  if (auto load = op->dyn_cast<LoadOp>()) {
-    return ::makePermutationMap(op->getContext(), load.getIndices(),
-                                enclosingLoopToVectorDim);
+    return ::makePermutationMap(load.getIndices(), enclosingLoopToVectorDim);
  }

  auto store = op->cast<StoreOp>();
-  return ::makePermutationMap(op->getContext(), store.getIndices(),
-                              enclosingLoopToVectorDim);
+  return ::makePermutationMap(store.getIndices(), enclosingLoopToVectorDim);
 }

 bool mlir::matcher::operatesOnSuperVectors(Operation &op,
--- a/mlir/lib/Transforms/Vectorize.cpp
+++ b/mlir/lib/Transforms/Vectorize.cpp
@ -553,7 +553,7 @@ static llvm::cl::OptionCategory clOptionsCategory("vectorize options");

 static llvm::cl::list<int> clVirtualVectorSize(
    "virtual-vector-size",
-    llvm::cl::desc("Specify n-D virtual vector size for vectorization"),
+    llvm::cl::desc("Specify an n-D virtual vector size for vectorization"),
    llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory));

 static llvm::cl::list<int> clFastestVaryingPattern(
@ -567,124 +567,84 @@ static llvm::cl::list<int> clFastestVaryingPattern(
 /// Forward declaration.
 static FilterFunctionType
 isVectorizableLoopPtrFactory(const llvm::DenseSet<Operation *> &parallelLoops,
-                             unsigned fastestVaryingMemRefDimension);
-
-// Build a bunch of predetermined patterns that will be traversed in order.
-// Due to the recursive nature of NestedPatterns, this captures
-// arbitrarily nested pairs of loops at any position in the tree.
-/// Note that this currently only matches 2 nested loops and will be extended.
-// TODO(ntv): support 3-D loop patterns with a common reduction loop that can
-// be matched to GEMMs.
-static std::vector<NestedPattern>
-defaultPatterns(const llvm::DenseSet<Operation *> &parallelLoops) {
-  using matcher::For;
-  return std::vector<NestedPattern>{
-      // 3-D patterns
-      For(isVectorizableLoopPtrFactory(parallelLoops, 2),
-          For(isVectorizableLoopPtrFactory(parallelLoops, 1),
-              For(isVectorizableLoopPtrFactory(parallelLoops, 0)))),
-      // for i { for j { A[??f(not i, not j), f(i, not j), f(not i, j)];}}
-      // test independently with:
-      //   --test-fastest-varying=1 --test-fastest-varying=0
-      For(isVectorizableLoopPtrFactory(parallelLoops, 1),
-          For(isVectorizableLoopPtrFactory(parallelLoops, 0))),
-      // for i { for j { A[??f(not i, not j), f(i, not j), ?, f(not i, j)];}}
-      // test independently with:
-      //   --test-fastest-varying=2 --test-fastest-varying=0
-      For(isVectorizableLoopPtrFactory(parallelLoops, 2),
-          For(isVectorizableLoopPtrFactory(parallelLoops, 0))),
-      // for i { for j { A[??f(not i, not j), f(i, not j), ?, ?, f(not i, j)];}}
-      // test independently with:
-      //   --test-fastest-varying=3 --test-fastest-varying=0
-      For(isVectorizableLoopPtrFactory(parallelLoops, 3),
-          For(isVectorizableLoopPtrFactory(parallelLoops, 0))),
-      // for i { for j { A[??f(not i, not j), f(not i, j), f(i, not j)];}}
-      // test independently with:
-      //   --test-fastest-varying=0 --test-fastest-varying=1
-      For(isVectorizableLoopPtrFactory(parallelLoops, 0),
-          For(isVectorizableLoopPtrFactory(parallelLoops, 1))),
-      // for i { for j { A[??f(not i, not j), f(not i, j), ?, f(i, not j)];}}
-      // test independently with:
-      //   --test-fastest-varying=0 --test-fastest-varying=2
-      For(isVectorizableLoopPtrFactory(parallelLoops, 0),
-          For(isVectorizableLoopPtrFactory(parallelLoops, 2))),
-      // for i { for j { A[??f(not i, not j), f(not i, j), ?, ?, f(i, not j)];}}
-      // test independently with:
-      //   --test-fastest-varying=0 --test-fastest-varying=3
-      For(isVectorizableLoopPtrFactory(parallelLoops, 0),
-          For(isVectorizableLoopPtrFactory(parallelLoops, 3))),
-      // for i { A[??f(not i) , f(i)];}
-      // test independently with:  --test-fastest-varying=0
-      For(isVectorizableLoopPtrFactory(parallelLoops, 0)),
-      // for i { A[??f(not i) , f(i), ?];}
-      // test independently with:  --test-fastest-varying=1
-      For(isVectorizableLoopPtrFactory(parallelLoops, 1)),
-      // for i { A[??f(not i) , f(i), ?, ?];}
-      // test independently with:  --test-fastest-varying=2
-      For(isVectorizableLoopPtrFactory(parallelLoops, 2)),
-      // for i { A[??f(not i) , f(i), ?, ?, ?];}
-      // test independently with:  --test-fastest-varying=3
-      For(isVectorizableLoopPtrFactory(parallelLoops, 3))};
-}
+                             int fastestVaryingMemRefDimension);

 /// Creates a vectorization pattern from the command line arguments.
 /// Up to 3-D patterns are supported.
 /// If the command line argument requests a pattern of higher order, returns an
 /// empty pattern list which will conservatively result in no vectorization.
 static std::vector<NestedPattern>
-makePatterns(const llvm::DenseSet<Operation *> &parallelLoops) {
+makePatterns(const llvm::DenseSet<Operation *> &parallelLoops, int vectorRank,
+             ArrayRef<int64_t> fastestVaryingPattern) {
  using matcher::For;
-  if (clFastestVaryingPattern.empty()) {
-    return defaultPatterns(parallelLoops);
-  }
-  switch (clFastestVaryingPattern.size()) {
+  int64_t d0 = fastestVaryingPattern.empty() ? -1 : fastestVaryingPattern[0];
+  int64_t d1 = fastestVaryingPattern.size() < 2 ? -1 : fastestVaryingPattern[1];
+  int64_t d2 = fastestVaryingPattern.size() < 3 ? -1 : fastestVaryingPattern[2];
+  switch (vectorRank) {
  case 1:
-    return {For(isVectorizableLoopPtrFactory(parallelLoops,
-                                             clFastestVaryingPattern[0]))};
+    return {For(isVectorizableLoopPtrFactory(parallelLoops, d0))};
  case 2:
-    return {For(
-        isVectorizableLoopPtrFactory(parallelLoops, clFastestVaryingPattern[0]),
-        For(isVectorizableLoopPtrFactory(parallelLoops,
-                                         clFastestVaryingPattern[1])))};
+    return {For(isVectorizableLoopPtrFactory(parallelLoops, d0),
+                For(isVectorizableLoopPtrFactory(parallelLoops, d1)))};
  case 3:
-    return {For(
-        isVectorizableLoopPtrFactory(parallelLoops, clFastestVaryingPattern[0]),
-        For(isVectorizableLoopPtrFactory(parallelLoops,
-                                         clFastestVaryingPattern[1]),
-            For(isVectorizableLoopPtrFactory(parallelLoops,
-                                             clFastestVaryingPattern[2]))))};
-  default:
+    return {For(isVectorizableLoopPtrFactory(parallelLoops, d0),
+                For(isVectorizableLoopPtrFactory(parallelLoops, d1),
+                    For(isVectorizableLoopPtrFactory(parallelLoops, d2))))};
+  default: {
    return std::vector<NestedPattern>();
  }
+  }
 }

 namespace {

+/// Base state for the vectorize pass.
+/// Command line arguments are preempted by non-empty pass arguments.
 struct Vectorize : public FunctionPass<Vectorize> {
-  Vectorize() {
-    if (!clVirtualVectorSize.empty()) {
-      vectorSizes.reserve(clVirtualVectorSize.size());
-      this->vectorSizes.assign(clVirtualVectorSize.begin(),
-                               clVirtualVectorSize.end());
-    }
-  }
-  Vectorize(ArrayRef<int64_t> virtualVectorSize) {
-    if (clVirtualVectorSize.empty()) {
-      this->vectorSizes.assign(virtualVectorSize.begin(),
-                               virtualVectorSize.end());
-    } else {
-      vectorSizes.reserve(clVirtualVectorSize.size());
-      this->vectorSizes.assign(clVirtualVectorSize.begin(),
-                               clVirtualVectorSize.end());
-    }
-  }
+  Vectorize();
+  Vectorize(ArrayRef<int64_t> virtualVectorSize);
+  Vectorize(ArrayRef<int64_t> virtualVectorSize,
+            ArrayRef<int64_t> fastestVaryingPattern);
  void runOnFunction() override;
+
+  // The virtual vector size that we vectorize to.
  SmallVector<int64_t, 4> vectorSizes;
+  // Optionally, the fixed mapping from loop to fastest varying MemRef dimension
+  // for all the MemRefs within a loop pattern:
+  //   the index represents the loop depth, the value represents the k^th
+  //   fastest varying memory dimension.
+  // This is voluntarily restrictive and is meant to precisely target a
+  // particular loop/op pair, for testing purposes.
+  SmallVector<int64_t, 4> fastestVaryingPattern;
 };

 } // end anonymous namespace

-/////// TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate. //////
+Vectorize::Vectorize() {
+  this->vectorSizes.assign(clVirtualVectorSize.begin(),
+                           clVirtualVectorSize.end());
+  this->fastestVaryingPattern.assign(clFastestVaryingPattern.begin(),
+                                     clFastestVaryingPattern.end());
+}
+
+Vectorize::Vectorize(ArrayRef<int64_t> virtualVectorSize) : Vectorize() {
+  if (!virtualVectorSize.empty()) {
+    this->vectorSizes.assign(virtualVectorSize.begin(),
+                             virtualVectorSize.end());
+  }
+}
+
+Vectorize::Vectorize(ArrayRef<int64_t> virtualVectorSize,
+                     ArrayRef<int64_t> fastestVaryingPattern)
+    : Vectorize(virtualVectorSize) {
+  if (!fastestVaryingPattern.empty()) {
+    this->fastestVaryingPattern.assign(fastestVaryingPattern.begin(),
+                                       fastestVaryingPattern.end());
+  }
+}
+
+/////// TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate.
+/////////
 namespace {

 struct VectorizationStrategy {
@ -833,12 +793,12 @@ void VectorizationState::registerReplacement(Value *key, Value *value) {
 /// vectorized immediately. The resulting vector_transfer_read is immediately
 /// registered to replace all uses of the LoadOp in this pattern's scope.
 ///
-/// StoreOp are the terminals of the vectorizeNonTerminals call. They need
-/// to be vectorized late once all the use-def chains have been traversed.
-/// Additionally, they may have ssa-values operands which come from outside
-/// the scope of the current pattern.
-/// Such special cases force us to delay the vectorization of the stores
-/// until the last step. Here we merely register the store operation.
+/// StoreOp are the terminals of the vectorizeNonTerminals call. They need to be
+/// vectorized late once all the use-def chains have been traversed.
+/// Additionally, they may have ssa-values operands which come from outside the
+/// scope of the current pattern.
+/// Such special cases force us to delay the vectorization of the stores until
+/// the last step. Here we merely register the store operation.
 template <typename LoadOrStoreOpPointer>
 static LogicalResult vectorizeRootOrTerminal(Value *iv,
                                             LoadOrStoreOpPointer memoryOp,
@ -860,6 +820,8 @@ static LogicalResult vectorizeRootOrTerminal(Value *iv,
  if (opInst->template isa<LoadOp>()) {
    auto permutationMap =
        makePermutationMap(opInst, state->strategy->loopToVectorDim);
+    if (!permutationMap)
+      return LogicalResult::Failure;
    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
    LLVM_DEBUG(permutationMap.print(dbgs()));
    FuncBuilder b(opInst);
@ -907,22 +869,23 @@ static LogicalResult vectorizeAffineForOp(AffineForOp loop, int64_t step,
  return success();
 }

-/// Returns a FilterFunctionType that can be used in NestedPattern to
-/// match a loop whose underlying load/store accesses are all varying along the
-/// `fastestVaryingMemRefDimension`.
-/// TODO(ntv): In the future, allow more interesting mixed layout permutation
-/// once we understand better the performance implications and we are confident
-/// we can build a cost model and a search procedure.
+/// Returns a FilterFunctionType that can be used in NestedPattern to match a
+/// loop whose underlying load/store accesses are either invariant or all
+// varying along the `fastestVaryingMemRefDimension`.
 static FilterFunctionType
 isVectorizableLoopPtrFactory(const llvm::DenseSet<Operation *> &parallelLoops,
-                             unsigned fastestVaryingMemRefDimension) {
+                             int fastestVaryingMemRefDimension) {
  return [&parallelLoops, fastestVaryingMemRefDimension](Operation &forOp) {
    auto loop = forOp.cast<AffineForOp>();
    auto parallelIt = parallelLoops.find(loop);
    if (parallelIt == parallelLoops.end())
      return false;
-    return isVectorizableLoopBodyAlongFastestVaryingMemRefDim(
-        loop, fastestVaryingMemRefDimension);
+    int memRefDim = -1;
+    auto vectorizableBody = isVectorizableLoopBody(loop, &memRefDim);
+    if (!vectorizableBody)
+      return false;
+    return memRefDim == -1 || fastestVaryingMemRefDimension == -1 ||
+           memRefDim == fastestVaryingMemRefDimension;
  };
 }

@ -1047,15 +1010,15 @@ static Value *vectorizeOperand(Value *operand, Operation *op,
  return nullptr;
 };

-/// Encodes Operation-specific behavior for vectorization. In general we
-/// assume that all operands of an op must be vectorized but this is not always
-/// true. In the future, it would be nice to have a trait that describes how a
+/// Encodes Operation-specific behavior for vectorization. In general we assume
+/// that all operands of an op must be vectorized but this is not always true.
+/// In the future, it would be nice to have a trait that describes how a
 /// particular operation vectorizes. For now we implement the case distinction
 /// here.
 /// Returns a vectorized form of an operation or nullptr if vectorization fails.
-/// TODO(ntv): consider adding a trait to Op to describe how it gets vectorized.
-/// Maybe some Ops are not vectorizable or require some tricky logic, we cannot
-/// do one-off logic here; ideally it would be TableGen'd.
+// TODO(ntv): consider adding a trait to Op to describe how it gets vectorized.
+// Maybe some Ops are not vectorizable or require some tricky logic, we cannot
+// do one-off logic here; ideally it would be TableGen'd.
 static Operation *vectorizeOneOperation(Operation *opInst,
                                        VectorizationState *state) {
  // Sanity checks.
@ -1074,6 +1037,8 @@ static Operation *vectorizeOneOperation(Operation *opInst,
    FuncBuilder b(opInst);
    auto permutationMap =
        makePermutationMap(opInst, state->strategy->loopToVectorDim);
+    if (!permutationMap)
+      return nullptr;
    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
    LLVM_DEBUG(permutationMap.print(dbgs()));
    auto transfer = b.create<VectorTransferWriteOp>(
@ -1249,10 +1214,18 @@ static LogicalResult vectorizeRootMatch(NestedMatch m,
 /// Applies vectorization to the current Function by searching over a bunch of
 /// predetermined patterns.
 void Vectorize::runOnFunction() {
+  Function &f = getFunction();
+  if (!fastestVaryingPattern.empty() &&
+      fastestVaryingPattern.size() != vectorSizes.size()) {
+    f.emitNote("Fastest varying pattern specified with different size than the "
+               "vector size.");
+    this->signalPassFailure();
+    return;
+  }
+
  // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
  NestedPatternContext mlContext;

-  Function &f = getFunction();
  llvm::DenseSet<Operation *> parallelLoops;
  f.walkPostOrder([&parallelLoops](Operation *op) {
    if (auto loop = op->dyn_cast<AffineForOp>()) {
@ -1262,7 +1235,8 @@ void Vectorize::runOnFunction() {
    }
  });

-  for (auto &pat : makePatterns(parallelLoops)) {
+  for (auto &pat :
+       makePatterns(parallelLoops, vectorSizes.size(), fastestVaryingPattern)) {
    LLVM_DEBUG(dbgs() << "\n******************************************");
    LLVM_DEBUG(dbgs() << "\n******************************************");
    LLVM_DEBUG(dbgs() << "\n[early-vect] new pattern on Function\n");
--- a/mlir/test/Transforms/Vectorize/vectorize_2d.mlir
+++ b/mlir/test/Transforms/Vectorize/vectorize_2d.mlir
@ -1,7 +1,15 @@
+// RUN: mlir-opt %s -vectorize -virtual-vector-size 4 -virtual-vector-size 8 | FileCheck %s -check-prefix=VECT
 // RUN: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=1 --test-fastest-varying=0 | FileCheck %s

 // Permutation maps used in vectorization.
-// CHECK: #[[map_proj_d0d1_d0d1:map[0-9]+]] = (d0, d1) -> (d0, d1)
+// CHECK-DAG: #[[map_id1:map[0-9]+]] = (d0) -> (d0)
+// CHECK-DAG: #[[map_id2:map[0-9]+]] = (d0, d1) -> (d0, d1)
+// CHECK-DAG: #[[map_proj_d0d1_zerod1:map[0-9]+]] = (d0, d1) -> (0, d1)
+// CHECK-DAG: #[[map_proj_d0d1_d0zero:map[0-9]+]] = (d0, d1) -> (d0, 0)
+// VECT-DAG: #[[map_id1:map[0-9]+]] = (d0) -> (d0)
+// VECT-DAG: #[[map_id2:map[0-9]+]] = (d0, d1) -> (d0, d1)
+// VECT-DAG: #[[map_proj_d0d1_zerod1:map[0-9]+]] = (d0, d1) -> (0, d1)
+// VECT-DAG: #[[map_proj_d0d1_d0zero:map[0-9]+]] = (d0, d1) -> (d0, 0)

 func @vec2d(%A : memref<?x?x?xf32>) {
   %M = dim %A, 0 : memref<?x?x?xf32>
@ -46,7 +54,7 @@ func @vector_add_2d(%M : index, %N : index) -> f32 {
  affine.for %i0 = 0 to %M {
    affine.for %i1 = 0 to %N {
      // CHECK: [[C1:%.*]] = constant splat<vector<32x256xf32>, 1.000000e+00> : vector<32x256xf32>
-      // CHECK: vector_transfer_write [[C1]], {{.*}} {permutation_map: #[[map_proj_d0d1_d0d1]]} : vector<32x256xf32>, memref<?x?xf32>, index, index
+      // CHECK: vector_transfer_write [[C1]], {{.*}} {permutation_map: #[[map_id2]]} : vector<32x256xf32>, memref<?x?xf32>, index, index
      // non-scoped %f1
      store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
    }
@ -54,22 +62,22 @@ func @vector_add_2d(%M : index, %N : index) -> f32 {
  affine.for %i2 = 0 to %M {
    affine.for %i3 = 0 to %N {
      // CHECK: [[C3:%.*]] = constant splat<vector<32x256xf32>, 2.000000e+00> : vector<32x256xf32>
-      // CHECK: vector_transfer_write [[C3]], {{.*}} {permutation_map: #[[map_proj_d0d1_d0d1]]}  : vector<32x256xf32>, memref<?x?xf32>, index, index
+      // CHECK: vector_transfer_write [[C3]], {{.*}} {permutation_map: #[[map_id2]]}  : vector<32x256xf32>, memref<?x?xf32>, index, index
      // non-scoped %f2
      store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
    }
  }
  affine.for %i4 = 0 to %M {
    affine.for %i5 = 0 to %N {
-      // CHECK: [[A5:%.*]] = vector_transfer_read %0, {{.*}} {permutation_map: #[[map_proj_d0d1_d0d1]]} : (memref<?x?xf32>, index, index) -> vector<32x256xf32>
-      // CHECK: [[B5:%.*]] = vector_transfer_read %1, {{.*}} {permutation_map: #[[map_proj_d0d1_d0d1]]} : (memref<?x?xf32>, index, index) -> vector<32x256xf32>
+      // CHECK: [[A5:%.*]] = vector_transfer_read %0, {{.*}} {permutation_map: #[[map_id2]]} : (memref<?x?xf32>, index, index) -> vector<32x256xf32>
+      // CHECK: [[B5:%.*]] = vector_transfer_read %1, {{.*}} {permutation_map: #[[map_id2]]} : (memref<?x?xf32>, index, index) -> vector<32x256xf32>
      // CHECK: [[S5:%.*]] = addf [[A5]], [[B5]] : vector<32x256xf32>
      // CHECK: [[SPLAT1:%.*]] = constant splat<vector<32x256xf32>, 1.000000e+00> : vector<32x256xf32>
      // CHECK: [[S6:%.*]] = addf [[S5]], [[SPLAT1]] : vector<32x256xf32>
      // CHECK: [[SPLAT2:%.*]] = constant splat<vector<32x256xf32>, 2.000000e+00> : vector<32x256xf32>
      // CHECK: [[S7:%.*]] = addf [[S5]], [[SPLAT2]] : vector<32x256xf32>
      // CHECK: [[S8:%.*]] = addf [[S7]], [[S6]] : vector<32x256xf32>
-      // CHECK: vector_transfer_write [[S8]], {{.*}} {permutation_map: #[[map_proj_d0d1_d0d1]]} : vector<32x256xf32>, memref<?x?xf32>, index, index
+      // CHECK: vector_transfer_write [[S8]], {{.*}} {permutation_map: #[[map_id2]]} : vector<32x256xf32>, memref<?x?xf32>, index, index
      //
      %a5 = load %A[%i4, %i5] : memref<?x?xf32, 0>
      %b5 = load %B[%i4, %i5] : memref<?x?xf32, 0>
@ -89,3 +97,46 @@ func @vector_add_2d(%M : index, %N : index) -> f32 {
  return %res : f32
 }

+// VECT-LABEL: func @vectorize_matmul
+func @vectorize_matmul(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
+  %c0 = constant 0 : index
+  %M = dim %arg0, 0 : memref<?x?xf32>
+  %K = dim %arg0, 1 : memref<?x?xf32>
+  %N = dim %arg2, 1 : memref<?x?xf32>
+  //      VECT: %[[C0:.*]] = constant 0 : index
+  // VECT-NEXT: %[[M:.*]] = dim %arg0, 0 : memref<?x?xf32>
+  // VECT-NEXT: %[[K:.*]] = dim %arg0, 1 : memref<?x?xf32>
+  // VECT-NEXT: %[[N:.*]] = dim %arg2, 1 : memref<?x?xf32>
+  //      VECT: {{.*}} #[[map_id1]](%[[M]]) step 4 {
+  // VECT-NEXT:   {{.*}} #[[map_id1]](%[[N]]) step 8 {
+  //      VECT:     %[[VC0:.*]] = constant splat<vector<4x8xf32>, 0.000000e+00> : vector<4x8xf32>
+  // VECT-NEXT:     vector_transfer_write %[[VC0]], %arg2, %{{.*}}, %{{.*}} {permutation_map: #[[map_id2]]}
+  affine.for %i0 = (d0) -> (d0)(%c0) to (d0) -> (d0)(%M) {
+    affine.for %i1 = (d0) -> (d0)(%c0) to (d0) -> (d0)(%N) {
+      %cst = constant 0.000000e+00 : f32
+      store %cst, %arg2[%i0, %i1] : memref<?x?xf32>
+    }
+  }
+  //      VECT:  affine.for %[[I2:.*]] = #[[map_id1]](%[[C0]]) to #[[map_id1]](%[[M]]) step 4 {
+  // VECT-NEXT:    affine.for %[[I3:.*]] = #[[map_id1]](%[[C0]]) to #[[map_id1]](%[[N]]) step 8 {
+  // VECT-NEXT:      affine.for %[[I4:.*]] = #map5(%[[C0]]) to #[[map_id1]](%[[K]]) {
+  // VECT-NEXT:        %[[A:.*]] = vector_transfer_read %arg1, %[[I4]], %[[I3]] {permutation_map: #[[map_proj_d0d1_zerod1]]} 
+  // VECT-NEXT:        %[[B:.*]] = vector_transfer_read %arg0, %[[I2]], %[[I4]] {permutation_map: #[[map_proj_d0d1_d0zero]]} 
+  // VECT-NEXT:        %[[C:.*]] = mulf %[[B]], %[[A]] : vector<4x8xf32>
+  // VECT-NEXT:        %[[D:.*]] = vector_transfer_read %arg2, %[[I2]], %[[I3]] {permutation_map: #[[map_id2]]} 
+  // VECT-NEXT:        %[[E:.*]] = addf %[[D]], %[[C]] : vector<4x8xf32>
+  // VECT-NEXT:        vector_transfer_write %[[E]], %arg2, %[[I2]], %[[I3]] {permutation_map: #[[map_id2]]} : vector<4x8xf32>, memref<?x?xf32>, index, index
+  affine.for %i2 = (d0) -> (d0)(%c0) to (d0) -> (d0)(%M) {
+    affine.for %i3 = (d0) -> (d0)(%c0) to (d0) -> (d0)(%N) {
+      affine.for %i4 = (d0) -> (d0)(%c0) to (d0) -> (d0)(%K) {
+        %6 = load %arg1[%i4, %i3] : memref<?x?xf32>
+        %7 = load %arg0[%i2, %i4] : memref<?x?xf32>
+        %8 = mulf %7, %6 : f32
+        %9 = load %arg2[%i2, %i3] : memref<?x?xf32>
+        %10 = addf %9, %8 : f32
+        store %10, %arg2[%i2, %i3] : memref<?x?xf32>
+      }
+    }
+  }
+  return
+}