[MLIR][MaterializeVectors] Add a MaterializeVector pass via unrolling.

This CL adds an MLIR-MLIR pass which materializes super-vectors to hardware-dependent sized vectors. While the physical vector size is target-dependent, the pass is written in a target-independent way: the target vector size is specified as a parameter to the pass. This pass is thus a partial lowering that opens the "greybox" that is the super-vector abstraction. This first CL adds a first materilization pass iterates over vector_transfer_write operations and: 1. computes the program slice including the current vector_transfer_write; 2. computes the multi-dimensional ratio of super-vector shape to hardware vector shape; 3. for each possible multi-dimensional value within the bounds of ratio, a new slice is instantiated (i.e. cloned and rewritten) so that all operations in this instance operate on the hardware vector type. As a simple example, given: ```mlir mlfunc @vector_add_2d(%M : index, %N : index) -> memref<?x?xf32> { %A = alloc (%M, %N) : memref<?x?xf32> %B = alloc (%M, %N) : memref<?x?xf32> %C = alloc (%M, %N) : memref<?x?xf32> for %i0 = 0 to %M { for %i1 = 0 to %N { %a1 = load %A[%i0, %i1] : memref<?x?xf32> %b1 = load %B[%i0, %i1] : memref<?x?xf32> %s1 = addf %a1, %b1 : f32 store %s1, %C[%i0, %i1] : memref<?x?xf32> } } return %C : memref<?x?xf32> } ``` and the following options: ``` -vectorize -virtual-vector-size 32 --test-fastest-varying=0 -materialize-vectors -vector-size=8 ``` materialization emits: ```mlir #map0 = (d0, d1) -> (d0, d1) #map1 = (d0, d1) -> (d0, d1 + 8) #map2 = (d0, d1) -> (d0, d1 + 16) #map3 = (d0, d1) -> (d0, d1 + 24) mlfunc @vector_add_2d(%arg0 : index, %arg1 : index) -> memref<?x?xf32> { %0 = alloc(%arg0, %arg1) : memref<?x?xf32> %1 = alloc(%arg0, %arg1) : memref<?x?xf32> %2 = alloc(%arg0, %arg1) : memref<?x?xf32> for %i0 = 0 to %arg0 { for %i1 = 0 to %arg1 step 32 { %3 = affine_apply #map0(%i0, %i1) %4 = "vector_transfer_read"(%0, %3tensorflow/mlir#0, %3tensorflow/mlir#1) : (memref<?x?xf32>, index, index) -> vector<8xf32> %5 = affine_apply #map1(%i0, %i1) %6 = "vector_transfer_read"(%0, %5tensorflow/mlir#0, %5tensorflow/mlir#1) : (memref<?x?xf32>, index, index) -> vector<8xf32> %7 = affine_apply #map2(%i0, %i1) %8 = "vector_transfer_read"(%0, %7tensorflow/mlir#0, %7tensorflow/mlir#1) : (memref<?x?xf32>, index, index) -> vector<8xf32> %9 = affine_apply #map3(%i0, %i1) %10 = "vector_transfer_read"(%0, %9tensorflow/mlir#0, %9tensorflow/mlir#1) : (memref<?x?xf32>, index, index) -> vector<8xf32> %11 = affine_apply #map0(%i0, %i1) %12 = "vector_transfer_read"(%1, %11tensorflow/mlir#0, %11tensorflow/mlir#1) : (memref<?x?xf32>, index, index) -> vector<8xf32> %13 = affine_apply #map1(%i0, %i1) %14 = "vector_transfer_read"(%1, %13tensorflow/mlir#0, %13tensorflow/mlir#1) : (memref<?x?xf32>, index, index) -> vector<8xf32> %15 = affine_apply #map2(%i0, %i1) %16 = "vector_transfer_read"(%1, %15tensorflow/mlir#0, %15tensorflow/mlir#1) : (memref<?x?xf32>, index, index) -> vector<8xf32> %17 = affine_apply #map3(%i0, %i1) %18 = "vector_transfer_read"(%1, %17tensorflow/mlir#0, %17tensorflow/mlir#1) : (memref<?x?xf32>, index, index) -> vector<8xf32> %19 = addf %4, %12 : vector<8xf32> %20 = addf %6, %14 : vector<8xf32> %21 = addf %8, %16 : vector<8xf32> %22 = addf %10, %18 : vector<8xf32> %23 = affine_apply #map0(%i0, %i1) "vector_transfer_write"(%19, %2, %23tensorflow/mlir#0, %23tensorflow/mlir#1) : (vector<8xf32>, memref<?x?xf32>, index, index) -> () %24 = affine_apply #map1(%i0, %i1) "vector_transfer_write"(%20, %2, %24tensorflow/mlir#0, %24tensorflow/mlir#1) : (vector<8xf32>, memref<?x?xf32>, index, index) -> () %25 = affine_apply #map2(%i0, %i1) "vector_transfer_write"(%21, %2, %25tensorflow/mlir#0, %25tensorflow/mlir#1) : (vector<8xf32>, memref<?x?xf32>, index, index) -> () %26 = affine_apply #map3(%i0, %i1) "vector_transfer_write"(%22, %2, %26tensorflow/mlir#0, %26tensorflow/mlir#1) : (vector<8xf32>, memref<?x?xf32>, index, index) -> () } } return %2 : memref<?x?xf32> } ``` PiperOrigin-RevId: 222455351
2018-11-21 13:46:54 -08:00 · 2018-11-21 13:46:54 -08:00 · a5782f0d40
parent 258dae5d73
commit a5782f0d40
5 changed files with 690 additions and 5 deletions
--- a/mlir/include/mlir/Analysis/SliceAnalysis.h
+++ b/mlir/include/mlir/Analysis/SliceAnalysis.h
@ -70,8 +70,8 @@ using TransitiveFilter = std::function<bool(Statement *)>;
 ///              9
 ///
 /// Assuming all local orders match the numbering order:
-/// 1. after getting back to the root getForwardSlice,
-///    `forwardSlice` may contain:
+/// 1. after getting back to the root getForwardSlice, `forwardSlice` may
+///    contain:
 ///      {9, 7, 8, 5, 1, 2, 6, 3, 4}
 /// 2. reversing the result of 1. gives:
 ///      {4, 3, 6, 2, 1, 5, 8, 7, 9}
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@ -44,6 +44,9 @@ FunctionPass *createVectorizePass();
 /// FileCheck.
 FunctionPass *createVectorizerTestPass();

+/// Creates a pass to lower super-vectors to target-dependent HW vectors.
+FunctionPass *createMaterializeVectors();
+
 /// Creates a loop unrolling pass. Default option or command-line options take
 /// effect if -1 is passed as parameter.
 FunctionPass *createLoopUnrollPass(int unrollFactor = -1, int unrollFull = -1);
--- a/mlir/lib/Analysis/SliceAnalysis.cpp
+++ b/mlir/lib/Analysis/SliceAnalysis.cpp
@ -59,7 +59,7 @@ void mlir::getForwardSlice(Statement *stmt,
        auto *ownerStmt = u.getOwner();
        if (forwardSlice->count(ownerStmt) == 0) {
          getForwardSlice(ownerStmt, forwardSlice, filter,
-                          /* topLevel */ false);
+                          /*topLevel=*/false);
        }
      }
    }
@ -68,7 +68,7 @@ void mlir::getForwardSlice(Statement *stmt,
      auto *ownerStmt = u.getOwner();
      if (forwardSlice->count(ownerStmt) == 0) {
        getForwardSlice(ownerStmt, forwardSlice, filter,
-                        /* topLevel */ false);
+                        /*topLevel=*/false);
      }
    }
  } else {
@ -105,7 +105,7 @@ void mlir::getBackwardSlice(Statement *stmt,
    auto *stmt = operand->getDefiningStmt();
    if (backwardSlice->count(stmt) == 0) {
      getBackwardSlice(stmt, backwardSlice, filter,
-                       /* topLevel */ false);
+                       /*topLevel=*/false);
    }
  }

--- a/mlir/lib/Transforms/MaterializeVectors.cpp
+++ b/mlir/lib/Transforms/MaterializeVectors.cpp
@ -0,0 +1,595 @@
+//===- MaterializeVectors.cpp - MaterializeVectors Pass Impl ----*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements target-dependent materialization of super-vectors to
+// vectors of the proper size for the hardware.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Analysis/MLFunctionMatcher.h"
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Analysis/VectorAnalysis.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLValue.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/SSAValue.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Pass.h"
+#include "mlir/StandardOps/StandardOps.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+///
+/// Implements target-dependent materialization of virtual super-vectors to
+/// vectors of the proper size for the hardware.
+///
+/// While the physical vector size is target-dependent, the pass is written in
+/// a target-independent way: the target vector size is specified as a parameter
+/// to the pass. This pass is thus a partial lowering that opens the "greybox"
+/// that is the super-vector abstraction. In particular, this pass can turn the
+/// vector_transfer_read and vector_transfer_write ops in either:
+///   1. a loop nest with either scalar and vector load/store instructions; or
+///   2. a loop-nest with DmaStartOp / DmaWaitOp; or
+///   3. a pre-existing blackbox library call that can be written manually or
+///      synthesized using search and superoptimization.
+/// An important feature that either of these 3 target lowering abstractions
+/// must handle is the handling of "non-effecting" padding with the proper
+/// neutral element in order to guarantee that all "partial tiles" are actually
+/// "full tiles" in practice.
+///
+/// In particular this pass is a MLIR-MLIR rewriting and does not concern itself
+/// with target-specific instruction-selection and register allocation. These
+/// will happen downstream in LLVM.
+///
+/// In this sense, despite performing lowering to a target-dependent size, this
+/// pass is still target-agnostic.
+///
+/// Implementation details
+/// ======================
+/// The current decisions made by the super-vectorization pass guarantee that
+/// use-def chains do not escape an enclosing vectorized ForStmt. In other
+/// words, this pass operates on a scoped program slice. Furthermore, since we
+/// do not vectorize in the presence of conditionals for now, sliced chains are
+/// guaranteed not to escape the innermost scope, which has to be either the top
+/// MLFunction scope of the innermost loop scope, by construction. As a
+/// consequence, the implementation just starts from vector_transfer_write
+/// operations and builds the slice scoped the innermost loop enclosing the
+/// current vector_transfer_write. These assumptions and the implementation
+/// details are subject to revision in the future.
+
+using llvm::dbgs;
+using llvm::DenseSet;
+using llvm::SetVector;
+
+using namespace mlir;
+
+using functional::map;
+
+static llvm::cl::list<int>
+    clVectorSize("vector-size",
+                 llvm::cl::desc("Specify the HW vector size for vectorization"),
+                 llvm::cl::ZeroOrMore);
+
+#define DEBUG_TYPE "materialize-vect"
+
+namespace {
+struct MaterializationState {
+  /// In practice, the determination of the HW-specific vector type to use when
+  /// lowering a super-vector type must be based on the elemental type. The
+  /// elemental type must be retrieved from the super-vector type. In the future
+  /// information about hardware vector type for a particular elemental type
+  /// will be part of the contract between MLIR and the backend.
+  ///
+  /// For example, 8xf32 has the same size as 16xf16 but the targeted HW itself
+  /// may exhibit the following property:
+  /// 1. have a special unit for a 128xf16 datapath;
+  /// 2. no F16 FPU support on the regular 8xf32/16xf16 vector datapath.
+  ///
+  /// For now, we just assume hwVectorSize has the proper information regardless
+  /// of the type and we assert everything is f32.
+  /// TODO(ntv): relax the assumptions on admissible element type once a
+  /// contract exists.
+  MaterializationState() : hwVectorSize(clVectorSize.size(), 0) {
+    std::copy(clVectorSize.begin(), clVectorSize.end(), hwVectorSize.begin());
+  }
+  SmallVector<int, 8> hwVectorSize;
+  VectorType superVectorType;
+  VectorType hwVectorType;
+  SmallVector<unsigned, 8> hwVectorInstance;
+  DenseMap<const MLValue *, MLValue *> *substitutionsMap;
+};
+
+struct MaterializeVectors : public FunctionPass {
+  MaterializeVectors() : FunctionPass(&MaterializeVectors::passID) {}
+
+  PassResult runOnMLFunction(MLFunction *f) override;
+
+  // Thread-safe RAII contexts local to pass, BumpPtrAllocator freed on exit.
+  MLFunctionMatcherContext mlContext;
+
+  static char passID;
+};
+
+} // end anonymous namespace
+
+char MaterializeVectors::passID = 0;
+
+// Returns the distance, in number of elements, between a slice in a dimension
+// and the next slice in the same dimension.
+//   e.g. shape[3, 4, 5] -> strides[20, 5, 1]
+static SmallVector<unsigned, 8> makeStrides(ArrayRef<unsigned> shape) {
+  SmallVector<unsigned, 8> tmp;
+  tmp.reserve(shape.size());
+  unsigned running = 1;
+  for (auto rit = shape.rbegin(), reit = shape.rend(); rit != reit; ++rit) {
+    assert(*rit > 0 && "NYI: symbolic or null shape dimension");
+    tmp.push_back(running);
+    running *= *rit;
+  }
+  return SmallVector<unsigned, 8>(tmp.rbegin(), tmp.rend());
+}
+
+// Returns the linearized expression.
+static SmallVector<unsigned, 8> delinearize(unsigned linearIndex,
+                                            ArrayRef<unsigned> shape) {
+  SmallVector<unsigned, 8> res;
+  res.reserve(shape.size());
+  auto strides = makeStrides(shape);
+  for (unsigned idx = 0; idx < strides.size(); ++idx) {
+    assert(strides[idx] > 0);
+    auto val = linearIndex / strides[idx];
+    res.push_back(val);
+    assert((val >= 0 && val < shape[idx]) &&
+           "delinearization is out of bounds");
+    linearIndex %= strides[idx];
+  }
+  // Sanity check.
+  assert(linearIndex == 0 && "linear index constructed from shape must "
+                             "have 0 remainder after delinearization");
+  return res;
+}
+
+// Since this is used during a traversal of a topologically sorted set, we
+// can just return the original SSAValue if we do not have a substitution.
+// The topological order guarantees there will never be one.
+static MLValue *
+substitute(SSAValue *v,
+           const DenseMap<const MLValue *, MLValue *> &substitutionsMap) {
+  auto it = substitutionsMap.find(cast<MLValue>(v));
+  if (it == substitutionsMap.end()) {
+    return cast<MLValue>(v);
+  }
+  return it->second;
+};
+
+/// Returns an AffineMap that reindexed the memRefIndices by the
+/// multi-dimensional hwVectorInstance.
+/// This is used by the function that materialized a vector_transfer operation
+/// to use hardware vector types instead of super-vector types.
+///
+/// The general problem this pass solves is as follows:
+/// Assume a vector_transfer operation at the super-vector granularity that has
+/// `l` enclosing loops (ForStmt). Assume the vector transfer operation operates
+/// on a MemRef of rank `r`, a super-vector of rank `s` and a hardware vector of
+/// rank `h`.
+/// For the purpose of illustration assume l==4, r==3, s==2, h==1 and that the
+/// super-vector is vector<3x32xf32> and the hardware vector is vector<8xf32>.
+/// Assume the following MLIR snippet after super-vectorizationhas been applied:
+/// ```mlir
+/// for %i0 = 0 to %M {
+///   for %i1 = 0 to %N step 3 {
+///     for %i2 = 0 to %O {
+///       for %i3 = 0 to %P step 32 {
+///         %r = vector_transfer_read(%A, map(%i..)#0, map(%i..)#1, map(%i..)#2)
+///                                   -> vector<3x32xf32>
+///         ...
+/// }}}}
+/// ```
+/// where map denotes an AffineMap operating on enclosing loops with properties
+/// compatible for vectorization (i.e. some contiguity left unspecified here).
+/// Note that the vectorized loops are %i1 and %i3.
+/// This function translates the vector_transfer_read operation to multiple
+/// instances of vector_transfer_read that operate on vector<8x32>.
+///
+/// Without loss of generality, we assume hwVectorInstance is: {2, 1}.
+/// The only constraints on hwVectorInstance is they belong to:
+///   [0, 2] x [0, 3], which is the span of ratio of super-vector shape to
+/// hardware vector shape in our example.
+///
+/// This function instantiates the iteration <2, 1> of vector_transfer_read
+/// into the set of operations in pseudo-MLIR:
+/// ```mlir
+///   map2 = (d0, d1, d2, d3) -> (d0, d1 + 2, d2, d3 + 1 * 8)
+///   map3 = map o map2 // where o denotes composition
+///   %r = vector_transfer_read(%A, map3(%i..)#0, map3(%i..)#1, map3(%i..)#2)
+///                             -> vector<3x32xf32>
+/// ```
+///
+/// Practical considerations
+/// ========================
+/// For now, `map` is assumed to be the identity map and the indices are
+/// specified just as vector_transfer_read(%A, %i0, %i1, %i2, %i3). This will be
+/// extended in the future once we have a proper Op for vector transfers.
+/// Additionally, the example above is specified in pseudo-MLIR form; once we
+/// have proper support for generic maps we can generate the code and show
+/// actual MLIR.
+///
+/// TODO(ntv): support a concrete AffineMap and compose with it.
+/// TODO(ntv): these implementation details should be captured in a
+/// vectorization trait at the op level directly.
+static SmallVector<MLValue *, 8>
+reindexAffineIndices(MLFuncBuilder *b, Type hwVectorType,
+                     ArrayRef<unsigned> hwVectorInstance,
+                     ArrayRef<SSAValue *> memrefIndices) {
+  auto vectorShape = hwVectorType.cast<VectorType>().getShape();
+  assert(hwVectorInstance.size() >= vectorShape.size());
+
+  unsigned numIndices = memrefIndices.size();
+  auto numMemRefIndices = numIndices - hwVectorInstance.size();
+  auto numSuperVectorIndices = hwVectorInstance.size() - vectorShape.size();
+
+  SmallVector<AffineExpr, 8> affineExprs;
+  // TODO(ntv): support a concrete map and composition.
+  unsigned i = 0;
+  // The first numMemRefIndices correspond to ForStmt that have not been
+  // vectorized, the transformation is the identity on those.
+  for (i = 0; i < numMemRefIndices; ++i) {
+    auto d_i = b->getAffineDimExpr(i);
+    affineExprs.push_back(d_i);
+  }
+  // The next numSuperVectorIndices correspond to super-vector dimensions that
+  // do not have a hardware vector dimension counterpart. For those we only
+  // need to increment the index by the corresponding hwVectorInstance.
+  for (i = numMemRefIndices; i < numMemRefIndices + numSuperVectorIndices;
+       ++i) {
+    auto d_i = b->getAffineDimExpr(i);
+    auto offset = hwVectorInstance[i - numMemRefIndices];
+    affineExprs.push_back(d_i + offset);
+  }
+  // The remaining indices correspond to super-vector dimensions that
+  // have a hardware vector dimension counterpart. For those we to increment the
+  // index by "hwVectorInstance" multiples of the corresponding hardware
+  // vector size.
+  for (; i < numIndices; ++i) {
+    auto d_i = b->getAffineDimExpr(i);
+    auto offset = hwVectorInstance[i - numMemRefIndices];
+    auto stride = vectorShape[i - numMemRefIndices - numSuperVectorIndices];
+    affineExprs.push_back(d_i + offset * stride);
+  }
+  auto affineMap = AffineMap::get(numIndices, 0, affineExprs, {});
+
+  // TODO(ntv): support a concrete map and composition.
+  auto app = b->create<AffineApplyOp>(b->getInsertionPoint()->getLoc(),
+                                      affineMap, memrefIndices);
+  unsigned numResults = app->getNumResults();
+  SmallVector<MLValue *, 8> res;
+  for (unsigned i = 0; i < numResults; ++i) {
+    res.push_back(cast<MLValue>(app->getResult(i)));
+  }
+  return res;
+}
+
+/// Returns the cloned operands of `opStmt` for the instance of
+/// `hwVectorInstance` when lowering from a super-vector type to
+/// `hwVectorType`. `hwVectorInstance` represents one particular instance of
+/// `hwVectorType` int the covering of the super-vector type. For a more
+/// detailed description of the problem, see the description of
+/// reindexAffineIndices.
+static SmallVector<MLValue *, 8>
+cloneAndUnrollOperands(OperationStmt *opStmt, Type hwVectorType,
+                       ArrayRef<unsigned> hwVectorInstance,
+                       DenseMap<const MLValue *, MLValue *> *substitutionsMap) {
+  using functional::map;
+
+  // For Ops that are not vector_transfer_read/vector_transfer_write we can just
+  // substitute and be done.
+  if (!isaVectorTransferRead(*opStmt) && !isaVectorTransferWrite(*opStmt)) {
+    return map([substitutionsMap](
+                   SSAValue *v) { return substitute(v, *substitutionsMap); },
+               opStmt->getOperands());
+  }
+
+  // TODO(ntv): this error-prone boilerplate can be removed once we have a
+  // proper Op for vectr_transfer.
+  unsigned offset = 0;
+  unsigned numIndices = 0;
+  SmallVector<MLValue *, 8> res;
+  auto operands = opStmt->getOperands();
+  if (isaVectorTransferRead(*opStmt)) {
+    offset = 1;
+    numIndices = opStmt->getNumOperands() - 1;
+  } else if (isaVectorTransferWrite(*opStmt)) {
+    offset = 2;
+    numIndices = opStmt->getNumOperands() - 2;
+  }
+  // Copy as-is the [optional valueToStore], memref.
+  for (unsigned i = 0; i < offset; ++i) {
+    res.push_back(substitute(*(operands.begin() + i), *substitutionsMap));
+  }
+
+  MLFuncBuilder b(opStmt);
+  // TODO(ntv): indices extraction is brittle and unsafe before we have an Op.
+  SmallVector<SSAValue *, 8> indices;
+  for (auto it = operands.begin() + offset; it != operands.end(); ++it) {
+    indices.push_back(*it);
+  }
+  auto affineValues =
+      reindexAffineIndices(&b, hwVectorType, hwVectorInstance, indices);
+  res.append(affineValues.begin(), affineValues.end());
+
+  return res;
+}
+
+// Returns attributes with the following substitutions applied:
+//   - splat of `superVectorType` is replaced by splat of `hwVectorType`.
+// TODO(ntv): add more substitutions on a per-need basis.
+static SmallVector<NamedAttribute, 2>
+materializeAttributes(OperationStmt *opStmt, VectorType superVectorType,
+                      VectorType hwVectorType) {
+  SmallVector<NamedAttribute, 2> res;
+  for (auto a : opStmt->getAttrs()) {
+    auto splat = a.second.dyn_cast<SplatElementsAttr>();
+    bool splatOfSuperVectorType = splat && (splat.getType() == superVectorType);
+    if (splatOfSuperVectorType) {
+      auto attr = SplatElementsAttr::get(hwVectorType.cast<VectorType>(),
+                                         splat.getValue());
+      res.push_back(NamedAttribute(a.first, attr));
+    } else {
+      res.push_back(a);
+    }
+  }
+  return res;
+}
+
+/// Returns `true` if stmt instance is properly cloned and inserted, false
+/// otherwise.
+/// The multi-dimensional `hwVectorInstance` belongs to the shapeRatio of
+/// super-vector type to hw vector type.
+/// A cloned instance of `stmt` is formed as follows:
+///   1. vector_transfer_read: the return `superVectorType` is replaced by
+///      `hwVectorType`. Additionally, affine indices are reindexed with
+///      `reindexAffineIndices` using `hwVectorInstance` and vector type
+///      information;
+///   2. vector_transfer_write: the `valueToStore` type is simply substituted.
+///      Since we operate on a topologically sorted slice, a substitution must
+///      have been registered for non-constant ops. Additionally, affine indices
+///      are reindexed in the same way as for vector_transfer_read;
+///   3. constant ops are splats of the super-vector type by construction.
+///      They are cloned to a splat on the hw vector type with the same value;
+///   4. remaining ops are cloned to version of the op that returns a hw vector
+///      type, all operands are substituted according to `substitutions`. Thanks
+///      to the topological order of a slice, the substitution is always
+///      possible.
+static bool cloneAndInsertHardwareVectorInstance(Statement *stmt,
+                                                 MaterializationState *state) {
+  LLVM_DEBUG(dbgs() << "\nclone" << *stmt);
+  if (auto *opStmt = dyn_cast<OperationStmt>(stmt)) {
+    // TODO(ntv): Is it worth considering an OperationStmt.clone operation
+    // which changes the type so we can promote an OperationStmt with less
+    // boilerplate?
+    assert(opStmt->getNumResults() <= 1 && "NYI: opStmt has > 1 results");
+    auto operands = cloneAndUnrollOperands(opStmt, state->hwVectorType,
+                                           state->hwVectorInstance,
+                                           state->substitutionsMap);
+    MLFuncBuilder b(stmt);
+    if (opStmt->getNumResults() == 0) {
+      // vector_transfer_write
+      b.createOperation(stmt->getLoc(), opStmt->getName(), operands, {},
+                        materializeAttributes(opStmt, state->superVectorType,
+                                              state->hwVectorType));
+    } else {
+      // vector_transfer_read
+      auto *cloned = b.createOperation(
+          stmt->getLoc(), opStmt->getName(), operands, {state->hwVectorType},
+          materializeAttributes(opStmt, state->superVectorType,
+                                state->hwVectorType));
+      state->substitutionsMap->insert(std::make_pair(
+          cast<MLValue>(opStmt->getResult(0)),
+          cast<MLValue>(cast<OperationStmt>(cloned)->getResult(0))));
+    }
+    return false;
+  }
+
+  if (isa<ForStmt>(stmt)) {
+    // Fail hard and wake up when needed.
+    stmt->emitError("NYI path ForStmt");
+    return true;
+  }
+
+  // Fail hard and wake up when needed.
+  stmt->emitError("NYI path IfStmt");
+  return true;
+}
+
+/// Takes a slice and rewrites the operations in it so that occurrences
+/// of `superVectorType` are replaced by `hwVectorType`.
+///
+/// Implementation
+/// ==============
+///   1. computes the shape ratio of super-vector to HW vector shapes. This
+///      gives for each op in the slice, how many instantiations are required
+///      in each dimension;
+///   2. performs the concrete materialization. Note that in a first
+///      implementation we use full unrolling because it pragmatically removes
+///      the need to explicitly materialize an AllocOp. Thanks to the properties
+///      of super-vectors, this unrolling is always possible and simple:
+///      vectorizing to a super-vector abstraction already achieved the
+///      equivalent of loop strip-mining + loop sinking and encoded this in the
+///      vector type.
+///
+/// TODO(ntv): materialized allocs.
+/// TODO(ntv): full loops + materialized allocs.
+/// TODO(ntv): partial unrolling + materialized allocs.
+static void emitSlice(MaterializationState *state,
+                      SetVector<Statement *> *slice) {
+  auto ratio = shapeRatio(state->superVectorType, state->hwVectorType);
+  assert(ratio.hasValue() &&
+         "ratio of super-vector to HW-vector shape is not integral");
+  // The number of integer points in a hyperrectangular region is:
+  // shape[0] * strides[0].
+  auto numValueToUnroll = (*ratio)[0] * makeStrides(*ratio)[0];
+  // Full unrolling to hardware vectors in a first approximation.
+  for (unsigned idx = 0; idx < numValueToUnroll; ++idx) {
+    // Fresh RAII instanceIndices and substitutionsMap.
+    MaterializationState scopedState = *state;
+    scopedState.hwVectorInstance = delinearize(idx, *ratio);
+    DenseMap<const MLValue *, MLValue *> substitutionMap;
+    scopedState.substitutionsMap = &substitutionMap;
+    // slice are topologically sorted, we can just clone them in order.
+    for (auto *stmt : *slice) {
+      auto fail = cloneAndInsertHardwareVectorInstance(stmt, &scopedState);
+      assert(!fail && "Unhandled super-vector materialization failure");
+    }
+  }
+  // slice are topologically sorted, we can just erase them in reverse
+  // order. Reverse iterator does not just work simply with an operator*
+  // dereference.
+  for (int idx = slice->size() - 1; idx >= 0; --idx) {
+    (*slice)[idx]->erase();
+  }
+}
+
+/// Materializes super-vector types into concrete hw vector types as follows:
+///   1. start from super-vector terminators (current vector_transfer_write
+///      ops);
+///   2. collect all the statements that can be reached by transitive use-defs
+///      chains;
+///   3. get the superVectorType for this particular terminator and the
+///      corresponding hardware vector type (for now limited to F32)
+///      TODO(ntv): be more general than F32.
+///   4. emit the transitive useDef set to operate on the finer grain vector
+///      types.
+///
+/// Notes
+/// =====
+/// The `slice` is sorted in topological order by construction.
+/// Additionally, this set is limited to statements in the same lexical scope
+/// because we currently disallow vectorization of defs that come from another
+/// scope.
+static void materialize(MLFunction *f,
+                        const SetVector<OperationStmt *> &terminators,
+                        MaterializationState *state) {
+  DenseSet<Statement *> seen;
+  for (auto terminator : terminators) {
+    LLVM_DEBUG(dbgs() << "\nFrom terminator:" << *terminator);
+
+    // Short-circuit test, a given terminator may have been reached by some
+    // other previous transitive use-def chains.
+    if (seen.count(terminator) > 0) {
+      continue;
+    }
+
+    // Terminators are vector_transfer_write with 0 results by construction atm.
+    assert(isaVectorTransferWrite(*terminator) && "");
+    assert(terminator->getNumResults() == 0 &&
+           "NYI: terminators must have 0 results");
+
+    // Get the transitive use-defs starting from terminator, limited to the
+    // current enclosing scope of the terminator. See the top of the function
+    // Note for the justification of this restriction.
+    // TODO(ntv): relax scoping constraints.
+    auto *enclosingScope = terminator->getParentStmt();
+    auto keepIfInSameScope = [enclosingScope](Statement *stmt) {
+      assert(stmt && "NULL stmt");
+      if (!enclosingScope) {
+        // by construction, everyone is always under the top scope (null scope).
+        return true;
+      }
+      return properlyDominates(*enclosingScope, *stmt);
+    };
+    SetVector<Statement *> slice =
+        getSlice(terminator, keepIfInSameScope, keepIfInSameScope);
+    assert(!slice.empty());
+
+    // Sanity checks: transitive slice must be completely disjoint from
+    // what we have seen so far.
+    LLVM_DEBUG(dbgs() << "\nTransitive use-defs:");
+    for (auto *ud : slice) {
+      LLVM_DEBUG(dbgs() << "\nud:" << *ud);
+      assert(seen.count(ud) == 0 &&
+             "Transitive use-defs not disjoint from already seen");
+      seen.insert(ud);
+    }
+
+    // Emit the current slice.
+    // Set scoped super-vector and corresponding hw vector types.
+    state->superVectorType =
+        terminator->getOperand(0)->getType().cast<VectorType>();
+    assert((state->superVectorType.getElementType() ==
+            Type::getF32(terminator->getContext())) &&
+           "Only f32 supported for now");
+    state->hwVectorType = VectorType::get(
+        state->hwVectorSize, state->superVectorType.getElementType());
+    emitSlice(state, &slice);
+    LLVM_DEBUG(dbgs() << "\nMLFunction is now\n");
+    LLVM_DEBUG(f->print(dbgs()));
+  }
+}
+
+PassResult MaterializeVectors::runOnMLFunction(MLFunction *f) {
+  using matcher::Op;
+  LLVM_DEBUG(dbgs() << "\nMaterializeVectors on MLFunction\n");
+  LLVM_DEBUG(f->print(dbgs()));
+
+  MaterializationState state;
+  // Get the hardware vector type.
+  // TODO(ntv): get elemental type from super-vector type rather than force f32.
+  auto subVectorType =
+      VectorType::get(state.hwVectorSize, Type::getF32(f->getContext()));
+
+  // Capture terminators; i.e. vector_transfer_write ops involving a strict
+  // super-vector of subVectorType.
+  auto filter = [subVectorType](const Statement &stmt) {
+    const auto &opStmt = cast<OperationStmt>(stmt);
+    if (!isaVectorTransferWrite(opStmt)) {
+      return false;
+    }
+    return matcher::operatesOnStrictSuperVectors(opStmt, subVectorType);
+  };
+  auto pat = Op(filter);
+  auto matches = pat.match(f);
+  SetVector<OperationStmt *> terminators;
+  for (auto m : matches) {
+    terminators.insert(cast<OperationStmt>(m.first));
+  }
+
+  // Call materialization.
+  materialize(f, terminators, &state);
+  return PassResult::Success;
+}
+
+FunctionPass *mlir::createMaterializeVectors() {
+  return new MaterializeVectors();
+}
+
+static PassRegistration<MaterializeVectors>
+    pass("materialize-vectors", "Materializes super-vectors to vectors of the "
+                                "proper size for the hardware");
+
+#undef DEBUG_TYPE
--- a/mlir/test/Transforms/materialize_vectors.mlir
+++ b/mlir/test/Transforms/materialize_vectors.mlir
@ -0,0 +1,87 @@
+// RUN: mlir-opt %s -vectorize -virtual-vector-size 32 --test-fastest-varying=0 -materialize-vectors -vector-size=8 | FileCheck %s -check-prefix=VEC1DTO1D
+// RUN: mlir-opt %s -vectorize -virtual-vector-size 3 -virtual-vector-size 16 --test-fastest-varying=1 --test-fastest-varying=0 -materialize-vectors -vector-size=8 | FileCheck %s -check-prefix=VEC2DTO1D
+// RUN_: mlir-opt %s -vectorize -virtual-vector-size 3 -virtual-vector-size 32 --test-fastest-varying=1 --test-fastest-varying=0 -materialize-vectors -vector-size=3 -vector-size=16 | FileCheck %s -check-prefix=VEC2DTO2D
+
+// vector<32xf32> -> vector<8xf32>
+// VEC1DTO1D: [[MAP0:#.*]] = (d0, d1) -> (d0, d1)
+// VEC1DTO1D: [[MAP1:#.*]] = (d0, d1) -> (d0, d1 + 8)
+// VEC1DTO1D: [[MAP2:#.*]] = (d0, d1) -> (d0, d1 + 16)
+// VEC1DTO1D: [[MAP3:#.*]] = (d0, d1) -> (d0, d1 + 24)
+// vector<3x16xf32> -> vector<8xf32>
+// VEC2DTO1D: [[MAP0:#.*]] = (d0, d1) -> (d0, d1)
+// VEC2DTO1D: [[MAP1:#.*]] = (d0, d1) -> (d0, d1 + 8)
+// VEC2DTO1D: [[MAP2:#.*]] = (d0, d1) -> (d0 + 1, d1)
+// VEC2DTO1D: [[MAP3:#.*]] = (d0, d1) -> (d0 + 1, d1 + 8)
+// VEC2DTO1D: [[MAP4:#.*]] = (d0, d1) -> (d0 + 2, d1)
+// VEC2DTO1D: [[MAP5:#.*]] = (d0, d1) -> (d0 + 2, d1 + 8)
+// vector<3x32xf32> -> vector<3x16xf32>
+// VEC2DTO2D: [[MAP0:#.*]] = (d0, d1) -> (d0, d1)
+// VEC2DTO2D: [[MAP1:#.*]] = (d0, d1) -> (d0, d1 + 16)
+mlfunc @vector_add_2d(%M : index, %N : index) -> f32 {
+  %A = alloc (%M, %N) : memref<?x?xf32, 0>
+  %B = alloc (%M, %N) : memref<?x?xf32, 0>
+  %C = alloc (%M, %N) : memref<?x?xf32, 0>
+  %f1 = constant 1.0 : f32
+  %f2 = constant 2.0 : f32
+  for %i0 = 0 to %M {
+    for %i1 = 0 to %N {
+      // non-scoped %f1
+      // VEC1DTO1D does 4x unrolling.
+      // VEC1DTO1D: [[CST0:%.*]] = constant splat<vector<8xf32>, 1.000000e+00> : vector<8xf32>
+      // VEC1DTO1D: [[CST1:%.*]] = constant splat<vector<8xf32>, 1.000000e+00> : vector<8xf32>
+      // VEC1DTO1D: [[CST2:%.*]] = constant splat<vector<8xf32>, 1.000000e+00> : vector<8xf32>
+      // VEC1DTO1D: [[CST3:%.*]] = constant splat<vector<8xf32>, 1.000000e+00> : vector<8xf32>
+      // VEC1DTO1D: [[VAL0:%.*]] = affine_apply [[MAP0]]{{.*}}
+      // VEC1DTO1D: "vector_transfer_write"([[CST0]], {{.*}}, [[VAL0]]#0, [[VAL0]]#1) : (vector<8xf32>
+      // VEC1DTO1D: [[VAL1:%.*]] = affine_apply [[MAP1]]{{.*}}
+      // VEC1DTO1D: "vector_transfer_write"([[CST1]], {{.*}}, [[VAL1]]#0, [[VAL1]]#1) : (vector<8xf32>
+      // VEC1DTO1D: [[VAL2:%.*]] = affine_apply [[MAP2]]{{.*}}
+      // VEC1DTO1D:"vector_transfer_write"([[CST2]], {{.*}}, [[VAL2]]#0, [[VAL2]]#1) : (vector<8xf32>
+      // VEC1DTO1D: [[VAL3:%.*]] = affine_apply [[MAP3]]{{.*}}
+      // VEC1DTO1D:"vector_transfer_write"([[CST3]], {{.*}}, [[VAL3]]#0, [[VAL3]]#1) : (vector<8xf32>
+      //
+      store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
+    }
+  }
+  for %i2 = 0 to %M {
+    for %i3 = 0 to %N {
+      // non-scoped %f2
+      // VEC2DTO1D does (3x4)x unrolling.
+      // VEC2DTO1D-COUNT-6: {{.*}} = constant splat<vector<8xf32>, 1.000000e+00> : vector<8xf32>
+      // VEC2DTO1D: [[VAL0:%.*]] = affine_apply [[MAP0]]{{.*}}
+      // VEC2DTO1D: "vector_transfer_write"({{.*}}, [[VAL0]]#0, [[VAL0]]#1) : (vector<8xf32>
+      // ... 4 other interleaved affine_apply, vector_transfer_write
+      // VEC2DTO1D: [[VAL5:%.*]] = affine_apply [[MAP5]]{{.*}}
+      // VEC2DTO1D: "vector_transfer_write"({{.*}}, [[VAL5]]#0, [[VAL5]]#1) : (vector<8xf32>
+      //
+      store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
+    }
+  }
+  for %i4 = 0 to %M {
+    for %i5 = 0 to %N {
+      // VEC2DTO2D: %7 = affine_apply #map0(%i4, %i5)
+      // VEC2DTO2D: %8 = "vector_transfer_read"(%0, %7#0, %7#1) : (memref<?x?xf32>, index, index) -> vector<3x16xf32>
+      // VEC2DTO2D: %9 = affine_apply #map1(%i4, %i5)
+      // VEC2DTO2D: %10 = "vector_transfer_read"(%0, %9#0, %9#1) : (memref<?x?xf32>, index, index) -> vector<3x16xf32>
+      // VEC2DTO2D: %11 = affine_apply #map0(%i4, %i5)
+      // VEC2DTO2D: %12 = "vector_transfer_read"(%1, %11#0, %11#1) : (memref<?x?xf32>, index, index) -> vector<3x16xf32>
+      // VEC2DTO2D: %13 = affine_apply #map1(%i4, %i5)
+      // VEC2DTO2D: %14 = "vector_transfer_read"(%1, %13#0, %13#1) : (memref<?x?xf32>, index, index) -> vector<3x16xf32>
+      // VEC2DTO2D: %15 = addf %8, %12 : vector<3x16xf32>
+      // VEC2DTO2D: %16 = addf %10, %14 : vector<3x16xf32>
+      // VEC2DTO2D: %17 = affine_apply #map0(%i4, %i5)
+      // VEC2DTO2D: "vector_transfer_write"(%15, %2, %17#0, %17#1) : (vector<3x16xf32>, memref<?x?xf32>, index, index) -> ()
+      // VEC2DTO2D: %18 = affine_apply #map1(%i4, %i5)
+      // VEC2DTO2D: "vector_transfer_write"(%16, %2, %18#0, %18#1) : (vector<3x16xf32>, memref<?x?xf32>, index, index) -> ()
+      //
+      %a5 = load %A[%i4, %i5] : memref<?x?xf32, 0>
+      %b5 = load %B[%i4, %i5] : memref<?x?xf32, 0>
+      %s5 = addf %a5, %b5 : f32
+      store %s5, %C[%i4, %i5] : memref<?x?xf32, 0>
+    }
+  }
+  %c7 = constant 7 : index
+  %c42 = constant 42 : index
+  %res = load %C[%c7, %c42] : memref<?x?xf32, 0>
+  return %res : f32
+}