[mlir] Support masked N-D vector transfer ops in ProgressiveVectorToSCF.

Mask vectors are handled similar to data vectors in N-D TransferWriteOp. They are copied into a temporary memory buffer, which can be indexed into with non-constant values. Differential Revision: https://reviews.llvm.org/D101136
2021-04-23 18:11:07 +09:00 · 2021-04-23 18:11:07 +09:00 · 64f7fb5dfc
parent c623945d70
commit 64f7fb5dfc
2 changed files with 132 additions and 46 deletions
--- a/mlir/lib/Conversion/VectorToSCF/ProgressiveVectorToSCF.cpp
+++ b/mlir/lib/Conversion/VectorToSCF/ProgressiveVectorToSCF.cpp
@ -56,16 +56,34 @@ static MemRefType unpackOneDim(MemRefType type) {
                                         vectorType.getElementType()));
 }

-// TODO: Parallelism and threadlocal considerations.
-static Value setAllocAtFunctionEntry(MemRefType type, Operation *op) {
+/// Helper data structure for data and mask buffers.
+struct BufferAllocs {
+  Value dataBuffer;
+  Value maskBuffer;
+};
+
+/// Allocate temporary buffers for data (vector) and mask (if present).
+/// TODO: Parallelism and threadlocal considerations.
+template <typename OpTy>
+static BufferAllocs allocBuffers(OpTy xferOp) {
  auto &b = ScopedContext::getBuilderRef();
  OpBuilder::InsertionGuard guard(b);
  Operation *scope =
-      op->getParentWithTrait<OpTrait::AutomaticAllocationScope>();
+      xferOp->template getParentWithTrait<OpTrait::AutomaticAllocationScope>();
  assert(scope && "Expected op to be inside automatic allocation scope");
  b.setInsertionPointToStart(&scope->getRegion(0).front());
-  Value res = memref_alloca(type);
-  return res;
+
+  BufferAllocs result;
+  auto bufferType = MemRefType::get({}, xferOp.getVectorType());
+  result.dataBuffer = memref_alloca(bufferType).value;
+
+  if (xferOp.mask()) {
+    auto maskType = MemRefType::get({}, xferOp.mask().getType());
+    result.maskBuffer = memref_alloca(maskType).value;
+    memref_store(xferOp.mask(), result.maskBuffer);
+  }
+
+  return result;
 }

 /// Given a vector transfer op, calculate which dimension of the `source`
@ -238,6 +256,16 @@ static ArrayAttr dropFirstElem(OpBuilder &builder, ArrayAttr attr) {
  return ArrayAttr::get(builder.getContext(), attr.getValue().drop_front());
 }

+/// Given a transfer op, find the memref from which the mask is loaded. This
+/// is similar to Strategy<TransferWriteOp>::getBuffer.
+template <typename OpTy>
+static Value getMaskBuffer(OpTy xferOp) {
+  assert(xferOp.mask() && "Expected that transfer op has mask");
+  auto loadOp = xferOp.mask().template getDefiningOp<memref::LoadOp>();
+  assert(loadOp && "Expected transfer op mask produced by LoadOp");
+  return loadOp.getMemRef();
+}
+
 /// Codegen strategy, depending on the operation.
 template <typename OpTy>
 struct Strategy;
@ -266,8 +294,8 @@ struct Strategy<TransferReadOp> {
    return getStoreOp(xferOp).getMemRef();
  }

-  /// Retrieve the indices of the current StoreOp.
-  static void getStoreIndices(TransferReadOp xferOp,
+  /// Retrieve the indices of the current StoreOp that stores into the buffer.
+  static void getBufferIndices(TransferReadOp xferOp,
                               SmallVector<Value, 8> &indices) {
    auto storeOp = getStoreOp(xferOp);
    auto prevIndices = memref::StoreOpAdaptor(storeOp).indices();
@ -300,10 +328,11 @@ struct Strategy<TransferReadOp> {
  ///
  /// Note: The loop and type cast are generated in TransferOpConversion.
  ///       The original TransferReadOp and store op are deleted in `cleanup`.
-  static void rewriteOp(OpBuilder &builder, TransferReadOp xferOp,
+  /// Note: The `mask` operand is set in TransferOpConversion.
+  static TransferReadOp rewriteOp(OpBuilder &builder, TransferReadOp xferOp,
                                  Value buffer, Value iv) {
    SmallVector<Value, 8> storeIndices;
-    getStoreIndices(xferOp, storeIndices);
+    getBufferIndices(xferOp, storeIndices);
    storeIndices.push_back(iv);

    SmallVector<Value, 8> xferIndices;
@ -321,6 +350,7 @@ struct Strategy<TransferReadOp> {
        newXfer.getDefiningOp()->setAttr(kPassLabel, builder.getUnitAttr());

    memref_store(newXfer, buffer, storeIndices);
+    return newXfer.getDefiningOp<TransferReadOp>();
  }

  /// Handle out-of-bounds accesses on the to-be-unpacked dimension: Write
@ -329,7 +359,7 @@ struct Strategy<TransferReadOp> {
      OpBuilder &/*builder*/, TransferReadOp xferOp, Value buffer,
      Value iv) {
    SmallVector<Value, 8> storeIndices;
-    getStoreIndices(xferOp, storeIndices);
+    getBufferIndices(xferOp, storeIndices);
    storeIndices.push_back(iv);

    auto bufferType = buffer.getType().dyn_cast<ShapedType>();
@ -361,8 +391,8 @@ struct Strategy<TransferWriteOp> {
    return loadOp.getMemRef();
  }

-  /// Retrieve the indices of the current LoadOp.
-  static void getLoadIndices(TransferWriteOp xferOp,
+  /// Retrieve the indices of the current LoadOp that loads from the buffer.
+  static void getBufferIndices(TransferWriteOp xferOp,
                               SmallVector<Value, 8> &indices) {
    auto loadOp = xferOp.vector().getDefiningOp<memref::LoadOp>();
    auto prevIndices = memref::LoadOpAdaptor(loadOp).indices();
@ -378,10 +408,10 @@ struct Strategy<TransferWriteOp> {
  ///    to memory.
  ///
  /// Note: For more details, see comments on Strategy<TransferReadOp>.
-  static void rewriteOp(OpBuilder &builder, TransferWriteOp xferOp,
+  static TransferWriteOp rewriteOp(OpBuilder &builder, TransferWriteOp xferOp,
                                   Value buffer, Value iv) {
    SmallVector<Value, 8> loadIndices;
-    getLoadIndices(xferOp, loadIndices);
+    getBufferIndices(xferOp, loadIndices);
    loadIndices.push_back(iv);

    SmallVector<Value, 8> xferIndices;
@ -397,6 +427,8 @@ struct Strategy<TransferWriteOp> {

    if (vecType.getRank() > kTargetRank)
        newXfer.op->setAttr(kPassLabel, builder.getUnitAttr());
+
+    return newXfer;
  }

  /// Handle out-of-bounds accesses on the to-be-unpacked dimension.
@ -416,8 +448,6 @@ LogicalResult checkPrepareXferOp(OpTy xferOp) {
      return failure();
  if (xferOp.getVectorType().getRank() <= kTargetRank)
      return failure();
-  if (xferOp.mask())
-      return failure();
  return success();
 }

@ -442,6 +472,8 @@ LogicalResult checkPrepareXferOp(OpTy xferOp) {
 /// memref.store %1, %0[] : memref<vector<5x4xf32>>
 /// %vec = memref.load %0[] : memref<vector<5x4xf32>>
 /// ```
+///
+/// Note: A second temporary buffer may be allocated for the `mask` operand.
 struct PrepareTransferReadConversion
    : public OpRewritePattern<TransferReadOp> {
  using OpRewritePattern<TransferReadOp>::OpRewritePattern;
@ -452,12 +484,16 @@ struct PrepareTransferReadConversion
      return failure();

    ScopedContext scope(rewriter, xferOp.getLoc());
-    auto allocType = MemRefType::get({}, xferOp.getVectorType());
-    auto buffer = setAllocAtFunctionEntry(allocType, xferOp);
+    auto buffers = allocBuffers(xferOp);
    auto *newXfer = rewriter.clone(*xferOp.getOperation());
    newXfer->setAttr(kPassLabel, rewriter.getUnitAttr());
-    memref_store(newXfer->getResult(0), buffer);
-    rewriter.replaceOpWithNewOp<memref::LoadOp>(xferOp, buffer);
+    if (xferOp.mask()) {
+      auto loadedMask = memref_load(buffers.maskBuffer);
+      dyn_cast<TransferReadOp>(newXfer).maskMutable().assign(loadedMask);
+    }
+
+    memref_store(newXfer->getResult(0), buffers.dataBuffer);
+    rewriter.replaceOpWithNewOp<memref::LoadOp>(xferOp, buffers.dataBuffer);

    return success();
  }
@ -484,6 +520,8 @@ struct PrepareTransferReadConversion
 /// vector.transfer_write %1, %A[%a, %b, %c] { __vector_to_scf_lowering__ }
 ///     : vector<5x4xf32>, memref<?x?x?xf32>
 /// ```
+///
+/// Note: A second temporary buffer may be allocated for the `mask` operand.
 struct PrepareTransferWriteConversion
    : public OpRewritePattern<TransferWriteOp> {
  using OpRewritePattern<TransferWriteOp>::OpRewritePattern;
@ -494,16 +532,20 @@ struct PrepareTransferWriteConversion
      return failure();

    ScopedContext scope(rewriter, xferOp.getLoc());
-    auto allocType = MemRefType::get({}, xferOp.getVectorType());
-    auto buffer = setAllocAtFunctionEntry(allocType, xferOp);
-    memref_store(xferOp.vector(), buffer);
-    auto loadedVec = memref_load(buffer);
-
+    auto buffers = allocBuffers(xferOp);
+    memref_store(xferOp.vector(), buffers.dataBuffer);
+    auto loadedVec = memref_load(buffers.dataBuffer);
    rewriter.updateRootInPlace(xferOp, [&]() {
      xferOp.vectorMutable().assign(loadedVec);
      xferOp->setAttr(kPassLabel, rewriter.getUnitAttr());
    });

+    if (xferOp.mask()) {
+      auto loadedMask = memref_load(buffers.maskBuffer);
+      rewriter.updateRootInPlace(
+          xferOp, [&]() { xferOp.maskMutable().assign(loadedMask); });
+    }
+
    return success();
  }
 };
@ -535,16 +577,28 @@ struct TransferOpConversion : public OpRewritePattern<OpTy> {
        return failure();

    ScopedContext scope(rewriter, xferOp.getLoc());
-    // How the buffer can be found depends on OpTy.
-    auto buffer = Strategy<OpTy>::getBuffer(xferOp);
-    auto bufferType = buffer.getType().template dyn_cast<MemRefType>();
-    auto castedType = unpackOneDim(bufferType);
-    auto casted = vector_type_cast(castedType, buffer);
+
+    // Find and cast data buffer. How the buffer can be found depends on OpTy.
+    auto dataBuffer = Strategy<OpTy>::getBuffer(xferOp);
+    auto dataBufferType = dataBuffer.getType().template dyn_cast<MemRefType>();
+    auto castedDataType = unpackOneDim(dataBufferType);
+    auto castedDataBuffer = vector_type_cast(castedDataType, dataBuffer);
+
+    // If the xferOp has a mask: Find and cast mask buffer.
+    Value castedMaskBuffer;
+    if (xferOp.mask()) {
+      auto maskBuffer = getMaskBuffer(xferOp);
+      auto maskBufferType =
+          maskBuffer.getType().template dyn_cast<MemRefType>();
+      auto castedMaskType = unpackOneDim(maskBufferType);
+      castedMaskBuffer = vector_type_cast(castedMaskType, maskBuffer);
+    }

    // Loop bounds and step.
    auto lb = std_constant_index(0).value;
    auto ub = std_constant_index(
-        castedType.getDimSize(castedType.getRank() - 1)).value;
+                  castedDataType.getDimSize(castedDataType.getRank() - 1))
+                  .value;
    auto step = std_constant_index(1).value;

    // Generate for loop.
@ -555,10 +609,30 @@ struct TransferOpConversion : public OpRewritePattern<OpTy> {
      ScopedContext scope(b, loc);
      generateInBoundsCheck(
          xferOp, iv, b, unpackedDim(xferOp),
-          /*inBoundsCase=*/[&](OpBuilder &b, Location /*loc*/) {
-        Strategy<OpTy>::rewriteOp(b, xferOp, casted, iv);
-      }, /*outOfBoundsCase=*/[&](OpBuilder &b, Location /*loc*/) {
-        Strategy<OpTy>::handleOutOfBoundsDim(b, xferOp, casted, iv);
+          /*inBoundsCase=*/
+          [&](OpBuilder &b, Location /*loc*/) {
+            // Create new transfer op.
+            OpTy newXfer =
+                Strategy<OpTy>::rewriteOp(b, xferOp, castedDataBuffer, iv);
+
+            // If old transfer op has a mask: Set mask on new transfer op.
+            if (xferOp.mask()) {
+              OpBuilder::InsertionGuard guard(b);
+              b.setInsertionPoint(newXfer); // Insert load before newXfer.
+
+              SmallVector<Value, 8> loadIndices;
+              Strategy<OpTy>::getBufferIndices(xferOp, loadIndices);
+              loadIndices.push_back(iv);
+
+              auto mask = memref_load(castedMaskBuffer, loadIndices);
+              rewriter.updateRootInPlace(
+                  newXfer, [&]() { newXfer.maskMutable().assign(mask); });
+            }
+          },
+          /*outOfBoundsCase=*/
+          [&](OpBuilder &b, Location /*loc*/) {
+            Strategy<OpTy>::handleOutOfBoundsDim(b, xferOp, castedDataBuffer,
+                                                 iv);
          });
      b.create<scf::YieldOp>(loc);
    });
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-2d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-2d.mlir
@ -1,8 +1,3 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
-// RUN: FileCheck %s
-
 // RUN: mlir-opt %s -test-progressive-convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
@ -17,6 +12,19 @@ func @transfer_read_2d(%A : memref<?x?xf32>, %base1: index, %base2: index) {
  return
 }

+func @transfer_read_2d_mask(%A : memref<?x?xf32>, %base1: index, %base2: index) {
+  %fm42 = constant -42.0: f32
+  %mask = constant dense<[[1, 0, 1, 0, 1, 1, 1, 0, 1],
+                          [0, 0, 1, 1, 1, 1, 1, 0, 1],
+                          [1, 1, 1, 1, 1, 1, 1, 0, 1],
+                          [0, 0, 1, 0, 1, 1, 1, 0, 1]]> : vector<4x9xi1>
+  %f = vector.transfer_read %A[%base1, %base2], %fm42, %mask
+      {permutation_map = affine_map<(d0, d1) -> (d0, d1)>} :
+    memref<?x?xf32>, vector<4x9xf32>
+  vector.print %f: vector<4x9xf32>
+  return
+}
+
 func @transfer_read_2d_transposed(
    %A : memref<?x?xf32>, %base1: index, %base2: index) {
  %fm42 = constant -42.0: f32
@ -80,7 +88,10 @@ func @entry() {
  call @transfer_write_2d(%A, %c3, %c1) : (memref<?x?xf32>, index, index) -> ()
  // Read shifted by 0 and pad with -42:
  call @transfer_read_2d(%A, %c0, %c0) : (memref<?x?xf32>, index, index) -> ()
-  // Same as above, but transposed
+  // Same as above, but apply a mask
+  call @transfer_read_2d_mask(%A, %c0, %c0)
+      : (memref<?x?xf32>, index, index) -> ()
+  // Same as above, but without mask and transposed
  call @transfer_read_2d_transposed(%A, %c0, %c0)
      : (memref<?x?xf32>, index, index) -> ()
  // Second vector dimension is a broadcast
@ -92,5 +103,6 @@ func @entry() {
 // CHECK: ( ( 12, 13, -42, -42, -42, -42, -42, -42, -42 ), ( 22, 23, -42, -42, -42, -42, -42, -42, -42 ), ( -42, -42, -42, -42, -42, -42, -42, -42, -42 ), ( -42, -42, -42, -42, -42, -42, -42, -42, -42 ) )
 // CHECK: ( ( 12, 22, -42, -42, -42, -42, -42, -42, -42 ), ( 13, 23, -42, -42, -42, -42, -42, -42, -42 ), ( -42, -42, -42, -42, -42, -42, -42, -42, -42 ), ( -42, -42, -42, -42, -42, -42, -42, -42, -42 ) )
 // CHECK: ( ( 0, 1, 2, 3, -42, -42, -42, -42, -42 ), ( 10, 11, 12, 13, -42, -42, -42, -42, -42 ), ( 20, 21, 22, 23, -42, -42, -42, -42, -42 ), ( -42, -42, -42, -42, -42, -42, -42, -42, -42 ) )
+// CHECK: ( ( 0, -42, 2, -42, -42, -42, -42, -42, -42 ), ( -42, -42, 12, 13, -42, -42, -42, -42, -42 ), ( 20, 21, 22, 23, -42, -42, -42, -42, -42 ), ( -42, -42, -42, -42, -42, -42, -42, -42, -42 ) )
 // CHECK: ( ( 0, 10, 20, -42, -42, -42, -42, -42, -42 ), ( 1, 11, 21, -42, -42, -42, -42, -42, -42 ), ( 2, 12, 22, -42, -42, -42, -42, -42, -42 ), ( 3, 13, 23, -42, -42, -42, -42, -42, -42 ) )
 // CHECK: ( ( 12, 12, 12, 12, 12, 12, 12, 12, 12 ), ( 13, 13, 13, 13, 13, 13, 13, 13, 13 ), ( -42, -42, -42, -42, -42, -42, -42, -42, -42 ), ( -42, -42, -42, -42, -42, -42, -42, -42, -42 ) )