[mlir][linalg] Lower PadTensorOps with non-constant pad value

The padding of such ops is not generated in a vectorized way. Instead, emit a tensor::GenerateOp. We may vectorize GenerateOps in the future. Differential Revision: https://reviews.llvm.org/D103879
2021-06-14 15:00:30 +09:00 · 2021-06-14 15:00:30 +09:00 · ddda52ce3c
parent 73cbc91c93
commit ddda52ce3c
2 changed files with 76 additions and 11 deletions
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@ -689,10 +689,6 @@ struct GenericPadTensorOpVectorizationPattern
          padOp.getLoc(), getIntFromAttr(ofr.get<Attribute>())).getResult();
    };

-    // Pad value must be a constant.
-    auto padValue = padOp.getConstantPaddingValue();
-    if (!padValue) return failure();
-
    auto resultType = padOp.getResultType();
    // Compute size of InitTensorOp. Any combination of static/dynamic is
    // supported.
@ -712,20 +708,20 @@ struct GenericPadTensorOpVectorizationPattern
      staticSizes.push_back(resultType.getDimSize(dim));
    }

+    // Init tensor and fill it with padding.
    Value init = rewriter.create<InitTensorOp>(
        padOp.getLoc(), dynSizes, staticSizes, resultType.getElementType());
-    Value fill =
-        rewriter.create<FillOp>(padOp.getLoc(), init, padValue).result();
-
-    auto sourceType = padOp.getSourceType();
+    Value fill = tryVectorizeFill(rewriter, padOp, init, dynSizes);

    // Try vectorizing the copy of source.
-    if (tryVectorizeCopy(rewriter, padOp, padValue, fill).succeeded())
+    if (tryVectorizeCopy(rewriter, padOp, fill).succeeded())
      return success();

    // Neither source type nor PadTensorOp result type have static shape. Such
-    // PadTensorOps cannot be vectorized. Generate a SubTensorInsertOp instead.
+    // PadTensorOps cannot be vectorized. Generate a SubTensorInsertOp instead
+    // for copying the PadOp source.

+    auto sourceType = padOp.getSourceType();
    // Compute size of source of PadTensorOp.
    SmallVector<OpFoldResult> srcSizes;
    for (unsigned dim = 0; dim < sourceType.getRank(); ++dim) {
@ -745,14 +741,54 @@ struct GenericPadTensorOpVectorizationPattern
    return success();
  }

+  /// Vectorize the filling of `dest`. This is possible if the padOp is padding
+  /// with a constant value. Otherwise, generate a tensor::GenerateOp.
+  Value tryVectorizeFill(PatternRewriter &rewriter, PadTensorOp padOp,
+                         Value dest, const SmallVector<Value> &dynSizes) const {
+    // Fill can be vectorized if padValue is a constant. (If there is enough
+    // static type information, the FillOp will be vectorized by another
+    // pattern.)
+    auto padValue = padOp.getConstantPaddingValue();
+    if (padValue)
+      return rewriter.create<FillOp>(padOp.getLoc(), dest, padValue).result();
+
+    // Fill could not be vectorized: Lower to tensor::GenerateOp with region.
+    auto generateOp = rewriter.create<tensor::GenerateOp>(
+        padOp.getLoc(), padOp.getResultType(), dynSizes);
+    // Copy region to new op.
+    BlockAndValueMapping bvm;
+    padOp.region().cloneInto(&generateOp.getRegion(), bvm);
+    // Rewrite linalg::YieldOp to tensor::YieldOp.
+    OpBuilder::InsertionGuard guard(rewriter);
+    auto yieldOp = dyn_cast<linalg::YieldOp>(
+        generateOp.getRegion().front().getTerminator());
+    assert(yieldOp && "malformed PadTensorOp: expected YieldOp terminator");
+    assert(yieldOp.values().size() == 1);
+    rewriter.setInsertionPoint(yieldOp);
+    rewriter.replaceOpWithNewOp<tensor::YieldOp>(yieldOp, yieldOp.values()[0]);
+    return generateOp;
+  }
+
  /// Vectorize the copying of a PadTensorOp's source. This is possible if each
  /// dimension size is statically know in the source type or the result type
  /// (or both).
  LogicalResult tryVectorizeCopy(PatternRewriter &rewriter, PadTensorOp padOp,
-                                 Value padValue, Value dest) const {
+                                 Value dest) const {
    auto sourceType = padOp.getSourceType();
    auto resultType = padOp.getResultType();

+    // Copy cannot be vectorized if pad value is non-constant and source shape
+    // is dynamic. In case of a dynamic source shape, padding must be appended
+    // by TransferReadOp, but TransferReadOp supports only constant padding.
+    auto padValue = padOp.getConstantPaddingValue();
+    if (!padValue) {
+      if (!sourceType.hasStaticShape()) return failure();
+      // Create dummy padding value.
+      auto elemType = sourceType.getElementType();
+      padValue = rewriter.create<ConstantOp>(padOp.getLoc(), elemType,
+                                             rewriter.getZeroAttr(elemType));
+    }
+
    SmallVector<int64_t> vecShape;
    SmallVector<bool> readInBounds;
    SmallVector<bool> writeInBounds;
--- a/mlir/test/Dialect/Linalg/vectorization.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization.mlir
@ -674,6 +674,35 @@ func @pad_and_subtensor_insert(

 // -----

+// CHECK-LABEL: func @pad_tensor_non_const_pad_value
+//  CHECK-SAME:     %[[ARG0:.*]]: tensor<5x6xf32>
+//   CHECK-NOT:   linalg.pad_tensor
+//   CHECK-DAG:   %[[C0:.*]] = constant 0 : index
+//   CHECK-DAG:   %[[C3:.*]] = constant 3 : index
+//   CHECK-DAG:   %[[C4:.*]] = constant 4 : index
+//       CHECK:   %[[FILL:.*]] = tensor.generate
+//       CHECK:     %[[RES:.*]] = mulf
+//       CHECK:     tensor.yield %[[RES]] : f32
+//       CHECK:   %[[READ:.*]] = vector.transfer_read %[[ARG0]][%[[C0]], %[[C0]]], %{{.*}} {in_bounds = [true, true]} : tensor<5x6xf32>, vector<5x6xf32>
+//       CHECK:   %[[WRITE:.*]] = vector.transfer_write %[[READ]], %[[FILL]][%[[C3]], %[[C4]]] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<12x13xf32>
+//       CHECK:   return %[[WRITE]]
+func @pad_tensor_non_const_pad_value(%arg0: tensor<5x6xf32>) -> tensor<12x13xf32> {
+  %c0 = constant 0 : index
+  %c5 = constant 5.0 : f32
+  %0 = linalg.pad_tensor %arg0 low[3, 4] high[4, 3] {
+    ^bb0(%arg1: index, %arg2: index):
+      %i1 = index_cast %arg1 : index to i32
+      %i2 = index_cast %arg2 : index to i32
+      %f1 = sitofp %i1 : i32 to f32
+      %f2 = sitofp %i2 : i32 to f32
+      %m = mulf %f1, %f2 : f32
+      linalg.yield %m : f32
+  } : tensor<5x6xf32> to tensor<12x13xf32>
+  return %0 : tensor<12x13xf32>
+}
+
+// -----
+
 // CHECK-DAG: #[[$M0:.*]] = affine_map<(d0, d1) -> (d0, d1, 0)>

 // CHECK-LABEL: func @sum_exp