[mlir] Vectorize linalg.pad_tensor consumed by transfer_write

Vectorize linalg.pad_tensor without generating a linalg.init_tensor when consumed by a transfer_write.

Differential Revision: https://reviews.llvm.org/D103137
This commit is contained in:
Matthias Springer 2021-06-14 10:16:22 +09:00
parent b1fd8a13cc
commit 562f9e995d
2 changed files with 186 additions and 2 deletions

View File

@ -784,6 +784,141 @@ struct PadTensorOpVectorizationWithTransferReadPattern
}
};
/// Rewrite use of PadTensorOp result in TransferWriteOp.
/// This pattern rewrites TransferWriteOps that write to a padded tensor value,
/// where the same amount of padding is immediately removed again after the
/// write. In such cases, the TransferWriteOp can write to the non-padded tensor
/// value and apply out-of-bounds masking. E.g.:
/// ```
/// %0 = subtensor ...[...] [%s0, %s1] [1, 1] : tensor<...> to tensor<?x?xf32>
/// %1 = linalg.pad_tensor %0 ... : tensor<?x?xf32> to tensor<17x5xf32>
/// %2 = vector.transfer_write %vec, %1[...]
/// : vector<17x5xf32>, tensor<17x5xf32>
/// %r = subtensor %2[0, 0] [%s0, %s1] [1, 1]
/// : tensor<17x5xf32> to tensor<?x?xf32>
/// ```
/// is rewritten to:
/// ```
/// %0 = subtensor ...[...] [%s0, %s1] [1, 1] : tensor<...> to tensor<?x?xf32>
/// %r = vector.transfer_write %vec, %0[...] : vector<17x5xf32>, tensor<?x?xf32>
/// ```
/// Note: It is important that the SubTensorOp %r resizes the result of the
/// TransferWriteOp to the same size as the input of the TensorPadOp (or an even
/// smaller size). Otherwise, %r's new (dynamic) dimensions would differ from
/// %r's old dimensions.
///
/// This rewrite is possible if:
/// - Low padding is static 0.
/// - `xferOp` has exactly one use, which is a SubTensorOp. This SubTensorOp
/// trims the same amount of padding that was added beforehand.
/// - Single, scalar padding value.
struct PadTensorOpVectorizationWithTransferWritePattern
: public VectorizePadTensorOpUserPattern<vector::TransferWriteOp> {
using VectorizePadTensorOpUserPattern<vector::TransferWriteOp>
::VectorizePadTensorOpUserPattern;
LogicalResult rewriteUser(PatternRewriter &rewriter, PadTensorOp padOp,
vector::TransferWriteOp xferOp) const override {
// Low padding must be static 0.
if (!padOp.hasZeroLowPad()) return failure();
// Pad value must be a constant.
auto padValue = padOp.getConstantPaddingValue();
if (!padValue) return failure();
// TransferWriteOp result must be directly consumed by a SubTensorOp.
if (!xferOp->hasOneUse()) return failure();
auto trimPadding = dyn_cast<SubTensorOp>(*xferOp->user_begin());
if (!trimPadding) return failure();
// Only static zero offsets supported when trimming padding.
if (!trimPadding.hasZeroOffset()) return failure();
// trimPadding must remove the amount of padding that was added earlier.
if (!hasSameTensorSize(padOp.source(), trimPadding)) return failure();
SmallVector<bool> inBounds(xferOp.getVectorType().getRank(), false);
auto newXferOp = rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
xferOp, padOp.source().getType(), xferOp.vector(), padOp.source(),
xferOp.indices(), xferOp.permutation_mapAttr(), xferOp.mask(),
rewriter.getBoolArrayAttr(inBounds));
rewriter.replaceOp(trimPadding, newXferOp->getResult(0));
return success();
}
/// Check if `beforePadding` and `afterTrimming` have the same tensor size,
/// i.e., same dimensions.
///
/// Dimensions may be static, dynamic or mix of both. In case of dynamic
/// dimensions, this function tries to infer the (static) tensor size by
/// looking at the defining op and utilizing op-specific knowledge.
///
/// This is a conservative analysis. In case equal tensor sizes cannot be
/// proven statically, this analysis returns `false` even though the tensor
/// sizes may turn out to be equal at runtime.
bool hasSameTensorSize(Value beforePadding, SubTensorOp afterTrimming) const {
// If the input to PadTensorOp is a CastOp, try with with both CastOp result
// and CastOp operand.
if (auto castOp = beforePadding.getDefiningOp<tensor::CastOp>())
if (hasSameTensorSize(castOp.source(), afterTrimming)) return true;
auto t1 = beforePadding.getType().dyn_cast<RankedTensorType>();
auto t2 = afterTrimming.getType().dyn_cast<RankedTensorType>();
// Only RankedTensorType supported.
if (!t1 || !t2) return false;
// Rank of both values must be the same.
if (t1.getRank() != t2.getRank()) return false;
// All static dimensions must be the same. Mixed cases (e.g., dimension
// static in `t1` but dynamic in `t2`) are not supported.
for (unsigned i = 0; i < t1.getRank(); ++i) {
if (t1.isDynamicDim(i) != t2.isDynamicDim(i))
return false;
if (!t1.isDynamicDim(i) && t1.getDimSize(i) != t2.getDimSize(i))
return false;
}
// Nothing more to check if all dimensions are static.
if (t1.getNumDynamicDims() == 0) return true;
// All dynamic sizes must be the same. The only supported case at the moment
// is when `beforePadding` is a SubTensorOp (or a cast thereof).
// Apart from CastOp, only SubTensorOp is supported.
auto beforeSubtensor = beforePadding.getDefiningOp<SubTensorOp>();
if (!beforeSubtensor) return false;
assert(static_cast<size_t>(t1.getRank())
== beforeSubtensor.getMixedSizes().size());
assert(static_cast<size_t>(t2.getRank())
== afterTrimming.getMixedSizes().size());
for (unsigned i = 0; i < t1.getRank(); ++i) {
// Skip static dimensions.
if (!t1.isDynamicDim(i)) continue;
auto size1 = beforeSubtensor.getMixedSizes()[i];
auto size2 = afterTrimming.getMixedSizes()[i];
// Case 1: Same value or same constant int.
if (isEqualConstantIntOrValue(size1, size2)) continue;
// Other cases: Take a deeper look at defining ops of values.
auto v1 = size1.dyn_cast<Value>();
auto v2 = size2.dyn_cast<Value>();
if (!v1 || !v2) return false;
// Case 2: Both values are identical AffineMinOps. (Should not happen if
// CSE is run.)
auto minOp1 = v1.getDefiningOp<AffineMinOp>();
auto minOp2 = v2.getDefiningOp<AffineMinOp>();
if (minOp1 && minOp2 && minOp1.getAffineMap() == minOp2.getAffineMap()
&& minOp1.operands() == minOp2.operands()) continue;
// Add additional cases as needed.
}
// All tests passed.
return true;
}
};
/// Rewrite use of PadTensorOp result in SubtensorInsertOp. E.g.:
/// ```
/// %0 = linalg.pad_tensor %src ... : tensor<?x?xf32> to tensor<17x5xf32>
@ -807,8 +942,8 @@ struct PadTensorOpVectorizationWithTransferReadPattern
/// - Single, scalar padding value.
struct PadTensorOpVectorizationWithSubTensorInsertPattern
: public VectorizePadTensorOpUserPattern<SubTensorInsertOp> {
using VectorizePadTensorOpUserPattern<
SubTensorInsertOp>::VectorizePadTensorOpUserPattern;
using VectorizePadTensorOpUserPattern<SubTensorInsertOp>
::VectorizePadTensorOpUserPattern;
LogicalResult rewriteUser(PatternRewriter &rewriter, PadTensorOp padOp,
SubTensorInsertOp insertOp) const override {
@ -864,6 +999,7 @@ void mlir::linalg::populatePadTensorOpVectorizationPatterns(
patterns.getContext(), baseBenefit);
// Try these specialized patterns first before resorting to the generic one.
patterns.add<PadTensorOpVectorizationWithTransferReadPattern,
PadTensorOpVectorizationWithTransferWritePattern,
PadTensorOpVectorizationWithSubTensorInsertPattern>(
patterns.getContext(), baseBenefit.getBenefit() + 1);
}

View File

@ -580,6 +580,54 @@ func @pad_and_transfer_read(%arg0: tensor<5x6xf32>) -> vector<7x9xf32> {
// -----
// CHECK-LABEL: func @pad_and_transfer_write_static
// CHECK-SAME: %[[ARG0:.*]]: tensor<5x6xf32>, %[[ARG1:.*]]: vector<7x9xf32>
// CHECK-NOT: linalg.pad_tensor
// CHECK: %[[C0:.*]] = constant 0 : index
// CHECK: %[[RESULT:.*]] = vector.transfer_write %[[ARG1]], %[[ARG0]][%[[C0]], %[[C0]]] : vector<7x9xf32>, tensor<5x6xf32>
// CHECK: return %[[RESULT]]
func @pad_and_transfer_write_static(
%arg0: tensor<5x6xf32>, %arg1: vector<7x9xf32>) -> tensor<5x6xf32> {
%c0 = constant 0 : index
%c5 = constant 5.0 : f32
%0 = linalg.pad_tensor %arg0 low[0, 0] high[5, 7] {
^bb0(%arg2: index, %arg3: index):
linalg.yield %c5 : f32
} : tensor<5x6xf32> to tensor<10x13xf32>
%1 = vector.transfer_write %arg1, %0[%c0, %c0]
: vector<7x9xf32>, tensor<10x13xf32>
%2 = subtensor %1[0, 0] [5, 6] [1, 1] : tensor<10x13xf32> to tensor<5x6xf32>
return %2 : tensor<5x6xf32>
}
// -----
// CHECK-LABEL: func @pad_and_transfer_write_dynamic_static
// CHECK-SAME: %[[ARG0:.*]]: tensor<?x?xf32>, %[[ARG1:.*]]: vector<7x9xf32>, %[[SIZE:.*]]: index, %[[PADDING:.*]]: index
// CHECK-NOT: linalg.pad_tensor
// CHECK: %[[C0:.*]] = constant 0 : index
// CHECK: %[[SUB:.*]] = subtensor %[[ARG0]][0, 0] [%[[SIZE]], 6] [1, 1] : tensor<?x?xf32> to tensor<?x6xf32>
// CHECK: %[[RESULT:.*]] = vector.transfer_write %[[ARG1]], %[[SUB]][%[[C0]], %[[C0]]] : vector<7x9xf32>, tensor<?x6xf32>
// CHECK: return %[[RESULT]]
func @pad_and_transfer_write_dynamic_static(
%arg0: tensor<?x?xf32>, %arg1: vector<7x9xf32>, %size: index, %padding: index) -> tensor<?x6xf32> {
%c0 = constant 0 : index
%c5 = constant 5.0 : f32
%s = subtensor %arg0[0, 0] [%size, 6] [1, 1]
: tensor<?x?xf32> to tensor<?x6xf32>
%0 = linalg.pad_tensor %s low[0, 0] high[%padding, 7] {
^bb0(%arg2: index, %arg3: index):
linalg.yield %c5 : f32
} : tensor<?x6xf32> to tensor<?x13xf32>
%1 = vector.transfer_write %arg1, %0[%c0, %c0]
: vector<7x9xf32>, tensor<?x13xf32>
%2 = subtensor %1[0, 0] [%size, 6] [1, 1] : tensor<?x13xf32> to tensor<?x6xf32>
return %2 : tensor<?x6xf32>
}
// -----
// CHECK-LABEL: func @pad_and_subtensor_insert
// CHECK-SAME: %[[ARG0:.*]]: tensor<5x6xf32>, %[[ARG1:.*]]: tensor<12x13xf32>
// CHECK-NOT: linalg.pad_tensor