[mlir] Support masked N-D vector transfer ops in ProgressiveVectorToSCF.

Mask vectors are handled similar to data vectors in N-D TransferWriteOp. They are copied into a temporary memory buffer, which can be indexed into with non-constant values.

Differential Revision: https://reviews.llvm.org/D101136
This commit is contained in:
Matthias Springer 2021-04-23 18:11:07 +09:00
parent c623945d70
commit 64f7fb5dfc
2 changed files with 132 additions and 46 deletions

View File

@ -56,16 +56,34 @@ static MemRefType unpackOneDim(MemRefType type) {
vectorType.getElementType()));
}
// TODO: Parallelism and threadlocal considerations.
static Value setAllocAtFunctionEntry(MemRefType type, Operation *op) {
/// Helper data structure for data and mask buffers.
struct BufferAllocs {
Value dataBuffer;
Value maskBuffer;
};
/// Allocate temporary buffers for data (vector) and mask (if present).
/// TODO: Parallelism and threadlocal considerations.
template <typename OpTy>
static BufferAllocs allocBuffers(OpTy xferOp) {
auto &b = ScopedContext::getBuilderRef();
OpBuilder::InsertionGuard guard(b);
Operation *scope =
op->getParentWithTrait<OpTrait::AutomaticAllocationScope>();
xferOp->template getParentWithTrait<OpTrait::AutomaticAllocationScope>();
assert(scope && "Expected op to be inside automatic allocation scope");
b.setInsertionPointToStart(&scope->getRegion(0).front());
Value res = memref_alloca(type);
return res;
BufferAllocs result;
auto bufferType = MemRefType::get({}, xferOp.getVectorType());
result.dataBuffer = memref_alloca(bufferType).value;
if (xferOp.mask()) {
auto maskType = MemRefType::get({}, xferOp.mask().getType());
result.maskBuffer = memref_alloca(maskType).value;
memref_store(xferOp.mask(), result.maskBuffer);
}
return result;
}
/// Given a vector transfer op, calculate which dimension of the `source`
@ -238,6 +256,16 @@ static ArrayAttr dropFirstElem(OpBuilder &builder, ArrayAttr attr) {
return ArrayAttr::get(builder.getContext(), attr.getValue().drop_front());
}
/// Given a transfer op, find the memref from which the mask is loaded. This
/// is similar to Strategy<TransferWriteOp>::getBuffer.
template <typename OpTy>
static Value getMaskBuffer(OpTy xferOp) {
assert(xferOp.mask() && "Expected that transfer op has mask");
auto loadOp = xferOp.mask().template getDefiningOp<memref::LoadOp>();
assert(loadOp && "Expected transfer op mask produced by LoadOp");
return loadOp.getMemRef();
}
/// Codegen strategy, depending on the operation.
template <typename OpTy>
struct Strategy;
@ -266,9 +294,9 @@ struct Strategy<TransferReadOp> {
return getStoreOp(xferOp).getMemRef();
}
/// Retrieve the indices of the current StoreOp.
static void getStoreIndices(TransferReadOp xferOp,
SmallVector<Value, 8> &indices) {
/// Retrieve the indices of the current StoreOp that stores into the buffer.
static void getBufferIndices(TransferReadOp xferOp,
SmallVector<Value, 8> &indices) {
auto storeOp = getStoreOp(xferOp);
auto prevIndices = memref::StoreOpAdaptor(storeOp).indices();
indices.append(prevIndices.begin(), prevIndices.end());
@ -300,10 +328,11 @@ struct Strategy<TransferReadOp> {
///
/// Note: The loop and type cast are generated in TransferOpConversion.
/// The original TransferReadOp and store op are deleted in `cleanup`.
static void rewriteOp(OpBuilder &builder, TransferReadOp xferOp,
Value buffer, Value iv) {
/// Note: The `mask` operand is set in TransferOpConversion.
static TransferReadOp rewriteOp(OpBuilder &builder, TransferReadOp xferOp,
Value buffer, Value iv) {
SmallVector<Value, 8> storeIndices;
getStoreIndices(xferOp, storeIndices);
getBufferIndices(xferOp, storeIndices);
storeIndices.push_back(iv);
SmallVector<Value, 8> xferIndices;
@ -321,6 +350,7 @@ struct Strategy<TransferReadOp> {
newXfer.getDefiningOp()->setAttr(kPassLabel, builder.getUnitAttr());
memref_store(newXfer, buffer, storeIndices);
return newXfer.getDefiningOp<TransferReadOp>();
}
/// Handle out-of-bounds accesses on the to-be-unpacked dimension: Write
@ -329,7 +359,7 @@ struct Strategy<TransferReadOp> {
OpBuilder &/*builder*/, TransferReadOp xferOp, Value buffer,
Value iv) {
SmallVector<Value, 8> storeIndices;
getStoreIndices(xferOp, storeIndices);
getBufferIndices(xferOp, storeIndices);
storeIndices.push_back(iv);
auto bufferType = buffer.getType().dyn_cast<ShapedType>();
@ -361,9 +391,9 @@ struct Strategy<TransferWriteOp> {
return loadOp.getMemRef();
}
/// Retrieve the indices of the current LoadOp.
static void getLoadIndices(TransferWriteOp xferOp,
SmallVector<Value, 8> &indices) {
/// Retrieve the indices of the current LoadOp that loads from the buffer.
static void getBufferIndices(TransferWriteOp xferOp,
SmallVector<Value, 8> &indices) {
auto loadOp = xferOp.vector().getDefiningOp<memref::LoadOp>();
auto prevIndices = memref::LoadOpAdaptor(loadOp).indices();
indices.append(prevIndices.begin(), prevIndices.end());
@ -378,10 +408,10 @@ struct Strategy<TransferWriteOp> {
/// to memory.
///
/// Note: For more details, see comments on Strategy<TransferReadOp>.
static void rewriteOp(OpBuilder &builder, TransferWriteOp xferOp,
Value buffer, Value iv) {
static TransferWriteOp rewriteOp(OpBuilder &builder, TransferWriteOp xferOp,
Value buffer, Value iv) {
SmallVector<Value, 8> loadIndices;
getLoadIndices(xferOp, loadIndices);
getBufferIndices(xferOp, loadIndices);
loadIndices.push_back(iv);
SmallVector<Value, 8> xferIndices;
@ -397,6 +427,8 @@ struct Strategy<TransferWriteOp> {
if (vecType.getRank() > kTargetRank)
newXfer.op->setAttr(kPassLabel, builder.getUnitAttr());
return newXfer;
}
/// Handle out-of-bounds accesses on the to-be-unpacked dimension.
@ -416,8 +448,6 @@ LogicalResult checkPrepareXferOp(OpTy xferOp) {
return failure();
if (xferOp.getVectorType().getRank() <= kTargetRank)
return failure();
if (xferOp.mask())
return failure();
return success();
}
@ -442,6 +472,8 @@ LogicalResult checkPrepareXferOp(OpTy xferOp) {
/// memref.store %1, %0[] : memref<vector<5x4xf32>>
/// %vec = memref.load %0[] : memref<vector<5x4xf32>>
/// ```
///
/// Note: A second temporary buffer may be allocated for the `mask` operand.
struct PrepareTransferReadConversion
: public OpRewritePattern<TransferReadOp> {
using OpRewritePattern<TransferReadOp>::OpRewritePattern;
@ -452,12 +484,16 @@ struct PrepareTransferReadConversion
return failure();
ScopedContext scope(rewriter, xferOp.getLoc());
auto allocType = MemRefType::get({}, xferOp.getVectorType());
auto buffer = setAllocAtFunctionEntry(allocType, xferOp);
auto buffers = allocBuffers(xferOp);
auto *newXfer = rewriter.clone(*xferOp.getOperation());
newXfer->setAttr(kPassLabel, rewriter.getUnitAttr());
memref_store(newXfer->getResult(0), buffer);
rewriter.replaceOpWithNewOp<memref::LoadOp>(xferOp, buffer);
if (xferOp.mask()) {
auto loadedMask = memref_load(buffers.maskBuffer);
dyn_cast<TransferReadOp>(newXfer).maskMutable().assign(loadedMask);
}
memref_store(newXfer->getResult(0), buffers.dataBuffer);
rewriter.replaceOpWithNewOp<memref::LoadOp>(xferOp, buffers.dataBuffer);
return success();
}
@ -484,6 +520,8 @@ struct PrepareTransferReadConversion
/// vector.transfer_write %1, %A[%a, %b, %c] { __vector_to_scf_lowering__ }
/// : vector<5x4xf32>, memref<?x?x?xf32>
/// ```
///
/// Note: A second temporary buffer may be allocated for the `mask` operand.
struct PrepareTransferWriteConversion
: public OpRewritePattern<TransferWriteOp> {
using OpRewritePattern<TransferWriteOp>::OpRewritePattern;
@ -494,16 +532,20 @@ struct PrepareTransferWriteConversion
return failure();
ScopedContext scope(rewriter, xferOp.getLoc());
auto allocType = MemRefType::get({}, xferOp.getVectorType());
auto buffer = setAllocAtFunctionEntry(allocType, xferOp);
memref_store(xferOp.vector(), buffer);
auto loadedVec = memref_load(buffer);
auto buffers = allocBuffers(xferOp);
memref_store(xferOp.vector(), buffers.dataBuffer);
auto loadedVec = memref_load(buffers.dataBuffer);
rewriter.updateRootInPlace(xferOp, [&]() {
xferOp.vectorMutable().assign(loadedVec);
xferOp->setAttr(kPassLabel, rewriter.getUnitAttr());
});
if (xferOp.mask()) {
auto loadedMask = memref_load(buffers.maskBuffer);
rewriter.updateRootInPlace(
xferOp, [&]() { xferOp.maskMutable().assign(loadedMask); });
}
return success();
}
};
@ -535,16 +577,28 @@ struct TransferOpConversion : public OpRewritePattern<OpTy> {
return failure();
ScopedContext scope(rewriter, xferOp.getLoc());
// How the buffer can be found depends on OpTy.
auto buffer = Strategy<OpTy>::getBuffer(xferOp);
auto bufferType = buffer.getType().template dyn_cast<MemRefType>();
auto castedType = unpackOneDim(bufferType);
auto casted = vector_type_cast(castedType, buffer);
// Find and cast data buffer. How the buffer can be found depends on OpTy.
auto dataBuffer = Strategy<OpTy>::getBuffer(xferOp);
auto dataBufferType = dataBuffer.getType().template dyn_cast<MemRefType>();
auto castedDataType = unpackOneDim(dataBufferType);
auto castedDataBuffer = vector_type_cast(castedDataType, dataBuffer);
// If the xferOp has a mask: Find and cast mask buffer.
Value castedMaskBuffer;
if (xferOp.mask()) {
auto maskBuffer = getMaskBuffer(xferOp);
auto maskBufferType =
maskBuffer.getType().template dyn_cast<MemRefType>();
auto castedMaskType = unpackOneDim(maskBufferType);
castedMaskBuffer = vector_type_cast(castedMaskType, maskBuffer);
}
// Loop bounds and step.
auto lb = std_constant_index(0).value;
auto ub = std_constant_index(
castedType.getDimSize(castedType.getRank() - 1)).value;
castedDataType.getDimSize(castedDataType.getRank() - 1))
.value;
auto step = std_constant_index(1).value;
// Generate for loop.
@ -555,11 +609,31 @@ struct TransferOpConversion : public OpRewritePattern<OpTy> {
ScopedContext scope(b, loc);
generateInBoundsCheck(
xferOp, iv, b, unpackedDim(xferOp),
/*inBoundsCase=*/[&](OpBuilder &b, Location /*loc*/) {
Strategy<OpTy>::rewriteOp(b, xferOp, casted, iv);
}, /*outOfBoundsCase=*/[&](OpBuilder &b, Location /*loc*/) {
Strategy<OpTy>::handleOutOfBoundsDim(b, xferOp, casted, iv);
});
/*inBoundsCase=*/
[&](OpBuilder &b, Location /*loc*/) {
// Create new transfer op.
OpTy newXfer =
Strategy<OpTy>::rewriteOp(b, xferOp, castedDataBuffer, iv);
// If old transfer op has a mask: Set mask on new transfer op.
if (xferOp.mask()) {
OpBuilder::InsertionGuard guard(b);
b.setInsertionPoint(newXfer); // Insert load before newXfer.
SmallVector<Value, 8> loadIndices;
Strategy<OpTy>::getBufferIndices(xferOp, loadIndices);
loadIndices.push_back(iv);
auto mask = memref_load(castedMaskBuffer, loadIndices);
rewriter.updateRootInPlace(
newXfer, [&]() { newXfer.maskMutable().assign(mask); });
}
},
/*outOfBoundsCase=*/
[&](OpBuilder &b, Location /*loc*/) {
Strategy<OpTy>::handleOutOfBoundsDim(b, xferOp, castedDataBuffer,
iv);
});
b.create<scf::YieldOp>(loc);
});

View File

@ -1,8 +1,3 @@
// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
// RUN: FileCheck %s
// RUN: mlir-opt %s -test-progressive-convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
@ -17,6 +12,19 @@ func @transfer_read_2d(%A : memref<?x?xf32>, %base1: index, %base2: index) {
return
}
func @transfer_read_2d_mask(%A : memref<?x?xf32>, %base1: index, %base2: index) {
%fm42 = constant -42.0: f32
%mask = constant dense<[[1, 0, 1, 0, 1, 1, 1, 0, 1],
[0, 0, 1, 1, 1, 1, 1, 0, 1],
[1, 1, 1, 1, 1, 1, 1, 0, 1],
[0, 0, 1, 0, 1, 1, 1, 0, 1]]> : vector<4x9xi1>
%f = vector.transfer_read %A[%base1, %base2], %fm42, %mask
{permutation_map = affine_map<(d0, d1) -> (d0, d1)>} :
memref<?x?xf32>, vector<4x9xf32>
vector.print %f: vector<4x9xf32>
return
}
func @transfer_read_2d_transposed(
%A : memref<?x?xf32>, %base1: index, %base2: index) {
%fm42 = constant -42.0: f32
@ -80,7 +88,10 @@ func @entry() {
call @transfer_write_2d(%A, %c3, %c1) : (memref<?x?xf32>, index, index) -> ()
// Read shifted by 0 and pad with -42:
call @transfer_read_2d(%A, %c0, %c0) : (memref<?x?xf32>, index, index) -> ()
// Same as above, but transposed
// Same as above, but apply a mask
call @transfer_read_2d_mask(%A, %c0, %c0)
: (memref<?x?xf32>, index, index) -> ()
// Same as above, but without mask and transposed
call @transfer_read_2d_transposed(%A, %c0, %c0)
: (memref<?x?xf32>, index, index) -> ()
// Second vector dimension is a broadcast
@ -92,5 +103,6 @@ func @entry() {
// CHECK: ( ( 12, 13, -42, -42, -42, -42, -42, -42, -42 ), ( 22, 23, -42, -42, -42, -42, -42, -42, -42 ), ( -42, -42, -42, -42, -42, -42, -42, -42, -42 ), ( -42, -42, -42, -42, -42, -42, -42, -42, -42 ) )
// CHECK: ( ( 12, 22, -42, -42, -42, -42, -42, -42, -42 ), ( 13, 23, -42, -42, -42, -42, -42, -42, -42 ), ( -42, -42, -42, -42, -42, -42, -42, -42, -42 ), ( -42, -42, -42, -42, -42, -42, -42, -42, -42 ) )
// CHECK: ( ( 0, 1, 2, 3, -42, -42, -42, -42, -42 ), ( 10, 11, 12, 13, -42, -42, -42, -42, -42 ), ( 20, 21, 22, 23, -42, -42, -42, -42, -42 ), ( -42, -42, -42, -42, -42, -42, -42, -42, -42 ) )
// CHECK: ( ( 0, -42, 2, -42, -42, -42, -42, -42, -42 ), ( -42, -42, 12, 13, -42, -42, -42, -42, -42 ), ( 20, 21, 22, 23, -42, -42, -42, -42, -42 ), ( -42, -42, -42, -42, -42, -42, -42, -42, -42 ) )
// CHECK: ( ( 0, 10, 20, -42, -42, -42, -42, -42, -42 ), ( 1, 11, 21, -42, -42, -42, -42, -42, -42 ), ( 2, 12, 22, -42, -42, -42, -42, -42, -42 ), ( 3, 13, 23, -42, -42, -42, -42, -42, -42 ) )
// CHECK: ( ( 12, 12, 12, 12, 12, 12, 12, 12, 12 ), ( 13, 13, 13, 13, 13, 13, 13, 13, 13 ), ( -42, -42, -42, -42, -42, -42, -42, -42, -42 ), ( -42, -42, -42, -42, -42, -42, -42, -42, -42 ) )