[mlir][sparse] replace stack-based access pattern with dyn-alloc

Rationale:
Allocating the temporary buffers for access pattern expansion on the stack
(using alloca) is a bit too agressive, since it easily runs out of stack space
for large enveloping tensor dimensions. This revision changes the dynamic
allocation of these buffers with explicit alloc/dealloc pairs.

Reviewed By: bixia, wrengr

Differential Revision: https://reviews.llvm.org/D123253
This commit is contained in:
Aart Bik 2022-04-06 13:22:08 -07:00
parent 303c180199
commit 0b55f94d2b
3 changed files with 109 additions and 15 deletions

View File

@ -160,6 +160,16 @@ static Value genAlloca(ConversionPatternRewriter &rewriter, Location loc,
return rewriter.create<memref::AllocaOp>(loc, memTp, ValueRange{sz});
}
/// Generates an uninitialized buffer of the given size and type,
/// but returns it as type `memref<? x $tp>` (rather than as type
/// `memref<$sz x $tp>`). Unlike temporary buffers on the stack,
/// this buffer must be explicitly deallocated by client.
static Value genAlloc(ConversionPatternRewriter &rewriter, Location loc,
Value sz, Type tp) {
auto memTp = MemRefType::get({ShapedType::kDynamicSize}, tp);
return rewriter.create<memref::AllocOp>(loc, memTp, ValueRange{sz});
}
/// Generates an uninitialized temporary buffer of the given size and
/// type, but returns it as type `memref<? x $tp>` (rather than as type
/// `memref<$sz x $tp>`).
@ -761,15 +771,18 @@ public:
auto enc = getSparseTensorEncoding(srcType);
Value src = adaptor.getOperands()[0];
Value sz = genDimSizeCall(rewriter, op, enc, src, srcType.getRank() - 1);
// Allocate temporary stack buffers for values, filled-switch, and indices.
Value values = genAlloca(rewriter, loc, sz, eltType);
Value filled = genAlloca(rewriter, loc, sz, boolType);
Value indices = genAlloca(rewriter, loc, sz, idxType);
// Allocate temporary buffers for values, filled-switch, and indices.
// We do not use stack buffers for this, since the expanded size may
// be rather large (as it envelops a single expanded dense dimension).
Value values = genAlloc(rewriter, loc, sz, eltType);
Value filled = genAlloc(rewriter, loc, sz, boolType);
Value indices = genAlloc(rewriter, loc, sz, idxType);
Value zero = constantZero(rewriter, loc, idxType);
// Reset the values/filled-switch to all-zero/false. Note that this
// introduces an O(N) operation into the computation, but this reset
// operation is amortized over the innermost loops for the access
// pattern expansion.
// pattern expansion. As noted in the operation doc, we would like
// to amortize this setup cost even between kernels.
rewriter.create<linalg::FillOp>(
loc, ValueRange{constantZero(rewriter, loc, eltType)},
ValueRange{values});
@ -789,6 +802,7 @@ public:
LogicalResult
matchAndRewrite(CompressOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
Location loc = op->getLoc();
// Note that this method call resets the values/filled-switch back to
// all-zero/false by only iterating over the set elements, so the
// complexity remains proportional to the sparsity of the expanded
@ -798,6 +812,18 @@ public:
TypeRange noTp;
replaceOpWithFuncCall(rewriter, op, name, noTp, adaptor.getOperands(),
EmitCInterface::On);
// Deallocate the buffers on exit of the loop nest.
Operation *parent = op;
for (; isa<scf::ForOp>(parent->getParentOp()) ||
isa<scf::WhileOp>(parent->getParentOp()) ||
isa<scf::ParallelOp>(parent->getParentOp()) ||
isa<scf::IfOp>(parent->getParentOp());
parent = parent->getParentOp())
;
rewriter.setInsertionPointAfter(parent);
rewriter.create<memref::DeallocOp>(loc, adaptor.getOperands()[2]);
rewriter.create<memref::DeallocOp>(loc, adaptor.getOperands()[3]);
rewriter.create<memref::DeallocOp>(loc, adaptor.getOperands()[4]);
return success();
}
};

View File

@ -461,24 +461,31 @@ func @sparse_insert(%arg0: tensor<128xf32, #SparseVector>,
}
// CHECK-LABEL: func @sparse_expansion()
// %[[S:.*]] = call @sparseDimSize
// %[[V:.*]] = memref.alloca(%[[S]]) : memref<?xf64>
// %[[F:.*]] = memref.alloca(%[[S]]) : memref<?xi1>
// %[[A:.*]] = memref.alloca(%[[S]]) : memref<?xindex>
// linalg.fill ins(%{{.*}} : f64) outs(%[[V]] : memref<?xf64>)
// linalg.fill ins(%{{.*}} : i1) outs(%[[F]] : memref<?xi1>)
// CHECK: return
func @sparse_expansion() {
// CHECK: %[[S:.*]] = call @sparseDimSize
// CHECK: %[[A:.*]] = memref.alloc(%[[S]]) : memref<?xf64>
// CHECK: %[[B:.*]] = memref.alloc(%[[S]]) : memref<?xi1>
// CHECK: %[[C:.*]] = memref.alloc(%[[S]]) : memref<?xindex>
// CHECK-DAG: linalg.fill ins(%{{.*}} : f64) outs(%[[A]] : memref<?xf64>)
// CHECK-DAG: linalg.fill ins(%{{.*}} : i1) outs(%[[B]] : memref<?xi1>)
// CHECK: return %[[C]] : memref<?xindex>
func @sparse_expansion() -> memref<?xindex> {
%c = arith.constant 8 : index
%0 = sparse_tensor.init [%c, %c] : tensor<8x8xf64, #SparseMatrix>
%values, %filled, %added, %count = sparse_tensor.expand %0
: tensor<8x8xf64, #SparseMatrix> to memref<?xf64>, memref<?xi1>, memref<?xindex>, index
return
return %added : memref<?xindex>
}
// CHECK-LABEL: func @sparse_compression(
// CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>,
// CHECK-SAME: %[[A:.*0]]: !llvm.ptr<i8>,
// CHECK-SAME: %[[B:.*1]]: memref<?xindex>,
// CHECK-SAME: %[[C:.*2]]: memref<?xf64>,
// CHECK-SAME: %[[D:.*3]]: memref<?xi1>,
// CHECK-SAME: %[[E:.*4]]: memref<?xindex>,
// CHECK: call @expInsertF64(%[[A]],
// CHECK-DAG: memref.dealloc %[[C]] : memref<?xf64>
// CHECK-DAG: memref.dealloc %[[D]] : memref<?xi1>
// CHECK-DAG: memref.dealloc %[[E]] : memref<?xindex>
// CHECK: return
func @sparse_compression(%arg0: tensor<8x8xf64, #SparseMatrix>,
%arg1: memref<?xindex>, %arg2: memref<?xf64>, %arg3: memref<?xi1>,

View File

@ -0,0 +1,61 @@
// RUN: mlir-opt %s -sparsification | \
// RUN: FileCheck %s --check-prefix=CHECK-SPARSE
// RUN: mlir-opt %s -sparsification -sparse-tensor-conversion | \
// RUN: FileCheck %s --check-prefix=CHECK-CONVERT
#DCSC = #sparse_tensor.encoding<{
dimLevelType = [ "compressed", "compressed" ],
dimOrdering = affine_map<(i,j) -> (j,i)>
}>
#SV = #sparse_tensor.encoding<{
dimLevelType = [ "compressed" ]
}>
#rowsum = {
indexing_maps = [
affine_map<(i,j) -> (i,j)>, // A
affine_map<(i,j) -> (i)> // x (out)
],
iterator_types = ["parallel", "reduction"],
doc = "X(i) = SUM A(i,j)"
}
//
// CHECK-SPARSE-LABEL: func @kernel(
// CHECK-SPARSE: %[[A:.*]], %[[B:.*]], %[[C:.*]], %{{.*}} = sparse_tensor.expand
// CHECK-SPARSE: scf.for
// CHECK-SPARSE: scf.for
// CHECK-SPARSE: sparse_tensor.compress %{{.*}}, %{{.*}}, %[[A]], %[[B]], %[[C]]
// CHECK-SPARSE: %[[RET:.*]] = sparse_tensor.load %{{.*}} hasInserts
// CHECK-SPARSE: return %[[RET]]
//
// CHECK-CONVERT-LABEL: func @kernel(
// CHECK-CONVERT: %{{.*}} = call @sparseDimSize
// CHECK-CONVERT: %[[S:.*]] = call @sparseDimSize
// CHECK-CONVERT: %[[A:.*]] = memref.alloc(%[[S]]) : memref<?xf64>
// CHECK-CONVERT: %[[B:.*]] = memref.alloc(%[[S]]) : memref<?xi1>
// CHECK-CONVERT: %[[C:.*]] = memref.alloc(%[[S]]) : memref<?xindex>
// CHECK-CONVERT: linalg.fill ins(%{{.*}} : f64) outs(%[[A]] : memref<?xf64>)
// CHECK-CONVERT: linalg.fill ins(%{{.*}} : i1) outs(%[[B]] : memref<?xi1>)
// CHECK-CONVERT: scf.for
// CHECK-CONVERT: scf.for
// CHECK-CONVERT: call @expInsertF64
// CHECK-CONVERT: memref.dealloc %[[A]] : memref<?xf64>
// CHECK-CONVERT: memref.dealloc %[[B]] : memref<?xi1>
// CHECK-CONVERT: memref.dealloc %[[C]] : memref<?xindex>
// CHECK-CONVERT: call @endInsert
//
func @kernel(%arga: tensor<?x?xf64, #DCSC>) -> tensor<?xf64, #SV> {
%c0 = arith.constant 0 : index
%n = tensor.dim %arga, %c0 : tensor<?x?xf64, #DCSC>
%v = sparse_tensor.init [%n] : tensor<?xf64, #SV>
%0 = linalg.generic #rowsum
ins(%arga: tensor<?x?xf64, #DCSC>)
outs(%v: tensor<?xf64, #SV>) {
^bb(%a: f64, %x: f64):
%1 = arith.addf %x, %a : f64
linalg.yield %1 : f64
} -> tensor<?xf64, #SV>
return %0 : tensor<?xf64, #SV>
}