forked from OSchip/llvm-project
[mlir][sparse] replace stack-based access pattern with dyn-alloc
Rationale: Allocating the temporary buffers for access pattern expansion on the stack (using alloca) is a bit too agressive, since it easily runs out of stack space for large enveloping tensor dimensions. This revision changes the dynamic allocation of these buffers with explicit alloc/dealloc pairs. Reviewed By: bixia, wrengr Differential Revision: https://reviews.llvm.org/D123253
This commit is contained in:
parent
303c180199
commit
0b55f94d2b
|
@ -160,6 +160,16 @@ static Value genAlloca(ConversionPatternRewriter &rewriter, Location loc,
|
|||
return rewriter.create<memref::AllocaOp>(loc, memTp, ValueRange{sz});
|
||||
}
|
||||
|
||||
/// Generates an uninitialized buffer of the given size and type,
|
||||
/// but returns it as type `memref<? x $tp>` (rather than as type
|
||||
/// `memref<$sz x $tp>`). Unlike temporary buffers on the stack,
|
||||
/// this buffer must be explicitly deallocated by client.
|
||||
static Value genAlloc(ConversionPatternRewriter &rewriter, Location loc,
|
||||
Value sz, Type tp) {
|
||||
auto memTp = MemRefType::get({ShapedType::kDynamicSize}, tp);
|
||||
return rewriter.create<memref::AllocOp>(loc, memTp, ValueRange{sz});
|
||||
}
|
||||
|
||||
/// Generates an uninitialized temporary buffer of the given size and
|
||||
/// type, but returns it as type `memref<? x $tp>` (rather than as type
|
||||
/// `memref<$sz x $tp>`).
|
||||
|
@ -761,15 +771,18 @@ public:
|
|||
auto enc = getSparseTensorEncoding(srcType);
|
||||
Value src = adaptor.getOperands()[0];
|
||||
Value sz = genDimSizeCall(rewriter, op, enc, src, srcType.getRank() - 1);
|
||||
// Allocate temporary stack buffers for values, filled-switch, and indices.
|
||||
Value values = genAlloca(rewriter, loc, sz, eltType);
|
||||
Value filled = genAlloca(rewriter, loc, sz, boolType);
|
||||
Value indices = genAlloca(rewriter, loc, sz, idxType);
|
||||
// Allocate temporary buffers for values, filled-switch, and indices.
|
||||
// We do not use stack buffers for this, since the expanded size may
|
||||
// be rather large (as it envelops a single expanded dense dimension).
|
||||
Value values = genAlloc(rewriter, loc, sz, eltType);
|
||||
Value filled = genAlloc(rewriter, loc, sz, boolType);
|
||||
Value indices = genAlloc(rewriter, loc, sz, idxType);
|
||||
Value zero = constantZero(rewriter, loc, idxType);
|
||||
// Reset the values/filled-switch to all-zero/false. Note that this
|
||||
// introduces an O(N) operation into the computation, but this reset
|
||||
// operation is amortized over the innermost loops for the access
|
||||
// pattern expansion.
|
||||
// pattern expansion. As noted in the operation doc, we would like
|
||||
// to amortize this setup cost even between kernels.
|
||||
rewriter.create<linalg::FillOp>(
|
||||
loc, ValueRange{constantZero(rewriter, loc, eltType)},
|
||||
ValueRange{values});
|
||||
|
@ -789,6 +802,7 @@ public:
|
|||
LogicalResult
|
||||
matchAndRewrite(CompressOp op, OpAdaptor adaptor,
|
||||
ConversionPatternRewriter &rewriter) const override {
|
||||
Location loc = op->getLoc();
|
||||
// Note that this method call resets the values/filled-switch back to
|
||||
// all-zero/false by only iterating over the set elements, so the
|
||||
// complexity remains proportional to the sparsity of the expanded
|
||||
|
@ -798,6 +812,18 @@ public:
|
|||
TypeRange noTp;
|
||||
replaceOpWithFuncCall(rewriter, op, name, noTp, adaptor.getOperands(),
|
||||
EmitCInterface::On);
|
||||
// Deallocate the buffers on exit of the loop nest.
|
||||
Operation *parent = op;
|
||||
for (; isa<scf::ForOp>(parent->getParentOp()) ||
|
||||
isa<scf::WhileOp>(parent->getParentOp()) ||
|
||||
isa<scf::ParallelOp>(parent->getParentOp()) ||
|
||||
isa<scf::IfOp>(parent->getParentOp());
|
||||
parent = parent->getParentOp())
|
||||
;
|
||||
rewriter.setInsertionPointAfter(parent);
|
||||
rewriter.create<memref::DeallocOp>(loc, adaptor.getOperands()[2]);
|
||||
rewriter.create<memref::DeallocOp>(loc, adaptor.getOperands()[3]);
|
||||
rewriter.create<memref::DeallocOp>(loc, adaptor.getOperands()[4]);
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
|
|
@ -461,24 +461,31 @@ func @sparse_insert(%arg0: tensor<128xf32, #SparseVector>,
|
|||
}
|
||||
|
||||
// CHECK-LABEL: func @sparse_expansion()
|
||||
// %[[S:.*]] = call @sparseDimSize
|
||||
// %[[V:.*]] = memref.alloca(%[[S]]) : memref<?xf64>
|
||||
// %[[F:.*]] = memref.alloca(%[[S]]) : memref<?xi1>
|
||||
// %[[A:.*]] = memref.alloca(%[[S]]) : memref<?xindex>
|
||||
// linalg.fill ins(%{{.*}} : f64) outs(%[[V]] : memref<?xf64>)
|
||||
// linalg.fill ins(%{{.*}} : i1) outs(%[[F]] : memref<?xi1>)
|
||||
// CHECK: return
|
||||
func @sparse_expansion() {
|
||||
// CHECK: %[[S:.*]] = call @sparseDimSize
|
||||
// CHECK: %[[A:.*]] = memref.alloc(%[[S]]) : memref<?xf64>
|
||||
// CHECK: %[[B:.*]] = memref.alloc(%[[S]]) : memref<?xi1>
|
||||
// CHECK: %[[C:.*]] = memref.alloc(%[[S]]) : memref<?xindex>
|
||||
// CHECK-DAG: linalg.fill ins(%{{.*}} : f64) outs(%[[A]] : memref<?xf64>)
|
||||
// CHECK-DAG: linalg.fill ins(%{{.*}} : i1) outs(%[[B]] : memref<?xi1>)
|
||||
// CHECK: return %[[C]] : memref<?xindex>
|
||||
func @sparse_expansion() -> memref<?xindex> {
|
||||
%c = arith.constant 8 : index
|
||||
%0 = sparse_tensor.init [%c, %c] : tensor<8x8xf64, #SparseMatrix>
|
||||
%values, %filled, %added, %count = sparse_tensor.expand %0
|
||||
: tensor<8x8xf64, #SparseMatrix> to memref<?xf64>, memref<?xi1>, memref<?xindex>, index
|
||||
return
|
||||
return %added : memref<?xindex>
|
||||
}
|
||||
|
||||
// CHECK-LABEL: func @sparse_compression(
|
||||
// CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>,
|
||||
// CHECK-SAME: %[[A:.*0]]: !llvm.ptr<i8>,
|
||||
// CHECK-SAME: %[[B:.*1]]: memref<?xindex>,
|
||||
// CHECK-SAME: %[[C:.*2]]: memref<?xf64>,
|
||||
// CHECK-SAME: %[[D:.*3]]: memref<?xi1>,
|
||||
// CHECK-SAME: %[[E:.*4]]: memref<?xindex>,
|
||||
// CHECK: call @expInsertF64(%[[A]],
|
||||
// CHECK-DAG: memref.dealloc %[[C]] : memref<?xf64>
|
||||
// CHECK-DAG: memref.dealloc %[[D]] : memref<?xi1>
|
||||
// CHECK-DAG: memref.dealloc %[[E]] : memref<?xindex>
|
||||
// CHECK: return
|
||||
func @sparse_compression(%arg0: tensor<8x8xf64, #SparseMatrix>,
|
||||
%arg1: memref<?xindex>, %arg2: memref<?xf64>, %arg3: memref<?xi1>,
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
// RUN: mlir-opt %s -sparsification | \
|
||||
// RUN: FileCheck %s --check-prefix=CHECK-SPARSE
|
||||
// RUN: mlir-opt %s -sparsification -sparse-tensor-conversion | \
|
||||
// RUN: FileCheck %s --check-prefix=CHECK-CONVERT
|
||||
|
||||
#DCSC = #sparse_tensor.encoding<{
|
||||
dimLevelType = [ "compressed", "compressed" ],
|
||||
dimOrdering = affine_map<(i,j) -> (j,i)>
|
||||
}>
|
||||
|
||||
#SV = #sparse_tensor.encoding<{
|
||||
dimLevelType = [ "compressed" ]
|
||||
}>
|
||||
|
||||
#rowsum = {
|
||||
indexing_maps = [
|
||||
affine_map<(i,j) -> (i,j)>, // A
|
||||
affine_map<(i,j) -> (i)> // x (out)
|
||||
],
|
||||
iterator_types = ["parallel", "reduction"],
|
||||
doc = "X(i) = SUM A(i,j)"
|
||||
}
|
||||
|
||||
//
|
||||
// CHECK-SPARSE-LABEL: func @kernel(
|
||||
// CHECK-SPARSE: %[[A:.*]], %[[B:.*]], %[[C:.*]], %{{.*}} = sparse_tensor.expand
|
||||
// CHECK-SPARSE: scf.for
|
||||
// CHECK-SPARSE: scf.for
|
||||
// CHECK-SPARSE: sparse_tensor.compress %{{.*}}, %{{.*}}, %[[A]], %[[B]], %[[C]]
|
||||
// CHECK-SPARSE: %[[RET:.*]] = sparse_tensor.load %{{.*}} hasInserts
|
||||
// CHECK-SPARSE: return %[[RET]]
|
||||
//
|
||||
// CHECK-CONVERT-LABEL: func @kernel(
|
||||
// CHECK-CONVERT: %{{.*}} = call @sparseDimSize
|
||||
// CHECK-CONVERT: %[[S:.*]] = call @sparseDimSize
|
||||
// CHECK-CONVERT: %[[A:.*]] = memref.alloc(%[[S]]) : memref<?xf64>
|
||||
// CHECK-CONVERT: %[[B:.*]] = memref.alloc(%[[S]]) : memref<?xi1>
|
||||
// CHECK-CONVERT: %[[C:.*]] = memref.alloc(%[[S]]) : memref<?xindex>
|
||||
// CHECK-CONVERT: linalg.fill ins(%{{.*}} : f64) outs(%[[A]] : memref<?xf64>)
|
||||
// CHECK-CONVERT: linalg.fill ins(%{{.*}} : i1) outs(%[[B]] : memref<?xi1>)
|
||||
// CHECK-CONVERT: scf.for
|
||||
// CHECK-CONVERT: scf.for
|
||||
// CHECK-CONVERT: call @expInsertF64
|
||||
// CHECK-CONVERT: memref.dealloc %[[A]] : memref<?xf64>
|
||||
// CHECK-CONVERT: memref.dealloc %[[B]] : memref<?xi1>
|
||||
// CHECK-CONVERT: memref.dealloc %[[C]] : memref<?xindex>
|
||||
// CHECK-CONVERT: call @endInsert
|
||||
//
|
||||
func @kernel(%arga: tensor<?x?xf64, #DCSC>) -> tensor<?xf64, #SV> {
|
||||
%c0 = arith.constant 0 : index
|
||||
%n = tensor.dim %arga, %c0 : tensor<?x?xf64, #DCSC>
|
||||
%v = sparse_tensor.init [%n] : tensor<?xf64, #SV>
|
||||
%0 = linalg.generic #rowsum
|
||||
ins(%arga: tensor<?x?xf64, #DCSC>)
|
||||
outs(%v: tensor<?xf64, #SV>) {
|
||||
^bb(%a: f64, %x: f64):
|
||||
%1 = arith.addf %x, %a : f64
|
||||
linalg.yield %1 : f64
|
||||
} -> tensor<?xf64, #SV>
|
||||
return %0 : tensor<?xf64, #SV>
|
||||
}
|
Loading…
Reference in New Issue