[mlir][sparse] replace stack-based access pattern with dyn-alloc

Rationale: Allocating the temporary buffers for access pattern expansion on the stack (using alloca) is a bit too agressive, since it easily runs out of stack space for large enveloping tensor dimensions. This revision changes the dynamic allocation of these buffers with explicit alloc/dealloc pairs. Reviewed By: bixia, wrengr Differential Revision: https://reviews.llvm.org/D123253
2022-04-06 13:22:08 -07:00 · 2022-04-06 13:22:08 -07:00 · 0b55f94d2b
parent 303c180199
commit 0b55f94d2b
3 changed files with 109 additions and 15 deletions
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@ -160,6 +160,16 @@ static Value genAlloca(ConversionPatternRewriter &rewriter, Location loc,
  return rewriter.create<memref::AllocaOp>(loc, memTp, ValueRange{sz});
 }

+/// Generates an uninitialized buffer of the given size and type,
+/// but returns it as type `memref<? x $tp>` (rather than as type
+/// `memref<$sz x $tp>`). Unlike temporary buffers on the stack,
+/// this buffer must be explicitly deallocated by client.
+static Value genAlloc(ConversionPatternRewriter &rewriter, Location loc,
+                      Value sz, Type tp) {
+  auto memTp = MemRefType::get({ShapedType::kDynamicSize}, tp);
+  return rewriter.create<memref::AllocOp>(loc, memTp, ValueRange{sz});
+}
+
 /// Generates an uninitialized temporary buffer of the given size and
 /// type, but returns it as type `memref<? x $tp>` (rather than as type
 /// `memref<$sz x $tp>`).
@ -761,15 +771,18 @@ public:
    auto enc = getSparseTensorEncoding(srcType);
    Value src = adaptor.getOperands()[0];
    Value sz = genDimSizeCall(rewriter, op, enc, src, srcType.getRank() - 1);
-    // Allocate temporary stack buffers for values, filled-switch, and indices.
-    Value values = genAlloca(rewriter, loc, sz, eltType);
-    Value filled = genAlloca(rewriter, loc, sz, boolType);
-    Value indices = genAlloca(rewriter, loc, sz, idxType);
+    // Allocate temporary buffers for values, filled-switch, and indices.
+    // We do not use stack buffers for this, since the expanded size may
+    // be rather large (as it envelops a single expanded dense dimension).
+    Value values = genAlloc(rewriter, loc, sz, eltType);
+    Value filled = genAlloc(rewriter, loc, sz, boolType);
+    Value indices = genAlloc(rewriter, loc, sz, idxType);
    Value zero = constantZero(rewriter, loc, idxType);
    // Reset the values/filled-switch to all-zero/false. Note that this
    // introduces an O(N) operation into the computation, but this reset
    // operation is amortized over the innermost loops for the access
-    // pattern expansion.
+    // pattern expansion. As noted in the operation doc, we would like
+    // to amortize this setup cost even between kernels.
    rewriter.create<linalg::FillOp>(
        loc, ValueRange{constantZero(rewriter, loc, eltType)},
        ValueRange{values});
@ -789,6 +802,7 @@ public:
  LogicalResult
  matchAndRewrite(CompressOp op, OpAdaptor adaptor,
                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = op->getLoc();
    // Note that this method call resets the values/filled-switch back to
    // all-zero/false by only iterating over the set elements, so the
    // complexity remains proportional to the sparsity of the expanded
@ -798,6 +812,18 @@ public:
    TypeRange noTp;
    replaceOpWithFuncCall(rewriter, op, name, noTp, adaptor.getOperands(),
                          EmitCInterface::On);
+    // Deallocate the buffers on exit of the loop nest.
+    Operation *parent = op;
+    for (; isa<scf::ForOp>(parent->getParentOp()) ||
+           isa<scf::WhileOp>(parent->getParentOp()) ||
+           isa<scf::ParallelOp>(parent->getParentOp()) ||
+           isa<scf::IfOp>(parent->getParentOp());
+         parent = parent->getParentOp())
+      ;
+    rewriter.setInsertionPointAfter(parent);
+    rewriter.create<memref::DeallocOp>(loc, adaptor.getOperands()[2]);
+    rewriter.create<memref::DeallocOp>(loc, adaptor.getOperands()[3]);
+    rewriter.create<memref::DeallocOp>(loc, adaptor.getOperands()[4]);
    return success();
  }
 };
--- a/mlir/test/Dialect/SparseTensor/conversion.mlir
+++ b/mlir/test/Dialect/SparseTensor/conversion.mlir
@ -461,24 +461,31 @@ func @sparse_insert(%arg0: tensor<128xf32, #SparseVector>,
 }

 // CHECK-LABEL: func @sparse_expansion()
-//    %[[S:.*]] = call @sparseDimSize
-//    %[[V:.*]] = memref.alloca(%[[S]]) : memref<?xf64>
-//    %[[F:.*]] = memref.alloca(%[[S]]) : memref<?xi1>
-//    %[[A:.*]] = memref.alloca(%[[S]]) : memref<?xindex>
-//    linalg.fill ins(%{{.*}} : f64) outs(%[[V]] : memref<?xf64>)
-//    linalg.fill ins(%{{.*}} : i1) outs(%[[F]] : memref<?xi1>)
-//       CHECK: return
-func @sparse_expansion() {
+//       CHECK: %[[S:.*]] = call @sparseDimSize
+//       CHECK: %[[A:.*]] = memref.alloc(%[[S]]) : memref<?xf64>
+//       CHECK: %[[B:.*]] = memref.alloc(%[[S]]) : memref<?xi1>
+//       CHECK: %[[C:.*]] = memref.alloc(%[[S]]) : memref<?xindex>
+//   CHECK-DAG: linalg.fill ins(%{{.*}} : f64) outs(%[[A]] : memref<?xf64>)
+//   CHECK-DAG: linalg.fill ins(%{{.*}} : i1) outs(%[[B]] : memref<?xi1>)
+//       CHECK: return %[[C]] : memref<?xindex>
+func @sparse_expansion() -> memref<?xindex> {
  %c = arith.constant 8 : index
  %0 = sparse_tensor.init [%c, %c] : tensor<8x8xf64, #SparseMatrix>
  %values, %filled, %added, %count = sparse_tensor.expand %0
    : tensor<8x8xf64, #SparseMatrix> to memref<?xf64>, memref<?xi1>, memref<?xindex>, index
-  return
+  return %added : memref<?xindex>
 }

 // CHECK-LABEL: func @sparse_compression(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>,
+//  CHECK-SAME: %[[A:.*0]]: !llvm.ptr<i8>,
+//  CHECK-SAME: %[[B:.*1]]: memref<?xindex>,
+//  CHECK-SAME: %[[C:.*2]]: memref<?xf64>,
+//  CHECK-SAME: %[[D:.*3]]: memref<?xi1>,
+//  CHECK-SAME: %[[E:.*4]]: memref<?xindex>,
 //       CHECK: call @expInsertF64(%[[A]],
+//   CHECK-DAG: memref.dealloc %[[C]] : memref<?xf64>
+//   CHECK-DAG: memref.dealloc %[[D]] : memref<?xi1>
+//   CHECK-DAG: memref.dealloc %[[E]] : memref<?xindex>
 //       CHECK: return
 func @sparse_compression(%arg0: tensor<8x8xf64, #SparseMatrix>,
                         %arg1: memref<?xindex>, %arg2: memref<?xf64>, %arg3: memref<?xi1>,
--- a/mlir/test/Dialect/SparseTensor/sparse_expand.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_expand.mlir
@ -0,0 +1,61 @@
+// RUN: mlir-opt %s -sparsification                           | \
+// RUN:   FileCheck %s --check-prefix=CHECK-SPARSE
+// RUN: mlir-opt %s -sparsification -sparse-tensor-conversion | \
+// RUN:   FileCheck %s --check-prefix=CHECK-CONVERT
+
+#DCSC = #sparse_tensor.encoding<{
+  dimLevelType = [  "compressed", "compressed" ],
+  dimOrdering = affine_map<(i,j) -> (j,i)>
+}>
+
+#SV = #sparse_tensor.encoding<{
+  dimLevelType = [  "compressed" ]
+}>
+
+#rowsum = {
+  indexing_maps = [
+    affine_map<(i,j) -> (i,j)>, // A
+    affine_map<(i,j) -> (i)>    // x (out)
+  ],
+  iterator_types = ["parallel", "reduction"],
+  doc = "X(i) = SUM A(i,j)"
+}
+
+//
+// CHECK-SPARSE-LABEL: func @kernel(
+// CHECK-SPARSE: %[[A:.*]], %[[B:.*]], %[[C:.*]], %{{.*}} = sparse_tensor.expand
+// CHECK-SPARSE: scf.for
+// CHECK-SPARSE:   scf.for
+// CHECK-SPARSE: sparse_tensor.compress %{{.*}}, %{{.*}}, %[[A]], %[[B]], %[[C]]
+// CHECK-SPARSE: %[[RET:.*]] = sparse_tensor.load %{{.*}} hasInserts
+// CHECK-SPARSE: return %[[RET]]
+//
+// CHECK-CONVERT-LABEL: func @kernel(
+// CHECK-CONVERT: %{{.*}} = call @sparseDimSize
+// CHECK-CONVERT: %[[S:.*]] = call @sparseDimSize
+// CHECK-CONVERT: %[[A:.*]] = memref.alloc(%[[S]]) : memref<?xf64>
+// CHECK-CONVERT: %[[B:.*]] = memref.alloc(%[[S]]) : memref<?xi1>
+// CHECK-CONVERT: %[[C:.*]] = memref.alloc(%[[S]]) : memref<?xindex>
+// CHECK-CONVERT: linalg.fill ins(%{{.*}} : f64) outs(%[[A]] : memref<?xf64>)
+// CHECK-CONVERT: linalg.fill ins(%{{.*}} : i1) outs(%[[B]] : memref<?xi1>)
+// CHECK-CONVERT: scf.for
+// CHECK-CONVERT:   scf.for
+// CHECK-CONVERT: call @expInsertF64
+// CHECK-CONVERT: memref.dealloc %[[A]] : memref<?xf64>
+// CHECK-CONVERT: memref.dealloc %[[B]] : memref<?xi1>
+// CHECK-CONVERT: memref.dealloc %[[C]] : memref<?xindex>
+// CHECK-CONVERT: call @endInsert
+//
+func @kernel(%arga: tensor<?x?xf64, #DCSC>) -> tensor<?xf64, #SV> {
+  %c0 = arith.constant 0 : index
+  %n = tensor.dim %arga, %c0 : tensor<?x?xf64, #DCSC>
+  %v = sparse_tensor.init [%n] : tensor<?xf64, #SV>
+  %0 = linalg.generic #rowsum
+    ins(%arga: tensor<?x?xf64, #DCSC>)
+    outs(%v: tensor<?xf64, #SV>) {
+    ^bb(%a: f64, %x: f64):
+      %1 = arith.addf %x, %a : f64
+      linalg.yield %1 : f64
+  } -> tensor<?xf64, #SV>
+  return %0 : tensor<?xf64, #SV>
+}