[mlir][sparse] sparse tensor storage implementation

This revision connects the generated sparse code with an actual sparse storage scheme, which can be initialized from a test file. Lacking a first-class citizen SparseTensor type (with buffer), the storage is hidden behind an opaque pointer with some "glue" to bring the pointer back to tensor land. Rather than generating sparse setup code for each different annotated tensor (viz. the "pack" methods in TACO), a single "one-size-fits-all" implementation has been added to the runtime support library. Many details and abstractions need to be refined in the future, but this revision allows full end-to-end integration testing and performance benchmarking (with on one end, an annotated Lingalg op and, on the other end, a JIT/AOT executable). Reviewed By: nicolasvasilache, bixia Differential Revision: https://reviews.llvm.org/D95847
2021-02-09 16:22:22 -08:00 · 2021-02-09 16:22:22 -08:00 · 0b1764a3d7
parent 17db24a7a8
commit 0b1764a3d7
19 changed files with 2603 additions and 1652 deletions
--- a/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt
@ -45,6 +45,12 @@ add_public_tablegen_target(MLIRLinalgStructuredOpsIncGen)
 add_dependencies(MLIRLinalgStructuredOpsIncGen LinalgOdsGen)
 add_dependencies(mlir-headers MLIRLinalgStructuredOpsIncGen)

+set(LLVM_TARGET_DEFINITIONS LinalgSparseOps.td)
+mlir_tablegen(LinalgSparseOps.h.inc -gen-op-decls)
+mlir_tablegen(LinalgSparseOps.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MLIRLinalgSparseOpsIncGen)
+add_dependencies(mlir-headers MLIRLinalgSparseOpsIncGen)
+
 set(LLVM_TARGET_DEFINITIONS LinalgInterfaces.td)
 mlir_tablegen(LinalgInterfaces.h.inc -gen-op-interface-decls)
 mlir_tablegen(LinalgInterfaces.cpp.inc -gen-op-interface-defs)
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h
@ -126,4 +126,7 @@ class IndexedGenericOp;
 #define GET_OP_CLASSES
 #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.h.inc"

+#define GET_OP_CLASSES
+#include "mlir/Dialect/Linalg/IR/LinalgSparseOps.h.inc"
+
 #endif // MLIR_DIALECT_LINALG_LINALGOPS_H_
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgSparseOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgSparseOps.td
@ -0,0 +1,138 @@
+//===- LinalgSparseOps.td - Linalg dialect sparse ops ------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The following operations bootstrap working with sparse tensors solely
+// within the Linalg dialect. They provide temporary bridges between a
+// future SparseTensorType (now an opaque pointer), the actual TensorType,
+// and MemRef arrays underlying an actual sparse storage scheme in memory.
+//
+// Lacking a proper sparse tensor type, the 'sparse_tensor' operation
+// provides a bridge between an opaque pointer and a regular tensor type
+// just to simplify feeding the value into a Linalg op. The operation
+// simply disappears during lowering.
+//
+// The other operations form the bridge between the opaque pointer and
+// the actual storage of pointers, indices, and values. These operations
+// resemble 'tensor_to_memref' in the sense that they map tensors to
+// their bufferized memrefs, but they lower into actual calls since
+// sparse storage does not bufferize into a single memrefs, as dense
+// tensors do, but into a hierarchical storage scheme where pointers
+// access memrefs with indices and eventually into values.
+//
+// TODO: introduce SparseTensorType as first class citizen in MLIR
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LINALG_SPARSE_OPS
+#define LINALG_SPARSE_OPS
+
+include "mlir/Dialect/Linalg/IR/LinalgBase.td"
+
+// Base class.
+class Linalg_SparseOp<string mnemonic> : Op<Linalg_Dialect, mnemonic, []> {
+  let printer = [{ return ::print(p, *this); }];
+  let verifier = ?;
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+}
+
+def Linalg_SparseTensorFromPointerOp :
+    Linalg_SparseOp<"sparse_tensor">,
+    Arguments<(ins AnyType:$ptr)>,
+    Results<(outs AnyTensor:$result)> {
+  let summary = "Views an opaque sparse tensor pointer as a tensor";
+  let description = [{
+     Lacking a first class citizen type for sparse tensors, this operation
+     forms the glue between a sparse storage scheme (behind an opaque
+     pointer) and the (dense) tensors used in the kernel definitions.
+     This operation merely provides a way to assign a proper tensor
+     type and shape to the incoming opaque pointer. It disappears
+     completely during lowering.
+
+     Example:
+
+    ```mlir
+     !SparseTensor = type !llvm.ptr<i8>
+
+     %0 = linalg.sparse_tensor %arg0 : !SparseTensor to tensor<64x64xf64>
+    ```
+  }];
+  let assemblyFormat = "$ptr attr-dict `:` type($ptr) `to` type($result)";
+}
+
+def Linalg_SparseTensorToPointersMemRefOp :
+    Linalg_SparseOp<"sparse_pointers">,
+    Arguments<(ins AnyTensor:$tensor, Index:$dim)>,
+    Results<(outs AnyStridedMemRefOfRank<1>:$result)> {
+  let summary = "Extract pointers array at given dimension from a tensor";
+  let description = [{
+     Returns the pointers array of the sparse storage scheme at the
+     given dimension for the given tensor. This is similar to the
+     `tensor_to_memref` operation in the sense that it provides a bridge
+     between a tensor world view and a bufferized world view. Unlike the
+     `tensor_to_memref` operation, however, this sparse operation actually
+     lowers into a call into a support library to obtain access to the
+     pointers array.
+
+     Example:
+
+    ```mlir
+    %1 = linalg.sparse_pointers %0, %c1 : tensor<64x64xf64> to memref<?xindex>
+    ```
+  }];
+  let assemblyFormat = "$tensor `,` $dim attr-dict `:` type($tensor)"
+      " `to` type($result)";
+}
+
+def Linalg_SparseTensorToIndicesMemRefOp :
+    Linalg_SparseOp<"sparse_indices">,
+    Arguments<(ins AnyTensor:$tensor, Index:$dim)>,
+    Results<(outs AnyStridedMemRefOfRank<1>:$result)> {
+  let summary = "Extract indices array at given dimension from a tensor";
+  let description = [{
+     Returns the indices array of the sparse storage scheme at the
+     given dimension for the given tensor. This is similar to the
+     `tensor_to_memref` operation in the sense that it provides a bridge
+     between a tensor world view and a bufferized world view. Unlike the
+     `tensor_to_memref` operation, however, this sparse operation actually
+     lowers into a call into a support library to obtain access to the
+     indices array.
+
+     Example:
+
+    ```mlir
+    %1 = linalg.sparse_indices %0, %c1 : tensor<64x64xf64> to memref<?xindex>
+    ```
+  }];
+  let assemblyFormat = "$tensor `,` $dim attr-dict `:` type($tensor)"
+      " `to` type($result)";
+}
+
+def Linalg_SparseTensorToValuesMemRefOp :
+    Linalg_SparseOp<"sparse_values">,
+    Arguments<(ins AnyTensor:$tensor)>,
+    Results<(outs AnyStridedMemRefOfRank<1>:$result)> {
+  let summary = "Extract numerical values array from a tensor";
+  let description = [{
+     Returns the values array of the sparse storage scheme for the given
+     tensor, independent of the actual dimension. This is similar to the
+     `tensor_to_memref` operation in the sense that it provides a bridge
+     between a tensor world view and a bufferized world view. Unlike the
+     `tensor_to_memref` operation, however, this sparse operation actually
+     lowers into a call into a support library to obtain access to the
+     values array.
+
+     Example:
+
+    ```mlir
+    %1 = linalg.sparse_values %0 : tensor<64x64xf64> to memref<?xf64>
+    ```
+  }];
+  let assemblyFormat = "$tensor attr-dict `:` type($tensor) `to` type($result)";
+}
+
+#endif // LINALG_SPARSE_OPS
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@ -1016,9 +1016,9 @@ enum class SparseIntType { kNative, kI64, kI32, kI16, kI8 };
 struct SparsificationOptions {
  SparsificationOptions(SparseParallelizationStrategy p,
                        SparseVectorizationStrategy v, unsigned vl,
-                        SparseIntType pt, SparseIntType it)
+                        SparseIntType pt, SparseIntType it, bool fo)
      : parallelizationStrategy(p), vectorizationStrategy(v), vectorLength(vl),
-        ptrType(pt), indType(it) {
+        ptrType(pt), indType(it), fastOutput(fo) {
    // TODO: remove restriction when vectors with index elements are supported
    assert((v != SparseVectorizationStrategy::kAnyStorageInnerLoop ||
            (ptrType != SparseIntType::kNative &&
@ -1028,19 +1028,25 @@ struct SparsificationOptions {
  SparsificationOptions()
      : SparsificationOptions(SparseParallelizationStrategy::kNone,
                              SparseVectorizationStrategy::kNone, 1u,
-                              SparseIntType::kNative, SparseIntType::kNative) {}
+                              SparseIntType::kNative, SparseIntType::kNative,
+                              false) {}
  SparseParallelizationStrategy parallelizationStrategy;
  SparseVectorizationStrategy vectorizationStrategy;
  unsigned vectorLength;
  SparseIntType ptrType;
  SparseIntType indType;
+  bool fastOutput; // experimental: fast output buffers
 };

-/// Set up sparsification rewriting rules with the given options.
+/// Sets up sparsification rewriting rules with the given options.
 void populateSparsificationPatterns(
    MLIRContext *context, OwningRewritePatternList &patterns,
    const SparsificationOptions &options = SparsificationOptions());

+/// Sets up sparsification conversion rules with the given options.
+void populateSparsificationConversionPatterns(
+    MLIRContext *context, OwningRewritePatternList &patterns);
+
 } // namespace linalg
 } // namespace mlir

--- a/mlir/integration_test/Sparse/CPU/sparse_sum.mlir
+++ b/mlir/integration_test/Sparse/CPU/sparse_sum.mlir
@ -0,0 +1,106 @@
+// RUN: mlir-opt %s \
+// RUN:   --test-sparsification="lower" \
+// RUN:   --convert-linalg-to-loops \
+// RUN:   --func-bufferize --tensor-constant-bufferize --tensor-bufferize --finalizing-bufferize  \
+// RUN:   --convert-scf-to-std --convert-vector-to-llvm --convert-std-to-llvm | \
+// RUN: TENSOR0="%mlir_integration_test_dir/data/test.mtx" \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+//
+// Use descriptive names for opaque pointers.
+//
+!Filename     = type !llvm.ptr<i8>
+!SparseTensor = type !llvm.ptr<i8>
+
+#trait_sum_reduce = {
+  indexing_maps = [
+    affine_map<(i,j) -> (i,j)>, // A
+    affine_map<(i,j) -> ()>     // x (out)
+  ],
+  sparse = [
+    [ "S", "S" ], // A
+    [          ]  // x
+  ],
+  iterator_types = ["reduction", "reduction"],
+  doc = "x += A(i,j)"
+}
+
+//
+// Integration test that lowers a kernel annotated as sparse to
+// actual sparse code, initializes a matching sparse storage scheme
+// from file, and runs the resulting code with the JIT compiler.
+//
+module {
+  //
+  // The kernel expressed as an annotated Linalg op. The kernel
+  // sum reduces a matrix to a single scalar.
+  //
+  func @kernel_sum_reduce(%argA: !SparseTensor,
+                          %argx: tensor<f64>) -> tensor<f64> {
+    %arga = linalg.sparse_tensor %argA : !SparseTensor to tensor<?x?xf64>
+    %0 = linalg.generic #trait_sum_reduce
+      ins(%arga: tensor<?x?xf64>)
+      outs(%argx: tensor<f64>) {
+      ^bb(%a: f64, %x: f64):
+        %0 = addf %x, %a : f64
+        linalg.yield %0 : f64
+    } -> tensor<f64>
+    return %0 : tensor<f64>
+  }
+
+  //
+  // Runtime support library that is called directly from here.
+  //
+  func private @getTensorFilename(index) -> (!Filename)
+  func private @newSparseTensor(!Filename, memref<?xi1>) -> (!SparseTensor)
+  func private @delSparseTensor(!SparseTensor) -> ()
+  func private @print_memref_f64(%ptr : tensor<*xf64>)
+
+  //
+  // Main driver that reads matrix from file and calls the sparse kernel.
+  //
+  func @entry() {
+    %d0 = constant 0.0 : f64
+    %c0 = constant 0 : index
+    %c1 = constant 1 : index
+    %c2 = constant 2 : index
+
+    // Mark both dimensions of the matrix as sparse
+    // (this must match the annotation in the trait).
+    %annotations = alloc(%c2) : memref<?xi1>
+    %sparse = constant true
+    store %sparse, %annotations[%c0] : memref<?xi1>
+    store %sparse, %annotations[%c1] : memref<?xi1>
+
+    // Setup memory for a single reduction scalar,
+    // initialized to zero.
+    %xdata = alloc() : memref<f64>
+    store %d0, %xdata[] : memref<f64>
+    %x = tensor_load %xdata : memref<f64>
+
+    // Read the sparse matrix from file, construct sparse storage
+    // according to <sparse,sparse> in memory, and call the kernel.
+    %fileName = call @getTensorFilename(%c0) : (index) -> (!Filename)
+    %a = call @newSparseTensor(%fileName, %annotations)
+      : (!Filename, memref<?xi1>) -> (!SparseTensor)
+    %0 = call @kernel_sum_reduce(%a, %x)
+      : (!SparseTensor, tensor<f64>) -> tensor<f64>
+
+    // Print the result for verification.
+    //
+    // CHECK: 28.2
+    //
+    %m = tensor_to_memref %0 : memref<f64>
+    %v = load %m[] : memref<f64>
+    vector.print %v : f64
+
+    // Release the resources.
+    call @delSparseTensor(%a) : (!SparseTensor) -> ()
+    dealloc %xdata : memref<f64>
+
+    return
+  }
+}
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@ -1566,6 +1566,9 @@ struct FoldTensorCastOp;
 #define GET_OP_CLASSES
 #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"

+#define GET_OP_CLASSES
+#include "mlir/Dialect/Linalg/IR/LinalgSparseOps.cpp.inc"
+
 /// Return the dims that are `iteratorTypeName` loops in the LinalgOp `op`.
 /// Assumes `op` is a LinalgOp.
 void mlir::linalg::getDimsOfType(Operation *op, StringRef iteratorTypeName,
--- a/mlir/lib/Dialect/Linalg/IR/LinalgTypes.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgTypes.cpp
@ -67,6 +67,10 @@ void mlir::linalg::LinalgDialect::initialize() {
 #define GET_OP_LIST
 #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
      >();
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/Linalg/IR/LinalgSparseOps.cpp.inc"
+      >();

  addInterfaces<LinalgInlinerInterface>();
 }
--- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
@ -10,6 +10,7 @@ add_mlir_dialect_library(MLIRLinalgTransforms
  Interchange.cpp
  Loops.cpp
  Promotion.cpp
+  SparseLowering.cpp
  Sparsification.cpp
  Tiling.cpp
  Transforms.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/SparseLowering.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/SparseLowering.cpp
@ -0,0 +1,138 @@
+//===- SparseLowering.cpp - Lowers sparse primitives to library calls.  ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+
+using namespace mlir;
+
+namespace {
+
+/// Returns function reference (first hit also inserts into module).
+static FlatSymbolRefAttr getFunc(Operation *op, StringRef name, Type result,
+                                 ValueRange operands) {
+  MLIRContext *context = op->getContext();
+  auto module = op->getParentOfType<ModuleOp>();
+  auto func = module.lookupSymbol<FuncOp>(name);
+  if (!func) {
+    OpBuilder moduleBuilder(module.getBodyRegion());
+    moduleBuilder
+        .create<FuncOp>(op->getLoc(), name,
+                        FunctionType::get(context, operands.getTypes(), result))
+        .setPrivate();
+  }
+  return SymbolRefAttr::get(context, name);
+}
+
+/// Sparse conversion rule to remove opaque pointer cast.
+class TensorFromPointerConverter
+    : public OpConversionPattern<linalg::SparseTensorFromPointerOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(linalg::SparseTensorFromPointerOp op,
+                  ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOp(op, operands[0]);
+    return success();
+  }
+};
+
+/// Sparse conversion rule for dimension accesses.
+class TensorToDimSizeConverter : public OpConversionPattern<DimOp> {
+public:
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(DimOp op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    if (!operands[0].getType().isa<LLVM::LLVMPointerType>())
+      return failure();
+    Type resType = op.getType();
+    StringRef name = "sparseDimSize";
+    rewriter.replaceOpWithNewOp<CallOp>(
+        op, resType, getFunc(op, name, resType, operands), operands);
+    return success();
+  }
+};
+
+/// Sparse conversion rule for pointer accesses.
+class TensorToPointersConverter
+    : public OpConversionPattern<linalg::SparseTensorToPointersMemRefOp> {
+public:
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(linalg::SparseTensorToPointersMemRefOp op,
+                  ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    Type resType = op.getType();
+    Type eltType = resType.cast<ShapedType>().getElementType();
+    StringRef name;
+    if (eltType.isIndex() || eltType.isInteger(64))
+      name = "sparsePtrsI64";
+    else
+      return failure();
+    rewriter.replaceOpWithNewOp<CallOp>(
+        op, resType, getFunc(op, name, resType, operands), operands);
+    return success();
+  }
+};
+
+/// Sparse conversion rule for index accesses.
+class TensorToIndicesConverter
+    : public OpConversionPattern<linalg::SparseTensorToIndicesMemRefOp> {
+public:
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(linalg::SparseTensorToIndicesMemRefOp op,
+                  ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    Type resType = op.getType();
+    Type eltType = resType.cast<ShapedType>().getElementType();
+    StringRef name;
+    if (eltType.isIndex() || eltType.isInteger(64))
+      name = "sparseIndxsI64";
+    else
+      return failure();
+    rewriter.replaceOpWithNewOp<CallOp>(
+        op, resType, getFunc(op, name, resType, operands), operands);
+    return success();
+  }
+};
+
+/// Sparse conversion rule for value accesses.
+class TensorToValuesConverter
+    : public OpConversionPattern<linalg::SparseTensorToValuesMemRefOp> {
+public:
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(linalg::SparseTensorToValuesMemRefOp op,
+                  ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    Type resType = op.getType();
+    Type eltType = resType.cast<ShapedType>().getElementType();
+    StringRef name;
+    if (eltType.isF64())
+      name = "sparseValsF64";
+    else
+      return failure();
+    rewriter.replaceOpWithNewOp<CallOp>(
+        op, resType, getFunc(op, name, resType, operands), operands);
+    return success();
+  }
+};
+
+} // namespace
+
+/// Populates the given patterns list with conversion rules required for
+/// the sparsification of linear algebra operations.
+void linalg::populateSparsificationConversionPatterns(
+    MLIRContext *context, OwningRewritePatternList &patterns) {
+  patterns.insert<TensorFromPointerConverter, TensorToDimSizeConverter,
+                  TensorToPointersConverter, TensorToIndicesConverter,
+                  TensorToValuesConverter>(context);
+}
--- a/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp
@ -520,6 +520,37 @@ static Type genIntType(PatternRewriter &rewriter, linalg::SparseIntType tp) {
  llvm_unreachable("unexpected SparseIntType");
 }

+/// Returns true if tensor was set up with sparse storage scheme.
+static bool linkedSparse(linalg::GenericOp op, unsigned tensor) {
+  if (tensor < op.getNumInputs())
+    return isa_and_nonnull<linalg::SparseTensorFromPointerOp>(
+        op.getInput(tensor).getDefiningOp());
+  return false;
+}
+
+/// Generates buffer for the output tensor.
+static Value genOutputBuffer(CodeGen &codegen, PatternRewriter &rewriter,
+                             linalg::GenericOp op, MemRefType denseTp,
+                             ArrayRef<Value> args) {
+  Location loc = op.getLoc();
+  Value tensor = op.getOutput(0);
+  // The output tensor simply could materialize from the buffer that will
+  // be generated for the tensor present in the outs() clause. This has
+  // the major advantage that the sparse kernel only updates the nonzero
+  // positions for the output tensor. Currently this results in functional,
+  // but slightly imprecise IR, so it is put under an experimental option.
+  if (codegen.options.fastOutput)
+    return rewriter.create<TensorToMemrefOp>(loc, denseTp, tensor);
+  // By default, a new buffer is allocated which is initialized to the
+  // tensor defined in the outs() clause. This is always correct but
+  // introduces a dense initialization component that may negatively
+  // impact the running complexity of the sparse kernel.
+  Value init = rewriter.create<TensorToMemrefOp>(loc, denseTp, tensor);
+  Value alloc = rewriter.create<AllocOp>(loc, denseTp, args);
+  rewriter.create<linalg::CopyOp>(loc, init, alloc);
+  return alloc;
+}
+
 /// Local bufferization of all dense and sparse data structures.
 /// This code enables testing the first prototype sparse compiler.
 // TODO: replace this with a proliferated bufferization strategy
@ -529,53 +560,63 @@ static void genBuffers(Merger &merger, CodeGen &codegen,
  unsigned numTensors = op.getNumShapedOperands();
  unsigned numInputs = op.getNumInputs();
  assert(numTensors == numInputs + 1);
-
-  // For now, set all unknown dimensions to 999.
-  // TODO: compute these values (using sparsity or by reading tensor)
-  Value unknown = rewriter.create<ConstantIndexOp>(loc, 999);
-
  // For every tensor, find lower and upper bound on dimensions, set the
-  // same bounds on loop indices, and allocate dense or sparse buffer(s).
+  // same bounds on loop indices, and obtain dense or sparse buffer(s).
  SmallVector<Value, 4> args;
  for (unsigned t = 0; t < numTensors; t++) {
+    Value tensor = t < numInputs ? op.getInput(t) : op.getOutput(0);
    auto tensorType = op.getShapedType(t);
    auto shape = tensorType.getShape();
    auto map = op.getIndexingMap(t);
    // Scan all dimensions of current tensor.
-    bool allDense = true;
+    bool dense = !linkedSparse(op, t);
    args.clear();
    for (unsigned d = 0, rank = shape.size(); d < rank; d++) {
      unsigned i = map.getDimPosition(d);
      // Handle sparse storage schemes.
      if (merger.isDim(t, i, Dim::kSparse)) {
-        allDense = false;
+        dense = false;
        auto dynShape = {ShapedType::kDynamicSize};
        auto ptrTp = MemRefType::get(
            dynShape, genIntType(rewriter, codegen.options.ptrType));
        auto indTp = MemRefType::get(
            dynShape, genIntType(rewriter, codegen.options.indType));
-        codegen.pointers[t][i] = rewriter.create<AllocaOp>(loc, ptrTp, unknown);
-        codegen.indices[t][i] = rewriter.create<AllocaOp>(loc, indTp, unknown);
+        Value dim = rewriter.create<ConstantIndexOp>(loc, d);
+        // Generate sparse primitives to obtains pointer and indices.
+        codegen.pointers[t][i] =
+            rewriter.create<linalg::SparseTensorToPointersMemRefOp>(
+                loc, ptrTp, tensor, dim);
+        codegen.indices[t][i] =
+            rewriter.create<linalg::SparseTensorToIndicesMemRefOp>(loc, indTp,
+                                                                   tensor, dim);
      }
      // Find lower and upper bound in current dimension.
      Value up;
      if (shape[d] == TensorType::kDynamicSize) {
-        Value arg = t < numInputs ? op.getInput(t) : op.getOutput(0);
-        up = rewriter.create<DimOp>(loc, arg, d);
+        up = rewriter.create<DimOp>(loc, tensor, d);
        args.push_back(up);
      } else {
        up = rewriter.create<ConstantIndexOp>(loc, shape[d]);
      }
      codegen.sizes[i] = codegen.highs[t][i] = up;
    }
-    // Allocate dense or sparse buffer for numerical values.
-    if (allDense) {
+    // Perform the required bufferization. All dense inputs materialize
+    // from the input tensor. The dense output tensor needs special
+    // handling. Sparse inputs use a sparse primitive to obtain the values.
+    if (dense) {
      auto denseTp = MemRefType::get(shape, tensorType.getElementType());
-      codegen.buffers[t] = rewriter.create<AllocaOp>(loc, denseTp, args);
+      if (t < numInputs)
+        codegen.buffers[t] =
+            rewriter.create<TensorToMemrefOp>(loc, denseTp, tensor);
+      else
+        codegen.buffers[t] =
+            genOutputBuffer(codegen, rewriter, op, denseTp, args);
    } else {
-      auto sparseTp = MemRefType::get({ShapedType::kDynamicSize},
-                                      tensorType.getElementType());
-      codegen.buffers[t] = rewriter.create<AllocaOp>(loc, sparseTp, unknown);
+      auto dynShape = {ShapedType::kDynamicSize};
+      auto sparseTp = MemRefType::get(dynShape, tensorType.getElementType());
+      codegen.buffers[t] =
+          rewriter.create<linalg::SparseTensorToValuesMemRefOp>(loc, sparseTp,
+                                                                tensor);
    }
  }
 }
@ -657,7 +698,7 @@ static Value genTensorLoad(Merger &merger, CodeGen &codegen,
  SmallVector<Value, 4> args;
  unsigned tensor = merger.exp(exp).e0;
  auto map = op.getIndexingMap(tensor);
-  bool sparse = false;
+  bool sparse = linkedSparse(op, tensor);
  for (unsigned i = 0, m = map.getNumResults(); i < m; ++i) {
    unsigned idx = map.getDimPosition(i);
    args.push_back(codegen.loops[idx]); // universal dense index
--- a/mlir/lib/ExecutionEngine/SparseUtils.cpp
+++ b/mlir/lib/ExecutionEngine/SparseUtils.cpp
@ -89,11 +89,96 @@ private:
    return false;
  }

+public:
  std::vector<uint64_t> sizes; // per-rank dimension sizes
  std::vector<Element> elements;
  uint64_t pos;
 };

+/// A memory-resident sparse tensor using a storage scheme based on per-rank
+/// annotations on dense/sparse. This data structure provides a bufferized
+/// form of an imaginary SparseTensorType, until such a type becomes a
+/// first-class citizen of MLIR. In contrast to generating setup methods for
+/// each differently annotated sparse tensor, this method provides a convenient
+/// "one-size-fits-all" solution that simply takes an input tensor and
+/// annotations to implement all required setup in a general manner.
+template <typename P, typename I, typename V>
+class SparseTensorStorage {
+public:
+  /// Constructs sparse tensor storage scheme following the given
+  /// per-rank dimension dense/sparse annotations.
+  SparseTensorStorage(SparseTensor *tensor, bool *sparsity)
+      : sizes(tensor->sizes), positions(sizes.size()), indices(sizes.size()) {
+    // Provide hints on capacity.
+    // TODO: needs fine-tuning based on sparsity
+    values.reserve(tensor->elements.size());
+    for (uint64_t d = 0, s = 1, rank = sizes.size(); d < rank; d++) {
+      s *= tensor->sizes[d];
+      if (sparsity[d]) {
+        positions[d].reserve(s + 1);
+        indices[d].reserve(s);
+        s = 1;
+      }
+    }
+    // Then setup the tensor.
+    traverse(tensor, sparsity, 0, tensor->elements.size(), 0);
+  }
+
+private:
+  /// Initializes sparse tensor storage scheme from a memory-resident
+  /// representation of an external sparse tensor. This method prepares
+  /// the pointers and indices arrays under the given per-rank dimension
+  /// dense/sparse annotations.
+  void traverse(SparseTensor *tensor, bool *sparsity, uint64_t lo, uint64_t hi,
+                uint64_t d) {
+    const std::vector<Element> &elements = tensor->elements;
+    // Once dimensions are exhausted, insert the numerical values.
+    if (d == sizes.size()) {
+      values.push_back(lo < hi ? elements[lo].value : 0.0);
+      return;
+    }
+    // Prepare a sparse pointer structure at this dimension.
+    if (sparsity[d] && positions[d].empty())
+      positions[d].push_back(0);
+    // Visit all elements in this interval.
+    uint64_t full = 0;
+    while (lo < hi) {
+      // Find segment in interval with same index elements in this dimension.
+      unsigned idx = elements[lo].indices[d];
+      unsigned seg = lo + 1;
+      while (seg < hi && elements[seg].indices[d] == idx)
+        seg++;
+      // Handle segment in interval for sparse or dense dimension.
+      if (sparsity[d]) {
+        indices[d].push_back(idx);
+      } else {
+        for (; full < idx; full++)
+          traverse(tensor, sparsity, 0, 0, d + 1); // pass empty
+        full++;
+      }
+      traverse(tensor, sparsity, lo, seg, d + 1);
+      // And move on to next segment in interval.
+      lo = seg;
+    }
+    // Finalize the sparse pointer structure at this dimension.
+    if (sparsity[d]) {
+      positions[d].push_back(indices[d].size());
+    } else {
+      for (uint64_t sz = tensor->sizes[d]; full < sz; full++)
+        traverse(tensor, sparsity, 0, 0, d + 1); // pass empty
+    }
+  }
+
+public:
+  std::vector<uint64_t> sizes; // per-rank dimension sizes
+  std::vector<std::vector<P>> positions;
+  std::vector<std::vector<I>> indices;
+  std::vector<V> values;
+};
+
+typedef SparseTensorStorage<uint64_t, uint64_t, double>
+    SparseTensorStorageU64U64F64;
+
 /// Helper to convert string to lower case.
 static char *toLower(char *token) {
  for (char *c = token; *c; c++)
@ -200,24 +285,37 @@ static void readExtFROSTTHeader(FILE *file, char *name, uint64_t *idata) {
 //
 //
 // Note that input parameters in the "MLIRized" version of a function mimic
-// the data layout of a MemRef<?xT>:
-//
-//   struct MemRef {
-//     T *base;
-//     T *data;
-//     int64_t off;
-//     int64_t sizes[1];
-//     int64_t strides[1];
-//   }
+// the data layout of a MemRef<?xT> (but cannot use a direct struct). The
+// output parameter uses a direct struct.
 //
 //===----------------------------------------------------------------------===//

+extern "C" {
+
+/// Cannot use templates with C linkage.
+
+struct MemRef1DU64 {
+  const uint64_t *base;
+  const uint64_t *data;
+  uint64_t off;
+  uint64_t sizes[1];
+  uint64_t strides[1];
+};
+
+struct MemRef1DF64 {
+  const double *base;
+  const double *data;
+  uint64_t off;
+  uint64_t sizes[1];
+  uint64_t strides[1];
+};
+
 /// Reads in a sparse tensor with the given filename. The call yields a
 /// pointer to an opaque memory-resident sparse tensor object that is only
 /// understood by other methods in the sparse runtime support library. An
 /// array parameter is used to pass the rank, the number of nonzero elements,
 /// and the dimension sizes (one per rank).
-extern "C" void *openTensorC(char *filename, uint64_t *idata) {
+void *openTensorC(char *filename, uint64_t *idata) {
  // Open the file.
  FILE *file = fopen(filename, "r");
  if (!file) {
@ -264,14 +362,14 @@ extern "C" void *openTensorC(char *filename, uint64_t *idata) {
 }

 /// "MLIRized" version.
-extern "C" void *openTensor(char *filename, uint64_t *ibase, uint64_t *idata,
-                            uint64_t ioff, uint64_t isize, uint64_t istride) {
+void *openTensor(char *filename, uint64_t *ibase, uint64_t *idata,
+                 uint64_t ioff, uint64_t isize, uint64_t istride) {
  assert(istride == 1);
  return openTensorC(filename, idata + ioff);
 }

 /// Yields the next element from the given opaque sparse tensor object.
-extern "C" void readTensorItemC(void *tensor, uint64_t *idata, double *ddata) {
+void readTensorItemC(void *tensor, uint64_t *idata, double *ddata) {
  const Element &e = static_cast<SparseTensor *>(tensor)->next();
  for (uint64_t r = 0, rank = e.indices.size(); r < rank; r++)
    idata[r] = e.indices[r];
@ -279,27 +377,74 @@ extern "C" void readTensorItemC(void *tensor, uint64_t *idata, double *ddata) {
 }

 /// "MLIRized" version.
-extern "C" void readTensorItem(void *tensor, uint64_t *ibase, uint64_t *idata,
-                               uint64_t ioff, uint64_t isize, uint64_t istride,
-                               double *dbase, double *ddata, uint64_t doff,
-                               uint64_t dsize, uint64_t dstride) {
+void readTensorItem(void *tensor, uint64_t *ibase, uint64_t *idata,
+                    uint64_t ioff, uint64_t isize, uint64_t istride,
+                    double *dbase, double *ddata, uint64_t doff, uint64_t dsize,
+                    uint64_t dstride) {
  assert(istride == 1 && dstride == 1);
  readTensorItemC(tensor, idata + ioff, ddata + doff);
 }

 /// Closes the given opaque sparse tensor object, releasing its memory
-/// resources. After this call, the opague object cannot be used anymore.
-extern "C" void closeTensor(void *tensor) {
-  delete static_cast<SparseTensor *>(tensor);
-}
+/// resources. After this call, the opaque object cannot be used anymore.
+void closeTensor(void *tensor) { delete static_cast<SparseTensor *>(tensor); }

 /// Helper method to read a sparse tensor filename from the environment,
 /// defined with the naming convention ${TENSOR0}, ${TENSOR1}, etc.
-extern "C" char *getTensorFilename(uint64_t id) {
+char *getTensorFilename(uint64_t id) {
  char var[80];
  sprintf(var, "TENSOR%" PRIu64, id);
  char *env = getenv(var);
  return env;
 }

+///
+/// Sparse primitives that support an opaque implementation of a bufferized
+/// SparseTensor in MLIR. This could be replaced by actual codegen in MLIR.
+///
+
+void *newSparseTensorC(char *filename, bool *annotations) {
+  uint64_t idata[64];
+  SparseTensor *t = static_cast<SparseTensor *>(openTensorC(filename, idata));
+  SparseTensorStorageU64U64F64 *tensor =
+      new SparseTensorStorageU64U64F64(t, annotations);
+  delete t;
+  return tensor;
+}
+
+/// "MLIRized" version.
+void *newSparseTensor(char *filename, bool *abase, bool *adata, uint64_t aoff,
+                      uint64_t asize, uint64_t astride) {
+  assert(astride == 1);
+  return newSparseTensorC(filename, abase + aoff);
+}
+
+uint64_t sparseDimSize(void *tensor, uint64_t d) {
+  return static_cast<SparseTensorStorageU64U64F64 *>(tensor)->sizes[d];
+}
+
+MemRef1DU64 sparsePtrsI64(void *tensor, uint64_t d) {
+  const std::vector<uint64_t> &v =
+      static_cast<SparseTensorStorageU64U64F64 *>(tensor)->positions[d];
+  return {v.data(), v.data(), 0, {v.size()}, {1}};
+}
+
+MemRef1DU64 sparseIndxsI64(void *tensor, uint64_t d) {
+  const std::vector<uint64_t> &v =
+      static_cast<SparseTensorStorageU64U64F64 *>(tensor)->indices[d];
+  return {v.data(), v.data(), 0, {v.size()}, {1}};
+}
+
+MemRef1DF64 sparseValsF64(void *tensor) {
+  const std::vector<double> &v =
+      static_cast<SparseTensorStorageU64U64F64 *>(tensor)->values;
+  return {v.data(), v.data(), 0, {v.size()}, {1}};
+}
+
+void delSparseTensor(void *tensor) {
+  delete static_cast<SparseTensorStorageU64U64F64 *>(tensor);
+}
+
+} // extern "C"
+
 #endif // MLIR_CRUNNERUTILS_DEFINE_FUNCTIONS
--- a/mlir/test/Dialect/Linalg/sparse_1d.mlir
+++ b/mlir/test/Dialect/Linalg/sparse_1d.mlir
--- a/mlir/test/Dialect/Linalg/sparse_2d.mlir
+++ b/mlir/test/Dialect/Linalg/sparse_2d.mlir
--- a/mlir/test/Dialect/Linalg/sparse_3d.mlir
+++ b/mlir/test/Dialect/Linalg/sparse_3d.mlir
--- a/mlir/test/Dialect/Linalg/sparse_lower.mlir
+++ b/mlir/test/Dialect/Linalg/sparse_lower.mlir
@ -0,0 +1,181 @@
+// RUN: mlir-opt %s -test-sparsification | \
+// RUN:   FileCheck %s --check-prefix=CHECK-HIR
+//
+// RUN: mlir-opt %s -test-sparsification="lower" --convert-linalg-to-loops | \
+// RUN:   FileCheck %s --check-prefix=CHECK-MIR
+//
+// RUN: mlir-opt %s -test-sparsification="lower" --convert-linalg-to-loops \
+// RUN:   --func-bufferize --tensor-constant-bufferize \
+// RUN:   --tensor-bufferize --finalizing-bufferize  | \
+// RUN:   FileCheck %s --check-prefix=CHECK-LIR
+//
+// RUN: mlir-opt %s -test-sparsification="lower fast-output" --convert-linalg-to-loops \
+// RUN:   --func-bufferize --tensor-constant-bufferize \
+// RUN:   --tensor-bufferize --finalizing-bufferize  | \
+// RUN:   FileCheck %s --check-prefix=CHECK-FAST
+
+#trait_matvec = {
+  indexing_maps = [
+    affine_map<(i,j) -> (i,j)>,  // A
+    affine_map<(i,j) -> (j)>,    // b
+    affine_map<(i,j) -> (i)>     // x (out)
+  ],
+  iterator_types = ["parallel","reduction"],
+  sparse = [
+    [ "D", "S" ],  // A
+    [ "D" ],       // b
+    [ "D" ]        // x (out)
+  ],
+  sparse_dim_map = [
+    affine_map<(i,j) -> (j,i)>,  // A: column-wise
+    affine_map<(i)   -> (i)>,    // x
+    affine_map<(i)   -> (i)>     // b
+  ],
+  doc = "x(i) += A(i,j) * b(j)"
+}
+
+// CHECK-HIR-LABEL:   func @matvec(
+// CHECK-HIR-SAME:                 %[[VAL_0:.*]]: !llvm.ptr<i8>,
+// CHECK-HIR-SAME:                 %[[VAL_1:.*]]: tensor<64xf64>,
+// CHECK-HIR-SAME:                 %[[VAL_2:.*]]: tensor<64xf64>) -> tensor<64xf64> {
+// CHECK-HIR:           %[[VAL_3:.*]] = constant 64 : index
+// CHECK-HIR:           %[[VAL_4:.*]] = constant 0 : index
+// CHECK-HIR:           %[[VAL_5:.*]] = constant 1 : index
+// CHECK-HIR:           %[[VAL_6:.*]] = linalg.sparse_tensor %[[VAL_0]] : !llvm.ptr<i8> to tensor<64x64xf64>
+// CHECK-HIR:           %[[VAL_7:.*]] = linalg.sparse_pointers %[[VAL_6]], %[[VAL_5]] : tensor<64x64xf64> to memref<?xindex>
+// CHECK-HIR:           %[[VAL_8:.*]] = linalg.sparse_indices %[[VAL_6]], %[[VAL_5]] : tensor<64x64xf64> to memref<?xindex>
+// CHECK-HIR:           %[[VAL_9:.*]] = linalg.sparse_values %[[VAL_6]] : tensor<64x64xf64> to memref<?xf64>
+// CHECK-HIR:           %[[VAL_10:.*]] = tensor_to_memref %[[VAL_1]] : memref<64xf64>
+// CHECK-HIR:           %[[VAL_11:.*]] = tensor_to_memref %[[VAL_2]] : memref<64xf64>
+// CHECK-HIR:           %[[VAL_12:.*]] = alloc() : memref<64xf64>
+// CHECK-HIR:           linalg.copy(%[[VAL_11]], %[[VAL_12]]) : memref<64xf64>, memref<64xf64>
+// CHECK-HIR:           scf.for %[[VAL_13:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
+// CHECK-HIR:             %[[VAL_14:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_13]]] : memref<?xindex>
+// CHECK-HIR:             %[[VAL_15:.*]] = addi %[[VAL_13]], %[[VAL_5]] : index
+// CHECK-HIR:             %[[VAL_16:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_15]]] : memref<?xindex>
+// CHECK-HIR:             %[[VAL_17:.*]] = load %[[VAL_12]]{{\[}}%[[VAL_13]]] : memref<64xf64>
+// CHECK-HIR:             %[[VAL_18:.*]] = scf.for %[[VAL_19:.*]] = %[[VAL_14]] to %[[VAL_16]] step %[[VAL_5]] iter_args(%[[VAL_20:.*]] = %[[VAL_17]]) -> (f64) {
+// CHECK-HIR:               %[[VAL_21:.*]] = load %[[VAL_8]]{{\[}}%[[VAL_19]]] : memref<?xindex>
+// CHECK-HIR:               %[[VAL_22:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_19]]] : memref<?xf64>
+// CHECK-HIR:               %[[VAL_23:.*]] = load %[[VAL_10]]{{\[}}%[[VAL_21]]] : memref<64xf64>
+// CHECK-HIR:               %[[VAL_24:.*]] = mulf %[[VAL_22]], %[[VAL_23]] : f64
+// CHECK-HIR:               %[[VAL_25:.*]] = addf %[[VAL_20]], %[[VAL_24]] : f64
+// CHECK-HIR:               scf.yield %[[VAL_25]] : f64
+// CHECK-HIR:             }
+// CHECK-HIR:             store %[[VAL_26:.*]], %[[VAL_12]]{{\[}}%[[VAL_13]]] : memref<64xf64>
+// CHECK-HIR:           }
+// CHECK-HIR:           %[[VAL_27:.*]] = tensor_load %[[VAL_12]] : memref<64xf64>
+// CHECK-HIR:           return %[[VAL_27]] : tensor<64xf64>
+// CHECK-HIR:         }
+
+// CHECK-MIR-LABEL:   func @matvec(
+// CHECK-MIR-SAME:                 %[[VAL_0:.*]]: !llvm.ptr<i8>,
+// CHECK-MIR-SAME:                 %[[VAL_1:.*]]: tensor<64xf64>,
+// CHECK-MIR-SAME:                 %[[VAL_2:.*]]: tensor<64xf64>) -> tensor<64xf64> {
+// CHECK-MIR:           %[[VAL_3:.*]] = constant 64 : index
+// CHECK-MIR:           %[[VAL_4:.*]] = constant 0 : index
+// CHECK-MIR:           %[[VAL_5:.*]] = constant 1 : index
+// CHECK-MIR:           %[[VAL_6:.*]] = call @sparsePtrsI64(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
+// CHECK-MIR:           %[[VAL_7:.*]] = call @sparseIndxsI64(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
+// CHECK-MIR:           %[[VAL_8:.*]] = call @sparseValsF64(%[[VAL_0]]) : (!llvm.ptr<i8>) -> memref<?xf64>
+// CHECK-MIR:           %[[VAL_9:.*]] = tensor_to_memref %[[VAL_1]] : memref<64xf64>
+// CHECK-MIR:           %[[VAL_10:.*]] = tensor_to_memref %[[VAL_2]] : memref<64xf64>
+// CHECK-MIR:           %[[VAL_11:.*]] = alloc() : memref<64xf64>
+// CHECK-MIR:           scf.for %[[VAL_12:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
+// CHECK-MIR:             %[[VAL_13:.*]] = load %[[VAL_10]]{{\[}}%[[VAL_12]]] : memref<64xf64>
+// CHECK-MIR:             store %[[VAL_13]], %[[VAL_11]]{{\[}}%[[VAL_12]]] : memref<64xf64>
+// CHECK-MIR:           }
+// CHECK-MIR:           scf.for %[[VAL_14:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
+// CHECK-MIR:             %[[VAL_15:.*]] = load %[[VAL_6]]{{\[}}%[[VAL_14]]] : memref<?xindex>
+// CHECK-MIR:             %[[VAL_16:.*]] = addi %[[VAL_14]], %[[VAL_5]] : index
+// CHECK-MIR:             %[[VAL_17:.*]] = load %[[VAL_6]]{{\[}}%[[VAL_16]]] : memref<?xindex>
+// CHECK-MIR:             %[[VAL_18:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_14]]] : memref<64xf64>
+// CHECK-MIR:             %[[VAL_19:.*]] = scf.for %[[VAL_20:.*]] = %[[VAL_15]] to %[[VAL_17]] step %[[VAL_5]] iter_args(%[[VAL_21:.*]] = %[[VAL_18]]) -> (f64) {
+// CHECK-MIR:               %[[VAL_22:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_20]]] : memref<?xindex>
+// CHECK-MIR:               %[[VAL_23:.*]] = load %[[VAL_8]]{{\[}}%[[VAL_20]]] : memref<?xf64>
+// CHECK-MIR:               %[[VAL_24:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_22]]] : memref<64xf64>
+// CHECK-MIR:               %[[VAL_25:.*]] = mulf %[[VAL_23]], %[[VAL_24]] : f64
+// CHECK-MIR:               %[[VAL_26:.*]] = addf %[[VAL_21]], %[[VAL_25]] : f64
+// CHECK-MIR:               scf.yield %[[VAL_26]] : f64
+// CHECK-MIR:             }
+// CHECK-MIR:             store %[[VAL_27:.*]], %[[VAL_11]]{{\[}}%[[VAL_14]]] : memref<64xf64>
+// CHECK-MIR:           }
+// CHECK-MIR:           %[[VAL_28:.*]] = tensor_load %[[VAL_11]] : memref<64xf64>
+// CHECK-MIR:           return %[[VAL_28]] : tensor<64xf64>
+// CHECK-MIR:         }
+
+// CHECK-LIR-LABEL:   func @matvec(
+// CHECK-LIR-SAME:                 %[[VAL_0:.*]]: !llvm.ptr<i8>,
+// CHECK-LIR-SAME:                 %[[VAL_1:.*]]: memref<64xf64>,
+// CHECK-LIR-SAME:                 %[[VAL_2:.*]]: memref<64xf64>) -> memref<64xf64> {
+// CHECK-LIR:           %[[VAL_3:.*]] = constant 64 : index
+// CHECK-LIR:           %[[VAL_4:.*]] = constant 0 : index
+// CHECK-LIR:           %[[VAL_5:.*]] = constant 1 : index
+// CHECK-LIR:           %[[VAL_6:.*]] = call @sparsePtrsI64(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
+// CHECK-LIR:           %[[VAL_7:.*]] = call @sparseIndxsI64(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
+// CHECK-LIR:           %[[VAL_8:.*]] = call @sparseValsF64(%[[VAL_0]]) : (!llvm.ptr<i8>) -> memref<?xf64>
+// CHECK-LIR:           %[[VAL_9:.*]] = alloc() : memref<64xf64>
+// CHECK-LIR:           scf.for %[[VAL_10:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
+// CHECK-LIR:             %[[VAL_11:.*]] = load %[[VAL_2]]{{\[}}%[[VAL_10]]] : memref<64xf64>
+// CHECK-LIR:             store %[[VAL_11]], %[[VAL_9]]{{\[}}%[[VAL_10]]] : memref<64xf64>
+// CHECK-LIR:           }
+// CHECK-LIR:           scf.for %[[VAL_12:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
+// CHECK-LIR:             %[[VAL_13:.*]] = load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref<?xindex>
+// CHECK-LIR:             %[[VAL_14:.*]] = addi %[[VAL_12]], %[[VAL_5]] : index
+// CHECK-LIR:             %[[VAL_15:.*]] = load %[[VAL_6]]{{\[}}%[[VAL_14]]] : memref<?xindex>
+// CHECK-LIR:             %[[VAL_16:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_12]]] : memref<64xf64>
+// CHECK-LIR:             %[[VAL_17:.*]] = scf.for %[[VAL_18:.*]] = %[[VAL_13]] to %[[VAL_15]] step %[[VAL_5]] iter_args(%[[VAL_19:.*]] = %[[VAL_16]]) -> (f64) {
+// CHECK-LIR:               %[[VAL_20:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_18]]] : memref<?xindex>
+// CHECK-LIR:               %[[VAL_21:.*]] = load %[[VAL_8]]{{\[}}%[[VAL_18]]] : memref<?xf64>
+// CHECK-LIR:               %[[VAL_22:.*]] = load %[[VAL_1]]{{\[}}%[[VAL_20]]] : memref<64xf64>
+// CHECK-LIR:               %[[VAL_23:.*]] = mulf %[[VAL_21]], %[[VAL_22]] : f64
+// CHECK-LIR:               %[[VAL_24:.*]] = addf %[[VAL_19]], %[[VAL_23]] : f64
+// CHECK-LIR:               scf.yield %[[VAL_24]] : f64
+// CHECK-LIR:             }
+// CHECK-LIR:             store %[[VAL_25:.*]], %[[VAL_9]]{{\[}}%[[VAL_12]]] : memref<64xf64>
+// CHECK-LIR:           }
+// CHECK-LIR:           return %[[VAL_9]] : memref<64xf64>
+// CHECK-LIR:         }
+
+// CHECK-FAST-LABEL:   func @matvec(
+// CHECK-FAST-SAME:                 %[[VAL_0:.*]]: !llvm.ptr<i8>,
+// CHECK-FAST-SAME:                 %[[VAL_1:.*]]: memref<64xf64>,
+// CHECK-FAST-SAME:                 %[[VAL_2:.*]]: memref<64xf64>) -> memref<64xf64> {
+// CHECK-FAST:           %[[VAL_3:.*]] = constant 64 : index
+// CHECK-FAST:           %[[VAL_4:.*]] = constant 0 : index
+// CHECK-FAST:           %[[VAL_5:.*]] = constant 1 : index
+// CHECK-FAST:           %[[VAL_6:.*]] = call @sparsePtrsI64(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
+// CHECK-FAST:           %[[VAL_7:.*]] = call @sparseIndxsI64(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
+// CHECK-FAST:           %[[VAL_8:.*]] = call @sparseValsF64(%[[VAL_0]]) : (!llvm.ptr<i8>) -> memref<?xf64>
+// CHECK-FAST:           scf.for %[[VAL_9:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
+// CHECK-FAST:             %[[VAL_10:.*]] = load %[[VAL_6]]{{\[}}%[[VAL_9]]] : memref<?xindex>
+// CHECK-FAST:             %[[VAL_11:.*]] = addi %[[VAL_9]], %[[VAL_5]] : index
+// CHECK-FAST:             %[[VAL_12:.*]] = load %[[VAL_6]]{{\[}}%[[VAL_11]]] : memref<?xindex>
+// CHECK-FAST:             %[[VAL_13:.*]] = load %[[VAL_2]]{{\[}}%[[VAL_9]]] : memref<64xf64>
+// CHECK-FAST:             %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_10]] to %[[VAL_12]] step %[[VAL_5]] iter_args(%[[VAL_16:.*]] = %[[VAL_13]]) -> (f64) {
+// CHECK-FAST:               %[[VAL_17:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_15]]] : memref<?xindex>
+// CHECK-FAST:               %[[VAL_18:.*]] = load %[[VAL_8]]{{\[}}%[[VAL_15]]] : memref<?xf64>
+// CHECK-FAST:               %[[VAL_19:.*]] = load %[[VAL_1]]{{\[}}%[[VAL_17]]] : memref<64xf64>
+// CHECK-FAST:               %[[VAL_20:.*]] = mulf %[[VAL_18]], %[[VAL_19]] : f64
+// CHECK-FAST:               %[[VAL_21:.*]] = addf %[[VAL_16]], %[[VAL_20]] : f64
+// CHECK-FAST:               scf.yield %[[VAL_21]] : f64
+// CHECK-FAST:             }
+// CHECK-FAST:             store %[[VAL_22:.*]], %[[VAL_2]]{{\[}}%[[VAL_9]]] : memref<64xf64>
+// CHECK-FAST:           }
+// CHECK-FAST:           return %[[VAL_2]] : memref<64xf64>
+// CHECK-FAST:         }
+
+!SparseTensor = type !llvm.ptr<i8>
+
+func @matvec(%argA: !SparseTensor, %argb: tensor<64xf64>, %argx: tensor<64xf64>) -> tensor<64xf64> {
+  %arga = linalg.sparse_tensor %argA : !SparseTensor to tensor<64x64xf64>
+  %0 = linalg.generic #trait_matvec
+      ins(%arga, %argb : tensor<64x64xf64>, tensor<64xf64>)
+      outs(%argx: tensor<64xf64>) {
+    ^bb(%A: f64, %b: f64, %x: f64):
+      %0 = mulf %A, %b : f64
+      %1 = addf %x, %0 : f64
+      linalg.yield %1 : f64
+  } -> tensor<64xf64>
+  return %0 : tensor<64xf64>
+}
--- a/mlir/test/Dialect/Linalg/sparse_lower_calls.mlir
+++ b/mlir/test/Dialect/Linalg/sparse_lower_calls.mlir
@ -0,0 +1,37 @@
+// RUN: mlir-opt --test-sparsification="lower" %s | FileCheck %s
+
+!SparseTensor = type !llvm.ptr<i8>
+
+// CHECK-LABEL: func @sparse_pointers(
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
+//       CHECK: %[[C:.*]] = constant 1 : index
+//       CHECK: %[[T:.*]] = call @sparsePtrsI64(%[[A]], %[[C]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
+//       CHECK: return %[[T]] : memref<?xindex>
+func @sparse_pointers(%arg0: !SparseTensor) -> memref<?xindex> {
+  %a = linalg.sparse_tensor %arg0 : !SparseTensor to tensor<128xf64>
+  %c = constant 1 : index
+  %0 = linalg.sparse_pointers %a, %c : tensor<128xf64> to memref<?xindex>
+  return %0 : memref<?xindex>
+}
+
+// CHECK-LABEL: func @sparse_indices(
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
+//       CHECK: %[[C:.*]] = constant 1 : index
+//       CHECK: %[[T:.*]] = call @sparseIndxsI64(%[[A]], %[[C]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
+//       CHECK: return %[[T]] : memref<?xindex>
+func @sparse_indices(%arg0: !SparseTensor) -> memref<?xindex> {
+  %a = linalg.sparse_tensor %arg0 : !SparseTensor to tensor<128xf64>
+  %c = constant 1 : index
+  %0 = linalg.sparse_indices %a, %c : tensor<128xf64> to memref<?xindex>
+  return %0 : memref<?xindex>
+}
+
+// CHECK-LABEL: func @sparse_values(
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
+//       CHECK: %[[T:.*]] = call @sparseValsF64(%[[A]]) : (!llvm.ptr<i8>) -> memref<?xf64>
+//       CHECK: return %[[T]] : memref<?xf64>
+func @sparse_values(%arg0: !SparseTensor) -> memref<?xf64> {
+  %a = linalg.sparse_tensor %arg0 : !SparseTensor to tensor<128xf64>
+  %0 = linalg.sparse_values %a : tensor<128xf64> to memref<?xf64>
+  return %0 : memref<?xf64>
+}
--- a/mlir/test/Dialect/Linalg/sparse_nd.mlir
+++ b/mlir/test/Dialect/Linalg/sparse_nd.mlir
@ -23,52 +23,55 @@
 // CHECK-SAME:              %[[VAL_0:.*0]]: tensor<100x200x300x400x500x600x700x800xf32>,
 // CHECK-SAME:              %[[VAL_1:.*1]]: tensor<100x200x300x400x500x600x700x800xf32>,
 // CHECK-SAME:              %[[VAL_2:.*2]]: tensor<100x200x300x400x500x600x700x800xf32>) -> tensor<100x200x300x400x500x600x700x800xf32> {
-// CHECK:           %[[VAL_3:.*]] = constant 999 : index
-// CHECK:           %[[VAL_4:.*]] = constant 100 : index
-// CHECK:           %[[VAL_5:.*]] = constant 200 : index
-// CHECK:           %[[VAL_6:.*]] = constant 300 : index
-// CHECK:           %[[VAL_7:.*]] = constant 600 : index
-// CHECK:           %[[VAL_8:.*]] = constant 700 : index
-// CHECK:           %[[VAL_9:.*]] = constant 800 : index
-// CHECK:           %[[VAL_10:.*]] = constant 0 : index
-// CHECK:           %[[VAL_11:.*]] = constant 1 : index
-// CHECK:           %[[VAL_12:.*]] = alloca() : memref<100x200x300x400x500x600x700x800xf32>
-// CHECK:           %[[VAL_13:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
-// CHECK:           %[[VAL_14:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
-// CHECK:           %[[VAL_15:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
-// CHECK:           %[[VAL_16:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
-// CHECK:           %[[VAL_17:.*]] = alloca(%[[VAL_3]]) : memref<?xf32>
-// CHECK:           %[[VAL_18:.*]] = alloca() : memref<100x200x300x400x500x600x700x800xf32>
-// CHECK:           scf.for %[[VAL_19:.*]] = %[[VAL_10]] to %[[VAL_9]] step %[[VAL_11]] {
-// CHECK:             scf.for %[[VAL_20:.*]] = %[[VAL_10]] to %[[VAL_8]] step %[[VAL_11]] {
-// CHECK:               %[[VAL_21:.*]] = muli %[[VAL_19]], %[[VAL_8]] : index
-// CHECK:               %[[VAL_22:.*]] = addi %[[VAL_21]], %[[VAL_20]] : index
-// CHECK:               scf.for %[[VAL_23:.*]] = %[[VAL_10]] to %[[VAL_7]] step %[[VAL_11]] {
-// CHECK:                 %[[VAL_24:.*]] = muli %[[VAL_22]], %[[VAL_7]] : index
-// CHECK:                 %[[VAL_25:.*]] = addi %[[VAL_24]], %[[VAL_23]] : index
-// CHECK:                 %[[VAL_26:.*]] = load %[[VAL_13]]{{\[}}%[[VAL_25]]] : memref<?xindex>
-// CHECK:                 %[[VAL_27:.*]] = addi %[[VAL_25]], %[[VAL_11]] : index
-// CHECK:                 %[[VAL_28:.*]] = load %[[VAL_13]]{{\[}}%[[VAL_27]]] : memref<?xindex>
-// CHECK:                 scf.for %[[VAL_29:.*]] = %[[VAL_26]] to %[[VAL_28]] step %[[VAL_11]] {
-// CHECK:                   %[[VAL_30:.*]] = load %[[VAL_14]]{{\[}}%[[VAL_29]]] : memref<?xindex>
-// CHECK:                   %[[VAL_31:.*]] = load %[[VAL_15]]{{\[}}%[[VAL_29]]] : memref<?xindex>
-// CHECK:                   %[[VAL_32:.*]] = addi %[[VAL_29]], %[[VAL_11]] : index
-// CHECK:                   %[[VAL_33:.*]] = load %[[VAL_15]]{{\[}}%[[VAL_32]]] : memref<?xindex>
-// CHECK:                   scf.for %[[VAL_34:.*]] = %[[VAL_31]] to %[[VAL_33]] step %[[VAL_11]] {
-// CHECK:                     %[[VAL_35:.*]] = load %[[VAL_16]]{{\[}}%[[VAL_34]]] : memref<?xindex>
-// CHECK:                     scf.for %[[VAL_36:.*]] = %[[VAL_10]] to %[[VAL_6]] step %[[VAL_11]] {
-// CHECK:                       %[[VAL_37:.*]] = muli %[[VAL_34]], %[[VAL_6]] : index
-// CHECK:                       %[[VAL_38:.*]] = addi %[[VAL_37]], %[[VAL_36]] : index
-// CHECK:                       scf.for %[[VAL_39:.*]] = %[[VAL_10]] to %[[VAL_5]] step %[[VAL_11]] {
-// CHECK:                         %[[VAL_40:.*]] = muli %[[VAL_38]], %[[VAL_5]] : index
-// CHECK:                         %[[VAL_41:.*]] = addi %[[VAL_40]], %[[VAL_39]] : index
-// CHECK:                         scf.for %[[VAL_42:.*]] = %[[VAL_10]] to %[[VAL_4]] step %[[VAL_11]] {
-// CHECK:                           %[[VAL_43:.*]] = muli %[[VAL_41]], %[[VAL_4]] : index
-// CHECK:                           %[[VAL_44:.*]] = addi %[[VAL_43]], %[[VAL_42]] : index
-// CHECK:                           %[[VAL_45:.*]] = load %[[VAL_12]]{{\[}}%[[VAL_42]], %[[VAL_39]], %[[VAL_36]], %[[VAL_35]], %[[VAL_30]], %[[VAL_23]], %[[VAL_20]], %[[VAL_19]]] : memref<100x200x300x400x500x600x700x800xf32>
-// CHECK:                           %[[VAL_46:.*]] = load %[[VAL_17]]{{\[}}%[[VAL_44]]] : memref<?xf32>
-// CHECK:                           %[[VAL_47:.*]] = mulf %[[VAL_45]], %[[VAL_46]] : f32
-// CHECK:                           store %[[VAL_47]], %[[VAL_18]]{{\[}}%[[VAL_42]], %[[VAL_39]], %[[VAL_36]], %[[VAL_35]], %[[VAL_30]], %[[VAL_23]], %[[VAL_20]], %[[VAL_19]]] : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:           %[[VAL_3:.*]] = constant 3 : index
+// CHECK:           %[[VAL_4:.*]] = constant 4 : index
+// CHECK:           %[[VAL_5:.*]] = constant 100 : index
+// CHECK:           %[[VAL_6:.*]] = constant 200 : index
+// CHECK:           %[[VAL_7:.*]] = constant 300 : index
+// CHECK:           %[[VAL_8:.*]] = constant 600 : index
+// CHECK:           %[[VAL_9:.*]] = constant 700 : index
+// CHECK:           %[[VAL_10:.*]] = constant 800 : index
+// CHECK:           %[[VAL_11:.*]] = constant 0 : index
+// CHECK:           %[[VAL_12:.*]] = constant 1 : index
+// CHECK:           %[[VAL_13:.*]] = tensor_to_memref %[[VAL_0]] : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:           %[[VAL_14:.*]] = linalg.sparse_pointers %[[VAL_1]], %[[VAL_3]] : tensor<100x200x300x400x500x600x700x800xf32> to memref<?xindex>
+// CHECK:           %[[VAL_15:.*]] = linalg.sparse_indices %[[VAL_1]], %[[VAL_3]] : tensor<100x200x300x400x500x600x700x800xf32> to memref<?xindex>
+// CHECK:           %[[VAL_16:.*]] = linalg.sparse_pointers %[[VAL_1]], %[[VAL_4]] : tensor<100x200x300x400x500x600x700x800xf32> to memref<?xindex>
+// CHECK:           %[[VAL_17:.*]] = linalg.sparse_indices %[[VAL_1]], %[[VAL_4]] : tensor<100x200x300x400x500x600x700x800xf32> to memref<?xindex>
+// CHECK:           %[[VAL_18:.*]] = linalg.sparse_values %[[VAL_1]] : tensor<100x200x300x400x500x600x700x800xf32> to memref<?xf32>
+// CHECK:           %[[VAL_19:.*]] = tensor_to_memref %[[VAL_2]] : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:           %[[VAL_20:.*]] = alloc() : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:           linalg.copy(%[[VAL_19]], %[[VAL_20]]) : memref<100x200x300x400x500x600x700x800xf32>, memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:           scf.for %[[VAL_21:.*]] = %[[VAL_11]] to %[[VAL_10]] step %[[VAL_12]] {
+// CHECK:             scf.for %[[VAL_22:.*]] = %[[VAL_11]] to %[[VAL_9]] step %[[VAL_12]] {
+// CHECK:               %[[VAL_23:.*]] = muli %[[VAL_21]], %[[VAL_9]] : index
+// CHECK:               %[[VAL_24:.*]] = addi %[[VAL_23]], %[[VAL_22]] : index
+// CHECK:               scf.for %[[VAL_25:.*]] = %[[VAL_11]] to %[[VAL_8]] step %[[VAL_12]] {
+// CHECK:                 %[[VAL_26:.*]] = muli %[[VAL_24]], %[[VAL_8]] : index
+// CHECK:                 %[[VAL_27:.*]] = addi %[[VAL_26]], %[[VAL_25]] : index
+// CHECK:                 %[[VAL_28:.*]] = load %[[VAL_14]]{{\[}}%[[VAL_27]]] : memref<?xindex>
+// CHECK:                 %[[VAL_29:.*]] = addi %[[VAL_27]], %[[VAL_12]] : index
+// CHECK:                 %[[VAL_30:.*]] = load %[[VAL_14]]{{\[}}%[[VAL_29]]] : memref<?xindex>
+// CHECK:                 scf.for %[[VAL_31:.*]] = %[[VAL_28]] to %[[VAL_30]] step %[[VAL_12]] {
+// CHECK:                   %[[VAL_32:.*]] = load %[[VAL_15]]{{\[}}%[[VAL_31]]] : memref<?xindex>
+// CHECK:                   %[[VAL_33:.*]] = load %[[VAL_16]]{{\[}}%[[VAL_31]]] : memref<?xindex>
+// CHECK:                   %[[VAL_34:.*]] = addi %[[VAL_31]], %[[VAL_12]] : index
+// CHECK:                   %[[VAL_35:.*]] = load %[[VAL_16]]{{\[}}%[[VAL_34]]] : memref<?xindex>
+// CHECK:                   scf.for %[[VAL_36:.*]] = %[[VAL_33]] to %[[VAL_35]] step %[[VAL_12]] {
+// CHECK:                     %[[VAL_37:.*]] = load %[[VAL_17]]{{\[}}%[[VAL_36]]] : memref<?xindex>
+// CHECK:                     scf.for %[[VAL_38:.*]] = %[[VAL_11]] to %[[VAL_7]] step %[[VAL_12]] {
+// CHECK:                       %[[VAL_39:.*]] = muli %[[VAL_36]], %[[VAL_7]] : index
+// CHECK:                       %[[VAL_40:.*]] = addi %[[VAL_39]], %[[VAL_38]] : index
+// CHECK:                       scf.for %[[VAL_41:.*]] = %[[VAL_11]] to %[[VAL_6]] step %[[VAL_12]] {
+// CHECK:                         %[[VAL_42:.*]] = muli %[[VAL_40]], %[[VAL_6]] : index
+// CHECK:                         %[[VAL_43:.*]] = addi %[[VAL_42]], %[[VAL_41]] : index
+// CHECK:                         scf.for %[[VAL_44:.*]] = %[[VAL_11]] to %[[VAL_5]] step %[[VAL_12]] {
+// CHECK:                           %[[VAL_45:.*]] = muli %[[VAL_43]], %[[VAL_5]] : index
+// CHECK:                           %[[VAL_46:.*]] = addi %[[VAL_45]], %[[VAL_44]] : index
+// CHECK:                           %[[VAL_47:.*]] = load %[[VAL_13]]{{\[}}%[[VAL_44]], %[[VAL_41]], %[[VAL_38]], %[[VAL_37]], %[[VAL_32]], %[[VAL_25]], %[[VAL_22]], %[[VAL_21]]] : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:                           %[[VAL_48:.*]] = load %[[VAL_18]]{{\[}}%[[VAL_46]]] : memref<?xf32>
+// CHECK:                           %[[VAL_49:.*]] = mulf %[[VAL_47]], %[[VAL_48]] : f32
+// CHECK:                           store %[[VAL_49]], %[[VAL_20]]{{\[}}%[[VAL_44]], %[[VAL_41]], %[[VAL_38]], %[[VAL_37]], %[[VAL_32]], %[[VAL_25]], %[[VAL_22]], %[[VAL_21]]] : memref<100x200x300x400x500x600x700x800xf32>
 // CHECK:                         }
 // CHECK:                       }
 // CHECK:                     }
@ -77,8 +80,8 @@
 // CHECK:               }
 // CHECK:             }
 // CHECK:           }
-// CHECK:           %[[VAL_48:.*]] = tensor_load %[[VAL_18]] : memref<100x200x300x400x500x600x700x800xf32>
-// CHECK:           return %[[VAL_48]] : tensor<100x200x300x400x500x600x700x800xf32>
+// CHECK:           %[[VAL_50:.*]] = tensor_load %[[VAL_20]] : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:           return %[[VAL_50]] : tensor<100x200x300x400x500x600x700x800xf32>
 // CHECK:         }
 func @mul(%arga: tensor<100x200x300x400x500x600x700x800xf32>,
          %argb: tensor<100x200x300x400x500x600x700x800xf32>,
--- a/mlir/test/Dialect/Linalg/sparse_roundtrip.mlir
+++ b/mlir/test/Dialect/Linalg/sparse_roundtrip.mlir
@ -0,0 +1,49 @@
+// RUN: mlir-opt -split-input-file %s | FileCheck %s
+
+!SparseTensor = type !llvm.ptr<i8>
+
+// CHECK-LABEL: func @sparse_tensor(
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
+//       CHECK: %[[T:.*]] = linalg.sparse_tensor %[[A]] : !llvm.ptr<i8> to tensor<128xf64>
+//       CHECK: return %[[T]] : tensor<128xf64>
+func @sparse_tensor(%arg0: !SparseTensor) -> tensor<128xf64> {
+  %0 = linalg.sparse_tensor %arg0 : !SparseTensor to tensor<128xf64>
+  return %0 : tensor<128xf64>
+}
+
+// -----
+
+// CHECK-LABEL: func @sparse_pointers(
+//  CHECK-SAME: %[[A:.*]]: tensor<128xf64>)
+//       CHECK: %[[C:.*]] = constant 1 : index
+//       CHECK: %[[T:.*]] = linalg.sparse_pointers %[[A]], %[[C]] : tensor<128xf64> to memref<?xindex>
+//       CHECK: return %[[T]] : memref<?xindex>
+func @sparse_pointers(%arg0: tensor<128xf64>) -> memref<?xindex> {
+  %c = constant 1 : index
+  %0 = linalg.sparse_pointers %arg0, %c : tensor<128xf64> to memref<?xindex>
+  return %0 : memref<?xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @sparse_indices(
+//  CHECK-SAME: %[[A:.*]]: tensor<128xf64>)
+//       CHECK: %[[C:.*]] = constant 1 : index
+//       CHECK: %[[T:.*]] = linalg.sparse_indices %[[A]], %[[C]] : tensor<128xf64> to memref<?xindex>
+//       CHECK: return %[[T]] : memref<?xindex>
+func @sparse_indices(%arg0: tensor<128xf64>) -> memref<?xindex> {
+  %c = constant 1 : index
+  %0 = linalg.sparse_indices %arg0, %c : tensor<128xf64> to memref<?xindex>
+  return %0 : memref<?xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @sparse_values(
+//  CHECK-SAME: %[[A:.*]]: tensor<128xf64>)
+//       CHECK: %[[T:.*]] = linalg.sparse_values %[[A]] : tensor<128xf64> to memref<?xf64>
+//       CHECK: return %[[T]] : memref<?xf64>
+func @sparse_values(%arg0: tensor<128xf64>) -> memref<?xf64> {
+  %0 = linalg.sparse_values %arg0 : tensor<128xf64> to memref<?xf64>
+  return %0 : memref<?xf64>
+}
--- a/mlir/test/lib/Transforms/TestSparsification.cpp
+++ b/mlir/test/lib/Transforms/TestSparsification.cpp
@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//

+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/Pass/Pass.h"
@ -40,9 +41,17 @@ struct TestSparsification
                          llvm::cl::desc("Set the index type"),
                          llvm::cl::init(0)};

+  Option<bool> fastOutput{*this, "fast-output",
+                          llvm::cl::desc("Allows fast output buffers"),
+                          llvm::cl::init(false)};
+
+  Option<bool> lower{*this, "lower", llvm::cl::desc("Lower sparse primitives"),
+                     llvm::cl::init(false)};
+
  /// Registers all dialects required by testing.
  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<scf::SCFDialect, vector::VectorDialect>();
+    registry
+        .insert<scf::SCFDialect, vector::VectorDialect, LLVM::LLVMDialect>();
  }

  /// Returns parallelization strategy given on command line.
@ -96,11 +105,25 @@ struct TestSparsification
    // Translate strategy flags to strategy options.
    linalg::SparsificationOptions options(parallelOption(), vectorOption(),
                                          vectorLength, typeOption(ptrType),
-                                          typeOption(indType));
+                                          typeOption(indType), fastOutput);
    // Apply rewriting.
    linalg::populateSparsificationPatterns(ctx, patterns, options);
    vector::populateVectorToVectorCanonicalizationPatterns(patterns, ctx);
    (void)applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
+    // Lower sparse primitives to calls into runtime support library.
+    if (lower) {
+      OwningRewritePatternList conversionPatterns;
+      ConversionTarget target(*ctx);
+      target.addIllegalOp<linalg::SparseTensorFromPointerOp,
+                          linalg::SparseTensorToPointersMemRefOp,
+                          linalg::SparseTensorToIndicesMemRefOp,
+                          linalg::SparseTensorToValuesMemRefOp>();
+      target.addLegalOp<CallOp>();
+      linalg::populateSparsificationConversionPatterns(ctx, conversionPatterns);
+      if (failed(applyPartialConversion(getOperation(), target,
+                                        std::move(conversionPatterns))))
+        signalPassFailure();
+    }
  }
 };