[mlir][sparse] retry sparse-only for cyclic iteration graphs

This is a very minor improvement during iteration graph construction. If the first attempt considering the dimension order of all tensors fails, a second attempt is made using the constraints of sparse tensors only. Dense tensors prefer dimension order (locality) but provide random access if needed, enabling the compilation of more sparse kernels. Reviewed By: penpornk Differential Revision: https://reviews.llvm.org/D94709
2021-01-14 12:04:49 -08:00 · 2021-01-14 12:04:49 -08:00 · 5508516b06
parent 39665d9aab
commit 5508516b06
2 changed files with 111 additions and 7 deletions
--- a/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp
@ -274,6 +274,11 @@ public:
    return false;
  }

+  // Returns true if tensor has any sparse dimension.
+  bool isSparseTensor(unsigned t) const {
+    return llvm::any_of(dims[t], [](Dim d) { return d == Dim::kSparse; });
+  }
+
  // Setter
  void setDim(unsigned t, unsigned i, Dim d) { dims[t][i] = d; }

@ -382,17 +387,22 @@ static bool topSortDFS(unsigned i, std::vector<unsigned> &visit,
 /// for sparse storage formats since these only support access along fixed
 /// dimensions. Even for dense storage formats, however, the natural index
 /// order yields innermost unit-stride access with better spatial locality.
-static bool computeIterationGraph(linalg::GenericOp op,
-                                  std::vector<unsigned> &topSort) {
+static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,
+                                  std::vector<unsigned> &topSort,
+                                  bool sparseOnly) {
  // Set up an n x n from/to adjacency matrix of the iteration graph
  // for the implicit loop indices i_0 .. i_n-1.
  unsigned n = op.getNumLoops();
  std::vector<std::vector<bool>> adjM(n, std::vector<bool>(n, false));

  // Iterate over the indexing maps of every tensor in the tensor expression.
-  for (auto imap : llvm::enumerate(op.indexing_maps())) {
-    auto map = imap.value().template cast<AffineMapAttr>().getValue();
+  unsigned numTensors = op.getNumShapedOperands();
+  for (unsigned t = 0; t < numTensors; t++) {
+    auto map = op.getIndexingMap(t);
    assert(map.getNumDims() == n);
+    // Skip dense tensor constraints when sparse only is requested.
+    if (sparseOnly && !merger.isSparseTensor(t))
+      continue;
    // At the moment, we take the index variables in the tensor access
    // expression in the order in which they appear (conceptually a
    // "row-major" layout of every tensor). So, a tensor access A_ijk
@ -407,6 +417,7 @@ static bool computeIterationGraph(linalg::GenericOp op,

  // Topologically sort the iteration graph to determine loop order.
  // Report failure for a cyclic iteration graph.
+  topSort.clear();
  topSort.reserve(n);
  std::vector<unsigned> visit(n, 0);
  for (unsigned i = 0; i < n; i++)
@ -1207,10 +1218,9 @@ public:
    // tensors are visited in natural index order. Fails on cycles.
    // This assumes that higher-level passes have already put the
    // tensors in each tensor expression in a feasible order.
-    // TODO: try again without *dense* constraints on failure or
-    //       even try to insert sparse reorderings to resolve cycles
    std::vector<unsigned> topSort;
-    if (!computeIterationGraph(op, topSort))
+    if (!computeIterationGraph(merger, op, topSort, /*sparseOnly=*/false) &&
+        !computeIterationGraph(merger, op, topSort, /*sparseOnly=*/true))
      return failure();

    // Finds the terminating yield statement and builds the tensor
--- a/mlir/test/Dialect/Linalg/sparse_nd.mlir
+++ b/mlir/test/Dialect/Linalg/sparse_nd.mlir
@ -0,0 +1,94 @@
+// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+// RUN: mlir-opt %s -test-sparsification | FileCheck %s
+
+// Example with cyclic iteration graph with sparse and dense constraints,
+// but an acyclic iteration graph using sparse constraints only.
+#trait_mul = {
+  indexing_maps = [
+    affine_map<(i,j,k,l,m,n,o,p) -> (i,j,k,l,m,n,o,p)>,  // A
+    affine_map<(i,j,k,l,m,n,o,p) -> (p,o,n,m,l,k,j,i)>,  // B
+    affine_map<(i,j,k,l,m,n,o,p) -> (i,j,k,l,m,n,o,p)>   // X
+  ],
+  sparse = [
+    [ "D", "D", "D", "D", "D", "D", "D", "D" ],  // a
+    [ "D", "D", "D", "S", "S", "D", "D", "D" ],  // b
+    [ "D", "D", "D", "D", "D", "D", "D", "D" ]   // x
+  ],
+  iterator_types = ["parallel", "parallel", "parallel", "parallel",
+                    "parallel", "parallel", "parallel", "parallel"],
+  doc = "X(i,j,k,l,m,n,o,p) = A(i,j,k,l,m,n,o,p)  * B(p,o,n,m,l,k,j,i)"
+}
+
+// CHECK-LABEL:   func @mul(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<100x200x300x400x500x600x700x800xf32>,
+// CHECK-SAME:              %[[VAL_1:.*]]: tensor<100x200x300x400x500x600x700x800xf32>) -> tensor<100x200x300x400x500x600x700x800xf32> {
+// CHECK:           %[[VAL_2:.*]] = constant 999 : index
+// CHECK:           %[[VAL_3:.*]] = constant 100 : index
+// CHECK:           %[[VAL_4:.*]] = constant 200 : index
+// CHECK:           %[[VAL_5:.*]] = constant 300 : index
+// CHECK:           %[[VAL_6:.*]] = constant 600 : index
+// CHECK:           %[[VAL_7:.*]] = constant 700 : index
+// CHECK:           %[[VAL_8:.*]] = constant 800 : index
+// CHECK:           %[[VAL_9:.*]] = constant 0 : index
+// CHECK:           %[[VAL_10:.*]] = constant 1 : index
+// CHECK:           %[[VAL_11:.*]] = alloca() : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:           %[[VAL_12:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
+// CHECK:           %[[VAL_13:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
+// CHECK:           %[[VAL_14:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
+// CHECK:           %[[VAL_15:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
+// CHECK:           %[[VAL_16:.*]] = alloca(%[[VAL_2]]) : memref<?xf32>
+// CHECK:           %[[VAL_17:.*]] = alloca() : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:           scf.for %[[VAL_18:.*]] = %[[VAL_9]] to %[[VAL_8]] step %[[VAL_10]] {
+// CHECK:             scf.for %[[VAL_19:.*]] = %[[VAL_9]] to %[[VAL_7]] step %[[VAL_10]] {
+// CHECK:               %[[VAL_20:.*]] = muli %[[VAL_18]], %[[VAL_7]] : index
+// CHECK:               %[[VAL_21:.*]] = addi %[[VAL_20]], %[[VAL_19]] : index
+// CHECK:               scf.for %[[VAL_22:.*]] = %[[VAL_9]] to %[[VAL_6]] step %[[VAL_10]] {
+// CHECK:                 %[[VAL_23:.*]] = muli %[[VAL_21]], %[[VAL_6]] : index
+// CHECK:                 %[[VAL_24:.*]] = addi %[[VAL_23]], %[[VAL_22]] : index
+// CHECK:                 %[[VAL_25:.*]] = load %[[VAL_12]]{{\[}}%[[VAL_24]]] : memref<?xindex>
+// CHECK:                 %[[VAL_26:.*]] = addi %[[VAL_24]], %[[VAL_10]] : index
+// CHECK:                 %[[VAL_27:.*]] = load %[[VAL_12]]{{\[}}%[[VAL_26]]] : memref<?xindex>
+// CHECK:                 scf.for %[[VAL_28:.*]] = %[[VAL_25]] to %[[VAL_27]] step %[[VAL_10]] {
+// CHECK:                   %[[VAL_29:.*]] = load %[[VAL_13]]{{\[}}%[[VAL_28]]] : memref<?xindex>
+// CHECK:                   %[[VAL_30:.*]] = load %[[VAL_14]]{{\[}}%[[VAL_28]]] : memref<?xindex>
+// CHECK:                   %[[VAL_31:.*]] = addi %[[VAL_28]], %[[VAL_10]] : index
+// CHECK:                   %[[VAL_32:.*]] = load %[[VAL_14]]{{\[}}%[[VAL_31]]] : memref<?xindex>
+// CHECK:                   scf.for %[[VAL_33:.*]] = %[[VAL_30]] to %[[VAL_32]] step %[[VAL_10]] {
+// CHECK:                     %[[VAL_34:.*]] = load %[[VAL_15]]{{\[}}%[[VAL_33]]] : memref<?xindex>
+// CHECK:                     scf.for %[[VAL_35:.*]] = %[[VAL_9]] to %[[VAL_5]] step %[[VAL_10]] {
+// CHECK:                       %[[VAL_36:.*]] = muli %[[VAL_33]], %[[VAL_5]] : index
+// CHECK:                       %[[VAL_37:.*]] = addi %[[VAL_36]], %[[VAL_35]] : index
+// CHECK:                       scf.for %[[VAL_38:.*]] = %[[VAL_9]] to %[[VAL_4]] step %[[VAL_10]] {
+// CHECK:                         %[[VAL_39:.*]] = muli %[[VAL_37]], %[[VAL_4]] : index
+// CHECK:                         %[[VAL_40:.*]] = addi %[[VAL_39]], %[[VAL_38]] : index
+// CHECK:                         scf.for %[[VAL_41:.*]] = %[[VAL_9]] to %[[VAL_3]] step %[[VAL_10]] {
+// CHECK:                           %[[VAL_42:.*]] = muli %[[VAL_40]], %[[VAL_3]] : index
+// CHECK:                           %[[VAL_43:.*]] = addi %[[VAL_42]], %[[VAL_41]] : index
+// CHECK:                           %[[VAL_44:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_41]], %[[VAL_38]], %[[VAL_35]], %[[VAL_34]], %[[VAL_29]], %[[VAL_22]], %[[VAL_19]], %[[VAL_18]]] : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:                           %[[VAL_45:.*]] = load %[[VAL_16]]{{\[}}%[[VAL_43]]] : memref<?xf32>
+// CHECK:                           %[[VAL_46:.*]] = mulf %[[VAL_44]], %[[VAL_45]] : f32
+// CHECK:                           store %[[VAL_46]], %[[VAL_17]]{{\[}}%[[VAL_41]], %[[VAL_38]], %[[VAL_35]], %[[VAL_34]], %[[VAL_29]], %[[VAL_22]], %[[VAL_19]], %[[VAL_18]]] : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:                         }
+// CHECK:                       }
+// CHECK:                     }
+// CHECK:                   }
+// CHECK:                 }
+// CHECK:               }
+// CHECK:             }
+// CHECK:           }
+// CHECK:           %[[VAL_47:.*]] = tensor_load %[[VAL_17]] : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:           return %[[VAL_47]] : tensor<100x200x300x400x500x600x700x800xf32>
+// CHECK:         }
+func @mul(%arga: tensor<100x200x300x400x500x600x700x800xf32>,
+          %argb: tensor<100x200x300x400x500x600x700x800xf32>)
+	      -> tensor<100x200x300x400x500x600x700x800xf32> {
+  %0 = linalg.generic #trait_mul
+    ins(%arga, %argb: tensor<100x200x300x400x500x600x700x800xf32>,
+                      tensor<100x200x300x400x500x600x700x800xf32>)
+    outs(%arga: tensor<100x200x300x400x500x600x700x800xf32>) {
+      ^bb(%a: f32, %b: f32, %s : f32):
+        %0 = mulf %a, %b : f32
+        linalg.yield %0 : f32
+    }      -> tensor<100x200x300x400x500x600x700x800xf32>
+  return %0 : tensor<100x200x300x400x500x600x700x800xf32>
+}