[mlir][sparse] support affine expression on sparse dimensions (analysis implementation)

Reviewed By: aartbik Differential Revision: https://reviews.llvm.org/D138171
2022-11-17 01:20:11 +00:00 · 2022-11-17 01:20:11 +00:00 · 372e7939d7
parent d23b63cecc
commit 372e7939d7
5 changed files with 205 additions and 41 deletions
--- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
@ -148,14 +148,29 @@ struct LatPoint {
 /// independently from the basic algorithm if bottlenecks are identified.
 class Merger {
 public:
-  /// Constructs a merger for the given number of tensors and loops. The
-  /// user supplies the number of tensors involved in the kernel, with the
-  /// last tensor in this set denoting the output tensor. The merger adds an
-  /// additional synthetic tensor at the end of this set to represent all
-  /// invariant expressions in the kernel.
-  Merger(unsigned t, unsigned l)
-      : outTensor(t - 1), syntheticTensor(t), numTensors(t + 1), numLoops(l),
-        hasSparseOut(false),
+  /// Constructs a merger for the given number of tensors, native loops, and
+  /// filter loops. The user supplies the number of tensors involved in the
+  /// kernel, with the last tensor in this set denoting the output tensor. The
+  /// merger adds an additional synthetic tensor at the end of this set to
+  /// represent all invariant expressions in the kernel.
+  /// In addition to natives
+  /// loops (which are specified by the GenericOp), extra filter loops are
+  /// needed in order to handle affine expressions on sparse dimensions.
+  /// E.g., (d0, d1, d2) => (d0 + d1, d2), a naive implementation of the filter
+  /// loop could be generated as:
+  /// for (coord : sparse_dim[0])
+  ///   if (coord == d0 + d1) {
+  ///      generated_code;
+  ///   }
+  /// }
+  /// to filter out coordinates that are not equal to the affine expression
+  /// result.
+  /// TODO: we want to make the filter loop more efficient in the future, e.g.,
+  /// by avoiding scanning the full stored index sparse (keeping the last
+  /// position in ordered list) or even apply binary search to find the index.
+  Merger(unsigned t, unsigned l, unsigned fl)
+      : outTensor(t - 1), syntheticTensor(t), numTensors(t + 1),
+        numNativeLoops(l), numLoops(l + fl), hasSparseOut(false),
        dimTypes(numTensors,
                 std::vector<DimLevelType>(numLoops, DimLevelType::Undef)),
        loopIdxToDim(numTensors,
@ -231,6 +246,15 @@ public:
  /// Bit translation (get loop index).
  unsigned index(unsigned b) const { return b / numTensors; }

+  /// Get the number of total loops (native loops + filter loops).
+  unsigned getNumLoops() const { return numLoops; }
+  /// Get the number of native loops.
+  unsigned getNumNativeLoops() const { return numNativeLoops; }
+  /// Get the number of filter loops.
+  unsigned getNumFilterLoops() const { return numLoops - numNativeLoops; }
+  /// Get the starting filter loop index.
+  unsigned getFilterLoopStartingIdx() const { return getNumNativeLoops(); }
+
  /// Returns true if bit corresponds to index of output tensor.
  bool isOutTensor(unsigned b, unsigned i) const {
    return tensor(b) == outTensor && index(b) == i;
@ -242,6 +266,11 @@ public:
  /// expressions).
  unsigned getSynTensorID() const { return syntheticTensor; }

+  bool isFilterLoop(unsigned ldx) const {
+    assert(ldx < numLoops);
+    return ldx >= numNativeLoops;
+  }
+
  /// Returns true if given tensor iterates *only* in the given tensor
  /// expression. For the output tensor, this defines a "simply dynamic"
  /// operation [Bik96]. For instance: a(i) *= 2.0 or a(i) += a(i) for
@ -345,6 +374,7 @@ private:
  const unsigned outTensor;
  const unsigned syntheticTensor;
  const unsigned numTensors;
+  const unsigned numNativeLoops;
  const unsigned numLoops;
  bool hasSparseOut;
  // Map that converts pair<tensor id, loop id> to the corresponding dimension
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
@ -329,6 +329,12 @@ Operation *SparseTensorLoopEmitter::enterLoopOverTensorAtDim(
  return loop;
 }

+Operation *SparseTensorLoopEmitter::enterFilterLoopOverTensorAtDim(
+    OpBuilder &builder, Location loc, size_t tid, size_t dim, AffineExpr affine,
+    MutableArrayRef<Value> reduc) {
+  llvm_unreachable("need to be implemented");
+}
+
 void SparseTensorLoopEmitter::genDenseAffineAddressAtCurLevel(
    OpBuilder &builder, Location loc, size_t tid, size_t dim,
    AffineExpr affine) {
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
@ -406,6 +406,11 @@ public:
                                      ArrayRef<size_t> extraTids = {},
                                      ArrayRef<size_t> extraDims = {});

+  Operation *enterFilterLoopOverTensorAtDim(OpBuilder &builder, Location loc,
+                                            size_t tid, size_t dim,
+                                            AffineExpr affine,
+                                            MutableArrayRef<Value> reduc = {});
+
  void genDenseAffineAddressAtCurLevel(OpBuilder &builder, Location loc,
                                       size_t tid, size_t dim,
                                       AffineExpr affine);
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@ -155,8 +155,11 @@ static AffineMap permute(MLIRContext *context, AffineMap m,
 /// Helper method to inspect affine expressions. Rejects cases where the
 /// same index is used more than once. Also rejects compound affine
 /// expressions in sparse dimensions.
+/// filterIdx stores the current filter loop idx should be used for the next
+/// compound affine sparse level, and it will be incremented by one when
+/// used.
 static bool findAffine(Merger &merger, unsigned tensor, unsigned dim,
-                       AffineExpr a, DimLevelType dlt,
+                       AffineExpr a, DimLevelType dlt, unsigned &filterLdx,
                       bool setLvlFormat = true) {
  switch (a.getKind()) {
  case AffineExprKind::DimId: {
@ -169,22 +172,68 @@ static bool findAffine(Merger &merger, unsigned tensor, unsigned dim,
    return true;
  }
  case AffineExprKind::Add:
-  case AffineExprKind::Mul: {
-    if (!isDenseDLT(dlt))
-      return false; // compound only in dense dim
-    auto binOp = a.cast<AffineBinaryOpExpr>();
-    // We do not set dim level format for affine expresssion like d0 + d1 on
-    // both loop index at d0 and d1,
-    return findAffine(merger, tensor, dim, binOp.getLHS(), dlt, false) &&
-           findAffine(merger, tensor, dim, binOp.getRHS(), dlt, false);
+  case AffineExprKind::Mul:
+  case AffineExprKind::Constant: {
+    if (!isDenseDLT(dlt) && setLvlFormat) {
+      assert(isUndefDLT(merger.getDimLevelType(tensor, filterLdx)));
+      // Use a filter loop for sparse affine expression.
+      merger.setDimAndDimLevelType(tensor, filterLdx++, dim, dlt);
+    }
+
+    if (auto binOp = a.dyn_cast<AffineBinaryOpExpr>()) {
+      // We do not set dim level format for affine expresssion like d0 + d1 on
+      // either loop index at d0 or d1.
+      // We continue the recursion merely to check whether current affine is
+      // admissible or not.
+      return findAffine(merger, tensor, dim, binOp.getLHS(), dlt, filterLdx,
+                        false) &&
+             findAffine(merger, tensor, dim, binOp.getRHS(), dlt, filterLdx,
+                        false);
+    }
+    // Falls through when it is a constant Affine
+    return true;
  }
-  case AffineExprKind::Constant:
-    return isDenseDLT(dlt); // const only in dense dim
  default:
    return false;
  }
 }

+/// Get the total number of compound affine expressions in affineMap that are
+/// attached to the given tensor. For the following inputs:
+///
+/// affineMap = (d0, d1, d2) => (d0 + d1, d2)
+/// tensor = ["compressed", "compressed"]
+///
+/// Returns 1 (because the first level is compressed and its corresponding
+/// affineMap is d0 + d1)
+static unsigned getNumCompoundAffineOnSparseDims(AffineMap affineMap,
+                                                 Value tensor) {
+  unsigned num = 0;
+  auto enc = getSparseTensorEncoding(tensor.getType());
+  if (enc) {
+    ArrayRef<AffineExpr> exps = affineMap.getResults();
+    for (unsigned rank = 0; rank < exps.size(); rank++) {
+      auto aidx = toOrigDim(enc, rank);
+      auto affine = exps[aidx];
+      if (!affine.isa<AffineDimExpr>())
+        if (!isDenseDLT(getDimLevelType(enc, rank)))
+          num++;
+    }
+  }
+
+  return num;
+}
+
+/// Get the total number of compound affine expressions attached on a sparse
+/// level in the given GenericOp.
+static unsigned getNumCompoundAffineOnSparseDims(linalg::GenericOp op) {
+  unsigned num = 0;
+  for (OpOperand &t : op->getOpOperands())
+    num += getNumCompoundAffineOnSparseDims(op.getMatchingIndexingMap(&t),
+                                            t.get());
+  return num;
+}
+
 /// Helper method to inspect sparse encodings in the tensor types.
 /// Fills the per-dimension sparsity information for all tensors.
 /// Returns true if the sparse annotations and affine subscript
@ -192,19 +241,22 @@ static bool findAffine(Merger &merger, unsigned tensor, unsigned dim,
 /// no annotations are found or inadmissible constructs occur.
 static bool findSparseAnnotations(Merger &merger, linalg::GenericOp op) {
  bool annotated = false;
+  unsigned filterLdx = merger.getFilterLoopStartingIdx();
  for (OpOperand &t : op->getOpOperands()) {
    auto map = op.getMatchingIndexingMap(&t);
    auto enc = getSparseTensorEncoding(t.get().getType());
    if (enc)
      annotated = true;
    assert(map.getNumResults() == op.getRank(&t));
+
    for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
      unsigned tensor = t.getOperandNumber();
      AffineExpr a = map.getResult(toOrigDim(enc, d));
-      if (!findAffine(merger, tensor, d, a, getDimLevelType(enc, d)))
+      if (!findAffine(merger, tensor, d, a, getDimLevelType(enc, d), filterLdx))
        return false; // inadmissible affine expression
    }
  }
+  assert(filterLdx == merger.getNumLoops());
  return annotated;
 }

@ -214,34 +266,58 @@ static bool findSparseAnnotations(Merger &merger, linalg::GenericOp op) {
 /// latest possible index.
 static bool topSortOptimal(unsigned n,
                           ArrayRef<utils::IteratorType> iteratorTypes,
-                           std::vector<unsigned> &topSort,
+                           const Merger &merger, std::vector<unsigned> &topSort,
                           std::vector<unsigned> &inDegree,
                           std::vector<std::vector<bool>> &adjM) {
-  std::vector<unsigned> redIt; // reduce iterator with 0 degree
-  std::vector<unsigned> parIt; // parallel iterator with 0 degree
+  std::vector<unsigned> redIt;    // reduce iterator with 0 degree
+  std::vector<unsigned> parIt;    // parallel iterator with 0 degree
+  std::vector<unsigned> filterIt; // filter loop with 0 degree
  for (unsigned i = 0; i < n; i++) {
    if (inDegree[i] == 0) {
-      if (linalg::isReductionIterator(iteratorTypes[i]))
+      if (merger.isFilterLoop(i))
+        filterIt.push_back(i);
+      else if (linalg::isReductionIterator(iteratorTypes[i]))
        redIt.push_back(i);
      else
        parIt.push_back(i);
    }
  }

-  while (!redIt.empty() || !parIt.empty()) {
-    // We always choose parallel iterator if there is any.
-    auto &it = !parIt.empty() ? parIt : redIt;
+  while (!redIt.empty() || !parIt.empty() || !filterIt.empty()) {
+    // We always choose in order of filter loop -> parallel loop -> reduction
+    // loop because
+    // 1. Putting reduction loop early might make the loop sequence
+    // inadmissible.
+    // 2. Filter loops should be put as early as possible for better
+    // performance, since only one (if any) iteration will carry the
+    // computation. E.g., for (1 to N)
+    //  for (1 to M)
+    //    for (1 to K)
+    //      if (xxx)
+    //        O(X) computation  => O(NMK+NMX) time complexity
+    //
+    // By putting the filter loop one level up, we got
+    //
+    // for (1 to N)
+    //  for (1 to K)
+    //    if (xxx)
+    //      for (1 to M)
+    //        O(X) computation  => O(NK+NMX) time complexity
+    auto &it = !filterIt.empty() ? filterIt : (!parIt.empty() ? parIt : redIt);
    auto src = it.back();
    topSort.push_back(src);
    it.pop_back();
    // Update in-degree, and push 0-degree node into worklist.
-    for (unsigned dst = 0; dst < n; dst++)
+    for (unsigned dst = 0; dst < n; dst++) {
      if (adjM[src][dst] && --inDegree[dst] == 0) {
-        if (linalg::isReductionIterator(iteratorTypes[dst]))
+        if (merger.isFilterLoop(dst))
+          filterIt.push_back(dst);
+        else if (linalg::isReductionIterator(iteratorTypes[dst]))
          redIt.push_back(dst);
        else
          parIt.push_back(dst);
      }
+    }
  }
  return topSort.size() == n;
 }
@ -340,7 +416,7 @@ static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,
                                  OpOperand *skip = nullptr) {
  // Set up an n x n from/to adjacency matrix of the iteration graph
  // for the implicit loop indices i_0 .. i_n-1.
-  unsigned n = op.getNumLoops();
+  unsigned n = merger.getNumLoops();
  std::vector<std::vector<bool>> adjM(n, std::vector<bool>(n, false));
  std::vector<unsigned> inDegree(n, 0); // in-degree of each node.
  auto iteratorTypes = op.getIteratorTypesArray();
@ -352,7 +428,7 @@ static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,
    // Get map and encoding.
    auto map = op.getMatchingIndexingMap(&t);
    auto enc = getSparseTensorEncoding(t.get().getType());
-    assert(map.getNumDims() == n);
+    assert(map.getNumDims() + getNumCompoundAffineOnSparseDims(op) == n);
    // Skip dense tensor constraints when not requested.
    if (!(mask & SortMask::kIncludeDense) && !enc)
      continue;
@ -364,6 +440,19 @@ static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,
      AffineExpr ta = map.getResult(toOrigDim(enc, d));
      Optional<unsigned> tldx = merger.getLoopIdx(t.getOperandNumber(), d);

+      // Filter loops should be constructed after all the dependent loops,
+      // i.e., d0 + d1 < filter_loop(d0 + d1)
+      if (tldx && merger.isFilterLoop(tldx.value())) {
+        assert(!ta.isa<AffineDimExpr>() &&
+               !isDenseDLT(getDimLevelType(enc, d)));
+        addAffineOrderings(adjM, inDegree, ta, AffineExpr(), llvm::None, tldx);
+        // Now that the ordering of affine expression is captured by filter
+        // loop idx, we only need to ensure the affine ordering against filter
+        // loop. Thus, we reset the affine express to nil here to mark it as
+        // resolved.
+        ta = AffineExpr();
+      }
+
      if (d > 0) {
        AffineExpr fa = map.getResult(toOrigDim(enc, d - 1));
        Optional<unsigned> fldx =
@ -377,6 +466,11 @@ static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,
        if (!(mask & SortMask::kIncludeDense))
          tryLoosenAffineDenseConstraints(op, fldx, fa, tldx, ta);

+        // (d0 + d1) < (d2 + d3), or
+        // filter_loop_d-1 < (d2 + d3), or
+        // (d0 + d1) < filter_loop_d, or
+        // filter_loop_d-1 < filter_loop_d depending on whether fa/ta is reset
+        // above.
        addAffineOrderings(adjM, inDegree, fa, ta, fldx, tldx);
      }
    }
@ -402,7 +496,7 @@ static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,
  // Report failure for a cyclic iteration graph.
  topSort.clear();
  topSort.reserve(n);
-  return topSortOptimal(n, iteratorTypes, topSort, inDegree, adjM);
+  return topSortOptimal(n, iteratorTypes, merger, topSort, inDegree, adjM);
 }

 /// Returns true if tensor materializes uninitialized into the computation.
@ -430,9 +524,8 @@ static bool isAdmissibleTensorExp(Merger &merger, linalg::GenericOp op,
  // An all-dense annotated "sparse" output tensor becomes a linearized random
  // access 1-dim memref. Also admissible since insertions cannot occur.
  bool allDense = true;
-  auto iteratorTypes = op.getIteratorTypesArray();
-  unsigned numLoops = iteratorTypes.size();
-  for (unsigned i = 0; i < numLoops; i++)
+  unsigned numLoops = merger.getNumLoops(); // numNativeLoops + numFilterLoops
+  for (unsigned i = 0; i < merger.getNumLoops(); i++)
    if (isCompressedDLT(merger.getDimLevelType(tensor, i)) ||
        isSingletonDLT(merger.getDimLevelType(tensor, i))) {
      allDense = false;
@ -443,19 +536,31 @@ static bool isAdmissibleTensorExp(Merger &merger, linalg::GenericOp op,
    }
  if (allDense)
    return true;
+
+  // TODO: support compound affine expression on sparse output.
+  if (getNumCompoundAffineOnSparseDims(op.getMatchingIndexingMap(lhs),
+                                       lhs->get()) != 0)
+    return false;
+
  // A tensor expression with a sparse output tensor that changes its values
  // but not its nonzero structure, an operation called "simply dynamic" in
  // [Bik96,Ch9], is also admissible without special codegen.
  if (merger.isSingleCondition(tensor, exp))
    return true;
+
  // Accept "truly dynamic" if the output tensor materializes uninitialized
  // into the computation and insertions occur in lexicographic index order.
  if (isMaterializing(lhs->get())) {
+    auto iteratorTypes = op.getIteratorTypesArray();
    unsigned nest = 0;
    for (unsigned i = 0; i < numLoops; i++) {
-      if (linalg::isReductionIterator(iteratorTypes[topSort[i]]))
-        break; // terminate at first reduction
-      nest++;
+      if (!merger.isFilterLoop(topSort[i])) {
+        // We only count non-filter loops as filter loops should be considered
+        // as a special type of parallel loops.
+        if (linalg::isReductionIterator(iteratorTypes[topSort[i]]))
+          break; // terminate at first reduction
+        nest++;
+      }
    }
    // Determine admissible dynamic insertion situations:
    // (1) fully injective, since there are no reductions,
@ -878,7 +983,14 @@ static void genInvariants(Merger &merger, CodeGen &codegen, OpBuilder &builder,
    auto enc = getSparseTensorEncoding(t.get().getType());
    for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
      AffineExpr a = map.getResult(toOrigDim(enc, d));
-      if (!isInvariantAffine(codegen, a, ldx, atLevel))
+      Optional<unsigned> sldx = merger.getLoopIdx(t.getOperandNumber(), d);
+      if (sldx && merger.isFilterLoop(sldx.value())) {
+        if (!codegen.getLoopIdxValue(sldx.value()))
+          // The filter loops has not been constructed.
+          return;
+        if (sldx.value() == ldx)
+          atLevel = true;
+      } else if (!isInvariantAffine(codegen, a, ldx, atLevel))
        return; // still in play
    }
    // All exhausted at this level (atLevel denotes exactly at this level).
@ -1003,6 +1115,16 @@ static Operation *genFor(Merger &merger, CodeGen &codegen, OpBuilder &builder,

  Operation *loop =
      genLoopBoundary(codegen, merger, [&](MutableArrayRef<Value> reduc) {
+        if (merger.isFilterLoop(idx)) {
+          assert(isSparse);
+          OpOperand *t = &op->getOpOperand(tid);
+          auto enc = getSparseTensorEncoding(t->get().getType());
+          // Retrieves the affine expression for the filter loop.
+          AffineExpr a =
+              op.getMatchingIndexingMap(t).getResult(toOrigDim(enc, dim));
+          return codegen.loopEmitter.enterFilterLoopOverTensorAtDim(
+              builder, loc, tid, dim, a, reduc);
+        }
        return codegen.loopEmitter.enterLoopOverTensorAtDim(
            builder, loc, tid, dim, reduc, isParallel, extraTids, extraDims);
      }).value();
@ -1488,7 +1610,8 @@ public:
      return failure();
    unsigned numTensors = op->getNumOperands();
    unsigned numLoops = op.getNumLoops();
-    Merger merger(numTensors, numLoops);
+    unsigned numFilterLoops = getNumCompoundAffineOnSparseDims(op);
+    Merger merger(numTensors, numLoops, numFilterLoops);
    if (!findSparseAnnotations(merger, op))
      return failure();

--- a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
+++ b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
@ -127,7 +127,7 @@ class MergerTestBase : public ::testing::Test {
 protected:
  MergerTestBase(unsigned numTensors, unsigned numLoops)
      : numTensors(numTensors), numLoops(numLoops),
-        merger(numTensors, numLoops) {}
+        merger(numTensors, numLoops, /*numFilterLoops=*/0) {}

  ///
  /// Expression construction helpers.