diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp index 373f3d1391a2..bcebf0721c6c 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -929,22 +929,23 @@ static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isReduction, llvm_unreachable("unexpected parallelization strategy"); } -/// Checks unit strides for dense tensors. The iteration graph may have ignored +/// Checks unit stride for dense tensors. The iteration graph may have ignored /// dense access patterns in order to avoid cycles (sparse access patterns are /// always placed innermost), but that means dense access has become strided. -/// For now, we reject vectorization of such cases. -/// TODO: implement strided load/stores on dense arrays +/// This prevents effective vectorization. static bool denseUnitStrides(Merger &merger, linalg::GenericOp op, - unsigned ldx) { + unsigned idx) { for (OpOperand *t : op.getInputAndOutputOperands()) { if (!getSparseTensorEncoding(t->get().getType())) { auto map = op.getTiedIndexingMap(t); for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) { AffineExpr a = map.getResult(d); - if (a.getKind() != AffineExprKind::DimId) - return false; // very conservative - unsigned idx = a.cast().getPosition(); - if (idx == ldx && d != rank - 1) + // Report non-unit stride if innermost index appears at an outer + // dimension (true non-unit stride) or if the innermost index appears + // in a compound subscript in the innermost dimension. Even if the + // latter is unit stride, it does not play well with scatter/gather. + if (a.isFunctionOfDim(idx) && + ((d != rank - 1) || (a.getKind() != AffineExprKind::DimId))) return false; } } diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector.mlir index 2b2492fc1db1..46af303b6662 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_vector.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_vector.mlir @@ -1,10 +1,10 @@ -// RUN: mlir-opt %s -sparsification="vectorization-strategy=0 vl=16" -split-input-file | \ +// RUN: mlir-opt %s -sparsification="vectorization-strategy=0 vl=16" -cse -split-input-file | \ // RUN: FileCheck %s --check-prefix=CHECK-VEC0 -// RUN: mlir-opt %s -sparsification="vectorization-strategy=1 vl=16" -split-input-file | \ +// RUN: mlir-opt %s -sparsification="vectorization-strategy=1 vl=16" -cse -split-input-file | \ // RUN: FileCheck %s --check-prefix=CHECK-VEC1 -// RUN: mlir-opt %s -sparsification="vectorization-strategy=2 vl=16" -split-input-file | \ +// RUN: mlir-opt %s -sparsification="vectorization-strategy=2 vl=16" -cse -split-input-file | \ // RUN: FileCheck %s --check-prefix=CHECK-VEC2 -// RUN: mlir-opt %s -sparsification="vectorization-strategy=2 vl=16 enable-simd-index32=true" -split-input-file | \ +// RUN: mlir-opt %s -sparsification="vectorization-strategy=2 vl=16 enable-simd-index32=true" -cse -split-input-file | \ // RUN: FileCheck %s --check-prefix=CHECK-VEC3 #DenseVector = #sparse_tensor.encoding<{ dimLevelType = [ "dense" ] }> @@ -386,3 +386,87 @@ func @mul_ds(%arga: tensor<512x1024xf32, #SparseMatrix>, %argb: tensor<512x1024x } -> tensor<512x1024xf32> return %0 : tensor<512x1024xf32> } + +// ----- + +#SparseMatrix = #sparse_tensor.encoding<{dimLevelType = ["dense","compressed"]}> + +#trait_affine = { + indexing_maps = [ + affine_map<(i,j) -> (i,j)>, + affine_map<(i,j) -> (i+1,j)> + ], + iterator_types = ["parallel","parallel"], + doc = "X(i+1,j) += A(i,j)" +} + +// +// CHECK-VEC0-LABEL: func @add_dense +// CHECK-VEC0-DAG: %[[c0:.*]] = constant 0 : index +// CHECK-VEC0-DAG: %[[c1:.*]] = constant 1 : index +// CHECK-VEC0-DAG: %[[c32:.*]] = constant 32 : index +// CHECK-VEC0: scf.for %[[i:.*]] = %[[c0]] to %[[c32]] step %[[c1]] { +// CHECK-VEC0: %[[lo:.*]] = memref.load %{{.*}}[%[[i]]] : memref +// CHECK-VEC0: %[[i1:.*]] = addi %[[i]], %[[c1]] : index +// CHECK-VEC0: %[[hi:.*]] = memref.load %{{.*}}[%[[i1]]] : memref +// CHECK-VEC0: scf.for %[[jj:.*]] = %[[lo]] to %[[hi]] step %[[c1]] { +// CHECK-VEC0: %[[j:.*]] = memref.load %{{.*}}[%[[jj]]] : memref +// CHECK-VEC0: %[[x:.*]] = memref.load %{{.*}}[%[[i1]], %[[j]]] : memref<33x64xf64> +// CHECK-VEC0: %[[a:.*]] = memref.load %{{.*}}[%[[jj]]] : memref +// CHECK-VEC0: %[[s:.*]] = addf %[[x]], %[[a]] : f64 +// CHECK-VEC0: memref.store %[[s]], %{{.*}}[%[[i1]], %[[j]]] : memref<33x64xf64> +// CHECK-VEC0: } +// CHECK-VEC0: } +// CHECK-VEC0: return +// +// CHECK-VEC1-LABEL: func @add_dense +// CHECK-VEC1-DAG: %[[c0:.*]] = constant 0 : index +// CHECK-VEC1-DAG: %[[c1:.*]] = constant 1 : index +// CHECK-VEC1-DAG: %[[c32:.*]] = constant 32 : index +// CHECK-VEC1: scf.for %[[i:.*]] = %[[c0]] to %[[c32]] step %[[c1]] { +// CHECK-VEC1: %[[lo:.*]] = memref.load %{{.*}}[%[[i]]] : memref +// CHECK-VEC1: %[[i1:.*]] = addi %[[i]], %[[c1]] : index +// CHECK-VEC1: %[[hi:.*]] = memref.load %{{.*}}[%[[i1]]] : memref +// CHECK-VEC1: scf.for %[[jj:.*]] = %[[lo]] to %[[hi]] step %[[c1]] { +// CHECK-VEC1: %[[j:.*]] = memref.load %{{.*}}[%[[jj]]] : memref +// CHECK-VEC1: %[[x:.*]] = memref.load %{{.*}}[%[[i1]], %[[j]]] : memref<33x64xf64> +// CHECK-VEC1: %[[a:.*]] = memref.load %{{.*}}[%[[jj]]] : memref +// CHECK-VEC1: %[[s:.*]] = addf %[[x]], %[[a]] : f64 +// CHECK-VEC1: memref.store %[[s]], %{{.*}}[%[[i1]], %[[j]]] : memref<33x64xf64> +// CHECK-VEC1: } +// CHECK-VEC1: } +// CHECK-VEC1: return +// +// CHECK-VEC2: #[[$map:.*]] = affine_map<(d0, d1)[s0] -> (16, d0 - d1) +// CHECK-VEC2-LABEL: func @add_dense +// CHECK-VEC2-DAG: %[[c0:.*]] = constant 0 : index +// CHECK-VEC2-DAG: %[[c1:.*]] = constant 1 : index +// CHECK-VEC2-DAG: %[[c16:.*]] = constant 16 : index +// CHECK-VEC2-DAG: %[[c32:.*]] = constant 32 : index +// CHECK-VEC2: scf.for %[[i:.*]] = %[[c0]] to %[[c32]] step %[[c1]] { +// CHECK-VEC2: %[[lo:.*]] = memref.load %{{.*}}[%[[i]]] : memref +// CHECK-VEC2: %[[i1:.*]] = addi %[[i]], %[[c1]] : index +// CHECK-VEC2: %[[hi:.*]] = memref.load %{{.*}}[%[[i1]]] : memref +// CHECK-VEC2: scf.for %[[jj:.*]] = %[[lo]] to %[[hi]] step %[[c16]] { +// CHECK-VEC2: %[[sub:.*]] = affine.min #[[$map]](%[[hi]], %[[jj]])[%[[c16]]] +// CHECK-VEC2: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1> +// CHECK-VEC2: %[[j:.*]] = vector.maskedload %{{.*}}[%[[jj]]], %[[mask]], %{{.*}} : memref +// CHECK-VEC2: %[[x:.*]] = vector.gather %{{.*}}[%[[i1]], %[[c0]]] [%[[j]]], %[[mask]], %{{.*}} : memref<33x64xf64> +// CHECK-VEC2: %[[a:.*]] = vector.maskedload %{{.*}}[%[[jj]]], %[[mask]], %{{.*}} : memref +// CHECK-VEC2: %[[s:.*]] = addf %[[x]], %[[a]] : vector<16xf64> +// CHECK-VEC2: vector.scatter %{{.*}}[%[[i1]], %[[c0]]] [%[[j]]], %[[mask]], %[[s]] : memref<33x64xf64> +// CHECK-VEC2: } +// CHECK-VEC2: } +// CHECK-VEC2: return +// +func @add_dense(%arga: tensor<32x64xf64, #SparseMatrix>, + %argx: tensor<33x64xf64> {linalg.inplaceable = true}) -> tensor<33x64xf64> { + %0 = linalg.generic #trait_affine + ins(%arga: tensor<32x64xf64, #SparseMatrix>) + outs(%argx: tensor<33x64xf64>) { + ^bb(%a: f64, %x: f64): + %0 = addf %x, %a : f64 + linalg.yield %0 : f64 + } -> tensor<33x64xf64> + return %0 : tensor<33x64xf64> +}