[mlir][sparse] accept affine subscripts in outer dimensions of dense memrefs

This relaxes vectorization of dense memrefs a bit so that affine expressions
are allowed in more outer dimensions. Vectorization of non unit stride
references is disabled though, since this seems ineffective anyway.

Reviewed By: bixia

Differential Revision: https://reviews.llvm.org/D111469
This commit is contained in:
Aart Bik 2021-10-08 16:13:29 -07:00
parent 011d8633eb
commit 849f016ce8
2 changed files with 97 additions and 12 deletions

View File

@ -929,22 +929,23 @@ static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isReduction,
llvm_unreachable("unexpected parallelization strategy");
}
/// Checks unit strides for dense tensors. The iteration graph may have ignored
/// Checks unit stride for dense tensors. The iteration graph may have ignored
/// dense access patterns in order to avoid cycles (sparse access patterns are
/// always placed innermost), but that means dense access has become strided.
/// For now, we reject vectorization of such cases.
/// TODO: implement strided load/stores on dense arrays
/// This prevents effective vectorization.
static bool denseUnitStrides(Merger &merger, linalg::GenericOp op,
unsigned ldx) {
unsigned idx) {
for (OpOperand *t : op.getInputAndOutputOperands()) {
if (!getSparseTensorEncoding(t->get().getType())) {
auto map = op.getTiedIndexingMap(t);
for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
AffineExpr a = map.getResult(d);
if (a.getKind() != AffineExprKind::DimId)
return false; // very conservative
unsigned idx = a.cast<AffineDimExpr>().getPosition();
if (idx == ldx && d != rank - 1)
// Report non-unit stride if innermost index appears at an outer
// dimension (true non-unit stride) or if the innermost index appears
// in a compound subscript in the innermost dimension. Even if the
// latter is unit stride, it does not play well with scatter/gather.
if (a.isFunctionOfDim(idx) &&
((d != rank - 1) || (a.getKind() != AffineExprKind::DimId)))
return false;
}
}

View File

@ -1,10 +1,10 @@
// RUN: mlir-opt %s -sparsification="vectorization-strategy=0 vl=16" -split-input-file | \
// RUN: mlir-opt %s -sparsification="vectorization-strategy=0 vl=16" -cse -split-input-file | \
// RUN: FileCheck %s --check-prefix=CHECK-VEC0
// RUN: mlir-opt %s -sparsification="vectorization-strategy=1 vl=16" -split-input-file | \
// RUN: mlir-opt %s -sparsification="vectorization-strategy=1 vl=16" -cse -split-input-file | \
// RUN: FileCheck %s --check-prefix=CHECK-VEC1
// RUN: mlir-opt %s -sparsification="vectorization-strategy=2 vl=16" -split-input-file | \
// RUN: mlir-opt %s -sparsification="vectorization-strategy=2 vl=16" -cse -split-input-file | \
// RUN: FileCheck %s --check-prefix=CHECK-VEC2
// RUN: mlir-opt %s -sparsification="vectorization-strategy=2 vl=16 enable-simd-index32=true" -split-input-file | \
// RUN: mlir-opt %s -sparsification="vectorization-strategy=2 vl=16 enable-simd-index32=true" -cse -split-input-file | \
// RUN: FileCheck %s --check-prefix=CHECK-VEC3
#DenseVector = #sparse_tensor.encoding<{ dimLevelType = [ "dense" ] }>
@ -386,3 +386,87 @@ func @mul_ds(%arga: tensor<512x1024xf32, #SparseMatrix>, %argb: tensor<512x1024x
} -> tensor<512x1024xf32>
return %0 : tensor<512x1024xf32>
}
// -----
#SparseMatrix = #sparse_tensor.encoding<{dimLevelType = ["dense","compressed"]}>
#trait_affine = {
indexing_maps = [
affine_map<(i,j) -> (i,j)>,
affine_map<(i,j) -> (i+1,j)>
],
iterator_types = ["parallel","parallel"],
doc = "X(i+1,j) += A(i,j)"
}
//
// CHECK-VEC0-LABEL: func @add_dense
// CHECK-VEC0-DAG: %[[c0:.*]] = constant 0 : index
// CHECK-VEC0-DAG: %[[c1:.*]] = constant 1 : index
// CHECK-VEC0-DAG: %[[c32:.*]] = constant 32 : index
// CHECK-VEC0: scf.for %[[i:.*]] = %[[c0]] to %[[c32]] step %[[c1]] {
// CHECK-VEC0: %[[lo:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xindex>
// CHECK-VEC0: %[[i1:.*]] = addi %[[i]], %[[c1]] : index
// CHECK-VEC0: %[[hi:.*]] = memref.load %{{.*}}[%[[i1]]] : memref<?xindex>
// CHECK-VEC0: scf.for %[[jj:.*]] = %[[lo]] to %[[hi]] step %[[c1]] {
// CHECK-VEC0: %[[j:.*]] = memref.load %{{.*}}[%[[jj]]] : memref<?xindex>
// CHECK-VEC0: %[[x:.*]] = memref.load %{{.*}}[%[[i1]], %[[j]]] : memref<33x64xf64>
// CHECK-VEC0: %[[a:.*]] = memref.load %{{.*}}[%[[jj]]] : memref<?xf64>
// CHECK-VEC0: %[[s:.*]] = addf %[[x]], %[[a]] : f64
// CHECK-VEC0: memref.store %[[s]], %{{.*}}[%[[i1]], %[[j]]] : memref<33x64xf64>
// CHECK-VEC0: }
// CHECK-VEC0: }
// CHECK-VEC0: return
//
// CHECK-VEC1-LABEL: func @add_dense
// CHECK-VEC1-DAG: %[[c0:.*]] = constant 0 : index
// CHECK-VEC1-DAG: %[[c1:.*]] = constant 1 : index
// CHECK-VEC1-DAG: %[[c32:.*]] = constant 32 : index
// CHECK-VEC1: scf.for %[[i:.*]] = %[[c0]] to %[[c32]] step %[[c1]] {
// CHECK-VEC1: %[[lo:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xindex>
// CHECK-VEC1: %[[i1:.*]] = addi %[[i]], %[[c1]] : index
// CHECK-VEC1: %[[hi:.*]] = memref.load %{{.*}}[%[[i1]]] : memref<?xindex>
// CHECK-VEC1: scf.for %[[jj:.*]] = %[[lo]] to %[[hi]] step %[[c1]] {
// CHECK-VEC1: %[[j:.*]] = memref.load %{{.*}}[%[[jj]]] : memref<?xindex>
// CHECK-VEC1: %[[x:.*]] = memref.load %{{.*}}[%[[i1]], %[[j]]] : memref<33x64xf64>
// CHECK-VEC1: %[[a:.*]] = memref.load %{{.*}}[%[[jj]]] : memref<?xf64>
// CHECK-VEC1: %[[s:.*]] = addf %[[x]], %[[a]] : f64
// CHECK-VEC1: memref.store %[[s]], %{{.*}}[%[[i1]], %[[j]]] : memref<33x64xf64>
// CHECK-VEC1: }
// CHECK-VEC1: }
// CHECK-VEC1: return
//
// CHECK-VEC2: #[[$map:.*]] = affine_map<(d0, d1)[s0] -> (16, d0 - d1)
// CHECK-VEC2-LABEL: func @add_dense
// CHECK-VEC2-DAG: %[[c0:.*]] = constant 0 : index
// CHECK-VEC2-DAG: %[[c1:.*]] = constant 1 : index
// CHECK-VEC2-DAG: %[[c16:.*]] = constant 16 : index
// CHECK-VEC2-DAG: %[[c32:.*]] = constant 32 : index
// CHECK-VEC2: scf.for %[[i:.*]] = %[[c0]] to %[[c32]] step %[[c1]] {
// CHECK-VEC2: %[[lo:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xindex>
// CHECK-VEC2: %[[i1:.*]] = addi %[[i]], %[[c1]] : index
// CHECK-VEC2: %[[hi:.*]] = memref.load %{{.*}}[%[[i1]]] : memref<?xindex>
// CHECK-VEC2: scf.for %[[jj:.*]] = %[[lo]] to %[[hi]] step %[[c16]] {
// CHECK-VEC2: %[[sub:.*]] = affine.min #[[$map]](%[[hi]], %[[jj]])[%[[c16]]]
// CHECK-VEC2: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
// CHECK-VEC2: %[[j:.*]] = vector.maskedload %{{.*}}[%[[jj]]], %[[mask]], %{{.*}} : memref<?xindex>
// CHECK-VEC2: %[[x:.*]] = vector.gather %{{.*}}[%[[i1]], %[[c0]]] [%[[j]]], %[[mask]], %{{.*}} : memref<33x64xf64>
// CHECK-VEC2: %[[a:.*]] = vector.maskedload %{{.*}}[%[[jj]]], %[[mask]], %{{.*}} : memref<?xf64>
// CHECK-VEC2: %[[s:.*]] = addf %[[x]], %[[a]] : vector<16xf64>
// CHECK-VEC2: vector.scatter %{{.*}}[%[[i1]], %[[c0]]] [%[[j]]], %[[mask]], %[[s]] : memref<33x64xf64>
// CHECK-VEC2: }
// CHECK-VEC2: }
// CHECK-VEC2: return
//
func @add_dense(%arga: tensor<32x64xf64, #SparseMatrix>,
%argx: tensor<33x64xf64> {linalg.inplaceable = true}) -> tensor<33x64xf64> {
%0 = linalg.generic #trait_affine
ins(%arga: tensor<32x64xf64, #SparseMatrix>)
outs(%argx: tensor<33x64xf64>) {
^bb(%a: f64, %x: f64):
%0 = addf %x, %a : f64
linalg.yield %0 : f64
} -> tensor<33x64xf64>
return %0 : tensor<33x64xf64>
}