forked from OSchip/llvm-project
[mlir][sparse] accept affine subscripts in outer dimensions of dense memrefs
This relaxes vectorization of dense memrefs a bit so that affine expressions are allowed in more outer dimensions. Vectorization of non unit stride references is disabled though, since this seems ineffective anyway. Reviewed By: bixia Differential Revision: https://reviews.llvm.org/D111469
This commit is contained in:
parent
011d8633eb
commit
849f016ce8
|
@ -929,22 +929,23 @@ static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isReduction,
|
|||
llvm_unreachable("unexpected parallelization strategy");
|
||||
}
|
||||
|
||||
/// Checks unit strides for dense tensors. The iteration graph may have ignored
|
||||
/// Checks unit stride for dense tensors. The iteration graph may have ignored
|
||||
/// dense access patterns in order to avoid cycles (sparse access patterns are
|
||||
/// always placed innermost), but that means dense access has become strided.
|
||||
/// For now, we reject vectorization of such cases.
|
||||
/// TODO: implement strided load/stores on dense arrays
|
||||
/// This prevents effective vectorization.
|
||||
static bool denseUnitStrides(Merger &merger, linalg::GenericOp op,
|
||||
unsigned ldx) {
|
||||
unsigned idx) {
|
||||
for (OpOperand *t : op.getInputAndOutputOperands()) {
|
||||
if (!getSparseTensorEncoding(t->get().getType())) {
|
||||
auto map = op.getTiedIndexingMap(t);
|
||||
for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
|
||||
AffineExpr a = map.getResult(d);
|
||||
if (a.getKind() != AffineExprKind::DimId)
|
||||
return false; // very conservative
|
||||
unsigned idx = a.cast<AffineDimExpr>().getPosition();
|
||||
if (idx == ldx && d != rank - 1)
|
||||
// Report non-unit stride if innermost index appears at an outer
|
||||
// dimension (true non-unit stride) or if the innermost index appears
|
||||
// in a compound subscript in the innermost dimension. Even if the
|
||||
// latter is unit stride, it does not play well with scatter/gather.
|
||||
if (a.isFunctionOfDim(idx) &&
|
||||
((d != rank - 1) || (a.getKind() != AffineExprKind::DimId)))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// RUN: mlir-opt %s -sparsification="vectorization-strategy=0 vl=16" -split-input-file | \
|
||||
// RUN: mlir-opt %s -sparsification="vectorization-strategy=0 vl=16" -cse -split-input-file | \
|
||||
// RUN: FileCheck %s --check-prefix=CHECK-VEC0
|
||||
// RUN: mlir-opt %s -sparsification="vectorization-strategy=1 vl=16" -split-input-file | \
|
||||
// RUN: mlir-opt %s -sparsification="vectorization-strategy=1 vl=16" -cse -split-input-file | \
|
||||
// RUN: FileCheck %s --check-prefix=CHECK-VEC1
|
||||
// RUN: mlir-opt %s -sparsification="vectorization-strategy=2 vl=16" -split-input-file | \
|
||||
// RUN: mlir-opt %s -sparsification="vectorization-strategy=2 vl=16" -cse -split-input-file | \
|
||||
// RUN: FileCheck %s --check-prefix=CHECK-VEC2
|
||||
// RUN: mlir-opt %s -sparsification="vectorization-strategy=2 vl=16 enable-simd-index32=true" -split-input-file | \
|
||||
// RUN: mlir-opt %s -sparsification="vectorization-strategy=2 vl=16 enable-simd-index32=true" -cse -split-input-file | \
|
||||
// RUN: FileCheck %s --check-prefix=CHECK-VEC3
|
||||
|
||||
#DenseVector = #sparse_tensor.encoding<{ dimLevelType = [ "dense" ] }>
|
||||
|
@ -386,3 +386,87 @@ func @mul_ds(%arga: tensor<512x1024xf32, #SparseMatrix>, %argb: tensor<512x1024x
|
|||
} -> tensor<512x1024xf32>
|
||||
return %0 : tensor<512x1024xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
#SparseMatrix = #sparse_tensor.encoding<{dimLevelType = ["dense","compressed"]}>
|
||||
|
||||
#trait_affine = {
|
||||
indexing_maps = [
|
||||
affine_map<(i,j) -> (i,j)>,
|
||||
affine_map<(i,j) -> (i+1,j)>
|
||||
],
|
||||
iterator_types = ["parallel","parallel"],
|
||||
doc = "X(i+1,j) += A(i,j)"
|
||||
}
|
||||
|
||||
//
|
||||
// CHECK-VEC0-LABEL: func @add_dense
|
||||
// CHECK-VEC0-DAG: %[[c0:.*]] = constant 0 : index
|
||||
// CHECK-VEC0-DAG: %[[c1:.*]] = constant 1 : index
|
||||
// CHECK-VEC0-DAG: %[[c32:.*]] = constant 32 : index
|
||||
// CHECK-VEC0: scf.for %[[i:.*]] = %[[c0]] to %[[c32]] step %[[c1]] {
|
||||
// CHECK-VEC0: %[[lo:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xindex>
|
||||
// CHECK-VEC0: %[[i1:.*]] = addi %[[i]], %[[c1]] : index
|
||||
// CHECK-VEC0: %[[hi:.*]] = memref.load %{{.*}}[%[[i1]]] : memref<?xindex>
|
||||
// CHECK-VEC0: scf.for %[[jj:.*]] = %[[lo]] to %[[hi]] step %[[c1]] {
|
||||
// CHECK-VEC0: %[[j:.*]] = memref.load %{{.*}}[%[[jj]]] : memref<?xindex>
|
||||
// CHECK-VEC0: %[[x:.*]] = memref.load %{{.*}}[%[[i1]], %[[j]]] : memref<33x64xf64>
|
||||
// CHECK-VEC0: %[[a:.*]] = memref.load %{{.*}}[%[[jj]]] : memref<?xf64>
|
||||
// CHECK-VEC0: %[[s:.*]] = addf %[[x]], %[[a]] : f64
|
||||
// CHECK-VEC0: memref.store %[[s]], %{{.*}}[%[[i1]], %[[j]]] : memref<33x64xf64>
|
||||
// CHECK-VEC0: }
|
||||
// CHECK-VEC0: }
|
||||
// CHECK-VEC0: return
|
||||
//
|
||||
// CHECK-VEC1-LABEL: func @add_dense
|
||||
// CHECK-VEC1-DAG: %[[c0:.*]] = constant 0 : index
|
||||
// CHECK-VEC1-DAG: %[[c1:.*]] = constant 1 : index
|
||||
// CHECK-VEC1-DAG: %[[c32:.*]] = constant 32 : index
|
||||
// CHECK-VEC1: scf.for %[[i:.*]] = %[[c0]] to %[[c32]] step %[[c1]] {
|
||||
// CHECK-VEC1: %[[lo:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xindex>
|
||||
// CHECK-VEC1: %[[i1:.*]] = addi %[[i]], %[[c1]] : index
|
||||
// CHECK-VEC1: %[[hi:.*]] = memref.load %{{.*}}[%[[i1]]] : memref<?xindex>
|
||||
// CHECK-VEC1: scf.for %[[jj:.*]] = %[[lo]] to %[[hi]] step %[[c1]] {
|
||||
// CHECK-VEC1: %[[j:.*]] = memref.load %{{.*}}[%[[jj]]] : memref<?xindex>
|
||||
// CHECK-VEC1: %[[x:.*]] = memref.load %{{.*}}[%[[i1]], %[[j]]] : memref<33x64xf64>
|
||||
// CHECK-VEC1: %[[a:.*]] = memref.load %{{.*}}[%[[jj]]] : memref<?xf64>
|
||||
// CHECK-VEC1: %[[s:.*]] = addf %[[x]], %[[a]] : f64
|
||||
// CHECK-VEC1: memref.store %[[s]], %{{.*}}[%[[i1]], %[[j]]] : memref<33x64xf64>
|
||||
// CHECK-VEC1: }
|
||||
// CHECK-VEC1: }
|
||||
// CHECK-VEC1: return
|
||||
//
|
||||
// CHECK-VEC2: #[[$map:.*]] = affine_map<(d0, d1)[s0] -> (16, d0 - d1)
|
||||
// CHECK-VEC2-LABEL: func @add_dense
|
||||
// CHECK-VEC2-DAG: %[[c0:.*]] = constant 0 : index
|
||||
// CHECK-VEC2-DAG: %[[c1:.*]] = constant 1 : index
|
||||
// CHECK-VEC2-DAG: %[[c16:.*]] = constant 16 : index
|
||||
// CHECK-VEC2-DAG: %[[c32:.*]] = constant 32 : index
|
||||
// CHECK-VEC2: scf.for %[[i:.*]] = %[[c0]] to %[[c32]] step %[[c1]] {
|
||||
// CHECK-VEC2: %[[lo:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xindex>
|
||||
// CHECK-VEC2: %[[i1:.*]] = addi %[[i]], %[[c1]] : index
|
||||
// CHECK-VEC2: %[[hi:.*]] = memref.load %{{.*}}[%[[i1]]] : memref<?xindex>
|
||||
// CHECK-VEC2: scf.for %[[jj:.*]] = %[[lo]] to %[[hi]] step %[[c16]] {
|
||||
// CHECK-VEC2: %[[sub:.*]] = affine.min #[[$map]](%[[hi]], %[[jj]])[%[[c16]]]
|
||||
// CHECK-VEC2: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
|
||||
// CHECK-VEC2: %[[j:.*]] = vector.maskedload %{{.*}}[%[[jj]]], %[[mask]], %{{.*}} : memref<?xindex>
|
||||
// CHECK-VEC2: %[[x:.*]] = vector.gather %{{.*}}[%[[i1]], %[[c0]]] [%[[j]]], %[[mask]], %{{.*}} : memref<33x64xf64>
|
||||
// CHECK-VEC2: %[[a:.*]] = vector.maskedload %{{.*}}[%[[jj]]], %[[mask]], %{{.*}} : memref<?xf64>
|
||||
// CHECK-VEC2: %[[s:.*]] = addf %[[x]], %[[a]] : vector<16xf64>
|
||||
// CHECK-VEC2: vector.scatter %{{.*}}[%[[i1]], %[[c0]]] [%[[j]]], %[[mask]], %[[s]] : memref<33x64xf64>
|
||||
// CHECK-VEC2: }
|
||||
// CHECK-VEC2: }
|
||||
// CHECK-VEC2: return
|
||||
//
|
||||
func @add_dense(%arga: tensor<32x64xf64, #SparseMatrix>,
|
||||
%argx: tensor<33x64xf64> {linalg.inplaceable = true}) -> tensor<33x64xf64> {
|
||||
%0 = linalg.generic #trait_affine
|
||||
ins(%arga: tensor<32x64xf64, #SparseMatrix>)
|
||||
outs(%argx: tensor<33x64xf64>) {
|
||||
^bb(%a: f64, %x: f64):
|
||||
%0 = addf %x, %a : f64
|
||||
linalg.yield %0 : f64
|
||||
} -> tensor<33x64xf64>
|
||||
return %0 : tensor<33x64xf64>
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue