[mlir][sparse] generalize reduction support in sparse compiler

Now not just SUM, but also PRODUCT, AND, OR, XOR. The reductions
MIN and MAX are still to be done (also depends on recognizing
these operations in cmp-select constructs).

Reviewed By: bixia

Differential Revision: https://reviews.llvm.org/D110203
This commit is contained in:
Aart Bik 2021-09-21 14:48:49 -07:00
parent a85d7a56c7
commit 5da21338bc
3 changed files with 342 additions and 27 deletions

View File

@ -28,11 +28,18 @@
using namespace mlir;
using namespace mlir::sparse_tensor;
//===----------------------------------------------------------------------===//
// Declarations of data structures.
//===----------------------------------------------------------------------===//
namespace {
// Iteration graph sorting.
enum SortMask { kSparseOnly = 0x0, kIncludeDense = 0x1, kIncludeUndef = 0x2 };
// Reduction kinds.
enum Reduction { kSum, kProduct, kAnd, kOr, kXor };
// Code generation.
struct CodeGen {
CodeGen(SparsificationOptions o, unsigned numTensors, unsigned numLoops)
@ -68,6 +75,7 @@ struct CodeGen {
// is most effective; we could generalize to more outer and while-loops.
unsigned redExp;
Value redVal;
Reduction redKind;
// Current vector length and mask.
unsigned curVecLength;
Value curVecMask;
@ -75,8 +83,12 @@ struct CodeGen {
} // namespace
// Helper method to apply dimension ordering permutation.
static unsigned perm(SparseTensorEncodingAttr &enc, unsigned d) {
//===----------------------------------------------------------------------===//
// Sparse compiler analysis methods.
//===----------------------------------------------------------------------===//
/// Helper method to apply dimension ordering permutation.
static unsigned perm(const SparseTensorEncodingAttr &enc, unsigned d) {
if (enc) {
auto order = enc.getDimOrdering();
if (order) {
@ -87,8 +99,8 @@ static unsigned perm(SparseTensorEncodingAttr &enc, unsigned d) {
return d;
}
// Helper method to translate dim level type to internal representation.
static Dim toDim(SparseTensorEncodingAttr &enc, unsigned d) {
/// Helper method to translate dim level type to internal representation.
static Dim toDim(const SparseTensorEncodingAttr &enc, unsigned d) {
if (enc) {
SparseTensorEncodingAttr::DimLevelType tp = enc.getDimLevelType()[d];
if (tp == SparseTensorEncodingAttr::DimLevelType::Compressed)
@ -283,6 +295,83 @@ static bool isAdmissableTensorExp(Merger &merger, linalg::GenericOp op,
return false;
}
//===----------------------------------------------------------------------===//
// Sparse compiler synthesis methods.
//===----------------------------------------------------------------------===//
/// Maps reduction kind to name encoding.
static StringRef getReductionName(Reduction kind) {
switch (kind) {
case kSum:
return "add";
case kProduct:
return "mul";
case kAnd:
return "and";
case kOr:
return "or";
case kXor:
return "xor";
}
llvm_unreachable("unknown reduction kind");
}
/// Maps operation to reduction.
static Reduction getReduction(Kind kind) {
switch (kind) {
case Kind::kAddF:
case Kind::kAddI:
case Kind::kSubF:
case Kind::kSubI:
return kSum;
case Kind::kMulF:
case Kind::kMulI:
return kProduct;
case Kind::kAndI:
return kAnd;
case Kind::kOrI:
return kOr;
case Kind::kXorI:
return kXor;
default:
llvm_unreachable("unexpected reduction operator");
}
}
/// Generates an initial value for a vector reductions, following the scheme
/// given in Chapter 5 of "The Software Vectorization Handbook", where the
/// initial scalar value is correctly embedded in the vector reduction value,
/// and a straightforward horizontal reduction will complete the operation.
static Value genReductionInit(PatternRewriter &rewriter, Location loc,
Reduction kind, VectorType vtp, Value r) {
switch (kind) {
case kSum:
case kXor: {
// Initialize reduction vector to: | 0 | .. | 0 | r |
Attribute zero = rewriter.getZeroAttr(vtp);
Value vec = rewriter.create<ConstantOp>(loc, vtp, zero);
return rewriter.create<vector::InsertElementOp>(loc, r, vec, 0);
}
case kProduct: {
// Initialize reduction vector to: | 1 | .. | 1 | r |
Type etp = vtp.getElementType();
Attribute one;
if (etp.isa<FloatType>())
one = rewriter.getFloatAttr(etp, 1.0);
else
one = rewriter.getIntegerAttr(etp, 1);
Value vec =
rewriter.create<ConstantOp>(loc, vtp, DenseElementsAttr::get(vtp, one));
return rewriter.create<vector::InsertElementOp>(loc, r, vec, 0);
}
case kAnd:
case kOr:
// Initialize reduction vector to: | r | .. | r | r |
return rewriter.create<vector::BroadcastOp>(loc, vtp, r);
}
llvm_unreachable("unknown reduction kind");
}
/// Maps sparse integer option to actual integral storage type.
static Type genIntType(PatternRewriter &rewriter, unsigned width) {
if (width == 0)
@ -644,11 +733,15 @@ static Value genReductionStart(Merger &merger, CodeGen &codegen,
linalg::GenericOp op) {
if (codegen.redVal)
return codegen.redVal; // chained with previous for-loop
if (codegen.curVecLength > 1) {
// TODO: assumes + reductions for now
// Generate vector or scalar start of a reduction.
unsigned vl = codegen.curVecLength;
if (vl > 1) {
VectorType vtp = vectorType(codegen, codegen.buffers[codegen.redExp]);
return rewriter.create<ConstantOp>(op.getLoc(), vtp,
rewriter.getZeroAttr(vtp));
assert(!merger.exp(codegen.redExp).val);
codegen.curVecLength = 1;
Value load = genTensorLoad(merger, codegen, rewriter, op, codegen.redExp);
codegen.curVecLength = vl;
return genReductionInit(rewriter, op.getLoc(), codegen.redKind, vtp, load);
}
return genTensorLoad(merger, codegen, rewriter, op, codegen.redExp);
}
@ -661,19 +754,12 @@ static void genReductionEnd(Merger &merger, CodeGen &codegen,
return;
assert(codegen.curVecLength == 1);
codegen.redVal = merger.exp(codegen.redExp).val = Value(); // end chain
// Generate vector or scalar end of a reduction.
if (auto vtp = red.getType().dyn_cast<VectorType>()) {
// TODO: assumes + reductions for now
StringAttr kind = rewriter.getStringAttr("add");
Value ld = genTensorLoad(merger, codegen, rewriter, op, codegen.redExp);
// Integer reductions don't accept an accumulator.
if (vtp.getElementType().isa<IntegerType>()) {
red = rewriter.create<vector::ReductionOp>(op.getLoc(), ld.getType(),
kind, red, ValueRange{});
red = rewriter.create<AddIOp>(op.getLoc(), red, ld);
} else {
red = rewriter.create<vector::ReductionOp>(op.getLoc(), ld.getType(),
kind, red, ld);
}
StringRef name = getReductionName(codegen.redKind);
StringAttr kind = rewriter.getStringAttr(name);
red = rewriter.create<vector::ReductionOp>(
op.getLoc(), vtp.getElementType(), kind, red, ValueRange{});
}
genTensorStore(merger, codegen, rewriter, op, red);
}
@ -725,7 +811,8 @@ static bool isInvariantAffine(const CodeGen &codegen, AffineExpr a,
/// Hoists loop invariant tensor loads for which indices have been exhausted.
static void genInvariants(Merger &merger, CodeGen &codegen,
PatternRewriter &rewriter, linalg::GenericOp op,
unsigned exp, unsigned ldx, bool hoist) {
unsigned exp, unsigned ldx, bool hoist,
Kind last = Kind::kTensor) {
if (exp == -1u)
return;
if (merger.exp(exp).kind == Kind::kTensor) {
@ -743,6 +830,7 @@ static void genInvariants(Merger &merger, CodeGen &codegen,
OpOperand *lhs = op.getOutputOperand(0);
if (lhs == t) {
codegen.redExp = hoist ? exp : -1u;
codegen.redKind = getReduction(last);
} else if (atLevel) {
merger.exp(exp).val =
hoist ? genTensorLoad(merger, codegen, rewriter, op, exp) : Value();
@ -751,10 +839,11 @@ static void genInvariants(Merger &merger, CodeGen &codegen,
// Traverse into the binary operations. Note that we only hoist
// tensor loads, since subsequent MLIR/LLVM passes know how to
// deal with all other kinds of derived loop invariants.
Kind last = merger.exp(exp).kind;
unsigned e0 = merger.exp(exp).children.e0;
unsigned e1 = merger.exp(exp).children.e1;
genInvariants(merger, codegen, rewriter, op, e0, ldx, hoist);
genInvariants(merger, codegen, rewriter, op, e1, ldx, hoist);
genInvariants(merger, codegen, rewriter, op, e0, ldx, hoist, last);
genInvariants(merger, codegen, rewriter, op, e1, ldx, hoist, last);
}
}
@ -1233,6 +1322,10 @@ static void genResult(Merger &merger, CodeGen &codegen,
rewriter.replaceOp(op, result);
}
//===----------------------------------------------------------------------===//
// Sparse compiler rewriting methods.
//===----------------------------------------------------------------------===//
namespace {
/// Sparse rewriting rule for generic Lingalg operation.

View File

@ -210,32 +210,38 @@ func @mul_s(%arga: tensor<1024xf32, #SparseVector>, %argb: tensor<1024xf32>, %ar
//
// CHECK-VEC1-LABEL: func @reduction_d
// CHECK-VEC1-DAG: %[[c0:.*]] = constant 0 : index
// CHECK-VEC1-DAG: %[[i0:.*]] = constant 0 : i32
// CHECK-VEC1-DAG: %[[c16:.*]] = constant 16 : index
// CHECK-VEC1-DAG: %[[c1024:.*]] = constant 1024 : index
// CHECK-VEC1-DAG: %[[v0:.*]] = constant dense<0.000000e+00> : vector<16xf32>
// CHECK-VEC1: %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[v0]]) -> (vector<16xf32>) {
// CHECK-VEC1: %[[l:.*]] = memref.load %{{.*}}[] : memref<f32>
// CHECK-VEC1: %[[r:.*]] = vector.insertelement %[[l]], %[[v0]][%[[i0]] : i32] : vector<16xf32>
// CHECK-VEC1: %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[r]]) -> (vector<16xf32>) {
// CHECK-VEC1: %[[la:.*]] = vector.load %{{.*}}[%[[i]]] : memref<?xf32>, vector<16xf32>
// CHECK-VEC1: %[[lb:.*]] = vector.load %{{.*}}[%[[i]]] : memref<1024xf32>, vector<16xf32>
// CHECK-VEC1: %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
// CHECK-VEC1: %[[a:.*]] = addf %[[red_in]], %[[m]] : vector<16xf32>
// CHECK-VEC1: scf.yield %[[a]] : vector<16xf32>
// CHECK-VEC1: }
// CHECK-VEC1: %{{.*}} = vector.reduction "add", %[[red]], %{{.*}} : vector<16xf32> into f32
// CHECK-VEC1: %{{.*}} = vector.reduction "add", %[[red]] : vector<16xf32> into f32
// CHECK-VEC1: return
//
// CHECK-VEC2-LABEL: func @reduction_d
// CHECK-VEC2-DAG: %[[c0:.*]] = constant 0 : index
// CHECK-VEC2-DAG: %[[i0:.*]] = constant 0 : i32
// CHECK-VEC2-DAG: %[[c16:.*]] = constant 16 : index
// CHECK-VEC2-DAG: %[[c1024:.*]] = constant 1024 : index
// CHECK-VEC2-DAG: %[[v0:.*]] = constant dense<0.000000e+00> : vector<16xf32>
// CHECK-VEC2: %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[v0]]) -> (vector<16xf32>) {
// CHECK-VEC2: %[[l:.*]] = memref.load %{{.*}}[] : memref<f32>
// CHECK-VEC2: %[[r:.*]] = vector.insertelement %[[l]], %[[v0]][%[[i0]] : i32] : vector<16xf32>
// CHECK-VEC2: %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[r]]) -> (vector<16xf32>) {
// CHECK-VEC2: %[[la:.*]] = vector.load %{{.*}}[%[[i]]] : memref<?xf32>, vector<16xf32>
// CHECK-VEC2: %[[lb:.*]] = vector.load %{{.*}}[%[[i]]] : memref<1024xf32>, vector<16xf32>
// CHECK-VEC2: %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
// CHECK-VEC2: %[[a:.*]] = addf %[[red_in]], %[[m]] : vector<16xf32>
// CHECK-VEC2: scf.yield %[[a]] : vector<16xf32>
// CHECK-VEC2: }
// CHECK-VEC2: %{{.*}} = vector.reduction "add", %[[red]], %{{.*}} : vector<16xf32> into f32
// CHECK-VEC2: %{{.*}} = vector.reduction "add", %[[red]] : vector<16xf32> into f32
// CHECK-VEC2: return
//
func @reduction_d(%arga: tensor<1024xf32, #DenseVector>, %argb: tensor<1024xf32>, %argx: tensor<f32>) -> tensor<f32> {

View File

@ -0,0 +1,216 @@
// RUN: mlir-opt %s \
// RUN: --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \
// RUN: --sparsification --sparse-tensor-conversion \
// RUN: --convert-vector-to-scf --convert-scf-to-std \
// RUN: --func-bufferize --tensor-constant-bufferize --tensor-bufferize \
// RUN: --std-bufferize --finalizing-bufferize --lower-affine \
// RUN: --convert-vector-to-llvm --convert-memref-to-llvm \
// RUN: --convert-std-to-llvm --reconcile-unrealized-casts | \
// RUN: mlir-cpu-runner \
// RUN: -e entry -entry-point-result=void \
// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
// RUN: FileCheck %s
//
// Do the same run, but now with SIMDization as well. This should not change the outcome.
//
// RUN: mlir-opt %s \
// RUN: --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \
// RUN: --sparsification="vectorization-strategy=2 vl=8" --sparse-tensor-conversion \
// RUN: --convert-vector-to-scf --convert-scf-to-std \
// RUN: --func-bufferize --tensor-constant-bufferize --tensor-bufferize \
// RUN: --std-bufferize --finalizing-bufferize --lower-affine \
// RUN: --convert-vector-to-llvm --convert-memref-to-llvm \
// RUN: --convert-std-to-llvm --reconcile-unrealized-casts | \
// RUN: mlir-cpu-runner \
// RUN: -e entry -entry-point-result=void \
// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
// RUN: FileCheck %s
#SV = #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>
#DV = #sparse_tensor.encoding<{ dimLevelType = [ "dense" ] }>
#trait_reduction = {
indexing_maps = [
affine_map<(i) -> (i)>, // a
affine_map<(i) -> ()> // x (scalar out)
],
iterator_types = ["reduction"],
doc = "x += OPER_i a(i)"
}
// An example of vector reductions.
module {
func @sum_reduction_i32(%arga: tensor<32xi32, #SV>,
%argx: tensor<i32>) -> tensor<i32> {
%0 = linalg.generic #trait_reduction
ins(%arga: tensor<32xi32, #SV>)
outs(%argx: tensor<i32>) {
^bb(%a: i32, %x: i32):
%0 = addi %x, %a : i32
linalg.yield %0 : i32
} -> tensor<i32>
return %0 : tensor<i32>
}
func @sum_reduction_f32(%arga: tensor<32xf32, #SV>,
%argx: tensor<f32>) -> tensor<f32> {
%0 = linalg.generic #trait_reduction
ins(%arga: tensor<32xf32, #SV>)
outs(%argx: tensor<f32>) {
^bb(%a: f32, %x: f32):
%0 = addf %x, %a : f32
linalg.yield %0 : f32
} -> tensor<f32>
return %0 : tensor<f32>
}
func @prod_reduction_i32(%arga: tensor<32xi32, #DV>,
%argx: tensor<i32>) -> tensor<i32> {
%0 = linalg.generic #trait_reduction
ins(%arga: tensor<32xi32, #DV>)
outs(%argx: tensor<i32>) {
^bb(%a: i32, %x: i32):
%0 = muli %x, %a : i32
linalg.yield %0 : i32
} -> tensor<i32>
return %0 : tensor<i32>
}
func @prod_reduction_f32(%arga: tensor<32xf32, #DV>,
%argx: tensor<f32>) -> tensor<f32> {
%0 = linalg.generic #trait_reduction
ins(%arga: tensor<32xf32, #DV>)
outs(%argx: tensor<f32>) {
^bb(%a: f32, %x: f32):
%0 = mulf %x, %a : f32
linalg.yield %0 : f32
} -> tensor<f32>
return %0 : tensor<f32>
}
func @and_reduction_i32(%arga: tensor<32xi32, #DV>,
%argx: tensor<i32>) -> tensor<i32> {
%0 = linalg.generic #trait_reduction
ins(%arga: tensor<32xi32, #DV>)
outs(%argx: tensor<i32>) {
^bb(%a: i32, %x: i32):
%0 = and %x, %a : i32
linalg.yield %0 : i32
} -> tensor<i32>
return %0 : tensor<i32>
}
func @or_reduction_i32(%arga: tensor<32xi32, #SV>,
%argx: tensor<i32>) -> tensor<i32> {
%0 = linalg.generic #trait_reduction
ins(%arga: tensor<32xi32, #SV>)
outs(%argx: tensor<i32>) {
^bb(%a: i32, %x: i32):
%0 = or %x, %a : i32
linalg.yield %0 : i32
} -> tensor<i32>
return %0 : tensor<i32>
}
func @xor_reduction_i32(%arga: tensor<32xi32, #SV>,
%argx: tensor<i32>) -> tensor<i32> {
%0 = linalg.generic #trait_reduction
ins(%arga: tensor<32xi32, #SV>)
outs(%argx: tensor<i32>) {
^bb(%a: i32, %x: i32):
%0 = xor %x, %a : i32
linalg.yield %0 : i32
} -> tensor<i32>
return %0 : tensor<i32>
}
func @dump_i32(%arg0 : tensor<i32>) {
%m = memref.buffer_cast %arg0 : memref<i32>
%v = memref.load %m[] : memref<i32>
vector.print %v : i32
return
}
func @dump_f32(%arg0 : tensor<f32>) {
%m = memref.buffer_cast %arg0 : memref<f32>
%v = memref.load %m[] : memref<f32>
vector.print %v : f32
return
}
func @entry() {
%ri = constant dense< 7 > : tensor<i32>
%rf = constant dense< 2.0 > : tensor<f32>
%c_0_i32 = constant dense<[
0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, 0,
0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0
]> : tensor<32xi32>
%c_0_f32 = constant dense<[
0.0, 1.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0,
0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0, 2.5, 0.0, 0.0, 0.0,
2.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 9.0
]> : tensor<32xf32>
%c_1_i32 = constant dense<[
1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 3
]> : tensor<32xi32>
%c_1_f32 = constant dense<[
1.0, 1.0, 1.0, 3.5, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0
]> : tensor<32xf32>
// Convert constants to annotated tensors.
%sparse_input_i32 = sparse_tensor.convert %c_0_i32
: tensor<32xi32> to tensor<32xi32, #SV>
%sparse_input_f32 = sparse_tensor.convert %c_0_f32
: tensor<32xf32> to tensor<32xf32, #SV>
%dense_input_i32 = sparse_tensor.convert %c_1_i32
: tensor<32xi32> to tensor<32xi32, #DV>
%dense_input_f32 = sparse_tensor.convert %c_1_f32
: tensor<32xf32> to tensor<32xf32, #DV>
// Call the kernels.
%0 = call @sum_reduction_i32(%sparse_input_i32, %ri)
: (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
%1 = call @sum_reduction_f32(%sparse_input_f32, %rf)
: (tensor<32xf32, #SV>, tensor<f32>) -> tensor<f32>
%2 = call @prod_reduction_i32(%dense_input_i32, %ri)
: (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
%3 = call @prod_reduction_f32(%dense_input_f32, %rf)
: (tensor<32xf32, #DV>, tensor<f32>) -> tensor<f32>
%4 = call @and_reduction_i32(%dense_input_i32, %ri)
: (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
%5 = call @or_reduction_i32(%sparse_input_i32, %ri)
: (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
%6 = call @xor_reduction_i32(%sparse_input_i32, %ri)
: (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
// Verify results.
//
// CHECK: 26
// CHECK: 27.5
// CHECK: 3087
// CHECK: 168
// CHECK: 1
// CHECK: 15
// CHECK: 10
//
call @dump_i32(%0) : (tensor<i32>) -> ()
call @dump_f32(%1) : (tensor<f32>) -> ()
call @dump_i32(%2) : (tensor<i32>) -> ()
call @dump_f32(%3) : (tensor<f32>) -> ()
call @dump_i32(%4) : (tensor<i32>) -> ()
call @dump_i32(%5) : (tensor<i32>) -> ()
call @dump_i32(%6) : (tensor<i32>) -> ()
return
}
}