[mlir][sparse] add parallelization strategies to sparse compiler

This CL adds the ability to request different parallelization strategies
for the generate code. Every "parallel" loop is a candidate, and converted
to a parallel op if it is an actual for-loop (not a while) and the strategy
allows dense/sparse outer/inner parallelization.

This will connect directly with the work of @ezhulenev on parallel loops.

Still TBD: vectorization strategy

Reviewed By: penpornk

Differential Revision: https://reviews.llvm.org/D91978
This commit is contained in:
Aart Bik 2020-11-24 15:36:10 -08:00
parent 23dc04981b
commit 5c4e397e6c
4 changed files with 388 additions and 52 deletions

View File

@ -783,9 +783,62 @@ LogicalResult applyStagedPatterns(
//===----------------------------------------------------------------------===//
// Support for sparse tensor code generation.
//
// The sparse compiler part of MLIR lowers a tensor expression formulated as a
// Linalg operation into a sequence of loops depending on what dimensions of the
// tensors are marked dense or sparse. The generated code distinguishes between:
// (1) for-loops that iterate over a single dense dimension,
// (2) for-loops that iterate over a single sparse dimension,
// (3) while-loops that co-iterate over several sparse dimensions.
// The for-loops may be subsequently optimized for parallel or vector execution.
//
// For more details, the Dialect/Linalg/Transforms/Sparsification.cpp file.
//===----------------------------------------------------------------------===//
void populateSparsificationPatterns(MLIRContext *context,
OwningRewritePatternList &patterns);
/// Defines a parallelization strategy. Any implicit loop in the Linalg
/// operation that is marked "parallel" (thus not "reduction") is a candidate
/// for parallelization. The loop is made parallel if (1) allowed by the
/// strategy (e.g., AnyStorageOuterLoop considers either a dense or sparse
/// outermost loop only), and (2) the generated code is an actual for-loop
/// (and not a co-iterating while-loop).
enum class SparseParallelizationStrategy {
kNone,
kDenseOuterLoop,
kAnyStorageOuterLoop,
kDenseAnyLoop,
kAnyStorageAnyLoop
// TODO: support reduction parallelization too?
};
/// Defines a vectorization strategy. Any implicit inner loop in the Linalg
/// operation is a candidate (full SIMD for "parallel" loops and horizontal
/// SIMD for "reduction" loops). A loop is actually vectorized if (1) allowed
/// by the strategy, and (2) the emitted code is an actual for-loop (and not
/// a co-iterating while-loop).
enum class SparseVectorizationStrategy {
kNone,
kDenseInnerLoop,
kAnyStorageInnerLoop
};
/// Sparsification options.
struct SparsificationOptions {
SparsificationOptions(SparseParallelizationStrategy p,
SparseVectorizationStrategy v, unsigned vl)
: parallelizationStrategy(p), vectorizationStrategy(v), vectorLength(vl) {
}
SparsificationOptions()
: SparsificationOptions(SparseParallelizationStrategy::kNone,
SparseVectorizationStrategy::kNone, 1u) {}
SparseParallelizationStrategy parallelizationStrategy;
SparseVectorizationStrategy vectorizationStrategy;
unsigned vectorLength;
};
/// Set up sparsification rewriting rules with the given options.
void populateSparsificationPatterns(
MLIRContext *context, OwningRewritePatternList &patterns,
const SparsificationOptions &options = SparsificationOptions());
} // namespace linalg
} // namespace mlir

View File

@ -235,22 +235,30 @@ private:
// Code generation.
struct CodeGen {
CodeGen(unsigned numTensors, unsigned numLoops)
: loops(numLoops), sizes(numLoops), buffers(numTensors),
CodeGen(linalg::SparsificationOptions o, unsigned numTensors,
unsigned numLoops)
: options(o), loops(numLoops), sizes(numLoops), buffers(numTensors),
pointers(numTensors, std::vector<Value>(numLoops)),
indices(numTensors, std::vector<Value>(numLoops)),
highs(numTensors, std::vector<Value>(numLoops)),
pidxs(numTensors, std::vector<Value>(numLoops)),
idxs(numTensors, std::vector<Value>(numLoops)) {}
// Universal dense indices and upper bounds (by index).
// Sparsification options.
linalg::SparsificationOptions options;
// Universal dense indices and upper bounds (by index). The loops array
// is updated with the value of the universal dense index in the current
// loop. The sizes array is set once with the inferred dimension sizes.
std::vector<Value> loops;
std::vector<Value> sizes;
// Buffers for storing dense and sparse numerical values (by tensor).
// This array is set once during bufferization of all tensors.
std::vector<Value> buffers;
// Sparse storage schemes (1-D): pointers and indices (by tensor and index).
// This array is set once during bufferization of all sparse tensors.
std::vector<std::vector<Value>> pointers;
std::vector<std::vector<Value>> indices;
// Sparse iteration information (by tensor and index).
// Sparse iteration information (by tensor and index). These arrays
// are updated to remain current within the current loop.
std::vector<std::vector<Value>> highs;
std::vector<std::vector<Value>> pidxs;
std::vector<std::vector<Value>> idxs;
@ -388,7 +396,7 @@ static unsigned buildLattices(Merger &merger, linalg::GenericOp op,
unsigned exp, unsigned idx) {
Kind kind = merger.exp(exp).kind;
if (kind == Kind::kTensor || kind == Kind::kInvariant) {
// Either the index is really used in the tensor expression, or it it
// Either the index is really used in the tensor expression, or it is
// set to the "non-existing dense index" in that dimension. Invariant
// expressions borrow the output tensor indices.
unsigned s = merger.addSet();
@ -573,38 +581,81 @@ static bool genInit(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter,
return needsUniv;
}
/// Generates a for-loop or a while-loop, depending on whether it implements
/// singleton iteration or co-iteration over the given conjunction.
static void genLoop(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter,
linalg::GenericOp op, unsigned idx, bool needsUniv,
llvm::BitVector &indices, scf::ForOp &forOp,
scf::WhileOp &whileOp) {
Location loc = op.getLoc();
/// Generates a for-loop on a single index.
static Operation *genFor(Merger &merger, CodeGen &codegen,
PatternRewriter &rewriter, linalg::GenericOp op,
bool isOuter, unsigned idx, llvm::BitVector &indices) {
unsigned fb = indices.find_first();
unsigned tensor = merger.tensor(fb);
assert(idx == merger.index(fb));
// Emit a for-loop for a single index.
if (indices.count() == 1) {
unsigned fb = indices.find_first();
unsigned tensor = merger.tensor(fb);
assert(idx == merger.index(fb));
// Emit a sparse for-loop or a dense for-loop.
Value one = rewriter.create<ConstantIndexOp>(loc, 1);
if (merger.isSparseBit(fb)) {
forOp = rewriter.create<scf::ForOp>(loc, codegen.pidxs[tensor][idx],
codegen.highs[tensor][idx], one);
codegen.pidxs[tensor][idx] = forOp.getInductionVar();
} else {
forOp = rewriter.create<scf::ForOp>(loc, codegen.loops[idx],
codegen.sizes[idx], one);
codegen.loops[idx] = forOp.getInductionVar();
}
rewriter.setInsertionPointToStart(forOp.getBody());
return;
// Parallelization strategy. Any implicit loop in the Linalg operation that
// is marked "parallel" is a candidate. Whether it is actually converted to
// a parallel operation depends on the requested strategy.
auto iteratorTypes = op.iterator_types().getValue();
bool isSparse = merger.isSparseBit(fb);
bool isParallel = linalg::isParallelIteratorType(iteratorTypes[idx]);
switch (codegen.options.parallelizationStrategy) {
case linalg::SparseParallelizationStrategy::kNone:
isParallel = false;
break;
case linalg::SparseParallelizationStrategy::kDenseOuterLoop:
isParallel &= isOuter && !isSparse;
break;
case linalg::SparseParallelizationStrategy::kAnyStorageOuterLoop:
isParallel &= isOuter;
break;
case linalg::SparseParallelizationStrategy::kDenseAnyLoop:
isParallel &= !isSparse;
break;
case linalg::SparseParallelizationStrategy::kAnyStorageAnyLoop:
break;
}
// Otherwise, emit a while-loop for co-iteration.
Type indexType = rewriter.getIndexType();
// Loop bounds and increment.
Location loc = op.getLoc();
Value lo;
Value hi;
Value step = rewriter.create<ConstantIndexOp>(loc, 1);
Value index;
if (isSparse) {
lo = codegen.pidxs[tensor][idx];
hi = codegen.highs[tensor][idx];
} else {
lo = codegen.loops[idx];
hi = codegen.sizes[idx];
}
// Emit a parallel loop.
if (isParallel) {
scf::ParallelOp parOp = rewriter.create<scf::ParallelOp>(loc, lo, hi, step);
if (isSparse)
codegen.pidxs[tensor][idx] = parOp.getInductionVars()[0];
else
codegen.loops[idx] = parOp.getInductionVars()[0];
rewriter.setInsertionPointToStart(parOp.getBody());
return parOp;
}
// Emit a sequential loop.
scf::ForOp forOp = rewriter.create<scf::ForOp>(loc, lo, hi, step);
if (isSparse)
codegen.pidxs[tensor][idx] = forOp.getInductionVar();
else
codegen.loops[idx] = forOp.getInductionVar();
rewriter.setInsertionPointToStart(forOp.getBody());
return forOp;
}
/// Emit a while-loop for co-iteration over multiple indices.
static Operation *genWhile(Merger &merger, CodeGen &codegen,
PatternRewriter &rewriter, linalg::GenericOp op,
unsigned idx, bool needsUniv,
llvm::BitVector &indices) {
SmallVector<Type, 4> types;
SmallVector<Value, 4> operands;
// Construct the while-loop with a parameter for each index.
Type indexType = rewriter.getIndexType();
for (unsigned b = 0, be = indices.size(); b < be; b++) {
if (indices[b] && merger.isSparseBit(b)) {
unsigned tensor = merger.tensor(b);
@ -617,9 +668,11 @@ static void genLoop(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter,
types.push_back(indexType);
operands.push_back(codegen.loops[idx]);
}
whileOp = rewriter.create<scf::WhileOp>(loc, types, operands);
Location loc = op.getLoc();
scf::WhileOp whileOp = rewriter.create<scf::WhileOp>(loc, types, operands);
Block *before = rewriter.createBlock(&whileOp.before(), {}, types);
Block *after = rewriter.createBlock(&whileOp.after(), {}, types);
// Build the "before" region, which effectively consists
// of a conjunction of "i < upper" tests on all induction.
rewriter.setInsertionPointToStart(&whileOp.before().front());
@ -641,6 +694,18 @@ static void genLoop(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter,
assert(o == operands.size());
rewriter.create<scf::ConditionOp>(loc, cond, before->getArguments());
rewriter.setInsertionPointToStart(&whileOp.after().front());
return whileOp;
}
/// Generates a for-loop or a while-loop, depending on whether it implements
/// singleton iteration or co-iteration over the given conjunction.
static Operation *genLoop(Merger &merger, CodeGen &codegen,
PatternRewriter &rewriter, linalg::GenericOp op,
bool isOuter, unsigned idx, bool needsUniv,
llvm::BitVector &indices) {
if (indices.count() == 1)
return genFor(merger, codegen, rewriter, op, isOuter, idx, indices);
return genWhile(merger, codegen, rewriter, op, idx, needsUniv, indices);
}
/// Generates the local variables for this loop, consisting of the sparse
@ -804,16 +869,16 @@ static void genStmt(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter,
LatPoint lati = merger.lat(li);
// Emit loop.
scf::ForOp forOp;
scf::WhileOp whileOp;
llvm::BitVector indices = lati.bits;
optimizeIndices(merger, lsize, indices);
genLoop(merger, codegen, rewriter, op, idx, needsUniv, indices, forOp,
whileOp);
bool isOuter = at == 0;
Operation *loop = genLoop(merger, codegen, rewriter, op, isOuter, idx,
needsUniv, indices);
genLocals(merger, codegen, rewriter, op, topSort, at, needsUniv, lati.bits);
// Visit all lattices points with Li >= Lj to generate the
// loop-body, possibly with if statements for coiteration.
bool isWhile = dyn_cast<scf::WhileOp>(loop) != nullptr;
scf::IfOp ifOp;
for (unsigned lj : merger.set(lts)) {
if (li == lj || merger.latGT(li, lj)) {
@ -823,22 +888,22 @@ static void genStmt(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter,
if (merger.hasAnyOf(tmp, false))
continue; // dense exhausted within if/else
// Recurse into body of each branch.
if (whileOp)
if (isWhile)
genIf(merger, codegen, rewriter, op, idx, latj.bits, ifOp);
genStmt(merger, codegen, rewriter, op, topSort, latj.exp, at + 1);
}
}
// Wrap-up induction and restore insertion point.
if (forOp) {
needsUniv = false;
rewriter.setInsertionPointAfter(forOp);
} else {
if (isWhile) {
scf::WhileOp whileOp = cast<scf::WhileOp>(loop);
rewriter.setInsertionPointToEnd(&whileOp.after().front());
genWhileInduction(merger, codegen, rewriter, op, idx, needsUniv,
lati.bits, whileOp.results());
rewriter.setInsertionPointAfter(whileOp);
} else {
needsUniv = false;
}
rewriter.setInsertionPointAfter(loop);
}
}
@ -846,7 +911,9 @@ namespace {
/// Sparse rewriting rule for generic Lingalg operation.
struct GenericOpSparsifier : public OpRewritePattern<linalg::GenericOp> {
using OpRewritePattern<linalg::GenericOp>::OpRewritePattern;
public:
GenericOpSparsifier(MLIRContext *context, linalg::SparsificationOptions o)
: OpRewritePattern<linalg::GenericOp>(context), options(o) {}
LogicalResult matchAndRewrite(linalg::GenericOp op,
PatternRewriter &rewriter) const override {
@ -878,7 +945,7 @@ struct GenericOpSparsifier : public OpRewritePattern<linalg::GenericOp> {
return failure(); // build failure
// Recursively generates code.
CodeGen codegen(numTensors, numLoops);
CodeGen codegen(options, numTensors, numLoops);
genBuffers(merger, codegen, rewriter, op);
genStmt(merger, codegen, rewriter, op, topSort, exp.getValue(), 0);
Value result =
@ -886,13 +953,18 @@ struct GenericOpSparsifier : public OpRewritePattern<linalg::GenericOp> {
rewriter.replaceOp(op, result);
return success();
}
private:
/// Options to control sparse code generation.
linalg::SparsificationOptions options;
};
} // namespace
/// Populates the given patterns list with rewriting rules required for
/// the sparsification of linear algebra operations.
void mlir::linalg::populateSparsificationPatterns(
MLIRContext *context, OwningRewritePatternList &patterns) {
patterns.insert<GenericOpSparsifier>(context);
void linalg::populateSparsificationPatterns(
MLIRContext *context, OwningRewritePatternList &patterns,
const SparsificationOptions &options) {
patterns.insert<GenericOpSparsifier>(context, options);
}

View File

@ -0,0 +1,161 @@
// RUN: mlir-opt %s -test-sparsification="parallelization-strategy=0" | \
// RUN: FileCheck %s --check-prefix=CHECK-PAR0
// RUN: mlir-opt %s -test-sparsification="parallelization-strategy=1" | \
// RUN: FileCheck %s --check-prefix=CHECK-PAR1
// RUN: mlir-opt %s -test-sparsification="parallelization-strategy=2" | \
// RUN: FileCheck %s --check-prefix=CHECK-PAR2
// RUN: mlir-opt %s -test-sparsification="parallelization-strategy=3" | \
// RUN: FileCheck %s --check-prefix=CHECK-PAR3
// RUN: mlir-opt %s -test-sparsification="parallelization-strategy=4" | \
// RUN: FileCheck %s --check-prefix=CHECK-PAR4
#trait_dd = {
indexing_maps = [
affine_map<(i,j) -> (i,j)>, // A
affine_map<(i,j) -> (i,j)> // X (out)
],
sparse = [
[ "D", "D" ], // A
[ "D", "D" ] // X
],
iterator_types = ["parallel", "parallel"],
doc = "X(i,j) = A(i,j) * SCALE"
}
//
// CHECK-PAR0-LABEL: func @scale_dd
// CHECK-PAR0: scf.for
// CHECK-PAR0: scf.for
// CHECK-PAR0: return
//
// CHECK-PAR1-LABEL: func @scale_dd
// CHECK-PAR1: scf.parallel
// CHECK-PAR1: scf.for
// CHECK-PAR1: return
//
// CHECK-PAR2-LABEL: func @scale_dd
// CHECK-PAR2: scf.parallel
// CHECK-PAR2: scf.for
// CHECK-PAR2: return
//
// CHECK-PAR3-LABEL: func @scale_dd
// CHECK-PAR3: scf.parallel
// CHECK-PAR3: scf.parallel
// CHECK-PAR3: return
//
// CHECK-PAR4-LABEL: func @scale_dd
// CHECK-PAR4: scf.parallel
// CHECK-PAR4: scf.parallel
// CHECK-PAR4: return
//
func @scale_dd(%scale: f32, %arga: tensor<?x?xf32>) -> tensor<?x?xf32> {
%0 = linalg.generic #trait_dd
ins(%arga: tensor<?x?xf32>) {
^bb(%a: f32):
%0 = mulf %a, %scale : f32
linalg.yield %0 : f32
} -> tensor<?x?xf32>
return %0 : tensor<?x?xf32>
}
#trait_ss = {
indexing_maps = [
affine_map<(i,j) -> (i,j)>, // A
affine_map<(i,j) -> (i,j)> // X (out)
],
sparse = [
[ "S", "S" ], // A
[ "D", "D" ] // X
],
iterator_types = ["parallel", "parallel"],
doc = "X(i,j) = A(i,j) * SCALE"
}
//
// CHECK-PAR0-LABEL: func @scale_ss
// CHECK-PAR0: scf.for
// CHECK-PAR0: scf.for
// CHECK-PAR0: return
//
// CHECK-PAR1-LABEL: func @scale_ss
// CHECK-PAR1: scf.for
// CHECK-PAR1: scf.for
// CHECK-PAR1: return
//
// CHECK-PAR2-LABEL: func @scale_ss
// CHECK-PAR2: scf.parallel
// CHECK-PAR2: scf.for
// CHECK-PAR2: return
//
// CHECK-PAR3-LABEL: func @scale_ss
// CHECK-PAR3: scf.for
// CHECK-PAR3: scf.for
// CHECK-PAR3: return
//
// CHECK-PAR4-LABEL: func @scale_ss
// CHECK-PAR4: scf.parallel
// CHECK-PAR4: scf.parallel
// CHECK-PAR4: return
//
func @scale_ss(%scale: f32, %arga: tensor<?x?xf32>) -> tensor<?x?xf32> {
%0 = linalg.generic #trait_ss
ins(%arga: tensor<?x?xf32>) {
^bb(%a: f32):
%0 = mulf %a, %scale : f32
linalg.yield %0 : f32
} -> tensor<?x?xf32>
return %0 : tensor<?x?xf32>
}
#trait_matvec = {
indexing_maps = [
affine_map<(i,j) -> (i,j)>, // A
affine_map<(i,j) -> (j)>, // b
affine_map<(i,j) -> (i)> // x (out)
],
sparse = [
[ "D", "S" ], // A
[ "D" ], // b
[ "D" ] // x
],
iterator_types = ["parallel", "reduction"],
doc = "x(i) += A(i,j) * b(j)"
}
//
// CHECK-PAR0-LABEL: func @matvec
// CHECK-PAR0: scf.for
// CHECK-PAR0: scf.for
// CHECK-PAR0: return
//
// CHECK-PAR1-LABEL: func @matvec
// CHECK-PAR1: scf.parallel
// CHECK-PAR1: scf.for
// CHECK-PAR1: return
//
// CHECK-PAR2-LABEL: func @matvec
// CHECK-PAR2: scf.parallel
// CHECK-PAR2: scf.for
// CHECK-PAR2: return
//
// CHECK-PAR3-LABEL: func @matvec
// CHECK-PAR3: scf.parallel
// CHECK-PAR3: scf.for
// CHECK-PAR3: return
//
// CHECK-PAR4-LABEL: func @matvec
// CHECK-PAR4: scf.parallel
// CHECK-PAR4: scf.for
// CHECK-PAR4: return
//
func @matvec(%argA: tensor<16x32xf32>, %argb: tensor<32xf32>, %argx: tensor<16xf32>) -> tensor<16xf32> {
%0 = linalg.generic #trait_matvec
ins(%argA, %argb : tensor<16x32xf32>, tensor<32xf32>)
init(%argx : tensor<16xf32>) {
^bb(%A: f32, %b: f32, %x: f32):
%0 = mulf %A, %b : f32
%1 = addf %0, %x : f32
linalg.yield %1 : f32
} -> tensor<16xf32>
return %0 : tensor<16xf32>
}

View File

@ -16,13 +16,63 @@ namespace {
struct TestSparsification
: public PassWrapper<TestSparsification, FunctionPass> {
TestSparsification() = default;
TestSparsification(const TestSparsification &pass) {}
Option<int32_t> parallelization{
*this, "parallelization-strategy",
llvm::cl::desc("Set the parallelization strategy"), llvm::cl::init(0)};
Option<int32_t> vectorization{
*this, "vectorization-strategy",
llvm::cl::desc("Set the vectorization strategy"), llvm::cl::init(0)};
Option<int32_t> vectorLength{
*this, "vl", llvm::cl::desc("Set the vector length"), llvm::cl::init(1)};
/// Registers all dialects required by testing.
void getDependentDialects(DialectRegistry &registry) const override {
registry.insert<scf::SCFDialect>();
registry.insert<scf::SCFDialect, vector::VectorDialect>();
}
/// Returns parallelization strategy given on command line.
linalg::SparseParallelizationStrategy parallelOption() {
switch (parallelization) {
default:
return linalg::SparseParallelizationStrategy::kNone;
case 1:
return linalg::SparseParallelizationStrategy::kDenseOuterLoop;
case 2:
return linalg::SparseParallelizationStrategy::kAnyStorageOuterLoop;
case 3:
return linalg::SparseParallelizationStrategy::kDenseAnyLoop;
case 4:
return linalg::SparseParallelizationStrategy::kAnyStorageAnyLoop;
}
}
/// Returns vectorization strategy given on command line.
linalg::SparseVectorizationStrategy vectorOption() {
switch (vectorization) {
default:
return linalg::SparseVectorizationStrategy::kNone;
case 1:
return linalg::SparseVectorizationStrategy::kDenseInnerLoop;
case 2:
return linalg::SparseVectorizationStrategy::kAnyStorageInnerLoop;
}
}
/// Runs the test on a function.
void runOnFunction() override {
auto *ctx = &getContext();
OwningRewritePatternList patterns;
linalg::populateSparsificationPatterns(ctx, patterns);
// Translate strategy flags to strategy options.
linalg::SparsificationOptions options(parallelOption(), vectorOption(),
vectorLength);
// Apply rewriting.
linalg::populateSparsificationPatterns(ctx, patterns, options);
applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
}
};