[mlir][Linalg] Add support for tileAndDistribute on tensors.

scf.parallel is currently not a good fit for tiling on tensors.
Instead provide a path to parallelism directly through scf.for.
For now, this transformation ignores the distribution scheme and always does a block-cyclic mapping (where block is the tile size).

Differential revision: https://reviews.llvm.org/D90475
This commit is contained in:
Nicolas Vasilache 2020-11-16 10:40:24 +00:00
parent e0c92c6c03
commit 7625742237
8 changed files with 103 additions and 20 deletions

View File

@ -24,15 +24,15 @@ namespace edsc {
/// Adapters for building loop nests using the builder and the location stored
/// in ScopedContext. Actual builders are in scf::buildLoopNest.
scf::ValueVector loopNestBuilder(ValueRange lbs, ValueRange ubs,
scf::LoopNest loopNestBuilder(ValueRange lbs, ValueRange ubs,
ValueRange steps,
function_ref<void(ValueRange)> fun = nullptr);
scf::ValueVector loopNestBuilder(Value lb, Value ub, Value step,
scf::LoopNest loopNestBuilder(Value lb, Value ub, Value step,
function_ref<void(Value)> fun = nullptr);
scf::ValueVector loopNestBuilder(
scf::LoopNest loopNestBuilder(
Value lb, Value ub, Value step, ValueRange iterArgInitValues,
function_ref<scf::ValueVector(Value, ValueRange)> fun = nullptr);
scf::ValueVector loopNestBuilder(
scf::LoopNest loopNestBuilder(
ValueRange lbs, ValueRange ubs, ValueRange steps,
ValueRange iterArgInitValues,
function_ref<scf::ValueVector(ValueRange, ValueRange)> fun = nullptr);

View File

@ -51,6 +51,11 @@ ParallelOp getParallelForInductionVarOwner(Value val);
/// An owning vector of values, handy to return from functions.
using ValueVector = std::vector<Value>;
using LoopVector = std::vector<scf::ForOp>;
struct LoopNest {
ResultRange getResults() { return loops.front().getResults(); }
LoopVector loops;
};
/// Creates a perfect nest of "for" loops, i.e. all loops but the innermost
/// contain only another loop and a terminator. The lower, upper bounds and
@ -65,11 +70,12 @@ using ValueVector = std::vector<Value>;
/// yielded from the loop body and forwarded back through the loop nest. If the
/// function is not provided, the loop nest is not expected to have iteration
/// arguments, the body of the innermost loop will be left empty, containing
/// only the zero-operand terminator. Returns the values yielded by the
/// outermost loop. If bound arrays are empty, the body builder will be called
/// only the zero-operand terminator. Returns the LoopNest containing the list
/// of perfectly nest scf::ForOp build during the call.
/// If bound arrays are empty, the body builder will be called
/// once to construct the IR outside of the loop with an empty list of induction
/// variables.
ValueVector buildLoopNest(
LoopNest buildLoopNest(
OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs,
ValueRange steps, ValueRange iterArgs,
function_ref<ValueVector(OpBuilder &, Location, ValueRange, ValueRange)>
@ -78,7 +84,8 @@ ValueVector buildLoopNest(
/// A convenience version for building loop nests without iteration arguments
/// (like for reductions). Does not take the initial value of reductions or
/// expect the body building functions to return their current value.
ValueVector buildLoopNest(OpBuilder &builder, Location loc, ValueRange lbs,
/// The built nested scf::For are captured in `capturedLoops` when non-null.
LoopNest buildLoopNest(OpBuilder &builder, Location loc, ValueRange lbs,
ValueRange ubs, ValueRange steps,
function_ref<void(OpBuilder &, Location, ValueRange)>
bodyBuilder = nullptr);

View File

@ -24,6 +24,7 @@
#include "mlir/IR/Matchers.h"
#include "mlir/IR/OpImplementation.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopUtils.h"
using namespace mlir;
using namespace mlir::linalg;
@ -171,10 +172,27 @@ void GenerateLoopNest<scf::ForOp>::doit(
ArrayRef<Range> loopRanges, ValueRange iterArgInitValues,
ArrayRef<Attribute> iteratorTypes,
function_ref<scf::ValueVector(ValueRange, ValueRange)> bodyBuilderFn,
Optional<LinalgLoopDistributionOptions>) {
Optional<LinalgLoopDistributionOptions> distributionOptions) {
// Create procInfo so it dominate loops, if appropriate.
OpBuilder &builder = edsc::ScopedContext::getBuilderRef();
Location loc = edsc::ScopedContext::getLocation();
SmallVector<ProcInfo, 2> procInfo;
if (distributionOptions.hasValue())
procInfo = distributionOptions->procInfo(builder, loc, ArrayRef<Range>{});
SmallVector<Value, 4> lbs, ubs, steps;
unpackRanges(loopRanges, lbs, ubs, steps);
edsc::loopNestBuilder(lbs, ubs, steps, iterArgInitValues, bodyBuilderFn);
LoopNest loopNest =
edsc::loopNestBuilder(lbs, ubs, steps, iterArgInitValues, bodyBuilderFn);
if (!distributionOptions.hasValue() || loopNest.loops.empty())
return;
// TODO: support distributionMethod, which is currently ignored.
for (auto it : llvm::zip(loopNest.loops, procInfo,
distributionOptions->distributionMethod))
mapLoopToProcessorIds(std::get<0>(it), std::get<1>(it).procId,
std::get<1>(it).nprocs);
}
/// Specialization to build affine "for" nest.

View File

@ -14,7 +14,7 @@
using namespace mlir;
using namespace mlir::edsc;
mlir::scf::ValueVector
mlir::scf::LoopNest
mlir::edsc::loopNestBuilder(ValueRange lbs, ValueRange ubs, ValueRange steps,
function_ref<void(ValueRange)> fun) {
// Delegates actual construction to scf::buildLoopNest by wrapping `fun` into
@ -29,7 +29,7 @@ mlir::edsc::loopNestBuilder(ValueRange lbs, ValueRange ubs, ValueRange steps,
});
}
mlir::scf::ValueVector
mlir::scf::LoopNest
mlir::edsc::loopNestBuilder(Value lb, Value ub, Value step,
function_ref<void(Value)> fun) {
// Delegates to the ValueRange-based version by wrapping the lambda.
@ -42,7 +42,7 @@ mlir::edsc::loopNestBuilder(Value lb, Value ub, Value step,
wrapper);
}
mlir::scf::ValueVector mlir::edsc::loopNestBuilder(
mlir::scf::LoopNest mlir::edsc::loopNestBuilder(
Value lb, Value ub, Value step, ValueRange iterArgInitValues,
function_ref<scf::ValueVector(Value, ValueRange)> fun) {
// Delegates actual construction to scf::buildLoopNest by wrapping `fun` into
@ -61,7 +61,7 @@ mlir::scf::ValueVector mlir::edsc::loopNestBuilder(
});
}
mlir::scf::ValueVector mlir::edsc::loopNestBuilder(
mlir::scf::LoopNest mlir::edsc::loopNestBuilder(
ValueRange lbs, ValueRange ubs, ValueRange steps,
ValueRange iterArgInitValues,
function_ref<scf::ValueVector(ValueRange, ValueRange)> fun) {

View File

@ -305,7 +305,7 @@ void ForOp::getNumRegionInvocations(ArrayRef<Attribute> operands,
step.getValue().getSExtValue());
}
ValueVector mlir::scf::buildLoopNest(
LoopNest mlir::scf::buildLoopNest(
OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs,
ValueRange steps, ValueRange iterArgs,
function_ref<ValueVector(OpBuilder &, Location, ValueRange, ValueRange)>
@ -323,7 +323,7 @@ ValueVector mlir::scf::buildLoopNest(
assert(results.size() == iterArgs.size() &&
"loop nest body must return as many values as loop has iteration "
"arguments");
return results;
return LoopNest();
}
// First, create the loop structure iteratively using the body-builder
@ -372,11 +372,13 @@ ValueVector mlir::scf::buildLoopNest(
builder.setInsertionPointToEnd(loops.back().getBody());
builder.create<scf::YieldOp>(loc, results);
// Return the results of the outermost loop.
return ValueVector(loops.front().result_begin(), loops.front().result_end());
// Return the loops.
LoopNest res;
res.loops.assign(loops.begin(), loops.end());
return res;
}
ValueVector mlir::scf::buildLoopNest(
LoopNest mlir::scf::buildLoopNest(
OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs,
ValueRange steps,
function_ref<void(OpBuilder &, Location, ValueRange)> bodyBuilder) {

View File

@ -172,3 +172,43 @@ func @gemm6(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
// CHECK: %[[OFFSETX_2:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
// CHECK: %[[SV3:.*]] = subview %[[ARG2]][%[[ARG3]], %[[OFFSETX_2]]]
// CHECK: linalg.matmul ins(%[[SV1]], %[[SV2]]{{.*}} outs(%[[SV3]]
// -----
// CHECK-LABEL: func @matmul_tensors(
// CHECK-SAME: %[[TA:[0-9a-z]+]]: tensor<?x?xf32>
// CHECK-SAME: %[[TB:[0-9a-z]+]]: tensor<?x?xf32>
// CHECK-SAME: %[[TC:[0-9a-z]+]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
func @matmul_tensors(
%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>)
-> tensor<?x?xf32> {
// CHECK: %[[C8:.*]] = constant 8 : index
// CHECK: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
// CHECK: %[[NBLOCKSY:.*]] = "gpu.grid_dim"() {dimension = "y"}
// CHECK: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
// CHECK: %[[NBLOCKSX:.*]] = "gpu.grid_dim"() {dimension = "x"}
// CHECK: %[[LBY:.*]] = muli %[[BIDY]], %[[C8]] : index
// CHECK: %[[STEPY:.*]] = muli %[[NBLOCKSY]], %[[C8]] : index
// CHECK: %[[TD0:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC0:.*]] = %[[TC]]) -> (tensor<?x?xf32>) {
// CHECK: %[[LBX:.*]] = muli %[[BIDX]], %[[C8]] : index
// CHECK: %[[STEPX:.*]] = muli %[[NBLOCKSX]], %[[C8]] : index
// CHECK: %[[TD1:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC1:.*]] = %[[TC0]]) -> (tensor<?x?xf32>) {
// CHECK: %[[TD2:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC2:.*]] = %[[TC1]]) -> (tensor<?x?xf32>) {
// CHECK: %[[sTA:.*]] = subtensor %[[TA]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
// CHECK: %[[sTB:.*]] = subtensor %[[TB]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
// CHECK: %[[sTC:.*]] = subtensor %[[TC2]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
// CHECK: %[[sTD:.*]] = linalg.matmul ins(%[[sTA]], %[[sTB]] : tensor<?x?xf32>, tensor<?x?xf32>)
// CHECK-SAME: init(%[[sTC]] : tensor<?x?xf32>) -> tensor<?x?xf32>
// CHECK: %[[TD:.*]] = subtensor_insert %[[sTD]] into %[[TC2]][{{.*}}] : tensor<?x?xf32> into tensor<?x?xf32>
// CHECK: scf.yield %[[TD]] : tensor<?x?xf32>
// CHECK: scf.yield %[[TD2]] : tensor<?x?xf32>
// CHECK: scf.yield %[[TD1]] : tensor<?x?xf32>
%0 = linalg.matmul {__internal_linalg_transform__ = "tensors_distribute1"}
ins(%arg0, %arg1: tensor<?x?xf32>, tensor<?x?xf32>)
init(%arg2: tensor<?x?xf32>)
-> tensor<?x?xf32>
// CHECK: return %[[TD0]] : tensor<?x?xf32>
return %0 : tensor<?x?xf32>
}

View File

@ -1223,7 +1223,7 @@ TEST_FUNC(builder_loop_for_yield) {
[&](Value iv, ValueRange args) {
Value sum = args[0] + args[1];
return scf::ValueVector{args[1], sum};
});
}).getResults();
results[0] + results[1];
// clang-format off

View File

@ -409,6 +409,22 @@ static void fillTileAndDistributePatterns(MLIRContext *context,
LinalgMarker(Identifier::get("distribute6", context),
Identifier::get("after_distribute6", context)));
}
{
LinalgLoopDistributionOptions cyclicNprocsEqNiters;
cyclicNprocsEqNiters.distributionMethod.resize(
2, DistributionMethod::CyclicNumProcsEqNumIters);
cyclicNprocsEqNiters.procInfo =
getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
patterns.insert<LinalgTilingPattern<MatmulOp>>(
context,
LinalgTilingOptions()
.setTileSizes({8, 8, 4})
.setLoopType(LinalgTilingLoopType::Loops)
.setDistributionOptions(cyclicNprocsEqNiters),
LinalgMarker(Identifier::get("tensors_distribute1", context),
Identifier::get("tensors_after_distribute1", context)));
}
}
static void