From 76257422378e54dc2b59ff034e2955e9518e6c99 Mon Sep 17 00:00:00 2001 From: Nicolas Vasilache Date: Mon, 16 Nov 2020 10:40:24 +0000 Subject: [PATCH] [mlir][Linalg] Add support for tileAndDistribute on tensors. scf.parallel is currently not a good fit for tiling on tensors. Instead provide a path to parallelism directly through scf.for. For now, this transformation ignores the distribution scheme and always does a block-cyclic mapping (where block is the tile size). Differential revision: https://reviews.llvm.org/D90475 --- mlir/include/mlir/Dialect/SCF/EDSC/Builders.h | 8 ++-- mlir/include/mlir/Dialect/SCF/SCF.h | 15 +++++-- mlir/lib/Dialect/Linalg/Utils/Utils.cpp | 22 +++++++++- mlir/lib/Dialect/SCF/EDSC/Builders.cpp | 8 ++-- mlir/lib/Dialect/SCF/SCF.cpp | 12 +++--- .../Dialect/Linalg/tile-and-distribute.mlir | 40 +++++++++++++++++++ mlir/test/EDSC/builder-api-test.cpp | 2 +- .../lib/Transforms/TestLinalgTransforms.cpp | 16 ++++++++ 8 files changed, 103 insertions(+), 20 deletions(-) diff --git a/mlir/include/mlir/Dialect/SCF/EDSC/Builders.h b/mlir/include/mlir/Dialect/SCF/EDSC/Builders.h index fe8df4c2d0e4..8622d8c98315 100644 --- a/mlir/include/mlir/Dialect/SCF/EDSC/Builders.h +++ b/mlir/include/mlir/Dialect/SCF/EDSC/Builders.h @@ -24,15 +24,15 @@ namespace edsc { /// Adapters for building loop nests using the builder and the location stored /// in ScopedContext. Actual builders are in scf::buildLoopNest. -scf::ValueVector loopNestBuilder(ValueRange lbs, ValueRange ubs, +scf::LoopNest loopNestBuilder(ValueRange lbs, ValueRange ubs, ValueRange steps, function_ref fun = nullptr); -scf::ValueVector loopNestBuilder(Value lb, Value ub, Value step, +scf::LoopNest loopNestBuilder(Value lb, Value ub, Value step, function_ref fun = nullptr); -scf::ValueVector loopNestBuilder( +scf::LoopNest loopNestBuilder( Value lb, Value ub, Value step, ValueRange iterArgInitValues, function_ref fun = nullptr); -scf::ValueVector loopNestBuilder( +scf::LoopNest loopNestBuilder( ValueRange lbs, ValueRange ubs, ValueRange steps, ValueRange iterArgInitValues, function_ref fun = nullptr); diff --git a/mlir/include/mlir/Dialect/SCF/SCF.h b/mlir/include/mlir/Dialect/SCF/SCF.h index 55c8cbf5fa74..619ebd2639e7 100644 --- a/mlir/include/mlir/Dialect/SCF/SCF.h +++ b/mlir/include/mlir/Dialect/SCF/SCF.h @@ -51,6 +51,11 @@ ParallelOp getParallelForInductionVarOwner(Value val); /// An owning vector of values, handy to return from functions. using ValueVector = std::vector; +using LoopVector = std::vector; +struct LoopNest { + ResultRange getResults() { return loops.front().getResults(); } + LoopVector loops; +}; /// Creates a perfect nest of "for" loops, i.e. all loops but the innermost /// contain only another loop and a terminator. The lower, upper bounds and @@ -65,11 +70,12 @@ using ValueVector = std::vector; /// yielded from the loop body and forwarded back through the loop nest. If the /// function is not provided, the loop nest is not expected to have iteration /// arguments, the body of the innermost loop will be left empty, containing -/// only the zero-operand terminator. Returns the values yielded by the -/// outermost loop. If bound arrays are empty, the body builder will be called +/// only the zero-operand terminator. Returns the LoopNest containing the list +/// of perfectly nest scf::ForOp build during the call. +/// If bound arrays are empty, the body builder will be called /// once to construct the IR outside of the loop with an empty list of induction /// variables. -ValueVector buildLoopNest( +LoopNest buildLoopNest( OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs, ValueRange steps, ValueRange iterArgs, function_ref @@ -78,7 +84,8 @@ ValueVector buildLoopNest( /// A convenience version for building loop nests without iteration arguments /// (like for reductions). Does not take the initial value of reductions or /// expect the body building functions to return their current value. -ValueVector buildLoopNest(OpBuilder &builder, Location loc, ValueRange lbs, +/// The built nested scf::For are captured in `capturedLoops` when non-null. +LoopNest buildLoopNest(OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs, ValueRange steps, function_ref bodyBuilder = nullptr); diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp index 210d17516718..e5f0ba013e01 100644 --- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp @@ -24,6 +24,7 @@ #include "mlir/IR/Matchers.h" #include "mlir/IR/OpImplementation.h" #include "mlir/Pass/Pass.h" +#include "mlir/Transforms/LoopUtils.h" using namespace mlir; using namespace mlir::linalg; @@ -171,10 +172,27 @@ void GenerateLoopNest::doit( ArrayRef loopRanges, ValueRange iterArgInitValues, ArrayRef iteratorTypes, function_ref bodyBuilderFn, - Optional) { + Optional distributionOptions) { + // Create procInfo so it dominate loops, if appropriate. + OpBuilder &builder = edsc::ScopedContext::getBuilderRef(); + Location loc = edsc::ScopedContext::getLocation(); + SmallVector procInfo; + if (distributionOptions.hasValue()) + procInfo = distributionOptions->procInfo(builder, loc, ArrayRef{}); + SmallVector lbs, ubs, steps; unpackRanges(loopRanges, lbs, ubs, steps); - edsc::loopNestBuilder(lbs, ubs, steps, iterArgInitValues, bodyBuilderFn); + LoopNest loopNest = + edsc::loopNestBuilder(lbs, ubs, steps, iterArgInitValues, bodyBuilderFn); + + if (!distributionOptions.hasValue() || loopNest.loops.empty()) + return; + + // TODO: support distributionMethod, which is currently ignored. + for (auto it : llvm::zip(loopNest.loops, procInfo, + distributionOptions->distributionMethod)) + mapLoopToProcessorIds(std::get<0>(it), std::get<1>(it).procId, + std::get<1>(it).nprocs); } /// Specialization to build affine "for" nest. diff --git a/mlir/lib/Dialect/SCF/EDSC/Builders.cpp b/mlir/lib/Dialect/SCF/EDSC/Builders.cpp index 45097186a248..d0ac5f0c3439 100644 --- a/mlir/lib/Dialect/SCF/EDSC/Builders.cpp +++ b/mlir/lib/Dialect/SCF/EDSC/Builders.cpp @@ -14,7 +14,7 @@ using namespace mlir; using namespace mlir::edsc; -mlir::scf::ValueVector +mlir::scf::LoopNest mlir::edsc::loopNestBuilder(ValueRange lbs, ValueRange ubs, ValueRange steps, function_ref fun) { // Delegates actual construction to scf::buildLoopNest by wrapping `fun` into @@ -29,7 +29,7 @@ mlir::edsc::loopNestBuilder(ValueRange lbs, ValueRange ubs, ValueRange steps, }); } -mlir::scf::ValueVector +mlir::scf::LoopNest mlir::edsc::loopNestBuilder(Value lb, Value ub, Value step, function_ref fun) { // Delegates to the ValueRange-based version by wrapping the lambda. @@ -42,7 +42,7 @@ mlir::edsc::loopNestBuilder(Value lb, Value ub, Value step, wrapper); } -mlir::scf::ValueVector mlir::edsc::loopNestBuilder( +mlir::scf::LoopNest mlir::edsc::loopNestBuilder( Value lb, Value ub, Value step, ValueRange iterArgInitValues, function_ref fun) { // Delegates actual construction to scf::buildLoopNest by wrapping `fun` into @@ -61,7 +61,7 @@ mlir::scf::ValueVector mlir::edsc::loopNestBuilder( }); } -mlir::scf::ValueVector mlir::edsc::loopNestBuilder( +mlir::scf::LoopNest mlir::edsc::loopNestBuilder( ValueRange lbs, ValueRange ubs, ValueRange steps, ValueRange iterArgInitValues, function_ref fun) { diff --git a/mlir/lib/Dialect/SCF/SCF.cpp b/mlir/lib/Dialect/SCF/SCF.cpp index bc8671b9ba85..fe2eb9ced469 100644 --- a/mlir/lib/Dialect/SCF/SCF.cpp +++ b/mlir/lib/Dialect/SCF/SCF.cpp @@ -305,7 +305,7 @@ void ForOp::getNumRegionInvocations(ArrayRef operands, step.getValue().getSExtValue()); } -ValueVector mlir::scf::buildLoopNest( +LoopNest mlir::scf::buildLoopNest( OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs, ValueRange steps, ValueRange iterArgs, function_ref @@ -323,7 +323,7 @@ ValueVector mlir::scf::buildLoopNest( assert(results.size() == iterArgs.size() && "loop nest body must return as many values as loop has iteration " "arguments"); - return results; + return LoopNest(); } // First, create the loop structure iteratively using the body-builder @@ -372,11 +372,13 @@ ValueVector mlir::scf::buildLoopNest( builder.setInsertionPointToEnd(loops.back().getBody()); builder.create(loc, results); - // Return the results of the outermost loop. - return ValueVector(loops.front().result_begin(), loops.front().result_end()); + // Return the loops. + LoopNest res; + res.loops.assign(loops.begin(), loops.end()); + return res; } -ValueVector mlir::scf::buildLoopNest( +LoopNest mlir::scf::buildLoopNest( OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs, ValueRange steps, function_ref bodyBuilder) { diff --git a/mlir/test/Dialect/Linalg/tile-and-distribute.mlir b/mlir/test/Dialect/Linalg/tile-and-distribute.mlir index 6ff4be0169fb..2a6a7ba7b7e3 100644 --- a/mlir/test/Dialect/Linalg/tile-and-distribute.mlir +++ b/mlir/test/Dialect/Linalg/tile-and-distribute.mlir @@ -172,3 +172,43 @@ func @gemm6(%a : memref, %b : memref, %c : memref) // CHECK: %[[OFFSETX_2:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]] // CHECK: %[[SV3:.*]] = subview %[[ARG2]][%[[ARG3]], %[[OFFSETX_2]]] // CHECK: linalg.matmul ins(%[[SV1]], %[[SV2]]{{.*}} outs(%[[SV3]] + +// ----- + +// CHECK-LABEL: func @matmul_tensors( +// CHECK-SAME: %[[TA:[0-9a-z]+]]: tensor +// CHECK-SAME: %[[TB:[0-9a-z]+]]: tensor +// CHECK-SAME: %[[TC:[0-9a-z]+]]: tensor) -> tensor { +func @matmul_tensors( + %arg0: tensor, %arg1: tensor, %arg2: tensor) + -> tensor { +// CHECK: %[[C8:.*]] = constant 8 : index +// CHECK: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"} +// CHECK: %[[NBLOCKSY:.*]] = "gpu.grid_dim"() {dimension = "y"} +// CHECK: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"} +// CHECK: %[[NBLOCKSX:.*]] = "gpu.grid_dim"() {dimension = "x"} +// CHECK: %[[LBY:.*]] = muli %[[BIDY]], %[[C8]] : index +// CHECK: %[[STEPY:.*]] = muli %[[NBLOCKSY]], %[[C8]] : index +// CHECK: %[[TD0:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC0:.*]] = %[[TC]]) -> (tensor) { +// CHECK: %[[LBX:.*]] = muli %[[BIDX]], %[[C8]] : index +// CHECK: %[[STEPX:.*]] = muli %[[NBLOCKSX]], %[[C8]] : index +// CHECK: %[[TD1:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC1:.*]] = %[[TC0]]) -> (tensor) { +// CHECK: %[[TD2:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC2:.*]] = %[[TC1]]) -> (tensor) { +// CHECK: %[[sTA:.*]] = subtensor %[[TA]][{{.*}}] : tensor to tensor +// CHECK: %[[sTB:.*]] = subtensor %[[TB]][{{.*}}] : tensor to tensor +// CHECK: %[[sTC:.*]] = subtensor %[[TC2]][{{.*}}] : tensor to tensor +// CHECK: %[[sTD:.*]] = linalg.matmul ins(%[[sTA]], %[[sTB]] : tensor, tensor) +// CHECK-SAME: init(%[[sTC]] : tensor) -> tensor +// CHECK: %[[TD:.*]] = subtensor_insert %[[sTD]] into %[[TC2]][{{.*}}] : tensor into tensor +// CHECK: scf.yield %[[TD]] : tensor +// CHECK: scf.yield %[[TD2]] : tensor +// CHECK: scf.yield %[[TD1]] : tensor + %0 = linalg.matmul {__internal_linalg_transform__ = "tensors_distribute1"} + ins(%arg0, %arg1: tensor, tensor) + init(%arg2: tensor) + -> tensor + +// CHECK: return %[[TD0]] : tensor + return %0 : tensor +} + diff --git a/mlir/test/EDSC/builder-api-test.cpp b/mlir/test/EDSC/builder-api-test.cpp index 1a866066523e..7677c175ec94 100644 --- a/mlir/test/EDSC/builder-api-test.cpp +++ b/mlir/test/EDSC/builder-api-test.cpp @@ -1223,7 +1223,7 @@ TEST_FUNC(builder_loop_for_yield) { [&](Value iv, ValueRange args) { Value sum = args[0] + args[1]; return scf::ValueVector{args[1], sum}; - }); + }).getResults(); results[0] + results[1]; // clang-format off diff --git a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp index 253d4adf903c..8857bbe09eef 100644 --- a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp +++ b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp @@ -409,6 +409,22 @@ static void fillTileAndDistributePatterns(MLIRContext *context, LinalgMarker(Identifier::get("distribute6", context), Identifier::get("after_distribute6", context))); } + + { + LinalgLoopDistributionOptions cyclicNprocsEqNiters; + cyclicNprocsEqNiters.distributionMethod.resize( + 2, DistributionMethod::CyclicNumProcsEqNumIters); + cyclicNprocsEqNiters.procInfo = + getGpuProcIds; + patterns.insert>( + context, + LinalgTilingOptions() + .setTileSizes({8, 8, 4}) + .setLoopType(LinalgTilingLoopType::Loops) + .setDistributionOptions(cyclicNprocsEqNiters), + LinalgMarker(Identifier::get("tensors_distribute1", context), + Identifier::get("tensors_after_distribute1", context))); + } } static void