[mlir][Linalg] Add support for tileAndDistribute on tensors.

scf.parallel is currently not a good fit for tiling on tensors. Instead provide a path to parallelism directly through scf.for. For now, this transformation ignores the distribution scheme and always does a block-cyclic mapping (where block is the tile size). Differential revision: https://reviews.llvm.org/D90475
2020-11-16 10:40:24 +00:00 · 2020-11-16 10:40:24 +00:00 · 7625742237
parent e0c92c6c03
commit 7625742237
8 changed files with 103 additions and 20 deletions
--- a/mlir/include/mlir/Dialect/SCF/EDSC/Builders.h
+++ b/mlir/include/mlir/Dialect/SCF/EDSC/Builders.h
@ -24,15 +24,15 @@ namespace edsc {

 /// Adapters for building loop nests using the builder and the location stored
 /// in ScopedContext. Actual builders are in scf::buildLoopNest.
-scf::ValueVector loopNestBuilder(ValueRange lbs, ValueRange ubs,
+scf::LoopNest loopNestBuilder(ValueRange lbs, ValueRange ubs,
                                 ValueRange steps,
                                 function_ref<void(ValueRange)> fun = nullptr);
-scf::ValueVector loopNestBuilder(Value lb, Value ub, Value step,
+scf::LoopNest loopNestBuilder(Value lb, Value ub, Value step,
                                 function_ref<void(Value)> fun = nullptr);
-scf::ValueVector loopNestBuilder(
+scf::LoopNest loopNestBuilder(
    Value lb, Value ub, Value step, ValueRange iterArgInitValues,
    function_ref<scf::ValueVector(Value, ValueRange)> fun = nullptr);
-scf::ValueVector loopNestBuilder(
+scf::LoopNest loopNestBuilder(
    ValueRange lbs, ValueRange ubs, ValueRange steps,
    ValueRange iterArgInitValues,
    function_ref<scf::ValueVector(ValueRange, ValueRange)> fun = nullptr);
--- a/mlir/include/mlir/Dialect/SCF/SCF.h
+++ b/mlir/include/mlir/Dialect/SCF/SCF.h
@ -51,6 +51,11 @@ ParallelOp getParallelForInductionVarOwner(Value val);

 /// An owning vector of values, handy to return from functions.
 using ValueVector = std::vector<Value>;
+using LoopVector = std::vector<scf::ForOp>;
+struct LoopNest {
+  ResultRange getResults() { return loops.front().getResults(); }
+  LoopVector loops;
+};

 /// Creates a perfect nest of "for" loops, i.e. all loops but the innermost
 /// contain only another loop and a terminator. The lower, upper bounds and
@ -65,11 +70,12 @@ using ValueVector = std::vector<Value>;
 /// yielded from the loop body and forwarded back through the loop nest. If the
 /// function is not provided, the loop nest is not expected to have iteration
 /// arguments, the body of the innermost loop will be left empty, containing
-/// only the zero-operand terminator. Returns the values yielded by the
-/// outermost loop. If bound arrays are empty, the body builder will be called
+/// only the zero-operand terminator. Returns the LoopNest containing the list
+/// of perfectly nest scf::ForOp build during the call.
+/// If bound arrays are empty, the body builder will be called
 /// once to construct the IR outside of the loop with an empty list of induction
 /// variables.
-ValueVector buildLoopNest(
+LoopNest buildLoopNest(
    OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs,
    ValueRange steps, ValueRange iterArgs,
    function_ref<ValueVector(OpBuilder &, Location, ValueRange, ValueRange)>
@ -78,7 +84,8 @@ ValueVector buildLoopNest(
 /// A convenience version for building loop nests without iteration arguments
 /// (like for reductions). Does not take the initial value of reductions or
 /// expect the body building functions to return their current value.
-ValueVector buildLoopNest(OpBuilder &builder, Location loc, ValueRange lbs,
+/// The built nested scf::For are captured in `capturedLoops` when non-null.
+LoopNest buildLoopNest(OpBuilder &builder, Location loc, ValueRange lbs,
                          ValueRange ubs, ValueRange steps,
                          function_ref<void(OpBuilder &, Location, ValueRange)>
                              bodyBuilder = nullptr);
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@ -24,6 +24,7 @@
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"

 using namespace mlir;
 using namespace mlir::linalg;
@ -171,10 +172,27 @@ void GenerateLoopNest<scf::ForOp>::doit(
    ArrayRef<Range> loopRanges, ValueRange iterArgInitValues,
    ArrayRef<Attribute> iteratorTypes,
    function_ref<scf::ValueVector(ValueRange, ValueRange)> bodyBuilderFn,
-    Optional<LinalgLoopDistributionOptions>) {
+    Optional<LinalgLoopDistributionOptions> distributionOptions) {
+  // Create procInfo so it dominate loops, if appropriate.
+  OpBuilder &builder = edsc::ScopedContext::getBuilderRef();
+  Location loc = edsc::ScopedContext::getLocation();
+  SmallVector<ProcInfo, 2> procInfo;
+  if (distributionOptions.hasValue())
+    procInfo = distributionOptions->procInfo(builder, loc, ArrayRef<Range>{});
+
  SmallVector<Value, 4> lbs, ubs, steps;
  unpackRanges(loopRanges, lbs, ubs, steps);
-  edsc::loopNestBuilder(lbs, ubs, steps, iterArgInitValues, bodyBuilderFn);
+  LoopNest loopNest =
+      edsc::loopNestBuilder(lbs, ubs, steps, iterArgInitValues, bodyBuilderFn);
+
+  if (!distributionOptions.hasValue() || loopNest.loops.empty())
+    return;
+
+  // TODO: support distributionMethod, which is currently ignored.
+  for (auto it : llvm::zip(loopNest.loops, procInfo,
+                           distributionOptions->distributionMethod))
+    mapLoopToProcessorIds(std::get<0>(it), std::get<1>(it).procId,
+                          std::get<1>(it).nprocs);
 }

 /// Specialization to build affine "for" nest.
--- a/mlir/lib/Dialect/SCF/EDSC/Builders.cpp
+++ b/mlir/lib/Dialect/SCF/EDSC/Builders.cpp
@ -14,7 +14,7 @@
 using namespace mlir;
 using namespace mlir::edsc;

-mlir::scf::ValueVector
+mlir::scf::LoopNest
 mlir::edsc::loopNestBuilder(ValueRange lbs, ValueRange ubs, ValueRange steps,
                            function_ref<void(ValueRange)> fun) {
  // Delegates actual construction to scf::buildLoopNest by wrapping `fun` into
@ -29,7 +29,7 @@ mlir::edsc::loopNestBuilder(ValueRange lbs, ValueRange ubs, ValueRange steps,
      });
 }

-mlir::scf::ValueVector
+mlir::scf::LoopNest
 mlir::edsc::loopNestBuilder(Value lb, Value ub, Value step,
                            function_ref<void(Value)> fun) {
  // Delegates to the ValueRange-based version by wrapping the lambda.
@ -42,7 +42,7 @@ mlir::edsc::loopNestBuilder(Value lb, Value ub, Value step,
                         wrapper);
 }

-mlir::scf::ValueVector mlir::edsc::loopNestBuilder(
+mlir::scf::LoopNest mlir::edsc::loopNestBuilder(
    Value lb, Value ub, Value step, ValueRange iterArgInitValues,
    function_ref<scf::ValueVector(Value, ValueRange)> fun) {
  // Delegates actual construction to scf::buildLoopNest by wrapping `fun` into
@ -61,7 +61,7 @@ mlir::scf::ValueVector mlir::edsc::loopNestBuilder(
      });
 }

-mlir::scf::ValueVector mlir::edsc::loopNestBuilder(
+mlir::scf::LoopNest mlir::edsc::loopNestBuilder(
    ValueRange lbs, ValueRange ubs, ValueRange steps,
    ValueRange iterArgInitValues,
    function_ref<scf::ValueVector(ValueRange, ValueRange)> fun) {
--- a/mlir/lib/Dialect/SCF/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/SCF.cpp
@ -305,7 +305,7 @@ void ForOp::getNumRegionInvocations(ArrayRef<Attribute> operands,
              step.getValue().getSExtValue());
 }

-ValueVector mlir::scf::buildLoopNest(
+LoopNest mlir::scf::buildLoopNest(
    OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs,
    ValueRange steps, ValueRange iterArgs,
    function_ref<ValueVector(OpBuilder &, Location, ValueRange, ValueRange)>
@ -323,7 +323,7 @@ ValueVector mlir::scf::buildLoopNest(
    assert(results.size() == iterArgs.size() &&
           "loop nest body must return as many values as loop has iteration "
           "arguments");
-    return results;
+    return LoopNest();
  }

  // First, create the loop structure iteratively using the body-builder
@ -372,11 +372,13 @@ ValueVector mlir::scf::buildLoopNest(
  builder.setInsertionPointToEnd(loops.back().getBody());
  builder.create<scf::YieldOp>(loc, results);

-  // Return the results of the outermost loop.
-  return ValueVector(loops.front().result_begin(), loops.front().result_end());
+  // Return the loops.
+  LoopNest res;
+  res.loops.assign(loops.begin(), loops.end());
+  return res;
 }

-ValueVector mlir::scf::buildLoopNest(
+LoopNest mlir::scf::buildLoopNest(
    OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs,
    ValueRange steps,
    function_ref<void(OpBuilder &, Location, ValueRange)> bodyBuilder) {
--- a/mlir/test/Dialect/Linalg/tile-and-distribute.mlir
+++ b/mlir/test/Dialect/Linalg/tile-and-distribute.mlir
@ -172,3 +172,43 @@ func @gemm6(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
 //      CHECK:     %[[OFFSETX_2:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
 //      CHECK:     %[[SV3:.*]] = subview %[[ARG2]][%[[ARG3]], %[[OFFSETX_2]]]
 //      CHECK:     linalg.matmul ins(%[[SV1]], %[[SV2]]{{.*}} outs(%[[SV3]]
+
+// -----
+
+// CHECK-LABEL: func @matmul_tensors(
+// CHECK-SAME:    %[[TA:[0-9a-z]+]]: tensor<?x?xf32>
+// CHECK-SAME:    %[[TB:[0-9a-z]+]]: tensor<?x?xf32>
+// CHECK-SAME:    %[[TC:[0-9a-z]+]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
+func @matmul_tensors(
+  %arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>)
+    -> tensor<?x?xf32> {
+//      CHECK: %[[C8:.*]] = constant 8 : index
+//      CHECK: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
+//      CHECK: %[[NBLOCKSY:.*]] = "gpu.grid_dim"() {dimension = "y"}
+//      CHECK: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
+//      CHECK: %[[NBLOCKSX:.*]] = "gpu.grid_dim"() {dimension = "x"}
+//      CHECK: %[[LBY:.*]] = muli %[[BIDY]], %[[C8]] : index
+//      CHECK: %[[STEPY:.*]] = muli %[[NBLOCKSY]], %[[C8]] : index
+//      CHECK: %[[TD0:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC0:.*]] = %[[TC]]) -> (tensor<?x?xf32>) {
+//      CHECK: %[[LBX:.*]] = muli %[[BIDX]], %[[C8]] : index
+//      CHECK: %[[STEPX:.*]] = muli %[[NBLOCKSX]], %[[C8]] : index
+//      CHECK:   %[[TD1:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC1:.*]] = %[[TC0]]) -> (tensor<?x?xf32>) {
+//      CHECK:     %[[TD2:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC2:.*]] = %[[TC1]]) -> (tensor<?x?xf32>) {
+//      CHECK:       %[[sTA:.*]] = subtensor %[[TA]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
+//      CHECK:       %[[sTB:.*]] = subtensor %[[TB]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
+//      CHECK:       %[[sTC:.*]] = subtensor %[[TC2]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
+//      CHECK:       %[[sTD:.*]] = linalg.matmul ins(%[[sTA]], %[[sTB]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CHECK-SAME:                                  init(%[[sTC]] : tensor<?x?xf32>)  -> tensor<?x?xf32>
+//      CHECK:       %[[TD:.*]] = subtensor_insert %[[sTD]] into %[[TC2]][{{.*}}]  : tensor<?x?xf32> into tensor<?x?xf32>
+//      CHECK:       scf.yield %[[TD]] : tensor<?x?xf32>
+//      CHECK:     scf.yield %[[TD2]] : tensor<?x?xf32>
+//      CHECK:   scf.yield %[[TD1]] : tensor<?x?xf32>
+  %0 = linalg.matmul {__internal_linalg_transform__ = "tensors_distribute1"}
+       ins(%arg0, %arg1: tensor<?x?xf32>, tensor<?x?xf32>)
+      init(%arg2: tensor<?x?xf32>)
+    -> tensor<?x?xf32>
+
+//      CHECK: return %[[TD0]] : tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
--- a/mlir/test/EDSC/builder-api-test.cpp
+++ b/mlir/test/EDSC/builder-api-test.cpp
@ -1223,7 +1223,7 @@ TEST_FUNC(builder_loop_for_yield) {
                                 [&](Value iv, ValueRange args) {
                                   Value sum = args[0] + args[1];
                                   return scf::ValueVector{args[1], sum};
-                                 });
+                                 }).getResults();
  results[0] + results[1];

  // clang-format off
--- a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
@ -409,6 +409,22 @@ static void fillTileAndDistributePatterns(MLIRContext *context,
        LinalgMarker(Identifier::get("distribute6", context),
                     Identifier::get("after_distribute6", context)));
  }
+
+  {
+    LinalgLoopDistributionOptions cyclicNprocsEqNiters;
+    cyclicNprocsEqNiters.distributionMethod.resize(
+        2, DistributionMethod::CyclicNumProcsEqNumIters);
+    cyclicNprocsEqNiters.procInfo =
+        getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
+    patterns.insert<LinalgTilingPattern<MatmulOp>>(
+        context,
+        LinalgTilingOptions()
+            .setTileSizes({8, 8, 4})
+            .setLoopType(LinalgTilingLoopType::Loops)
+            .setDistributionOptions(cyclicNprocsEqNiters),
+        LinalgMarker(Identifier::get("tensors_distribute1", context),
+                     Identifier::get("tensors_after_distribute1", context)));
+  }
 }

 static void