forked from OSchip/llvm-project
[mlir][Linalg] Add support for tileAndDistribute on tensors.
scf.parallel is currently not a good fit for tiling on tensors. Instead provide a path to parallelism directly through scf.for. For now, this transformation ignores the distribution scheme and always does a block-cyclic mapping (where block is the tile size). Differential revision: https://reviews.llvm.org/D90475
This commit is contained in:
parent
e0c92c6c03
commit
7625742237
|
@ -24,15 +24,15 @@ namespace edsc {
|
|||
|
||||
/// Adapters for building loop nests using the builder and the location stored
|
||||
/// in ScopedContext. Actual builders are in scf::buildLoopNest.
|
||||
scf::ValueVector loopNestBuilder(ValueRange lbs, ValueRange ubs,
|
||||
scf::LoopNest loopNestBuilder(ValueRange lbs, ValueRange ubs,
|
||||
ValueRange steps,
|
||||
function_ref<void(ValueRange)> fun = nullptr);
|
||||
scf::ValueVector loopNestBuilder(Value lb, Value ub, Value step,
|
||||
scf::LoopNest loopNestBuilder(Value lb, Value ub, Value step,
|
||||
function_ref<void(Value)> fun = nullptr);
|
||||
scf::ValueVector loopNestBuilder(
|
||||
scf::LoopNest loopNestBuilder(
|
||||
Value lb, Value ub, Value step, ValueRange iterArgInitValues,
|
||||
function_ref<scf::ValueVector(Value, ValueRange)> fun = nullptr);
|
||||
scf::ValueVector loopNestBuilder(
|
||||
scf::LoopNest loopNestBuilder(
|
||||
ValueRange lbs, ValueRange ubs, ValueRange steps,
|
||||
ValueRange iterArgInitValues,
|
||||
function_ref<scf::ValueVector(ValueRange, ValueRange)> fun = nullptr);
|
||||
|
|
|
@ -51,6 +51,11 @@ ParallelOp getParallelForInductionVarOwner(Value val);
|
|||
|
||||
/// An owning vector of values, handy to return from functions.
|
||||
using ValueVector = std::vector<Value>;
|
||||
using LoopVector = std::vector<scf::ForOp>;
|
||||
struct LoopNest {
|
||||
ResultRange getResults() { return loops.front().getResults(); }
|
||||
LoopVector loops;
|
||||
};
|
||||
|
||||
/// Creates a perfect nest of "for" loops, i.e. all loops but the innermost
|
||||
/// contain only another loop and a terminator. The lower, upper bounds and
|
||||
|
@ -65,11 +70,12 @@ using ValueVector = std::vector<Value>;
|
|||
/// yielded from the loop body and forwarded back through the loop nest. If the
|
||||
/// function is not provided, the loop nest is not expected to have iteration
|
||||
/// arguments, the body of the innermost loop will be left empty, containing
|
||||
/// only the zero-operand terminator. Returns the values yielded by the
|
||||
/// outermost loop. If bound arrays are empty, the body builder will be called
|
||||
/// only the zero-operand terminator. Returns the LoopNest containing the list
|
||||
/// of perfectly nest scf::ForOp build during the call.
|
||||
/// If bound arrays are empty, the body builder will be called
|
||||
/// once to construct the IR outside of the loop with an empty list of induction
|
||||
/// variables.
|
||||
ValueVector buildLoopNest(
|
||||
LoopNest buildLoopNest(
|
||||
OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs,
|
||||
ValueRange steps, ValueRange iterArgs,
|
||||
function_ref<ValueVector(OpBuilder &, Location, ValueRange, ValueRange)>
|
||||
|
@ -78,7 +84,8 @@ ValueVector buildLoopNest(
|
|||
/// A convenience version for building loop nests without iteration arguments
|
||||
/// (like for reductions). Does not take the initial value of reductions or
|
||||
/// expect the body building functions to return their current value.
|
||||
ValueVector buildLoopNest(OpBuilder &builder, Location loc, ValueRange lbs,
|
||||
/// The built nested scf::For are captured in `capturedLoops` when non-null.
|
||||
LoopNest buildLoopNest(OpBuilder &builder, Location loc, ValueRange lbs,
|
||||
ValueRange ubs, ValueRange steps,
|
||||
function_ref<void(OpBuilder &, Location, ValueRange)>
|
||||
bodyBuilder = nullptr);
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
#include "mlir/IR/Matchers.h"
|
||||
#include "mlir/IR/OpImplementation.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
|
||||
using namespace mlir;
|
||||
using namespace mlir::linalg;
|
||||
|
@ -171,10 +172,27 @@ void GenerateLoopNest<scf::ForOp>::doit(
|
|||
ArrayRef<Range> loopRanges, ValueRange iterArgInitValues,
|
||||
ArrayRef<Attribute> iteratorTypes,
|
||||
function_ref<scf::ValueVector(ValueRange, ValueRange)> bodyBuilderFn,
|
||||
Optional<LinalgLoopDistributionOptions>) {
|
||||
Optional<LinalgLoopDistributionOptions> distributionOptions) {
|
||||
// Create procInfo so it dominate loops, if appropriate.
|
||||
OpBuilder &builder = edsc::ScopedContext::getBuilderRef();
|
||||
Location loc = edsc::ScopedContext::getLocation();
|
||||
SmallVector<ProcInfo, 2> procInfo;
|
||||
if (distributionOptions.hasValue())
|
||||
procInfo = distributionOptions->procInfo(builder, loc, ArrayRef<Range>{});
|
||||
|
||||
SmallVector<Value, 4> lbs, ubs, steps;
|
||||
unpackRanges(loopRanges, lbs, ubs, steps);
|
||||
edsc::loopNestBuilder(lbs, ubs, steps, iterArgInitValues, bodyBuilderFn);
|
||||
LoopNest loopNest =
|
||||
edsc::loopNestBuilder(lbs, ubs, steps, iterArgInitValues, bodyBuilderFn);
|
||||
|
||||
if (!distributionOptions.hasValue() || loopNest.loops.empty())
|
||||
return;
|
||||
|
||||
// TODO: support distributionMethod, which is currently ignored.
|
||||
for (auto it : llvm::zip(loopNest.loops, procInfo,
|
||||
distributionOptions->distributionMethod))
|
||||
mapLoopToProcessorIds(std::get<0>(it), std::get<1>(it).procId,
|
||||
std::get<1>(it).nprocs);
|
||||
}
|
||||
|
||||
/// Specialization to build affine "for" nest.
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
using namespace mlir;
|
||||
using namespace mlir::edsc;
|
||||
|
||||
mlir::scf::ValueVector
|
||||
mlir::scf::LoopNest
|
||||
mlir::edsc::loopNestBuilder(ValueRange lbs, ValueRange ubs, ValueRange steps,
|
||||
function_ref<void(ValueRange)> fun) {
|
||||
// Delegates actual construction to scf::buildLoopNest by wrapping `fun` into
|
||||
|
@ -29,7 +29,7 @@ mlir::edsc::loopNestBuilder(ValueRange lbs, ValueRange ubs, ValueRange steps,
|
|||
});
|
||||
}
|
||||
|
||||
mlir::scf::ValueVector
|
||||
mlir::scf::LoopNest
|
||||
mlir::edsc::loopNestBuilder(Value lb, Value ub, Value step,
|
||||
function_ref<void(Value)> fun) {
|
||||
// Delegates to the ValueRange-based version by wrapping the lambda.
|
||||
|
@ -42,7 +42,7 @@ mlir::edsc::loopNestBuilder(Value lb, Value ub, Value step,
|
|||
wrapper);
|
||||
}
|
||||
|
||||
mlir::scf::ValueVector mlir::edsc::loopNestBuilder(
|
||||
mlir::scf::LoopNest mlir::edsc::loopNestBuilder(
|
||||
Value lb, Value ub, Value step, ValueRange iterArgInitValues,
|
||||
function_ref<scf::ValueVector(Value, ValueRange)> fun) {
|
||||
// Delegates actual construction to scf::buildLoopNest by wrapping `fun` into
|
||||
|
@ -61,7 +61,7 @@ mlir::scf::ValueVector mlir::edsc::loopNestBuilder(
|
|||
});
|
||||
}
|
||||
|
||||
mlir::scf::ValueVector mlir::edsc::loopNestBuilder(
|
||||
mlir::scf::LoopNest mlir::edsc::loopNestBuilder(
|
||||
ValueRange lbs, ValueRange ubs, ValueRange steps,
|
||||
ValueRange iterArgInitValues,
|
||||
function_ref<scf::ValueVector(ValueRange, ValueRange)> fun) {
|
||||
|
|
|
@ -305,7 +305,7 @@ void ForOp::getNumRegionInvocations(ArrayRef<Attribute> operands,
|
|||
step.getValue().getSExtValue());
|
||||
}
|
||||
|
||||
ValueVector mlir::scf::buildLoopNest(
|
||||
LoopNest mlir::scf::buildLoopNest(
|
||||
OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs,
|
||||
ValueRange steps, ValueRange iterArgs,
|
||||
function_ref<ValueVector(OpBuilder &, Location, ValueRange, ValueRange)>
|
||||
|
@ -323,7 +323,7 @@ ValueVector mlir::scf::buildLoopNest(
|
|||
assert(results.size() == iterArgs.size() &&
|
||||
"loop nest body must return as many values as loop has iteration "
|
||||
"arguments");
|
||||
return results;
|
||||
return LoopNest();
|
||||
}
|
||||
|
||||
// First, create the loop structure iteratively using the body-builder
|
||||
|
@ -372,11 +372,13 @@ ValueVector mlir::scf::buildLoopNest(
|
|||
builder.setInsertionPointToEnd(loops.back().getBody());
|
||||
builder.create<scf::YieldOp>(loc, results);
|
||||
|
||||
// Return the results of the outermost loop.
|
||||
return ValueVector(loops.front().result_begin(), loops.front().result_end());
|
||||
// Return the loops.
|
||||
LoopNest res;
|
||||
res.loops.assign(loops.begin(), loops.end());
|
||||
return res;
|
||||
}
|
||||
|
||||
ValueVector mlir::scf::buildLoopNest(
|
||||
LoopNest mlir::scf::buildLoopNest(
|
||||
OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs,
|
||||
ValueRange steps,
|
||||
function_ref<void(OpBuilder &, Location, ValueRange)> bodyBuilder) {
|
||||
|
|
|
@ -172,3 +172,43 @@ func @gemm6(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
|
|||
// CHECK: %[[OFFSETX_2:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
|
||||
// CHECK: %[[SV3:.*]] = subview %[[ARG2]][%[[ARG3]], %[[OFFSETX_2]]]
|
||||
// CHECK: linalg.matmul ins(%[[SV1]], %[[SV2]]{{.*}} outs(%[[SV3]]
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @matmul_tensors(
|
||||
// CHECK-SAME: %[[TA:[0-9a-z]+]]: tensor<?x?xf32>
|
||||
// CHECK-SAME: %[[TB:[0-9a-z]+]]: tensor<?x?xf32>
|
||||
// CHECK-SAME: %[[TC:[0-9a-z]+]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
|
||||
func @matmul_tensors(
|
||||
%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>)
|
||||
-> tensor<?x?xf32> {
|
||||
// CHECK: %[[C8:.*]] = constant 8 : index
|
||||
// CHECK: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
|
||||
// CHECK: %[[NBLOCKSY:.*]] = "gpu.grid_dim"() {dimension = "y"}
|
||||
// CHECK: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
|
||||
// CHECK: %[[NBLOCKSX:.*]] = "gpu.grid_dim"() {dimension = "x"}
|
||||
// CHECK: %[[LBY:.*]] = muli %[[BIDY]], %[[C8]] : index
|
||||
// CHECK: %[[STEPY:.*]] = muli %[[NBLOCKSY]], %[[C8]] : index
|
||||
// CHECK: %[[TD0:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC0:.*]] = %[[TC]]) -> (tensor<?x?xf32>) {
|
||||
// CHECK: %[[LBX:.*]] = muli %[[BIDX]], %[[C8]] : index
|
||||
// CHECK: %[[STEPX:.*]] = muli %[[NBLOCKSX]], %[[C8]] : index
|
||||
// CHECK: %[[TD1:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC1:.*]] = %[[TC0]]) -> (tensor<?x?xf32>) {
|
||||
// CHECK: %[[TD2:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC2:.*]] = %[[TC1]]) -> (tensor<?x?xf32>) {
|
||||
// CHECK: %[[sTA:.*]] = subtensor %[[TA]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[sTB:.*]] = subtensor %[[TB]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[sTC:.*]] = subtensor %[[TC2]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[sTD:.*]] = linalg.matmul ins(%[[sTA]], %[[sTB]] : tensor<?x?xf32>, tensor<?x?xf32>)
|
||||
// CHECK-SAME: init(%[[sTC]] : tensor<?x?xf32>) -> tensor<?x?xf32>
|
||||
// CHECK: %[[TD:.*]] = subtensor_insert %[[sTD]] into %[[TC2]][{{.*}}] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
// CHECK: scf.yield %[[TD]] : tensor<?x?xf32>
|
||||
// CHECK: scf.yield %[[TD2]] : tensor<?x?xf32>
|
||||
// CHECK: scf.yield %[[TD1]] : tensor<?x?xf32>
|
||||
%0 = linalg.matmul {__internal_linalg_transform__ = "tensors_distribute1"}
|
||||
ins(%arg0, %arg1: tensor<?x?xf32>, tensor<?x?xf32>)
|
||||
init(%arg2: tensor<?x?xf32>)
|
||||
-> tensor<?x?xf32>
|
||||
|
||||
// CHECK: return %[[TD0]] : tensor<?x?xf32>
|
||||
return %0 : tensor<?x?xf32>
|
||||
}
|
||||
|
||||
|
|
|
@ -1223,7 +1223,7 @@ TEST_FUNC(builder_loop_for_yield) {
|
|||
[&](Value iv, ValueRange args) {
|
||||
Value sum = args[0] + args[1];
|
||||
return scf::ValueVector{args[1], sum};
|
||||
});
|
||||
}).getResults();
|
||||
results[0] + results[1];
|
||||
|
||||
// clang-format off
|
||||
|
|
|
@ -409,6 +409,22 @@ static void fillTileAndDistributePatterns(MLIRContext *context,
|
|||
LinalgMarker(Identifier::get("distribute6", context),
|
||||
Identifier::get("after_distribute6", context)));
|
||||
}
|
||||
|
||||
{
|
||||
LinalgLoopDistributionOptions cyclicNprocsEqNiters;
|
||||
cyclicNprocsEqNiters.distributionMethod.resize(
|
||||
2, DistributionMethod::CyclicNumProcsEqNumIters);
|
||||
cyclicNprocsEqNiters.procInfo =
|
||||
getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
|
||||
patterns.insert<LinalgTilingPattern<MatmulOp>>(
|
||||
context,
|
||||
LinalgTilingOptions()
|
||||
.setTileSizes({8, 8, 4})
|
||||
.setLoopType(LinalgTilingLoopType::Loops)
|
||||
.setDistributionOptions(cyclicNprocsEqNiters),
|
||||
LinalgMarker(Identifier::get("tensors_distribute1", context),
|
||||
Identifier::get("tensors_after_distribute1", context)));
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
Loading…
Reference in New Issue