[mlir][transform] Create GPU transform dialect

This revision adds GPU transform dialect. It also introduce a prefix such as "transform.gpu" for all ops related to this dialect.

MLIR already had two GPU transform op in linalg. This revision moves these ops into GPUTransformOps. The Ops are as follows:

`transform.structured.map_nested_foreach_thread_to_gpu_blocks`  -> `transform.gpu.map_foreach_to_blocks`
This op selects the outermost (toplevel) foreach_thread and parallelize across GPU blocks. It can also generate `gpu_launch`.

`transform.structured.map_nested_foreach_thread_to_gpu_threads` -> `transform.gpu.map_nested_foreach_to_threads`
This op parallelizes nested foreach_thread that are inside `gpu_launch` across GPU threads.

It doesn't add new functionality, but there are some minor refactoring of the code.

Reviewed By: ftynse

Differential Revision: https://reviews.llvm.org/D134800
This commit is contained in:
Guray Ozen 2022-10-03 09:56:42 +02:00
parent 491ac8f3e8
commit 89bb0cae46
13 changed files with 851 additions and 572 deletions

View File

@ -1,2 +1,3 @@
add_subdirectory(IR)
add_subdirectory(Transforms)
add_subdirectory(TransformOps)

View File

@ -0,0 +1,6 @@
set(LLVM_TARGET_DEFINITIONS GPUTransformOps.td)
mlir_tablegen(GPUTransformOps.h.inc -gen-op-decls)
mlir_tablegen(GPUTransformOps.cpp.inc -gen-op-defs)
add_public_tablegen_target(MLIRGPUTransformOpsIncGen)
add_mlir_doc(GPUTransformOps GPUTransformOps Dialects/ -gen-op-doc)

View File

@ -0,0 +1,75 @@
//===- GPUTransformOps.h - GPU transform ops --------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H
#define MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H
#include "mlir/Dialect/PDL/IR/PDLTypes.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
#include "mlir/IR/OpImplementation.h"
#include "mlir/IR/PatternMatch.h"
namespace mlir {
namespace gpu {
class GpuOp;
} // namespace gpu
} // namespace mlir
//===----------------------------------------------------------------------===//
// GPU Transform Operations
//===----------------------------------------------------------------------===//
#define GET_OP_CLASSES
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h.inc"
namespace mlir {
class DialectRegistry;
namespace transform {
namespace gpu {
/// Searches `scf.foreach_thread` ops nested under `target` and maps each such
/// op to GPU threads. Mapping is one-to-one and the induction variables of
/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the
/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in
/// which case, the union of the number of threads is computed and may result in
/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not
/// supported. Dynamic block dim sizes are currently not supported.
DiagnosedSilenceableFailure
mapNestedForeachToThreadsImp(RewriterBase &rewriter, Operation *target,
const SmallVectorImpl<int64_t> &blockDim,
bool syncAfterDistribute,
llvm::Optional<TransformOpInterface> transformOp);
/// Maps the top level `scf.foreach_thread` op to GPU Thread Blocks. Mapping is
/// one-to-one and the induction variables of `scf.foreach_thread` are rewritten
/// to gpu.block_id according to the thread_dim_apping attribute. Dynamic,
/// `scf.foreach_thread` trip counts are currently not supported. Dynamic block
/// dim sizes are currently not supported.
DiagnosedSilenceableFailure mapForeachToBlocksImp(
RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
function_ref<void(RewriterBase &, scf::ForeachThreadOp,
SmallVectorImpl<Value> &)>
blockIdGenerator,
SmallVectorImpl<int64_t> &gridDims, TransformOpInterface transformOp);
/// Finds the top level scf::ForeachThreadOp of given target.
DiagnosedSilenceableFailure
findTopLevelForeachThreadOp(Operation *target,
scf::ForeachThreadOp &topLevelForeachThreadOp,
TransformOpInterface transformOp);
} // namespace gpu
} // namespace transform
namespace gpu {
void registerTransformDialectExtension(DialectRegistry &registry);
} // namespace gpu
} // namespace mlir
#endif // MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H

View File

@ -0,0 +1,175 @@
//===- GPUTransformOps.td - GPU transform ops --------------*- tablegen -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef GPU_TRANSFORM_OPS
#define GPU_TRANSFORM_OPS
include "mlir/Dialect/Transform/IR/TransformDialect.td"
include "mlir/Dialect/Transform/IR/TransformEffects.td"
include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
include "mlir/Dialect/PDL/IR/PDLTypes.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir/IR/OpBase.td"
def MapNestedForeachToThreads :
Op<Transform_Dialect, "gpu.map_nested_foreach_to_threads",
[FunctionalStyleTransformOpTrait,
MemoryEffectsOpInterface,
TransformEachOpTrait,
TransformOpInterface]> {
let description = [{
Target the `gpu.launch op` and rewrite all `scf.foreach_thread`
nested in it to distributed `gpu.thread_id` attribute.
The operation searches for `scf.foreach_thread` ops nested under `target`
and maps each such op to GPU threads. Mapping is one-to-one and the
induction variables of `scf.foreach_thread` are rewritten to
`gpu.thread_id` according to the `thread_dim_mapping` attribute.
Sibling `scf.foreach_thread` are supported in which case, the union of
the number of threads is computed and may result in predication.
Multiple scf.foreach_thread are supported per `gpu.launch` in which case,
the max of all the threads is computed and taken for the global
`gpu.thread_id`. If necessary, `scf.foreach_thread` that do not use the
whole thread range result in predicated computations.
Dynamic `scf.foreach_thread` trip counts are currently not supported.
Dynamic block dim sizes are currently not supported.
Only **bufferized** `scf.foreach_thread` are currently supported.
Only `scf.foreach_thread` distributed to **at most 3 dimensions** are
currently supported.
Barriers are inserted after each scf.foreach_thread op for now.
The operation alters the block size of the given gpu_launch using
blockDim argument.
#### Return modes:
This operation ignores non-gpu_launch ops and drops them in the return.
If any scf.foreach_thread with tensors is found, the transform definitely
fails.
If all the scf.foreach_thread operations contained within the LaunchOp
referred to by the `target` PDLOperation lower to GPU properly, the
transform succeeds. Otherwise the transform definitely fails.
The returned handle points to the same LaunchOp operand, consuming it and
producing a new SSA value to satisfy chaining and linearity of the IR
properties.
#### Example:
```
gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
threads(%tx, %ty, %tz) in (%tx = %3, %ty = %4, %tz = %5) {
scf.foreach_thread (%i, %j) in (7, 9) {
... // body 1
} {thread_dim_mapping = [1, 0, 2]}
scf.foreach_thread (%i) in (12) {
... // body 2
}
gpu.terminator
}
```
is translated to:
```
%bdimX = arith.constant 12 : index
%bdimY = arith.constant 9 : index
gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
threads(%tx, %ty, %tz) in (%tx = %bdimX, %ty = %bdimY, %tz = %5) {
if (threadIdx.x < 9 && threadIdx.y < 7) {
... // body 1
}
gpu.barrier
if (threadIdx.y < 1) {
... // body 2
}
gpu.barrier
gpu.terminator
}
```
}];
let arguments = (ins PDL_Operation:$target,
DefaultValuedAttr<I64ArrayAttr, "{}">:$blockDim,
DefaultValuedAttr<BoolAttr, "true">:$syncAfterDistribute);
let results = (outs PDL_Operation:$result);
let assemblyFormat = "$target attr-dict";
let extraClassDeclaration = [{
::mlir::DiagnosedSilenceableFailure applyToOne(
::mlir::Operation *target,
::llvm::SmallVectorImpl<::mlir::Operation *> &results,
::mlir::transform::TransformState &state);
}];
}
def MapForeachToBlocks :
Op<Transform_Dialect, "gpu.map_foreach_to_blocks",
[FunctionalStyleTransformOpTrait,
MemoryEffectsOpInterface,
TransformOpInterface,
TransformEachOpTrait]> {
let description = [{
Target the gpu_launch op and rewrite the top level `scf.foreach_thread`
to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute
is set, then first generates `gpu_launch` and moves the top level
`scf.foreach_thread` inside.
The operation searches top level `scf.foreach_thread` ops under
`gpu_launch` and maps each such op to GPU blocks. Mapping is
one-to-one and the induction variables of `scf.foreach_thread` are
rewritten to gpu.block_id according to the `thread_dim_apping` attribute.
Dynamic, `scf.foreach_thread` trip counts are currently not supported.
Dynamic block dim sizes are currently not supported.
Only **bufferized** scf.foreach_thread are currently supported.
Only scf.foreach_thread distributed to **at most 3 dimensions** are
currently supported.
The operation alters the block size of the given gpu_launch using
gridDim argument.
#### Return modes:
This operation ignores non-gpu_launch ops and drops them in the return.
If any scf.foreach_thread with tensors is found, the transform definitely
fails.
If all the scf.foreach_thread operations contained within the LaunchOp
referred to by the `target` PDLOperation lower to GPU properly, the
transform succeeds. Otherwise the transform definitely fails.
The returned handle points to the same LaunchOp operand, consuming it and
producing a new SSA value to satisfy chaining and linearity of the IR
properties.
}];
let arguments = (ins PDL_Operation:$target,
DefaultValuedAttr<I64ArrayAttr, "{}">:$gridDim,
UnitAttr:$generate_gpu_launch);
let results = (outs PDL_Operation:$result);
let assemblyFormat = "$target attr-dict";
let extraClassDeclaration = [{
::mlir::DiagnosedSilenceableFailure applyToOne(
::mlir::Operation *target,
::llvm::SmallVectorImpl<::mlir::Operation *> &results,
::mlir::transform::TransformState &state);
}];
}
#endif // GPU_TRANSFORM_OPS

View File

@ -751,161 +751,6 @@ def TileToForeachThreadOp :
}];
}
def MapNestedForeachThreadToGpuThreads :
Op<Transform_Dialect, "structured.map_nested_foreach_thread_to_gpu_threads",
[FunctionalStyleTransformOpTrait,
MemoryEffectsOpInterface,
TransformEachOpTrait,
TransformOpInterface]> {
let description = [{
Target the gpu_launch op and rewrite all scf.foreach_thread
to distributed gpu.thread_id attribute.
The operation searches `scf.foreach_thread` ops nested under `target`
and maps each such op to GPU threads. Mapping is one-to-one and the
induction variables of `scf.foreach_thread` are rewritten to
gpu.thread_id according to the thread_dim_apping attribute.
Sibling `scf.foreach_thread` are supported in which case, the union of
the number of threads is computed and may result in predication.
Multiple scf.foreach_thread are supported per function in which case, the
max of all the threads is computed and taken for the global gpu.thread_id.
If necessary, scf.foreach_thread that do not use the whole thread range
result in predicated computations.
Dynamic, `scf.foreach_thread` trip counts are currently not supported.
Dynamic block dim sizes are currently not supported.
Only **bufferized** scf.foreach_thread are currently supported.
Only scf.foreach_thread distributed to **at most 3 dimensions** are
currently supported.
Barriers are inserted after each scf.foreach_thread op for now.
The operation alters the block size of the given gpu_launch using
blockDim argument.
#### Return modes:
This operation ignores non-gpu_launch ops and drops them in the return.
If any scf.foreach_thread with tensors is found, the transform definitely
fails.
If all the scf.foreach_thread operations contained within the LaunchOp
referred to by the `target` PDLOperation lower to GPU properly, the
transform succeeds. Otherwise the transform definitely fails.
The returned handle points to the same LaunchOp operand, consuming it and
producing a new SSA value to satisfy chaining and linearity of the IR
properties.
#### Example:
```
gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
threads(%tx, %ty, %tz) in (%tx = %3, %ty = %4, %tz = %5) {
scf.foreach_thread (%i, %j) in (7, 9) {
... // body 1
} {thread_dim_mapping = [1, 0, 2]}
scf.foreach_thread (%i) in (12) {
... // body 2
}
gpu.terminator
}
```
is translated to:
```
%bdimX = arith.constant 12 : index
%bdimY = arith.constant 9 : index
gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
threads(%tx, %ty, %tz) in (%tx = %bdimX, %ty = %bdimY, %tz = %5) {
if (threadIdx.x < 9 && threadIdx.y < 7) {
... // body 1
}
gpu.barrier
if (threadIdx.y < 1) {
... // body 2
}
gpu.barrier
gpu.terminator
}
```
}];
let arguments = (ins PDL_Operation:$target,
DefaultValuedAttr<I64ArrayAttr, "{}">:$blockDim,
DefaultValuedAttr<BoolAttr, "true">:$syncAfterDistribute);
let results = (outs PDL_Operation:$result);
let assemblyFormat = "$target attr-dict";
let extraClassDeclaration = [{
::mlir::DiagnosedSilenceableFailure applyToOne(
::mlir::Operation *target,
::llvm::SmallVectorImpl<::mlir::Operation *> &results,
::mlir::transform::TransformState &state);
}];
}
def MapNestedForeachThreadToGpuBlocks : Op<Transform_Dialect,
"structured.map_nested_foreach_thread_to_gpu_blocks",
[FunctionalStyleTransformOpTrait,
MemoryEffectsOpInterface,
TransformOpInterface,
TransformEachOpTrait]> {
let description = [{
Target the gpu_launch op and rewrite the top level `scf.foreach_thread`
to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute
is set, then first generates `gpu_launch` and moves the top level
`scf.foreach_thread` inside.
The operation searches top level `scf.foreach_thread` ops under
`gpu_launch` and maps each such op to GPU blocks. Mapping is
one-to-one and the induction variables of `scf.foreach_thread` are
rewritten to gpu.block_id according to the `thread_dim_apping` attribute.
Dynamic, `scf.foreach_thread` trip counts are currently not supported.
Dynamic block dim sizes are currently not supported.
Only **bufferized** scf.foreach_thread are currently supported.
Only scf.foreach_thread distributed to **at most 3 dimensions** are
currently supported.
The operation alters the block size of the given gpu_launch using
gridDim argument.
#### Return modes:
This operation ignores non-gpu_launch ops and drops them in the return.
If any scf.foreach_thread with tensors is found, the transform definitely
fails.
If all the scf.foreach_thread operations contained within the LaunchOp
referred to by the `target` PDLOperation lower to GPU properly, the
transform succeeds. Otherwise the transform definitely fails.
The returned handle points to the same LaunchOp operand, consuming it and
producing a new SSA value to satisfy chaining and linearity of the IR
properties.
}];
let arguments = (ins PDL_Operation:$target,
DefaultValuedAttr<I64ArrayAttr, "{}">:$gridDim,
UnitAttr:$generate_gpu_launch);
let results = (outs PDL_Operation:$result);
let assemblyFormat = "$target attr-dict";
let extraClassDeclaration = [{
::mlir::DiagnosedSilenceableFailure applyToOne(
::mlir::Operation *target,
::llvm::SmallVectorImpl<::mlir::Operation *> &results,
::mlir::transform::TransformState &state);
}];
}
def VectorizeOp : Op<Transform_Dialect, "structured.vectorize",
[FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
TransformEachOpTrait, TransformOpInterface]> {

View File

@ -125,32 +125,6 @@ bool areElementwiseOpsFusable(OpOperand *fusedOperand);
FailureOr<Operation *> fuseElementwiseOps(RewriterBase &rewriter,
OpOperand *fusedOperand);
/// Maps the top level `scf.foreach_thread` op to GPU Thread Blocks. Mapping is
/// one-to-one and the induction variables of `scf.foreach_thread` are rewritten
/// to gpu.block_id according to the thread_dim_apping attribute. Dynamic,
/// `scf.foreach_thread` trip counts are currently not supported. Dynamic block
/// dim sizes are currently not supported.
LogicalResult rewriteTopLevelForeachThreadToGpuBlocks(
RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
function_ref<void(RewriterBase &, scf::ForeachThreadOp,
SmallVector<Value> &)>
blockIdGenerator,
SmallVector<int64_t> &gridDims);
/// Finds the top level scf::ForeachThreadOp of given target.
FailureOr<scf::ForeachThreadOp> findTopLevelForeachThreadOp(Operation *target);
/// Searches `scf.foreach_thread` ops nested under `target` and maps each such
/// op to GPU threads. Mapping is one-to-one and the induction variables of
/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the
/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in
/// which case, the union of the number of threads is computed and may result in
/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not
/// supported. Dynamic block dim sizes are currently not supported.
mlir::WalkResult rewriteMapNestedForeachThreadToGpuThreads(
RewriterBase &rewriter, Operation *target,
const SmallVector<int64_t> &blockDim, bool syncAfterDistribute);
/// Split the given `op` into two parts along the given iteration space
/// `dimension` at the specified `splitPoint`, and return the two parts.
///

View File

@ -31,6 +31,7 @@
#include "mlir/Dialect/EmitC/IR/EmitC.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
@ -115,6 +116,7 @@ inline void registerAllDialects(DialectRegistry &registry) {
linalg::registerTransformDialectExtension(registry);
memref::registerTransformDialectExtension(registry);
scf::registerTransformDialectExtension(registry);
gpu::registerTransformDialectExtension(registry);
// Register all external models.
arith::registerBufferizableOpInterfaceExternalModels(registry);

View File

@ -82,6 +82,8 @@ add_mlir_dialect_library(MLIRGPUTransforms
MLIRTransformUtils
)
add_subdirectory(TransformOps)
if(MLIR_ENABLE_CUDA_RUNNER)
if(NOT MLIR_ENABLE_CUDA_CONVERSIONS)
message(SEND_ERROR

View File

@ -0,0 +1,18 @@
add_mlir_dialect_library(MLIRGPUTransformOps
GPUTransformOps.cpp
ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU/TransformOps
DEPENDS
MLIRGPUTransformOpsIncGen
LINK_LIBS PUBLIC
MLIRIR
MLIRGPUTransforms
MLIRParser
MLIRPDLDialect
MLIRSideEffectInterfaces
MLIRTransformDialect
MLIRGPUOps
)

View File

@ -0,0 +1,507 @@
//===- GPUTransformOps.cpp - Implementation of GPU transform ops ----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
#include "mlir/Dialect/PDL/IR/PDL.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Transform/IR/TransformDialect.h"
#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
#include "mlir/IR/Diagnostics.h"
#include "mlir/IR/Value.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
using namespace mlir;
using namespace mlir::gpu;
using namespace mlir::transform;
namespace {
/// A simple pattern rewriter that implements no special logic.
class SimpleRewriter : public PatternRewriter {
public:
SimpleRewriter(MLIRContext *context) : PatternRewriter(context) {}
};
} // namespace
/// Determines if the size of the kernel configuration is supported by the GPU
/// architecture being used. It presently makes use of CUDA limitations, however
/// that aspect may be enhanced for other GPUs.
static DiagnosedSilenceableFailure
checkGpuLimits(TransformOpInterface transformOp, Optional<int64_t> gridDimX,
Optional<int64_t> gridDimY, Optional<int64_t> gridDimZ,
Optional<int64_t> blockDimX, Optional<int64_t> blockDimY,
Optional<int64_t> blockDimZ) {
static constexpr int max_total_blockdim = 1024;
static constexpr int max_blockdimx = 1024;
static constexpr int max_blockdimy = 1024;
static constexpr int max_blockdimz = 64;
static constexpr int max_total_griddim = 2147483647;
static constexpr int max_griddimx = 2147483647;
static constexpr int max_griddimy = 65535;
static constexpr int max_griddimz = 65535;
if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
max_total_blockdim ||
(gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >
max_total_griddim ||
blockDimX.value_or(1) > max_blockdimx ||
blockDimY.value_or(1) > max_blockdimy ||
blockDimZ.value_or(1) > max_blockdimz ||
gridDimY.value_or(1) > max_griddimy ||
gridDimZ.value_or(1) > max_griddimz ||
gridDimX.value_or(1) > max_griddimx) {
return transformOp.emitSilenceableError()
<< "Trying to launch a GPU kernel with gridDim = ("
<< gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "
<< gridDimZ.value_or(1) << ") blockDim = (" << blockDimX.value_or(1)
<< ", " << blockDimY.value_or(1) << ", " << blockDimZ.value_or(1)
<< "). It is larger than the limits.";
}
return DiagnosedSilenceableFailure::success();
}
/// Creates an empty-body gpu::LaunchOp using the provided kernel settings and
/// put a terminator within.
static DiagnosedSilenceableFailure
createGpuLaunch(RewriterBase &rewriter, Location loc,
TransformOpInterface transformOp, LaunchOp &launchOp,
Optional<int64_t> gridDimX = llvm::None,
Optional<int64_t> gridDimY = llvm::None,
Optional<int64_t> gridDimZ = llvm::None,
Optional<int64_t> blockDimX = llvm::None,
Optional<int64_t> blockDimY = llvm::None,
Optional<int64_t> blockDimZ = llvm::None) {
DiagnosedSilenceableFailure diag =
checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
blockDimY, blockDimZ);
if (!diag.succeeded())
return diag;
auto createConst = [&](int dim) {
return rewriter.create<arith::ConstantIndexOp>(loc, dim);
};
OpBuilder::InsertionGuard guard(rewriter);
Value one = createConst(1);
Value gridSizeX = gridDimX.has_value() ? createConst(gridDimX.value()) : one;
Value gridSizeY = gridDimY.has_value() ? createConst(gridDimY.value()) : one;
Value gridSizeZ = gridDimZ.has_value() ? createConst(gridDimZ.value()) : one;
Value blkSizeX = blockDimX.has_value() ? createConst(blockDimX.value()) : one;
Value blkSizeY = blockDimY.has_value() ? createConst(blockDimY.value()) : one;
Value blkSizeZ = blockDimZ.has_value() ? createConst(blockDimZ.value()) : one;
launchOp = rewriter.create<LaunchOp>(loc, gridSizeX, gridSizeY, gridSizeZ,
blkSizeX, blkSizeY, blkSizeZ);
rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
rewriter.create<TerminatorOp>(loc);
return DiagnosedSilenceableFailure(success());
}
/// Alter kernel configuration of the given kernel.
static DiagnosedSilenceableFailure
alterGpuLaunch(SimpleRewriter &rewriter, LaunchOp gpuLaunch,
TransformOpInterface transformOp,
Optional<int64_t> gridDimX = llvm::None,
Optional<int64_t> gridDimY = llvm::None,
Optional<int64_t> gridDimZ = llvm::None,
Optional<int64_t> blockDimX = llvm::None,
Optional<int64_t> blockDimY = llvm::None,
Optional<int64_t> blockDimZ = llvm::None) {
DiagnosedSilenceableFailure diag =
checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
blockDimY, blockDimZ);
if (!diag.succeeded())
return diag;
KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPointAfterValue(currentBlockdim.x);
auto createConstValue = [&](int dim) {
return rewriter.create<arith::ConstantIndexOp>(currentBlockdim.x.getLoc(),
dim);
};
if (gridDimX.has_value())
gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value()));
if (gridDimY.has_value())
gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value()));
if (gridDimZ.has_value())
gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value()));
if (blockDimX.has_value())
gpuLaunch.getBlockSizeXMutable().assign(
createConstValue(blockDimX.value()));
if (blockDimY.has_value())
gpuLaunch.getBlockSizeYMutable().assign(
createConstValue(blockDimY.value()));
if (blockDimZ.has_value())
gpuLaunch.getBlockSizeZMutable().assign(
createConstValue(blockDimZ.value()));
return DiagnosedSilenceableFailure::success();
}
//===----------------------------------------------------------------------===//
// MapForeachToBlocks
//===----------------------------------------------------------------------===//
DiagnosedSilenceableFailure mlir::transform::gpu::mapForeachToBlocksImp(
RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
function_ref<void(RewriterBase &, scf::ForeachThreadOp,
SmallVectorImpl<Value> &)>
blockIdGenerator,
SmallVectorImpl<int64_t> &gridDims, TransformOpInterface transformOp) {
if (foreachThreadOp.getNumResults() > 0)
return transformOp.emitSilenceableError()
<< "only bufferized scf.foreach_thread lowers to gpu.block_id";
if (foreachThreadOp.getNumThreads().size() > 3)
return transformOp.emitSilenceableError()
<< "scf.foreach_thread with rank > 3 does not lower to gpu.block_id";
// Step 0. Outline the compute workload region and set up the workload
// operands.
FailureOr<SmallVector<OpFoldResult>> potentialGridDim =
foreachThreadOp.getPermutedNumThreads(rewriter);
if (failed(potentialGridDim) ||
llvm::any_of(*potentialGridDim, [](OpFoldResult ofr) {
return !getConstantIntValue(ofr).has_value();
})) {
return transformOp.emitSilenceableError() << "unsupported dynamic gridDim";
}
for (OpFoldResult ofr : *potentialGridDim)
gridDims.push_back(getConstantIntValue(ofr).value());
SmallVector<Value> blockOps;
blockIdGenerator(rewriter, foreachThreadOp, blockOps);
// Step 1. Move the body of foreachThreadOp.
// Erase the terminator first, it will not be used since we are on buffers.
rewriter.eraseOp(foreachThreadOp.getTerminator());
Block *targetBlock = foreachThreadOp->getBlock();
Block::iterator insertionPoint = Block::iterator(foreachThreadOp);
Block &sourceBlock = foreachThreadOp.getRegion().front();
targetBlock->getOperations().splice(insertionPoint,
sourceBlock.getOperations());
// Step 2. RAUW thread indices to thread ops.
SmallVector<Value> threadIndices =
*foreachThreadOp.getPermutedThreadIndices();
assert(blockOps.size() == 3 && "3 block id ops are required");
for (auto [blockIdx, blockOp] : llvm::zip(threadIndices, blockOps)) {
Value val = blockIdx;
Value blkOp = blockOp;
if (!val)
continue;
for (Operation *user : llvm::make_early_inc_range(val.getUsers()))
user->replaceUsesOfWith(val, blkOp);
}
// Step 3. Erase old op.
rewriter.eraseOp(foreachThreadOp);
return DiagnosedSilenceableFailure::success();
}
DiagnosedSilenceableFailure mlir::transform::gpu::findTopLevelForeachThreadOp(
Operation *target, scf::ForeachThreadOp &topLevelForeachThreadOp,
TransformOpInterface transformOp) {
auto walkResult = target->walk([&](scf::ForeachThreadOp foreachThreadOp) {
if (foreachThreadOp->getParentOfType<scf::ForeachThreadOp>())
return WalkResult::advance();
if (topLevelForeachThreadOp)
// TODO: Handle multiple foreach if there is no dependences between them
return WalkResult::interrupt();
topLevelForeachThreadOp = foreachThreadOp;
return WalkResult::advance();
});
if (walkResult.wasInterrupted())
return transformOp.emitSilenceableError()
<< "could not find a unique topLevel scf.foreach_thread";
return DiagnosedSilenceableFailure::success();
}
/// This is a helper that is only used in
/// rewriteTopLevelForeachThreadToGpuBlocks. It generates GPU dialects block_id.
static void generateGpuBlockIds(RewriterBase &rewriter,
scf::ForeachThreadOp foreachOp,
SmallVectorImpl<Value> &blockOps) {
Location loc = foreachOp->getLoc();
OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPoint(foreachOp);
IndexType indexType = rewriter.getIndexType();
SmallVector<Dimension> gpuDims{Dimension::x, Dimension::y, Dimension::z};
for (int64_t idx : llvm::seq<int64_t>(0, gpuDims.size())) {
blockOps.push_back(
rewriter.create<BlockIdOp>(loc, indexType, gpuDims[idx]));
}
}
DiagnosedSilenceableFailure
transform::MapForeachToBlocks::applyToOne(Operation *target,
SmallVectorImpl<Operation *> &results,
transform::TransformState &state) {
LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
SimpleRewriter rewriter(getContext());
auto transformOp = cast<TransformOpInterface>(getOperation());
if (!getGenerateGpuLaunch() && !gpuLaunch) {
results.assign({target});
DiagnosedSilenceableFailure diag =
emitSilenceableError()
<< "Given target is not gpu.launch, set `generate_gpu_launch` "
"attribute";
diag.attachNote(target->getLoc()) << "when applied to this payload op";
return diag;
}
scf::ForeachThreadOp topLevelForeachThreadOp;
DiagnosedSilenceableFailure diag =
mlir::transform::gpu::findTopLevelForeachThreadOp(
target, topLevelForeachThreadOp, transformOp);
if (!diag.succeeded()) {
results.assign({target});
diag.attachNote(target->getLoc()) << "when applied to this payload op";
return diag;
}
OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPoint(topLevelForeachThreadOp);
// Generate gpu launch here and move the foreach_thread inside
if (getGenerateGpuLaunch()) {
DiagnosedSilenceableFailure diag =
createGpuLaunch(rewriter, target->getLoc(), transformOp, gpuLaunch);
if (!diag.succeeded()) {
results.assign({target});
return diag;
}
rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
Operation *newForeachThreadOp = rewriter.clone(*topLevelForeachThreadOp);
rewriter.eraseOp(topLevelForeachThreadOp);
topLevelForeachThreadOp = cast<scf::ForeachThreadOp>(newForeachThreadOp);
}
SmallVector<int64_t> gridDim = extractFromI64ArrayAttr(getGridDim());
diag = mlir::transform::gpu::mapForeachToBlocksImp(
rewriter, topLevelForeachThreadOp, generateGpuBlockIds, gridDim,
transformOp);
if (diag.succeeded()) {
diag = alterGpuLaunch(rewriter, gpuLaunch,
cast<TransformOpInterface>(getOperation()),
gridDim[0], gridDim[1], gridDim[2]);
}
results.assign({gpuLaunch});
return diag;
}
//===----------------------------------------------------------------------===//
// MapNestedForeachToThreads
//===----------------------------------------------------------------------===//
/// Searches `scf.foreach_thread` ops nested under `target` and maps each such
/// op to GPU threads. Mapping is one-to-one and the induction variables of
/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the
/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in
/// which case, the union of the number of threads is computed and may result
/// in predication. Dynamic, `scf.foreach_thread` trip counts are currently
/// not supported. Dynamic block dim sizes are currently not supported.
static DiagnosedSilenceableFailure rewriteOneForeachThreadToGpuThreads(
RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
const SmallVectorImpl<int64_t> &globalBlockDims, bool syncAfterDistribute,
llvm::Optional<TransformOpInterface> transformOp) {
auto failureHelper =
[&](const Twine &message) -> DiagnosedSilenceableFailure {
if (transformOp.has_value()) {
return transformOp->emitSilenceableError() << message;
}
foreachThreadOp->emitError() << message;
return DiagnosedSilenceableFailure::definiteFailure();
};
if (foreachThreadOp.getNumResults() > 0)
return failureHelper(
"only bufferized scf.foreach_thread lowers to gpu.thread_id");
if (foreachThreadOp.getNumThreads().size() > 3)
return failureHelper(
"scf.foreach_thread with rank > 3 does not lower to gpu.thread_id");
auto potentialBlockDim = foreachThreadOp.getPermutedNumThreads(rewriter);
if (failed(potentialBlockDim) ||
llvm::any_of(*potentialBlockDim, [](OpFoldResult ofr) {
return !getConstantIntValue(ofr).has_value();
})) {
return failureHelper("unsupported dynamic blockdim size");
}
SmallVector<int64_t> blockDim =
llvm::to_vector(llvm::map_range(*potentialBlockDim, [](OpFoldResult ofr) {
return getConstantIntValue(ofr).value();
}));
// Step 1. Create the gpu.thread ops
Location loc = foreachThreadOp.getLoc();
IndexType indexType = rewriter.getIndexType();
SmallVector<Dimension> gpuDims{Dimension::x, Dimension::y, Dimension::z};
SmallVector<Value> threadOps;
for (int64_t idx : llvm::seq<int64_t>(0, blockDim.size())) {
threadOps.push_back(
rewriter.create<ThreadIdOp>(loc, indexType, gpuDims[idx]));
}
// Step 2. Maybe create conditionals to predicate the region.
Value predicate;
for (auto [threadId, blockDim, globalBlockDim] :
llvm::zip(threadOps, blockDim, globalBlockDims)) {
if (blockDim > globalBlockDim) {
return failureHelper(
"The GPU threads are fewer than the loop trip counts. "
"Try to tile scf.foreach_thread before mapping.");
}
if (blockDim == globalBlockDim)
continue;
Value blockIdx = rewriter.create<arith::ConstantIndexOp>(loc, blockDim);
Value tmpPredicate = rewriter.create<arith::CmpIOp>(
loc, arith::CmpIPredicate::ult, threadId, blockIdx);
predicate =
predicate ? rewriter.create<arith::AndIOp>(loc, predicate, tmpPredicate)
: tmpPredicate;
}
// Step 3. Move the body of foreachThreadOp.
// Erase the terminator first, it will not be used.
rewriter.eraseOp(foreachThreadOp.getTerminator());
Block *targetBlock;
Block::iterator insertionPoint;
if (predicate) {
// Step 3.a. If predicated, move at the beginning.
auto ifOp =
rewriter.create<scf::IfOp>(loc, predicate, /*withElseRegion=*/false);
targetBlock = ifOp.thenBlock();
insertionPoint = ifOp.thenBlock()->begin();
} else {
// Step 3.a. Otherwise, move inline just before foreachThreadOp.
targetBlock = foreachThreadOp->getBlock();
insertionPoint = Block::iterator(foreachThreadOp);
}
Block &sourceBlock = foreachThreadOp.getRegion().front();
targetBlock->getOperations().splice(insertionPoint,
sourceBlock.getOperations());
// Step 4. RAUW thread indices to thread ops.
SmallVector<Value> threadIndices =
*foreachThreadOp.getPermutedThreadIndices();
for (auto [threadIdx, threadOp] : llvm::zip(threadIndices, threadOps)) {
Value val = threadIdx;
Value op = threadOp;
if (!val)
continue;
for (Operation *user : llvm::make_early_inc_range(val.getUsers())) {
user->replaceUsesOfWith(val, op);
}
}
// Step 5. syncthreads.
// TODO: Need warpsync
if (syncAfterDistribute)
rewriter.create<BarrierOp>(loc);
// Step 6. Erase old op.
rewriter.eraseOp(foreachThreadOp);
return DiagnosedSilenceableFailure::success();
}
DiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForeachToThreadsImp(
RewriterBase &rewriter, Operation *target,
const SmallVectorImpl<int64_t> &blockDim, bool syncAfterDistribute,
llvm::Optional<TransformOpInterface> transformOp) {
DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success();
target->walk([&](scf::ForeachThreadOp foreachThreadOp) {
rewriter.setInsertionPoint(foreachThreadOp);
diag = rewriteOneForeachThreadToGpuThreads(
rewriter, foreachThreadOp, blockDim, syncAfterDistribute, transformOp);
return diag.succeeded() ? WalkResult::advance() : WalkResult::interrupt();
});
return diag;
}
DiagnosedSilenceableFailure transform::MapNestedForeachToThreads::applyToOne(
::mlir::Operation *target,
::llvm::SmallVectorImpl<::mlir::Operation *> &results,
::mlir::transform::TransformState &state) {
LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
auto transformOp = cast<TransformOpInterface>(getOperation());
if (!gpuLaunch) {
results.assign({target});
return emitSilenceableError() << "Given target is not gpu.launch";
}
SmallVector<int64_t> blockDim = extractFromI64ArrayAttr(getBlockDim());
blockDim.resize(/*size=*/3, /*value=*/1);
DiagnosedSilenceableFailure diag =
checkGpuLimits(transformOp, llvm::None, llvm::None, llvm::None,
blockDim[0], blockDim[1], blockDim[2]);
if (diag.isSilenceableFailure()) {
results.assign({target});
diag.attachNote(getLoc()) << getBlockDimAttrName() << " is very large";
return diag;
}
SimpleRewriter rewriter(getContext());
rewriter.setInsertionPoint(target);
diag = mlir::transform::gpu::mapNestedForeachToThreadsImp(
rewriter, target, blockDim, getSyncAfterDistribute(), llvm::None);
if (diag.succeeded()) {
diag =
alterGpuLaunch(rewriter, gpuLaunch, transformOp, llvm::None, llvm::None,
llvm::None, blockDim[0], blockDim[1], blockDim[2]);
}
results.assign({gpuLaunch});
return diag;
}
//===----------------------------------------------------------------------===//
// Transform op registration
//===----------------------------------------------------------------------===//
namespace {
/// Registers new ops and declares PDL as dependent dialect since the
/// additional ops are using PDL types for operands and results.
class GPUTransformDialectExtension
: public transform::TransformDialectExtension<
GPUTransformDialectExtension> {
public:
GPUTransformDialectExtension() {
declareDependentDialect<pdl::PDLDialect>();
declareGeneratedDialect<scf::SCFDialect>();
declareGeneratedDialect<arith::ArithDialect>();
declareGeneratedDialect<GPUDialect>();
registerTransformOps<
#define GET_OP_LIST
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
>();
}
};
} // namespace
#define GET_OP_CLASSES
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
void mlir::gpu::registerTransformDialectExtension(DialectRegistry &registry) {
registry.addExtensions<GPUTransformDialectExtension>();
}

View File

@ -1167,392 +1167,6 @@ void transform::TileOp::getEffects(
modifiesPayload(effects);
}
//===----------------------------------------------------------------------===//
// MapNestedForeachThreadToGpuThreads
//===----------------------------------------------------------------------===//
/// Searches `scf.foreach_thread` ops nested under `target` and maps each such
/// op to GPU threads. Mapping is one-to-one and the induction variables of
/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the
/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in
/// which case, the union of the number of threads is computed and may result in
/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not
/// supported. Dynamic block dim sizes are currently not supported.
static FailureOr<SmallVector<OpFoldResult>> rewriteOneForeachThreadToGpuThreads(
RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
const SmallVector<int64_t> &globalBlockDims, bool syncAfterDistribute) {
if (foreachThreadOp.getNumResults() > 0)
return foreachThreadOp->emitError(
"only bufferized scf.foreach_thread lowers to gpu.thread");
if (foreachThreadOp.getNumThreads().size() > 3)
return foreachThreadOp->emitError(
"scf.foreach_thread with rank > 3 does not lower to gpu.thread");
auto potentialBlockDim = foreachThreadOp.getPermutedNumThreads(rewriter);
if (failed(potentialBlockDim) ||
llvm::any_of(*potentialBlockDim, [](OpFoldResult ofr) {
return !getConstantIntValue(ofr).has_value();
}))
return foreachThreadOp->emitError("unsupported dynamic blockdim size");
SmallVector<int64_t> blockDim =
llvm::to_vector(llvm::map_range(*potentialBlockDim, [](OpFoldResult ofr) {
return getConstantIntValue(ofr).value();
}));
// Step 1. Create the gpu.thread ops
Location loc = foreachThreadOp.getLoc();
IndexType indexType = rewriter.getIndexType();
SmallVector<gpu::Dimension> gpuDims{gpu::Dimension::x, gpu::Dimension::y,
gpu::Dimension::z};
SmallVector<Value> threadOps;
for (int64_t idx : llvm::seq<int64_t>(0, blockDim.size())) {
threadOps.push_back(
rewriter.create<gpu::ThreadIdOp>(loc, indexType, gpuDims[idx]));
}
// Step 2. Maybe create conditionals to predicate the region.
Value predicate;
for (auto [threadId, blockDim, globalBlockDim] :
llvm::zip(threadOps, blockDim, globalBlockDims)) {
if (blockDim > globalBlockDim) {
return foreachThreadOp.emitOpError("blockDim size overflow: ")
<< blockDim << " > " << globalBlockDim;
}
if (blockDim == globalBlockDim)
continue;
Value tmpPredicate = rewriter.create<arith::CmpIOp>(
loc, arith::CmpIPredicate::ult, threadId,
rewriter.create<arith::ConstantIndexOp>(loc, blockDim));
predicate =
predicate ? rewriter.create<arith::AndIOp>(loc, predicate, tmpPredicate)
: tmpPredicate;
}
// Step 3. Move the body of foreachThreadOp.
// Erase the terminator first, it will not be used.
rewriter.eraseOp(foreachThreadOp.getTerminator());
Block *targetBlock;
Block::iterator insertionPoint;
if (predicate) {
// Step 3.a. If predicated, move at the beginning.
auto ifOp =
rewriter.create<scf::IfOp>(loc, predicate, /*withElseRegion=*/false);
targetBlock = ifOp.thenBlock();
insertionPoint = ifOp.thenBlock()->begin();
} else {
// Step 3.a. Otherwise, move inline just before foreachThreadOp.
targetBlock = foreachThreadOp->getBlock();
insertionPoint = Block::iterator(foreachThreadOp);
}
Block &sourceBlock = foreachThreadOp.getRegion().front();
targetBlock->getOperations().splice(insertionPoint,
sourceBlock.getOperations());
// Step 4. RAUW thread indices to thread ops.
SmallVector<Value> threadIndices =
*foreachThreadOp.getPermutedThreadIndices();
for (auto it : llvm::zip(threadIndices, threadOps)) {
Value val = std::get<0>(it);
if (!val)
continue;
for (Operation *user : llvm::make_early_inc_range(val.getUsers())) {
rewriter.updateRootInPlace(
user, [&]() { user->replaceUsesOfWith(val, std::get<1>(it)); });
}
}
// Step 5. syncthreads.
// TODO: Need warpsync
if (syncAfterDistribute)
rewriter.create<gpu::BarrierOp>(loc);
// Step 6. Erase old op.
rewriter.eraseOp(foreachThreadOp);
return *potentialBlockDim;
}
mlir::WalkResult mlir::linalg::rewriteMapNestedForeachThreadToGpuThreads(
RewriterBase &rewriter, Operation *target,
const SmallVector<int64_t> &blockDim, bool syncAfterDistribute) {
auto walkResult = target->walk([&](scf::ForeachThreadOp foreachThreadOp) {
rewriter.setInsertionPoint(foreachThreadOp);
if (failed(rewriteOneForeachThreadToGpuThreads(
rewriter, foreachThreadOp, blockDim, syncAfterDistribute)))
return WalkResult::interrupt();
return WalkResult::advance();
});
return walkResult;
}
static LogicalResult
checkGpuLimits(Optional<int64_t> gridDimX, Optional<int64_t> gridDimY,
Optional<int64_t> gridDimZ, Optional<int64_t> blockDimX,
Optional<int64_t> blockDimY, Optional<int64_t> blockDimZ) {
// TODO The limits should live in the gpu dialect, but it's not like that
// right now. Read them in the common gpu dialect
if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
1024 ||
gridDimY.value_or(1) > 65535 || gridDimZ.value_or(1) > 65535 ||
gridDimX.value_or(1) > 2147483647)
return failure();
return success();
}
/// Alter grid or block dimensions of the given kernel
static LogicalResult alterGpuLaunch(SimpleRewriter &rewriter,
gpu::LaunchOp gpuLaunch,
Optional<int64_t> gridDimX = llvm::None,
Optional<int64_t> gridDimY = llvm::None,
Optional<int64_t> gridDimZ = llvm::None,
Optional<int64_t> blockDimX = llvm::None,
Optional<int64_t> blockDimY = llvm::None,
Optional<int64_t> blockDimZ = llvm::None) {
if (failed(checkGpuLimits(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
blockDimZ))) {
gpuLaunch->emitError(
"Requested kernel thread configuration is larger than the limits");
return failure();
}
gpu::KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPointAfterValue(currentBlockdim.x);
auto createConstValue = [&](int dim) {
return rewriter.create<arith::ConstantIndexOp>(currentBlockdim.x.getLoc(),
dim);
};
if (gridDimX.has_value())
gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value()));
if (gridDimY.has_value())
gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value()));
if (gridDimZ.has_value())
gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value()));
if (blockDimX.has_value())
gpuLaunch.getBlockSizeXMutable().assign(
createConstValue(blockDimX.value()));
if (blockDimY.has_value())
gpuLaunch.getBlockSizeYMutable().assign(
createConstValue(blockDimY.value()));
if (blockDimZ.has_value())
gpuLaunch.getBlockSizeZMutable().assign(
createConstValue(blockDimZ.value()));
return success();
}
DiagnosedSilenceableFailure
transform::MapNestedForeachThreadToGpuThreads::applyToOne(
Operation *target, SmallVectorImpl<Operation *> &results,
transform::TransformState &state) {
gpu::LaunchOp gpuLaunch = dyn_cast<gpu::LaunchOp>(target);
if (!gpuLaunch) {
target->emitError("Given target is not gpu.launch");
return DiagnosedSilenceableFailure::definiteFailure();
}
SmallVector<int64_t> blockDim = extractFromI64ArrayAttr(getBlockDim());
blockDim.resize(/*size=*/3, /*value=*/1);
SimpleRewriter rewriter(getContext());
rewriter.setInsertionPoint(target);
auto walkResult = mlir::linalg::rewriteMapNestedForeachThreadToGpuThreads(
rewriter, target, blockDim, getSyncAfterDistribute());
if (walkResult.wasInterrupted())
return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
LogicalResult result =
alterGpuLaunch(rewriter, gpuLaunch, llvm::None, llvm::None, llvm::None,
blockDim[0], blockDim[1], blockDim[2]);
if (failed(result))
return DiagnosedSilenceableFailure::definiteFailure();
results.assign({target});
return DiagnosedSilenceableFailure(success());
}
//===----------------------------------------------------------------------===//
// MapNestedForeachThreadToGpuBlocks
//===----------------------------------------------------------------------===//
LogicalResult mlir::linalg::rewriteTopLevelForeachThreadToGpuBlocks(
RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
function_ref<void(RewriterBase &, scf::ForeachThreadOp,
SmallVector<Value> &)>
blockIdGenerator,
SmallVector<int64_t> &gridDims) {
if (foreachThreadOp.getNumResults() > 0)
return foreachThreadOp->emitError(
"only bufferized scf.foreach_thread lowers to gpu.block_id");
if (foreachThreadOp.getNumThreads().size() > 3)
return foreachThreadOp->emitError(
"scf.foreach_thread with rank > 3 does not lower to gpu.block_id");
// Step 0. Outline the compute workload region and set up the workload
// operands.
auto potentialGridDim = foreachThreadOp.getPermutedNumThreads(rewriter);
if (failed(potentialGridDim) ||
llvm::any_of(*potentialGridDim, [](OpFoldResult ofr) {
return !getConstantIntValue(ofr).has_value();
}))
return foreachThreadOp->emitError("unsupported dynamic gridDim");
for (OpFoldResult ofr : *potentialGridDim)
gridDims.push_back(getConstantIntValue(ofr).value());
SmallVector<Value> blockOps;
blockIdGenerator(rewriter, foreachThreadOp, blockOps);
// Step 1. Move the body of foreachThreadOp.
// Erase the terminator first, it will not be used since we are on buffers.
rewriter.eraseOp(foreachThreadOp.getTerminator());
Block *targetBlock = foreachThreadOp->getBlock();
Block::iterator insertionPoint = Block::iterator(foreachThreadOp);
Block &sourceBlock = foreachThreadOp.getRegion().front();
targetBlock->getOperations().splice(insertionPoint,
sourceBlock.getOperations());
// Step 2. RAUW thread indices to thread ops.
SmallVector<Value> threadIndices =
*foreachThreadOp.getPermutedThreadIndices();
assert(blockOps.size() == 3 && "3 block id ops are required");
for (auto it : llvm::zip(threadIndices, blockOps)) {
Value val = std::get<0>(it);
if (!val)
continue;
for (Operation *user : llvm::make_early_inc_range(val.getUsers())) {
rewriter.updateRootInPlace(
user, [&]() { user->replaceUsesOfWith(val, std::get<1>(it)); });
}
}
// Step 3. Erase old op.
rewriter.eraseOp(foreachThreadOp);
return success();
}
FailureOr<scf::ForeachThreadOp>
mlir::linalg::findTopLevelForeachThreadOp(Operation *target) {
scf::ForeachThreadOp topLevelForeachThreadOp;
auto walkResult = target->walk([&](scf::ForeachThreadOp foreachThreadOp) {
if (foreachThreadOp->getParentOfType<scf::ForeachThreadOp>())
return WalkResult::advance();
if (topLevelForeachThreadOp)
// TODO Handle multiple foreach if there is no dependences between them
return WalkResult::interrupt();
topLevelForeachThreadOp = foreachThreadOp;
return WalkResult::advance();
});
if (walkResult.wasInterrupted())
return target->emitError(
"could not find a unique topLevel scf.foreach_thread");
return topLevelForeachThreadOp;
}
/// Create gpuLauncOp with given kernel configurations
static FailureOr<gpu::LaunchOp>
createGpuLaunch(RewriterBase &rewriter, Location loc,
Optional<int64_t> gridDimX = llvm::None,
Optional<int64_t> gridDimY = llvm::None,
Optional<int64_t> gridDimZ = llvm::None,
Optional<int64_t> blockDimX = llvm::None,
Optional<int64_t> blockDimY = llvm::None,
Optional<int64_t> blockDimZ = llvm::None) {
if (failed(checkGpuLimits(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
blockDimZ)))
return failure();
auto createConstant = [&](int dim) {
return rewriter.create<arith::ConstantIndexOp>(loc, dim);
};
Value one = createConstant(1);
Value gridSizeX =
gridDimX.has_value() ? createConstant(gridDimX.value()) : one;
Value gridSizeY =
gridDimY.has_value() ? createConstant(gridDimY.value()) : one;
Value gridSizeZ =
gridDimZ.has_value() ? createConstant(gridDimZ.value()) : one;
Value blockSizeX =
blockDimX.has_value() ? createConstant(blockDimX.value()) : one;
Value blockSizeY =
blockDimY.has_value() ? createConstant(blockDimY.value()) : one;
Value blockSizeZ =
blockDimZ.has_value() ? createConstant(blockDimZ.value()) : one;
auto launchOp = rewriter.create<gpu::LaunchOp>(
loc, gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ);
rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
rewriter.create<gpu::TerminatorOp>(loc);
return launchOp;
}
/// This is an helper that is only used in
/// rewriteTopLevelForeachThreadToGpuBlocks. It generates GPU dialects block_id
static void generateGpuBlockIds(RewriterBase &rewriter,
scf::ForeachThreadOp foreachOp,
SmallVector<Value> &blockOps) {
Location loc = foreachOp->getLoc();
OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPoint(foreachOp);
IndexType indexType = rewriter.getIndexType();
SmallVector<gpu::Dimension> gpuDims{gpu::Dimension::x, gpu::Dimension::y,
gpu::Dimension::z};
for (int64_t idx : llvm::seq<int64_t>(0, gpuDims.size())) {
blockOps.push_back(
rewriter.create<gpu::BlockIdOp>(loc, indexType, gpuDims[idx]));
}
}
DiagnosedSilenceableFailure
transform::MapNestedForeachThreadToGpuBlocks::applyToOne(
Operation *target, SmallVectorImpl<Operation *> &results,
transform::TransformState &state) {
gpu::LaunchOp gpuLaunch = dyn_cast<gpu::LaunchOp>(target);
SimpleRewriter rewriter(getContext());
if (!getGenerateGpuLaunch() && !gpuLaunch) {
target->emitError("Given target is not gpu.launch, set "
"`generate_gpu_launch` attribute");
return DiagnosedSilenceableFailure::definiteFailure();
}
auto res = mlir::linalg::findTopLevelForeachThreadOp(target);
if (failed(res))
return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
scf::ForeachThreadOp topLevelForeachThreadOp = *res;
OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPoint(topLevelForeachThreadOp);
// Generate gpu launch here and move the foreach_thread inside
if (getGenerateGpuLaunch()) {
FailureOr<gpu::LaunchOp> maybeGpuLaunch =
createGpuLaunch(rewriter, target->getLoc());
if (failed(maybeGpuLaunch))
return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
gpuLaunch = *maybeGpuLaunch;
rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
Operation *newForeachThreadOp = rewriter.clone(*topLevelForeachThreadOp);
rewriter.eraseOp(topLevelForeachThreadOp);
topLevelForeachThreadOp =
dyn_cast<scf::ForeachThreadOp>(newForeachThreadOp);
}
SmallVector<int64_t> gridDim = extractFromI64ArrayAttr(getGridDim());
if (failed(mlir::linalg::rewriteTopLevelForeachThreadToGpuBlocks(
rewriter, topLevelForeachThreadOp, generateGpuBlockIds, gridDim)))
return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
if (failed(alterGpuLaunch(rewriter, gpuLaunch, gridDim[0], gridDim[1],
gridDim[2])))
return DiagnosedSilenceableFailure::definiteFailure();
results.assign({gpuLaunch});
return DiagnosedSilenceableFailure(success());
}
//===----------------------------------------------------------------------===//
// TileToForeachThreadOp
//===----------------------------------------------------------------------===//

View File

@ -35,7 +35,7 @@ transform.with_pdl_patterns {
transform.sequence %arg0 failures(propagate) {
^bb1(%arg1: !pdl.operation):
%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
transform.structured.map_nested_foreach_thread_to_gpu_blocks %funcop { blockDim = [12, 9, 1]}
transform.gpu.map_foreach_to_blocks %funcop { blockDim = [12, 9, 1]}
}
}
@ -92,7 +92,7 @@ transform.with_pdl_patterns {
transform.sequence %arg0 failures(propagate) {
^bb1(%arg1: !pdl.operation):
%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
transform.structured.map_nested_foreach_thread_to_gpu_threads %funcop { blockDim = [12, 9, 1] }
transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [12, 9, 1] }
}
}
@ -134,8 +134,8 @@ transform.with_pdl_patterns {
transform.sequence %arg0 failures(propagate) {
^bb1(%arg1: !pdl.operation):
%funcop = transform.structured.match ops{["func.func"]} in %arg0
%gpuLaunch = transform.structured.map_nested_foreach_thread_to_gpu_blocks %funcop { generate_gpu_launch }
transform.structured.map_nested_foreach_thread_to_gpu_threads %gpuLaunch { blockDim = [32, 4, 1] }
%gpuLaunch = transform.gpu.map_foreach_to_blocks %funcop { generate_gpu_launch }
transform.gpu.map_nested_foreach_to_threads %gpuLaunch { blockDim = [32, 4, 1] }
}
}
@ -171,6 +171,6 @@ transform.with_pdl_patterns {
transform.sequence %arg0 failures(propagate) {
^bb1(%arg1: !pdl.operation):
%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
transform.structured.map_nested_foreach_thread_to_gpu_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false }
transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false }
}
}

View File

@ -3761,6 +3761,65 @@ cc_library(
]),
)
td_library(
name = "GPUTransformOpsTdFiles",
srcs = [
"include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td",
],
includes = ["include"],
deps = [
":PDLDialectTdFiles",
":TransformDialectTdFiles",
],
)
gentbl_cc_library(
name = "GPUTransformOpsIncGen",
strip_include_prefix = "include",
tbl_outs = [
(
["-gen-op-decls"],
"include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h.inc",
),
(
["-gen-op-defs"],
"include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc",
),
],
tblgen = ":mlir-tblgen",
td_file = "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td",
deps = [
":GPUTransformOpsTdFiles",
],
)
cc_library(
name = "GPUTransformOps",
srcs = [
"lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp",
],
hdrs = [
"include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h",
],
includes = ["include"],
deps = [
":ArithDialect",
":AsmParser",
":ControlFlowDialect",
":GPUDialect",
":GPUTransformOpsIncGen",
":GPUTransforms",
":IR",
":PDLDialect",
":Parser",
":SCFDialect",
":SideEffectInterfaces",
":TransformDialect",
":TransformUtils",
"//llvm:Support",
],
)
td_library(
name = "LLVMOpsTdFiles",
srcs = [
@ -6401,6 +6460,7 @@ cc_library(
":GPUToROCDLTransforms",
":GPUToSPIRV",
":GPUToVulkanTransforms",
":GPUTransformOps",
":GPUTransforms",
":IR",
":LLVMDialect",