Add async dependencies support for gpu.launch op

Add async dependencies support for gpu.launch op: this allows specifying
a list of async tokens ("streams") as dependencies for the launch.

Update the GPU kernel outlining pass lowering to propagate async
dependencies from gpu.launch to gpu.launch_func op. Previously, a new
stream was being created and destroyed for a kernel launch. The async
deps support allows the kernel launch to be serialized on an existing

Differential Revision:
This commit is contained in:
Uday Bondhugula 2022-04-20 22:43:35 +05:30
parent 48e894a536
commit f47a38f517
6 changed files with 186 additions and 66 deletions

View File

@ -420,7 +420,9 @@ def GPU_LaunchFuncOp : GPU_Op<"launch_func",
let builders = [
OpBuilder<(ins "GPUFuncOp":$kernelFunc, "KernelDim3":$gridSize,
"KernelDim3":$blockSize, "Value":$dynamicSharedMemorySize,
CArg<"Type", "nullptr">:$asyncTokenType,
CArg<"ValueRange", "{}">:$asyncDependencies)>
let extraClassDeclaration = [{
@ -466,25 +468,32 @@ def GPU_LaunchFuncOp : GPU_Op<"launch_func",
let hasVerifier = 1;
def GPU_LaunchOp : GPU_Op<"launch", [AutomaticAllocationScope]>,
Arguments<(ins Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
def GPU_LaunchOp : GPU_Op<"launch",
[AutomaticAllocationScope, AttrSizedOperandSegments, GPU_AsyncOpInterface]>,
Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
Results<(outs)> {
Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
let summary = "GPU kernel launch operation";
let description = [{
Launch a kernel on the specified grid of thread blocks. The body of the
kernel is defined by the single region that this operation contains. The
operation takes six operands followed by an optional operand: the first
three operands are grid sizes along the x,y,z dimensions and the following
three are block sizes along the x,y,z dimensions. The last operand is
optional and corresponds to the amount of dynamic shared memory a kernel's
workgroup should be allocated; when this operand is not present, a zero size
is assumed.
operation takes an optional list of async dependencies followed by six
operands and an optional operand.
When a lower-dimensional kernel is required, unused sizes must
be explicitly set to `1`.
The `async` keyword indicates the kernel should be launched asynchronously;
the operation returns a new !gpu.async.token when the keyword is specified.
The kernel launched does not start executing until the ops producing its
async dependencies (optional operands) have completed.
The first three operands (following any async dependencies) are grid sizes
along the x,y,z dimensions and the following three are block sizes along the
x,y,z dimensions. When a lower-dimensional kernel is required, unused sizes
must be explicitly set to `1`. The last operand is optional and corresponds
to the amount of dynamic shared memory a kernel's workgroup should be
allocated; when this operand is not present, a zero size is assumed.
The body region has _twelve_ arguments, grouped as follows:
@ -496,7 +505,8 @@ def GPU_LaunchOp : GPU_Op<"launch", [AutomaticAllocationScope]>,
operation ::= `gpu.launch` `block` `(` ssa-id-list `)` `in` ssa-reassignment
operation ::= `gpu.launch` (`async` (`[` ssa-id-list `]`)? )?
`block` `(` ssa-id-list `)` `in` ssa-reassignment
`threads` `(` ssa-id-list `)` `in` ssa-reassignment
(dynamic_shared_memory_size ssa-use)?
region attr-dict?
@ -548,7 +558,9 @@ def GPU_LaunchOp : GPU_Op<"launch", [AutomaticAllocationScope]>,
OpBuilder<(ins "Value":$gridSizeX, "Value":$gridSizeY,
"Value":$gridSizeZ, "Value":$blockSizeX, "Value":$blockSizeY,
CArg<"Value", "nullptr">:$dynamic_shared_memory_size)>
CArg<"Value", "nullptr">:$dynamicSharedMemorySize,
CArg<"Type", "nullptr">:$asyncTokenType,
CArg<"ValueRange", "{}">:$asyncDependencies)>
let extraClassDeclaration = [{

View File

@ -275,6 +275,44 @@ LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,
return walkResult.wasInterrupted() ? failure() : success();
/// Parses an optional list of async operands with an optional leading keyword.
/// (`async`)? (`[` ssa-id-list `]`)?
/// This method is used by the tablegen assembly format for async ops as well.
static ParseResult parseAsyncDependencies(
OpAsmParser &parser, Type &asyncTokenType,
SmallVectorImpl<OpAsmParser::UnresolvedOperand> &asyncDependencies) {
auto loc = parser.getCurrentLocation();
if (succeeded(parser.parseOptionalKeyword("async"))) {
if (parser.getNumResults() == 0)
return parser.emitError(loc, "needs to be named when marked 'async'");
asyncTokenType = parser.getBuilder().getType<AsyncTokenType>();
return parser.parseOperandList(asyncDependencies,
/// Prints optional async dependencies with its leading keyword.
/// (`async`)? (`[` ssa-id-list `]`)?
// Used by the tablegen assembly format for several async ops.
static void printAsyncDependencies(OpAsmPrinter &printer, Operation *op,
Type asyncTokenType,
OperandRange asyncDependencies) {
if (asyncTokenType)
printer << "async";
if (asyncDependencies.empty())
if (asyncTokenType)
printer << ' ';
printer << '[';
llvm::interleaveComma(asyncDependencies, printer);
printer << ']';
// AllReduceOp
LogicalResult gpu::AllReduceOp::verifyRegions() {
if (body().empty() != op().hasValue())
return emitError("expected either an op attribute or a non-empty body");
@ -358,7 +396,12 @@ void gpu::addAsyncDependency(Operation *op, Value token) {
void LaunchOp::build(OpBuilder &builder, OperationState &result,
Value gridSizeX, Value gridSizeY, Value gridSizeZ,
Value blockSizeX, Value blockSizeY, Value blockSizeZ,
Value dynamicSharedMemorySize) {
Value dynamicSharedMemorySize, Type asyncTokenType,
ValueRange asyncDependencies) {
if (asyncTokenType)
// Add grid and block sizes as op operands, followed by the data operands.
{gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ});
@ -373,6 +416,11 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
for (unsigned i = 0; i < kNumConfigRegionAttributes; ++i)
body->addArgument(builder.getIndexType(), result.location);
SmallVector<int32_t, 8> segmentSizes(8, 1);
segmentSizes.front() = asyncDependencies.size();
segmentSizes.back() = dynamicSharedMemorySize ? 1 : 0;
KernelDim3 LaunchOp::getBlockIds() {
@ -400,11 +448,13 @@ KernelDim3 LaunchOp::getBlockSize() {
KernelDim3 LaunchOp::getGridSizeOperandValues() {
return KernelDim3{getOperand(0), getOperand(1), getOperand(2)};
auto operands = getOperands().drop_front(asyncDependencies().size());
return KernelDim3{operands[0], operands[1], operands[2]};
KernelDim3 LaunchOp::getBlockSizeOperandValues() {
return KernelDim3{getOperand(3), getOperand(4), getOperand(5)};
auto operands = getOperands().drop_front(asyncDependencies().size());
return KernelDim3{operands[3], operands[4], operands[5]};
LogicalResult LaunchOp::verifyRegions() {
@ -412,9 +462,9 @@ LogicalResult LaunchOp::verifyRegions() {
// sizes and transforms them into kNumConfigRegionAttributes region arguments
// for block/thread identifiers and grid/block sizes.
if (!body().empty()) {
if (body().getNumArguments() != LaunchOp::kNumConfigOperands +
getNumOperands() -
(dynamicSharedMemorySize() ? 1 : 0))
if (body().getNumArguments() !=
LaunchOp::kNumConfigOperands + getNumOperands() -
(dynamicSharedMemorySize() ? 1 : 0) - asyncDependencies().size())
return emitOpError("unexpected number of region arguments");
@ -435,6 +485,9 @@ LogicalResult LaunchOp::verifyRegions() {
if (getNumResults() == 0 && asyncToken())
return emitOpError("needs to be named when async keyword is specified");
return success();
@ -451,6 +504,11 @@ static void printSizeAssignment(OpAsmPrinter &p, KernelDim3 size,
void LaunchOp::print(OpAsmPrinter &p) {
if (asyncToken()) {
p << " async";
if (!asyncDependencies().empty())
p << " [" << asyncDependencies() << ']';
// Print the launch configuration.
p << ' ' << getBlocksKeyword();
printSizeAssignment(p, getGridSize(), getGridSizeOperandValues(),
@ -464,7 +522,8 @@ void LaunchOp::print(OpAsmPrinter &p) {
p << ' ';
p.printRegion(body(), /*printEntryBlockArgs=*/false);
p.printOptionalAttrDict((*this)->getAttrs(), /*elidedAttrs=*/{
// Parse the size assignment blocks for blocks and threads. These have the form
@ -498,11 +557,10 @@ parseSizeAssignment(OpAsmParser &parser,
/// Parses a Launch operation.
/// operation ::= `gpu.launch` `blocks` `(` ssa-id-list `)` `in`
/// ssa-reassignment
/// `threads` `(` ssa-id-list `)` `in`
/// ssa-reassignment
/// region attr-dict?
/// operation ::= `gpu.launch` (`async` `[` ssa-id-list `]`)?
// `blocks` `(` ssa-id-list `)` `in` ssa-reassignment
/// `threads` `(` ssa-id-list `)` `in` ssa-reassignment
/// region attr-dict?
/// ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)`
ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
// Sizes of the grid and block.
@ -518,6 +576,17 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
MutableArrayRef<OpAsmParser::UnresolvedOperand> regionArgsRef(regionArgs);
// Parse optional async dependencies.
SmallVector<OpAsmParser::UnresolvedOperand, 4> asyncDependencies;
Type asyncTokenType;
if (failed(
parseAsyncDependencies(parser, asyncTokenType, asyncDependencies)) ||
parser.resolveOperands(asyncDependencies, asyncTokenType,
return failure();
if (parser.getNumResults() > 0)
// Parse the size assignment segments: the first segment assigns grid sizes
// and defines values for block identifiers; the second segment assigns block
// sizes and defines values for thread identifiers. In the region argument
@ -536,13 +605,16 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
return failure();
OpAsmParser::UnresolvedOperand dynamicSharedMemorySize;
bool hasDynamicSharedMemorySize = false;
if (!parser.parseOptionalKeyword(
LaunchOp::getDynamicSharedMemorySizeKeyword())) {
hasDynamicSharedMemorySize = true;
if (parser.parseOperand(dynamicSharedMemorySize) ||
return failure();
// Introduce the body region and parse it. The region has
// kNumConfigRegionAttributes arguments that correspond to
@ -551,8 +623,16 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
SmallVector<Type, LaunchOp::kNumConfigRegionAttributes> dataTypes(
LaunchOp::kNumConfigRegionAttributes, index);
Region *body = result.addRegion();
return failure(parser.parseRegion(*body, regionArgs, dataTypes) ||
if (parser.parseRegion(*body, regionArgs, dataTypes) ||
return failure();
SmallVector<int32_t, 8> segmentSizes(8, 1);
segmentSizes.front() = asyncDependencies.size();
segmentSizes.back() = hasDynamicSharedMemorySize ? 1 : 0;
return success();
/// Simplify the gpu.launch when the range of a thread or block ID is
@ -602,7 +682,12 @@ void LaunchOp::getCanonicalizationPatterns(RewritePatternSet &rewrites,
void LaunchFuncOp::build(OpBuilder &builder, OperationState &result,
GPUFuncOp kernelFunc, KernelDim3 gridSize,
KernelDim3 blockSize, Value dynamicSharedMemorySize,
ValueRange kernelOperands) {
ValueRange kernelOperands, Type asyncTokenType,
ValueRange asyncDependencies) {
if (asyncTokenType)
// Add grid and block sizes as op operands, followed by the data operands.
result.addOperands({gridSize.x, gridSize.y, gridSize.z, blockSize.x,
blockSize.y, blockSize.z});
@ -615,7 +700,7 @@ void LaunchFuncOp::build(OpBuilder &builder, OperationState &result,
result.addAttribute(getKernelAttrName(), kernelSymbol);
SmallVector<int32_t, 9> segmentSizes(9, 1);
segmentSizes.front() = 0; // Initially no async dependencies.
segmentSizes.front() = asyncDependencies.size();
segmentSizes[segmentSizes.size() - 2] = dynamicSharedMemorySize ? 1 : 0;
segmentSizes.back() = static_cast<int32_t>(kernelOperands.size());
@ -1039,36 +1124,6 @@ LogicalResult MemcpyOp::verify() {
return success();
static ParseResult parseAsyncDependencies(
OpAsmParser &parser, Type &asyncTokenType,
SmallVectorImpl<OpAsmParser::UnresolvedOperand> &asyncDependencies) {
auto loc = parser.getCurrentLocation();
if (succeeded(parser.parseOptionalKeyword("async"))) {
if (parser.getNumResults() == 0)
return parser.emitError(loc, "needs to be named when marked 'async'");
asyncTokenType = parser.getBuilder().getType<AsyncTokenType>();
return parser.parseOperandList(asyncDependencies,
/// Prints optional async dependencies with its leading keyword.
/// (`async`)? (`[` ssa-id-list `]`)?
// Used by the tablegen assembly format for several async ops.
static void printAsyncDependencies(OpAsmPrinter &printer, Operation *op,
Type asyncTokenType,
OperandRange asyncDependencies) {
if (asyncTokenType)
printer << "async";
if (asyncDependencies.empty())
if (asyncTokenType)
printer << ' ';
printer << '[';
llvm::interleaveComma(asyncDependencies, printer);
printer << ']';
namespace {
/// Erases a common case of copy ops where a destination value is used only by

View File

@ -225,10 +225,13 @@ static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
OpBuilder builder(launchOp);
// The launch op has an optional dynamic shared memory size. If it doesn't
// exist, we use zero.
Value asyncToken = launchOp.asyncToken();
auto launchFunc = builder.create<gpu::LaunchFuncOp>(
launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
launchOp.getBlockSizeOperandValues(), launchOp.dynamicSharedMemorySize(),
operands, asyncToken ? asyncToken.getType() : nullptr,

View File

@ -4,7 +4,7 @@ func.func @not_enough_sizes(%sz : index) {
// expected-error@+1 {{expected 6 or more operands, but found 5}}
"gpu.launch"(%sz, %sz, %sz, %sz, %sz) ({
}) : (index, index, index, index, index) -> ()
}) {operand_segment_sizes = dense<[0, 1, 1, 1, 1, 1, 1, 0]> : vector<8xi32>} : (index, index, index, index, index) -> ()
@ -12,11 +12,11 @@ func.func @not_enough_sizes(%sz : index) {
func.func @no_region_attrs(%sz : index) {
// expected-error@+1 {{unexpected number of region arguments}}
"gpu.launch"(%sz, %sz, %sz, %sz, %sz, %sz) ({
"gpu.launch"(%sz, %sz, %sz, %sz, %sz, %sz) ({
^bb1(%bx: index, %by: index, %bz: index,
%tx: index, %ty: index, %tz: index):
}) : (index, index, index, index, index, index) -> ()
}) {operand_segment_sizes = dense<[0, 1, 1, 1, 1, 1, 1, 0]> : vector<8xi32>} : (index, index, index, index, index, index) -> ()

View File

@ -1,4 +1,8 @@
// RUN: mlir-opt -allow-unregistered-dialect %s | FileCheck %s
// Verify the printed output can be parsed.
// RUN: mlir-opt -allow-unregistered-dialect %s | mlir-opt -allow-unregistered-dialect | FileCheck %s
// Verify the generic form can be parsed.
// RUN: mlir-opt -allow-unregistered-dialect -mlir-print-op-generic %s | mlir-opt -allow-unregistered-dialect | FileCheck %s
module attributes {gpu.container_module} {
@ -26,6 +30,32 @@ module attributes {gpu.container_module} {
// CHECK-LABEL:func @launch_async(%{{.*}}: index, %{{.*}}: index) {
func @launch_async(%blk : index, %thrd : index) {
// CHECK: gpu.launch async [%{{.+}}] blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
%t = gpu.wait async
%name = gpu.launch async [%t] blocks(%arg0, %arg1, %arg2) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
threads(%arg3, %arg4, %arg5) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd) {
// CHECK-LABEL:func @launch_async_no_deps(%{{.*}}: index, %{{.*}}: index) {
func @launch_async_no_deps(%blk : index, %thrd : index) {
// CHECK: %{{.*}} = gpu.launch async blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
%t0 = gpu.launch async blocks(%arg0, %arg1, %arg2) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
threads(%arg3, %arg4, %arg5) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd) {
// CHECK: gpu.launch async blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
%t1 = gpu.launch async [] blocks(%arg0, %arg1, %arg2) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
threads(%arg3, %arg4, %arg5) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd) {
gpu.module @kernels {
gpu.func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>) kernel {
%tIdX = gpu.thread_id x

View File

@ -80,6 +80,26 @@ func.func @multiple_launches() {
%block_z2 = %cst) {
// With async and async deps.
// CHECK: %[[TOKEN:.*]] = gpu.wait async
// CHECK: gpu.launch_func async [%[[TOKEN]]] @multiple_launches_kernel_1::@multiple_launches_kernel blocks in (%[[CST]], %[[CST]], %[[CST]]) threads in (%[[CST]], %[[CST]], %[[CST]])
%t = gpu.wait async
%u = gpu.launch async [%t] blocks(%bx2, %by2, %bz2) in (%grid_x2 = %cst, %grid_y2 = %cst,
%grid_z2 = %cst)
threads(%tx2, %ty2, %tz2) in (%block_x2 = %cst, %block_y2 = %cst,
%block_z2 = %cst) {
// CHECK: gpu.launch_func async @multiple_launches_kernel_2::@multiple_launches_kernel blocks in (%[[CST]], %[[CST]], %[[CST]]) threads in (%[[CST]], %[[CST]], %[[CST]])
%v = gpu.launch async blocks(%bx2, %by2, %bz2) in (%grid_x2 = %cst, %grid_y2 = %cst,
%grid_z2 = %cst)
threads(%tx2, %ty2, %tz2) in (%block_x2 = %cst, %block_y2 = %cst,
%block_z2 = %cst) {