From 3230267d0dbdc908d7be5886e20c2ee5f0e942fe Mon Sep 17 00:00:00 2001 From: Alex Zinenko Date: Fri, 6 Dec 2019 14:23:06 -0800 Subject: [PATCH] Move GPU::LaunchOp to ODS. NFC. Move the definition of the GPU launch opreation from hand-rolled C++ code to ODS framework. This only does the moves, a follow-up is necessary to clean up users of custom functions that could be auto-generated by ODS. PiperOrigin-RevId: 284261856 --- mlir/g3doc/Dialects/GPU.md | 71 ----------- mlir/include/mlir/Dialect/GPU/GPUDialect.h | 68 ---------- mlir/include/mlir/Dialect/GPU/GPUOps.td | 140 +++++++++++++++++++++ mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 64 +++++----- 4 files changed, 175 insertions(+), 168 deletions(-) diff --git a/mlir/g3doc/Dialects/GPU.md b/mlir/g3doc/Dialects/GPU.md index d34ce1891e88..bcb677d7660b 100644 --- a/mlir/g3doc/Dialects/GPU.md +++ b/mlir/g3doc/Dialects/GPU.md @@ -69,77 +69,6 @@ Example: %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index) ``` -### `gpu.launch` - -Launch a kernel on the specified grid of thread blocks. The body of the kernel -is defined by the single region that this operation contains. The operation -takes at least six operands, with first three operands being grid sizes along -x,y,z dimensions, the following three arguments being block sizes along x,y,z -dimension, and the remaining operands are arguments of the kernel. When a -lower-dimensional kernel is required, unused sizes must be explicitly set to -`1`. - -The body region has at least _twelve_ arguments, grouped as follows: - -- three arguments that contain block identifiers along x,y,z dimensions; -- three arguments that contain thread identifiers along x,y,z dimensions; -- operands of the `gpu.launch` operation as is, including six leading operands - for grid and block sizes. - -Operations inside the body region, and any operations in the nested regions, are -_not_ allowed to use values defined outside the _body_ region, as if this region -was a function. If necessary, values must be passed as kernel arguments into the -body region. Nested regions inside the kernel body are allowed to use values -defined in their ancestor regions as long as they don't cross the kernel body -region boundary. - -Syntax: - -``` {.ebnf} -operation ::= `gpu.launch` `block` `(` ssa-id-list `)` `in` ssa-reassignment - `threads` `(` ssa-id-list `)` `in` ssa-reassignment - (`args` ssa-reassignment `:` type-list)? - region attr-dict? -ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)` -``` - -Example: - -```mlir {.mlir} -gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2) - threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5) - args(%arg0 = %6, %arg1 = 7) : f32, memref { - // Block and thread identifiers, as well as block/grid sizes are - // immediately usable inside body region. - "some_op"(%bx, %tx) : (index, index) -> () - %42 = load %arg1[%bx] : memref -} - -// Generic syntax explains how the pretty syntax maps to the IR structure. -"gpu.launch"(%cst, %cst, %c1, // Grid sizes. - %cst, %c1, %c1, // Block sizes. - %arg0, %arg1) // Actual arguments. - {/*attributes*/} - // All sizes and identifiers have "index" size. - : (index, index, index, index, index, index, f32, memref) -> () { -// The operation passes block and thread identifiers, followed by grid and block -// sizes, followed by actual arguments to the entry block of the region. -^bb0(%bx : index, %by : index, %bz : index, - %tx : index, %ty : index, %tz : index, - %num_bx : index, %num_by : index, %num_bz : index, - %num_tx : index, %num_ty : index, %num_tz : index, - %arg0 : f32, %arg1 : memref): - "some_op"(%bx, %tx) : (index, index) -> () - %3 = "std.load"(%arg1, %bx) : (memref, index) -> f32 -} -``` - -Rationale: using operation/block arguments gives analyses a clear way of -understanding that a value has additional semantics (e.g., we will need to know -what value corresponds to threadIdx.x for coalescing). We can recover these -properties by analyzing the operations producing values, but it is easier just -to have that information by construction. - ### `gpu.launch_func` Launch a kernel function on the specified grid of thread blocks. `gpu.launch` diff --git a/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/GPUDialect.h index 194dd9c1e1dc..3d63a45b8efb 100644 --- a/mlir/include/mlir/Dialect/GPU/GPUDialect.h +++ b/mlir/include/mlir/Dialect/GPU/GPUDialect.h @@ -77,74 +77,6 @@ struct KernelDim3 { Value *z; }; -/// GPU kernel launch operation. Takes a 3D grid of thread blocks as leading -/// operands, followed by kernel data operands. Has one region representing -/// the kernel to be executed. This region is not allowed to use values defined -/// outside it. -class LaunchOp : public Op::Impl, - OpTrait::ZeroResult, OpTrait::IsIsolatedFromAbove> { -public: - using Op::Op; - - static void build(Builder *builder, OperationState &result, Value *gridSizeX, - Value *gridSizeY, Value *gridSizeZ, Value *blockSizeX, - Value *blockSizeY, Value *blockSizeZ, - ArrayRef operands); - - /// Get the kernel region. - Region &getBody(); - - /// Get the SSA values corresponding to kernel block identifiers. - KernelDim3 getBlockIds(); - /// Get the SSA values corresponding to kernel thread identifiers. - KernelDim3 getThreadIds(); - /// Get the SSA values corresponding to kernel grid size. - KernelDim3 getGridSize(); - /// Get the SSA values corresponding to kernel block size. - KernelDim3 getBlockSize(); - /// Get the operand values passed as kernel arguments. - operand_range getKernelOperandValues(); - /// Get the operand types passed as kernel arguments. - operand_type_range getKernelOperandTypes(); - - /// Get the SSA values passed as operands to specify the grid size. - KernelDim3 getGridSizeOperandValues(); - /// Get the SSA values passed as operands to specify the block size. - KernelDim3 getBlockSizeOperandValues(); - - /// Get the SSA values of the kernel arguments. - llvm::iterator_range getKernelArguments(); - - LogicalResult verify(); - - /// Custom syntax support. - void print(OpAsmPrinter &p); - static ParseResult parse(OpAsmParser &parser, OperationState &result); - - static StringRef getOperationName() { return "gpu.launch"; } - - /// Erase the `index`-th kernel argument. Both the entry block argument and - /// the operand will be dropped. The block argument must not have any uses. - void eraseKernelArgument(unsigned index); - - /// Append canonicalization patterns to `results`. - static void getCanonicalizationPatterns(OwningRewritePatternList &results, - MLIRContext *context); - -private: - static StringRef getBlocksKeyword() { return "blocks"; } - static StringRef getThreadsKeyword() { return "threads"; } - static StringRef getArgsKeyword() { return "args"; } - - /// The number of launch configuration operands, placed at the leading - /// positions of the operand list. - static constexpr unsigned kNumConfigOperands = 6; - - /// The number of region attributes containing the launch configuration, - /// placed in the leading positions of the argument list. - static constexpr unsigned kNumConfigRegionAttributes = 12; -}; - /// Operation to launch a kernel given as outlined function. class LaunchFuncOp : public Op::Impl, OpTrait::ZeroResult> { diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td index fcaa77ce779c..9b4e21800bdb 100644 --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -181,6 +181,146 @@ def GPU_GPUFuncOp : GPU_Op<"func", [FunctionLike, IsolatedFromAbove, Symbol]> { let parser = [{ return parseGPUFuncOp(parser, result); }]; } +def GPU_LaunchOp : GPU_Op<"launch", [IsolatedFromAbove]>, + Arguments<(ins Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, + Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ, + Variadic:$operands)>, + Results<(outs)> { + let summary = "GPU kernel launch operation"; + + let description = [{ + Launch a kernel on the specified grid of thread blocks. The body of the + kernel is defined by the single region that this operation contains. The + operation takes at least six operands, with first three operands being grid + sizes along x,y,z dimensions, the following three arguments being block + sizes along x,y,z dimension, and the remaining operands are arguments of the + kernel. When a lower-dimensional kernel is required, unused sizes must be + explicitly set to `1`. + + The body region has at least _twelve_ arguments, grouped as follows: + + - three arguments that contain block identifiers along x,y,z dimensions; + - three arguments that contain thread identifiers along x,y,z dimensions; + - operands of the `gpu.launch` operation as is, including six leading + operands for grid and block sizes. + + Operations inside the body region, and any operations in the nested regions, + are _not_ allowed to use values defined outside the _body_ region, as if + this region was a function. If necessary, values must be passed as kernel + arguments into the body region. Nested regions inside the kernel body are + allowed to use values defined in their ancestor regions as long as they + don't cross the kernel body region boundary. + + Syntax: + + ``` {.ebnf} + operation ::= `gpu.launch` `block` `(` ssa-id-list `)` `in` ssa-reassignment + `threads` `(` ssa-id-list `)` `in` ssa-reassignment + (`args` ssa-reassignment `:` type-list)? + region attr-dict? + ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)` + ``` + + Example: + + ```mlir {.mlir} + gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2) + threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5) + args(%arg0 = %6, %arg1 = 7) : f32, memref { + // Block and thread identifiers, as well as block/grid sizes are + // immediately usable inside body region. + "some_op"(%bx, %tx) : (index, index) -> () + %42 = load %arg1[%bx] : memref + } + + // Generic syntax explains how the pretty syntax maps to the IR structure. + "gpu.launch"(%cst, %cst, %c1, // Grid sizes. + %cst, %c1, %c1, // Block sizes. + %arg0, %arg1) // Actual arguments. + {/*attributes*/} + // All sizes and identifiers have "index" size. + : (index, index, index, index, index, index, f32, memref) + -> () { + // The operation passes block and thread identifiers, followed by grid and + // block sizes, followed by actual arguments to the entry block of the + // region. + ^bb0(%bx : index, %by : index, %bz : index, + %tx : index, %ty : index, %tz : index, + %num_bx : index, %num_by : index, %num_bz : index, + %num_tx : index, %num_ty : index, %num_tz : index, + %arg0 : f32, %arg1 : memref): + "some_op"(%bx, %tx) : (index, index) -> () + %3 = "std.load"(%arg1, %bx) : (memref, index) -> f32 + } + ``` + + Rationale: using operation/block arguments gives analyses a clear way of + understanding that a value has additional semantics (e.g., we will need to + know what value corresponds to threadIdx.x for coalescing). We can recover + these properties by analyzing the operations producing values, but it is + easier just to have that information by construction. + }]; + + let regions = (region AnyRegion:$body); + + let skipDefaultBuilders = 1; + + let builders = [ + OpBuilder<"Builder *builder, OperationState &result, Value *gridSizeX," + "Value *gridSizeY, Value *gridSizeZ, Value *blockSizeX," + "Value *blockSizeY, Value *blockSizeZ," + "ArrayRef operands"> + ]; + + let hasCanonicalizer = 1; + + let extraClassDeclaration = [{ + /// Get the kernel region. + Region &getBody(); + + /// Get the SSA values corresponding to kernel block identifiers. + KernelDim3 getBlockIds(); + /// Get the SSA values corresponding to kernel thread identifiers. + KernelDim3 getThreadIds(); + /// Get the SSA values corresponding to kernel grid size. + KernelDim3 getGridSize(); + /// Get the SSA values corresponding to kernel block size. + KernelDim3 getBlockSize(); + /// Get the operand values passed as kernel arguments. + operand_range getKernelOperandValues(); + /// Get the operand types passed as kernel arguments. + operand_type_range getKernelOperandTypes(); + + /// Get the SSA values passed as operands to specify the grid size. + KernelDim3 getGridSizeOperandValues(); + /// Get the SSA values passed as operands to specify the block size. + KernelDim3 getBlockSizeOperandValues(); + + /// Get the SSA values of the kernel arguments. + llvm::iterator_range getKernelArguments(); + + /// Erase the `index`-th kernel argument. Both the entry block argument and + /// the operand will be dropped. The block argument must not have any uses. + void eraseKernelArgument(unsigned index); + + static StringRef getBlocksKeyword() { return "blocks"; } + static StringRef getThreadsKeyword() { return "threads"; } + static StringRef getArgsKeyword() { return "args"; } + + /// The number of launch configuration operands, placed at the leading + /// positions of the operand list. + static constexpr unsigned kNumConfigOperands = 6; + + /// The number of region attributes containing the launch configuration, + /// placed in the leading positions of the argument list. + static constexpr unsigned kNumConfigRegionAttributes = 12; + }]; + + let parser = [{ return parseLaunchOp(parser, result); }]; + let printer = [{ printLaunchOp(p, *this); }]; + let verifier = [{ return ::verify(*this); }]; +} + def GPU_ReturnOp : GPU_Op<"return", [Terminator]>, Arguments<(ins)>, Results<(outs)> { let summary = "Terminator for GPU launch regions."; diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index 38998b968ad5..87b170b6da89 100644 --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -46,7 +46,7 @@ bool GPUDialect::isKernel(Operation *op) { GPUDialect::GPUDialect(MLIRContext *context) : Dialect(getDialectName(), context) { - addOperations(); @@ -244,19 +244,20 @@ llvm::iterator_range LaunchOp::getKernelArguments() { return llvm::drop_begin(args, LaunchOp::kNumConfigRegionAttributes); } -LogicalResult LaunchOp::verify() { +LogicalResult verify(LaunchOp op) { // Kernel launch takes kNumConfigOperands leading operands for grid/block // sizes and transforms them into kNumConfigRegionAttributes region arguments // for block/thread identifiers and grid/block sizes. - if (!getBody().empty()) { - Block &entryBlock = getBody().front(); - if (entryBlock.getNumArguments() != kNumConfigOperands + getNumOperands()) - return emitOpError("unexpected number of region arguments"); + if (!op.getBody().empty()) { + Block &entryBlock = op.getBody().front(); + if (entryBlock.getNumArguments() != + LaunchOp::kNumConfigOperands + op.getNumOperands()) + return op.emitOpError("unexpected number of region arguments"); } // Block terminators without successors are expected to exit the kernel region // and must be `gpu.launch`. - for (Block &block : getBody()) { + for (Block &block : op.getBody()) { if (block.empty()) continue; if (block.back().getNumSuccessors() != 0) @@ -265,8 +266,8 @@ LogicalResult LaunchOp::verify() { return block.back() .emitError("expected 'gpu.terminator' or a terminator with " "successors") - .attachNote(getLoc()) - << "in '" << getOperationName() << "' body region"; + .attachNote(op.getLoc()) + << "in '" << LaunchOp::getOperationName() << "' body region"; } } @@ -285,27 +286,31 @@ static void printSizeAssignment(OpAsmPrinter &p, KernelDim3 size, p << *size.z << " = " << *operands[2] << ')'; } -void LaunchOp::print(OpAsmPrinter &p) { - SmallVector operandContainer(operand_begin(), operand_end()); +void printLaunchOp(OpAsmPrinter &p, LaunchOp op) { + SmallVector operandContainer(op.operand_begin(), + op.operand_end()); ArrayRef operands(operandContainer); // Print the launch configuration. - p << getOperationName() << ' ' << getBlocksKeyword(); - printSizeAssignment(p, getGridSize(), operands.take_front(3), getBlockIds()); - p << ' ' << getThreadsKeyword(); - printSizeAssignment(p, getBlockSize(), operands.slice(3, 3), getThreadIds()); + p << LaunchOp::getOperationName() << ' ' << op.getBlocksKeyword(); + printSizeAssignment(p, op.getGridSize(), operands.take_front(3), + op.getBlockIds()); + p << ' ' << op.getThreadsKeyword(); + printSizeAssignment(p, op.getBlockSize(), operands.slice(3, 3), + op.getThreadIds()); // From now on, the first kNumConfigOperands operands corresponding to grid // and block sizes are irrelevant, so we can drop them. - operands = operands.drop_front(kNumConfigOperands); + operands = operands.drop_front(LaunchOp::kNumConfigOperands); // Print the data argument remapping. - if (!getBody().empty() && !operands.empty()) { - p << ' ' << getArgsKeyword() << '('; + if (!op.getBody().empty() && !operands.empty()) { + p << ' ' << op.getArgsKeyword() << '('; for (unsigned i = 0, e = operands.size(); i < e; ++i) { if (i != 0) p << ", "; - p << *getBody().front().getArgument(kNumConfigRegionAttributes + i) + p << *op.getBody().front().getArgument( + LaunchOp::kNumConfigRegionAttributes + i) << " = " << *operands[i]; } p << ") "; @@ -321,8 +326,8 @@ void LaunchOp::print(OpAsmPrinter &p) { } } - p.printRegion(getBody(), /*printEntryBlockArgs=*/false); - p.printOptionalAttrDict(getAttrs()); + p.printRegion(op.getBody(), /*printEntryBlockArgs=*/false); + p.printOptionalAttrDict(op.getAttrs()); } // Parse the size assignment blocks for blocks and threads. These have the form @@ -361,10 +366,10 @@ parseSizeAssignment(OpAsmParser &parser, // (`args` ssa-reassignment `:` type-list)? // region attr-dict? // ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)` -ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { +ParseResult parseLaunchOp(OpAsmParser &parser, OperationState &result) { // Sizes of the grid and block. - SmallVector sizes( - kNumConfigOperands); + SmallVector sizes( + LaunchOp::kNumConfigOperands); MutableArrayRef sizesRef(sizes); // Actual (data) operands passed to the kernel. @@ -372,7 +377,7 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { // Region arguments to be created. SmallVector regionArgs( - kNumConfigRegionAttributes); + LaunchOp::kNumConfigRegionAttributes); MutableArrayRef regionArgsRef(regionArgs); // Parse the size assignment segments: the first segment assigns grid sizes @@ -380,11 +385,11 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { // sizes and defines values for thread identifiers. In the region argument // list, identifiers precede sizes, and block-related values precede // thread-related values. - if (parser.parseKeyword(getBlocksKeyword().data()) || + if (parser.parseKeyword(LaunchOp::getBlocksKeyword().data()) || parseSizeAssignment(parser, sizesRef.take_front(3), regionArgsRef.slice(6, 3), regionArgsRef.slice(0, 3)) || - parser.parseKeyword(getThreadsKeyword().data()) || + parser.parseKeyword(LaunchOp::getThreadsKeyword().data()) || parseSizeAssignment(parser, sizesRef.drop_front(3), regionArgsRef.slice(9, 3), regionArgsRef.slice(3, 3)) || @@ -397,7 +402,7 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { // so is the trailing type list. Parse it as well and use the parsed types // to resolve the operands passed to the kernel arguments. SmallVector dataTypes; - if (!parser.parseOptionalKeyword(getArgsKeyword())) { + if (!parser.parseOptionalKeyword(LaunchOp::getArgsKeyword())) { llvm::SMLoc argsLoc = parser.getCurrentLocation(); regionArgs.push_back({}); @@ -425,7 +430,8 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { // block/thread identifiers and grid/block sizes, all of the `index` type. // Follow the actual kernel arguments. Type index = parser.getBuilder().getIndexType(); - dataTypes.insert(dataTypes.begin(), kNumConfigRegionAttributes, index); + dataTypes.insert(dataTypes.begin(), LaunchOp::kNumConfigRegionAttributes, + index); Region *body = result.addRegion(); return failure(parser.parseRegion(*body, regionArgs, dataTypes) || parser.parseOptionalAttrDict(result.attributes));