Move GPU::LaunchOp to ODS. NFC.

Move the definition of the GPU launch opreation from hand-rolled C++ code to
ODS framework. This only does the moves, a follow-up is necessary to clean up
users of custom functions that could be auto-generated by ODS.

PiperOrigin-RevId: 284261856
This commit is contained in:
Alex Zinenko 2019-12-06 14:23:06 -08:00 committed by A. Unique TensorFlower
parent 6e0a2e4e2f
commit 3230267d0d
4 changed files with 175 additions and 168 deletions

View File

@ -69,77 +69,6 @@ Example:
%gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
```
### `gpu.launch`
Launch a kernel on the specified grid of thread blocks. The body of the kernel
is defined by the single region that this operation contains. The operation
takes at least six operands, with first three operands being grid sizes along
x,y,z dimensions, the following three arguments being block sizes along x,y,z
dimension, and the remaining operands are arguments of the kernel. When a
lower-dimensional kernel is required, unused sizes must be explicitly set to
`1`.
The body region has at least _twelve_ arguments, grouped as follows:
- three arguments that contain block identifiers along x,y,z dimensions;
- three arguments that contain thread identifiers along x,y,z dimensions;
- operands of the `gpu.launch` operation as is, including six leading operands
for grid and block sizes.
Operations inside the body region, and any operations in the nested regions, are
_not_ allowed to use values defined outside the _body_ region, as if this region
was a function. If necessary, values must be passed as kernel arguments into the
body region. Nested regions inside the kernel body are allowed to use values
defined in their ancestor regions as long as they don't cross the kernel body
region boundary.
Syntax:
``` {.ebnf}
operation ::= `gpu.launch` `block` `(` ssa-id-list `)` `in` ssa-reassignment
`threads` `(` ssa-id-list `)` `in` ssa-reassignment
(`args` ssa-reassignment `:` type-list)?
region attr-dict?
ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)`
```
Example:
```mlir {.mlir}
gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2)
threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5)
args(%arg0 = %6, %arg1 = 7) : f32, memref<?xf32, 1> {
// Block and thread identifiers, as well as block/grid sizes are
// immediately usable inside body region.
"some_op"(%bx, %tx) : (index, index) -> ()
%42 = load %arg1[%bx] : memref<?xf32, 1>
}
// Generic syntax explains how the pretty syntax maps to the IR structure.
"gpu.launch"(%cst, %cst, %c1, // Grid sizes.
%cst, %c1, %c1, // Block sizes.
%arg0, %arg1) // Actual arguments.
{/*attributes*/}
// All sizes and identifiers have "index" size.
: (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> () {
// The operation passes block and thread identifiers, followed by grid and block
// sizes, followed by actual arguments to the entry block of the region.
^bb0(%bx : index, %by : index, %bz : index,
%tx : index, %ty : index, %tz : index,
%num_bx : index, %num_by : index, %num_bz : index,
%num_tx : index, %num_ty : index, %num_tz : index,
%arg0 : f32, %arg1 : memref<?xf32, 1>):
"some_op"(%bx, %tx) : (index, index) -> ()
%3 = "std.load"(%arg1, %bx) : (memref<?xf32, 1>, index) -> f32
}
```
Rationale: using operation/block arguments gives analyses a clear way of
understanding that a value has additional semantics (e.g., we will need to know
what value corresponds to threadIdx.x for coalescing). We can recover these
properties by analyzing the operations producing values, but it is easier just
to have that information by construction.
### `gpu.launch_func`
Launch a kernel function on the specified grid of thread blocks. `gpu.launch`

View File

@ -77,74 +77,6 @@ struct KernelDim3 {
Value *z;
};
/// GPU kernel launch operation. Takes a 3D grid of thread blocks as leading
/// operands, followed by kernel data operands. Has one region representing
/// the kernel to be executed. This region is not allowed to use values defined
/// outside it.
class LaunchOp : public Op<LaunchOp, OpTrait::AtLeastNOperands<6>::Impl,
OpTrait::ZeroResult, OpTrait::IsIsolatedFromAbove> {
public:
using Op::Op;
static void build(Builder *builder, OperationState &result, Value *gridSizeX,
Value *gridSizeY, Value *gridSizeZ, Value *blockSizeX,
Value *blockSizeY, Value *blockSizeZ,
ArrayRef<Value *> operands);
/// Get the kernel region.
Region &getBody();
/// Get the SSA values corresponding to kernel block identifiers.
KernelDim3 getBlockIds();
/// Get the SSA values corresponding to kernel thread identifiers.
KernelDim3 getThreadIds();
/// Get the SSA values corresponding to kernel grid size.
KernelDim3 getGridSize();
/// Get the SSA values corresponding to kernel block size.
KernelDim3 getBlockSize();
/// Get the operand values passed as kernel arguments.
operand_range getKernelOperandValues();
/// Get the operand types passed as kernel arguments.
operand_type_range getKernelOperandTypes();
/// Get the SSA values passed as operands to specify the grid size.
KernelDim3 getGridSizeOperandValues();
/// Get the SSA values passed as operands to specify the block size.
KernelDim3 getBlockSizeOperandValues();
/// Get the SSA values of the kernel arguments.
llvm::iterator_range<Block::args_iterator> getKernelArguments();
LogicalResult verify();
/// Custom syntax support.
void print(OpAsmPrinter &p);
static ParseResult parse(OpAsmParser &parser, OperationState &result);
static StringRef getOperationName() { return "gpu.launch"; }
/// Erase the `index`-th kernel argument. Both the entry block argument and
/// the operand will be dropped. The block argument must not have any uses.
void eraseKernelArgument(unsigned index);
/// Append canonicalization patterns to `results`.
static void getCanonicalizationPatterns(OwningRewritePatternList &results,
MLIRContext *context);
private:
static StringRef getBlocksKeyword() { return "blocks"; }
static StringRef getThreadsKeyword() { return "threads"; }
static StringRef getArgsKeyword() { return "args"; }
/// The number of launch configuration operands, placed at the leading
/// positions of the operand list.
static constexpr unsigned kNumConfigOperands = 6;
/// The number of region attributes containing the launch configuration,
/// placed in the leading positions of the argument list.
static constexpr unsigned kNumConfigRegionAttributes = 12;
};
/// Operation to launch a kernel given as outlined function.
class LaunchFuncOp : public Op<LaunchFuncOp, OpTrait::AtLeastNOperands<6>::Impl,
OpTrait::ZeroResult> {

View File

@ -181,6 +181,146 @@ def GPU_GPUFuncOp : GPU_Op<"func", [FunctionLike, IsolatedFromAbove, Symbol]> {
let parser = [{ return parseGPUFuncOp(parser, result); }];
}
def GPU_LaunchOp : GPU_Op<"launch", [IsolatedFromAbove]>,
Arguments<(ins Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
Variadic<AnyType>:$operands)>,
Results<(outs)> {
let summary = "GPU kernel launch operation";
let description = [{
Launch a kernel on the specified grid of thread blocks. The body of the
kernel is defined by the single region that this operation contains. The
operation takes at least six operands, with first three operands being grid
sizes along x,y,z dimensions, the following three arguments being block
sizes along x,y,z dimension, and the remaining operands are arguments of the
kernel. When a lower-dimensional kernel is required, unused sizes must be
explicitly set to `1`.
The body region has at least _twelve_ arguments, grouped as follows:
- three arguments that contain block identifiers along x,y,z dimensions;
- three arguments that contain thread identifiers along x,y,z dimensions;
- operands of the `gpu.launch` operation as is, including six leading
operands for grid and block sizes.
Operations inside the body region, and any operations in the nested regions,
are _not_ allowed to use values defined outside the _body_ region, as if
this region was a function. If necessary, values must be passed as kernel
arguments into the body region. Nested regions inside the kernel body are
allowed to use values defined in their ancestor regions as long as they
don't cross the kernel body region boundary.
Syntax:
``` {.ebnf}
operation ::= `gpu.launch` `block` `(` ssa-id-list `)` `in` ssa-reassignment
`threads` `(` ssa-id-list `)` `in` ssa-reassignment
(`args` ssa-reassignment `:` type-list)?
region attr-dict?
ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)`
```
Example:
```mlir {.mlir}
gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2)
threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5)
args(%arg0 = %6, %arg1 = 7) : f32, memref<?xf32, 1> {
// Block and thread identifiers, as well as block/grid sizes are
// immediately usable inside body region.
"some_op"(%bx, %tx) : (index, index) -> ()
%42 = load %arg1[%bx] : memref<?xf32, 1>
}
// Generic syntax explains how the pretty syntax maps to the IR structure.
"gpu.launch"(%cst, %cst, %c1, // Grid sizes.
%cst, %c1, %c1, // Block sizes.
%arg0, %arg1) // Actual arguments.
{/*attributes*/}
// All sizes and identifiers have "index" size.
: (index, index, index, index, index, index, f32, memref<?xf32, 1>)
-> () {
// The operation passes block and thread identifiers, followed by grid and
// block sizes, followed by actual arguments to the entry block of the
// region.
^bb0(%bx : index, %by : index, %bz : index,
%tx : index, %ty : index, %tz : index,
%num_bx : index, %num_by : index, %num_bz : index,
%num_tx : index, %num_ty : index, %num_tz : index,
%arg0 : f32, %arg1 : memref<?xf32, 1>):
"some_op"(%bx, %tx) : (index, index) -> ()
%3 = "std.load"(%arg1, %bx) : (memref<?xf32, 1>, index) -> f32
}
```
Rationale: using operation/block arguments gives analyses a clear way of
understanding that a value has additional semantics (e.g., we will need to
know what value corresponds to threadIdx.x for coalescing). We can recover
these properties by analyzing the operations producing values, but it is
easier just to have that information by construction.
}];
let regions = (region AnyRegion:$body);
let skipDefaultBuilders = 1;
let builders = [
OpBuilder<"Builder *builder, OperationState &result, Value *gridSizeX,"
"Value *gridSizeY, Value *gridSizeZ, Value *blockSizeX,"
"Value *blockSizeY, Value *blockSizeZ,"
"ArrayRef<Value *> operands">
];
let hasCanonicalizer = 1;
let extraClassDeclaration = [{
/// Get the kernel region.
Region &getBody();
/// Get the SSA values corresponding to kernel block identifiers.
KernelDim3 getBlockIds();
/// Get the SSA values corresponding to kernel thread identifiers.
KernelDim3 getThreadIds();
/// Get the SSA values corresponding to kernel grid size.
KernelDim3 getGridSize();
/// Get the SSA values corresponding to kernel block size.
KernelDim3 getBlockSize();
/// Get the operand values passed as kernel arguments.
operand_range getKernelOperandValues();
/// Get the operand types passed as kernel arguments.
operand_type_range getKernelOperandTypes();
/// Get the SSA values passed as operands to specify the grid size.
KernelDim3 getGridSizeOperandValues();
/// Get the SSA values passed as operands to specify the block size.
KernelDim3 getBlockSizeOperandValues();
/// Get the SSA values of the kernel arguments.
llvm::iterator_range<Block::args_iterator> getKernelArguments();
/// Erase the `index`-th kernel argument. Both the entry block argument and
/// the operand will be dropped. The block argument must not have any uses.
void eraseKernelArgument(unsigned index);
static StringRef getBlocksKeyword() { return "blocks"; }
static StringRef getThreadsKeyword() { return "threads"; }
static StringRef getArgsKeyword() { return "args"; }
/// The number of launch configuration operands, placed at the leading
/// positions of the operand list.
static constexpr unsigned kNumConfigOperands = 6;
/// The number of region attributes containing the launch configuration,
/// placed in the leading positions of the argument list.
static constexpr unsigned kNumConfigRegionAttributes = 12;
}];
let parser = [{ return parseLaunchOp(parser, result); }];
let printer = [{ printLaunchOp(p, *this); }];
let verifier = [{ return ::verify(*this); }];
}
def GPU_ReturnOp : GPU_Op<"return", [Terminator]>, Arguments<(ins)>,
Results<(outs)> {
let summary = "Terminator for GPU launch regions.";

View File

@ -46,7 +46,7 @@ bool GPUDialect::isKernel(Operation *op) {
GPUDialect::GPUDialect(MLIRContext *context)
: Dialect(getDialectName(), context) {
addOperations<LaunchOp, LaunchFuncOp,
addOperations<LaunchFuncOp,
#define GET_OP_LIST
#include "mlir/Dialect/GPU/GPUOps.cpp.inc"
>();
@ -244,19 +244,20 @@ llvm::iterator_range<Block::args_iterator> LaunchOp::getKernelArguments() {
return llvm::drop_begin(args, LaunchOp::kNumConfigRegionAttributes);
}
LogicalResult LaunchOp::verify() {
LogicalResult verify(LaunchOp op) {
// Kernel launch takes kNumConfigOperands leading operands for grid/block
// sizes and transforms them into kNumConfigRegionAttributes region arguments
// for block/thread identifiers and grid/block sizes.
if (!getBody().empty()) {
Block &entryBlock = getBody().front();
if (entryBlock.getNumArguments() != kNumConfigOperands + getNumOperands())
return emitOpError("unexpected number of region arguments");
if (!op.getBody().empty()) {
Block &entryBlock = op.getBody().front();
if (entryBlock.getNumArguments() !=
LaunchOp::kNumConfigOperands + op.getNumOperands())
return op.emitOpError("unexpected number of region arguments");
}
// Block terminators without successors are expected to exit the kernel region
// and must be `gpu.launch`.
for (Block &block : getBody()) {
for (Block &block : op.getBody()) {
if (block.empty())
continue;
if (block.back().getNumSuccessors() != 0)
@ -265,8 +266,8 @@ LogicalResult LaunchOp::verify() {
return block.back()
.emitError("expected 'gpu.terminator' or a terminator with "
"successors")
.attachNote(getLoc())
<< "in '" << getOperationName() << "' body region";
.attachNote(op.getLoc())
<< "in '" << LaunchOp::getOperationName() << "' body region";
}
}
@ -285,27 +286,31 @@ static void printSizeAssignment(OpAsmPrinter &p, KernelDim3 size,
p << *size.z << " = " << *operands[2] << ')';
}
void LaunchOp::print(OpAsmPrinter &p) {
SmallVector<Value *, 12> operandContainer(operand_begin(), operand_end());
void printLaunchOp(OpAsmPrinter &p, LaunchOp op) {
SmallVector<Value *, 12> operandContainer(op.operand_begin(),
op.operand_end());
ArrayRef<Value *> operands(operandContainer);
// Print the launch configuration.
p << getOperationName() << ' ' << getBlocksKeyword();
printSizeAssignment(p, getGridSize(), operands.take_front(3), getBlockIds());
p << ' ' << getThreadsKeyword();
printSizeAssignment(p, getBlockSize(), operands.slice(3, 3), getThreadIds());
p << LaunchOp::getOperationName() << ' ' << op.getBlocksKeyword();
printSizeAssignment(p, op.getGridSize(), operands.take_front(3),
op.getBlockIds());
p << ' ' << op.getThreadsKeyword();
printSizeAssignment(p, op.getBlockSize(), operands.slice(3, 3),
op.getThreadIds());
// From now on, the first kNumConfigOperands operands corresponding to grid
// and block sizes are irrelevant, so we can drop them.
operands = operands.drop_front(kNumConfigOperands);
operands = operands.drop_front(LaunchOp::kNumConfigOperands);
// Print the data argument remapping.
if (!getBody().empty() && !operands.empty()) {
p << ' ' << getArgsKeyword() << '(';
if (!op.getBody().empty() && !operands.empty()) {
p << ' ' << op.getArgsKeyword() << '(';
for (unsigned i = 0, e = operands.size(); i < e; ++i) {
if (i != 0)
p << ", ";
p << *getBody().front().getArgument(kNumConfigRegionAttributes + i)
p << *op.getBody().front().getArgument(
LaunchOp::kNumConfigRegionAttributes + i)
<< " = " << *operands[i];
}
p << ") ";
@ -321,8 +326,8 @@ void LaunchOp::print(OpAsmPrinter &p) {
}
}
p.printRegion(getBody(), /*printEntryBlockArgs=*/false);
p.printOptionalAttrDict(getAttrs());
p.printRegion(op.getBody(), /*printEntryBlockArgs=*/false);
p.printOptionalAttrDict(op.getAttrs());
}
// Parse the size assignment blocks for blocks and threads. These have the form
@ -361,10 +366,10 @@ parseSizeAssignment(OpAsmParser &parser,
// (`args` ssa-reassignment `:` type-list)?
// region attr-dict?
// ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)`
ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
ParseResult parseLaunchOp(OpAsmParser &parser, OperationState &result) {
// Sizes of the grid and block.
SmallVector<OpAsmParser::OperandType, kNumConfigOperands> sizes(
kNumConfigOperands);
SmallVector<OpAsmParser::OperandType, LaunchOp::kNumConfigOperands> sizes(
LaunchOp::kNumConfigOperands);
MutableArrayRef<OpAsmParser::OperandType> sizesRef(sizes);
// Actual (data) operands passed to the kernel.
@ -372,7 +377,7 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
// Region arguments to be created.
SmallVector<OpAsmParser::OperandType, 16> regionArgs(
kNumConfigRegionAttributes);
LaunchOp::kNumConfigRegionAttributes);
MutableArrayRef<OpAsmParser::OperandType> regionArgsRef(regionArgs);
// Parse the size assignment segments: the first segment assigns grid sizes
@ -380,11 +385,11 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
// sizes and defines values for thread identifiers. In the region argument
// list, identifiers precede sizes, and block-related values precede
// thread-related values.
if (parser.parseKeyword(getBlocksKeyword().data()) ||
if (parser.parseKeyword(LaunchOp::getBlocksKeyword().data()) ||
parseSizeAssignment(parser, sizesRef.take_front(3),
regionArgsRef.slice(6, 3),
regionArgsRef.slice(0, 3)) ||
parser.parseKeyword(getThreadsKeyword().data()) ||
parser.parseKeyword(LaunchOp::getThreadsKeyword().data()) ||
parseSizeAssignment(parser, sizesRef.drop_front(3),
regionArgsRef.slice(9, 3),
regionArgsRef.slice(3, 3)) ||
@ -397,7 +402,7 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
// so is the trailing type list. Parse it as well and use the parsed types
// to resolve the operands passed to the kernel arguments.
SmallVector<Type, 4> dataTypes;
if (!parser.parseOptionalKeyword(getArgsKeyword())) {
if (!parser.parseOptionalKeyword(LaunchOp::getArgsKeyword())) {
llvm::SMLoc argsLoc = parser.getCurrentLocation();
regionArgs.push_back({});
@ -425,7 +430,8 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
// block/thread identifiers and grid/block sizes, all of the `index` type.
// Follow the actual kernel arguments.
Type index = parser.getBuilder().getIndexType();
dataTypes.insert(dataTypes.begin(), kNumConfigRegionAttributes, index);
dataTypes.insert(dataTypes.begin(), LaunchOp::kNumConfigRegionAttributes,
index);
Region *body = result.addRegion();
return failure(parser.parseRegion(*body, regionArgs, dataTypes) ||
parser.parseOptionalAttrDict(result.attributes));