From 3230267d0dbdc908d7be5886e20c2ee5f0e942fe Mon Sep 17 00:00:00 2001
From: Alex Zinenko <zinenko@google.com>
Date: Fri, 6 Dec 2019 14:23:06 -0800
Subject: [PATCH] Move GPU::LaunchOp to ODS. NFC.

Move the definition of the GPU launch opreation from hand-rolled C++ code to
ODS framework. This only does the moves, a follow-up is necessary to clean up
users of custom functions that could be auto-generated by ODS.

PiperOrigin-RevId: 284261856
---
 mlir/g3doc/Dialects/GPU.md                 |  71 -----------
 mlir/include/mlir/Dialect/GPU/GPUDialect.h |  68 ----------
 mlir/include/mlir/Dialect/GPU/GPUOps.td    | 140 +++++++++++++++++++++
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp     |  64 +++++-----
 4 files changed, 175 insertions(+), 168 deletions(-)

diff --git a/mlir/g3doc/Dialects/GPU.md b/mlir/g3doc/Dialects/GPU.md
index d34ce1891e88..bcb677d7660b 100644
--- a/mlir/g3doc/Dialects/GPU.md
+++ b/mlir/g3doc/Dialects/GPU.md
@@ -69,77 +69,6 @@ Example:
   %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
 ```
 
-### `gpu.launch`
-
-Launch a kernel on the specified grid of thread blocks. The body of the kernel
-is defined by the single region that this operation contains. The operation
-takes at least six operands, with first three operands being grid sizes along
-x,y,z dimensions, the following three arguments being block sizes along x,y,z
-dimension, and the remaining operands are arguments of the kernel. When a
-lower-dimensional kernel is required, unused sizes must be explicitly set to
-`1`.
-
-The body region has at least _twelve_ arguments, grouped as follows:
-
--   three arguments that contain block identifiers along x,y,z dimensions;
--   three arguments that contain thread identifiers along x,y,z dimensions;
--   operands of the `gpu.launch` operation as is, including six leading operands
-    for grid and block sizes.
-
-Operations inside the body region, and any operations in the nested regions, are
-_not_ allowed to use values defined outside the _body_ region, as if this region
-was a function. If necessary, values must be passed as kernel arguments into the
-body region. Nested regions inside the kernel body are allowed to use values
-defined in their ancestor regions as long as they don't cross the kernel body
-region boundary.
-
-Syntax:
-
-``` {.ebnf}
-operation ::= `gpu.launch` `block` `(` ssa-id-list `)` `in` ssa-reassignment
-                         `threads` `(` ssa-id-list `)` `in` ssa-reassignment
-                           (`args` ssa-reassignment `:` type-list)?
-                           region attr-dict?
-ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)`
-```
-
-Example:
-
-```mlir {.mlir}
-gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2)
-           threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5)
-           args(%arg0 = %6, %arg1 = 7) : f32, memref<?xf32, 1> {
-  // Block and thread identifiers, as well as block/grid sizes are
-  // immediately usable inside body region.
-  "some_op"(%bx, %tx) : (index, index) -> ()
-  %42 = load %arg1[%bx] : memref<?xf32, 1>
-}
-
-// Generic syntax explains how the pretty syntax maps to the IR structure.
-"gpu.launch"(%cst, %cst, %c1,  // Grid sizes.
-                    %cst, %c1, %c1,   // Block sizes.
-                    %arg0, %arg1)     // Actual arguments.
-    {/*attributes*/}
-    // All sizes and identifiers have "index" size.
-    : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> () {
-// The operation passes block and thread identifiers, followed by grid and block
-// sizes, followed by actual arguments to the entry block of the region.
-^bb0(%bx : index, %by : index, %bz : index,
-     %tx : index, %ty : index, %tz : index,
-     %num_bx : index, %num_by : index, %num_bz : index,
-     %num_tx : index, %num_ty : index, %num_tz : index,
-     %arg0 : f32, %arg1 : memref<?xf32, 1>):
-  "some_op"(%bx, %tx) : (index, index) -> ()
-  %3 = "std.load"(%arg1, %bx) : (memref<?xf32, 1>, index) -> f32
-}
-```
-
-Rationale: using operation/block arguments gives analyses a clear way of
-understanding that a value has additional semantics (e.g., we will need to know
-what value corresponds to threadIdx.x for coalescing). We can recover these
-properties by analyzing the operations producing values, but it is easier just
-to have that information by construction.
-
 ### `gpu.launch_func`
 
 Launch a kernel function on the specified grid of thread blocks. `gpu.launch`
diff --git a/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
index 194dd9c1e1dc..3d63a45b8efb 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
@@ -77,74 +77,6 @@ struct KernelDim3 {
   Value *z;
 };
 
-/// GPU kernel launch operation.  Takes a 3D grid of thread blocks as leading
-/// operands, followed by kernel data operands.  Has one region representing
-/// the kernel to be executed.  This region is not allowed to use values defined
-/// outside it.
-class LaunchOp : public Op<LaunchOp, OpTrait::AtLeastNOperands<6>::Impl,
-                           OpTrait::ZeroResult, OpTrait::IsIsolatedFromAbove> {
-public:
-  using Op::Op;
-
-  static void build(Builder *builder, OperationState &result, Value *gridSizeX,
-                    Value *gridSizeY, Value *gridSizeZ, Value *blockSizeX,
-                    Value *blockSizeY, Value *blockSizeZ,
-                    ArrayRef<Value *> operands);
-
-  /// Get the kernel region.
-  Region &getBody();
-
-  /// Get the SSA values corresponding to kernel block identifiers.
-  KernelDim3 getBlockIds();
-  /// Get the SSA values corresponding to kernel thread identifiers.
-  KernelDim3 getThreadIds();
-  /// Get the SSA values corresponding to kernel grid size.
-  KernelDim3 getGridSize();
-  /// Get the SSA values corresponding to kernel block size.
-  KernelDim3 getBlockSize();
-  /// Get the operand values passed as kernel arguments.
-  operand_range getKernelOperandValues();
-  /// Get the operand types passed as kernel arguments.
-  operand_type_range getKernelOperandTypes();
-
-  /// Get the SSA values passed as operands to specify the grid size.
-  KernelDim3 getGridSizeOperandValues();
-  /// Get the SSA values passed as operands to specify the block size.
-  KernelDim3 getBlockSizeOperandValues();
-
-  /// Get the SSA values of the kernel arguments.
-  llvm::iterator_range<Block::args_iterator> getKernelArguments();
-
-  LogicalResult verify();
-
-  /// Custom syntax support.
-  void print(OpAsmPrinter &p);
-  static ParseResult parse(OpAsmParser &parser, OperationState &result);
-
-  static StringRef getOperationName() { return "gpu.launch"; }
-
-  /// Erase the `index`-th kernel argument.  Both the entry block argument and
-  /// the operand will be dropped.  The block argument must not have any uses.
-  void eraseKernelArgument(unsigned index);
-
-  /// Append canonicalization patterns to `results`.
-  static void getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                          MLIRContext *context);
-
-private:
-  static StringRef getBlocksKeyword() { return "blocks"; }
-  static StringRef getThreadsKeyword() { return "threads"; }
-  static StringRef getArgsKeyword() { return "args"; }
-
-  /// The number of launch configuration operands, placed at the leading
-  /// positions of the operand list.
-  static constexpr unsigned kNumConfigOperands = 6;
-
-  /// The number of region attributes containing the launch configuration,
-  /// placed in the leading positions of the argument list.
-  static constexpr unsigned kNumConfigRegionAttributes = 12;
-};
-
 /// Operation to launch a kernel given as outlined function.
 class LaunchFuncOp : public Op<LaunchFuncOp, OpTrait::AtLeastNOperands<6>::Impl,
                                OpTrait::ZeroResult> {
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
index fcaa77ce779c..9b4e21800bdb 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -181,6 +181,146 @@ def GPU_GPUFuncOp : GPU_Op<"func", [FunctionLike, IsolatedFromAbove, Symbol]> {
   let parser = [{ return parseGPUFuncOp(parser, result); }];
 }
 
+def GPU_LaunchOp : GPU_Op<"launch", [IsolatedFromAbove]>,
+    Arguments<(ins Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
+               Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
+               Variadic<AnyType>:$operands)>,
+    Results<(outs)> {
+  let summary = "GPU kernel launch operation";
+
+  let description = [{
+    Launch a kernel on the specified grid of thread blocks. The body of the
+    kernel is defined by the single region that this operation contains. The
+    operation takes at least six operands, with first three operands being grid
+    sizes along x,y,z dimensions, the following three arguments being block
+    sizes along x,y,z dimension, and the remaining operands are arguments of the
+    kernel. When a lower-dimensional kernel is required, unused sizes must be
+    explicitly set to `1`.
+
+    The body region has at least _twelve_ arguments, grouped as follows:
+
+    -   three arguments that contain block identifiers along x,y,z dimensions;
+    -   three arguments that contain thread identifiers along x,y,z dimensions;
+    -   operands of the `gpu.launch` operation as is, including six leading
+        operands for grid and block sizes.
+
+    Operations inside the body region, and any operations in the nested regions,
+    are _not_ allowed to use values defined outside the _body_ region, as if
+    this region was a function. If necessary, values must be passed as kernel
+    arguments into the body region. Nested regions inside the kernel body are
+    allowed to use values defined in their ancestor regions as long as they
+    don't cross the kernel body region boundary.
+
+    Syntax:
+
+    ``` {.ebnf}
+    operation ::= `gpu.launch` `block` `(` ssa-id-list `)` `in` ssa-reassignment
+                             `threads` `(` ssa-id-list `)` `in` ssa-reassignment
+                               (`args` ssa-reassignment `:` type-list)?
+                               region attr-dict?
+    ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)`
+    ```
+
+    Example:
+
+    ```mlir {.mlir}
+    gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2)
+               threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5)
+               args(%arg0 = %6, %arg1 = 7) : f32, memref<?xf32, 1> {
+      // Block and thread identifiers, as well as block/grid sizes are
+      // immediately usable inside body region.
+      "some_op"(%bx, %tx) : (index, index) -> ()
+      %42 = load %arg1[%bx] : memref<?xf32, 1>
+    }
+
+    // Generic syntax explains how the pretty syntax maps to the IR structure.
+    "gpu.launch"(%cst, %cst, %c1,  // Grid sizes.
+                        %cst, %c1, %c1,   // Block sizes.
+                        %arg0, %arg1)     // Actual arguments.
+        {/*attributes*/}
+        // All sizes and identifiers have "index" size.
+        : (index, index, index, index, index, index, f32, memref<?xf32, 1>)
+            -> () {
+    // The operation passes block and thread identifiers, followed by grid and
+    // block sizes, followed by actual arguments to the entry block of the
+    // region.
+    ^bb0(%bx : index, %by : index, %bz : index,
+         %tx : index, %ty : index, %tz : index,
+         %num_bx : index, %num_by : index, %num_bz : index,
+         %num_tx : index, %num_ty : index, %num_tz : index,
+         %arg0 : f32, %arg1 : memref<?xf32, 1>):
+      "some_op"(%bx, %tx) : (index, index) -> ()
+      %3 = "std.load"(%arg1, %bx) : (memref<?xf32, 1>, index) -> f32
+    }
+    ```
+
+    Rationale: using operation/block arguments gives analyses a clear way of
+    understanding that a value has additional semantics (e.g., we will need to
+    know what value corresponds to threadIdx.x for coalescing). We can recover
+    these properties by analyzing the operations producing values, but it is
+    easier just to have that information by construction.
+  }];
+
+  let regions = (region AnyRegion:$body);
+
+  let skipDefaultBuilders = 1;
+
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState &result, Value *gridSizeX,"
+              "Value *gridSizeY, Value *gridSizeZ, Value *blockSizeX,"
+              "Value *blockSizeY, Value *blockSizeZ,"
+              "ArrayRef<Value *> operands">
+  ];
+
+  let hasCanonicalizer = 1;
+
+  let extraClassDeclaration = [{
+    /// Get the kernel region.
+    Region &getBody();
+
+    /// Get the SSA values corresponding to kernel block identifiers.
+    KernelDim3 getBlockIds();
+    /// Get the SSA values corresponding to kernel thread identifiers.
+    KernelDim3 getThreadIds();
+    /// Get the SSA values corresponding to kernel grid size.
+    KernelDim3 getGridSize();
+    /// Get the SSA values corresponding to kernel block size.
+    KernelDim3 getBlockSize();
+    /// Get the operand values passed as kernel arguments.
+    operand_range getKernelOperandValues();
+    /// Get the operand types passed as kernel arguments.
+    operand_type_range getKernelOperandTypes();
+
+    /// Get the SSA values passed as operands to specify the grid size.
+    KernelDim3 getGridSizeOperandValues();
+    /// Get the SSA values passed as operands to specify the block size.
+    KernelDim3 getBlockSizeOperandValues();
+
+    /// Get the SSA values of the kernel arguments.
+    llvm::iterator_range<Block::args_iterator> getKernelArguments();
+
+    /// Erase the `index`-th kernel argument.  Both the entry block argument and
+    /// the operand will be dropped.  The block argument must not have any uses.
+    void eraseKernelArgument(unsigned index);
+
+    static StringRef getBlocksKeyword() { return "blocks"; }
+    static StringRef getThreadsKeyword() { return "threads"; }
+    static StringRef getArgsKeyword() { return "args"; }
+
+    /// The number of launch configuration operands, placed at the leading
+    /// positions of the operand list.
+    static constexpr unsigned kNumConfigOperands = 6;
+
+    /// The number of region attributes containing the launch configuration,
+    /// placed in the leading positions of the argument list.
+    static constexpr unsigned kNumConfigRegionAttributes = 12;
+  }];
+
+  let parser = [{ return parseLaunchOp(parser, result); }];
+  let printer = [{ printLaunchOp(p, *this); }];
+  let verifier = [{ return ::verify(*this); }];
+}
+
 def GPU_ReturnOp : GPU_Op<"return", [Terminator]>, Arguments<(ins)>,
     Results<(outs)> {
   let summary = "Terminator for GPU launch regions.";
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 38998b968ad5..87b170b6da89 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -46,7 +46,7 @@ bool GPUDialect::isKernel(Operation *op) {
 
 GPUDialect::GPUDialect(MLIRContext *context)
     : Dialect(getDialectName(), context) {
-  addOperations<LaunchOp, LaunchFuncOp,
+  addOperations<LaunchFuncOp,
 #define GET_OP_LIST
 #include "mlir/Dialect/GPU/GPUOps.cpp.inc"
                 >();
@@ -244,19 +244,20 @@ llvm::iterator_range<Block::args_iterator> LaunchOp::getKernelArguments() {
   return llvm::drop_begin(args, LaunchOp::kNumConfigRegionAttributes);
 }
 
-LogicalResult LaunchOp::verify() {
+LogicalResult verify(LaunchOp op) {
   // Kernel launch takes kNumConfigOperands leading operands for grid/block
   // sizes and transforms them into kNumConfigRegionAttributes region arguments
   // for block/thread identifiers and grid/block sizes.
-  if (!getBody().empty()) {
-    Block &entryBlock = getBody().front();
-    if (entryBlock.getNumArguments() != kNumConfigOperands + getNumOperands())
-      return emitOpError("unexpected number of region arguments");
+  if (!op.getBody().empty()) {
+    Block &entryBlock = op.getBody().front();
+    if (entryBlock.getNumArguments() !=
+        LaunchOp::kNumConfigOperands + op.getNumOperands())
+      return op.emitOpError("unexpected number of region arguments");
   }
 
   // Block terminators without successors are expected to exit the kernel region
   // and must be `gpu.launch`.
-  for (Block &block : getBody()) {
+  for (Block &block : op.getBody()) {
     if (block.empty())
       continue;
     if (block.back().getNumSuccessors() != 0)
@@ -265,8 +266,8 @@ LogicalResult LaunchOp::verify() {
       return block.back()
                  .emitError("expected 'gpu.terminator' or a terminator with "
                             "successors")
-                 .attachNote(getLoc())
-             << "in '" << getOperationName() << "' body region";
+                 .attachNote(op.getLoc())
+             << "in '" << LaunchOp::getOperationName() << "' body region";
     }
   }
 
@@ -285,27 +286,31 @@ static void printSizeAssignment(OpAsmPrinter &p, KernelDim3 size,
   p << *size.z << " = " << *operands[2] << ')';
 }
 
-void LaunchOp::print(OpAsmPrinter &p) {
-  SmallVector<Value *, 12> operandContainer(operand_begin(), operand_end());
+void printLaunchOp(OpAsmPrinter &p, LaunchOp op) {
+  SmallVector<Value *, 12> operandContainer(op.operand_begin(),
+                                            op.operand_end());
   ArrayRef<Value *> operands(operandContainer);
 
   // Print the launch configuration.
-  p << getOperationName() << ' ' << getBlocksKeyword();
-  printSizeAssignment(p, getGridSize(), operands.take_front(3), getBlockIds());
-  p << ' ' << getThreadsKeyword();
-  printSizeAssignment(p, getBlockSize(), operands.slice(3, 3), getThreadIds());
+  p << LaunchOp::getOperationName() << ' ' << op.getBlocksKeyword();
+  printSizeAssignment(p, op.getGridSize(), operands.take_front(3),
+                      op.getBlockIds());
+  p << ' ' << op.getThreadsKeyword();
+  printSizeAssignment(p, op.getBlockSize(), operands.slice(3, 3),
+                      op.getThreadIds());
 
   // From now on, the first kNumConfigOperands operands corresponding to grid
   // and block sizes are irrelevant, so we can drop them.
-  operands = operands.drop_front(kNumConfigOperands);
+  operands = operands.drop_front(LaunchOp::kNumConfigOperands);
 
   // Print the data argument remapping.
-  if (!getBody().empty() && !operands.empty()) {
-    p << ' ' << getArgsKeyword() << '(';
+  if (!op.getBody().empty() && !operands.empty()) {
+    p << ' ' << op.getArgsKeyword() << '(';
     for (unsigned i = 0, e = operands.size(); i < e; ++i) {
       if (i != 0)
         p << ", ";
-      p << *getBody().front().getArgument(kNumConfigRegionAttributes + i)
+      p << *op.getBody().front().getArgument(
+               LaunchOp::kNumConfigRegionAttributes + i)
         << " = " << *operands[i];
     }
     p << ") ";
@@ -321,8 +326,8 @@ void LaunchOp::print(OpAsmPrinter &p) {
     }
   }
 
-  p.printRegion(getBody(), /*printEntryBlockArgs=*/false);
-  p.printOptionalAttrDict(getAttrs());
+  p.printRegion(op.getBody(), /*printEntryBlockArgs=*/false);
+  p.printOptionalAttrDict(op.getAttrs());
 }
 
 // Parse the size assignment blocks for blocks and threads.  These have the form
@@ -361,10 +366,10 @@ parseSizeAssignment(OpAsmParser &parser,
 //                             (`args` ssa-reassignment `:` type-list)?
 //                             region attr-dict?
 // ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)`
-ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
+ParseResult parseLaunchOp(OpAsmParser &parser, OperationState &result) {
   // Sizes of the grid and block.
-  SmallVector<OpAsmParser::OperandType, kNumConfigOperands> sizes(
-      kNumConfigOperands);
+  SmallVector<OpAsmParser::OperandType, LaunchOp::kNumConfigOperands> sizes(
+      LaunchOp::kNumConfigOperands);
   MutableArrayRef<OpAsmParser::OperandType> sizesRef(sizes);
 
   // Actual (data) operands passed to the kernel.
@@ -372,7 +377,7 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
 
   // Region arguments to be created.
   SmallVector<OpAsmParser::OperandType, 16> regionArgs(
-      kNumConfigRegionAttributes);
+      LaunchOp::kNumConfigRegionAttributes);
   MutableArrayRef<OpAsmParser::OperandType> regionArgsRef(regionArgs);
 
   // Parse the size assignment segments: the first segment assigns grid sizes
@@ -380,11 +385,11 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
   // sizes and defines values for thread identifiers.  In the region argument
   // list, identifiers precede sizes, and block-related values precede
   // thread-related values.
-  if (parser.parseKeyword(getBlocksKeyword().data()) ||
+  if (parser.parseKeyword(LaunchOp::getBlocksKeyword().data()) ||
       parseSizeAssignment(parser, sizesRef.take_front(3),
                           regionArgsRef.slice(6, 3),
                           regionArgsRef.slice(0, 3)) ||
-      parser.parseKeyword(getThreadsKeyword().data()) ||
+      parser.parseKeyword(LaunchOp::getThreadsKeyword().data()) ||
       parseSizeAssignment(parser, sizesRef.drop_front(3),
                           regionArgsRef.slice(9, 3),
                           regionArgsRef.slice(3, 3)) ||
@@ -397,7 +402,7 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
   // so is the trailing type list.  Parse it as well and use the parsed types
   // to resolve the operands passed to the kernel arguments.
   SmallVector<Type, 4> dataTypes;
-  if (!parser.parseOptionalKeyword(getArgsKeyword())) {
+  if (!parser.parseOptionalKeyword(LaunchOp::getArgsKeyword())) {
     llvm::SMLoc argsLoc = parser.getCurrentLocation();
 
     regionArgs.push_back({});
@@ -425,7 +430,8 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
   // block/thread identifiers and grid/block sizes, all of the `index` type.
   // Follow the actual kernel arguments.
   Type index = parser.getBuilder().getIndexType();
-  dataTypes.insert(dataTypes.begin(), kNumConfigRegionAttributes, index);
+  dataTypes.insert(dataTypes.begin(), LaunchOp::kNumConfigRegionAttributes,
+                   index);
   Region *body = result.addRegion();
   return failure(parser.parseRegion(*body, regionArgs, dataTypes) ||
                  parser.parseOptionalAttrDict(result.attributes));