Add gpu.barrier op to synchronize invocations of a local workgroup.

Adding gen table for rewrite patterns from GPU to NVVM dialect. Copy missing op documentation from GPUOps.td to GPU.md. PiperOrigin-RevId: 275419588
2019-10-18 00:30:14 -07:00 · 2019-10-18 00:30:14 -07:00 · fe0ee32da5
parent 2acc220f17
commit fe0ee32da5
7 changed files with 144 additions and 6 deletions
--- a/mlir/g3doc/Dialects/GPU.md
+++ b/mlir/g3doc/Dialects/GPU.md
@ -192,3 +192,55 @@ Example:
 ```mlir {.mlir}
  %tIdX = "gpu.thread_id"() {dimension: "x"} : () -> (index)
 ```
+
+### `gpu.yield`
+
+Is a special terminator operation for blocks inside regions in gpu ops. It
+returns values to the immediately enclosing gpu op.
+
+Example:
+
+```mlir {.mlir}
+gpu.yield %f0, %f1 : f32, f32
+```
+
+
+### `gpu.all_reduce`
+
+The "all_reduce" op reduces the value of every work item across a local
+workgroup. The result is equal for all work items of a workgroup.
+
+For example, both
+
+```mlir {.mlir}
+%1 = "gpu.all_reduce"(%0) ({}) { op = "add" } : (f32) -> (f32)
+%2 = "gpu.all_reduce"(%0) ({
+^bb(%lhs : f32, %rhs : f32):
+  %sum = addf %lhs, %rhs : f32
+  "gpu.yield"(%sum) : (f32) -> ()
+}) : (f32) -> (f32)
+```
+compute the sum of each work item's %0 value. The first version specifies
+the accumulation as operation, whereas the second version specifies the
+accumulation as code region. The accumulation operation must either be
+`add` or `mul`.
+
+Either none or all work items of a workgroup need to execute this op
+in convergence.
+
+### `gpu.barrier`
+
+The "barrier" op synchronizes all work items of a workgroup. It is used
+to coordinate communication between the work items of the workgroup.
+
+```mlir {.mlir}
+gpu.barrier
+```
+waits until all work items in the workgroup have reached this point
+and all memory accesses made by these work items prior to the op are
+visible to all work items in the workgroup. Data hazards between work items
+accessing the same memory can be avoided by synchronizing work items
+in-between these accesses.
+
+Either none or all work items of a workgroup need to execute this op
+in convergence.
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@ -68,7 +68,8 @@ def gpu_Yield : GPU_Op<"yield", [Terminator]>,

    Example:

-       gpu.yield %f0, %f1 : f32, f32
+       ```gpu.yield %f0, %f1 : f32, f32
+       ```
  }];
 }

@ -79,8 +80,8 @@ def gpu_AllReduce : GPU_Op<"all_reduce",
    Results<(outs AnyType)> {
  let summary = "Reduce values among workgroup.";
  let description = [{
-    The "all_reduce" op reduces the value of every invocation across a local
-    workgroup. The result is equal for all invocations of a local workgroup.
+    The "all_reduce" op reduces the value of every work item across a local
+    workgroup. The result is equal for all work items of a workgroup.

    For example, both
    ```
@ -91,16 +92,38 @@ def gpu_AllReduce : GPU_Op<"all_reduce",
        "gpu.yield"(%sum) : (f32) -> ()
      }) : (f32) -> (f32)
    ```
-    compute the sum of each invocation's %0 value. The first version specifies
+    compute the sum of each work item's %0 value. The first version specifies
    the accumulation as operation, whereas the second version specifies the
    accumulation as code region. The accumulation operation must either be
    `add` or `mul`.

-    Either none or all invocations of a local workgroup need to execute this op
+    Either none or all work items of a workgroup need to execute this op
    in convergence.
  }];
  let regions = (region AnyRegion:$body);
  let verifier = [{ return ::verifyAllReduce(*this); }];
 }

+def gpu_Barrier : GPU_Op<"barrier"> {
+  let summary = "Synchronizes all work items of a workgroup.";
+  let description = [{
+    The "barrier" op synchronizes all work items of a workgroup. It is used
+    to coordinate communication between the work items of the workgroup.
+
+    ```
+      gpu.barrier
+    ```
+    waits until all work items in the workgroup have reached this point
+    and all memory accesses made by these work items prior to the op are
+    visible to all work items in the workgroup. Data hazards between work items
+    accessing the same memory can be avoided by synchronizing work items
+    in-between these accesses.
+
+    Either none or all work items of a workgroup need to execute this op
+    in convergence.
+  }];
+  let parser = [{ return success(); }];
+  let printer = [{ p << getOperationName(); }];
+}
+
 #endif // GPU_OPS
--- a/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt
@ -1,6 +1,14 @@
+set(LLVM_TARGET_DEFINITIONS GPUToNVVM.td)
+mlir_tablegen(GPUToNVVM.cpp.inc -gen-rewriters)
+add_public_tablegen_target(MLIRGPUToNVVMIncGen)
+
 add_llvm_library(MLIRGPUtoNVVMTransforms
  LowerGpuOpsToNVVMOps.cpp
  )
+
+add_dependencies(GPUToNVVMTransforms
+  MLIRGPUToNVVMIncGen)
+
 target_link_libraries(MLIRGPUtoNVVMTransforms
  LLVMSupport
  MLIRGPU
--- a/mlir/lib/Conversion/GPUToNVVM/GPUToNVVM.td
+++ b/mlir/lib/Conversion/GPUToNVVM/GPUToNVVM.td
@ -0,0 +1,38 @@
+//==-- GPUToNVVM.td - GPU Ops to NVVM Patterns ---------------*- tablegen -*==//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Defines Patterns to lower GPU ops to NVVM.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef MLIR_CONVERSION_GPUTONVVM_TD
+#else
+#define MLIR_CONVERSION_GPUTONVVM_TD
+
+#ifdef GPU_OPS
+#else
+include "mlir/Dialect/GPU/GPUOps.td"
+#endif // GPU_OPS
+
+#ifdef NVVMIR_OPS
+#else
+include "mlir/Dialect/LLVMIR/NVVMOps.td"
+#endif // NVVMIR_OPS
+
+def : Pat<(gpu_Barrier), (NVVM_Barrier0Op)>;
+
+#endif // MLIR_CONVERSION_GPUTONVVM_TD
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@ -447,6 +447,9 @@ private:
  static constexpr int kWarpSize = 32;
 };

+/// Import the GPU Ops to NVVM Patterns.
+#include "GPUToNVVM.cpp.inc"
+
 /// A pass that replaces all occurrences of GPU device operations with their
 /// corresponding NVVM equivalent.
 ///
@ -462,6 +465,7 @@ public:
    OwningRewritePatternList patterns;
    LLVMTypeConverter converter(m.getContext());
    populateStdToLLVMConversionPatterns(converter, patterns);
+    populateWithGenerated(&getContext(), &patterns);
    patterns.insert<
        GPUIndexIntrinsicOpLowering<gpu::ThreadId, NVVM::ThreadIdXOp,
                                    NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@ -68,7 +68,18 @@ module attributes {gpu.kernel_module} {
      %xor = xor %lhs, %rhs : i32
      "gpu.yield"(%xor) : (i32) -> ()
    }) : (i32) -> (i32)
-
+    std.return
+  }
+}
+
+// -----
+
+module attributes {gpu.kernel_module} {
+  // CHECK-LABEL: func @gpu_sync()
+  func @gpu_sync()
+      attributes { gpu.kernel } {
+    // CHECK: nvvm.barrier0
+    gpu.barrier
    std.return
  }
 }
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@ -82,6 +82,8 @@ module attributes {gpu.container_module} {
      %one = constant 1.0 : f32
      %sum = "gpu.all_reduce"(%one) ({}) {op = "add"} : (f32) -> (f32)

+      "gpu.barrier"() : () -> ()
+
      "some_op"(%bIdX, %tIdX) : (index, index) -> ()
      %42 = load %arg1[%bIdX] : memref<?xf32, 1>
      return