Add gpu.barrier op to synchronize invocations of a local workgroup.

Adding gen table for rewrite patterns from GPU to NVVM dialect.

Copy missing op documentation from GPUOps.td to GPU.md.

PiperOrigin-RevId: 275419588
This commit is contained in:
Christian Sigg 2019-10-18 00:30:14 -07:00 committed by A. Unique TensorFlower
parent 2acc220f17
commit fe0ee32da5
7 changed files with 144 additions and 6 deletions

View File

@ -192,3 +192,55 @@ Example:
```mlir {.mlir}
%tIdX = "gpu.thread_id"() {dimension: "x"} : () -> (index)
```
### `gpu.yield`
Is a special terminator operation for blocks inside regions in gpu ops. It
returns values to the immediately enclosing gpu op.
Example:
```mlir {.mlir}
gpu.yield %f0, %f1 : f32, f32
```
### `gpu.all_reduce`
The "all_reduce" op reduces the value of every work item across a local
workgroup. The result is equal for all work items of a workgroup.
For example, both
```mlir {.mlir}
%1 = "gpu.all_reduce"(%0) ({}) { op = "add" } : (f32) -> (f32)
%2 = "gpu.all_reduce"(%0) ({
^bb(%lhs : f32, %rhs : f32):
%sum = addf %lhs, %rhs : f32
"gpu.yield"(%sum) : (f32) -> ()
}) : (f32) -> (f32)
```
compute the sum of each work item's %0 value. The first version specifies
the accumulation as operation, whereas the second version specifies the
accumulation as code region. The accumulation operation must either be
`add` or `mul`.
Either none or all work items of a workgroup need to execute this op
in convergence.
### `gpu.barrier`
The "barrier" op synchronizes all work items of a workgroup. It is used
to coordinate communication between the work items of the workgroup.
```mlir {.mlir}
gpu.barrier
```
waits until all work items in the workgroup have reached this point
and all memory accesses made by these work items prior to the op are
visible to all work items in the workgroup. Data hazards between work items
accessing the same memory can be avoided by synchronizing work items
in-between these accesses.
Either none or all work items of a workgroup need to execute this op
in convergence.

View File

@ -68,7 +68,8 @@ def gpu_Yield : GPU_Op<"yield", [Terminator]>,
Example:
gpu.yield %f0, %f1 : f32, f32
```gpu.yield %f0, %f1 : f32, f32
```
}];
}
@ -79,8 +80,8 @@ def gpu_AllReduce : GPU_Op<"all_reduce",
Results<(outs AnyType)> {
let summary = "Reduce values among workgroup.";
let description = [{
The "all_reduce" op reduces the value of every invocation across a local
workgroup. The result is equal for all invocations of a local workgroup.
The "all_reduce" op reduces the value of every work item across a local
workgroup. The result is equal for all work items of a workgroup.
For example, both
```
@ -91,16 +92,38 @@ def gpu_AllReduce : GPU_Op<"all_reduce",
"gpu.yield"(%sum) : (f32) -> ()
}) : (f32) -> (f32)
```
compute the sum of each invocation's %0 value. The first version specifies
compute the sum of each work item's %0 value. The first version specifies
the accumulation as operation, whereas the second version specifies the
accumulation as code region. The accumulation operation must either be
`add` or `mul`.
Either none or all invocations of a local workgroup need to execute this op
Either none or all work items of a workgroup need to execute this op
in convergence.
}];
let regions = (region AnyRegion:$body);
let verifier = [{ return ::verifyAllReduce(*this); }];
}
def gpu_Barrier : GPU_Op<"barrier"> {
let summary = "Synchronizes all work items of a workgroup.";
let description = [{
The "barrier" op synchronizes all work items of a workgroup. It is used
to coordinate communication between the work items of the workgroup.
```
gpu.barrier
```
waits until all work items in the workgroup have reached this point
and all memory accesses made by these work items prior to the op are
visible to all work items in the workgroup. Data hazards between work items
accessing the same memory can be avoided by synchronizing work items
in-between these accesses.
Either none or all work items of a workgroup need to execute this op
in convergence.
}];
let parser = [{ return success(); }];
let printer = [{ p << getOperationName(); }];
}
#endif // GPU_OPS

View File

@ -1,6 +1,14 @@
set(LLVM_TARGET_DEFINITIONS GPUToNVVM.td)
mlir_tablegen(GPUToNVVM.cpp.inc -gen-rewriters)
add_public_tablegen_target(MLIRGPUToNVVMIncGen)
add_llvm_library(MLIRGPUtoNVVMTransforms
LowerGpuOpsToNVVMOps.cpp
)
add_dependencies(GPUToNVVMTransforms
MLIRGPUToNVVMIncGen)
target_link_libraries(MLIRGPUtoNVVMTransforms
LLVMSupport
MLIRGPU

View File

@ -0,0 +1,38 @@
//==-- GPUToNVVM.td - GPU Ops to NVVM Patterns ---------------*- tablegen -*==//
//
// Copyright 2019 The MLIR Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
//
// Defines Patterns to lower GPU ops to NVVM.
//
//===----------------------------------------------------------------------===//
#ifdef MLIR_CONVERSION_GPUTONVVM_TD
#else
#define MLIR_CONVERSION_GPUTONVVM_TD
#ifdef GPU_OPS
#else
include "mlir/Dialect/GPU/GPUOps.td"
#endif // GPU_OPS
#ifdef NVVMIR_OPS
#else
include "mlir/Dialect/LLVMIR/NVVMOps.td"
#endif // NVVMIR_OPS
def : Pat<(gpu_Barrier), (NVVM_Barrier0Op)>;
#endif // MLIR_CONVERSION_GPUTONVVM_TD

View File

@ -447,6 +447,9 @@ private:
static constexpr int kWarpSize = 32;
};
/// Import the GPU Ops to NVVM Patterns.
#include "GPUToNVVM.cpp.inc"
/// A pass that replaces all occurrences of GPU device operations with their
/// corresponding NVVM equivalent.
///
@ -462,6 +465,7 @@ public:
OwningRewritePatternList patterns;
LLVMTypeConverter converter(m.getContext());
populateStdToLLVMConversionPatterns(converter, patterns);
populateWithGenerated(&getContext(), &patterns);
patterns.insert<
GPUIndexIntrinsicOpLowering<gpu::ThreadId, NVVM::ThreadIdXOp,
NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,

View File

@ -68,7 +68,18 @@ module attributes {gpu.kernel_module} {
%xor = xor %lhs, %rhs : i32
"gpu.yield"(%xor) : (i32) -> ()
}) : (i32) -> (i32)
std.return
}
}
// -----
module attributes {gpu.kernel_module} {
// CHECK-LABEL: func @gpu_sync()
func @gpu_sync()
attributes { gpu.kernel } {
// CHECK: nvvm.barrier0
gpu.barrier
std.return
}
}

View File

@ -82,6 +82,8 @@ module attributes {gpu.container_module} {
%one = constant 1.0 : f32
%sum = "gpu.all_reduce"(%one) ({}) {op = "add"} : (f32) -> (f32)
"gpu.barrier"() : () -> ()
"some_op"(%bIdX, %tIdX) : (index, index) -> ()
%42 = load %arg1[%bIdX] : memref<?xf32, 1>
return