forked from OSchip/llvm-project
Add gpu.barrier op to synchronize invocations of a local workgroup.
Adding gen table for rewrite patterns from GPU to NVVM dialect. Copy missing op documentation from GPUOps.td to GPU.md. PiperOrigin-RevId: 275419588
This commit is contained in:
parent
2acc220f17
commit
fe0ee32da5
|
@ -192,3 +192,55 @@ Example:
|
|||
```mlir {.mlir}
|
||||
%tIdX = "gpu.thread_id"() {dimension: "x"} : () -> (index)
|
||||
```
|
||||
|
||||
### `gpu.yield`
|
||||
|
||||
Is a special terminator operation for blocks inside regions in gpu ops. It
|
||||
returns values to the immediately enclosing gpu op.
|
||||
|
||||
Example:
|
||||
|
||||
```mlir {.mlir}
|
||||
gpu.yield %f0, %f1 : f32, f32
|
||||
```
|
||||
|
||||
|
||||
### `gpu.all_reduce`
|
||||
|
||||
The "all_reduce" op reduces the value of every work item across a local
|
||||
workgroup. The result is equal for all work items of a workgroup.
|
||||
|
||||
For example, both
|
||||
|
||||
```mlir {.mlir}
|
||||
%1 = "gpu.all_reduce"(%0) ({}) { op = "add" } : (f32) -> (f32)
|
||||
%2 = "gpu.all_reduce"(%0) ({
|
||||
^bb(%lhs : f32, %rhs : f32):
|
||||
%sum = addf %lhs, %rhs : f32
|
||||
"gpu.yield"(%sum) : (f32) -> ()
|
||||
}) : (f32) -> (f32)
|
||||
```
|
||||
compute the sum of each work item's %0 value. The first version specifies
|
||||
the accumulation as operation, whereas the second version specifies the
|
||||
accumulation as code region. The accumulation operation must either be
|
||||
`add` or `mul`.
|
||||
|
||||
Either none or all work items of a workgroup need to execute this op
|
||||
in convergence.
|
||||
|
||||
### `gpu.barrier`
|
||||
|
||||
The "barrier" op synchronizes all work items of a workgroup. It is used
|
||||
to coordinate communication between the work items of the workgroup.
|
||||
|
||||
```mlir {.mlir}
|
||||
gpu.barrier
|
||||
```
|
||||
waits until all work items in the workgroup have reached this point
|
||||
and all memory accesses made by these work items prior to the op are
|
||||
visible to all work items in the workgroup. Data hazards between work items
|
||||
accessing the same memory can be avoided by synchronizing work items
|
||||
in-between these accesses.
|
||||
|
||||
Either none or all work items of a workgroup need to execute this op
|
||||
in convergence.
|
||||
|
|
|
@ -68,7 +68,8 @@ def gpu_Yield : GPU_Op<"yield", [Terminator]>,
|
|||
|
||||
Example:
|
||||
|
||||
gpu.yield %f0, %f1 : f32, f32
|
||||
```gpu.yield %f0, %f1 : f32, f32
|
||||
```
|
||||
}];
|
||||
}
|
||||
|
||||
|
@ -79,8 +80,8 @@ def gpu_AllReduce : GPU_Op<"all_reduce",
|
|||
Results<(outs AnyType)> {
|
||||
let summary = "Reduce values among workgroup.";
|
||||
let description = [{
|
||||
The "all_reduce" op reduces the value of every invocation across a local
|
||||
workgroup. The result is equal for all invocations of a local workgroup.
|
||||
The "all_reduce" op reduces the value of every work item across a local
|
||||
workgroup. The result is equal for all work items of a workgroup.
|
||||
|
||||
For example, both
|
||||
```
|
||||
|
@ -91,16 +92,38 @@ def gpu_AllReduce : GPU_Op<"all_reduce",
|
|||
"gpu.yield"(%sum) : (f32) -> ()
|
||||
}) : (f32) -> (f32)
|
||||
```
|
||||
compute the sum of each invocation's %0 value. The first version specifies
|
||||
compute the sum of each work item's %0 value. The first version specifies
|
||||
the accumulation as operation, whereas the second version specifies the
|
||||
accumulation as code region. The accumulation operation must either be
|
||||
`add` or `mul`.
|
||||
|
||||
Either none or all invocations of a local workgroup need to execute this op
|
||||
Either none or all work items of a workgroup need to execute this op
|
||||
in convergence.
|
||||
}];
|
||||
let regions = (region AnyRegion:$body);
|
||||
let verifier = [{ return ::verifyAllReduce(*this); }];
|
||||
}
|
||||
|
||||
def gpu_Barrier : GPU_Op<"barrier"> {
|
||||
let summary = "Synchronizes all work items of a workgroup.";
|
||||
let description = [{
|
||||
The "barrier" op synchronizes all work items of a workgroup. It is used
|
||||
to coordinate communication between the work items of the workgroup.
|
||||
|
||||
```
|
||||
gpu.barrier
|
||||
```
|
||||
waits until all work items in the workgroup have reached this point
|
||||
and all memory accesses made by these work items prior to the op are
|
||||
visible to all work items in the workgroup. Data hazards between work items
|
||||
accessing the same memory can be avoided by synchronizing work items
|
||||
in-between these accesses.
|
||||
|
||||
Either none or all work items of a workgroup need to execute this op
|
||||
in convergence.
|
||||
}];
|
||||
let parser = [{ return success(); }];
|
||||
let printer = [{ p << getOperationName(); }];
|
||||
}
|
||||
|
||||
#endif // GPU_OPS
|
||||
|
|
|
@ -1,6 +1,14 @@
|
|||
set(LLVM_TARGET_DEFINITIONS GPUToNVVM.td)
|
||||
mlir_tablegen(GPUToNVVM.cpp.inc -gen-rewriters)
|
||||
add_public_tablegen_target(MLIRGPUToNVVMIncGen)
|
||||
|
||||
add_llvm_library(MLIRGPUtoNVVMTransforms
|
||||
LowerGpuOpsToNVVMOps.cpp
|
||||
)
|
||||
|
||||
add_dependencies(GPUToNVVMTransforms
|
||||
MLIRGPUToNVVMIncGen)
|
||||
|
||||
target_link_libraries(MLIRGPUtoNVVMTransforms
|
||||
LLVMSupport
|
||||
MLIRGPU
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
//==-- GPUToNVVM.td - GPU Ops to NVVM Patterns ---------------*- tablegen -*==//
|
||||
//
|
||||
// Copyright 2019 The MLIR Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
// =============================================================================
|
||||
//
|
||||
// Defines Patterns to lower GPU ops to NVVM.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifdef MLIR_CONVERSION_GPUTONVVM_TD
|
||||
#else
|
||||
#define MLIR_CONVERSION_GPUTONVVM_TD
|
||||
|
||||
#ifdef GPU_OPS
|
||||
#else
|
||||
include "mlir/Dialect/GPU/GPUOps.td"
|
||||
#endif // GPU_OPS
|
||||
|
||||
#ifdef NVVMIR_OPS
|
||||
#else
|
||||
include "mlir/Dialect/LLVMIR/NVVMOps.td"
|
||||
#endif // NVVMIR_OPS
|
||||
|
||||
def : Pat<(gpu_Barrier), (NVVM_Barrier0Op)>;
|
||||
|
||||
#endif // MLIR_CONVERSION_GPUTONVVM_TD
|
|
@ -447,6 +447,9 @@ private:
|
|||
static constexpr int kWarpSize = 32;
|
||||
};
|
||||
|
||||
/// Import the GPU Ops to NVVM Patterns.
|
||||
#include "GPUToNVVM.cpp.inc"
|
||||
|
||||
/// A pass that replaces all occurrences of GPU device operations with their
|
||||
/// corresponding NVVM equivalent.
|
||||
///
|
||||
|
@ -462,6 +465,7 @@ public:
|
|||
OwningRewritePatternList patterns;
|
||||
LLVMTypeConverter converter(m.getContext());
|
||||
populateStdToLLVMConversionPatterns(converter, patterns);
|
||||
populateWithGenerated(&getContext(), &patterns);
|
||||
patterns.insert<
|
||||
GPUIndexIntrinsicOpLowering<gpu::ThreadId, NVVM::ThreadIdXOp,
|
||||
NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
|
||||
|
|
|
@ -68,7 +68,18 @@ module attributes {gpu.kernel_module} {
|
|||
%xor = xor %lhs, %rhs : i32
|
||||
"gpu.yield"(%xor) : (i32) -> ()
|
||||
}) : (i32) -> (i32)
|
||||
|
||||
std.return
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
module attributes {gpu.kernel_module} {
|
||||
// CHECK-LABEL: func @gpu_sync()
|
||||
func @gpu_sync()
|
||||
attributes { gpu.kernel } {
|
||||
// CHECK: nvvm.barrier0
|
||||
gpu.barrier
|
||||
std.return
|
||||
}
|
||||
}
|
||||
|
|
|
@ -82,6 +82,8 @@ module attributes {gpu.container_module} {
|
|||
%one = constant 1.0 : f32
|
||||
%sum = "gpu.all_reduce"(%one) ({}) {op = "add"} : (f32) -> (f32)
|
||||
|
||||
"gpu.barrier"() : () -> ()
|
||||
|
||||
"some_op"(%bIdX, %tIdX) : (index, index) -> ()
|
||||
%42 = load %arg1[%bIdX] : memref<?xf32, 1>
|
||||
return
|
||||
|
|
Loading…
Reference in New Issue