llvm-project/openmp/libomptarget/DeviceRTL/include/Synchronization.h

//===- Synchronization.h - OpenMP synchronization utilities ------- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//

#ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
#define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H

#include "Types.h"

namespace _OMP {

namespace synchronize {

/// Initialize the synchronization machinery. Must be called by all threads.
void init(bool IsSPMD);

/// Synchronize all threads in a warp identified by \p Mask.
void warp(LaneMaskTy Mask);

/// Synchronize all threads in a block.
void threads();

/// Synchronizing threads is allowed even if they all hit different instances of
/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
/// restrictive in that it requires all threads to hit the same instance. The
/// noinline is removed by the openmp-opt pass and helps to preserve the
/// information till then.
///{
#pragma omp begin assumes ext_aligned_barrier

/// Synchronize all threads in a block, they are are reaching the same
/// instruction (hence all threads in the block are "aligned").
__attribute__((noinline)) void threadsAligned();

#pragma omp end assumes
///}

} // namespace synchronize

namespace fence {

/// Memory fence with \p Ordering semantics for the team.
void team(int Ordering);

/// Memory fence with \p Ordering semantics for the contention group.
void kernel(int Ordering);

/// Memory fence with \p Ordering semantics for the system.
void system(int Ordering);

} // namespace fence

namespace atomic {

/// Atomically load \p Addr with \p Ordering semantics.
uint32_t load(uint32_t *Addr, int Ordering);

/// Atomically store \p V to \p Addr with \p Ordering semantics.
void store(uint32_t *Addr, uint32_t V, int Ordering);

/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering);

/// Atomically add \p V to \p *Addr with \p Ordering semantics.
uint32_t add(uint32_t *Addr, uint32_t V, int Ordering);

/// Atomically add \p V to \p *Addr with \p Ordering semantics.
uint64_t add(uint64_t *Addr, uint64_t V, int Ordering);

} // namespace atomic

} // namespace _OMP

#endif
[OpenMP] Prototype opt-in new GPU device RTL The "old" OpenMP GPU device runtime (D14254) has served us well for many years but modernizing it has caused some pain recently. This patch introduces an alternative which is mostly written from scratch embracing OpenMP 5.X, C++, LLVM coding style (where applicable), and conceptual interfaces. This new runtime is opt-in through a clang flag (D106793). The new runtime is currently only build for nvptx and has "-new" in its name. The design is tailored towards middle-end optimizations rather than front-end code generation choices, a trend we already started in the old runtime a while back. In contrast to the old one, state is organized in a simple manner rather than a "smart" one. While this can induce costs it helps optimizations. Our expectation is that the majority of codes can be optimized and a "simple" design is therefore preferable. The new runtime does also avoid users to pay for things they do not use, especially wrt. memory. The unlikely case of nested parallelism is supported but costly to make the more likely case use less resources. The worksharing and reduction implementation have been taken from the old runtime and will be rewritten in the future if necessary. Documentation and debug features are still mostly missing and will be added over time. All external symbols start with `__kmpc` for legacy reasons but should be renamed once we switch over to a single runtime. All internal symbols are placed in appropriate namespaces (anonymous or `_OMP`) to avoid name clashes with user symbols. Differential Revision: https://reviews.llvm.org/D106803 2021-07-26 02:26:44 +08:00			`//===- Synchronization.h - OpenMP synchronization utilities ------- C++ -*-===//`
			`//`
			`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
			`//`
			`//===----------------------------------------------------------------------===//`
			`//`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H`
			`#define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H`

			`#include "Types.h"`

			`namespace _OMP {`

			`namespace synchronize {`

			`/// Initialize the synchronization machinery. Must be called by all threads.`
			`void init(bool IsSPMD);`

			`/// Synchronize all threads in a warp identified by \p Mask.`
			`void warp(LaneMaskTy Mask);`

			`/// Synchronize all threads in a block.`
			`void threads();`

[OpenMP] Introduce aligned synchronization into the new device RT We will later use the fact that a barrier is aligned to reason about thread divergence. For now we introduce the assumption and some more documentation. Reviewed By: tianshilei1992 Differential Revision: https://reviews.llvm.org/D112153 2021-10-21 00:28:18 +08:00			`/// Synchronizing threads is allowed even if they all hit different instances of`
			/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
			`/// restrictive in that it requires all threads to hit the same instance. The`
			`/// noinline is removed by the openmp-opt pass and helps to preserve the`
			`/// information till then.`
			`///{`
			`#pragma omp begin assumes ext_aligned_barrier`

			`/// Synchronize all threads in a block, they are are reaching the same`
			`/// instruction (hence all threads in the block are "aligned").`
			`__attribute__((noinline)) void threadsAligned();`

			`#pragma omp end assumes`
			`///}`

[OpenMP] Prototype opt-in new GPU device RTL The "old" OpenMP GPU device runtime (D14254) has served us well for many years but modernizing it has caused some pain recently. This patch introduces an alternative which is mostly written from scratch embracing OpenMP 5.X, C++, LLVM coding style (where applicable), and conceptual interfaces. This new runtime is opt-in through a clang flag (D106793). The new runtime is currently only build for nvptx and has "-new" in its name. The design is tailored towards middle-end optimizations rather than front-end code generation choices, a trend we already started in the old runtime a while back. In contrast to the old one, state is organized in a simple manner rather than a "smart" one. While this can induce costs it helps optimizations. Our expectation is that the majority of codes can be optimized and a "simple" design is therefore preferable. The new runtime does also avoid users to pay for things they do not use, especially wrt. memory. The unlikely case of nested parallelism is supported but costly to make the more likely case use less resources. The worksharing and reduction implementation have been taken from the old runtime and will be rewritten in the future if necessary. Documentation and debug features are still mostly missing and will be added over time. All external symbols start with `__kmpc` for legacy reasons but should be renamed once we switch over to a single runtime. All internal symbols are placed in appropriate namespaces (anonymous or `_OMP`) to avoid name clashes with user symbols. Differential Revision: https://reviews.llvm.org/D106803 2021-07-26 02:26:44 +08:00			`} // namespace synchronize`

			`namespace fence {`

			`/// Memory fence with \p Ordering semantics for the team.`
			`void team(int Ordering);`

			`/// Memory fence with \p Ordering semantics for the contention group.`
			`void kernel(int Ordering);`

			`/// Memory fence with \p Ordering semantics for the system.`
			`void system(int Ordering);`

			`} // namespace fence`

			`namespace atomic {`

[libomptarget] Refactor DeviceRTL prior to AMDGPU bringup Subset of D111993. Fix typos, rename read to load. Reviewed By: tianshilei1992 Differential Revision: https://reviews.llvm.org/D111999 2021-10-19 15:05:05 +08:00			`/// Atomically load \p Addr with \p Ordering semantics.`
			`uint32_t load(uint32_t *Addr, int Ordering);`
[OpenMP] Prototype opt-in new GPU device RTL The "old" OpenMP GPU device runtime (D14254) has served us well for many years but modernizing it has caused some pain recently. This patch introduces an alternative which is mostly written from scratch embracing OpenMP 5.X, C++, LLVM coding style (where applicable), and conceptual interfaces. This new runtime is opt-in through a clang flag (D106793). The new runtime is currently only build for nvptx and has "-new" in its name. The design is tailored towards middle-end optimizations rather than front-end code generation choices, a trend we already started in the old runtime a while back. In contrast to the old one, state is organized in a simple manner rather than a "smart" one. While this can induce costs it helps optimizations. Our expectation is that the majority of codes can be optimized and a "simple" design is therefore preferable. The new runtime does also avoid users to pay for things they do not use, especially wrt. memory. The unlikely case of nested parallelism is supported but costly to make the more likely case use less resources. The worksharing and reduction implementation have been taken from the old runtime and will be rewritten in the future if necessary. Documentation and debug features are still mostly missing and will be added over time. All external symbols start with `__kmpc` for legacy reasons but should be renamed once we switch over to a single runtime. All internal symbols are placed in appropriate namespaces (anonymous or `_OMP`) to avoid name clashes with user symbols. Differential Revision: https://reviews.llvm.org/D106803 2021-07-26 02:26:44 +08:00
			`/// Atomically store \p V to \p Addr with \p Ordering semantics.`
[libomptarget] Refactor DeviceRTL prior to AMDGPU bringup Subset of D111993. Fix typos, rename read to load. Reviewed By: tianshilei1992 Differential Revision: https://reviews.llvm.org/D111999 2021-10-19 15:05:05 +08:00			`void store(uint32_t *Addr, uint32_t V, int Ordering);`
[OpenMP] Prototype opt-in new GPU device RTL The "old" OpenMP GPU device runtime (D14254) has served us well for many years but modernizing it has caused some pain recently. This patch introduces an alternative which is mostly written from scratch embracing OpenMP 5.X, C++, LLVM coding style (where applicable), and conceptual interfaces. This new runtime is opt-in through a clang flag (D106793). The new runtime is currently only build for nvptx and has "-new" in its name. The design is tailored towards middle-end optimizations rather than front-end code generation choices, a trend we already started in the old runtime a while back. In contrast to the old one, state is organized in a simple manner rather than a "smart" one. While this can induce costs it helps optimizations. Our expectation is that the majority of codes can be optimized and a "simple" design is therefore preferable. The new runtime does also avoid users to pay for things they do not use, especially wrt. memory. The unlikely case of nested parallelism is supported but costly to make the more likely case use less resources. The worksharing and reduction implementation have been taken from the old runtime and will be rewritten in the future if necessary. Documentation and debug features are still mostly missing and will be added over time. All external symbols start with `__kmpc` for legacy reasons but should be renamed once we switch over to a single runtime. All internal symbols are placed in appropriate namespaces (anonymous or `_OMP`) to avoid name clashes with user symbols. Differential Revision: https://reviews.llvm.org/D106803 2021-07-26 02:26:44 +08:00
			`/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.`
			`uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering);`

			`/// Atomically add \p V to \p *Addr with \p Ordering semantics.`
			`uint32_t add(uint32_t *Addr, uint32_t V, int Ordering);`

			`/// Atomically add \p V to \p *Addr with \p Ordering semantics.`
			`uint64_t add(uint64_t *Addr, uint64_t V, int Ordering);`

			`} // namespace atomic`

			`} // namespace _OMP`

			`#endif`