[libomptarget][amdgcn] Implement partial barrier

[libomptarget][amdgcn] Implement partial barrier

named_sync is used to coordinate non-spmd kernels. This uses bar.sync on nvptx.
There is no corresponding ISA support on amdgcn, so this is implemented using
shared memory, one word initialized to zero.

Each wave increments the variable by one. Whichever wave is last is responsible
for resetting the variable to zero, at which point it and the others continue.

The race condition on a wave reaching the barrier before another wave has
noticed that it has been released is handled with a generation counter, packed
into the same word.

Uses a shared variable that is not needed on nvptx. Introduces a new hook,
kmpc_impl_target_init, to allow different targets to do extra initialization.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D88602
This commit is contained in:
JonChesterfield 2020-10-12 21:21:56 +01:00
parent 81ead8a535
commit 8b6cd15242
4 changed files with 64 additions and 5 deletions

View File

@ -109,11 +109,11 @@ INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) {
// AMDGCN doesn't need to sync threads in a warp
}
INLINE void __kmpc_impl_named_sync(uint32_t num_threads) {
(void)num_threads;
// TODO: Implement on top of __SHARED__
__builtin_amdgcn_s_barrier();
}
// AMDGCN specific kernel initialization
DEVICE void __kmpc_impl_target_init();
// Equivalent to ptx bar.sync 1. Barrier until num_threads arrive.
DEVICE void __kmpc_impl_named_sync(uint32_t num_threads);
INLINE void __kmpc_impl_threadfence() {
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");

View File

@ -62,6 +62,59 @@ DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var,
return __builtin_amdgcn_ds_bpermute(index << 2, var);
}
static DEVICE SHARED uint32_t L1_Barrier;
DEVICE void __kmpc_impl_target_init() {
// Don't have global ctors, and shared memory is not zero init
__atomic_store_n(&L1_Barrier, 0u, __ATOMIC_RELEASE);
}
DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
__atomic_thread_fence(__ATOMIC_ACQUIRE);
uint32_t num_waves = num_threads / WARPSIZE;
// Partial barrier implementation for amdgcn.
// Uses two 16 bit unsigned counters. One for the number of waves to have
// reached the barrier, and one to count how many times the barrier has been
// passed. These are packed in a single atomically accessed 32 bit integer.
// Low bits for the number of waves, assumed zero before this call.
// High bits to count the number of times the barrier has been passed.
assert(num_waves != 0);
assert(num_waves * WARPSIZE == num_threads);
assert(num_waves < 0xffffu);
// Increment the low 16 bits once, using the lowest active thread.
uint64_t lowestActiveThread = __kmpc_impl_ffs(__kmpc_impl_activemask()) - 1;
bool isLowest = GetLaneId() == lowestActiveThread;
if (isLowest) {
uint32_t load =
__atomic_fetch_add(&L1_Barrier, 1, __ATOMIC_RELAXED); // commutative
// Record the number of times the barrier has been passed
uint32_t generation = load & 0xffff0000u;
if ((load & 0x0000ffffu) == (num_waves - 1)) {
// Reached num_waves in low bits so this is the last wave.
// Set low bits to zero and increment high bits
load += 0x00010000u; // wrap is safe
load &= 0xffff0000u; // because bits zeroed second
// Reset the wave counter and release the waiting waves
__atomic_store_n(&L1_Barrier, load, __ATOMIC_RELAXED);
} else {
// more waves still to go, spin until generation counter changes
do {
__builtin_amdgcn_s_sleep(0);
load = __atomic_load_n(&L1_Barrier, __ATOMIC_RELAXED);
} while ((load & 0xffff0000u) == generation);
}
}
__atomic_thread_fence(__ATOMIC_RELEASE);
}
EXTERN uint64_t __ockl_get_local_size(uint32_t);
EXTERN uint64_t __ockl_get_num_groups(uint32_t);
DEVICE int GetNumberOfBlocksInKernel() { return __ockl_get_num_groups(0); }

View File

@ -63,6 +63,7 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
nThreads = GetNumberOfThreadsInBlock();
threadLimit = ThreadLimit;
__kmpc_impl_target_init();
}
EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {

View File

@ -183,6 +183,11 @@ INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
#endif // CUDA_VERSION
}
// NVPTX specific kernel initialization
INLINE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
}
// Barrier until num_threads arrive.
INLINE void __kmpc_impl_named_sync(uint32_t num_threads) {
// The named barrier for active parallel threads of a team in an L1 parallel
// region to synchronize with each other.