[libomptarget][amdgcn] Implement partial barrier

[libomptarget][amdgcn] Implement partial barrier named_sync is used to coordinate non-spmd kernels. This uses bar.sync on nvptx. There is no corresponding ISA support on amdgcn, so this is implemented using shared memory, one word initialized to zero. Each wave increments the variable by one. Whichever wave is last is responsible for resetting the variable to zero, at which point it and the others continue. The race condition on a wave reaching the barrier before another wave has noticed that it has been released is handled with a generation counter, packed into the same word. Uses a shared variable that is not needed on nvptx. Introduces a new hook, kmpc_impl_target_init, to allow different targets to do extra initialization. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D88602
2020-10-12 21:21:56 +01:00 · 2020-10-12 21:21:56 +01:00 · 8b6cd15242
parent 81ead8a535
commit 8b6cd15242
4 changed files with 64 additions and 5 deletions
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@ -109,11 +109,11 @@ INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) {
  // AMDGCN doesn't need to sync threads in a warp
 }

-INLINE void __kmpc_impl_named_sync(uint32_t num_threads) {
-  (void)num_threads;
-  // TODO: Implement on top of __SHARED__
-  __builtin_amdgcn_s_barrier();
-}
+// AMDGCN specific kernel initialization
+DEVICE void __kmpc_impl_target_init();
+
+// Equivalent to ptx bar.sync 1. Barrier until num_threads arrive.
+DEVICE void __kmpc_impl_named_sync(uint32_t num_threads);

 INLINE void __kmpc_impl_threadfence() {
  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@ -62,6 +62,59 @@ DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var,
  return __builtin_amdgcn_ds_bpermute(index << 2, var);
 }

+static DEVICE SHARED uint32_t L1_Barrier;
+
+DEVICE void __kmpc_impl_target_init() {
+  // Don't have global ctors, and shared memory is not zero init
+  __atomic_store_n(&L1_Barrier, 0u, __ATOMIC_RELEASE);
+}
+
+DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
+  __atomic_thread_fence(__ATOMIC_ACQUIRE);
+
+  uint32_t num_waves = num_threads / WARPSIZE;
+
+  // Partial barrier implementation for amdgcn.
+  // Uses two 16 bit unsigned counters. One for the number of waves to have
+  // reached the barrier, and one to count how many times the barrier has been
+  // passed. These are packed in a single atomically accessed 32 bit integer.
+  // Low bits for the number of waves, assumed zero before this call.
+  // High bits to count the number of times the barrier has been passed.
+
+  assert(num_waves != 0);
+  assert(num_waves * WARPSIZE == num_threads);
+  assert(num_waves < 0xffffu);
+
+  // Increment the low 16 bits once, using the lowest active thread.
+  uint64_t lowestActiveThread = __kmpc_impl_ffs(__kmpc_impl_activemask()) - 1;
+  bool isLowest = GetLaneId() == lowestActiveThread;
+
+  if (isLowest) {
+    uint32_t load =
+        __atomic_fetch_add(&L1_Barrier, 1, __ATOMIC_RELAXED); // commutative
+
+    // Record the number of times the barrier has been passed
+    uint32_t generation = load & 0xffff0000u;
+
+    if ((load & 0x0000ffffu) == (num_waves - 1)) {
+      // Reached num_waves in low bits so this is the last wave.
+      // Set low bits to zero and increment high bits
+      load += 0x00010000u; // wrap is safe
+      load &= 0xffff0000u; // because bits zeroed second
+
+      // Reset the wave counter and release the waiting waves
+      __atomic_store_n(&L1_Barrier, load, __ATOMIC_RELAXED);
+    } else {
+      // more waves still to go, spin until generation counter changes
+      do {
+        __builtin_amdgcn_s_sleep(0);
+        load = __atomic_load_n(&L1_Barrier, __ATOMIC_RELAXED);
+      } while ((load & 0xffff0000u) == generation);
+    }
+  }
+  __atomic_thread_fence(__ATOMIC_RELEASE);
+}
+
 EXTERN uint64_t __ockl_get_local_size(uint32_t);
 EXTERN uint64_t __ockl_get_num_groups(uint32_t);
 DEVICE int GetNumberOfBlocksInKernel() { return __ockl_get_num_groups(0); }
--- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
@ -63,6 +63,7 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
  nThreads = GetNumberOfThreadsInBlock();
  threadLimit = ThreadLimit;
+  __kmpc_impl_target_init();
 }

 EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@ -183,6 +183,11 @@ INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
 #endif // CUDA_VERSION
 }

+// NVPTX specific kernel initialization
+INLINE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
+}
+
+// Barrier until num_threads arrive.
 INLINE void __kmpc_impl_named_sync(uint32_t num_threads) {
  // The named barrier for active parallel threads of a team in an L1 parallel
  // region to synchronize with each other.