[libomptarget][nfc] Introduce atomic wrapper function

Summary: [libomptarget][nfc] Introduce atomic wrapper function Wraps atomic functions in a template prefixed __kmpc_atomic that dispatches to cuda or hip atomic functions. Intended to be easily extended to dispatch to OpenCL or C++ atomics for a third target. Reviewers: ABataev, jdoerfert, grokos Reviewed By: jdoerfert Subscribers: Anastasia, jvesely, mgrang, dexonsmith, llvm-commits, mgorny, jfb, openmp-commits Tags: #openmp, #llvm Differential Revision: https://reviews.llvm.org/D71404
2019-12-18 20:06:16 +00:00 · 2019-12-18 20:06:16 +00:00 · 2caeaf2f45
parent 3db1cf7a1e
commit 2caeaf2f45
8 changed files with 71 additions and 25 deletions
--- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
@ -76,6 +76,7 @@ set(h_files
  ${devicertl_base_directory}/common/omptarget.h
  ${devicertl_base_directory}/common/omptargeti.h
  ${devicertl_base_directory}/common/state-queue.h
+  ${devicertl_base_directory}/common/target_atomic.h
  ${devicertl_base_directory}/common/state-queuei.h
  ${devicertl_base_directory}/common/support.h)

--- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
@ -11,6 +11,8 @@
 //
 //===----------------------------------------------------------------------===//

+#include "common/target_atomic.h"
+
 ////////////////////////////////////////////////////////////////////////////////
 // Task Descriptor
 ////////////////////////////////////////////////////////////////////////////////
@ -207,7 +209,7 @@ INLINE void omptarget_nvptx_SimpleMemoryManager::Release() {
  ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT,
          "MemIdx is too big or uninitialized.");
  MemDataTy &MD = MemData[usedSlotIdx];
-  atomicExch((unsigned *)&MD.keys[usedMemIdx], 0);
+  __kmpc_atomic_exchange((unsigned *)&MD.keys[usedMemIdx], 0u);
 }

 INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
@ -217,7 +219,7 @@ INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
  const unsigned sm = usedSlotIdx;
  MemDataTy &MD = MemData[sm];
  unsigned i = hash(GetBlockIdInKernel());
-  while (atomicCAS((unsigned *)&MD.keys[i], 0, 1) != 0) {
+  while (__kmpc_atomic_cas((unsigned *)&MD.keys[i], 0u, 1u) != 0) {
    i = hash(i + 1);
  }
  usedSlotIdx = sm;
--- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//

 #include "common/omptarget.h"
+#include "common/target_atomic.h"
 #include "target_impl.h"

 EXTERN double omp_get_wtick(void) {
--- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
@ -14,6 +14,7 @@

 #include "common/omptarget.h"
 #include "target_impl.h"
+#include "common/target_atomic.h"

 ////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
@ -397,9 +398,9 @@ public:
    unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt);
    uint64_t warp_res;
    if (rank == 0) {
-      warp_res = atomicAdd(
+      warp_res = __kmpc_atomic_add(
          (unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
-          change);
+          (unsigned long long)change);
    }
    warp_res = Shuffle(active, warp_res, leader);
    return warp_res + rank;
@ -792,8 +793,8 @@ EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid,
    // Atomic max of iterations.
    uint64_t *varArray = (uint64_t *)array;
    uint64_t elem = varArray[i];
-    (void)atomicMax((unsigned long long int *)Buffer,
-                    (unsigned long long int)elem);
+    (void)__kmpc_atomic_max((unsigned long long int *)Buffer,
+                            (unsigned long long int)elem);

    // Barrier.
    syncWorkersInGenericMode(NumThreads);
--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//

 #include "common/omptarget.h"
+#include "common/target_atomic.h"
 #include "target_impl.h"

 EXTERN
@ -242,7 +243,7 @@ static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
    // atomicInc increments 'timestamp' and has a range [0, NumTeams-1].
    // It resets 'timestamp' back to 0 once the last team increments
    // this counter.
-    unsigned val = atomicInc(timestamp, NumTeams - 1);
+    unsigned val = __kmpc_atomic_inc(timestamp, NumTeams - 1);
    IsLastTeam = val == NumTeams - 1;
  }

@ -377,7 +378,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
  if (checkSPMDMode(loc) && GetThreadIdInBlock() != 0)
    return 0;
  // The master thread of the team actually does the reduction.
-  while (atomicCAS((uint32_t *)crit, 0, 1))
+  while (__kmpc_atomic_cas((uint32_t *)crit, 0u, 1u))
    ;
  return 1;
 }
@ -386,7 +387,7 @@ EXTERN void
 __kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, int32_t global_tid,
                                            kmp_CriticalName *crit) {
  __kmpc_impl_threadfence_system();
-  (void)atomicExch((uint32_t *)crit, 0);
+  (void)__kmpc_atomic_exchange((uint32_t *)crit, 0u);
 }

 INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
@ -431,7 +432,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
  bool IsMaster = isMaster(loc, ThreadId);
  while (IsMaster) {
    // Atomic read
-    Bound = atomicAdd((uint32_t *)&IterCnt, 0);
+    Bound = __kmpc_atomic_add((uint32_t *)&IterCnt, 0u);
    if (TeamId < Bound + num_of_records)
      break;
  }
@ -447,7 +448,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
    // Increment team counter.
    // This counter is incremented by all teams in the current
    // BUFFER_SIZE chunk.
-    ChunkTeamCount = atomicInc((uint32_t *)&Cnt, num_of_records - 1);
+    ChunkTeamCount = __kmpc_atomic_inc((uint32_t *)&Cnt, num_of_records - 1u);
  }
  // Synchronize
  if (checkSPMDMode(loc))
@ -522,7 +523,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
  if (IsMaster && ChunkTeamCount == num_of_records - 1) {
    // Allow SIZE number of teams to proceed writing their
    // intermediate results to the global buffer.
-    atomicAdd((uint32_t *)&IterCnt, num_of_records);
+    __kmpc_atomic_add((uint32_t *)&IterCnt, uint32_t(num_of_records));
  }

  return 0;
--- a/openmp/libomptarget/deviceRTLs/common/state-queuei.h
+++ b/openmp/libomptarget/deviceRTLs/common/state-queuei.h
@ -1,4 +1,4 @@
-//===------- state-queue.cu - NVPTX OpenMP GPU State Queue ------- CUDA -*-===//
+//===------- state-queuei.h - OpenMP GPU State Queue ------------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@ -17,15 +17,16 @@
 //===----------------------------------------------------------------------===//

 #include "state-queue.h"
+#include "common/target_atomic.h"

 template <typename ElementType, uint32_t SIZE>
 INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ENQUEUE_TICKET() {
-  return atomicAdd((unsigned int *)&tail, 1);
+  return __kmpc_atomic_add((unsigned int *)&tail, 1u);
 }

 template <typename ElementType, uint32_t SIZE>
 INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::DEQUEUE_TICKET() {
-  return atomicAdd((unsigned int *)&head, 1);
+  return __kmpc_atomic_add((unsigned int *)&head, 1u);
 }

 template <typename ElementType, uint32_t SIZE>
@ -37,28 +38,28 @@ omptarget_nvptx_Queue<ElementType, SIZE>::ID(uint32_t ticket) {
 template <typename ElementType, uint32_t SIZE>
 INLINE bool omptarget_nvptx_Queue<ElementType, SIZE>::IsServing(uint32_t slot,
                                                                uint32_t id) {
-  return atomicAdd((unsigned int *)&ids[slot], 0) == id;
+  return __kmpc_atomic_add((unsigned int *)&ids[slot], 0u) == id;
 }

 template <typename ElementType, uint32_t SIZE>
 INLINE void
 omptarget_nvptx_Queue<ElementType, SIZE>::PushElement(uint32_t slot,
                                                      ElementType *element) {
-  atomicExch((unsigned long long *)&elementQueue[slot],
-             (unsigned long long)element);
+  __kmpc_atomic_exchange((unsigned long long *)&elementQueue[slot],
+                         (unsigned long long)element);
 }

 template <typename ElementType, uint32_t SIZE>
 INLINE ElementType *
 omptarget_nvptx_Queue<ElementType, SIZE>::PopElement(uint32_t slot) {
-  return (ElementType *)atomicAdd((unsigned long long *)&elementQueue[slot],
-                                  (unsigned long long)0);
+  return (ElementType *)__kmpc_atomic_add(
+      (unsigned long long *)&elementQueue[slot], (unsigned long long)0);
 }

 template <typename ElementType, uint32_t SIZE>
 INLINE void omptarget_nvptx_Queue<ElementType, SIZE>::DoneServing(uint32_t slot,
                                                                  uint32_t id) {
-  atomicExch((unsigned int *)&ids[slot], (id + 1) % MAX_ID);
+  __kmpc_atomic_exchange((unsigned int *)&ids[slot], (id + 1) % MAX_ID);
 }

 template <typename ElementType, uint32_t SIZE>
--- a/openmp/libomptarget/deviceRTLs/common/target_atomic.h
+++ b/openmp/libomptarget/deviceRTLs/common/target_atomic.h
@ -0,0 +1,38 @@
+//===---- target_atomic.h - OpenMP GPU target atomic functions ---- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Declarations of atomic functions provided by each target
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_TARGET_ATOMIC_H
+#define OMPTARGET_TARGET_ATOMIC_H
+
+#include "target_impl.h"
+
+template <typename T> INLINE T __kmpc_atomic_add(T *address, T val) {
+  return atomicAdd(address, val);
+}
+
+template <typename T> INLINE T __kmpc_atomic_inc(T *address, T val) {
+  return atomicInc(address, val);
+}
+
+template <typename T> INLINE T __kmpc_atomic_max(T *address, T val) {
+  return atomicMax(address, val);
+}
+
+template <typename T> INLINE T __kmpc_atomic_exchange(T *address, T val) {
+  return atomicExch(address, val);
+}
+
+template <typename T> INLINE T __kmpc_atomic_cas(T *address, T compare, T val) {
+  return atomicCAS(address, compare, val);
+}
+
+#endif
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@ -12,10 +12,11 @@

 #include "target_impl.h"
 #include "common/debug.h"
+#include "common/target_atomic.h"

 #define __OMP_SPIN 1000
-#define UNSET 0
-#define SET 1
+#define UNSET 0u
+#define SET 1u

 EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock) {
  omp_unset_lock(lock);
@ -30,7 +31,7 @@ EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock) {
  // (old == compare ? val : old)

  // TODO: not sure spinning is a good idea here..
-  while (atomicCAS(lock, UNSET, SET) != UNSET) {
+  while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
    clock_t start = clock();
    clock_t now;
    for (;;) {
@ -44,7 +45,7 @@ EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock) {
 }

 EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock) {
-  (void)atomicExch(lock, UNSET);
+  (void)__kmpc_atomic_exchange(lock, UNSET);
 }

 EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock) {