[OpenMP][NFC] Remove unused and untested code from the device runtime

Summary: We carried a lot of unused and untested code in the device runtime. Among other reasons, we are planning major rewrites for which reduced size is going to help a lot. The number of code lines reduced by 14%! Before: ------------------------------------------------------------------------------- Language files blank comment code ------------------------------------------------------------------------------- CUDA 13 489 841 2454 C/C++ Header 14 322 493 1377 C 12 117 124 559 CMake 4 64 64 262 C++ 1 6 6 39 ------------------------------------------------------------------------------- SUM: 44 998 1528 4691 ------------------------------------------------------------------------------- After: ------------------------------------------------------------------------------- Language files blank comment code ------------------------------------------------------------------------------- CUDA 13 366 733 1879 C/C++ Header 14 317 484 1293 C 12 117 124 559 CMake 4 64 64 262 C++ 1 6 6 39 ------------------------------------------------------------------------------- SUM: 44 870 1411 4032 ------------------------------------------------------------------------------- Reviewers: hfinkel, jhuber6, fghanim, JonChesterfield, grokos, AndreyChurbanov, ye-luo, tianshilei1992, ggeorgakoudis, Hahnfeld, ABataev, hbae, ronlieb, gregrodgers Subscribers: jvesely, yaxunl, bollu, guansong, jfb, sstefan1, aaron.ballman, openmp-commits, cfe-commits Tags: #clang, #openmp Differential Revision: https://reviews.llvm.org/D83349
2020-07-07 16:44:33 -05:00 · 2020-07-07 16:44:33 -05:00 · cd0ea03e6f
parent 0f0c5af3db
commit cd0ea03e6f
14 changed files with 0 additions and 905 deletions
--- a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
@ -78,7 +78,6 @@ int bar(int n){
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0)
 // CHECK-NOT: call void @__kmpc_for_static_init
 // CHECK-NOT: call void @__kmpc_for_static_fini
-// CHECK-NOT: call i32 @__kmpc_nvptx_simd_reduce_nowait(
 // CHECK-NOT: call void @__kmpc_nvptx_end_reduce_nowait(
 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
 // CHECK: ret void
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@ -140,8 +140,6 @@ DEVICE int GetNumberOfThreadsInBlock();
 DEVICE unsigned GetWarpId();
 DEVICE unsigned GetLaneId();

-DEVICE bool __kmpc_impl_is_first_active_thread();
-
 // Locks
 DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);
 DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock);
--- a/openmp/libomptarget/deviceRTLs/common/omptarget.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h
@ -200,7 +200,6 @@ public:
  INLINE omptarget_nvptx_WorkDescr &WorkDescr() {
    return workDescrForActiveParallel;
  }
-  INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; }

  // init
  INLINE void InitTeamDescr();
@ -251,7 +250,6 @@ private:
      levelZeroTaskDescr; // icv for team master initial thread
  omptarget_nvptx_WorkDescr
      workDescrForActiveParallel; // one, ONLY for the active par
-  uint64_t lastprivateIterBuffer;

  ALIGN(16)
  __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
@ -277,10 +275,6 @@ public:
  INLINE uint16_t &NumThreadsForNextParallel(int tid) {
    return nextRegion.tnum[tid];
  }
-  // simd
-  INLINE uint16_t &SimdLimitForNextSimd(int tid) {
-    return nextRegion.slim[tid];
-  }
  // schedule (for dispatch)
  INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
  INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
@ -304,8 +298,6 @@ private:
    // Only one of the two is live at the same time.
    // parallel
    uint16_t tnum[MAX_THREADS_PER_TEAM];
-    // simd limit
-    uint16_t slim[MAX_THREADS_PER_TEAM];
  } nextRegion;
  // schedule (for dispatch)
  kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
--- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
@ -17,297 +17,6 @@ INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
  return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
 }

-/// Return the provided size aligned to the size of a pointer.
-INLINE static size_t AlignVal(size_t Val) {
-  const size_t Align = (size_t)sizeof(void *);
-  if (Val & (Align - 1)) {
-    Val += Align;
-    Val &= ~(Align - 1);
-  }
-  return Val;
-}
-
-#define DSFLAG 0
-#define DSFLAG_INIT 0
-#define DSPRINT(_flag, _str, _args...)                                         \
-  {                                                                            \
-    if (_flag) {                                                               \
-      /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x, _args);*/          \
-    }                                                                          \
-  }
-#define DSPRINT0(_flag, _str)                                                  \
-  {                                                                            \
-    if (_flag) {                                                               \
-      /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x);*/                 \
-    }                                                                          \
-  }
-
-// Initialize the shared data structures. This is expected to be called for the
-// master thread and warp masters. \param RootS: A pointer to the root of the
-// data sharing stack. \param InitialDataSize: The initial size of the data in
-// the slot.
-EXTERN void
-__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
-                                           size_t InitialDataSize) {
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
-  DSPRINT0(DSFLAG_INIT,
-           "Entering __kmpc_initialize_data_sharing_environment\n");
-
-  unsigned WID = GetWarpId();
-  DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID);
-
-  omptarget_nvptx_TeamDescr *teamDescr =
-      &omptarget_nvptx_threadPrivateContext->TeamContext();
-  __kmpc_data_sharing_slot *RootS =
-      teamDescr->RootS(WID, IsMasterThread(isSPMDMode()));
-
-  DataSharingState.SlotPtr[WID] = RootS;
-  DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
-
-  // We don't need to initialize the frame and active threads.
-
-  DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", (unsigned)InitialDataSize);
-  DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (unsigned long long)RootS);
-  DSPRINT(DSFLAG_INIT, "Root slot data-end at: %016llx \n",
-          (unsigned long long)RootS->DataEnd);
-  DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n",
-          (unsigned long long)RootS->Next);
-  DSPRINT(DSFLAG_INIT, "Shared slot ptr at: %016llx \n",
-          (unsigned long long)DataSharingState.SlotPtr[WID]);
-  DSPRINT(DSFLAG_INIT, "Shared stack ptr at: %016llx \n",
-          (unsigned long long)DataSharingState.StackPtr[WID]);
-
-  DSPRINT0(DSFLAG_INIT, "Exiting __kmpc_initialize_data_sharing_environment\n");
-}
-
-EXTERN void *__kmpc_data_sharing_environment_begin(
-    __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
-    void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
-    size_t SharingDataSize, size_t SharingDefaultDataSize,
-    int16_t IsOMPRuntimeInitialized) {
-
-  DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_begin\n");
-
-  // If the runtime has been elided, used shared memory for master-worker
-  // data sharing.
-  if (!IsOMPRuntimeInitialized)
-    return (void *)&DataSharingState;
-
-  DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
-  DSPRINT(DSFLAG, "Default Data Size %016llx\n",
-          (unsigned long long)SharingDefaultDataSize);
-
-  unsigned WID = GetWarpId();
-  __kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask();
-
-  __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
-  void *&StackP = DataSharingState.StackPtr[WID];
-  void * volatile &FrameP = DataSharingState.FramePtr[WID];
-  __kmpc_impl_lanemask_t &ActiveT = DataSharingState.ActiveThreads[WID];
-
-  DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
-  // Save the current values.
-  *SavedSharedSlot = SlotP;
-  *SavedSharedStack = StackP;
-  *SavedSharedFrame = FrameP;
-  *SavedActiveThreads = ActiveT;
-
-  DSPRINT(DSFLAG, "Warp ID: %u\n", WID);
-  DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (unsigned long long)SlotP);
-  DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (unsigned long long)StackP);
-  DSPRINT(DSFLAG, "Saved frame ptr at: %016llx \n", (long long)FrameP);
-  DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
-
-  // Only the warp active master needs to grow the stack.
-  if (__kmpc_impl_is_first_active_thread()) {
-    // Save the current active threads.
-    ActiveT = CurActiveThreads;
-
-    // Make sure we use aligned sizes to avoid rematerialization of data.
-    SharingDataSize = AlignVal(SharingDataSize);
-    // FIXME: The default data size can be assumed to be aligned?
-    SharingDefaultDataSize = AlignVal(SharingDefaultDataSize);
-
-    // Check if we have room for the data in the current slot.
-    const uintptr_t CurrentStartAddress = (uintptr_t)StackP;
-    const uintptr_t CurrentEndAddress = (uintptr_t)SlotP->DataEnd;
-    const uintptr_t RequiredEndAddress =
-        CurrentStartAddress + (uintptr_t)SharingDataSize;
-
-    DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
-    DSPRINT(DSFLAG, "Default Data Size %016llx\n",
-            (unsigned long long)SharingDefaultDataSize);
-    DSPRINT(DSFLAG, "Current Start Address %016llx\n",
-            (unsigned long long)CurrentStartAddress);
-    DSPRINT(DSFLAG, "Current End Address %016llx\n",
-            (unsigned long long)CurrentEndAddress);
-    DSPRINT(DSFLAG, "Required End Address %016llx\n",
-            (unsigned long long)RequiredEndAddress);
-    DSPRINT(DSFLAG, "Active Threads %08x\n", (unsigned)ActiveT);
-
-    // If we require a new slot, allocate it and initialize it (or attempt to
-    // reuse one). Also, set the shared stack and slot pointers to the new
-    // place. If we do not need to grow the stack, just adapt the stack and
-    // frame pointers.
-    if (CurrentEndAddress < RequiredEndAddress) {
-      size_t NewSize = (SharingDataSize > SharingDefaultDataSize)
-                           ? SharingDataSize
-                           : SharingDefaultDataSize;
-      __kmpc_data_sharing_slot *NewSlot = 0;
-
-      // Attempt to reuse an existing slot.
-      if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
-        uintptr_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
-                                     (uintptr_t)(&ExistingSlot->Data[0]);
-        if (ExistingSlotSize >= NewSize) {
-          DSPRINT(DSFLAG, "Reusing stack slot %016llx\n",
-                  (unsigned long long)ExistingSlot);
-          NewSlot = ExistingSlot;
-        } else {
-          DSPRINT(DSFLAG, "Cleaning up -failed reuse - %016llx\n",
-                  (unsigned long long)SlotP->Next);
-          SafeFree(ExistingSlot, "Failed reuse");
-        }
-      }
-
-      if (!NewSlot) {
-        NewSlot = (__kmpc_data_sharing_slot *)SafeMalloc(
-            sizeof(__kmpc_data_sharing_slot) + NewSize,
-            "Warp master slot allocation");
-        DSPRINT(DSFLAG, "New slot allocated %016llx (data size=%016llx)\n",
-                (unsigned long long)NewSlot, NewSize);
-      }
-
-      NewSlot->Next = 0;
-      NewSlot->DataEnd = &NewSlot->Data[NewSize];
-
-      SlotP->Next = NewSlot;
-      SlotP = NewSlot;
-      StackP = &NewSlot->Data[SharingDataSize];
-      FrameP = &NewSlot->Data[0];
-    } else {
-
-      // Clean up any old slot that we may still have. The slot producers, do
-      // not eliminate them because that may be used to return data.
-      if (SlotP->Next) {
-        DSPRINT(DSFLAG, "Cleaning up - old not required - %016llx\n",
-                (unsigned long long)SlotP->Next);
-        SafeFree(SlotP->Next, "Old slot not required");
-        SlotP->Next = 0;
-      }
-
-      FrameP = StackP;
-      StackP = (void *)RequiredEndAddress;
-    }
-  }
-
-  // FIXME: Need to see the impact of doing it here.
-  __kmpc_impl_threadfence_block();
-
-  DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_begin\n");
-
-  // All the threads in this warp get the frame they should work with.
-  return FrameP;
-}
-
-EXTERN void __kmpc_data_sharing_environment_end(
-    __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
-    void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
-    int32_t IsEntryPoint) {
-
-  DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n");
-
-  unsigned WID = GetWarpId();
-
-  if (IsEntryPoint) {
-    if (__kmpc_impl_is_first_active_thread()) {
-      DSPRINT0(DSFLAG, "Doing clean up\n");
-
-      // The master thread cleans the saved slot, because this is an environment
-      // only for the master.
-      __kmpc_data_sharing_slot *S = IsMasterThread(isSPMDMode())
-                                        ? *SavedSharedSlot
-                                        : DataSharingState.SlotPtr[WID];
-
-      if (S->Next) {
-        SafeFree(S->Next, "Sharing environment end");
-        S->Next = 0;
-      }
-    }
-
-    DSPRINT0(DSFLAG, "Exiting Exiting __kmpc_data_sharing_environment_end\n");
-    return;
-  }
-
-  __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
-
-  // Only the warp master can restore the stack and frame information, and only
-  // if there are no other threads left behind in this environment (i.e. the
-  // warp diverged and returns in different places). This only works if we
-  // assume that threads will converge right after the call site that started
-  // the environment.
-  if (__kmpc_impl_is_first_active_thread()) {
-    __kmpc_impl_lanemask_t &ActiveT = DataSharingState.ActiveThreads[WID];
-
-    DSPRINT0(DSFLAG, "Before restoring the stack\n");
-    // Zero the bits in the mask. If it is still different from zero, then we
-    // have other threads that will return after the current ones.
-    ActiveT &= ~CurActive;
-
-    DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n",
-            (unsigned)CurActive, (unsigned)ActiveT);
-
-    if (!ActiveT) {
-      // No other active threads? Great, lets restore the stack.
-
-      __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
-      void *&StackP = DataSharingState.StackPtr[WID];
-      void * volatile &FrameP = DataSharingState.FramePtr[WID];
-
-      SlotP = *SavedSharedSlot;
-      StackP = *SavedSharedStack;
-      FrameP = *SavedSharedFrame;
-      ActiveT = *SavedActiveThreads;
-
-      DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n",
-              (unsigned long long)SlotP);
-      DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n",
-              (unsigned long long)StackP);
-      DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n",
-              (unsigned long long)FrameP);
-      DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
-    }
-  }
-
-  // FIXME: Need to see the impact of doing it here.
-  __kmpc_impl_threadfence_block();
-
-  DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_end\n");
-  return;
-}
-
-EXTERN void *
-__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
-                                          int16_t IsOMPRuntimeInitialized) {
-  DSPRINT0(DSFLAG, "Entering __kmpc_get_data_sharing_environment_frame\n");
-
-  // If the runtime has been elided, use shared memory for master-worker
-  // data sharing.  We're reusing the statically allocated data structure
-  // that is used for standard data sharing.
-  if (!IsOMPRuntimeInitialized)
-    return (void *)&DataSharingState;
-
-  // Get the frame used by the requested thread.
-
-  unsigned SourceWID = SourceThreadID / WARPSIZE;
-
-  DSPRINT(DSFLAG, "Source  warp: %u\n", SourceWID);
-
-  void * volatile P = DataSharingState.FramePtr[SourceWID];
-  DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
-  return P;
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // Runtime functions for trunk data sharing scheme.
 ////////////////////////////////////////////////////////////////////////////////
--- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
@ -362,53 +362,3 @@ EXTERN int omp_test_lock(omp_lock_t *lock) {
  PRINT(LD_IO, "call omp_test_lock() return %d\n", rc);
  return rc;
 }
-
-// for xlf Fortran
-// Fortran, the return is LOGICAL type
-
-#define FLOGICAL long
-EXTERN FLOGICAL __xlf_omp_is_initial_device_i8() {
-  int ret = omp_is_initial_device();
-  if (ret == 0)
-    return (FLOGICAL)0;
-  else
-    return (FLOGICAL)1;
-}
-
-EXTERN int __xlf_omp_is_initial_device_i4() {
-  int ret = omp_is_initial_device();
-  if (ret == 0)
-    return 0;
-  else
-    return 1;
-}
-
-EXTERN long __xlf_omp_get_team_num_i4() {
-  int ret = omp_get_team_num();
-  return (long)ret;
-}
-
-EXTERN long __xlf_omp_get_num_teams_i4() {
-  int ret = omp_get_num_teams();
-  return (long)ret;
-}
-
-EXTERN void xlf_debug_print_int(int *p) {
-  printf("xlf DEBUG %d): %p %d\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
-}
-
-EXTERN void xlf_debug_print_long(long *p) {
-  printf("xlf DEBUG %d): %p %ld\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
-}
-
-EXTERN void xlf_debug_print_float(float *p) {
-  printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
-}
-
-EXTERN void xlf_debug_print_double(double *p) {
-  printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
-}
-
-EXTERN void xlf_debug_print_addr(void *p) {
-  printf("xlf DEBUG %d): %p \n", omp_get_team_num(), p);
-}
--- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
@ -754,55 +754,3 @@ void __kmpc_for_static_init_8u_simple_generic(
 EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
  PRINT0(LD_IO, "call kmpc_for_static_fini\n");
 }
-
-namespace {
-INLINE void syncWorkersInGenericMode(uint32_t NumThreads) {
-  int NumWarps = ((NumThreads + WARPSIZE - 1) / WARPSIZE);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  // On Volta and newer architectures we require that all lanes in
-  // a warp (at least, all present for the kernel launch) participate in the
-  // barrier.  This is enforced when launching the parallel region.  An
-  // exception is when there are < WARPSIZE workers.  In this case only 1 worker
-  // is started, so we don't need a barrier.
-  if (NumThreads > 1) {
-#endif
-    __kmpc_impl_named_sync(L1_BARRIER, WARPSIZE * NumWarps);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  }
-#endif
-}
-}; // namespace
-
-EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid,
-                                                  int32_t varNum, void *array) {
-  PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n");
-  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
-          "Expected non-SPMD mode + initialized runtime.");
-
-  omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
-  uint32_t NumThreads = GetNumberOfOmpThreads(checkSPMDMode(loc));
-  uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
-  for (unsigned i = 0; i < varNum; i++) {
-    // Reset buffer.
-    if (gtid == 0)
-      *Buffer = 0; // Reset to minimum loop iteration value.
-
-    // Barrier.
-    syncWorkersInGenericMode(NumThreads);
-
-    // Atomic max of iterations.
-    uint64_t *varArray = (uint64_t *)array;
-    uint64_t elem = varArray[i];
-    (void)__kmpc_atomic_max((unsigned long long int *)Buffer,
-                            (unsigned long long int)elem);
-
-    // Barrier.
-    syncWorkersInGenericMode(NumThreads);
-
-    // Read max value and update thread private array.
-    varArray[i] = *Buffer;
-
-    // Barrier.
-    syncWorkersInGenericMode(NumThreads);
-  }
-}
--- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
@ -25,13 +25,6 @@ extern DEVICE
 // init entry points
 ////////////////////////////////////////////////////////////////////////////////

-EXTERN void __kmpc_kernel_init_params(void *Ptr) {
-  PRINT(LD_IO, "call to __kmpc_kernel_init_params with version %f\n",
-        OMPTARGET_NVPTX_VERSION);
-
-  SetTeamsReductionScratchpadPtr(Ptr);
-}
-
 EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
  PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
        OMPTARGET_NVPTX_VERSION);
@ -152,10 +145,6 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
  }
 }

-EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit() {
-  __kmpc_spmd_kernel_deinit_v2(isRuntimeInitialized());
-}
-
 EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) {
  // We're not going to pop the task descr stack of each thread since
  // there are no more parallel regions in SPMD mode.
--- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
@ -35,161 +35,6 @@
 #include "common/omptarget.h"
 #include "target_impl.h"

-typedef struct ConvergentSimdJob {
-  omptarget_nvptx_TaskDescr taskDescr;
-  omptarget_nvptx_TaskDescr *convHeadTaskDescr;
-  uint16_t slimForNextSimd;
-} ConvergentSimdJob;
-
-////////////////////////////////////////////////////////////////////////////////
-// support for convergent simd (team of threads in a warp only)
-////////////////////////////////////////////////////////////////////////////////
-EXTERN bool __kmpc_kernel_convergent_simd(void *buffer,
-                                          __kmpc_impl_lanemask_t Mask,
-                                          bool *IsFinal, int32_t *LaneSource,
-                                          int32_t *LaneId, int32_t *NumLanes) {
-  PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n");
-  __kmpc_impl_lanemask_t ConvergentMask = Mask;
-  int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
-  __kmpc_impl_lanemask_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
-  *LaneSource += __kmpc_impl_ffs(WorkRemaining);
-  *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
-  __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
-  *LaneId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
-
-  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
-  int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
-
-  ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
-  int32_t SimdLimit =
-      omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
-  job->slimForNextSimd = SimdLimit;
-
-  int32_t SimdLimitSource = __kmpc_impl_shfl_sync(Mask, SimdLimit, *LaneSource);
-  // reset simdlimit to avoid propagating to successive #simd
-  if (SimdLimitSource > 0 && threadId == sourceThreadId)
-    omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0;
-
-  // We cannot have more than the # of convergent threads.
-  if (SimdLimitSource > 0)
-    *NumLanes = __kmpc_impl_min(ConvergentSize, SimdLimitSource);
-  else
-    *NumLanes = ConvergentSize;
-  ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads",
-         (int)*NumLanes);
-
-  // Set to true for lanes participating in the simd region.
-  bool isActive = false;
-  // Initialize state for active threads.
-  if (*LaneId < *NumLanes) {
-    omptarget_nvptx_TaskDescr *currTaskDescr =
-        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
-    omptarget_nvptx_TaskDescr *sourceTaskDescr =
-        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
-            sourceThreadId);
-    job->convHeadTaskDescr = currTaskDescr;
-    // install top descriptor from the thread for which the lanes are working.
-    omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
-                                                               sourceTaskDescr);
-    isActive = true;
-  }
-
-  // requires a memory fence between threads of a warp
-  return isActive;
-}
-
-EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) {
-  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
-  // pop stack
-  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
-  ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
-  omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) =
-      job->slimForNextSimd;
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
-      threadId, job->convHeadTaskDescr);
-}
-
-typedef struct ConvergentParallelJob {
-  omptarget_nvptx_TaskDescr taskDescr;
-  omptarget_nvptx_TaskDescr *convHeadTaskDescr;
-  uint16_t tnumForNextPar;
-} ConvergentParallelJob;
-
-////////////////////////////////////////////////////////////////////////////////
-// support for convergent parallelism (team of threads in a warp only)
-////////////////////////////////////////////////////////////////////////////////
-EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer,
-                                              __kmpc_impl_lanemask_t Mask,
-                                              bool *IsFinal,
-                                              int32_t *LaneSource) {
-  PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n");
-  __kmpc_impl_lanemask_t ConvergentMask = Mask;
-  int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
-  __kmpc_impl_lanemask_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
-  *LaneSource += __kmpc_impl_ffs(WorkRemaining);
-  *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
-  __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
-  uint32_t OmpId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
-
-  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
-  int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
-
-  ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
-  int32_t NumThreadsClause =
-      omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
-  job->tnumForNextPar = NumThreadsClause;
-
-  int32_t NumThreadsSource =
-      __kmpc_impl_shfl_sync(Mask, NumThreadsClause, *LaneSource);
-  // reset numthreads to avoid propagating to successive #parallel
-  if (NumThreadsSource > 0 && threadId == sourceThreadId)
-    omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
-        0;
-
-  // We cannot have more than the # of convergent threads.
-  uint16_t NumThreads;
-  if (NumThreadsSource > 0)
-    NumThreads = __kmpc_impl_min(ConvergentSize, NumThreadsSource);
-  else
-    NumThreads = ConvergentSize;
-  ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
-         (int)NumThreads);
-
-  // Set to true for workers participating in the parallel region.
-  bool isActive = false;
-  // Initialize state for active threads.
-  if (OmpId < NumThreads) {
-    // init L2 task descriptor and storage for the L1 parallel task descriptor.
-    omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr;
-    ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
-    omptarget_nvptx_TaskDescr *currTaskDescr =
-        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
-    omptarget_nvptx_TaskDescr *sourceTaskDescr =
-        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
-            sourceThreadId);
-    job->convHeadTaskDescr = currTaskDescr;
-    newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads);
-    // install new top descriptor
-    omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
-                                                               newTaskDescr);
-    isActive = true;
-  }
-
-  // requires a memory fence between threads of a warp
-  return isActive;
-}
-
-EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
-  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
-  // pop stack
-  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
-  ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
-      threadId, job->convHeadTaskDescr);
-  omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
-      job->tnumForNextPar;
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // support for parallel that goes parallel (1 static level only)
 ////////////////////////////////////////////////////////////////////////////////
@ -446,14 +291,6 @@ EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
      num_threads;
 }

-EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid,
-                                   int32_t simd_limit) {
-  PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", (int)simd_limit);
-  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
-  tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-  omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit;
-}
-
 // Do nothing. The host guarantees we started the requested number of
 // teams and we only need inspection of gridDim.

--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@ -73,22 +73,6 @@ gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
  return (logical_lane_id == 0);
 }

-EXTERN
-int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars,
-                                        size_t reduce_size, void *reduce_data,
-                                        kmp_ShuffleReductFctPtr shflFct,
-                                        kmp_InterWarpCopyFctPtr cpyFct) {
-  __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
-  if (Liveness == __kmpc_impl_all_lanes) {
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-    return GetThreadIdInBlock() % WARPSIZE ==
-           0; // Result on lane 0 of the simd warp.
-  } else {
-    return gpu_irregular_simd_reduce(
-        reduce_data, shflFct); // Result on the first active lane.
-  }
-}
-
 INLINE
 static int32_t nvptx_parallel_reduce_nowait(
    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
@ -177,14 +161,6 @@ static int32_t nvptx_parallel_reduce_nowait(
 #endif // __CUDA_ARCH__ >= 700
 }

-EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
-  return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
-                                      reduce_data, shflFct, cpyFct,
-                                      isSPMDMode(), isRuntimeUninitialized());
-}
-
 EXTERN
 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
    kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
@ -195,201 +171,6 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
      checkSPMDMode(loc), checkRuntimeUninitialized(loc));
 }

-EXTERN
-int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
-  return nvptx_parallel_reduce_nowait(
-      global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
-      /*isSPMDExecutionMode=*/true, /*isRuntimeUninitialized=*/true);
-}
-
-EXTERN
-int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
-  return nvptx_parallel_reduce_nowait(
-      global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
-      /*isSPMDExecutionMode=*/false, /*isRuntimeUninitialized=*/true);
-}
-
-INLINE
-static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
-                                         size_t reduce_size, void *reduce_data,
-                                         kmp_ShuffleReductFctPtr shflFct,
-                                         kmp_InterWarpCopyFctPtr cpyFct,
-                                         kmp_CopyToScratchpadFctPtr scratchFct,
-                                         kmp_LoadReduceFctPtr ldFct,
-                                         bool isSPMDExecutionMode) {
-  uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
-  // In non-generic mode all workers participate in the teams reduction.
-  // In generic mode only the team master participates in the teams
-  // reduction because the workers are waiting for parallel work.
-  uint32_t NumThreads =
-      isSPMDExecutionMode ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
-                          : /*Master thread only*/ 1;
-  uint32_t TeamId = GetBlockIdInKernel();
-  uint32_t NumTeams = GetNumberOfBlocksInKernel();
-  static SHARED volatile bool IsLastTeam;
-
-  // Team masters of all teams write to the scratchpad.
-  if (ThreadId == 0) {
-    unsigned int *timestamp = GetTeamsReductionTimestamp();
-    char *scratchpad = GetTeamsReductionScratchpad();
-
-    scratchFct(reduce_data, scratchpad, TeamId, NumTeams);
-    __kmpc_impl_threadfence();
-
-    // atomicInc increments 'timestamp' and has a range [0, NumTeams-1].
-    // It resets 'timestamp' back to 0 once the last team increments
-    // this counter.
-    unsigned val = __kmpc_atomic_inc(timestamp, NumTeams - 1);
-    IsLastTeam = val == NumTeams - 1;
-  }
-
-  // We have to wait on L1 barrier because in GENERIC mode the workers
-  // are waiting on barrier 0 for work.
-  //
-  // If we guard this barrier as follows it leads to deadlock, probably
-  // because of a compiler bug: if (!IsGenericMode()) __syncthreads();
-  uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE;
-  __kmpc_impl_named_sync(L1_BARRIER, SyncWarps * WARPSIZE);
-
-  // If this team is not the last, quit.
-  if (/* Volatile read by all threads */ !IsLastTeam)
-    return 0;
-
-    //
-    // Last team processing.
-    //
-
-    // Threads in excess of #teams do not participate in reduction of the
-    // scratchpad values.
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  uint32_t ActiveThreads = NumThreads;
-  if (NumTeams < NumThreads) {
-    ActiveThreads =
-        (NumTeams < WARPSIZE) ? 1 : NumTeams & ~((uint16_t)WARPSIZE - 1);
-  }
-  if (ThreadId >= ActiveThreads)
-    return 0;
-
-  // Load from scratchpad and reduce.
-  char *scratchpad = GetTeamsReductionScratchpad();
-  ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
-  for (uint32_t i = ActiveThreads + ThreadId; i < NumTeams; i += ActiveThreads)
-    ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
-
-  uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
-  uint32_t WarpId = ThreadId / WARPSIZE;
-
-  // Reduce across warps to the warp master.
-  if ((ActiveThreads % WARPSIZE == 0) ||
-      (WarpId < WarpsNeeded - 1)) // Full warp
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else if (ActiveThreads > 1) // Partial warp but contiguous lanes
-    // Only SPMD execution mode comes thru this case.
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/ActiveThreads % WARPSIZE,
-                              /*LaneId=*/ThreadId % WARPSIZE);
-
-  // When we have more than [warpsize] number of threads
-  // a block reduction is performed here.
-  if (ActiveThreads > WARPSIZE) {
-    // Gather all the reduced values from each warp
-    // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
-
-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
-  }
-#else
-  if (ThreadId >= NumTeams)
-    return 0;
-
-  // Load from scratchpad and reduce.
-  char *scratchpad = GetTeamsReductionScratchpad();
-  ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
-  for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
-    ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
-
-  // Reduce across warps to the warp master.
-  __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
-  if (Liveness == __kmpc_impl_all_lanes) // Full warp
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else // Partial warp but contiguous lanes
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/__kmpc_impl_popc(Liveness),
-                              /*LaneId=*/ThreadId % WARPSIZE);
-
-  // When we have more than [warpsize] number of threads
-  // a block reduction is performed here.
-  uint32_t ActiveThreads = NumTeams < NumThreads ? NumTeams : NumThreads;
-  if (ActiveThreads > WARPSIZE) {
-    uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
-    // Gather all the reduced values from each warp
-    // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
-
-    uint32_t WarpId = ThreadId / WARPSIZE;
-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
-  }
-#endif // __CUDA_ARCH__ >= 700
-
-  return ThreadId == 0;
-}
-
-EXTERN
-int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
-                                         size_t reduce_size, void *reduce_data,
-                                         kmp_ShuffleReductFctPtr shflFct,
-                                         kmp_InterWarpCopyFctPtr cpyFct,
-                                         kmp_CopyToScratchpadFctPtr scratchFct,
-                                         kmp_LoadReduceFctPtr ldFct) {
-  return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
-                                   reduce_data, shflFct, cpyFct, scratchFct,
-                                   ldFct, isSPMDMode());
-}
-
-EXTERN
-int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
-    kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
-  return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
-                                   reduce_data, shflFct, cpyFct, scratchFct,
-                                   ldFct, /*isSPMDExecutionMode=*/true);
-}
-
-EXTERN
-int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
-    kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
-  return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
-                                   reduce_data, shflFct, cpyFct, scratchFct,
-                                   ldFct, /*isSPMDExecutionMode=*/false);
-}
-
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
-                                                       int32_t global_tid,
-                                                       kmp_CriticalName *crit) {
-  if (checkSPMDMode(loc) && GetThreadIdInBlock() != 0)
-    return 0;
-  // The master thread of the team actually does the reduction.
-  while (__kmpc_atomic_cas((uint32_t *)crit, 0u, 1u))
-    ;
-  return 1;
-}
-
-EXTERN void
-__kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, int32_t global_tid,
-                                            kmp_CriticalName *crit) {
-  __kmpc_impl_threadfence_system();
-  (void)__kmpc_atomic_exchange((uint32_t *)crit, 0u);
-}
-
 INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
  return checkGenericMode(loc) || IsTeamMaster(ThreadId);
 }
--- a/openmp/libomptarget/deviceRTLs/common/src/support.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu
@ -264,6 +264,3 @@ DEVICE char *GetTeamsReductionScratchpad() {
  return static_cast<char *>(ReductionScratchpadPtr) + 256;
 }

-DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr) {
-  ReductionScratchpadPtr = ScratchpadPtr;
-}
--- a/openmp/libomptarget/deviceRTLs/common/src/sync.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/sync.cu
@ -79,23 +79,6 @@ EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) {
  PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n");
 }

-// Emit a simple barrier call in Generic mode.  Assumes the caller is in an L0
-// parallel region and that all worker threads participate.
-EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) {
-  int numberOfActiveOMPThreads = GetNumberOfThreadsInBlock() - WARPSIZE;
-  // The #threads parameter must be rounded up to the WARPSIZE.
-  int threads =
-      WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
-
-  PRINT(LD_SYNC,
-        "call kmpc_barrier_simple_generic with %d omp threads, sync parameter "
-        "%d\n",
-        (int)numberOfActiveOMPThreads, (int)threads);
-  // Barrier #1 is for synchronization among active threads.
-  __kmpc_impl_named_sync(L1_BARRIER, threads);
-  PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n");
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // KMP MASTER
 ////////////////////////////////////////////////////////////////////////////////
--- a/openmp/libomptarget/deviceRTLs/common/support.h
+++ b/openmp/libomptarget/deviceRTLs/common/support.h
@ -94,6 +94,5 @@ DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment);
 ////////////////////////////////////////////////////////////////////////////////
 DEVICE unsigned int *GetTeamsReductionTimestamp();
 DEVICE char *GetTeamsReductionScratchpad();
-DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);

 #endif
--- a/openmp/libomptarget/deviceRTLs/interface.h
+++ b/openmp/libomptarget/deviceRTLs/interface.h
@ -193,17 +193,10 @@ typedef struct ident {

 // parallel defs
 typedef ident_t kmp_Ident;
-typedef void (*kmp_ParFctPtr)(int32_t *global_tid, int32_t *bound_tid, ...);
-typedef void (*kmp_ReductFctPtr)(void *lhsData, void *rhsData);
 typedef void (*kmp_InterWarpCopyFctPtr)(void *src, int32_t warp_num);
 typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id,
                                        int16_t lane_offset,
                                        int16_t shortCircuit);
-typedef void (*kmp_CopyToScratchpadFctPtr)(void *reduceData, void *scratchpad,
-                                           int32_t index, int32_t width);
-typedef void (*kmp_LoadReduceFctPtr)(void *reduceData, void *scratchpad,
-                                     int32_t index, int32_t width,
-                                     int32_t reduce);
 typedef void (*kmp_ListGlobalFctPtr)(void *buffer, int idx, void *reduce_data);

 // task defs
@ -227,12 +220,6 @@ typedef int32_t kmp_CriticalName[8];
 EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc);
 EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t global_tid,
                                    int32_t num_threads);
-// simd
-EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t global_tid,
-                                   int32_t simd_limit);
-// aee ... not supported
-// EXTERN void __kmpc_fork_call(kmp_Ident *loc, int32_t argc, kmp_ParFctPtr
-// microtask, ...);
 EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid);
 EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
                                           uint32_t global_tid);
@ -354,61 +341,25 @@ EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t global_tid);
 EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t global_tid);
 EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t global_tid);

-// Support for reducing conditional lastprivate variables
-EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc,
-                                                  int32_t global_tid,
-                                                  int32_t varNum, void *array);
-
 // reduction
 EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid);
 EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
-EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
 EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
    kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
    void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
    kmp_InterWarpCopyFctPtr cpyFct);
-EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
-EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
-EXTERN int32_t __kmpc_nvptx_simd_reduce_nowait(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
 EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
    kmp_Ident *loc, int32_t global_tid, void *global_buffer,
    int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
    kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
    kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
    kmp_ListGlobalFctPtr glredFct);
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
-    kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
-    kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
-    kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
-                                                       int32_t global_tid,
-                                                       kmp_CriticalName *crit);
-EXTERN void __kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc,
-                                                        int32_t global_tid,
-                                                        kmp_CriticalName *crit);
 EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
 EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);

 // sync barrier
 EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid);
 EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid);
-EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid);
 EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc, int32_t global_tid);

 // single
@ -468,29 +419,16 @@ EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
                             int32_t cancelVal);

 // non standard
-EXTERN void __kmpc_kernel_init_params(void *ReductionScratchpadPtr);
 EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime);
 EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
 EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
                                    int16_t RequiresDataSharing);
-EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit();
 EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
 EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
                                           int16_t IsOMPRuntimeInitialized);
 EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
                                   int16_t IsOMPRuntimeInitialized);
 EXTERN void __kmpc_kernel_end_parallel();
-EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer,
-                                              __kmpc_impl_lanemask_t Mask,
-                                              bool *IsFinal,
-                                              int32_t *LaneSource);
-EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer);
-EXTERN bool __kmpc_kernel_convergent_simd(void *buffer,
-                                          __kmpc_impl_lanemask_t Mask,
-                                          bool *IsFinal, int32_t *LaneSource,
-                                          int32_t *LaneId, int32_t *NumLanes);
-EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer);
-

 EXTERN void __kmpc_data_sharing_init_stack();
 EXTERN void __kmpc_data_sharing_init_stack_spmd();
@ -512,22 +450,6 @@ struct __kmpc_data_sharing_slot {
  void *DataEnd;
  char Data[];
 };
-EXTERN void
-__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *RootS,
-                                           size_t InitialDataSize);
-EXTERN void *__kmpc_data_sharing_environment_begin(
-    __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
-    void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
-    size_t SharingDataSize, size_t SharingDefaultDataSize,
-    int16_t IsOMPRuntimeInitialized);
-EXTERN void __kmpc_data_sharing_environment_end(
-    __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
-    void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
-    int32_t IsEntryPoint);
-
-EXTERN void *
-__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
-                                          int16_t IsOMPRuntimeInitialized);

 // SPMD execution mode interrogation function.
 EXTERN int8_t __kmpc_is_spmd_exec_mode();
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@ -195,15 +195,6 @@ INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
 INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
 INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }

-// Return true if this is the first active thread in the warp.
-INLINE bool __kmpc_impl_is_first_active_thread() {
-  unsigned long long Mask = __kmpc_impl_activemask();
-  unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE);
-  unsigned long long Sh = Mask << ShNum;
-  // Truncate Sh to the 32 lower bits
-  return (unsigned)Sh == 0;
-}
-
 // Locks
 EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock);
 EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock);