[OpenMP][NFC] Remove unused and untested code from the device runtime

Summary:
We carried a lot of unused and untested code in the device runtime.
Among other reasons, we are planning major rewrites for which reduced
size is going to help a lot.

The number of code lines reduced by 14%!

Before:
-------------------------------------------------------------------------------
Language                     files          blank        comment           code
-------------------------------------------------------------------------------
CUDA                            13            489            841           2454
C/C++ Header                    14            322            493           1377
C                               12            117            124            559
CMake                            4             64             64            262
C++                              1              6              6             39
-------------------------------------------------------------------------------
SUM:                            44            998           1528           4691
-------------------------------------------------------------------------------

After:
-------------------------------------------------------------------------------
Language                     files          blank        comment           code
-------------------------------------------------------------------------------
CUDA                            13            366            733           1879
C/C++ Header                    14            317            484           1293
C                               12            117            124            559
CMake                            4             64             64            262
C++                              1              6              6             39
-------------------------------------------------------------------------------
SUM:                            44            870           1411           4032
-------------------------------------------------------------------------------

Reviewers: hfinkel, jhuber6, fghanim, JonChesterfield, grokos, AndreyChurbanov, ye-luo, tianshilei1992, ggeorgakoudis, Hahnfeld, ABataev, hbae, ronlieb, gregrodgers

Subscribers: jvesely, yaxunl, bollu, guansong, jfb, sstefan1, aaron.ballman, openmp-commits, cfe-commits

Tags: #clang, #openmp

Differential Revision: https://reviews.llvm.org/D83349
This commit is contained in:
Johannes Doerfert 2020-07-07 16:44:33 -05:00
parent 0f0c5af3db
commit cd0ea03e6f
14 changed files with 0 additions and 905 deletions

View File

@ -78,7 +78,6 @@ int bar(int n){
// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0)
// CHECK-NOT: call void @__kmpc_for_static_init
// CHECK-NOT: call void @__kmpc_for_static_fini
// CHECK-NOT: call i32 @__kmpc_nvptx_simd_reduce_nowait(
// CHECK-NOT: call void @__kmpc_nvptx_end_reduce_nowait(
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: ret void

View File

@ -140,8 +140,6 @@ DEVICE int GetNumberOfThreadsInBlock();
DEVICE unsigned GetWarpId();
DEVICE unsigned GetLaneId();
DEVICE bool __kmpc_impl_is_first_active_thread();
// Locks
DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);
DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock);

View File

@ -200,7 +200,6 @@ public:
INLINE omptarget_nvptx_WorkDescr &WorkDescr() {
return workDescrForActiveParallel;
}
INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; }
// init
INLINE void InitTeamDescr();
@ -251,7 +250,6 @@ private:
levelZeroTaskDescr; // icv for team master initial thread
omptarget_nvptx_WorkDescr
workDescrForActiveParallel; // one, ONLY for the active par
uint64_t lastprivateIterBuffer;
ALIGN(16)
__kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
@ -277,10 +275,6 @@ public:
INLINE uint16_t &NumThreadsForNextParallel(int tid) {
return nextRegion.tnum[tid];
}
// simd
INLINE uint16_t &SimdLimitForNextSimd(int tid) {
return nextRegion.slim[tid];
}
// schedule (for dispatch)
INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
@ -304,8 +298,6 @@ private:
// Only one of the two is live at the same time.
// parallel
uint16_t tnum[MAX_THREADS_PER_TEAM];
// simd limit
uint16_t slim[MAX_THREADS_PER_TEAM];
} nextRegion;
// schedule (for dispatch)
kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for

View File

@ -17,297 +17,6 @@ INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
}
/// Return the provided size aligned to the size of a pointer.
INLINE static size_t AlignVal(size_t Val) {
const size_t Align = (size_t)sizeof(void *);
if (Val & (Align - 1)) {
Val += Align;
Val &= ~(Align - 1);
}
return Val;
}
#define DSFLAG 0
#define DSFLAG_INIT 0
#define DSPRINT(_flag, _str, _args...) \
{ \
if (_flag) { \
/*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x, _args);*/ \
} \
}
#define DSPRINT0(_flag, _str) \
{ \
if (_flag) { \
/*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x);*/ \
} \
}
// Initialize the shared data structures. This is expected to be called for the
// master thread and warp masters. \param RootS: A pointer to the root of the
// data sharing stack. \param InitialDataSize: The initial size of the data in
// the slot.
EXTERN void
__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
size_t InitialDataSize) {
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
DSPRINT0(DSFLAG_INIT,
"Entering __kmpc_initialize_data_sharing_environment\n");
unsigned WID = GetWarpId();
DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID);
omptarget_nvptx_TeamDescr *teamDescr =
&omptarget_nvptx_threadPrivateContext->TeamContext();
__kmpc_data_sharing_slot *RootS =
teamDescr->RootS(WID, IsMasterThread(isSPMDMode()));
DataSharingState.SlotPtr[WID] = RootS;
DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
// We don't need to initialize the frame and active threads.
DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", (unsigned)InitialDataSize);
DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (unsigned long long)RootS);
DSPRINT(DSFLAG_INIT, "Root slot data-end at: %016llx \n",
(unsigned long long)RootS->DataEnd);
DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n",
(unsigned long long)RootS->Next);
DSPRINT(DSFLAG_INIT, "Shared slot ptr at: %016llx \n",
(unsigned long long)DataSharingState.SlotPtr[WID]);
DSPRINT(DSFLAG_INIT, "Shared stack ptr at: %016llx \n",
(unsigned long long)DataSharingState.StackPtr[WID]);
DSPRINT0(DSFLAG_INIT, "Exiting __kmpc_initialize_data_sharing_environment\n");
}
EXTERN void *__kmpc_data_sharing_environment_begin(
__kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
size_t SharingDataSize, size_t SharingDefaultDataSize,
int16_t IsOMPRuntimeInitialized) {
DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_begin\n");
// If the runtime has been elided, used shared memory for master-worker
// data sharing.
if (!IsOMPRuntimeInitialized)
return (void *)&DataSharingState;
DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
DSPRINT(DSFLAG, "Default Data Size %016llx\n",
(unsigned long long)SharingDefaultDataSize);
unsigned WID = GetWarpId();
__kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask();
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
void *&StackP = DataSharingState.StackPtr[WID];
void * volatile &FrameP = DataSharingState.FramePtr[WID];
__kmpc_impl_lanemask_t &ActiveT = DataSharingState.ActiveThreads[WID];
DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
// Save the current values.
*SavedSharedSlot = SlotP;
*SavedSharedStack = StackP;
*SavedSharedFrame = FrameP;
*SavedActiveThreads = ActiveT;
DSPRINT(DSFLAG, "Warp ID: %u\n", WID);
DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (unsigned long long)SlotP);
DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (unsigned long long)StackP);
DSPRINT(DSFLAG, "Saved frame ptr at: %016llx \n", (long long)FrameP);
DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
// Only the warp active master needs to grow the stack.
if (__kmpc_impl_is_first_active_thread()) {
// Save the current active threads.
ActiveT = CurActiveThreads;
// Make sure we use aligned sizes to avoid rematerialization of data.
SharingDataSize = AlignVal(SharingDataSize);
// FIXME: The default data size can be assumed to be aligned?
SharingDefaultDataSize = AlignVal(SharingDefaultDataSize);
// Check if we have room for the data in the current slot.
const uintptr_t CurrentStartAddress = (uintptr_t)StackP;
const uintptr_t CurrentEndAddress = (uintptr_t)SlotP->DataEnd;
const uintptr_t RequiredEndAddress =
CurrentStartAddress + (uintptr_t)SharingDataSize;
DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
DSPRINT(DSFLAG, "Default Data Size %016llx\n",
(unsigned long long)SharingDefaultDataSize);
DSPRINT(DSFLAG, "Current Start Address %016llx\n",
(unsigned long long)CurrentStartAddress);
DSPRINT(DSFLAG, "Current End Address %016llx\n",
(unsigned long long)CurrentEndAddress);
DSPRINT(DSFLAG, "Required End Address %016llx\n",
(unsigned long long)RequiredEndAddress);
DSPRINT(DSFLAG, "Active Threads %08x\n", (unsigned)ActiveT);
// If we require a new slot, allocate it and initialize it (or attempt to
// reuse one). Also, set the shared stack and slot pointers to the new
// place. If we do not need to grow the stack, just adapt the stack and
// frame pointers.
if (CurrentEndAddress < RequiredEndAddress) {
size_t NewSize = (SharingDataSize > SharingDefaultDataSize)
? SharingDataSize
: SharingDefaultDataSize;
__kmpc_data_sharing_slot *NewSlot = 0;
// Attempt to reuse an existing slot.
if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
uintptr_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
(uintptr_t)(&ExistingSlot->Data[0]);
if (ExistingSlotSize >= NewSize) {
DSPRINT(DSFLAG, "Reusing stack slot %016llx\n",
(unsigned long long)ExistingSlot);
NewSlot = ExistingSlot;
} else {
DSPRINT(DSFLAG, "Cleaning up -failed reuse - %016llx\n",
(unsigned long long)SlotP->Next);
SafeFree(ExistingSlot, "Failed reuse");
}
}
if (!NewSlot) {
NewSlot = (__kmpc_data_sharing_slot *)SafeMalloc(
sizeof(__kmpc_data_sharing_slot) + NewSize,
"Warp master slot allocation");
DSPRINT(DSFLAG, "New slot allocated %016llx (data size=%016llx)\n",
(unsigned long long)NewSlot, NewSize);
}
NewSlot->Next = 0;
NewSlot->DataEnd = &NewSlot->Data[NewSize];
SlotP->Next = NewSlot;
SlotP = NewSlot;
StackP = &NewSlot->Data[SharingDataSize];
FrameP = &NewSlot->Data[0];
} else {
// Clean up any old slot that we may still have. The slot producers, do
// not eliminate them because that may be used to return data.
if (SlotP->Next) {
DSPRINT(DSFLAG, "Cleaning up - old not required - %016llx\n",
(unsigned long long)SlotP->Next);
SafeFree(SlotP->Next, "Old slot not required");
SlotP->Next = 0;
}
FrameP = StackP;
StackP = (void *)RequiredEndAddress;
}
}
// FIXME: Need to see the impact of doing it here.
__kmpc_impl_threadfence_block();
DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_begin\n");
// All the threads in this warp get the frame they should work with.
return FrameP;
}
EXTERN void __kmpc_data_sharing_environment_end(
__kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
int32_t IsEntryPoint) {
DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n");
unsigned WID = GetWarpId();
if (IsEntryPoint) {
if (__kmpc_impl_is_first_active_thread()) {
DSPRINT0(DSFLAG, "Doing clean up\n");
// The master thread cleans the saved slot, because this is an environment
// only for the master.
__kmpc_data_sharing_slot *S = IsMasterThread(isSPMDMode())
? *SavedSharedSlot
: DataSharingState.SlotPtr[WID];
if (S->Next) {
SafeFree(S->Next, "Sharing environment end");
S->Next = 0;
}
}
DSPRINT0(DSFLAG, "Exiting Exiting __kmpc_data_sharing_environment_end\n");
return;
}
__kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
// Only the warp master can restore the stack and frame information, and only
// if there are no other threads left behind in this environment (i.e. the
// warp diverged and returns in different places). This only works if we
// assume that threads will converge right after the call site that started
// the environment.
if (__kmpc_impl_is_first_active_thread()) {
__kmpc_impl_lanemask_t &ActiveT = DataSharingState.ActiveThreads[WID];
DSPRINT0(DSFLAG, "Before restoring the stack\n");
// Zero the bits in the mask. If it is still different from zero, then we
// have other threads that will return after the current ones.
ActiveT &= ~CurActive;
DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n",
(unsigned)CurActive, (unsigned)ActiveT);
if (!ActiveT) {
// No other active threads? Great, lets restore the stack.
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
void *&StackP = DataSharingState.StackPtr[WID];
void * volatile &FrameP = DataSharingState.FramePtr[WID];
SlotP = *SavedSharedSlot;
StackP = *SavedSharedStack;
FrameP = *SavedSharedFrame;
ActiveT = *SavedActiveThreads;
DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n",
(unsigned long long)SlotP);
DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n",
(unsigned long long)StackP);
DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n",
(unsigned long long)FrameP);
DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
}
}
// FIXME: Need to see the impact of doing it here.
__kmpc_impl_threadfence_block();
DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_end\n");
return;
}
EXTERN void *
__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
int16_t IsOMPRuntimeInitialized) {
DSPRINT0(DSFLAG, "Entering __kmpc_get_data_sharing_environment_frame\n");
// If the runtime has been elided, use shared memory for master-worker
// data sharing. We're reusing the statically allocated data structure
// that is used for standard data sharing.
if (!IsOMPRuntimeInitialized)
return (void *)&DataSharingState;
// Get the frame used by the requested thread.
unsigned SourceWID = SourceThreadID / WARPSIZE;
DSPRINT(DSFLAG, "Source warp: %u\n", SourceWID);
void * volatile P = DataSharingState.FramePtr[SourceWID];
DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
return P;
}
////////////////////////////////////////////////////////////////////////////////
// Runtime functions for trunk data sharing scheme.
////////////////////////////////////////////////////////////////////////////////

View File

@ -362,53 +362,3 @@ EXTERN int omp_test_lock(omp_lock_t *lock) {
PRINT(LD_IO, "call omp_test_lock() return %d\n", rc);
return rc;
}
// for xlf Fortran
// Fortran, the return is LOGICAL type
#define FLOGICAL long
EXTERN FLOGICAL __xlf_omp_is_initial_device_i8() {
int ret = omp_is_initial_device();
if (ret == 0)
return (FLOGICAL)0;
else
return (FLOGICAL)1;
}
EXTERN int __xlf_omp_is_initial_device_i4() {
int ret = omp_is_initial_device();
if (ret == 0)
return 0;
else
return 1;
}
EXTERN long __xlf_omp_get_team_num_i4() {
int ret = omp_get_team_num();
return (long)ret;
}
EXTERN long __xlf_omp_get_num_teams_i4() {
int ret = omp_get_num_teams();
return (long)ret;
}
EXTERN void xlf_debug_print_int(int *p) {
printf("xlf DEBUG %d): %p %d\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
}
EXTERN void xlf_debug_print_long(long *p) {
printf("xlf DEBUG %d): %p %ld\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
}
EXTERN void xlf_debug_print_float(float *p) {
printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
}
EXTERN void xlf_debug_print_double(double *p) {
printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
}
EXTERN void xlf_debug_print_addr(void *p) {
printf("xlf DEBUG %d): %p \n", omp_get_team_num(), p);
}

View File

@ -754,55 +754,3 @@ void __kmpc_for_static_init_8u_simple_generic(
EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
PRINT0(LD_IO, "call kmpc_for_static_fini\n");
}
namespace {
INLINE void syncWorkersInGenericMode(uint32_t NumThreads) {
int NumWarps = ((NumThreads + WARPSIZE - 1) / WARPSIZE);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
// On Volta and newer architectures we require that all lanes in
// a warp (at least, all present for the kernel launch) participate in the
// barrier. This is enforced when launching the parallel region. An
// exception is when there are < WARPSIZE workers. In this case only 1 worker
// is started, so we don't need a barrier.
if (NumThreads > 1) {
#endif
__kmpc_impl_named_sync(L1_BARRIER, WARPSIZE * NumWarps);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
}
#endif
}
}; // namespace
EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid,
int32_t varNum, void *array) {
PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n");
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
"Expected non-SPMD mode + initialized runtime.");
omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
uint32_t NumThreads = GetNumberOfOmpThreads(checkSPMDMode(loc));
uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
for (unsigned i = 0; i < varNum; i++) {
// Reset buffer.
if (gtid == 0)
*Buffer = 0; // Reset to minimum loop iteration value.
// Barrier.
syncWorkersInGenericMode(NumThreads);
// Atomic max of iterations.
uint64_t *varArray = (uint64_t *)array;
uint64_t elem = varArray[i];
(void)__kmpc_atomic_max((unsigned long long int *)Buffer,
(unsigned long long int)elem);
// Barrier.
syncWorkersInGenericMode(NumThreads);
// Read max value and update thread private array.
varArray[i] = *Buffer;
// Barrier.
syncWorkersInGenericMode(NumThreads);
}
}

View File

@ -25,13 +25,6 @@ extern DEVICE
// init entry points
////////////////////////////////////////////////////////////////////////////////
EXTERN void __kmpc_kernel_init_params(void *Ptr) {
PRINT(LD_IO, "call to __kmpc_kernel_init_params with version %f\n",
OMPTARGET_NVPTX_VERSION);
SetTeamsReductionScratchpadPtr(Ptr);
}
EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
OMPTARGET_NVPTX_VERSION);
@ -152,10 +145,6 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
}
}
EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit() {
__kmpc_spmd_kernel_deinit_v2(isRuntimeInitialized());
}
EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) {
// We're not going to pop the task descr stack of each thread since
// there are no more parallel regions in SPMD mode.

View File

@ -35,161 +35,6 @@
#include "common/omptarget.h"
#include "target_impl.h"
typedef struct ConvergentSimdJob {
omptarget_nvptx_TaskDescr taskDescr;
omptarget_nvptx_TaskDescr *convHeadTaskDescr;
uint16_t slimForNextSimd;
} ConvergentSimdJob;
////////////////////////////////////////////////////////////////////////////////
// support for convergent simd (team of threads in a warp only)
////////////////////////////////////////////////////////////////////////////////
EXTERN bool __kmpc_kernel_convergent_simd(void *buffer,
__kmpc_impl_lanemask_t Mask,
bool *IsFinal, int32_t *LaneSource,
int32_t *LaneId, int32_t *NumLanes) {
PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n");
__kmpc_impl_lanemask_t ConvergentMask = Mask;
int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
__kmpc_impl_lanemask_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
*LaneSource += __kmpc_impl_ffs(WorkRemaining);
*IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
__kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
*LaneId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
int32_t SimdLimit =
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
job->slimForNextSimd = SimdLimit;
int32_t SimdLimitSource = __kmpc_impl_shfl_sync(Mask, SimdLimit, *LaneSource);
// reset simdlimit to avoid propagating to successive #simd
if (SimdLimitSource > 0 && threadId == sourceThreadId)
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0;
// We cannot have more than the # of convergent threads.
if (SimdLimitSource > 0)
*NumLanes = __kmpc_impl_min(ConvergentSize, SimdLimitSource);
else
*NumLanes = ConvergentSize;
ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads",
(int)*NumLanes);
// Set to true for lanes participating in the simd region.
bool isActive = false;
// Initialize state for active threads.
if (*LaneId < *NumLanes) {
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
omptarget_nvptx_TaskDescr *sourceTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
sourceThreadId);
job->convHeadTaskDescr = currTaskDescr;
// install top descriptor from the thread for which the lanes are working.
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
sourceTaskDescr);
isActive = true;
}
// requires a memory fence between threads of a warp
return isActive;
}
EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) {
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
// pop stack
int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) =
job->slimForNextSimd;
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
threadId, job->convHeadTaskDescr);
}
typedef struct ConvergentParallelJob {
omptarget_nvptx_TaskDescr taskDescr;
omptarget_nvptx_TaskDescr *convHeadTaskDescr;
uint16_t tnumForNextPar;
} ConvergentParallelJob;
////////////////////////////////////////////////////////////////////////////////
// support for convergent parallelism (team of threads in a warp only)
////////////////////////////////////////////////////////////////////////////////
EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer,
__kmpc_impl_lanemask_t Mask,
bool *IsFinal,
int32_t *LaneSource) {
PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n");
__kmpc_impl_lanemask_t ConvergentMask = Mask;
int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
__kmpc_impl_lanemask_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
*LaneSource += __kmpc_impl_ffs(WorkRemaining);
*IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
__kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
uint32_t OmpId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
int32_t NumThreadsClause =
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
job->tnumForNextPar = NumThreadsClause;
int32_t NumThreadsSource =
__kmpc_impl_shfl_sync(Mask, NumThreadsClause, *LaneSource);
// reset numthreads to avoid propagating to successive #parallel
if (NumThreadsSource > 0 && threadId == sourceThreadId)
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
0;
// We cannot have more than the # of convergent threads.
uint16_t NumThreads;
if (NumThreadsSource > 0)
NumThreads = __kmpc_impl_min(ConvergentSize, NumThreadsSource);
else
NumThreads = ConvergentSize;
ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
(int)NumThreads);
// Set to true for workers participating in the parallel region.
bool isActive = false;
// Initialize state for active threads.
if (OmpId < NumThreads) {
// init L2 task descriptor and storage for the L1 parallel task descriptor.
omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr;
ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
omptarget_nvptx_TaskDescr *sourceTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
sourceThreadId);
job->convHeadTaskDescr = currTaskDescr;
newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads);
// install new top descriptor
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
newTaskDescr);
isActive = true;
}
// requires a memory fence between threads of a warp
return isActive;
}
EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
// pop stack
int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
threadId, job->convHeadTaskDescr);
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
job->tnumForNextPar;
}
////////////////////////////////////////////////////////////////////////////////
// support for parallel that goes parallel (1 static level only)
////////////////////////////////////////////////////////////////////////////////
@ -446,14 +291,6 @@ EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
num_threads;
}
EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid,
int32_t simd_limit) {
PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", (int)simd_limit);
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit;
}
// Do nothing. The host guarantees we started the requested number of
// teams and we only need inspection of gridDim.

View File

@ -73,22 +73,6 @@ gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
return (logical_lane_id == 0);
}
EXTERN
int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars,
size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct,
kmp_InterWarpCopyFctPtr cpyFct) {
__kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
if (Liveness == __kmpc_impl_all_lanes) {
gpu_regular_warp_reduce(reduce_data, shflFct);
return GetThreadIdInBlock() % WARPSIZE ==
0; // Result on lane 0 of the simd warp.
} else {
return gpu_irregular_simd_reduce(
reduce_data, shflFct); // Result on the first active lane.
}
}
INLINE
static int32_t nvptx_parallel_reduce_nowait(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
@ -177,14 +161,6 @@ static int32_t nvptx_parallel_reduce_nowait(
#endif // __CUDA_ARCH__ >= 700
}
EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
reduce_data, shflFct, cpyFct,
isSPMDMode(), isRuntimeUninitialized());
}
EXTERN
int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
@ -195,201 +171,6 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
checkSPMDMode(loc), checkRuntimeUninitialized(loc));
}
EXTERN
int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
return nvptx_parallel_reduce_nowait(
global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
/*isSPMDExecutionMode=*/true, /*isRuntimeUninitialized=*/true);
}
EXTERN
int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
return nvptx_parallel_reduce_nowait(
global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
/*isSPMDExecutionMode=*/false, /*isRuntimeUninitialized=*/true);
}
INLINE
static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct,
kmp_InterWarpCopyFctPtr cpyFct,
kmp_CopyToScratchpadFctPtr scratchFct,
kmp_LoadReduceFctPtr ldFct,
bool isSPMDExecutionMode) {
uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
// In non-generic mode all workers participate in the teams reduction.
// In generic mode only the team master participates in the teams
// reduction because the workers are waiting for parallel work.
uint32_t NumThreads =
isSPMDExecutionMode ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
: /*Master thread only*/ 1;
uint32_t TeamId = GetBlockIdInKernel();
uint32_t NumTeams = GetNumberOfBlocksInKernel();
static SHARED volatile bool IsLastTeam;
// Team masters of all teams write to the scratchpad.
if (ThreadId == 0) {
unsigned int *timestamp = GetTeamsReductionTimestamp();
char *scratchpad = GetTeamsReductionScratchpad();
scratchFct(reduce_data, scratchpad, TeamId, NumTeams);
__kmpc_impl_threadfence();
// atomicInc increments 'timestamp' and has a range [0, NumTeams-1].
// It resets 'timestamp' back to 0 once the last team increments
// this counter.
unsigned val = __kmpc_atomic_inc(timestamp, NumTeams - 1);
IsLastTeam = val == NumTeams - 1;
}
// We have to wait on L1 barrier because in GENERIC mode the workers
// are waiting on barrier 0 for work.
//
// If we guard this barrier as follows it leads to deadlock, probably
// because of a compiler bug: if (!IsGenericMode()) __syncthreads();
uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE;
__kmpc_impl_named_sync(L1_BARRIER, SyncWarps * WARPSIZE);
// If this team is not the last, quit.
if (/* Volatile read by all threads */ !IsLastTeam)
return 0;
//
// Last team processing.
//
// Threads in excess of #teams do not participate in reduction of the
// scratchpad values.
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
uint32_t ActiveThreads = NumThreads;
if (NumTeams < NumThreads) {
ActiveThreads =
(NumTeams < WARPSIZE) ? 1 : NumTeams & ~((uint16_t)WARPSIZE - 1);
}
if (ThreadId >= ActiveThreads)
return 0;
// Load from scratchpad and reduce.
char *scratchpad = GetTeamsReductionScratchpad();
ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
for (uint32_t i = ActiveThreads + ThreadId; i < NumTeams; i += ActiveThreads)
ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
uint32_t WarpId = ThreadId / WARPSIZE;
// Reduce across warps to the warp master.
if ((ActiveThreads % WARPSIZE == 0) ||
(WarpId < WarpsNeeded - 1)) // Full warp
gpu_regular_warp_reduce(reduce_data, shflFct);
else if (ActiveThreads > 1) // Partial warp but contiguous lanes
// Only SPMD execution mode comes thru this case.
gpu_irregular_warp_reduce(reduce_data, shflFct,
/*LaneCount=*/ActiveThreads % WARPSIZE,
/*LaneId=*/ThreadId % WARPSIZE);
// When we have more than [warpsize] number of threads
// a block reduction is performed here.
if (ActiveThreads > WARPSIZE) {
// Gather all the reduced values from each warp
// to the first warp.
cpyFct(reduce_data, WarpsNeeded);
if (WarpId == 0)
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
}
#else
if (ThreadId >= NumTeams)
return 0;
// Load from scratchpad and reduce.
char *scratchpad = GetTeamsReductionScratchpad();
ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
// Reduce across warps to the warp master.
__kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
if (Liveness == __kmpc_impl_all_lanes) // Full warp
gpu_regular_warp_reduce(reduce_data, shflFct);
else // Partial warp but contiguous lanes
gpu_irregular_warp_reduce(reduce_data, shflFct,
/*LaneCount=*/__kmpc_impl_popc(Liveness),
/*LaneId=*/ThreadId % WARPSIZE);
// When we have more than [warpsize] number of threads
// a block reduction is performed here.
uint32_t ActiveThreads = NumTeams < NumThreads ? NumTeams : NumThreads;
if (ActiveThreads > WARPSIZE) {
uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
// Gather all the reduced values from each warp
// to the first warp.
cpyFct(reduce_data, WarpsNeeded);
uint32_t WarpId = ThreadId / WARPSIZE;
if (WarpId == 0)
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
}
#endif // __CUDA_ARCH__ >= 700
return ThreadId == 0;
}
EXTERN
int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct,
kmp_InterWarpCopyFctPtr cpyFct,
kmp_CopyToScratchpadFctPtr scratchFct,
kmp_LoadReduceFctPtr ldFct) {
return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
reduce_data, shflFct, cpyFct, scratchFct,
ldFct, isSPMDMode());
}
EXTERN
int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
reduce_data, shflFct, cpyFct, scratchFct,
ldFct, /*isSPMDExecutionMode=*/true);
}
EXTERN
int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
reduce_data, shflFct, cpyFct, scratchFct,
ldFct, /*isSPMDExecutionMode=*/false);
}
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
int32_t global_tid,
kmp_CriticalName *crit) {
if (checkSPMDMode(loc) && GetThreadIdInBlock() != 0)
return 0;
// The master thread of the team actually does the reduction.
while (__kmpc_atomic_cas((uint32_t *)crit, 0u, 1u))
;
return 1;
}
EXTERN void
__kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, int32_t global_tid,
kmp_CriticalName *crit) {
__kmpc_impl_threadfence_system();
(void)__kmpc_atomic_exchange((uint32_t *)crit, 0u);
}
INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
return checkGenericMode(loc) || IsTeamMaster(ThreadId);
}

View File

@ -264,6 +264,3 @@ DEVICE char *GetTeamsReductionScratchpad() {
return static_cast<char *>(ReductionScratchpadPtr) + 256;
}
DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr) {
ReductionScratchpadPtr = ScratchpadPtr;
}

View File

@ -79,23 +79,6 @@ EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) {
PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n");
}
// Emit a simple barrier call in Generic mode. Assumes the caller is in an L0
// parallel region and that all worker threads participate.
EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) {
int numberOfActiveOMPThreads = GetNumberOfThreadsInBlock() - WARPSIZE;
// The #threads parameter must be rounded up to the WARPSIZE.
int threads =
WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
PRINT(LD_SYNC,
"call kmpc_barrier_simple_generic with %d omp threads, sync parameter "
"%d\n",
(int)numberOfActiveOMPThreads, (int)threads);
// Barrier #1 is for synchronization among active threads.
__kmpc_impl_named_sync(L1_BARRIER, threads);
PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n");
}
////////////////////////////////////////////////////////////////////////////////
// KMP MASTER
////////////////////////////////////////////////////////////////////////////////

View File

@ -94,6 +94,5 @@ DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment);
////////////////////////////////////////////////////////////////////////////////
DEVICE unsigned int *GetTeamsReductionTimestamp();
DEVICE char *GetTeamsReductionScratchpad();
DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);
#endif

View File

@ -193,17 +193,10 @@ typedef struct ident {
// parallel defs
typedef ident_t kmp_Ident;
typedef void (*kmp_ParFctPtr)(int32_t *global_tid, int32_t *bound_tid, ...);
typedef void (*kmp_ReductFctPtr)(void *lhsData, void *rhsData);
typedef void (*kmp_InterWarpCopyFctPtr)(void *src, int32_t warp_num);
typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id,
int16_t lane_offset,
int16_t shortCircuit);
typedef void (*kmp_CopyToScratchpadFctPtr)(void *reduceData, void *scratchpad,
int32_t index, int32_t width);
typedef void (*kmp_LoadReduceFctPtr)(void *reduceData, void *scratchpad,
int32_t index, int32_t width,
int32_t reduce);
typedef void (*kmp_ListGlobalFctPtr)(void *buffer, int idx, void *reduce_data);
// task defs
@ -227,12 +220,6 @@ typedef int32_t kmp_CriticalName[8];
EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc);
EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t global_tid,
int32_t num_threads);
// simd
EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t global_tid,
int32_t simd_limit);
// aee ... not supported
// EXTERN void __kmpc_fork_call(kmp_Ident *loc, int32_t argc, kmp_ParFctPtr
// microtask, ...);
EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid);
EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
uint32_t global_tid);
@ -354,61 +341,25 @@ EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t global_tid);
EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t global_tid);
EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t global_tid);
// Support for reducing conditional lastprivate variables
EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc,
int32_t global_tid,
int32_t varNum, void *array);
// reduction
EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid);
EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
kmp_InterWarpCopyFctPtr cpyFct);
EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
EXTERN int32_t __kmpc_nvptx_simd_reduce_nowait(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
kmp_Ident *loc, int32_t global_tid, void *global_buffer,
int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
kmp_ListGlobalFctPtr glredFct);
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
int32_t global_tid,
kmp_CriticalName *crit);
EXTERN void __kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc,
int32_t global_tid,
kmp_CriticalName *crit);
EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
// sync barrier
EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid);
EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid);
EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid);
EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc, int32_t global_tid);
// single
@ -468,29 +419,16 @@ EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
int32_t cancelVal);
// non standard
EXTERN void __kmpc_kernel_init_params(void *ReductionScratchpadPtr);
EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime);
EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
int16_t RequiresDataSharing);
EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit();
EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
int16_t IsOMPRuntimeInitialized);
EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
int16_t IsOMPRuntimeInitialized);
EXTERN void __kmpc_kernel_end_parallel();
EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer,
__kmpc_impl_lanemask_t Mask,
bool *IsFinal,
int32_t *LaneSource);
EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer);
EXTERN bool __kmpc_kernel_convergent_simd(void *buffer,
__kmpc_impl_lanemask_t Mask,
bool *IsFinal, int32_t *LaneSource,
int32_t *LaneId, int32_t *NumLanes);
EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer);
EXTERN void __kmpc_data_sharing_init_stack();
EXTERN void __kmpc_data_sharing_init_stack_spmd();
@ -512,22 +450,6 @@ struct __kmpc_data_sharing_slot {
void *DataEnd;
char Data[];
};
EXTERN void
__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *RootS,
size_t InitialDataSize);
EXTERN void *__kmpc_data_sharing_environment_begin(
__kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
size_t SharingDataSize, size_t SharingDefaultDataSize,
int16_t IsOMPRuntimeInitialized);
EXTERN void __kmpc_data_sharing_environment_end(
__kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
int32_t IsEntryPoint);
EXTERN void *
__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
int16_t IsOMPRuntimeInitialized);
// SPMD execution mode interrogation function.
EXTERN int8_t __kmpc_is_spmd_exec_mode();

View File

@ -195,15 +195,6 @@ INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
// Return true if this is the first active thread in the warp.
INLINE bool __kmpc_impl_is_first_active_thread() {
unsigned long long Mask = __kmpc_impl_activemask();
unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE);
unsigned long long Sh = Mask << ShNum;
// Truncate Sh to the 32 lower bits
return (unsigned)Sh == 0;
}
// Locks
EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock);
EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock);