forked from OSchip/llvm-project
[OpenMP][NFC] Remove unused and untested code from the device runtime
Summary: We carried a lot of unused and untested code in the device runtime. Among other reasons, we are planning major rewrites for which reduced size is going to help a lot. The number of code lines reduced by 14%! Before: ------------------------------------------------------------------------------- Language files blank comment code ------------------------------------------------------------------------------- CUDA 13 489 841 2454 C/C++ Header 14 322 493 1377 C 12 117 124 559 CMake 4 64 64 262 C++ 1 6 6 39 ------------------------------------------------------------------------------- SUM: 44 998 1528 4691 ------------------------------------------------------------------------------- After: ------------------------------------------------------------------------------- Language files blank comment code ------------------------------------------------------------------------------- CUDA 13 366 733 1879 C/C++ Header 14 317 484 1293 C 12 117 124 559 CMake 4 64 64 262 C++ 1 6 6 39 ------------------------------------------------------------------------------- SUM: 44 870 1411 4032 ------------------------------------------------------------------------------- Reviewers: hfinkel, jhuber6, fghanim, JonChesterfield, grokos, AndreyChurbanov, ye-luo, tianshilei1992, ggeorgakoudis, Hahnfeld, ABataev, hbae, ronlieb, gregrodgers Subscribers: jvesely, yaxunl, bollu, guansong, jfb, sstefan1, aaron.ballman, openmp-commits, cfe-commits Tags: #clang, #openmp Differential Revision: https://reviews.llvm.org/D83349
This commit is contained in:
parent
0f0c5af3db
commit
cd0ea03e6f
|
@ -78,7 +78,6 @@ int bar(int n){
|
|||
// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0)
|
||||
// CHECK-NOT: call void @__kmpc_for_static_init
|
||||
// CHECK-NOT: call void @__kmpc_for_static_fini
|
||||
// CHECK-NOT: call i32 @__kmpc_nvptx_simd_reduce_nowait(
|
||||
// CHECK-NOT: call void @__kmpc_nvptx_end_reduce_nowait(
|
||||
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
|
||||
// CHECK: ret void
|
||||
|
|
|
@ -140,8 +140,6 @@ DEVICE int GetNumberOfThreadsInBlock();
|
|||
DEVICE unsigned GetWarpId();
|
||||
DEVICE unsigned GetLaneId();
|
||||
|
||||
DEVICE bool __kmpc_impl_is_first_active_thread();
|
||||
|
||||
// Locks
|
||||
DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);
|
||||
DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock);
|
||||
|
|
|
@ -200,7 +200,6 @@ public:
|
|||
INLINE omptarget_nvptx_WorkDescr &WorkDescr() {
|
||||
return workDescrForActiveParallel;
|
||||
}
|
||||
INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; }
|
||||
|
||||
// init
|
||||
INLINE void InitTeamDescr();
|
||||
|
@ -251,7 +250,6 @@ private:
|
|||
levelZeroTaskDescr; // icv for team master initial thread
|
||||
omptarget_nvptx_WorkDescr
|
||||
workDescrForActiveParallel; // one, ONLY for the active par
|
||||
uint64_t lastprivateIterBuffer;
|
||||
|
||||
ALIGN(16)
|
||||
__kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
|
||||
|
@ -277,10 +275,6 @@ public:
|
|||
INLINE uint16_t &NumThreadsForNextParallel(int tid) {
|
||||
return nextRegion.tnum[tid];
|
||||
}
|
||||
// simd
|
||||
INLINE uint16_t &SimdLimitForNextSimd(int tid) {
|
||||
return nextRegion.slim[tid];
|
||||
}
|
||||
// schedule (for dispatch)
|
||||
INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
|
||||
INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
|
||||
|
@ -304,8 +298,6 @@ private:
|
|||
// Only one of the two is live at the same time.
|
||||
// parallel
|
||||
uint16_t tnum[MAX_THREADS_PER_TEAM];
|
||||
// simd limit
|
||||
uint16_t slim[MAX_THREADS_PER_TEAM];
|
||||
} nextRegion;
|
||||
// schedule (for dispatch)
|
||||
kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
|
||||
|
|
|
@ -17,297 +17,6 @@ INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
|
|||
return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
|
||||
}
|
||||
|
||||
/// Return the provided size aligned to the size of a pointer.
|
||||
INLINE static size_t AlignVal(size_t Val) {
|
||||
const size_t Align = (size_t)sizeof(void *);
|
||||
if (Val & (Align - 1)) {
|
||||
Val += Align;
|
||||
Val &= ~(Align - 1);
|
||||
}
|
||||
return Val;
|
||||
}
|
||||
|
||||
#define DSFLAG 0
|
||||
#define DSFLAG_INIT 0
|
||||
#define DSPRINT(_flag, _str, _args...) \
|
||||
{ \
|
||||
if (_flag) { \
|
||||
/*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x, _args);*/ \
|
||||
} \
|
||||
}
|
||||
#define DSPRINT0(_flag, _str) \
|
||||
{ \
|
||||
if (_flag) { \
|
||||
/*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x);*/ \
|
||||
} \
|
||||
}
|
||||
|
||||
// Initialize the shared data structures. This is expected to be called for the
|
||||
// master thread and warp masters. \param RootS: A pointer to the root of the
|
||||
// data sharing stack. \param InitialDataSize: The initial size of the data in
|
||||
// the slot.
|
||||
EXTERN void
|
||||
__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
|
||||
size_t InitialDataSize) {
|
||||
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
|
||||
DSPRINT0(DSFLAG_INIT,
|
||||
"Entering __kmpc_initialize_data_sharing_environment\n");
|
||||
|
||||
unsigned WID = GetWarpId();
|
||||
DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID);
|
||||
|
||||
omptarget_nvptx_TeamDescr *teamDescr =
|
||||
&omptarget_nvptx_threadPrivateContext->TeamContext();
|
||||
__kmpc_data_sharing_slot *RootS =
|
||||
teamDescr->RootS(WID, IsMasterThread(isSPMDMode()));
|
||||
|
||||
DataSharingState.SlotPtr[WID] = RootS;
|
||||
DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
|
||||
|
||||
// We don't need to initialize the frame and active threads.
|
||||
|
||||
DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", (unsigned)InitialDataSize);
|
||||
DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (unsigned long long)RootS);
|
||||
DSPRINT(DSFLAG_INIT, "Root slot data-end at: %016llx \n",
|
||||
(unsigned long long)RootS->DataEnd);
|
||||
DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n",
|
||||
(unsigned long long)RootS->Next);
|
||||
DSPRINT(DSFLAG_INIT, "Shared slot ptr at: %016llx \n",
|
||||
(unsigned long long)DataSharingState.SlotPtr[WID]);
|
||||
DSPRINT(DSFLAG_INIT, "Shared stack ptr at: %016llx \n",
|
||||
(unsigned long long)DataSharingState.StackPtr[WID]);
|
||||
|
||||
DSPRINT0(DSFLAG_INIT, "Exiting __kmpc_initialize_data_sharing_environment\n");
|
||||
}
|
||||
|
||||
EXTERN void *__kmpc_data_sharing_environment_begin(
|
||||
__kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
|
||||
void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
|
||||
size_t SharingDataSize, size_t SharingDefaultDataSize,
|
||||
int16_t IsOMPRuntimeInitialized) {
|
||||
|
||||
DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_begin\n");
|
||||
|
||||
// If the runtime has been elided, used shared memory for master-worker
|
||||
// data sharing.
|
||||
if (!IsOMPRuntimeInitialized)
|
||||
return (void *)&DataSharingState;
|
||||
|
||||
DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
|
||||
DSPRINT(DSFLAG, "Default Data Size %016llx\n",
|
||||
(unsigned long long)SharingDefaultDataSize);
|
||||
|
||||
unsigned WID = GetWarpId();
|
||||
__kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask();
|
||||
|
||||
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
|
||||
void *&StackP = DataSharingState.StackPtr[WID];
|
||||
void * volatile &FrameP = DataSharingState.FramePtr[WID];
|
||||
__kmpc_impl_lanemask_t &ActiveT = DataSharingState.ActiveThreads[WID];
|
||||
|
||||
DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
|
||||
// Save the current values.
|
||||
*SavedSharedSlot = SlotP;
|
||||
*SavedSharedStack = StackP;
|
||||
*SavedSharedFrame = FrameP;
|
||||
*SavedActiveThreads = ActiveT;
|
||||
|
||||
DSPRINT(DSFLAG, "Warp ID: %u\n", WID);
|
||||
DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (unsigned long long)SlotP);
|
||||
DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (unsigned long long)StackP);
|
||||
DSPRINT(DSFLAG, "Saved frame ptr at: %016llx \n", (long long)FrameP);
|
||||
DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
|
||||
|
||||
// Only the warp active master needs to grow the stack.
|
||||
if (__kmpc_impl_is_first_active_thread()) {
|
||||
// Save the current active threads.
|
||||
ActiveT = CurActiveThreads;
|
||||
|
||||
// Make sure we use aligned sizes to avoid rematerialization of data.
|
||||
SharingDataSize = AlignVal(SharingDataSize);
|
||||
// FIXME: The default data size can be assumed to be aligned?
|
||||
SharingDefaultDataSize = AlignVal(SharingDefaultDataSize);
|
||||
|
||||
// Check if we have room for the data in the current slot.
|
||||
const uintptr_t CurrentStartAddress = (uintptr_t)StackP;
|
||||
const uintptr_t CurrentEndAddress = (uintptr_t)SlotP->DataEnd;
|
||||
const uintptr_t RequiredEndAddress =
|
||||
CurrentStartAddress + (uintptr_t)SharingDataSize;
|
||||
|
||||
DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
|
||||
DSPRINT(DSFLAG, "Default Data Size %016llx\n",
|
||||
(unsigned long long)SharingDefaultDataSize);
|
||||
DSPRINT(DSFLAG, "Current Start Address %016llx\n",
|
||||
(unsigned long long)CurrentStartAddress);
|
||||
DSPRINT(DSFLAG, "Current End Address %016llx\n",
|
||||
(unsigned long long)CurrentEndAddress);
|
||||
DSPRINT(DSFLAG, "Required End Address %016llx\n",
|
||||
(unsigned long long)RequiredEndAddress);
|
||||
DSPRINT(DSFLAG, "Active Threads %08x\n", (unsigned)ActiveT);
|
||||
|
||||
// If we require a new slot, allocate it and initialize it (or attempt to
|
||||
// reuse one). Also, set the shared stack and slot pointers to the new
|
||||
// place. If we do not need to grow the stack, just adapt the stack and
|
||||
// frame pointers.
|
||||
if (CurrentEndAddress < RequiredEndAddress) {
|
||||
size_t NewSize = (SharingDataSize > SharingDefaultDataSize)
|
||||
? SharingDataSize
|
||||
: SharingDefaultDataSize;
|
||||
__kmpc_data_sharing_slot *NewSlot = 0;
|
||||
|
||||
// Attempt to reuse an existing slot.
|
||||
if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
|
||||
uintptr_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
|
||||
(uintptr_t)(&ExistingSlot->Data[0]);
|
||||
if (ExistingSlotSize >= NewSize) {
|
||||
DSPRINT(DSFLAG, "Reusing stack slot %016llx\n",
|
||||
(unsigned long long)ExistingSlot);
|
||||
NewSlot = ExistingSlot;
|
||||
} else {
|
||||
DSPRINT(DSFLAG, "Cleaning up -failed reuse - %016llx\n",
|
||||
(unsigned long long)SlotP->Next);
|
||||
SafeFree(ExistingSlot, "Failed reuse");
|
||||
}
|
||||
}
|
||||
|
||||
if (!NewSlot) {
|
||||
NewSlot = (__kmpc_data_sharing_slot *)SafeMalloc(
|
||||
sizeof(__kmpc_data_sharing_slot) + NewSize,
|
||||
"Warp master slot allocation");
|
||||
DSPRINT(DSFLAG, "New slot allocated %016llx (data size=%016llx)\n",
|
||||
(unsigned long long)NewSlot, NewSize);
|
||||
}
|
||||
|
||||
NewSlot->Next = 0;
|
||||
NewSlot->DataEnd = &NewSlot->Data[NewSize];
|
||||
|
||||
SlotP->Next = NewSlot;
|
||||
SlotP = NewSlot;
|
||||
StackP = &NewSlot->Data[SharingDataSize];
|
||||
FrameP = &NewSlot->Data[0];
|
||||
} else {
|
||||
|
||||
// Clean up any old slot that we may still have. The slot producers, do
|
||||
// not eliminate them because that may be used to return data.
|
||||
if (SlotP->Next) {
|
||||
DSPRINT(DSFLAG, "Cleaning up - old not required - %016llx\n",
|
||||
(unsigned long long)SlotP->Next);
|
||||
SafeFree(SlotP->Next, "Old slot not required");
|
||||
SlotP->Next = 0;
|
||||
}
|
||||
|
||||
FrameP = StackP;
|
||||
StackP = (void *)RequiredEndAddress;
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: Need to see the impact of doing it here.
|
||||
__kmpc_impl_threadfence_block();
|
||||
|
||||
DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_begin\n");
|
||||
|
||||
// All the threads in this warp get the frame they should work with.
|
||||
return FrameP;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_data_sharing_environment_end(
|
||||
__kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
|
||||
void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
|
||||
int32_t IsEntryPoint) {
|
||||
|
||||
DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n");
|
||||
|
||||
unsigned WID = GetWarpId();
|
||||
|
||||
if (IsEntryPoint) {
|
||||
if (__kmpc_impl_is_first_active_thread()) {
|
||||
DSPRINT0(DSFLAG, "Doing clean up\n");
|
||||
|
||||
// The master thread cleans the saved slot, because this is an environment
|
||||
// only for the master.
|
||||
__kmpc_data_sharing_slot *S = IsMasterThread(isSPMDMode())
|
||||
? *SavedSharedSlot
|
||||
: DataSharingState.SlotPtr[WID];
|
||||
|
||||
if (S->Next) {
|
||||
SafeFree(S->Next, "Sharing environment end");
|
||||
S->Next = 0;
|
||||
}
|
||||
}
|
||||
|
||||
DSPRINT0(DSFLAG, "Exiting Exiting __kmpc_data_sharing_environment_end\n");
|
||||
return;
|
||||
}
|
||||
|
||||
__kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
|
||||
|
||||
// Only the warp master can restore the stack and frame information, and only
|
||||
// if there are no other threads left behind in this environment (i.e. the
|
||||
// warp diverged and returns in different places). This only works if we
|
||||
// assume that threads will converge right after the call site that started
|
||||
// the environment.
|
||||
if (__kmpc_impl_is_first_active_thread()) {
|
||||
__kmpc_impl_lanemask_t &ActiveT = DataSharingState.ActiveThreads[WID];
|
||||
|
||||
DSPRINT0(DSFLAG, "Before restoring the stack\n");
|
||||
// Zero the bits in the mask. If it is still different from zero, then we
|
||||
// have other threads that will return after the current ones.
|
||||
ActiveT &= ~CurActive;
|
||||
|
||||
DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n",
|
||||
(unsigned)CurActive, (unsigned)ActiveT);
|
||||
|
||||
if (!ActiveT) {
|
||||
// No other active threads? Great, lets restore the stack.
|
||||
|
||||
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
|
||||
void *&StackP = DataSharingState.StackPtr[WID];
|
||||
void * volatile &FrameP = DataSharingState.FramePtr[WID];
|
||||
|
||||
SlotP = *SavedSharedSlot;
|
||||
StackP = *SavedSharedStack;
|
||||
FrameP = *SavedSharedFrame;
|
||||
ActiveT = *SavedActiveThreads;
|
||||
|
||||
DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n",
|
||||
(unsigned long long)SlotP);
|
||||
DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n",
|
||||
(unsigned long long)StackP);
|
||||
DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n",
|
||||
(unsigned long long)FrameP);
|
||||
DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: Need to see the impact of doing it here.
|
||||
__kmpc_impl_threadfence_block();
|
||||
|
||||
DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_end\n");
|
||||
return;
|
||||
}
|
||||
|
||||
EXTERN void *
|
||||
__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
|
||||
int16_t IsOMPRuntimeInitialized) {
|
||||
DSPRINT0(DSFLAG, "Entering __kmpc_get_data_sharing_environment_frame\n");
|
||||
|
||||
// If the runtime has been elided, use shared memory for master-worker
|
||||
// data sharing. We're reusing the statically allocated data structure
|
||||
// that is used for standard data sharing.
|
||||
if (!IsOMPRuntimeInitialized)
|
||||
return (void *)&DataSharingState;
|
||||
|
||||
// Get the frame used by the requested thread.
|
||||
|
||||
unsigned SourceWID = SourceThreadID / WARPSIZE;
|
||||
|
||||
DSPRINT(DSFLAG, "Source warp: %u\n", SourceWID);
|
||||
|
||||
void * volatile P = DataSharingState.FramePtr[SourceWID];
|
||||
DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
|
||||
return P;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Runtime functions for trunk data sharing scheme.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
|
|
@ -362,53 +362,3 @@ EXTERN int omp_test_lock(omp_lock_t *lock) {
|
|||
PRINT(LD_IO, "call omp_test_lock() return %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
// for xlf Fortran
|
||||
// Fortran, the return is LOGICAL type
|
||||
|
||||
#define FLOGICAL long
|
||||
EXTERN FLOGICAL __xlf_omp_is_initial_device_i8() {
|
||||
int ret = omp_is_initial_device();
|
||||
if (ret == 0)
|
||||
return (FLOGICAL)0;
|
||||
else
|
||||
return (FLOGICAL)1;
|
||||
}
|
||||
|
||||
EXTERN int __xlf_omp_is_initial_device_i4() {
|
||||
int ret = omp_is_initial_device();
|
||||
if (ret == 0)
|
||||
return 0;
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
|
||||
EXTERN long __xlf_omp_get_team_num_i4() {
|
||||
int ret = omp_get_team_num();
|
||||
return (long)ret;
|
||||
}
|
||||
|
||||
EXTERN long __xlf_omp_get_num_teams_i4() {
|
||||
int ret = omp_get_num_teams();
|
||||
return (long)ret;
|
||||
}
|
||||
|
||||
EXTERN void xlf_debug_print_int(int *p) {
|
||||
printf("xlf DEBUG %d): %p %d\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
|
||||
}
|
||||
|
||||
EXTERN void xlf_debug_print_long(long *p) {
|
||||
printf("xlf DEBUG %d): %p %ld\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
|
||||
}
|
||||
|
||||
EXTERN void xlf_debug_print_float(float *p) {
|
||||
printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
|
||||
}
|
||||
|
||||
EXTERN void xlf_debug_print_double(double *p) {
|
||||
printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
|
||||
}
|
||||
|
||||
EXTERN void xlf_debug_print_addr(void *p) {
|
||||
printf("xlf DEBUG %d): %p \n", omp_get_team_num(), p);
|
||||
}
|
||||
|
|
|
@ -754,55 +754,3 @@ void __kmpc_for_static_init_8u_simple_generic(
|
|||
EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_fini\n");
|
||||
}
|
||||
|
||||
namespace {
|
||||
INLINE void syncWorkersInGenericMode(uint32_t NumThreads) {
|
||||
int NumWarps = ((NumThreads + WARPSIZE - 1) / WARPSIZE);
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
// On Volta and newer architectures we require that all lanes in
|
||||
// a warp (at least, all present for the kernel launch) participate in the
|
||||
// barrier. This is enforced when launching the parallel region. An
|
||||
// exception is when there are < WARPSIZE workers. In this case only 1 worker
|
||||
// is started, so we don't need a barrier.
|
||||
if (NumThreads > 1) {
|
||||
#endif
|
||||
__kmpc_impl_named_sync(L1_BARRIER, WARPSIZE * NumWarps);
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}; // namespace
|
||||
|
||||
EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid,
|
||||
int32_t varNum, void *array) {
|
||||
PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n");
|
||||
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
|
||||
"Expected non-SPMD mode + initialized runtime.");
|
||||
|
||||
omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
|
||||
uint32_t NumThreads = GetNumberOfOmpThreads(checkSPMDMode(loc));
|
||||
uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
|
||||
for (unsigned i = 0; i < varNum; i++) {
|
||||
// Reset buffer.
|
||||
if (gtid == 0)
|
||||
*Buffer = 0; // Reset to minimum loop iteration value.
|
||||
|
||||
// Barrier.
|
||||
syncWorkersInGenericMode(NumThreads);
|
||||
|
||||
// Atomic max of iterations.
|
||||
uint64_t *varArray = (uint64_t *)array;
|
||||
uint64_t elem = varArray[i];
|
||||
(void)__kmpc_atomic_max((unsigned long long int *)Buffer,
|
||||
(unsigned long long int)elem);
|
||||
|
||||
// Barrier.
|
||||
syncWorkersInGenericMode(NumThreads);
|
||||
|
||||
// Read max value and update thread private array.
|
||||
varArray[i] = *Buffer;
|
||||
|
||||
// Barrier.
|
||||
syncWorkersInGenericMode(NumThreads);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,13 +25,6 @@ extern DEVICE
|
|||
// init entry points
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
EXTERN void __kmpc_kernel_init_params(void *Ptr) {
|
||||
PRINT(LD_IO, "call to __kmpc_kernel_init_params with version %f\n",
|
||||
OMPTARGET_NVPTX_VERSION);
|
||||
|
||||
SetTeamsReductionScratchpadPtr(Ptr);
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
|
||||
PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
|
||||
OMPTARGET_NVPTX_VERSION);
|
||||
|
@ -152,10 +145,6 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
|
|||
}
|
||||
}
|
||||
|
||||
EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit() {
|
||||
__kmpc_spmd_kernel_deinit_v2(isRuntimeInitialized());
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) {
|
||||
// We're not going to pop the task descr stack of each thread since
|
||||
// there are no more parallel regions in SPMD mode.
|
||||
|
|
|
@ -35,161 +35,6 @@
|
|||
#include "common/omptarget.h"
|
||||
#include "target_impl.h"
|
||||
|
||||
typedef struct ConvergentSimdJob {
|
||||
omptarget_nvptx_TaskDescr taskDescr;
|
||||
omptarget_nvptx_TaskDescr *convHeadTaskDescr;
|
||||
uint16_t slimForNextSimd;
|
||||
} ConvergentSimdJob;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// support for convergent simd (team of threads in a warp only)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
EXTERN bool __kmpc_kernel_convergent_simd(void *buffer,
|
||||
__kmpc_impl_lanemask_t Mask,
|
||||
bool *IsFinal, int32_t *LaneSource,
|
||||
int32_t *LaneId, int32_t *NumLanes) {
|
||||
PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n");
|
||||
__kmpc_impl_lanemask_t ConvergentMask = Mask;
|
||||
int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
|
||||
__kmpc_impl_lanemask_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
|
||||
*LaneSource += __kmpc_impl_ffs(WorkRemaining);
|
||||
*IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
|
||||
__kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
|
||||
*LaneId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
|
||||
|
||||
int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
|
||||
int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
|
||||
|
||||
ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
|
||||
int32_t SimdLimit =
|
||||
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
|
||||
job->slimForNextSimd = SimdLimit;
|
||||
|
||||
int32_t SimdLimitSource = __kmpc_impl_shfl_sync(Mask, SimdLimit, *LaneSource);
|
||||
// reset simdlimit to avoid propagating to successive #simd
|
||||
if (SimdLimitSource > 0 && threadId == sourceThreadId)
|
||||
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0;
|
||||
|
||||
// We cannot have more than the # of convergent threads.
|
||||
if (SimdLimitSource > 0)
|
||||
*NumLanes = __kmpc_impl_min(ConvergentSize, SimdLimitSource);
|
||||
else
|
||||
*NumLanes = ConvergentSize;
|
||||
ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads",
|
||||
(int)*NumLanes);
|
||||
|
||||
// Set to true for lanes participating in the simd region.
|
||||
bool isActive = false;
|
||||
// Initialize state for active threads.
|
||||
if (*LaneId < *NumLanes) {
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
|
||||
omptarget_nvptx_TaskDescr *sourceTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
|
||||
sourceThreadId);
|
||||
job->convHeadTaskDescr = currTaskDescr;
|
||||
// install top descriptor from the thread for which the lanes are working.
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
|
||||
sourceTaskDescr);
|
||||
isActive = true;
|
||||
}
|
||||
|
||||
// requires a memory fence between threads of a warp
|
||||
return isActive;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) {
|
||||
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
|
||||
// pop stack
|
||||
int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
|
||||
ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
|
||||
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) =
|
||||
job->slimForNextSimd;
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
|
||||
threadId, job->convHeadTaskDescr);
|
||||
}
|
||||
|
||||
typedef struct ConvergentParallelJob {
|
||||
omptarget_nvptx_TaskDescr taskDescr;
|
||||
omptarget_nvptx_TaskDescr *convHeadTaskDescr;
|
||||
uint16_t tnumForNextPar;
|
||||
} ConvergentParallelJob;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// support for convergent parallelism (team of threads in a warp only)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer,
|
||||
__kmpc_impl_lanemask_t Mask,
|
||||
bool *IsFinal,
|
||||
int32_t *LaneSource) {
|
||||
PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n");
|
||||
__kmpc_impl_lanemask_t ConvergentMask = Mask;
|
||||
int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
|
||||
__kmpc_impl_lanemask_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
|
||||
*LaneSource += __kmpc_impl_ffs(WorkRemaining);
|
||||
*IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
|
||||
__kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
|
||||
uint32_t OmpId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
|
||||
|
||||
int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
|
||||
int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
|
||||
|
||||
ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
|
||||
int32_t NumThreadsClause =
|
||||
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
|
||||
job->tnumForNextPar = NumThreadsClause;
|
||||
|
||||
int32_t NumThreadsSource =
|
||||
__kmpc_impl_shfl_sync(Mask, NumThreadsClause, *LaneSource);
|
||||
// reset numthreads to avoid propagating to successive #parallel
|
||||
if (NumThreadsSource > 0 && threadId == sourceThreadId)
|
||||
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
|
||||
0;
|
||||
|
||||
// We cannot have more than the # of convergent threads.
|
||||
uint16_t NumThreads;
|
||||
if (NumThreadsSource > 0)
|
||||
NumThreads = __kmpc_impl_min(ConvergentSize, NumThreadsSource);
|
||||
else
|
||||
NumThreads = ConvergentSize;
|
||||
ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
|
||||
(int)NumThreads);
|
||||
|
||||
// Set to true for workers participating in the parallel region.
|
||||
bool isActive = false;
|
||||
// Initialize state for active threads.
|
||||
if (OmpId < NumThreads) {
|
||||
// init L2 task descriptor and storage for the L1 parallel task descriptor.
|
||||
omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr;
|
||||
ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
|
||||
omptarget_nvptx_TaskDescr *sourceTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
|
||||
sourceThreadId);
|
||||
job->convHeadTaskDescr = currTaskDescr;
|
||||
newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads);
|
||||
// install new top descriptor
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
|
||||
newTaskDescr);
|
||||
isActive = true;
|
||||
}
|
||||
|
||||
// requires a memory fence between threads of a warp
|
||||
return isActive;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
|
||||
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
|
||||
// pop stack
|
||||
int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
|
||||
ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
|
||||
threadId, job->convHeadTaskDescr);
|
||||
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
|
||||
job->tnumForNextPar;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// support for parallel that goes parallel (1 static level only)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -446,14 +291,6 @@ EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
|
|||
num_threads;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid,
|
||||
int32_t simd_limit) {
|
||||
PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", (int)simd_limit);
|
||||
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
|
||||
tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
|
||||
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit;
|
||||
}
|
||||
|
||||
// Do nothing. The host guarantees we started the requested number of
|
||||
// teams and we only need inspection of gridDim.
|
||||
|
||||
|
|
|
@ -73,22 +73,6 @@ gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
|
|||
return (logical_lane_id == 0);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars,
|
||||
size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct,
|
||||
kmp_InterWarpCopyFctPtr cpyFct) {
|
||||
__kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
|
||||
if (Liveness == __kmpc_impl_all_lanes) {
|
||||
gpu_regular_warp_reduce(reduce_data, shflFct);
|
||||
return GetThreadIdInBlock() % WARPSIZE ==
|
||||
0; // Result on lane 0 of the simd warp.
|
||||
} else {
|
||||
return gpu_irregular_simd_reduce(
|
||||
reduce_data, shflFct); // Result on the first active lane.
|
||||
}
|
||||
}
|
||||
|
||||
INLINE
|
||||
static int32_t nvptx_parallel_reduce_nowait(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
|
@ -177,14 +161,6 @@ static int32_t nvptx_parallel_reduce_nowait(
|
|||
#endif // __CUDA_ARCH__ >= 700
|
||||
}
|
||||
|
||||
EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
|
||||
return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
|
||||
reduce_data, shflFct, cpyFct,
|
||||
isSPMDMode(), isRuntimeUninitialized());
|
||||
}
|
||||
|
||||
EXTERN
|
||||
int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
|
||||
kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
|
||||
|
@ -195,201 +171,6 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
|
|||
checkSPMDMode(loc), checkRuntimeUninitialized(loc));
|
||||
}
|
||||
|
||||
EXTERN
|
||||
int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
|
||||
return nvptx_parallel_reduce_nowait(
|
||||
global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
|
||||
/*isSPMDExecutionMode=*/true, /*isRuntimeUninitialized=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
|
||||
return nvptx_parallel_reduce_nowait(
|
||||
global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
|
||||
/*isSPMDExecutionMode=*/false, /*isRuntimeUninitialized=*/true);
|
||||
}
|
||||
|
||||
INLINE
|
||||
static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
|
||||
size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct,
|
||||
kmp_InterWarpCopyFctPtr cpyFct,
|
||||
kmp_CopyToScratchpadFctPtr scratchFct,
|
||||
kmp_LoadReduceFctPtr ldFct,
|
||||
bool isSPMDExecutionMode) {
|
||||
uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
|
||||
// In non-generic mode all workers participate in the teams reduction.
|
||||
// In generic mode only the team master participates in the teams
|
||||
// reduction because the workers are waiting for parallel work.
|
||||
uint32_t NumThreads =
|
||||
isSPMDExecutionMode ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
|
||||
: /*Master thread only*/ 1;
|
||||
uint32_t TeamId = GetBlockIdInKernel();
|
||||
uint32_t NumTeams = GetNumberOfBlocksInKernel();
|
||||
static SHARED volatile bool IsLastTeam;
|
||||
|
||||
// Team masters of all teams write to the scratchpad.
|
||||
if (ThreadId == 0) {
|
||||
unsigned int *timestamp = GetTeamsReductionTimestamp();
|
||||
char *scratchpad = GetTeamsReductionScratchpad();
|
||||
|
||||
scratchFct(reduce_data, scratchpad, TeamId, NumTeams);
|
||||
__kmpc_impl_threadfence();
|
||||
|
||||
// atomicInc increments 'timestamp' and has a range [0, NumTeams-1].
|
||||
// It resets 'timestamp' back to 0 once the last team increments
|
||||
// this counter.
|
||||
unsigned val = __kmpc_atomic_inc(timestamp, NumTeams - 1);
|
||||
IsLastTeam = val == NumTeams - 1;
|
||||
}
|
||||
|
||||
// We have to wait on L1 barrier because in GENERIC mode the workers
|
||||
// are waiting on barrier 0 for work.
|
||||
//
|
||||
// If we guard this barrier as follows it leads to deadlock, probably
|
||||
// because of a compiler bug: if (!IsGenericMode()) __syncthreads();
|
||||
uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE;
|
||||
__kmpc_impl_named_sync(L1_BARRIER, SyncWarps * WARPSIZE);
|
||||
|
||||
// If this team is not the last, quit.
|
||||
if (/* Volatile read by all threads */ !IsLastTeam)
|
||||
return 0;
|
||||
|
||||
//
|
||||
// Last team processing.
|
||||
//
|
||||
|
||||
// Threads in excess of #teams do not participate in reduction of the
|
||||
// scratchpad values.
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
uint32_t ActiveThreads = NumThreads;
|
||||
if (NumTeams < NumThreads) {
|
||||
ActiveThreads =
|
||||
(NumTeams < WARPSIZE) ? 1 : NumTeams & ~((uint16_t)WARPSIZE - 1);
|
||||
}
|
||||
if (ThreadId >= ActiveThreads)
|
||||
return 0;
|
||||
|
||||
// Load from scratchpad and reduce.
|
||||
char *scratchpad = GetTeamsReductionScratchpad();
|
||||
ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
|
||||
for (uint32_t i = ActiveThreads + ThreadId; i < NumTeams; i += ActiveThreads)
|
||||
ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
|
||||
|
||||
uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
|
||||
uint32_t WarpId = ThreadId / WARPSIZE;
|
||||
|
||||
// Reduce across warps to the warp master.
|
||||
if ((ActiveThreads % WARPSIZE == 0) ||
|
||||
(WarpId < WarpsNeeded - 1)) // Full warp
|
||||
gpu_regular_warp_reduce(reduce_data, shflFct);
|
||||
else if (ActiveThreads > 1) // Partial warp but contiguous lanes
|
||||
// Only SPMD execution mode comes thru this case.
|
||||
gpu_irregular_warp_reduce(reduce_data, shflFct,
|
||||
/*LaneCount=*/ActiveThreads % WARPSIZE,
|
||||
/*LaneId=*/ThreadId % WARPSIZE);
|
||||
|
||||
// When we have more than [warpsize] number of threads
|
||||
// a block reduction is performed here.
|
||||
if (ActiveThreads > WARPSIZE) {
|
||||
// Gather all the reduced values from each warp
|
||||
// to the first warp.
|
||||
cpyFct(reduce_data, WarpsNeeded);
|
||||
|
||||
if (WarpId == 0)
|
||||
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
|
||||
}
|
||||
#else
|
||||
if (ThreadId >= NumTeams)
|
||||
return 0;
|
||||
|
||||
// Load from scratchpad and reduce.
|
||||
char *scratchpad = GetTeamsReductionScratchpad();
|
||||
ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
|
||||
for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
|
||||
ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
|
||||
|
||||
// Reduce across warps to the warp master.
|
||||
__kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
|
||||
if (Liveness == __kmpc_impl_all_lanes) // Full warp
|
||||
gpu_regular_warp_reduce(reduce_data, shflFct);
|
||||
else // Partial warp but contiguous lanes
|
||||
gpu_irregular_warp_reduce(reduce_data, shflFct,
|
||||
/*LaneCount=*/__kmpc_impl_popc(Liveness),
|
||||
/*LaneId=*/ThreadId % WARPSIZE);
|
||||
|
||||
// When we have more than [warpsize] number of threads
|
||||
// a block reduction is performed here.
|
||||
uint32_t ActiveThreads = NumTeams < NumThreads ? NumTeams : NumThreads;
|
||||
if (ActiveThreads > WARPSIZE) {
|
||||
uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
|
||||
// Gather all the reduced values from each warp
|
||||
// to the first warp.
|
||||
cpyFct(reduce_data, WarpsNeeded);
|
||||
|
||||
uint32_t WarpId = ThreadId / WARPSIZE;
|
||||
if (WarpId == 0)
|
||||
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
|
||||
}
|
||||
#endif // __CUDA_ARCH__ >= 700
|
||||
|
||||
return ThreadId == 0;
|
||||
}
|
||||
|
||||
EXTERN
|
||||
int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
|
||||
size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct,
|
||||
kmp_InterWarpCopyFctPtr cpyFct,
|
||||
kmp_CopyToScratchpadFctPtr scratchFct,
|
||||
kmp_LoadReduceFctPtr ldFct) {
|
||||
return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
|
||||
reduce_data, shflFct, cpyFct, scratchFct,
|
||||
ldFct, isSPMDMode());
|
||||
}
|
||||
|
||||
EXTERN
|
||||
int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
|
||||
kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
|
||||
return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
|
||||
reduce_data, shflFct, cpyFct, scratchFct,
|
||||
ldFct, /*isSPMDExecutionMode=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
|
||||
kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
|
||||
return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
|
||||
reduce_data, shflFct, cpyFct, scratchFct,
|
||||
ldFct, /*isSPMDExecutionMode=*/false);
|
||||
}
|
||||
|
||||
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
|
||||
int32_t global_tid,
|
||||
kmp_CriticalName *crit) {
|
||||
if (checkSPMDMode(loc) && GetThreadIdInBlock() != 0)
|
||||
return 0;
|
||||
// The master thread of the team actually does the reduction.
|
||||
while (__kmpc_atomic_cas((uint32_t *)crit, 0u, 1u))
|
||||
;
|
||||
return 1;
|
||||
}
|
||||
|
||||
EXTERN void
|
||||
__kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, int32_t global_tid,
|
||||
kmp_CriticalName *crit) {
|
||||
__kmpc_impl_threadfence_system();
|
||||
(void)__kmpc_atomic_exchange((uint32_t *)crit, 0u);
|
||||
}
|
||||
|
||||
INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
|
||||
return checkGenericMode(loc) || IsTeamMaster(ThreadId);
|
||||
}
|
||||
|
|
|
@ -264,6 +264,3 @@ DEVICE char *GetTeamsReductionScratchpad() {
|
|||
return static_cast<char *>(ReductionScratchpadPtr) + 256;
|
||||
}
|
||||
|
||||
DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr) {
|
||||
ReductionScratchpadPtr = ScratchpadPtr;
|
||||
}
|
||||
|
|
|
@ -79,23 +79,6 @@ EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) {
|
|||
PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n");
|
||||
}
|
||||
|
||||
// Emit a simple barrier call in Generic mode. Assumes the caller is in an L0
|
||||
// parallel region and that all worker threads participate.
|
||||
EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) {
|
||||
int numberOfActiveOMPThreads = GetNumberOfThreadsInBlock() - WARPSIZE;
|
||||
// The #threads parameter must be rounded up to the WARPSIZE.
|
||||
int threads =
|
||||
WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
|
||||
|
||||
PRINT(LD_SYNC,
|
||||
"call kmpc_barrier_simple_generic with %d omp threads, sync parameter "
|
||||
"%d\n",
|
||||
(int)numberOfActiveOMPThreads, (int)threads);
|
||||
// Barrier #1 is for synchronization among active threads.
|
||||
__kmpc_impl_named_sync(L1_BARRIER, threads);
|
||||
PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// KMP MASTER
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
|
|
@ -94,6 +94,5 @@ DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment);
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
DEVICE unsigned int *GetTeamsReductionTimestamp();
|
||||
DEVICE char *GetTeamsReductionScratchpad();
|
||||
DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -193,17 +193,10 @@ typedef struct ident {
|
|||
|
||||
// parallel defs
|
||||
typedef ident_t kmp_Ident;
|
||||
typedef void (*kmp_ParFctPtr)(int32_t *global_tid, int32_t *bound_tid, ...);
|
||||
typedef void (*kmp_ReductFctPtr)(void *lhsData, void *rhsData);
|
||||
typedef void (*kmp_InterWarpCopyFctPtr)(void *src, int32_t warp_num);
|
||||
typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id,
|
||||
int16_t lane_offset,
|
||||
int16_t shortCircuit);
|
||||
typedef void (*kmp_CopyToScratchpadFctPtr)(void *reduceData, void *scratchpad,
|
||||
int32_t index, int32_t width);
|
||||
typedef void (*kmp_LoadReduceFctPtr)(void *reduceData, void *scratchpad,
|
||||
int32_t index, int32_t width,
|
||||
int32_t reduce);
|
||||
typedef void (*kmp_ListGlobalFctPtr)(void *buffer, int idx, void *reduce_data);
|
||||
|
||||
// task defs
|
||||
|
@ -227,12 +220,6 @@ typedef int32_t kmp_CriticalName[8];
|
|||
EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc);
|
||||
EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t num_threads);
|
||||
// simd
|
||||
EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t simd_limit);
|
||||
// aee ... not supported
|
||||
// EXTERN void __kmpc_fork_call(kmp_Ident *loc, int32_t argc, kmp_ParFctPtr
|
||||
// microtask, ...);
|
||||
EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid);
|
||||
EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
|
||||
uint32_t global_tid);
|
||||
|
@ -354,61 +341,25 @@ EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t global_tid);
|
|||
EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t global_tid);
|
||||
EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t global_tid);
|
||||
|
||||
// Support for reducing conditional lastprivate variables
|
||||
EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc,
|
||||
int32_t global_tid,
|
||||
int32_t varNum, void *array);
|
||||
|
||||
// reduction
|
||||
EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid);
|
||||
EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
|
||||
EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
|
||||
EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
|
||||
kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
|
||||
void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
|
||||
kmp_InterWarpCopyFctPtr cpyFct);
|
||||
EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
|
||||
EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
|
||||
EXTERN int32_t __kmpc_nvptx_simd_reduce_nowait(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
|
||||
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
|
||||
kmp_Ident *loc, int32_t global_tid, void *global_buffer,
|
||||
int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
|
||||
kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
|
||||
kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
|
||||
kmp_ListGlobalFctPtr glredFct);
|
||||
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
|
||||
kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
|
||||
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
|
||||
kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
|
||||
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
|
||||
kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
|
||||
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
|
||||
int32_t global_tid,
|
||||
kmp_CriticalName *crit);
|
||||
EXTERN void __kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc,
|
||||
int32_t global_tid,
|
||||
kmp_CriticalName *crit);
|
||||
EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
|
||||
EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
|
||||
|
||||
// sync barrier
|
||||
EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid);
|
||||
EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid);
|
||||
EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid);
|
||||
EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc, int32_t global_tid);
|
||||
|
||||
// single
|
||||
|
@ -468,29 +419,16 @@ EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
|
|||
int32_t cancelVal);
|
||||
|
||||
// non standard
|
||||
EXTERN void __kmpc_kernel_init_params(void *ReductionScratchpadPtr);
|
||||
EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime);
|
||||
EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
|
||||
EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
|
||||
int16_t RequiresDataSharing);
|
||||
EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit();
|
||||
EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
|
||||
EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
|
||||
int16_t IsOMPRuntimeInitialized);
|
||||
EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
|
||||
int16_t IsOMPRuntimeInitialized);
|
||||
EXTERN void __kmpc_kernel_end_parallel();
|
||||
EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer,
|
||||
__kmpc_impl_lanemask_t Mask,
|
||||
bool *IsFinal,
|
||||
int32_t *LaneSource);
|
||||
EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer);
|
||||
EXTERN bool __kmpc_kernel_convergent_simd(void *buffer,
|
||||
__kmpc_impl_lanemask_t Mask,
|
||||
bool *IsFinal, int32_t *LaneSource,
|
||||
int32_t *LaneId, int32_t *NumLanes);
|
||||
EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer);
|
||||
|
||||
|
||||
EXTERN void __kmpc_data_sharing_init_stack();
|
||||
EXTERN void __kmpc_data_sharing_init_stack_spmd();
|
||||
|
@ -512,22 +450,6 @@ struct __kmpc_data_sharing_slot {
|
|||
void *DataEnd;
|
||||
char Data[];
|
||||
};
|
||||
EXTERN void
|
||||
__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *RootS,
|
||||
size_t InitialDataSize);
|
||||
EXTERN void *__kmpc_data_sharing_environment_begin(
|
||||
__kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
|
||||
void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
|
||||
size_t SharingDataSize, size_t SharingDefaultDataSize,
|
||||
int16_t IsOMPRuntimeInitialized);
|
||||
EXTERN void __kmpc_data_sharing_environment_end(
|
||||
__kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
|
||||
void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
|
||||
int32_t IsEntryPoint);
|
||||
|
||||
EXTERN void *
|
||||
__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
|
||||
int16_t IsOMPRuntimeInitialized);
|
||||
|
||||
// SPMD execution mode interrogation function.
|
||||
EXTERN int8_t __kmpc_is_spmd_exec_mode();
|
||||
|
|
|
@ -195,15 +195,6 @@ INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
|
|||
INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
|
||||
INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
|
||||
|
||||
// Return true if this is the first active thread in the warp.
|
||||
INLINE bool __kmpc_impl_is_first_active_thread() {
|
||||
unsigned long long Mask = __kmpc_impl_activemask();
|
||||
unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE);
|
||||
unsigned long long Sh = Mask << ShNum;
|
||||
// Truncate Sh to the 32 lower bits
|
||||
return (unsigned)Sh == 0;
|
||||
}
|
||||
|
||||
// Locks
|
||||
EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock);
|
||||
EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock);
|
||||
|
|
Loading…
Reference in New Issue