[OpenMP] Create and use `__kmpc_is_generic_main_thread`

In order to fold calls based on high-level knowledge and control flow
tracking it helps to expose the information as a runtime call. The
logic: `!SPMD && getTID() == getMasterTID()` was used in various places
and is now encapsulated in `__kmpc_is_generic_main_thread`. As part of
this rewrite we replaced eager computation of arguments with on-demand
computation, especially helpful if the calls can be folded and arguments
don't need to be computed consequently.

Differential Revision: https://reviews.llvm.org/D105768
This commit is contained in:
Johannes Doerfert 2021-07-01 00:21:26 -05:00
parent 1ab1f04a2b
commit a7b7b5dfe5
12 changed files with 37 additions and 35 deletions

View File

@ -192,7 +192,7 @@ INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
INLINE omptarget_nvptx_TaskDescr *
getMyTopTaskDescriptor(bool isSPMDExecutionMode) {
return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock(isSPMDExecutionMode));
return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
}
////////////////////////////////////////////////////////////////////////////////

View File

@ -15,11 +15,6 @@
#include "target/shuffle.h"
#include "target_impl.h"
// Return true if this is the master thread.
INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
}
////////////////////////////////////////////////////////////////////////////////
// Runtime functions for trunk data sharing scheme.
////////////////////////////////////////////////////////////////////////////////
@ -66,7 +61,8 @@ static void *__kmpc_alloc_for_warp(AllocTy Alloc, unsigned Bytes,
EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
Bytes = Bytes + (Bytes % MinBytes);
if (IsMasterThread(__kmpc_is_spmd_exec_mode())) {
int TID = GetThreadIdInBlock();
if (__kmpc_is_generic_main_thread(TID)) {
// Main thread alone, use shared memory if space is available.
if (MainSharedStack.Usage[0] + Bytes <= MainSharedStack.MaxSize) {
void *Ptr = &MainSharedStack.Data[MainSharedStack.Usage[0]];
@ -75,7 +71,6 @@ EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
return Ptr;
}
} else {
int TID = GetThreadIdInBlock();
int WID = GetWarpId();
unsigned WarpBytes = Bytes * WARPSIZE;
auto AllocSharedStack = [&]() {
@ -92,7 +87,6 @@ EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
return __kmpc_alloc_for_warp(AllocSharedStack, Bytes, WarpBytes);
}
// Fallback to malloc
int TID = GetThreadIdInBlock();
unsigned WarpBytes = Bytes * WARPSIZE;
auto AllocGlobal = [&] {
return SafeMalloc(WarpBytes, "AllocGlobalFallback");

View File

@ -68,9 +68,7 @@ EXTERN int omp_get_thread_limit(void) {
}
EXTERN int omp_get_thread_num() {
bool isSPMDExecutionMode = __kmpc_is_spmd_exec_mode();
int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
int rc = GetOmpThreadId(tid, isSPMDExecutionMode);
int rc = GetOmpThreadId();
PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
return rc;
}

View File

@ -210,7 +210,7 @@ public:
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), "Expected non-SPMD mode.");
return;
}
int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
int tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
T tnum = GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
T tripCount = ub - lb + 1; // +1 because ub is inclusive
@ -453,7 +453,7 @@ public:
// ID of a thread in its own warp
// automatically selects thread or warp ID based on selected implementation
int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
int tid = GetLogicalThreadIdInBlock();
ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode()),
"current thread is not needed here; error");
// retrieve schedule

View File

@ -160,6 +160,10 @@ EXTERN int8_t __kmpc_is_spmd_exec_mode() {
return (execution_param & ModeMask) == Spmd;
}
EXTERN int8_t __kmpc_is_generic_main_thread(kmp_int32 Tid) {
return !__kmpc_is_spmd_exec_mode() && GetMasterThreadID() == Tid;
}
EXTERN bool __kmpc_kernel_parallel(void**WorkFn);
static void __kmpc_target_region_state_machine(ident_t *Ident) {

View File

@ -188,7 +188,7 @@ EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
}
// assume this is only called for nested parallel
int threadId = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
int threadId = GetLogicalThreadIdInBlock();
// unlike actual parallel, threads in the same team do not share
// the workTaskDescr in this case and num threads is fixed to 1
@ -227,7 +227,7 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
}
// pop stack
int threadId = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
int threadId = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
// set new top
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
@ -249,8 +249,7 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
// it's cheap to recalculate this value so we never use the result
// of this call.
EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
return GetOmpThreadId(tid, __kmpc_is_spmd_exec_mode());
return GetOmpThreadId();
}
////////////////////////////////////////////////////////////////////////////////
@ -262,7 +261,7 @@ EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
ASSERT0(LT_FUSSY, isRuntimeInitialized(),
"Runtime must be initialized.");
tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
num_threads;
}

View File

@ -69,7 +69,7 @@ static int32_t nvptx_parallel_reduce_nowait(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode);
if (NumThreads == 1)
return 1;
@ -184,10 +184,11 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
kmp_ListGlobalFctPtr glredFct) {
// Terminate all threads in non-SPMD mode except for the master thread.
if (!__kmpc_is_spmd_exec_mode() && GetThreadIdInBlock() != GetMasterThreadID())
if (!__kmpc_is_spmd_exec_mode() &&
!__kmpc_is_generic_main_thread(GetThreadIdInBlock()))
return 0;
uint32_t ThreadId = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
uint32_t ThreadId = GetLogicalThreadIdInBlock();
// In non-generic mode all workers participate in the teams reduction.
// In generic mode only the team master participates in the teams

View File

@ -67,11 +67,11 @@ int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
// or a serial region by the master. If the master (whose CUDA thread
// id is GetMasterThreadID()) calls this routine, we return 0 because
// it is a shadow for the first worker.
int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) {
int GetLogicalThreadIdInBlock() {
// Implemented using control flow (predication) instead of with a modulo
// operation.
int tid = GetThreadIdInBlock();
if (!isSPMDExecutionMode && tid >= GetMasterThreadID())
if (__kmpc_is_generic_main_thread(tid))
return 0;
else
return tid;
@ -83,16 +83,19 @@ int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) {
//
////////////////////////////////////////////////////////////////////////////////
int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) {
int GetOmpThreadId() {
int tid = GetThreadIdInBlock();
if (__kmpc_is_generic_main_thread(tid))
return 0;
// omp_thread_num
int rc;
if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) {
rc = 0;
} else if (isSPMDExecutionMode) {
rc = GetThreadIdInBlock();
} else if (__kmpc_is_spmd_exec_mode()) {
rc = tid;
} else {
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
rc = currTaskDescr->ThreadId();
}

View File

@ -47,7 +47,7 @@ EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
"Expected SPMD mode with uninitialized runtime.");
__kmpc_barrier_simple_spmd(loc_ref, tid);
} else {
tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
tid = GetLogicalThreadIdInBlock();
int numberOfActiveOMPThreads =
GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
if (numberOfActiveOMPThreads > 1) {

View File

@ -96,7 +96,7 @@ EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
"bad assumptions");
// 2. push new context: update new task descriptor
int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
int tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
newTaskDescr->CopyForExplicitTask(parentTaskDescr);
// set new task descriptor as top
@ -135,7 +135,7 @@ EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
"bad assumptions");
// 2. push new context: update new task descriptor
int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
int tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
newTaskDescr->CopyForExplicitTask(parentTaskDescr);
// set new task descriptor as top
@ -163,7 +163,7 @@ EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr();
// 3... noting to call... is inline
// 4. pop context
int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
int tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
parentTaskDescr);
// 5. free

View File

@ -41,13 +41,12 @@ bool isRuntimeInitialized();
////////////////////////////////////////////////////////////////////////////////
// get global ids to locate tread/team info (constant regardless of OMP)
int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
int GetLogicalThreadIdInBlock();
int GetMasterThreadID();
int GetNumberOfWorkersInTeam();
// get OpenMP thread and team ids
int GetOmpThreadId(int threadId,
bool isSPMDExecutionMode); // omp_thread_num
int GetOmpThreadId(); // omp_thread_num
int GetOmpTeamId(); // omp_team_num
// get OpenMP number of threads and team

View File

@ -449,6 +449,10 @@ EXTERN void __kmpc_parallel_51(ident_t *ident, kmp_int32 global_tid,
// SPMD execution mode interrogation function.
EXTERN int8_t __kmpc_is_spmd_exec_mode();
/// Return true if the hardware thread id \p Tid represents the OpenMP main
/// thread in generic mode outside of a parallel region.
EXTERN int8_t __kmpc_is_generic_main_thread(kmp_int32 Tid);
EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
const void *buf, size_t size,
int16_t is_shared, const void **res);