[OpenMP] Simplify the ThreadStackTy for globalization fallback

With D106496 we can make the globalization fallback stack much simpler
and this version doesn't seem to experience the spurious failures and
deadlocks we have seen before.

Differential Revision: https://reviews.llvm.org/D106576
This commit is contained in:
Johannes Doerfert 2021-07-22 12:18:46 -05:00
parent 6ca969353c
commit d12ee28e2e
1 changed files with 31 additions and 75 deletions

View File

@ -21,114 +21,70 @@
static constexpr unsigned MinBytes = 8; static constexpr unsigned MinBytes = 8;
template <unsigned BytesPerThread, unsigned NThreads = MAX_THREADS_PER_TEAM> template <unsigned BPerThread, unsigned NThreads = MAX_THREADS_PER_TEAM>
struct alignas(32) ThreadStackTy { struct alignas(32) ThreadStackTy {
static constexpr unsigned MaxSize = NThreads * BytesPerThread; static constexpr unsigned BytesPerThread = BPerThread;
static constexpr unsigned NumThreads = NThreads; static constexpr unsigned NumThreads = NThreads;
static constexpr unsigned NumWarps = (NThreads + WARPSIZE - 1) / WARPSIZE; static constexpr unsigned NumWarps = (NThreads + WARPSIZE - 1) / WARPSIZE;
static constexpr unsigned MaxSizePerWarp = MaxSize / NumWarps;
unsigned char Data[MaxSize]; unsigned char Data[NumThreads][BytesPerThread];
char Sizes[MaxSize / MinBytes]; unsigned char Usage[NumThreads];
char SizeUsage[NumWarps];
char Usage[NumWarps];
}; };
[[clang::loader_uninitialized]] ThreadStackTy<MinBytes * 8, 1> MainSharedStack; [[clang::loader_uninitialized]] ThreadStackTy<MinBytes * 8, 1> MainSharedStack;
#pragma omp allocate(MainSharedStack) allocator(omp_pteam_mem_alloc) #pragma omp allocate(MainSharedStack) allocator(omp_pteam_mem_alloc)
[[clang::loader_uninitialized]] ThreadStackTy<MinBytes * 2, [[clang::loader_uninitialized]] ThreadStackTy<MinBytes,
MAX_THREADS_PER_TEAM / 8> MAX_THREADS_PER_TEAM / 4>
WorkerSharedStack; WorkerSharedStack;
#pragma omp allocate(WorkerSharedStack) allocator(omp_pteam_mem_alloc) #pragma omp allocate(WorkerSharedStack) allocator(omp_pteam_mem_alloc)
template <typename AllocTy>
static void *__kmpc_alloc_for_warp(AllocTy Alloc, unsigned Bytes,
unsigned WarpBytes) {
void *Ptr;
__kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
unsigned LeaderID = __kmpc_impl_ffs(CurActive) - 1;
bool IsWarpLeader =
(__kmpc_get_hardware_thread_id_in_block() % WARPSIZE) == LeaderID;
if (IsWarpLeader)
Ptr = Alloc();
// Get address from the first active lane.
int *FP = (int *)&Ptr;
FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], LeaderID);
if (sizeof(Ptr) == 8)
FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], LeaderID);
return (void *)&((char *)(Ptr))[(GetLaneId() - LeaderID) * Bytes];
}
EXTERN void *__kmpc_alloc_shared(size_t Bytes) { EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
Bytes = Bytes + (Bytes % MinBytes); size_t AlignedBytes = Bytes + (Bytes % MinBytes);
int TID = __kmpc_get_hardware_thread_id_in_block(); int TID = __kmpc_get_hardware_thread_id_in_block();
if (__kmpc_is_generic_main_thread(TID)) { if (__kmpc_is_generic_main_thread(TID)) {
// Main thread alone, use shared memory if space is available. // Main thread alone, use shared memory if space is available.
if (MainSharedStack.Usage[0] + Bytes <= MainSharedStack.MaxSize) { if (MainSharedStack.Usage[0] + AlignedBytes <= MainSharedStack.BytesPerThread) {
void *Ptr = &MainSharedStack.Data[MainSharedStack.Usage[0]]; void *Ptr = &MainSharedStack.Data[0][MainSharedStack.Usage[0]];
MainSharedStack.Usage[0] += Bytes; MainSharedStack.Usage[0] += AlignedBytes;
MainSharedStack.Sizes[MainSharedStack.SizeUsage[0]++] = Bytes;
return Ptr; return Ptr;
} }
} else { } else if (TID < WorkerSharedStack.NumThreads) {
int WID = GetWarpId(); if (WorkerSharedStack.Usage[TID] + AlignedBytes <= WorkerSharedStack.BytesPerThread) {
unsigned WarpBytes = Bytes * WARPSIZE; void *Ptr = &WorkerSharedStack.Data[TID][WorkerSharedStack.Usage[TID]];
auto AllocSharedStack = [&]() { WorkerSharedStack.Usage[TID] += AlignedBytes;
unsigned WarpOffset = WID * WorkerSharedStack.MaxSizePerWarp;
void *Ptr =
&WorkerSharedStack.Data[WarpOffset + WorkerSharedStack.Usage[WID]];
WorkerSharedStack.Usage[WID] += WarpBytes;
WorkerSharedStack.Sizes[WorkerSharedStack.SizeUsage[WID]++] = WarpBytes;
return Ptr; return Ptr;
}; }
if (TID < WorkerSharedStack.NumThreads &&
WorkerSharedStack.Usage[WID] + WarpBytes <=
WorkerSharedStack.MaxSizePerWarp)
return __kmpc_alloc_for_warp(AllocSharedStack, Bytes, WarpBytes);
} }
// Fallback to malloc // Fallback to malloc
unsigned WarpBytes = Bytes * WARPSIZE; return SafeMalloc(Bytes, "AllocGlobalFallback");
auto AllocGlobal = [&] {
return SafeMalloc(WarpBytes, "AllocGlobalFallback");
};
return __kmpc_alloc_for_warp(AllocGlobal, Bytes, WarpBytes);
} }
EXTERN void __kmpc_free_shared(void *Ptr, size_t /* Bytes */) { EXTERN void __kmpc_free_shared(void *Ptr, size_t Bytes) {
__kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); size_t AlignedBytes = Bytes + (Bytes % MinBytes);
unsigned LeaderID = __kmpc_impl_ffs(CurActive) - 1; int TID = __kmpc_get_hardware_thread_id_in_block();
bool IsWarpLeader = if (__kmpc_is_generic_main_thread(TID)) {
(__kmpc_get_hardware_thread_id_in_block() % WARPSIZE) == LeaderID; if (Ptr >= &MainSharedStack.Data[0][0] &&
__kmpc_syncwarp(CurActive); Ptr < &MainSharedStack.Data[MainSharedStack.NumThreads][0]) {
if (IsWarpLeader) { MainSharedStack.Usage[0] -= AlignedBytes;
if (Ptr >= &MainSharedStack.Data[0] &&
Ptr < &MainSharedStack.Data[MainSharedStack.MaxSize]) {
unsigned Bytes = MainSharedStack.Sizes[--MainSharedStack.SizeUsage[0]];
MainSharedStack.Usage[0] -= Bytes;
return; return;
} }
if (Ptr >= &WorkerSharedStack.Data[0] && } else if (TID < WorkerSharedStack.NumThreads) {
Ptr < &WorkerSharedStack.Data[WorkerSharedStack.MaxSize]) { if (Ptr >= &WorkerSharedStack.Data[0][0] &&
int WID = GetWarpId(); Ptr < &WorkerSharedStack.Data[WorkerSharedStack.NumThreads][0]) {
unsigned Bytes = int TID = __kmpc_get_hardware_thread_id_in_block();
WorkerSharedStack.Sizes[--WorkerSharedStack.SizeUsage[WID]]; WorkerSharedStack.Usage[TID] -= AlignedBytes;
WorkerSharedStack.Usage[WID] -= Bytes;
return; return;
} }
SafeFree(Ptr, "FreeGlobalFallback");
} }
SafeFree(Ptr, "FreeGlobalFallback");
} }
EXTERN void __kmpc_data_sharing_init_stack() { EXTERN void __kmpc_data_sharing_init_stack() {
for (unsigned i = 0; i < MainSharedStack.NumWarps; ++i) { for (unsigned i = 0; i < MainSharedStack.NumWarps; ++i)
MainSharedStack.SizeUsage[i] = 0;
MainSharedStack.Usage[i] = 0; MainSharedStack.Usage[i] = 0;
} for (unsigned i = 0; i < WorkerSharedStack.NumThreads; ++i)
for (unsigned i = 0; i < WorkerSharedStack.NumWarps; ++i) {
WorkerSharedStack.SizeUsage[i] = 0;
WorkerSharedStack.Usage[i] = 0; WorkerSharedStack.Usage[i] = 0;
}
} }
/// Allocate storage in shared memory to communicate arguments from the main /// Allocate storage in shared memory to communicate arguments from the main