[OpenMP] Eliminate the ThreadStates array in favor of indirection

If we have thread states, the program is going to be rather slow. If we
don't, we want to avoid wasting shared memory. This patch introduces a
slight penalty (malloc + indirection) for the slow path and reduces
resource usage for the fast path.

Differential Revision: https://reviews.llvm.org/D135037
This commit is contained in:
Johannes Doerfert 2022-10-02 09:45:08 -07:00
parent b113965073
commit f8ee045c6d
3 changed files with 24 additions and 8 deletions

View File

@ -109,7 +109,7 @@ struct ThreadStateTy {
}
};
extern ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam];
extern ThreadStateTy **ThreadStates;
#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
/// Initialize the state machinery. Must be called by all threads.

View File

@ -33,6 +33,9 @@ using uint32_t = unsigned int;
using int64_t = long;
using uint64_t = unsigned long;
using size_t = decltype(sizeof(char));
// TODO: Properly implement this
using intptr_t = int64_t;
using uintptr_t = uint64_t;
static_assert(sizeof(int8_t) == 1, "type size mismatch");
static_assert(sizeof(uint8_t) == 1, "type size mismatch");

View File

@ -12,6 +12,7 @@
#include "Configuration.h"
#include "Debug.h"
#include "Interface.h"
#include "Mapping.h"
#include "Synchronization.h"
#include "Types.h"
#include "Utils.h"
@ -221,10 +222,7 @@ void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
}
state::TeamStateTy SHARED(_OMP::state::TeamState);
__attribute__((loader_uninitialized))
state::ThreadStateTy *_OMP::state::ThreadStates[mapping::MaxThreadsPerTeam];
#pragma omp allocate(_OMP::state::ThreadStates) allocator(omp_pteam_mem_alloc)
state::ThreadStateTy **SHARED(_OMP::state::ThreadStates);
namespace {
@ -248,18 +246,32 @@ void state::init(bool IsSPMD) {
if (mapping::isInitialThreadInLevel0(IsSPMD)) {
TeamState.init(IsSPMD);
DebugEntryRAII::init();
ThreadStates = nullptr;
}
ThreadStates[mapping::getThreadIdInBlock()] = nullptr;
}
void state::enterDataEnvironment(IdentTy *Ident) {
ASSERT(config::mayUseThreadStates() &&
"Thread state modified while explicitly disabled!");
if (!config::mayUseThreadStates())
return;
unsigned TId = mapping::getThreadIdInBlock();
ThreadStateTy *NewThreadState =
static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy)));
uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
uint32_t Bytes = sizeof(ThreadStates[0]) * mapping::getBlockSize();
void *ThreadStatesPtr =
memory::allocShared(Bytes, "Thread state array allocation");
if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0),
reinterpret_cast<uintptr_t>(ThreadStatesPtr),
atomic::seq_cst, atomic::seq_cst))
memory::freeShared(ThreadStatesPtr, Bytes,
"Thread state array allocated multiple times");
ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst) &&
"Expected valid thread states bit!");
}
NewThreadState->init(ThreadStates[TId]);
TeamState.HasThreadState = true;
ThreadStates[TId] = NewThreadState;
@ -274,6 +286,8 @@ void state::exitDataEnvironment() {
}
void state::resetStateForThread(uint32_t TId) {
if (!config::mayUseThreadStates())
return;
if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId]))
return;
@ -295,7 +309,6 @@ void state::assumeInitialState(bool IsSPMD) {
TeamStateTy InitialTeamState;
InitialTeamState.init(IsSPMD);
InitialTeamState.assertEqual(TeamState);
ASSERT(!ThreadStates[mapping::getThreadIdInBlock()]);
ASSERT(mapping::isSPMDMode() == IsSPMD);
}