[OpenMP] Expose the state in the header to allow non-lto optimizations

We used to inline the `lookup` calls such that the runtime had "known"
access offsets when it was shipped. With the new static library build it
doesn't as the lookup is an indirection we cannot look through. This
should help us optimize the code better until we can do LTO for the
runtime again.

Differential Revision: https://reviews.llvm.org/D130111
This commit is contained in:
Johannes Doerfert 2022-07-19 14:22:23 -05:00
parent e01ce4e88a
commit a42361dc1c
2 changed files with 163 additions and 158 deletions

View File

@ -13,16 +13,104 @@
#define OMPTARGET_STATE_H
#include "Debug.h"
#include "Mapping.h"
#include "Types.h"
#include "Utils.h"
#pragma omp begin declare target device_type(nohost)
namespace _OMP {
namespace memory {
/// Alloca \p Size bytes in shared memory, if possible, for \p Reason.
///
/// Note: See the restrictions on __kmpc_alloc_shared for proper usage.
void *allocShared(uint64_t Size, const char *Reason);
/// Free \p Ptr, alloated via allocShared, for \p Reason.
///
/// Note: See the restrictions on __kmpc_free_shared for proper usage.
void freeShared(void *Ptr, uint64_t Bytes, const char *Reason);
/// Alloca \p Size bytes in global memory, if possible, for \p Reason.
void *allocGlobal(uint64_t Size, const char *Reason);
/// Return a pointer to the dynamic shared memory buffer.
void *getDynamicBuffer();
/// Free \p Ptr, alloated via allocGlobal, for \p Reason.
void freeGlobal(void *Ptr, const char *Reason);
} // namespace memory
namespace state {
inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE;
struct ICVStateTy {
uint32_t NThreadsVar;
uint32_t LevelVar;
uint32_t ActiveLevelVar;
uint32_t MaxActiveLevelsVar;
uint32_t RunSchedVar;
uint32_t RunSchedChunkVar;
bool operator==(const ICVStateTy &Other) const;
void assertEqual(const ICVStateTy &Other) const;
};
struct TeamStateTy {
void init(bool IsSPMD);
bool operator==(const TeamStateTy &) const;
void assertEqual(TeamStateTy &Other) const;
/// ICVs
///
/// Preallocated storage for ICV values that are used if the threads have not
/// set a custom default. The latter is supported but unlikely and slow(er).
///
///{
ICVStateTy ICVState;
///}
uint32_t ParallelTeamSize;
ParallelRegionFnTy ParallelRegionFnVar;
};
extern TeamStateTy TeamState;
#pragma omp allocate(TeamState) allocator(omp_pteam_mem_alloc)
struct ThreadStateTy {
/// ICVs have preallocated storage in the TeamStateTy which is used if a
/// thread has not set a custom value. The latter is supported but unlikely.
/// When it happens we will allocate dynamic memory to hold the values of all
/// ICVs. Thus, the first time an ICV is set by a thread we will allocate an
/// ICV struct to hold them all. This is slower than alternatives but allows
/// users to pay only for what they use.
///
state::ICVStateTy ICVState;
ThreadStateTy *PreviousThreadState;
void init() {
ICVState = TeamState.ICVState;
PreviousThreadState = nullptr;
}
void init(ThreadStateTy *PreviousTS) {
ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState;
PreviousThreadState = PreviousTS;
}
};
extern ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam];
#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
/// Initialize the state machinery. Must be called by all threads.
void init(bool IsSPMD);
@ -54,8 +142,73 @@ struct DateEnvironmentRAII {
/// TODO
void resetStateForThread(uint32_t TId);
uint32_t &lookup32(ValueKind VK, bool IsReadonly, IdentTy *Ident);
void *&lookupPtr(ValueKind VK, bool IsReadonly);
inline uint32_t &lookupForModify32Impl(uint32_t state::ICVStateTy::*Var,
IdentTy *Ident) {
if (OMP_LIKELY(!config::mayUseThreadStates() ||
TeamState.ICVState.LevelVar == 0))
return TeamState.ICVState.*Var;
uint32_t TId = mapping::getThreadIdInBlock();
if (OMP_UNLIKELY(!ThreadStates[TId])) {
ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(memory::allocGlobal(
sizeof(ThreadStateTy), "ICV modification outside data environment"));
ASSERT(ThreadStates[TId] != nullptr && "Nullptr returned by malloc!");
ThreadStates[TId]->init();
}
return ThreadStates[TId]->ICVState.*Var;
}
inline uint32_t &lookupImpl(uint32_t state::ICVStateTy::*Var) {
auto TId = mapping::getThreadIdInBlock();
if (OMP_UNLIKELY(config::mayUseThreadStates() && ThreadStates[TId]))
return ThreadStates[TId]->ICVState.*Var;
return TeamState.ICVState.*Var;
}
__attribute__((always_inline, flatten)) inline uint32_t &
lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident) {
switch (Kind) {
case state::VK_NThreads:
if (IsReadonly)
return lookupImpl(&ICVStateTy::NThreadsVar);
return lookupForModify32Impl(&ICVStateTy::NThreadsVar, Ident);
case state::VK_Level:
if (IsReadonly)
return lookupImpl(&ICVStateTy::LevelVar);
return lookupForModify32Impl(&ICVStateTy::LevelVar, Ident);
case state::VK_ActiveLevel:
if (IsReadonly)
return lookupImpl(&ICVStateTy::ActiveLevelVar);
return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar, Ident);
case state::VK_MaxActiveLevels:
if (IsReadonly)
return lookupImpl(&ICVStateTy::MaxActiveLevelsVar);
return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar, Ident);
case state::VK_RunSched:
if (IsReadonly)
return lookupImpl(&ICVStateTy::RunSchedVar);
return lookupForModify32Impl(&ICVStateTy::RunSchedVar, Ident);
case state::VK_RunSchedChunk:
if (IsReadonly)
return lookupImpl(&ICVStateTy::RunSchedChunkVar);
return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar, Ident);
case state::VK_ParallelTeamSize:
return TeamState.ParallelTeamSize;
default:
break;
}
__builtin_unreachable();
}
__attribute__((always_inline, flatten)) inline void *&
lookupPtr(ValueKind Kind, bool IsReadonly) {
switch (Kind) {
case state::VK_ParallelRegionFn:
return TeamState.ParallelRegionFnVar;
default:
break;
}
__builtin_unreachable();
}
/// A class without actual state used to provide a nice interface to lookup and
/// update ICV values we can declare in global scope.
@ -181,29 +334,6 @@ inline state::Value<uint32_t, state::VK_RunSched> RunSched;
} // namespace icv
namespace memory {
/// Alloca \p Size bytes in shared memory, if possible, for \p Reason.
///
/// Note: See the restrictions on __kmpc_alloc_shared for proper usage.
void *allocShared(uint64_t Size, const char *Reason);
/// Free \p Ptr, alloated via allocShared, for \p Reason.
///
/// Note: See the restrictions on __kmpc_free_shared for proper usage.
void freeShared(void *Ptr, uint64_t Bytes, const char *Reason);
/// Alloca \p Size bytes in global memory, if possible, for \p Reason.
void *allocGlobal(uint64_t Size, const char *Reason);
/// Return a pointer to the dynamic shared memory buffer.
void *getDynamicBuffer();
/// Free \p Ptr, alloated via allocGlobal, for \p Reason.
void freeGlobal(void *Ptr, const char *Reason);
} // namespace memory
} // namespace _OMP
#pragma omp end declare target

View File

@ -12,10 +12,8 @@
#include "Configuration.h"
#include "Debug.h"
#include "Interface.h"
#include "Mapping.h"
#include "Synchronization.h"
#include "Types.h"
#include "Utils.h"
using namespace _OMP;
@ -180,22 +178,7 @@ void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
///}
namespace {
struct ICVStateTy {
uint32_t NThreadsVar;
uint32_t LevelVar;
uint32_t ActiveLevelVar;
uint32_t MaxActiveLevelsVar;
uint32_t RunSchedVar;
uint32_t RunSchedChunkVar;
bool operator==(const ICVStateTy &Other) const;
void assertEqual(const ICVStateTy &Other) const;
};
bool ICVStateTy::operator==(const ICVStateTy &Other) const {
bool state::ICVStateTy::operator==(const ICVStateTy &Other) const {
return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
(ActiveLevelVar == Other.ActiveLevelVar) &
(MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
@ -203,7 +186,7 @@ bool ICVStateTy::operator==(const ICVStateTy &Other) const {
(RunSchedChunkVar == Other.RunSchedChunkVar);
}
void ICVStateTy::assertEqual(const ICVStateTy &Other) const {
void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const {
ASSERT(NThreadsVar == Other.NThreadsVar);
ASSERT(LevelVar == Other.LevelVar);
ASSERT(ActiveLevelVar == Other.ActiveLevelVar);
@ -212,30 +195,7 @@ void ICVStateTy::assertEqual(const ICVStateTy &Other) const {
ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar);
}
struct TeamStateTy {
/// TODO: provide a proper init function.
void init(bool IsSPMD);
bool operator==(const TeamStateTy &) const;
void assertEqual(TeamStateTy &Other) const;
/// ICVs
///
/// Preallocated storage for ICV values that are used if the threads have not
/// set a custom default. The latter is supported but unlikely and slow(er).
///
///{
ICVStateTy ICVState;
///}
uint32_t ParallelTeamSize;
ParallelRegionFnTy ParallelRegionFnVar;
};
TeamStateTy SHARED(TeamState);
void TeamStateTy::init(bool IsSPMD) {
void state::TeamStateTy::init(bool IsSPMD) {
ICVState.NThreadsVar = mapping::getBlockSize(IsSPMD);
ICVState.LevelVar = 0;
ICVState.ActiveLevelVar = 0;
@ -246,65 +206,24 @@ void TeamStateTy::init(bool IsSPMD) {
ParallelRegionFnVar = nullptr;
}
bool TeamStateTy::operator==(const TeamStateTy &Other) const {
bool state::TeamStateTy::operator==(const TeamStateTy &Other) const {
return (ICVState == Other.ICVState) &
(ParallelTeamSize == Other.ParallelTeamSize);
}
void TeamStateTy::assertEqual(TeamStateTy &Other) const {
void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
ICVState.assertEqual(Other.ICVState);
ASSERT(ParallelTeamSize == Other.ParallelTeamSize);
}
struct ThreadStateTy {
namespace {
/// ICVs have preallocated storage in the TeamStateTy which is used if a
/// thread has not set a custom value. The latter is supported but unlikely.
/// When it happens we will allocate dynamic memory to hold the values of all
/// ICVs. Thus, the first time an ICV is set by a thread we will allocate an
/// ICV struct to hold them all. This is slower than alternatives but allows
/// users to pay only for what they use.
///
ICVStateTy ICVState;
ThreadStateTy *PreviousThreadState;
void init() {
ICVState = TeamState.ICVState;
PreviousThreadState = nullptr;
}
void init(ThreadStateTy *PreviousTS) {
ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState;
PreviousThreadState = PreviousTS;
}
};
state::TeamStateTy SHARED(TeamState);
__attribute__((loader_uninitialized))
ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam];
state::ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam];
#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var, IdentTy *Ident) {
if (OMP_LIKELY(!config::mayUseThreadStates() ||
TeamState.ICVState.LevelVar == 0))
return TeamState.ICVState.*Var;
uint32_t TId = mapping::getThreadIdInBlock();
if (OMP_UNLIKELY(!ThreadStates[TId])) {
ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(memory::allocGlobal(
sizeof(ThreadStateTy), "ICV modification outside data environment"));
ASSERT(ThreadStates[TId] != nullptr && "Nullptr returned by malloc!");
ThreadStates[TId]->init();
}
return ThreadStates[TId]->ICVState.*Var;
}
template <typename IntTy> IntTy &lookupImpl(IntTy ICVStateTy::*Var) {
IntTy TId = mapping::getThreadIdInBlock();
if (OMP_UNLIKELY(config::mayUseThreadStates() && ThreadStates[TId]))
return ThreadStates[TId]->ICVState.*Var;
return TeamState.ICVState.*Var;
}
int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
int OutOfBoundsVal = -1) {
if (Level == 0)
@ -320,50 +239,6 @@ int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
} // namespace
uint32_t &state::lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident) {
switch (Kind) {
case state::VK_NThreads:
if (IsReadonly)
return lookupImpl<uint32_t>(&ICVStateTy::NThreadsVar);
return lookupForModify32Impl(&ICVStateTy::NThreadsVar, Ident);
case state::VK_Level:
if (IsReadonly)
return lookupImpl<uint32_t>(&ICVStateTy::LevelVar);
return lookupForModify32Impl(&ICVStateTy::LevelVar, Ident);
case state::VK_ActiveLevel:
if (IsReadonly)
return lookupImpl<uint32_t>(&ICVStateTy::ActiveLevelVar);
return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar, Ident);
case state::VK_MaxActiveLevels:
if (IsReadonly)
return lookupImpl<uint32_t>(&ICVStateTy::MaxActiveLevelsVar);
return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar, Ident);
case state::VK_RunSched:
if (IsReadonly)
return lookupImpl<uint32_t>(&ICVStateTy::RunSchedVar);
return lookupForModify32Impl(&ICVStateTy::RunSchedVar, Ident);
case state::VK_RunSchedChunk:
if (IsReadonly)
return lookupImpl<uint32_t>(&ICVStateTy::RunSchedChunkVar);
return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar, Ident);
case state::VK_ParallelTeamSize:
return TeamState.ParallelTeamSize;
default:
break;
}
__builtin_unreachable();
}
void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) {
switch (Kind) {
case state::VK_ParallelRegionFn:
return TeamState.ParallelRegionFnVar;
default:
break;
}
__builtin_unreachable();
}
void state::init(bool IsSPMD) {
SharedMemorySmartStack.init(IsSPMD);
if (mapping::isInitialThreadInLevel0(IsSPMD)) {