From a42361dc1c26acae656243232e81a236ba333a8c Mon Sep 17 00:00:00 2001 From: Johannes Doerfert Date: Tue, 19 Jul 2022 14:22:23 -0500 Subject: [PATCH] [OpenMP] Expose the state in the header to allow non-lto optimizations We used to inline the `lookup` calls such that the runtime had "known" access offsets when it was shipped. With the new static library build it doesn't as the lookup is an indirection we cannot look through. This should help us optimize the code better until we can do LTO for the runtime again. Differential Revision: https://reviews.llvm.org/D130111 --- openmp/libomptarget/DeviceRTL/include/State.h | 180 +++++++++++++++--- openmp/libomptarget/DeviceRTL/src/State.cpp | 141 +------------- 2 files changed, 163 insertions(+), 158 deletions(-) diff --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h index 183b68416f0a..08f08bd74a90 100644 --- a/openmp/libomptarget/DeviceRTL/include/State.h +++ b/openmp/libomptarget/DeviceRTL/include/State.h @@ -13,16 +13,104 @@ #define OMPTARGET_STATE_H #include "Debug.h" +#include "Mapping.h" #include "Types.h" +#include "Utils.h" #pragma omp begin declare target device_type(nohost) namespace _OMP { +namespace memory { + +/// Alloca \p Size bytes in shared memory, if possible, for \p Reason. +/// +/// Note: See the restrictions on __kmpc_alloc_shared for proper usage. +void *allocShared(uint64_t Size, const char *Reason); + +/// Free \p Ptr, alloated via allocShared, for \p Reason. +/// +/// Note: See the restrictions on __kmpc_free_shared for proper usage. +void freeShared(void *Ptr, uint64_t Bytes, const char *Reason); + +/// Alloca \p Size bytes in global memory, if possible, for \p Reason. +void *allocGlobal(uint64_t Size, const char *Reason); + +/// Return a pointer to the dynamic shared memory buffer. +void *getDynamicBuffer(); + +/// Free \p Ptr, alloated via allocGlobal, for \p Reason. +void freeGlobal(void *Ptr, const char *Reason); + +} // namespace memory + namespace state { inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE; +struct ICVStateTy { + uint32_t NThreadsVar; + uint32_t LevelVar; + uint32_t ActiveLevelVar; + uint32_t MaxActiveLevelsVar; + uint32_t RunSchedVar; + uint32_t RunSchedChunkVar; + + bool operator==(const ICVStateTy &Other) const; + + void assertEqual(const ICVStateTy &Other) const; +}; + +struct TeamStateTy { + void init(bool IsSPMD); + + bool operator==(const TeamStateTy &) const; + + void assertEqual(TeamStateTy &Other) const; + + /// ICVs + /// + /// Preallocated storage for ICV values that are used if the threads have not + /// set a custom default. The latter is supported but unlikely and slow(er). + /// + ///{ + ICVStateTy ICVState; + ///} + + uint32_t ParallelTeamSize; + ParallelRegionFnTy ParallelRegionFnVar; +}; + +extern TeamStateTy TeamState; +#pragma omp allocate(TeamState) allocator(omp_pteam_mem_alloc) + +struct ThreadStateTy { + + /// ICVs have preallocated storage in the TeamStateTy which is used if a + /// thread has not set a custom value. The latter is supported but unlikely. + /// When it happens we will allocate dynamic memory to hold the values of all + /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an + /// ICV struct to hold them all. This is slower than alternatives but allows + /// users to pay only for what they use. + /// + state::ICVStateTy ICVState; + + ThreadStateTy *PreviousThreadState; + + void init() { + ICVState = TeamState.ICVState; + PreviousThreadState = nullptr; + } + + void init(ThreadStateTy *PreviousTS) { + ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState; + PreviousThreadState = PreviousTS; + } +}; + +extern ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam]; +#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc) + /// Initialize the state machinery. Must be called by all threads. void init(bool IsSPMD); @@ -54,8 +142,73 @@ struct DateEnvironmentRAII { /// TODO void resetStateForThread(uint32_t TId); -uint32_t &lookup32(ValueKind VK, bool IsReadonly, IdentTy *Ident); -void *&lookupPtr(ValueKind VK, bool IsReadonly); +inline uint32_t &lookupForModify32Impl(uint32_t state::ICVStateTy::*Var, + IdentTy *Ident) { + if (OMP_LIKELY(!config::mayUseThreadStates() || + TeamState.ICVState.LevelVar == 0)) + return TeamState.ICVState.*Var; + uint32_t TId = mapping::getThreadIdInBlock(); + if (OMP_UNLIKELY(!ThreadStates[TId])) { + ThreadStates[TId] = reinterpret_cast(memory::allocGlobal( + sizeof(ThreadStateTy), "ICV modification outside data environment")); + ASSERT(ThreadStates[TId] != nullptr && "Nullptr returned by malloc!"); + ThreadStates[TId]->init(); + } + return ThreadStates[TId]->ICVState.*Var; +} + +inline uint32_t &lookupImpl(uint32_t state::ICVStateTy::*Var) { + auto TId = mapping::getThreadIdInBlock(); + if (OMP_UNLIKELY(config::mayUseThreadStates() && ThreadStates[TId])) + return ThreadStates[TId]->ICVState.*Var; + return TeamState.ICVState.*Var; +} + +__attribute__((always_inline, flatten)) inline uint32_t & +lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident) { + switch (Kind) { + case state::VK_NThreads: + if (IsReadonly) + return lookupImpl(&ICVStateTy::NThreadsVar); + return lookupForModify32Impl(&ICVStateTy::NThreadsVar, Ident); + case state::VK_Level: + if (IsReadonly) + return lookupImpl(&ICVStateTy::LevelVar); + return lookupForModify32Impl(&ICVStateTy::LevelVar, Ident); + case state::VK_ActiveLevel: + if (IsReadonly) + return lookupImpl(&ICVStateTy::ActiveLevelVar); + return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar, Ident); + case state::VK_MaxActiveLevels: + if (IsReadonly) + return lookupImpl(&ICVStateTy::MaxActiveLevelsVar); + return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar, Ident); + case state::VK_RunSched: + if (IsReadonly) + return lookupImpl(&ICVStateTy::RunSchedVar); + return lookupForModify32Impl(&ICVStateTy::RunSchedVar, Ident); + case state::VK_RunSchedChunk: + if (IsReadonly) + return lookupImpl(&ICVStateTy::RunSchedChunkVar); + return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar, Ident); + case state::VK_ParallelTeamSize: + return TeamState.ParallelTeamSize; + default: + break; + } + __builtin_unreachable(); +} + +__attribute__((always_inline, flatten)) inline void *& +lookupPtr(ValueKind Kind, bool IsReadonly) { + switch (Kind) { + case state::VK_ParallelRegionFn: + return TeamState.ParallelRegionFnVar; + default: + break; + } + __builtin_unreachable(); +} /// A class without actual state used to provide a nice interface to lookup and /// update ICV values we can declare in global scope. @@ -181,29 +334,6 @@ inline state::Value RunSched; } // namespace icv -namespace memory { - -/// Alloca \p Size bytes in shared memory, if possible, for \p Reason. -/// -/// Note: See the restrictions on __kmpc_alloc_shared for proper usage. -void *allocShared(uint64_t Size, const char *Reason); - -/// Free \p Ptr, alloated via allocShared, for \p Reason. -/// -/// Note: See the restrictions on __kmpc_free_shared for proper usage. -void freeShared(void *Ptr, uint64_t Bytes, const char *Reason); - -/// Alloca \p Size bytes in global memory, if possible, for \p Reason. -void *allocGlobal(uint64_t Size, const char *Reason); - -/// Return a pointer to the dynamic shared memory buffer. -void *getDynamicBuffer(); - -/// Free \p Ptr, alloated via allocGlobal, for \p Reason. -void freeGlobal(void *Ptr, const char *Reason); - -} // namespace memory - } // namespace _OMP #pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp index 312600062813..02de82e435f7 100644 --- a/openmp/libomptarget/DeviceRTL/src/State.cpp +++ b/openmp/libomptarget/DeviceRTL/src/State.cpp @@ -12,10 +12,8 @@ #include "Configuration.h" #include "Debug.h" #include "Interface.h" -#include "Mapping.h" #include "Synchronization.h" #include "Types.h" -#include "Utils.h" using namespace _OMP; @@ -180,22 +178,7 @@ void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); } ///} -namespace { - -struct ICVStateTy { - uint32_t NThreadsVar; - uint32_t LevelVar; - uint32_t ActiveLevelVar; - uint32_t MaxActiveLevelsVar; - uint32_t RunSchedVar; - uint32_t RunSchedChunkVar; - - bool operator==(const ICVStateTy &Other) const; - - void assertEqual(const ICVStateTy &Other) const; -}; - -bool ICVStateTy::operator==(const ICVStateTy &Other) const { +bool state::ICVStateTy::operator==(const ICVStateTy &Other) const { return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) & (ActiveLevelVar == Other.ActiveLevelVar) & (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) & @@ -203,7 +186,7 @@ bool ICVStateTy::operator==(const ICVStateTy &Other) const { (RunSchedChunkVar == Other.RunSchedChunkVar); } -void ICVStateTy::assertEqual(const ICVStateTy &Other) const { +void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const { ASSERT(NThreadsVar == Other.NThreadsVar); ASSERT(LevelVar == Other.LevelVar); ASSERT(ActiveLevelVar == Other.ActiveLevelVar); @@ -212,30 +195,7 @@ void ICVStateTy::assertEqual(const ICVStateTy &Other) const { ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar); } -struct TeamStateTy { - /// TODO: provide a proper init function. - void init(bool IsSPMD); - - bool operator==(const TeamStateTy &) const; - - void assertEqual(TeamStateTy &Other) const; - - /// ICVs - /// - /// Preallocated storage for ICV values that are used if the threads have not - /// set a custom default. The latter is supported but unlikely and slow(er). - /// - ///{ - ICVStateTy ICVState; - ///} - - uint32_t ParallelTeamSize; - ParallelRegionFnTy ParallelRegionFnVar; -}; - -TeamStateTy SHARED(TeamState); - -void TeamStateTy::init(bool IsSPMD) { +void state::TeamStateTy::init(bool IsSPMD) { ICVState.NThreadsVar = mapping::getBlockSize(IsSPMD); ICVState.LevelVar = 0; ICVState.ActiveLevelVar = 0; @@ -246,65 +206,24 @@ void TeamStateTy::init(bool IsSPMD) { ParallelRegionFnVar = nullptr; } -bool TeamStateTy::operator==(const TeamStateTy &Other) const { +bool state::TeamStateTy::operator==(const TeamStateTy &Other) const { return (ICVState == Other.ICVState) & (ParallelTeamSize == Other.ParallelTeamSize); } -void TeamStateTy::assertEqual(TeamStateTy &Other) const { +void state::TeamStateTy::assertEqual(TeamStateTy &Other) const { ICVState.assertEqual(Other.ICVState); ASSERT(ParallelTeamSize == Other.ParallelTeamSize); } -struct ThreadStateTy { +namespace { - /// ICVs have preallocated storage in the TeamStateTy which is used if a - /// thread has not set a custom value. The latter is supported but unlikely. - /// When it happens we will allocate dynamic memory to hold the values of all - /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an - /// ICV struct to hold them all. This is slower than alternatives but allows - /// users to pay only for what they use. - /// - ICVStateTy ICVState; - - ThreadStateTy *PreviousThreadState; - - void init() { - ICVState = TeamState.ICVState; - PreviousThreadState = nullptr; - } - - void init(ThreadStateTy *PreviousTS) { - ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState; - PreviousThreadState = PreviousTS; - } -}; +state::TeamStateTy SHARED(TeamState); __attribute__((loader_uninitialized)) -ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam]; +state::ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam]; #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc) -uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var, IdentTy *Ident) { - if (OMP_LIKELY(!config::mayUseThreadStates() || - TeamState.ICVState.LevelVar == 0)) - return TeamState.ICVState.*Var; - uint32_t TId = mapping::getThreadIdInBlock(); - if (OMP_UNLIKELY(!ThreadStates[TId])) { - ThreadStates[TId] = reinterpret_cast(memory::allocGlobal( - sizeof(ThreadStateTy), "ICV modification outside data environment")); - ASSERT(ThreadStates[TId] != nullptr && "Nullptr returned by malloc!"); - ThreadStates[TId]->init(); - } - return ThreadStates[TId]->ICVState.*Var; -} - -template IntTy &lookupImpl(IntTy ICVStateTy::*Var) { - IntTy TId = mapping::getThreadIdInBlock(); - if (OMP_UNLIKELY(config::mayUseThreadStates() && ThreadStates[TId])) - return ThreadStates[TId]->ICVState.*Var; - return TeamState.ICVState.*Var; -} - int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, int OutOfBoundsVal = -1) { if (Level == 0) @@ -320,50 +239,6 @@ int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, } // namespace -uint32_t &state::lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident) { - switch (Kind) { - case state::VK_NThreads: - if (IsReadonly) - return lookupImpl(&ICVStateTy::NThreadsVar); - return lookupForModify32Impl(&ICVStateTy::NThreadsVar, Ident); - case state::VK_Level: - if (IsReadonly) - return lookupImpl(&ICVStateTy::LevelVar); - return lookupForModify32Impl(&ICVStateTy::LevelVar, Ident); - case state::VK_ActiveLevel: - if (IsReadonly) - return lookupImpl(&ICVStateTy::ActiveLevelVar); - return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar, Ident); - case state::VK_MaxActiveLevels: - if (IsReadonly) - return lookupImpl(&ICVStateTy::MaxActiveLevelsVar); - return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar, Ident); - case state::VK_RunSched: - if (IsReadonly) - return lookupImpl(&ICVStateTy::RunSchedVar); - return lookupForModify32Impl(&ICVStateTy::RunSchedVar, Ident); - case state::VK_RunSchedChunk: - if (IsReadonly) - return lookupImpl(&ICVStateTy::RunSchedChunkVar); - return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar, Ident); - case state::VK_ParallelTeamSize: - return TeamState.ParallelTeamSize; - default: - break; - } - __builtin_unreachable(); -} - -void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) { - switch (Kind) { - case state::VK_ParallelRegionFn: - return TeamState.ParallelRegionFnVar; - default: - break; - } - __builtin_unreachable(); -} - void state::init(bool IsSPMD) { SharedMemorySmartStack.init(IsSPMD); if (mapping::isInitialThreadInLevel0(IsSPMD)) {