[OpenMP] Add flag for disabling thread state in runtime

The runtime uses thread state values to indicate when we use an ICV or
are in nested parallelism. This is done for OpenMP correctness, but it
not needed in the majority of cases. The new flag added is
`-fopenmp-assume-no-thread-state`.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D120106
This commit is contained in:
Joseph Huber 2022-02-17 19:09:46 -05:00
parent 03ec026eac
commit 0870a4f59a
8 changed files with 41 additions and 4 deletions

View File

@ -246,6 +246,7 @@ LANGOPT(OpenMPTargetDebug , 32, 0, "Enable debugging in the OpenMP offloading de
LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
LANGOPT(RenderScript , 1, 0, "RenderScript")
LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device")

View File

@ -2473,6 +2473,10 @@ def fno_openmp_assume_teams_oversubscription : Flag<["-"], "fno-openmp-assume-te
Group<f_Group>, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
def fno_openmp_assume_threads_oversubscription : Flag<["-"], "fno-openmp-assume-threads-oversubscription">,
Group<f_Group>, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
def fopenmp_assume_no_thread_state : Flag<["-"], "fopenmp-assume-no-thread-state">, Group<f_Group>,
Flags<[CC1Option, NoArgumentUnused, HelpHidden]>,
HelpText<"Assert no thread in a parallel region modifies an ICV">,
MarshallingInfoFlag<LangOpts<"OpenMPNoThreadState">>;
defm openmp_target_new_runtime: BoolFOption<"openmp-target-new-runtime",
LangOpts<"OpenMPTargetNewRuntime">, DefaultTrue,
PosFlag<SetTrue, [CC1Option], "Use the new bitcode library for OpenMP offloading">,

View File

@ -1210,6 +1210,8 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
"__omp_rtl_assume_teams_oversubscription");
OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
"__omp_rtl_assume_threads_oversubscription");
OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
"__omp_rtl_assume_no_thread_state");
}
}

View File

@ -5995,6 +5995,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
options::OPT_fno_openmp_assume_threads_oversubscription,
/*Default=*/false))
CmdArgs.push_back("-fopenmp-assume-threads-oversubscription");
if (Args.hasArg(options::OPT_fopenmp_assume_no_thread_state))
CmdArgs.push_back("-fopenmp-assume-no-thread-state");
break;
default:
// By default, if Clang doesn't know how to generate useful OpenMP code

View File

@ -6,6 +6,7 @@
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-DEFAULT
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-threads-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-THREADS
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-TEAMS
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-no-thread-state -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-STATE
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -o - | FileCheck %s --check-prefix=CHECK-RUNTIME
// expected-no-diagnostics
@ -16,26 +17,37 @@
// CHECK: @__omp_rtl_debug_kind = weak_odr hidden constant i32 1
// CHECK: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
// CHECK: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
// CHECK: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
//.
// CHECK-EQ: @__omp_rtl_debug_kind = weak_odr hidden constant i32 111
// CHECK-EQ: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
// CHECK-EQ: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
// CHECK-EQ: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
//.
// CHECK-DEFAULT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
// CHECK-DEFAULT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
// CHECK-DEFAULT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
// CHECK-DEFAULT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
//.
// CHECK-THREADS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
// CHECK-THREADS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
// CHECK-THREADS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 1
// CHECK-THREADS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
//.
// CHECK-TEAMS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
// CHECK-TEAMS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1
// CHECK-TEAMS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
// CHECK-TEAMS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
//.
// CHECK-STATE: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
// CHECK-STATE: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
// CHECK-STATE: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
// CHECK-STATE: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 1
//.
// CHECK-RUNTIME-NOT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
// CHECK-RUNTIME-NOT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1
// CHECK-RUNTIME-NOT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
// CHECK-RUNTIME-NOT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
//.
void foo() {
#pragma omp target

View File

@ -38,8 +38,13 @@ uint32_t getDebugKind();
/// Return the amount of dynamic shared memory that was allocated at launch.
uint64_t getDynamicMemorySize();
/// Return if debugging is enabled for the given debug kind.
bool isDebugMode(DebugKind Level);
/// Indicates if this kernel may require thread-specific states, or if it was
/// explicitly disabled by the user.
bool mayUseThreadStates();
} // namespace config
} // namespace _OMP

View File

@ -20,7 +20,9 @@ using namespace _OMP;
#pragma omp declare target
extern uint32_t __omp_rtl_debug_kind; // defined by CGOpenMPRuntimeGPU
// defined by CGOpenMPRuntimeGPU
extern uint32_t __omp_rtl_debug_kind;
extern uint32_t __omp_rtl_assume_no_thread_state;
// TODO: We want to change the name as soon as the old runtime is gone.
// This variable should be visibile to the plugin so we override the default
@ -48,4 +50,6 @@ bool config::isDebugMode(config::DebugKind Kind) {
return config::getDebugKind() & Kind;
}
bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; }
#pragma omp end declare target

View File

@ -285,7 +285,8 @@ ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam];
#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var, IdentTy *Ident) {
if (OMP_LIKELY(TeamState.ICVState.LevelVar == 0))
if (OMP_LIKELY(!config::mayUseThreadStates() ||
TeamState.ICVState.LevelVar == 0))
return TeamState.ICVState.*Var;
uint32_t TId = mapping::getThreadIdInBlock();
if (!ThreadStates[TId]) {
@ -299,13 +300,13 @@ uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var, IdentTy *Ident) {
uint32_t &lookup32Impl(uint32_t ICVStateTy::*Var) {
uint32_t TId = mapping::getThreadIdInBlock();
if (OMP_UNLIKELY(ThreadStates[TId]))
if (OMP_UNLIKELY(config::mayUseThreadStates() && ThreadStates[TId]))
return ThreadStates[TId]->ICVState.*Var;
return TeamState.ICVState.*Var;
}
uint64_t &lookup64Impl(uint64_t ICVStateTy::*Var) {
uint64_t TId = mapping::getThreadIdInBlock();
if (OMP_UNLIKELY(ThreadStates[TId]))
if (OMP_UNLIKELY(config::mayUseThreadStates() && ThreadStates[TId]))
return ThreadStates[TId]->ICVState.*Var;
return TeamState.ICVState.*Var;
}
@ -380,6 +381,9 @@ void state::init(bool IsSPMD) {
}
void state::enterDataEnvironment(IdentTy *Ident) {
ASSERT(config::mayUseThreadStates() &&
"Thread state modified while explicitly disabled!");
unsigned TId = mapping::getThreadIdInBlock();
ThreadStateTy *NewThreadState =
static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy)));
@ -388,6 +392,9 @@ void state::enterDataEnvironment(IdentTy *Ident) {
}
void state::exitDataEnvironment() {
ASSERT(config::mayUseThreadStates() &&
"Thread state modified while explicitly disabled!");
unsigned TId = mapping::getThreadIdInBlock();
resetStateForThread(TId);
}