diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu index d8049d9aaf42..5db443c3b331 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -311,7 +311,16 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn, (int)newTaskDescr->ThreadId(), (int)nThreads); isActive = true; - IncParallelLevel(threadsInTeam != 1); + // Reconverge the threads at the end of the parallel region to correctly + // handle parallel levels. + // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole + // warp. If only 1 thread is active, not need to reconverge the threads. + // If we have the whole warp, reconverge all the threads in the warp before + // actually trying to change the parallel level. Otherwise, parallel level + // can be changed incorrectly because of threads divergence. + bool IsActiveParallelRegion = threadsInTeam != 1; + IncParallelLevel(IsActiveParallelRegion, + IsActiveParallelRegion ? 0xFFFFFFFF : 1u); } return isActive; @@ -329,7 +338,16 @@ EXTERN void __kmpc_kernel_end_parallel() { omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( threadId, currTaskDescr->GetPrevTaskDescr()); - DecParallelLevel(threadsInTeam != 1); + // Reconverge the threads at the end of the parallel region to correctly + // handle parallel levels. + // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole + // warp. If only 1 thread is active, not need to reconverge the threads. + // If we have the whole warp, reconverge all the threads in the warp before + // actually trying to change the parallel level. Otherwise, parallel level can + // be changed incorrectly because of threads divergence. + bool IsActiveParallelRegion = threadsInTeam != 1; + DecParallelLevel(IsActiveParallelRegion, + IsActiveParallelRegion ? 0xFFFFFFFF : 1u); } //////////////////////////////////////////////////////////////////////////////// @@ -339,7 +357,7 @@ EXTERN void __kmpc_kernel_end_parallel() { EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n"); - IncParallelLevel(/*ActiveParallel=*/false); + IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask()); if (checkRuntimeUninitialized(loc)) { ASSERT0(LT_FUSSY, checkSPMDMode(loc), @@ -378,7 +396,7 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n"); - DecParallelLevel(/*ActiveParallel=*/false); + DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask()); if (checkRuntimeUninitialized(loc)) { ASSERT0(LT_FUSSY, checkSPMDMode(loc), diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h b/openmp/libomptarget/deviceRTLs/nvptx/src/support.h index 4df75edfbaa3..e10f2a19d324 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/support.h @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "target_impl.h" //////////////////////////////////////////////////////////////////////////////// // Execution Parameters //////////////////////////////////////////////////////////////////////////////// @@ -65,8 +66,8 @@ INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode); INLINE int IsTeamMaster(int ompThreadId); // Parallel level -INLINE void IncParallelLevel(bool ActiveParallel); -INLINE void DecParallelLevel(bool ActiveParallel); +INLINE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); +INLINE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); //////////////////////////////////////////////////////////////////////////////// // Memory diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h index 1a0f1a9ecd17..d4da6ad73fa2 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -203,30 +203,28 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); } //////////////////////////////////////////////////////////////////////////////// // Parallel level -INLINE void IncParallelLevel(bool ActiveParallel) { - __kmpc_impl_lanemask_t Active = __kmpc_impl_activemask(); - __kmpc_impl_syncwarp(Active); +INLINE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) { + __kmpc_impl_syncwarp(Mask); __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt(); - unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt); + unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt); if (Rank == 0) { parallelLevel[GetWarpId()] += (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); __threadfence(); } - __kmpc_impl_syncwarp(Active); + __kmpc_impl_syncwarp(Mask); } -INLINE void DecParallelLevel(bool ActiveParallel) { - __kmpc_impl_lanemask_t Active = __kmpc_impl_activemask(); - __kmpc_impl_syncwarp(Active); +INLINE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) { + __kmpc_impl_syncwarp(Mask); __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt(); - unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt); + unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt); if (Rank == 0) { parallelLevel[GetWarpId()] -= (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); __threadfence(); } - __kmpc_impl_syncwarp(Active); + __kmpc_impl_syncwarp(Mask); } //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c index edb00e09e60a..33e232464567 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c +++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c @@ -135,5 +135,17 @@ int main(int argc, char *argv[]) { } } + // Check for paraller level in non-SPMD kernels. + level = 0; + #pragma omp target teams distribute num_teams(1) thread_limit(32) reduction(+:level) + for (int i=0; i<5032; i+=32) { + int ub = (i+32 > 5032) ? 5032 : i+32; + #pragma omp parallel for schedule(dynamic) + for (int j=i ; j < ub; j++) ; + level += omp_get_level(); + } + // CHECK: Integral level = 0. + printf("Integral level = %d.\n", level); + return 0; }