[OPENMP][NVPTX]Correctly handle L2 parallelism in SPMD mode.

Summary:
The parallelLevel counter must be on per-thread basis to fully support
L2+ parallelism, otherwise we may end up with undefined behavior.
Introduce the parallelLevel on per-warp basis using shared memory. It
allows to avoid the problems with the synchronization and allows fully
support L2+ parallelism in SPMD mode with no runtime.

Reviewers: gtbercea, grokos

Subscribers: guansong, jdoerfert, caomhin, kkwli0, openmp-commits

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D60918

llvm-svn: 359341
This commit is contained in:
Alexey Bataev 2019-04-26 19:30:34 +00:00
parent 5ddc6d180c
commit c03fe73176
8 changed files with 50 additions and 26 deletions

View File

@ -165,7 +165,7 @@ EXTERN int omp_get_level(void) {
ASSERT0(LT_FUSSY, isSPMDMode(),
"Expected SPMD mode only with uninitialized runtime.");
// parallelLevel starts from 0, need to add 1 for correct level.
return parallelLevel + 1;
return parallelLevel[GetWarpId()] + 1;
}
int level = 0;
omptarget_nvptx_TaskDescr *currTaskDescr =

View File

@ -31,7 +31,7 @@ __device__ omptarget_nvptx_SimpleMemoryManager
__device__ __shared__ uint32_t usedMemIdx;
__device__ __shared__ uint32_t usedSlotIdx;
__device__ __shared__ uint8_t parallelLevel;
__device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
// Pointer to this team's OpenMP state object
__device__ __shared__

View File

@ -95,8 +95,10 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
// If OMP runtime is not required don't initialize OMP state.
setExecutionParameters(Spmd, RuntimeUninitialized);
if (GetThreadIdInBlock() == 0) {
parallelLevel = 0;
usedSlotIdx = smid() % MAX_SM;
parallelLevel[0] = 0;
} else if (GetLaneId() == 0) {
parallelLevel[GetWarpId()] = 0;
}
__SYNCTHREADS();
return;

View File

@ -406,7 +406,8 @@ extern __device__ omptarget_nvptx_SimpleMemoryManager
omptarget_nvptx_simpleMemoryManager;
extern __device__ __shared__ uint32_t usedMemIdx;
extern __device__ __shared__ uint32_t usedSlotIdx;
extern __device__ __shared__ uint8_t parallelLevel;
extern __device__ __shared__ uint8_t
parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
extern __device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;

View File

@ -339,10 +339,12 @@ EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
"Expected SPMD mode with uninitialized runtime.");
__SYNCTHREADS();
if (GetThreadIdInBlock() == 0)
++parallelLevel;
__SYNCTHREADS();
unsigned tnum = __ACTIVEMASK();
int leader = __ffs(tnum) - 1;
__SHFL_SYNC(tnum, leader, leader);
if (GetLaneId() == leader)
++parallelLevel[GetWarpId()];
__SHFL_SYNC(tnum, leader, leader);
return;
}
@ -382,10 +384,12 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
"Expected SPMD mode with uninitialized runtime.");
__SYNCTHREADS();
if (GetThreadIdInBlock() == 0)
--parallelLevel;
__SYNCTHREADS();
unsigned tnum = __ACTIVEMASK();
int leader = __ffs(tnum) - 1;
__SHFL_SYNC(tnum, leader, leader);
if (GetLaneId() == leader)
--parallelLevel[GetWarpId()];
__SHFL_SYNC(tnum, leader, leader);
return;
}
@ -407,7 +411,7 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
"Expected SPMD mode with uninitialized runtime.");
return parallelLevel + 1;
return parallelLevel[GetWarpId()] + 1;
}
int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));

View File

@ -40,6 +40,8 @@ INLINE int GetThreadIdInBlock();
INLINE int GetBlockIdInKernel();
INLINE int GetNumberOfBlocksInKernel();
INLINE int GetNumberOfThreadsInBlock();
INLINE unsigned GetWarpId();
INLINE unsigned GetLaneId();
// get global ids to locate tread/team info (constant regardless of OMP)
INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);

View File

@ -102,6 +102,10 @@ INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; }
INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
INLINE unsigned GetWarpId() { return threadIdx.x / WARPSIZE; }
INLINE unsigned GetLaneId() { return threadIdx.x & (WARPSIZE - 1); }
////////////////////////////////////////////////////////////////////////////////
//
// Calls to the Generic Scheme Implementation Layer (assuming 1D layout)
@ -154,7 +158,7 @@ INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode,
ASSERT0(LT_FUSSY, isSPMDExecutionMode,
"Uninitialized runtime with non-SPMD mode.");
// For level 2 parallelism all parallel regions are executed sequentially.
if (parallelLevel > 0)
if (parallelLevel[GetWarpId()] > 0)
rc = 0;
else
rc = GetThreadIdInBlock();
@ -175,7 +179,7 @@ INLINE int GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode,
ASSERT0(LT_FUSSY, isSPMDExecutionMode,
"Uninitialized runtime with non-SPMD mode.");
// For level 2 parallelism all parallel regions are executed sequentially.
if (parallelLevel > 0)
if (parallelLevel[GetWarpId()] > 0)
rc = 1;
else
rc = GetNumberOfThreadsInBlock();

View File

@ -6,24 +6,31 @@
int main(void) {
int isHost = -1;
int ParallelLevel1 = -1, ParallelLevel2 = -1;
int Count = 0;
#pragma omp target parallel for map(tofrom \
: isHost, ParallelLevel1, ParallelLevel2)
: isHost, ParallelLevel1, ParallelLevel2), reduction(+: Count) schedule(static, 1)
for (int J = 0; J < 10; ++J) {
#pragma omp critical
{
isHost = (isHost < 0 || isHost == omp_is_initial_device())
? omp_is_initial_device()
: 1;
ParallelLevel1 =
(ParallelLevel1 < 0 || ParallelLevel1 == 1) ? omp_get_level() : 2;
isHost = (isHost < 0 || isHost == 0) ? omp_is_initial_device() : isHost;
ParallelLevel1 = (ParallelLevel1 < 0 || ParallelLevel1 == 1)
? omp_get_level()
: ParallelLevel1;
}
int L2;
#pragma omp parallel for schedule(dynamic) lastprivate(L2)
for (int I = 0; I < 10; ++I)
L2 = omp_get_level();
if (omp_get_thread_num() > 5) {
int L2;
#pragma omp parallel for schedule(dynamic) lastprivate(L2) reduction(+: Count)
for (int I = 0; I < 10; ++I) {
L2 = omp_get_level();
Count += omp_get_level(); // (10-6)*10*2 = 80
}
#pragma omp critical
ParallelLevel2 = (ParallelLevel2 < 0 || ParallelLevel2 == 2) ? L2 : 1;
ParallelLevel2 =
(ParallelLevel2 < 0 || ParallelLevel2 == 2) ? L2 : ParallelLevel2;
} else {
Count += omp_get_level(); // 6 * 1 = 6
}
}
if (isHost < 0) {
@ -35,6 +42,10 @@ int main(void) {
// CHECK: Parallel level in SPMD mode: L1 is 1, L2 is 2
printf("Parallel level in SPMD mode: L1 is %d, L2 is %d\n", ParallelLevel1,
ParallelLevel2);
// Final result of Count is (10-6)(num of loops)*10(num of iterations)*2(par
// level) + 6(num of iterations) * 1(par level)
// CHECK: Expected count = 86
printf("Expected count = %d\n", Count);
return isHost;
}