forked from OSchip/llvm-project
[OPENMP][NVPTX]Correctly handle L2 parallelism in SPMD mode.
Summary: The parallelLevel counter must be on per-thread basis to fully support L2+ parallelism, otherwise we may end up with undefined behavior. Introduce the parallelLevel on per-warp basis using shared memory. It allows to avoid the problems with the synchronization and allows fully support L2+ parallelism in SPMD mode with no runtime. Reviewers: gtbercea, grokos Subscribers: guansong, jdoerfert, caomhin, kkwli0, openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D60918 llvm-svn: 359341
This commit is contained in:
parent
5ddc6d180c
commit
c03fe73176
|
@ -165,7 +165,7 @@ EXTERN int omp_get_level(void) {
|
|||
ASSERT0(LT_FUSSY, isSPMDMode(),
|
||||
"Expected SPMD mode only with uninitialized runtime.");
|
||||
// parallelLevel starts from 0, need to add 1 for correct level.
|
||||
return parallelLevel + 1;
|
||||
return parallelLevel[GetWarpId()] + 1;
|
||||
}
|
||||
int level = 0;
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
||||
|
|
|
@ -31,7 +31,7 @@ __device__ omptarget_nvptx_SimpleMemoryManager
|
|||
__device__ __shared__ uint32_t usedMemIdx;
|
||||
__device__ __shared__ uint32_t usedSlotIdx;
|
||||
|
||||
__device__ __shared__ uint8_t parallelLevel;
|
||||
__device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
|
||||
|
||||
// Pointer to this team's OpenMP state object
|
||||
__device__ __shared__
|
||||
|
|
|
@ -95,8 +95,10 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
|
|||
// If OMP runtime is not required don't initialize OMP state.
|
||||
setExecutionParameters(Spmd, RuntimeUninitialized);
|
||||
if (GetThreadIdInBlock() == 0) {
|
||||
parallelLevel = 0;
|
||||
usedSlotIdx = smid() % MAX_SM;
|
||||
parallelLevel[0] = 0;
|
||||
} else if (GetLaneId() == 0) {
|
||||
parallelLevel[GetWarpId()] = 0;
|
||||
}
|
||||
__SYNCTHREADS();
|
||||
return;
|
||||
|
|
|
@ -406,7 +406,8 @@ extern __device__ omptarget_nvptx_SimpleMemoryManager
|
|||
omptarget_nvptx_simpleMemoryManager;
|
||||
extern __device__ __shared__ uint32_t usedMemIdx;
|
||||
extern __device__ __shared__ uint32_t usedSlotIdx;
|
||||
extern __device__ __shared__ uint8_t parallelLevel;
|
||||
extern __device__ __shared__ uint8_t
|
||||
parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
|
||||
extern __device__ __shared__
|
||||
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
|
||||
|
||||
|
|
|
@ -339,10 +339,12 @@ EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
|
|||
if (checkRuntimeUninitialized(loc)) {
|
||||
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
|
||||
"Expected SPMD mode with uninitialized runtime.");
|
||||
__SYNCTHREADS();
|
||||
if (GetThreadIdInBlock() == 0)
|
||||
++parallelLevel;
|
||||
__SYNCTHREADS();
|
||||
unsigned tnum = __ACTIVEMASK();
|
||||
int leader = __ffs(tnum) - 1;
|
||||
__SHFL_SYNC(tnum, leader, leader);
|
||||
if (GetLaneId() == leader)
|
||||
++parallelLevel[GetWarpId()];
|
||||
__SHFL_SYNC(tnum, leader, leader);
|
||||
|
||||
return;
|
||||
}
|
||||
|
@ -382,10 +384,12 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
|
|||
if (checkRuntimeUninitialized(loc)) {
|
||||
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
|
||||
"Expected SPMD mode with uninitialized runtime.");
|
||||
__SYNCTHREADS();
|
||||
if (GetThreadIdInBlock() == 0)
|
||||
--parallelLevel;
|
||||
__SYNCTHREADS();
|
||||
unsigned tnum = __ACTIVEMASK();
|
||||
int leader = __ffs(tnum) - 1;
|
||||
__SHFL_SYNC(tnum, leader, leader);
|
||||
if (GetLaneId() == leader)
|
||||
--parallelLevel[GetWarpId()];
|
||||
__SHFL_SYNC(tnum, leader, leader);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -407,7 +411,7 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
|
|||
if (checkRuntimeUninitialized(loc)) {
|
||||
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
|
||||
"Expected SPMD mode with uninitialized runtime.");
|
||||
return parallelLevel + 1;
|
||||
return parallelLevel[GetWarpId()] + 1;
|
||||
}
|
||||
|
||||
int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
|
||||
|
|
|
@ -40,6 +40,8 @@ INLINE int GetThreadIdInBlock();
|
|||
INLINE int GetBlockIdInKernel();
|
||||
INLINE int GetNumberOfBlocksInKernel();
|
||||
INLINE int GetNumberOfThreadsInBlock();
|
||||
INLINE unsigned GetWarpId();
|
||||
INLINE unsigned GetLaneId();
|
||||
|
||||
// get global ids to locate tread/team info (constant regardless of OMP)
|
||||
INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
|
||||
|
|
|
@ -102,6 +102,10 @@ INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; }
|
|||
|
||||
INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
|
||||
|
||||
INLINE unsigned GetWarpId() { return threadIdx.x / WARPSIZE; }
|
||||
|
||||
INLINE unsigned GetLaneId() { return threadIdx.x & (WARPSIZE - 1); }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Calls to the Generic Scheme Implementation Layer (assuming 1D layout)
|
||||
|
@ -154,7 +158,7 @@ INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode,
|
|||
ASSERT0(LT_FUSSY, isSPMDExecutionMode,
|
||||
"Uninitialized runtime with non-SPMD mode.");
|
||||
// For level 2 parallelism all parallel regions are executed sequentially.
|
||||
if (parallelLevel > 0)
|
||||
if (parallelLevel[GetWarpId()] > 0)
|
||||
rc = 0;
|
||||
else
|
||||
rc = GetThreadIdInBlock();
|
||||
|
@ -175,7 +179,7 @@ INLINE int GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode,
|
|||
ASSERT0(LT_FUSSY, isSPMDExecutionMode,
|
||||
"Uninitialized runtime with non-SPMD mode.");
|
||||
// For level 2 parallelism all parallel regions are executed sequentially.
|
||||
if (parallelLevel > 0)
|
||||
if (parallelLevel[GetWarpId()] > 0)
|
||||
rc = 1;
|
||||
else
|
||||
rc = GetNumberOfThreadsInBlock();
|
||||
|
|
|
@ -6,24 +6,31 @@
|
|||
int main(void) {
|
||||
int isHost = -1;
|
||||
int ParallelLevel1 = -1, ParallelLevel2 = -1;
|
||||
int Count = 0;
|
||||
|
||||
#pragma omp target parallel for map(tofrom \
|
||||
: isHost, ParallelLevel1, ParallelLevel2)
|
||||
: isHost, ParallelLevel1, ParallelLevel2), reduction(+: Count) schedule(static, 1)
|
||||
for (int J = 0; J < 10; ++J) {
|
||||
#pragma omp critical
|
||||
{
|
||||
isHost = (isHost < 0 || isHost == omp_is_initial_device())
|
||||
? omp_is_initial_device()
|
||||
: 1;
|
||||
ParallelLevel1 =
|
||||
(ParallelLevel1 < 0 || ParallelLevel1 == 1) ? omp_get_level() : 2;
|
||||
isHost = (isHost < 0 || isHost == 0) ? omp_is_initial_device() : isHost;
|
||||
ParallelLevel1 = (ParallelLevel1 < 0 || ParallelLevel1 == 1)
|
||||
? omp_get_level()
|
||||
: ParallelLevel1;
|
||||
}
|
||||
int L2;
|
||||
#pragma omp parallel for schedule(dynamic) lastprivate(L2)
|
||||
for (int I = 0; I < 10; ++I)
|
||||
L2 = omp_get_level();
|
||||
if (omp_get_thread_num() > 5) {
|
||||
int L2;
|
||||
#pragma omp parallel for schedule(dynamic) lastprivate(L2) reduction(+: Count)
|
||||
for (int I = 0; I < 10; ++I) {
|
||||
L2 = omp_get_level();
|
||||
Count += omp_get_level(); // (10-6)*10*2 = 80
|
||||
}
|
||||
#pragma omp critical
|
||||
ParallelLevel2 = (ParallelLevel2 < 0 || ParallelLevel2 == 2) ? L2 : 1;
|
||||
ParallelLevel2 =
|
||||
(ParallelLevel2 < 0 || ParallelLevel2 == 2) ? L2 : ParallelLevel2;
|
||||
} else {
|
||||
Count += omp_get_level(); // 6 * 1 = 6
|
||||
}
|
||||
}
|
||||
|
||||
if (isHost < 0) {
|
||||
|
@ -35,6 +42,10 @@ int main(void) {
|
|||
// CHECK: Parallel level in SPMD mode: L1 is 1, L2 is 2
|
||||
printf("Parallel level in SPMD mode: L1 is %d, L2 is %d\n", ParallelLevel1,
|
||||
ParallelLevel2);
|
||||
// Final result of Count is (10-6)(num of loops)*10(num of iterations)*2(par
|
||||
// level) + 6(num of iterations) * 1(par level)
|
||||
// CHECK: Expected count = 86
|
||||
printf("Expected count = %d\n", Count);
|
||||
|
||||
return isHost;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue