forked from OSchip/llvm-project
[OPENMP][NVPTX]Make runtime compatible with the original runtime.
Summary: Reworked runtime to make it compatible with the requirements of the original runtime library. Also, simplified some code to reduce number of function calls. Reviewers: gtbercea, kkwli0 Subscribers: guansong, jfb, caomhin, openmp-commits Differential Revision: https://reviews.llvm.org/D55130 llvm-svn: 348003
This commit is contained in:
parent
1cfb796b58
commit
0f221f53d8
|
@ -93,9 +93,10 @@ public:
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Support for Static Init
|
||||
|
||||
INLINE static void for_static_init(int32_t schedtype, int32_t *plastiter,
|
||||
T *plower, T *pupper, ST *pstride,
|
||||
ST chunk, bool IsSPMDExecutionMode,
|
||||
INLINE static void for_static_init(int32_t gtid, int32_t schedtype,
|
||||
int32_t *plastiter, T *plower, T *pupper,
|
||||
ST *pstride, ST chunk,
|
||||
bool IsSPMDExecutionMode,
|
||||
bool IsRuntimeUninitialized) {
|
||||
// When IsRuntimeUninitialized is true, we assume that the caller is
|
||||
// in an L0 parallel region and that all worker threads participate.
|
||||
|
@ -112,108 +113,72 @@ public:
|
|||
PRINT(LD_LOOP,
|
||||
"OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
|
||||
"%d, num tids %d\n",
|
||||
GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized),
|
||||
schedtype, P64(chunk),
|
||||
GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized),
|
||||
GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
|
||||
IsRuntimeUninitialized));
|
||||
ASSERT0(
|
||||
LT_FUSSY,
|
||||
(GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized)) <
|
||||
(GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
|
||||
IsRuntimeUninitialized)),
|
||||
"current thread is not needed here; error");
|
||||
gtid, schedtype, P64(chunk), gtid, numberOfActiveOMPThreads);
|
||||
ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
|
||||
"current thread is not needed here; error");
|
||||
|
||||
// copy
|
||||
int lastiter = 0;
|
||||
T lb = *plower;
|
||||
T ub = *pupper;
|
||||
ST stride = *pstride;
|
||||
T entityId, numberOfEntities;
|
||||
// init
|
||||
switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
|
||||
case kmp_sched_static_chunk: {
|
||||
if (chunk > 0) {
|
||||
entityId =
|
||||
GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
|
||||
numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
|
||||
IsRuntimeUninitialized);
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
|
||||
numberOfEntities);
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
|
||||
numberOfActiveOMPThreads);
|
||||
break;
|
||||
}
|
||||
} // note: if chunk <=0, use nochunk
|
||||
case kmp_sched_static_balanced_chunk: {
|
||||
if (chunk > 0) {
|
||||
entityId =
|
||||
GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
|
||||
numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
|
||||
IsRuntimeUninitialized);
|
||||
|
||||
// round up to make sure the chunk is enough to cover all iterations
|
||||
T tripCount = ub - lb + 1; // +1 because ub is inclusive
|
||||
T span = (tripCount + numberOfEntities - 1) / numberOfEntities;
|
||||
T span = (tripCount + numberOfActiveOMPThreads - 1) /
|
||||
numberOfActiveOMPThreads;
|
||||
// perform chunk adjustment
|
||||
chunk = (span + chunk - 1) & ~(chunk - 1);
|
||||
|
||||
ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
|
||||
T oldUb = ub;
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
|
||||
numberOfEntities);
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
|
||||
numberOfActiveOMPThreads);
|
||||
if (ub > oldUb)
|
||||
ub = oldUb;
|
||||
break;
|
||||
}
|
||||
} // note: if chunk <=0, use nochunk
|
||||
case kmp_sched_static_nochunk: {
|
||||
entityId =
|
||||
GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
|
||||
numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
|
||||
IsRuntimeUninitialized);
|
||||
ForStaticNoChunk(lastiter, lb, ub, stride, chunk, entityId,
|
||||
numberOfEntities);
|
||||
ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
|
||||
numberOfActiveOMPThreads);
|
||||
break;
|
||||
}
|
||||
case kmp_sched_distr_static_chunk: {
|
||||
if (chunk > 0) {
|
||||
entityId = GetOmpTeamId();
|
||||
numberOfEntities = GetNumberOfOmpTeams();
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
|
||||
numberOfEntities);
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
|
||||
GetNumberOfOmpTeams());
|
||||
break;
|
||||
} // note: if chunk <=0, use nochunk
|
||||
}
|
||||
case kmp_sched_distr_static_nochunk: {
|
||||
entityId = GetOmpTeamId();
|
||||
numberOfEntities = GetNumberOfOmpTeams();
|
||||
|
||||
ForStaticNoChunk(lastiter, lb, ub, stride, chunk, entityId,
|
||||
numberOfEntities);
|
||||
ForStaticNoChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
|
||||
GetNumberOfOmpTeams());
|
||||
break;
|
||||
}
|
||||
case kmp_sched_distr_static_chunk_sched_static_chunkone: {
|
||||
entityId =
|
||||
GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
|
||||
IsRuntimeUninitialized) *
|
||||
GetOmpTeamId() +
|
||||
GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
|
||||
numberOfEntities = GetNumberOfOmpTeams() *
|
||||
GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
|
||||
IsRuntimeUninitialized);
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
|
||||
numberOfEntities);
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk,
|
||||
numberOfActiveOMPThreads * GetOmpTeamId() + gtid,
|
||||
GetNumberOfOmpTeams() * numberOfActiveOMPThreads);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", schedtype);
|
||||
PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
|
||||
schedtype);
|
||||
entityId =
|
||||
GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
|
||||
numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
|
||||
IsRuntimeUninitialized);
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
|
||||
numberOfEntities);
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
|
||||
numberOfActiveOMPThreads);
|
||||
break;
|
||||
}
|
||||
}
|
||||
// copy back
|
||||
|
@ -221,13 +186,11 @@ public:
|
|||
*plower = lb;
|
||||
*pupper = ub;
|
||||
*pstride = stride;
|
||||
PRINT(
|
||||
LD_LOOP,
|
||||
"Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last "
|
||||
"%d\n",
|
||||
GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, IsRuntimeUninitialized),
|
||||
GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper), P64(*pstride),
|
||||
lastiter);
|
||||
PRINT(LD_LOOP,
|
||||
"Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last "
|
||||
"%d\n",
|
||||
numberOfActiveOMPThreads, GetNumberOfWorkersInTeam(), P64(*plower),
|
||||
P64(*pupper), P64(*pstride), lastiter);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -247,12 +210,8 @@ public:
|
|||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
|
||||
T tnum = currTaskDescr->ThreadsInTeam();
|
||||
T tripCount = ub - lb + 1; // +1 because ub is inclusive
|
||||
ASSERT0(
|
||||
LT_FUSSY,
|
||||
GetOmpThreadId(tid, checkSPMDMode(loc), checkRuntimeUninitialized(loc)) <
|
||||
GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
|
||||
checkRuntimeUninitialized(loc)),
|
||||
"current thread is not needed here; error");
|
||||
ASSERT0(LT_FUSSY, threadId < tnum,
|
||||
"current thread is not needed here; error");
|
||||
|
||||
/* Currently just ignore the monotonic and non-monotonic modifiers
|
||||
* (the compiler isn't producing them * yet anyway).
|
||||
|
@ -320,10 +279,7 @@ public:
|
|||
// compute static chunk
|
||||
ST stride;
|
||||
int lastiter = 0;
|
||||
ForStaticChunk(
|
||||
lastiter, lb, ub, stride, chunk,
|
||||
GetOmpThreadId(tid, checkSPMDMode(loc),
|
||||
checkRuntimeUninitialized(loc)), tnum);
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
|
||||
// save computed params
|
||||
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
|
||||
|
@ -331,9 +287,7 @@ public:
|
|||
PRINT(LD_LOOP,
|
||||
"dispatch init (static chunk) : num threads = %d, ub = %" PRId64
|
||||
", next lower bound = %llu, stride = %llu\n",
|
||||
GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
|
||||
checkRuntimeUninitialized(loc)),
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
|
||||
tnum, omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->Stride(tid));
|
||||
} else if (schedule == kmp_sched_static_balanced_chunk) {
|
||||
|
@ -351,10 +305,7 @@ public:
|
|||
chunk = (span + chunk - 1) & ~(chunk - 1);
|
||||
|
||||
T oldUb = ub;
|
||||
ForStaticChunk(
|
||||
lastiter, lb, ub, stride, chunk,
|
||||
GetOmpThreadId(tid, checkSPMDMode(loc),
|
||||
checkRuntimeUninitialized(loc)), tnum);
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
|
||||
ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
|
||||
if (ub > oldUb)
|
||||
ub = oldUb;
|
||||
|
@ -365,9 +316,7 @@ public:
|
|||
PRINT(LD_LOOP,
|
||||
"dispatch init (static chunk) : num threads = %d, ub = %" PRId64
|
||||
", next lower bound = %llu, stride = %llu\n",
|
||||
GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
|
||||
checkRuntimeUninitialized(loc)),
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
|
||||
tnum, omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->Stride(tid));
|
||||
} else if (schedule == kmp_sched_static_nochunk) {
|
||||
|
@ -379,10 +328,7 @@ public:
|
|||
// compute static chunk
|
||||
ST stride;
|
||||
int lastiter = 0;
|
||||
ForStaticNoChunk(
|
||||
lastiter, lb, ub, stride, chunk,
|
||||
GetOmpThreadId(tid, checkSPMDMode(loc),
|
||||
checkRuntimeUninitialized(loc)), tnum);
|
||||
ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
|
||||
// save computed params
|
||||
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
|
||||
|
@ -390,9 +336,7 @@ public:
|
|||
PRINT(LD_LOOP,
|
||||
"dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
|
||||
", next lower bound = %llu, stride = %llu\n",
|
||||
GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
|
||||
checkRuntimeUninitialized(loc)),
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
|
||||
tnum, omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->Stride(tid));
|
||||
|
||||
|
@ -412,9 +356,7 @@ public:
|
|||
PRINT(LD_LOOP,
|
||||
"dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
|
||||
", chunk %" PRIu64 "\n",
|
||||
GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
|
||||
checkRuntimeUninitialized(loc)),
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
|
||||
tnum, omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId),
|
||||
omptarget_nvptx_threadPrivateContext->Chunk(teamId));
|
||||
}
|
||||
|
@ -460,19 +402,18 @@ public:
|
|||
// On Pascal, with inlining of the runtime into the user application,
|
||||
// this code deadlocks. This is probably because different threads
|
||||
// in a warp cannot make independent progress.
|
||||
NOINLINE static int dispatch_next(int32_t *plast, T *plower, T *pupper,
|
||||
ST *pstride) {
|
||||
NOINLINE static int dispatch_next(int32_t gtid, int32_t *plast, T *plower,
|
||||
T *pupper, ST *pstride) {
|
||||
ASSERT0(LT_FUSSY, isRuntimeInitialized(),
|
||||
"Expected non-SPMD mode + initialized runtime.");
|
||||
// ID of a thread in its own warp
|
||||
|
||||
// automatically selects thread or warp ID based on selected implementation
|
||||
int tid = GetLogicalThreadIdInBlock();
|
||||
ASSERT0(
|
||||
LT_FUSSY,
|
||||
GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()) <
|
||||
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
|
||||
"current thread is not needed here; error");
|
||||
ASSERT0(LT_FUSSY,
|
||||
gtid < GetNumberOfOmpThreads(tid, isSPMDMode(),
|
||||
isRuntimeUninitialized()),
|
||||
"current thread is not needed here; error");
|
||||
// retrieve schedule
|
||||
kmp_sched_t schedule =
|
||||
omptarget_nvptx_threadPrivateContext->ScheduleType(tid);
|
||||
|
@ -583,7 +524,7 @@ EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last,
|
|||
int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
|
||||
return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
|
||||
p_last, p_lb, p_ub, p_st);
|
||||
tid, p_last, p_lb, p_ub, p_st);
|
||||
}
|
||||
|
||||
EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid,
|
||||
|
@ -591,14 +532,14 @@ EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid,
|
|||
uint32_t *p_ub, int32_t *p_st) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
|
||||
return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
|
||||
p_last, p_lb, p_ub, p_st);
|
||||
tid, p_last, p_lb, p_ub, p_st);
|
||||
}
|
||||
|
||||
EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last,
|
||||
int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
|
||||
return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
|
||||
p_last, p_lb, p_ub, p_st);
|
||||
tid, p_last, p_lb, p_ub, p_st);
|
||||
}
|
||||
|
||||
EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid,
|
||||
|
@ -606,7 +547,7 @@ EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid,
|
|||
uint64_t *p_ub, int64_t *p_st) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
|
||||
return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
|
||||
p_last, p_lb, p_ub, p_st);
|
||||
tid, p_last, p_lb, p_ub, p_st);
|
||||
}
|
||||
|
||||
// fini
|
||||
|
@ -641,7 +582,7 @@ EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
|
|||
int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
|
||||
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
checkSPMDMode(loc), checkRuntimeUninitialized(loc));
|
||||
}
|
||||
|
||||
|
@ -652,7 +593,7 @@ EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
|
|||
int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
|
||||
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
checkSPMDMode(loc), checkRuntimeUninitialized(loc));
|
||||
}
|
||||
|
||||
|
@ -663,7 +604,7 @@ EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
|
|||
int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
|
||||
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
checkSPMDMode(loc), checkRuntimeUninitialized(loc));
|
||||
}
|
||||
|
||||
|
@ -674,7 +615,7 @@ EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
|
|||
int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
|
||||
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
checkSPMDMode(loc), checkRuntimeUninitialized(loc));
|
||||
}
|
||||
|
||||
|
@ -686,9 +627,8 @@ void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
|
|||
int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
|
||||
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/true,
|
||||
/*IsRuntimeUninitialized=*/true);
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
|
@ -699,9 +639,8 @@ void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
|
|||
int32_t incr, int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
|
||||
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/true,
|
||||
/*IsRuntimeUninitialized=*/true);
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
|
@ -712,9 +651,8 @@ void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
|
|||
int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
|
||||
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/true,
|
||||
/*IsRuntimeUninitialized=*/true);
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
|
@ -725,9 +663,8 @@ void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
|
|||
int64_t incr, int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
|
||||
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/true,
|
||||
/*IsRuntimeUninitialized=*/true);
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
|
@ -737,9 +674,8 @@ void __kmpc_for_static_init_4_simple_generic(
|
|||
int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
|
||||
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/false,
|
||||
/*IsRuntimeUninitialized=*/true);
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
|
@ -749,9 +685,8 @@ void __kmpc_for_static_init_4u_simple_generic(
|
|||
int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
|
||||
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/false,
|
||||
/*IsRuntimeUninitialized=*/true);
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
|
@ -761,9 +696,8 @@ void __kmpc_for_static_init_8_simple_generic(
|
|||
int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
|
||||
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/false,
|
||||
/*IsRuntimeUninitialized=*/true);
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
|
@ -773,9 +707,8 @@ void __kmpc_for_static_init_8u_simple_generic(
|
|||
int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
|
||||
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/false,
|
||||
/*IsRuntimeUninitialized=*/true);
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
|
||||
|
@ -807,15 +740,13 @@ EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid,
|
|||
"Expected non-SPMD mode + initialized runtime.");
|
||||
|
||||
omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
|
||||
int tid = GetOmpThreadId(GetLogicalThreadIdInBlock(), checkSPMDMode(loc),
|
||||
checkRuntimeUninitialized(loc));
|
||||
uint32_t NumThreads = GetNumberOfOmpThreads(
|
||||
GetLogicalThreadIdInBlock(), checkSPMDMode(loc),
|
||||
checkRuntimeUninitialized(loc));
|
||||
int tid = GetLogicalThreadIdInBlock();
|
||||
uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
|
||||
checkRuntimeUninitialized(loc));
|
||||
uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
|
||||
for (unsigned i = 0; i < varNum; i++) {
|
||||
// Reset buffer.
|
||||
if (tid == 0)
|
||||
if (gtid == 0)
|
||||
*Buffer = 0; // Reset to minimum loop iteration value.
|
||||
|
||||
// Barrier.
|
||||
|
|
|
@ -418,7 +418,9 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
|
|||
// it's cheap to recalculate this value so we never use the result
|
||||
// of this call.
|
||||
EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
|
||||
return GetLogicalThreadIdInBlock();
|
||||
int tid = GetLogicalThreadIdInBlock();
|
||||
return GetOmpThreadId(tid, checkSPMDMode(loc),
|
||||
checkRuntimeUninitialized(loc));
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
|
|
@ -232,8 +232,7 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
|
|||
|
||||
// Get the OMP thread Id. This is different from BlockThreadId in the case of
|
||||
// an L2 parallel region.
|
||||
return GetOmpThreadId(BlockThreadId, isSPMDExecutionMode,
|
||||
isRuntimeUninitialized) == 0;
|
||||
return global_tid == 0;
|
||||
#endif // __CUDA_ARCH__ >= 700
|
||||
}
|
||||
|
||||
|
|
|
@ -99,21 +99,14 @@ EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) {
|
|||
// KMP MASTER
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE int32_t IsMaster() {
|
||||
// only the team master updates the state
|
||||
int tid = GetLogicalThreadIdInBlock();
|
||||
int ompThreadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
|
||||
return IsTeamMaster(ompThreadId);
|
||||
}
|
||||
|
||||
EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid) {
|
||||
PRINT0(LD_IO, "call kmpc_master\n");
|
||||
return IsMaster();
|
||||
return IsTeamMaster(global_tid);
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid) {
|
||||
PRINT0(LD_IO, "call kmpc_end_master\n");
|
||||
ASSERT0(LT_FUSSY, IsMaster(), "expected only master here");
|
||||
ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -123,13 +116,13 @@ EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid) {
|
|||
EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid) {
|
||||
PRINT0(LD_IO, "call kmpc_single\n");
|
||||
// decide to implement single with master; master get the single
|
||||
return IsMaster();
|
||||
return IsTeamMaster(global_tid);
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid) {
|
||||
PRINT0(LD_IO, "call kmpc_end_single\n");
|
||||
// decide to implement single with master: master get the single
|
||||
ASSERT0(LT_FUSSY, IsMaster(), "expected only master here");
|
||||
ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here");
|
||||
// sync barrier is explicitely called... so that is not a problem
|
||||
}
|
||||
|
||||
|
|
|
@ -81,7 +81,8 @@ EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
|
|||
void *noAliasDepList) {
|
||||
PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n",
|
||||
P64(newKmpTaskDescr));
|
||||
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
|
||||
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
|
||||
"Runtime must be initialized.");
|
||||
// 1. get explict task descr from kmp task descr
|
||||
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
|
||||
(omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
|
||||
|
@ -118,7 +119,8 @@ EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
|
|||
kmp_TaskDescr *newKmpTaskDescr) {
|
||||
PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n",
|
||||
P64(newKmpTaskDescr));
|
||||
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
|
||||
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
|
||||
"Runtime must be initialized.");
|
||||
// 1. get explict task descr from kmp task descr
|
||||
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
|
||||
(omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
|
||||
|
@ -143,7 +145,8 @@ EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
|
|||
kmp_TaskDescr *newKmpTaskDescr) {
|
||||
PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n",
|
||||
P64(newKmpTaskDescr));
|
||||
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
|
||||
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
|
||||
"Runtime must be initialized.");
|
||||
// 1. get explict task descr from kmp task descr
|
||||
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
|
||||
(omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
|
||||
|
|
Loading…
Reference in New Issue