forked from OSchip/llvm-project
[OPENMP][NVPTX]Correct type casting for printf args + simplified shfl64 function.
Summary: Explicitly casted printf's args to the required types + simplified shfl64 function. Reviewers: gtbercea, kkwli0 Subscribers: guansong, jfb, caomhin, openmp-commits Differential Revision: https://reviews.llvm.org/D55379 llvm-svn: 348521
This commit is contained in:
parent
bf3f90c34f
commit
653e8ba79a
|
@ -15,14 +15,14 @@
|
|||
|
||||
EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t cancelVal) {
|
||||
PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", cancelVal);
|
||||
PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", (int)cancelVal);
|
||||
// disabled
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t cancelVal) {
|
||||
PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", cancelVal);
|
||||
PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", (int)cancelVal);
|
||||
// disabled
|
||||
return FALSE;
|
||||
}
|
||||
|
|
|
@ -84,7 +84,7 @@ __kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
|
|||
"Entering __kmpc_initialize_data_sharing_environment\n");
|
||||
|
||||
unsigned WID = getWarpId();
|
||||
DSPRINT(DSFLAG_INIT, "Warp ID: %d\n", WID);
|
||||
DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID);
|
||||
|
||||
omptarget_nvptx_TeamDescr *teamDescr =
|
||||
&omptarget_nvptx_threadPrivateContext->TeamContext();
|
||||
|
@ -95,15 +95,16 @@ __kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
|
|||
|
||||
// We don't need to initialize the frame and active threads.
|
||||
|
||||
DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", InitialDataSize);
|
||||
DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (long long)RootS);
|
||||
DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", (unsigned)InitialDataSize);
|
||||
DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (unsigned long long)RootS);
|
||||
DSPRINT(DSFLAG_INIT, "Root slot data-end at: %016llx \n",
|
||||
(long long)RootS->DataEnd);
|
||||
DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n", (long long)RootS->Next);
|
||||
(unsigned long long)RootS->DataEnd);
|
||||
DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n",
|
||||
(unsigned long long)RootS->Next);
|
||||
DSPRINT(DSFLAG_INIT, "Shared slot ptr at: %016llx \n",
|
||||
(long long)DataSharingState.SlotPtr[WID]);
|
||||
(unsigned long long)DataSharingState.SlotPtr[WID]);
|
||||
DSPRINT(DSFLAG_INIT, "Shared stack ptr at: %016llx \n",
|
||||
(long long)DataSharingState.StackPtr[WID]);
|
||||
(unsigned long long)DataSharingState.StackPtr[WID]);
|
||||
|
||||
DSPRINT0(DSFLAG_INIT, "Exiting __kmpc_initialize_data_sharing_environment\n");
|
||||
}
|
||||
|
@ -121,8 +122,9 @@ EXTERN void *__kmpc_data_sharing_environment_begin(
|
|||
if (!IsOMPRuntimeInitialized)
|
||||
return (void *)&DataSharingState;
|
||||
|
||||
DSPRINT(DSFLAG, "Data Size %016llx\n", SharingDataSize);
|
||||
DSPRINT(DSFLAG, "Default Data Size %016llx\n", SharingDefaultDataSize);
|
||||
DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
|
||||
DSPRINT(DSFLAG, "Default Data Size %016llx\n",
|
||||
(unsigned long long)SharingDefaultDataSize);
|
||||
|
||||
unsigned WID = getWarpId();
|
||||
unsigned CurActiveThreads = getActiveThreadsMask();
|
||||
|
@ -139,11 +141,11 @@ EXTERN void *__kmpc_data_sharing_environment_begin(
|
|||
*SavedSharedFrame = FrameP;
|
||||
*SavedActiveThreads = ActiveT;
|
||||
|
||||
DSPRINT(DSFLAG, "Warp ID: %d\n", WID);
|
||||
DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (long long)SlotP);
|
||||
DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (long long)StackP);
|
||||
DSPRINT(DSFLAG, "Warp ID: %u\n", WID);
|
||||
DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (unsigned long long)SlotP);
|
||||
DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (unsigned long long)StackP);
|
||||
DSPRINT(DSFLAG, "Saved frame ptr at: %016llx \n", (long long)FrameP);
|
||||
DSPRINT(DSFLAG, "Active threads: %08x \n", ActiveT);
|
||||
DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
|
||||
|
||||
// Only the warp active master needs to grow the stack.
|
||||
if (IsWarpMasterActiveThread()) {
|
||||
|
@ -161,12 +163,16 @@ EXTERN void *__kmpc_data_sharing_environment_begin(
|
|||
const uintptr_t RequiredEndAddress =
|
||||
CurrentStartAddress + (uintptr_t)SharingDataSize;
|
||||
|
||||
DSPRINT(DSFLAG, "Data Size %016llx\n", SharingDataSize);
|
||||
DSPRINT(DSFLAG, "Default Data Size %016llx\n", SharingDefaultDataSize);
|
||||
DSPRINT(DSFLAG, "Current Start Address %016llx\n", CurrentStartAddress);
|
||||
DSPRINT(DSFLAG, "Current End Address %016llx\n", CurrentEndAddress);
|
||||
DSPRINT(DSFLAG, "Required End Address %016llx\n", RequiredEndAddress);
|
||||
DSPRINT(DSFLAG, "Active Threads %08x\n", ActiveT);
|
||||
DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
|
||||
DSPRINT(DSFLAG, "Default Data Size %016llx\n",
|
||||
(unsigned long long)SharingDefaultDataSize);
|
||||
DSPRINT(DSFLAG, "Current Start Address %016llx\n",
|
||||
(unsigned long long)CurrentStartAddress);
|
||||
DSPRINT(DSFLAG, "Current End Address %016llx\n",
|
||||
(unsigned long long)CurrentEndAddress);
|
||||
DSPRINT(DSFLAG, "Required End Address %016llx\n",
|
||||
(unsigned long long)RequiredEndAddress);
|
||||
DSPRINT(DSFLAG, "Active Threads %08x\n", (unsigned)ActiveT);
|
||||
|
||||
// If we require a new slot, allocate it and initialize it (or attempt to
|
||||
// reuse one). Also, set the shared stack and slot pointers to the new
|
||||
|
@ -184,11 +190,11 @@ EXTERN void *__kmpc_data_sharing_environment_begin(
|
|||
(uintptr_t)(&ExistingSlot->Data[0]);
|
||||
if (ExistingSlotSize >= NewSize) {
|
||||
DSPRINT(DSFLAG, "Reusing stack slot %016llx\n",
|
||||
(long long)ExistingSlot);
|
||||
(unsigned long long)ExistingSlot);
|
||||
NewSlot = ExistingSlot;
|
||||
} else {
|
||||
DSPRINT(DSFLAG, "Cleaning up -failed reuse - %016llx\n",
|
||||
(long long)SlotP->Next);
|
||||
(unsigned long long)SlotP->Next);
|
||||
free(ExistingSlot);
|
||||
}
|
||||
}
|
||||
|
@ -197,7 +203,7 @@ EXTERN void *__kmpc_data_sharing_environment_begin(
|
|||
NewSlot = (__kmpc_data_sharing_slot *)malloc(
|
||||
sizeof(__kmpc_data_sharing_slot) + NewSize);
|
||||
DSPRINT(DSFLAG, "New slot allocated %016llx (data size=%016llx)\n",
|
||||
(long long)NewSlot, NewSize);
|
||||
(unsigned long long)NewSlot, NewSize);
|
||||
}
|
||||
|
||||
NewSlot->Next = 0;
|
||||
|
@ -213,7 +219,7 @@ EXTERN void *__kmpc_data_sharing_environment_begin(
|
|||
// not eliminate them because that may be used to return data.
|
||||
if (SlotP->Next) {
|
||||
DSPRINT(DSFLAG, "Cleaning up - old not required - %016llx\n",
|
||||
(long long)SlotP->Next);
|
||||
(unsigned long long)SlotP->Next);
|
||||
free(SlotP->Next);
|
||||
SlotP->Next = 0;
|
||||
}
|
||||
|
@ -275,8 +281,8 @@ EXTERN void __kmpc_data_sharing_environment_end(
|
|||
// have other threads that will return after the current ones.
|
||||
ActiveT &= ~CurActive;
|
||||
|
||||
DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n", CurActive,
|
||||
ActiveT);
|
||||
DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n",
|
||||
(unsigned)CurActive, (unsigned)ActiveT);
|
||||
|
||||
if (!ActiveT) {
|
||||
// No other active threads? Great, lets restore the stack.
|
||||
|
@ -290,10 +296,13 @@ EXTERN void __kmpc_data_sharing_environment_end(
|
|||
FrameP = *SavedSharedFrame;
|
||||
ActiveT = *SavedActiveThreads;
|
||||
|
||||
DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n", (long long)SlotP);
|
||||
DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n", (long long)StackP);
|
||||
DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n", (long long)FrameP);
|
||||
DSPRINT(DSFLAG, "Active threads: %08x \n", ActiveT);
|
||||
DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n",
|
||||
(unsigned long long)SlotP);
|
||||
DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n",
|
||||
(unsigned long long)StackP);
|
||||
DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n",
|
||||
(unsigned long long)FrameP);
|
||||
DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -319,7 +328,7 @@ __kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
|
|||
|
||||
unsigned SourceWID = SourceThreadID / WARPSIZE;
|
||||
|
||||
DSPRINT(DSFLAG, "Source warp: %d\n", SourceWID);
|
||||
DSPRINT(DSFLAG, "Source warp: %u\n", SourceWID);
|
||||
|
||||
void * volatile P = DataSharingState.FramePtr[SourceWID];
|
||||
DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
|
||||
|
|
|
@ -164,16 +164,18 @@
|
|||
#define PRINT0(_flag, _str) \
|
||||
{ \
|
||||
if (omptarget_device_environment.debug_level && DON(_flag)) { \
|
||||
printf("<b %2d, t %4d, w %2d, l %2d>: " _str, blockIdx.x, threadIdx.x, \
|
||||
threadIdx.x / WARPSIZE, threadIdx.x & 0x1F); \
|
||||
printf("<b %2d, t %4d, w %2d, l %2d>: " _str, (int)blockIdx.x, \
|
||||
(int)threadIdx.x, (int)(threadIdx.x / WARPSIZE), \
|
||||
(int)(threadIdx.x & 0x1F)); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define PRINT(_flag, _str, _args...) \
|
||||
{ \
|
||||
if (omptarget_device_environment.debug_level && DON(_flag)) { \
|
||||
printf("<b %2d, t %4d, w %2d, l %2d>: " _str, blockIdx.x, threadIdx.x, \
|
||||
threadIdx.x / WARPSIZE, threadIdx.x & 0x1F, _args); \
|
||||
printf("<b %2d, t %4d, w %2d, l %2d>: " _str, (int)blockIdx.x, \
|
||||
(int)threadIdx.x, (int)(threadIdx.x / WARPSIZE), \
|
||||
(int)(threadIdx.x & 0x1F), _args); \
|
||||
} \
|
||||
}
|
||||
#else
|
||||
|
@ -217,16 +219,18 @@
|
|||
#define ASSERT0(_flag, _cond, _str) \
|
||||
{ \
|
||||
if (TON(_flag) && !(_cond)) { \
|
||||
printf("<b %3d, t %4d, w %2d, l %2d> ASSERT: " _str "\n", blockIdx.x, \
|
||||
threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F); \
|
||||
printf("<b %3d, t %4d, w %2d, l %2d> ASSERT: " _str "\n", \
|
||||
(int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE), \
|
||||
(int)(threadIdx.x & 0x1F)); \
|
||||
assert(_cond); \
|
||||
} \
|
||||
}
|
||||
#define ASSERT(_flag, _cond, _str, _args...) \
|
||||
{ \
|
||||
if (TON(_flag) && !(_cond)) { \
|
||||
printf("<b %3d, t %4d, w %2d, l %d2> ASSERT: " _str "\n", blockIdx.x, \
|
||||
threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F, _args); \
|
||||
printf("<b %3d, t %4d, w %2d, l %d2> ASSERT: " _str "\n", \
|
||||
(int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE), \
|
||||
(int)(threadIdx.x & 0x1F), _args); \
|
||||
assert(_cond); \
|
||||
} \
|
||||
}
|
||||
|
@ -253,15 +257,17 @@
|
|||
#define WARNING0(_flag, _str) \
|
||||
{ \
|
||||
if (WON(_flag)) { \
|
||||
printf("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, blockIdx.x, \
|
||||
threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F); \
|
||||
printf("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, (int)blockIdx.x, \
|
||||
(int)threadIdx.x, (int)(threadIdx.x / WARPSIZE), \
|
||||
(int)(threadIdx.x & 0x1F)); \
|
||||
} \
|
||||
}
|
||||
#define WARNING(_flag, _str, _args...) \
|
||||
{ \
|
||||
if (WON(_flag)) { \
|
||||
printf("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, blockIdx.x, \
|
||||
threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F, _args); \
|
||||
printf("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, (int)blockIdx.x, \
|
||||
(int)threadIdx.x, (int)(threadIdx.x / WARPSIZE), \
|
||||
(int)(threadIdx.x & 0x1F), _args); \
|
||||
} \
|
||||
}
|
||||
|
||||
|
|
|
@ -222,9 +222,11 @@ EXTERN int omp_get_ancestor_thread_num(int level) {
|
|||
" chunk %" PRIu64 "; tid %d, tnum %d, nthreads %d\n",
|
||||
"ancestor", steps,
|
||||
(currTaskDescr->IsParallelConstruct() ? "par" : "task"),
|
||||
currTaskDescr->InParallelRegion(), sched,
|
||||
currTaskDescr->RuntimeChunkSize(), currTaskDescr->ThreadId(),
|
||||
currTaskDescr->ThreadsInTeam(), currTaskDescr->NThreads());
|
||||
(int)currTaskDescr->InParallelRegion(), (int)sched,
|
||||
currTaskDescr->RuntimeChunkSize(),
|
||||
(int)currTaskDescr->ThreadId(),
|
||||
(int)currTaskDescr->ThreadsInTeam(),
|
||||
(int)currTaskDescr->NThreads());
|
||||
}
|
||||
|
||||
if (currTaskDescr->IsParallelConstruct()) {
|
||||
|
|
|
@ -113,7 +113,8 @@ public:
|
|||
PRINT(LD_LOOP,
|
||||
"OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
|
||||
"%d, num tids %d\n",
|
||||
gtid, schedtype, P64(chunk), gtid, numberOfActiveOMPThreads);
|
||||
(int)gtid, (int)schedtype, (long long)chunk, (int)gtid,
|
||||
(int)numberOfActiveOMPThreads);
|
||||
ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
|
||||
"current thread is not needed here; error");
|
||||
|
||||
|
@ -173,9 +174,9 @@ public:
|
|||
break;
|
||||
}
|
||||
default: {
|
||||
ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", schedtype);
|
||||
ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", (int)schedtype);
|
||||
PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
|
||||
schedtype);
|
||||
(int)schedtype);
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
|
||||
numberOfActiveOMPThreads);
|
||||
break;
|
||||
|
@ -189,8 +190,9 @@ public:
|
|||
PRINT(LD_LOOP,
|
||||
"Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last "
|
||||
"%d\n",
|
||||
numberOfActiveOMPThreads, GetNumberOfWorkersInTeam(), P64(*plower),
|
||||
P64(*pupper), P64(*pstride), lastiter);
|
||||
(int)numberOfActiveOMPThreads, (int)GetNumberOfWorkersInTeam(),
|
||||
(long long)(*plower), (long long)(*pupper), (long long)(*pstride),
|
||||
(int)lastiter);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -229,7 +231,7 @@ public:
|
|||
__kmpc_barrier(loc, threadId);
|
||||
PRINT(LD_LOOP,
|
||||
"go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
|
||||
(long)tnum, P64(tripCount), schedule);
|
||||
(long)tnum, (long long)tripCount, (int)schedule);
|
||||
schedule = kmp_sched_static_chunk;
|
||||
chunk = tripCount; // one thread gets the whole loop
|
||||
} else if (schedule == kmp_sched_runtime) {
|
||||
|
@ -255,18 +257,20 @@ public:
|
|||
break;
|
||||
}
|
||||
}
|
||||
PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", schedule,
|
||||
P64(chunk));
|
||||
PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", (int)schedule,
|
||||
(long long)chunk);
|
||||
} else if (schedule == kmp_sched_auto) {
|
||||
schedule = kmp_sched_static_chunk;
|
||||
chunk = 1;
|
||||
PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", schedule,
|
||||
P64(chunk));
|
||||
PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", (int)schedule,
|
||||
(long long)chunk);
|
||||
} else {
|
||||
PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", schedule, P64(chunk));
|
||||
PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", (int)schedule,
|
||||
(long long)chunk);
|
||||
ASSERT(LT_FUSSY,
|
||||
schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
|
||||
"unknown schedule %d & chunk %lld\n", schedule, P64(chunk));
|
||||
"unknown schedule %d & chunk %lld\n", (int)schedule,
|
||||
(long long)chunk);
|
||||
}
|
||||
|
||||
// init schedules
|
||||
|
@ -287,9 +291,12 @@ public:
|
|||
PRINT(LD_LOOP,
|
||||
"dispatch init (static chunk) : num threads = %d, ub = %" PRId64
|
||||
", next lower bound = %llu, stride = %llu\n",
|
||||
tnum, omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->Stride(tid));
|
||||
(int)tnum,
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
|
||||
(unsigned long long)
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
|
||||
(unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
|
||||
tid));
|
||||
} else if (schedule == kmp_sched_static_balanced_chunk) {
|
||||
ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
|
||||
// save sched state
|
||||
|
@ -316,9 +323,12 @@ public:
|
|||
PRINT(LD_LOOP,
|
||||
"dispatch init (static chunk) : num threads = %d, ub = %" PRId64
|
||||
", next lower bound = %llu, stride = %llu\n",
|
||||
tnum, omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->Stride(tid));
|
||||
(int)tnum,
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
|
||||
(unsigned long long)
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
|
||||
(unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
|
||||
tid));
|
||||
} else if (schedule == kmp_sched_static_nochunk) {
|
||||
ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
|
||||
// save sched state
|
||||
|
@ -336,9 +346,12 @@ public:
|
|||
PRINT(LD_LOOP,
|
||||
"dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
|
||||
", next lower bound = %llu, stride = %llu\n",
|
||||
tnum, omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->Stride(tid));
|
||||
(int)tnum,
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
|
||||
(unsigned long long)
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
|
||||
(unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
|
||||
tid));
|
||||
|
||||
} else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
|
||||
__kmpc_barrier(loc, threadId);
|
||||
|
@ -356,7 +369,9 @@ public:
|
|||
PRINT(LD_LOOP,
|
||||
"dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
|
||||
", chunk %" PRIu64 "\n",
|
||||
tnum, omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
|
||||
(int)tnum,
|
||||
(unsigned long long)
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId),
|
||||
omptarget_nvptx_threadPrivateContext->Chunk(teamId));
|
||||
}
|
||||
|
@ -380,22 +395,22 @@ public:
|
|||
// c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
|
||||
// a.
|
||||
if (lb <= loopUpperBound && ub < loopUpperBound) {
|
||||
PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", P64(lb),
|
||||
P64(ub), P64(loopUpperBound));
|
||||
PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n",
|
||||
(long long)lb, (long long)ub, (long long)loopUpperBound);
|
||||
return NOT_FINISHED;
|
||||
}
|
||||
// b.
|
||||
if (lb <= loopUpperBound) {
|
||||
PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n",
|
||||
P64(lb), P64(ub), P64(loopUpperBound));
|
||||
(long long)lb, (long long)ub, (long long)loopUpperBound);
|
||||
ub = loopUpperBound;
|
||||
return LAST_CHUNK;
|
||||
}
|
||||
// c. if we are here, we are in case 'c'
|
||||
lb = loopUpperBound + 2;
|
||||
ub = loopUpperBound + 1;
|
||||
PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", P64(lb),
|
||||
P64(ub), P64(loopUpperBound));
|
||||
PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", (long long)lb,
|
||||
(long long)ub, (long long)loopUpperBound);
|
||||
return FINISHED;
|
||||
}
|
||||
|
||||
|
@ -426,7 +441,7 @@ public:
|
|||
// finished?
|
||||
if (myLb > ub) {
|
||||
PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n",
|
||||
P64(myLb), P64(ub));
|
||||
(long long)myLb, (long long)ub);
|
||||
return DISPATCH_FINISHED;
|
||||
}
|
||||
// not finished, save current bounds
|
||||
|
@ -442,7 +457,7 @@ public:
|
|||
ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid);
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride;
|
||||
PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n",
|
||||
P64(*plower), P64(*pupper));
|
||||
(long long)*plower, (long long)*pupper);
|
||||
return DISPATCH_NOTFINISHED;
|
||||
}
|
||||
ASSERT0(LT_FUSSY,
|
||||
|
@ -464,12 +479,13 @@ public:
|
|||
*pupper = myUb;
|
||||
*pstride = 1;
|
||||
|
||||
PRINT(LD_LOOP,
|
||||
"Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
|
||||
"last %d\n",
|
||||
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
|
||||
GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper), P64(*pstride),
|
||||
*plast);
|
||||
PRINT(
|
||||
LD_LOOP,
|
||||
"Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
|
||||
"last %d\n",
|
||||
(int)GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
|
||||
(int)GetNumberOfWorkersInTeam(), (long long)*plower, (long long)*pupper,
|
||||
(long long)*pstride, (int)*plast);
|
||||
return DISPATCH_NOTFINISHED;
|
||||
}
|
||||
|
||||
|
|
|
@ -150,7 +150,7 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
|
|||
PRINT(LD_PAR,
|
||||
"thread will execute parallel region with id %d in a team of "
|
||||
"%d threads\n",
|
||||
newTaskDescr->ThreadId(), newTaskDescr->ThreadsInTeam());
|
||||
(int)newTaskDescr->ThreadId(), (int)newTaskDescr->ThreadsInTeam());
|
||||
|
||||
if (RequiresDataSharing && threadId % WARPSIZE == 0) {
|
||||
// Warp master innitializes data sharing environment.
|
||||
|
|
|
@ -76,7 +76,7 @@ EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
|
|||
else
|
||||
*NumLanes = ConvergentSize;
|
||||
ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads",
|
||||
*NumLanes);
|
||||
(int)*NumLanes);
|
||||
|
||||
// Set to true for lanes participating in the simd region.
|
||||
bool isActive = false;
|
||||
|
@ -152,7 +152,7 @@ EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
|
|||
else
|
||||
NumThreads = ConvergentSize;
|
||||
ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
|
||||
NumThreads);
|
||||
(int)NumThreads);
|
||||
|
||||
// Set to true for workers participating in the parallel region.
|
||||
bool isActive = false;
|
||||
|
@ -260,7 +260,7 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
|
|||
}
|
||||
|
||||
ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
|
||||
NumThreads);
|
||||
(int)NumThreads);
|
||||
ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
|
||||
"only team master can create parallel");
|
||||
|
||||
|
@ -307,7 +307,7 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
|
|||
PRINT(LD_PAR,
|
||||
"thread will execute parallel region with id %d in a team of "
|
||||
"%d threads\n",
|
||||
newTaskDescr->ThreadId(), newTaskDescr->NThreads());
|
||||
(int)newTaskDescr->ThreadId(), (int)newTaskDescr->NThreads());
|
||||
|
||||
isActive = true;
|
||||
}
|
||||
|
@ -438,7 +438,7 @@ EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
|
|||
|
||||
EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid,
|
||||
int32_t simd_limit) {
|
||||
PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", simd_limit);
|
||||
PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", (int)simd_limit);
|
||||
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
|
||||
tid = GetLogicalThreadIdInBlock();
|
||||
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit;
|
||||
|
@ -449,12 +449,12 @@ EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid,
|
|||
|
||||
EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid,
|
||||
int32_t num_teams, int32_t thread_limit) {
|
||||
PRINT(LD_IO, "call kmpc_push_num_teams %d\n", num_teams);
|
||||
PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams);
|
||||
ASSERT0(LT_FUSSY, FALSE,
|
||||
"should never have anything with new teams on device");
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid,
|
||||
int proc_bind) {
|
||||
PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", proc_bind);
|
||||
PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind);
|
||||
}
|
||||
|
|
|
@ -76,12 +76,7 @@ EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
|
|||
}
|
||||
|
||||
EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
|
||||
int lo, hi;
|
||||
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
|
||||
hi = __SHFL_DOWN_SYNC(0xFFFFFFFF, hi, delta, size);
|
||||
lo = __SHFL_DOWN_SYNC(0xFFFFFFFF, lo, delta, size);
|
||||
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
|
||||
return val;
|
||||
return __SHFL_DOWN_SYNC(0xFFFFFFFFFFFFFFFFL, val, delta, size);
|
||||
}
|
||||
|
||||
static INLINE void gpu_regular_warp_reduce(void *reduce_data,
|
||||
|
|
|
@ -231,19 +231,20 @@ INLINE unsigned long PadBytes(unsigned long size,
|
|||
{
|
||||
// compute the necessary padding to satisfy alignment constraint
|
||||
ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0,
|
||||
"alignment %ld is not a power of 2\n", alignment);
|
||||
"alignment %lu is not a power of 2\n", alignment);
|
||||
return (~(unsigned long)size + 1) & (alignment - 1);
|
||||
}
|
||||
|
||||
INLINE void *SafeMalloc(size_t size, const char *msg) // check if success
|
||||
{
|
||||
void *ptr = malloc(size);
|
||||
PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr));
|
||||
PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg,
|
||||
(unsigned long long)ptr);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
INLINE void *SafeFree(void *ptr, const char *msg) {
|
||||
PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", P64(ptr), msg);
|
||||
PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", (unsigned long long)ptr, msg);
|
||||
free(ptr);
|
||||
return NULL;
|
||||
}
|
||||
|
|
|
@ -61,7 +61,7 @@ EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
|
|||
|
||||
PRINT(LD_SYNC,
|
||||
"call kmpc_barrier with %d omp threads, sync parameter %d\n",
|
||||
numberOfActiveOMPThreads, threads);
|
||||
(int)numberOfActiveOMPThreads, (int)threads);
|
||||
// Barrier #1 is for synchronization among active threads.
|
||||
named_sync(L1_BARRIER, threads);
|
||||
}
|
||||
|
@ -89,7 +89,7 @@ EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) {
|
|||
PRINT(LD_SYNC,
|
||||
"call kmpc_barrier_simple_generic with %d omp threads, sync parameter "
|
||||
"%d\n",
|
||||
numberOfActiveOMPThreads, threads);
|
||||
(int)numberOfActiveOMPThreads, (int)threads);
|
||||
// Barrier #1 is for synchronization among active threads.
|
||||
named_sync(L1_BARRIER, threads);
|
||||
PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n");
|
||||
|
|
|
@ -39,14 +39,15 @@ EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(
|
|||
PRINT(LD_IO,
|
||||
"call __kmpc_omp_task_alloc(size priv&struct %lld, shared %lld, "
|
||||
"fct 0x%llx)\n",
|
||||
P64(sizeOfTaskInclPrivate), P64(sizeOfSharedTable), P64(taskSub));
|
||||
(long long)sizeOfTaskInclPrivate, (long long)sizeOfSharedTable,
|
||||
(unsigned long long)taskSub);
|
||||
// want task+priv to be a multiple of 8 bytes
|
||||
size_t padForTaskInclPriv = PadBytes(sizeOfTaskInclPrivate, sizeof(void *));
|
||||
sizeOfTaskInclPrivate += padForTaskInclPriv;
|
||||
size_t kmpSize = sizeOfTaskInclPrivate + sizeOfSharedTable;
|
||||
ASSERT(LT_FUSSY, sizeof(omptarget_nvptx_TaskDescr) % sizeof(void *) == 0,
|
||||
"need task descr of size %d to be a multiple of %d\n",
|
||||
sizeof(omptarget_nvptx_TaskDescr), sizeof(void *));
|
||||
(int)sizeof(omptarget_nvptx_TaskDescr), (int)sizeof(void *));
|
||||
size_t totSize = sizeof(omptarget_nvptx_TaskDescr) + kmpSize;
|
||||
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
|
||||
(omptarget_nvptx_ExplicitTaskDescr *)SafeMalloc(
|
||||
|
@ -63,7 +64,8 @@ EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(
|
|||
newKmpTaskDescr->sub = taskSub;
|
||||
newKmpTaskDescr->destructors = NULL;
|
||||
PRINT(LD_TASK, "return with task descr kmp: 0x%llx, omptarget-nvptx 0x%llx\n",
|
||||
P64(newKmpTaskDescr), P64(newExplicitTaskDescr));
|
||||
(unsigned long long)newKmpTaskDescr,
|
||||
(unsigned long long)newExplicitTaskDescr);
|
||||
|
||||
return newKmpTaskDescr;
|
||||
}
|
||||
|
@ -102,10 +104,11 @@ EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
|
|||
|
||||
// 3. call sub
|
||||
PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n",
|
||||
P64(newKmpTaskDescr->sub), P64(newKmpTaskDescr));
|
||||
(unsigned long long)newKmpTaskDescr->sub,
|
||||
(unsigned long long)newKmpTaskDescr);
|
||||
newKmpTaskDescr->sub(0, newKmpTaskDescr);
|
||||
PRINT(LD_TASK, "return from call task sub 0x%llx()\n",
|
||||
P64(newKmpTaskDescr->sub));
|
||||
(unsigned long long)newKmpTaskDescr->sub);
|
||||
|
||||
// 4. pop context
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
|
||||
|
@ -118,7 +121,7 @@ EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
|
|||
EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newKmpTaskDescr) {
|
||||
PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n",
|
||||
P64(newKmpTaskDescr));
|
||||
(unsigned long long)newKmpTaskDescr);
|
||||
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
|
||||
"Runtime must be initialized.");
|
||||
// 1. get explict task descr from kmp task descr
|
||||
|
@ -144,7 +147,7 @@ EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
|
|||
EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newKmpTaskDescr) {
|
||||
PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n",
|
||||
P64(newKmpTaskDescr));
|
||||
(unsigned long long)newKmpTaskDescr);
|
||||
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
|
||||
"Runtime must be initialized.");
|
||||
// 1. get explict task descr from kmp task descr
|
||||
|
|
Loading…
Reference in New Issue