D9306 omp 4.1 async offload support (partial): code changes

llvm-svn: 236753
This commit is contained in:
Andrey Churbanov 2015-05-07 17:41:51 +00:00
parent d9d900c05b
commit 535b6faaf0
7 changed files with 426 additions and 23 deletions

View File

@ -381,6 +381,14 @@ kmpc_set_defaults 224
%endif # OMP_40
%endif
# OpenMP 4.1 entry points
%ifndef stub
%ifdef OMP_41
__kmpc_proxy_task_completed 259
__kmpc_proxy_task_completed_ooo 260
%endif
%endif
# User API entry points that have both lower- and upper- case versions for Fortran.
# Number for lowercase version is indicated. Number for uppercase is obtained by adding 1000.
# User API entry points are entry points that start with 'kmp_' or 'omp_'.

View File

@ -51,6 +51,8 @@
#define TASK_UNTIED 0
#define TASK_EXPLICIT 1
#define TASK_IMPLICIT 0
#define TASK_PROXY 1
#define TASK_FULL 0
#define KMP_CANCEL_THREADS
#define KMP_THREAD_ATTR
@ -1987,7 +1989,12 @@ typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 b
unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0 code path */
#if OMP_40_ENABLED
unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to invoke destructors from the runtime */
#if OMP_41_ENABLED
unsigned proxy : 1; /* task is a proxy task (it will be executed outside the context of the RTL) */
unsigned reserved : 11; /* reserved for compiler use */
#else
unsigned reserved : 12; /* reserved for compiler use */
#endif
#else // OMP_40_ENABLED
unsigned reserved : 13; /* reserved for compiler use */
#endif // OMP_40_ENABLED
@ -2077,6 +2084,9 @@ typedef struct kmp_base_task_team {
/* TRUE means tt_threads_data is set up and initialized */
kmp_int32 tt_nproc; /* #threads in team */
kmp_int32 tt_max_threads; /* number of entries allocated for threads_data array */
#if OMP_41_ENABLED
kmp_int32 tt_found_proxy_tasks; /* Have we found proxy tasks since last barrier */
#endif
KMP_ALIGN_CACHE
volatile kmp_uint32 tt_unfinished_threads; /* #threads still active */
@ -3147,7 +3157,7 @@ int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_onco
extern void __kmp_reap_task_teams( void );
extern void __kmp_unref_task_team( kmp_task_team_t *task_team, kmp_info_t *thread );
extern void __kmp_wait_to_unref_task_teams( void );
extern void __kmp_task_team_setup ( kmp_info_t *this_thr, kmp_team_t *team, int both );
extern void __kmp_task_team_setup ( kmp_info_t *this_thr, kmp_team_t *team, int both, int always );
extern void __kmp_task_team_sync ( kmp_info_t *this_thr, kmp_team_t *team );
extern void __kmp_task_team_wait ( kmp_info_t *this_thr, kmp_team_t *team
#if USE_ITT_BUILD
@ -3302,8 +3312,16 @@ KMP_EXPORT kmp_int32 __kmpc_cancellationpoint(ident_t* loc_ref, kmp_int32 gtid,
KMP_EXPORT kmp_int32 __kmpc_cancel_barrier(ident_t* loc_ref, kmp_int32 gtid);
KMP_EXPORT int __kmp_get_cancellation_status(int cancel_kind);
#if OMP_41_ENABLED
KMP_EXPORT void __kmpc_proxy_task_completed( kmp_int32 gtid, kmp_task_t *ptask );
KMP_EXPORT void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask );
#endif
#endif
/*
* Lock interface routines (fast versions with gtid passed in)
*/

View File

@ -1135,7 +1135,7 @@ __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
if (__kmp_tasking_mode != tskm_immediate_exec) {
__kmp_task_team_wait(this_thr, team
USE_ITT_BUILD_ARG(itt_sync_obj) );
__kmp_task_team_setup(this_thr, team, 0); // use 0 to only setup the current team
__kmp_task_team_setup(this_thr, team, 0, 0); // use 0,0 to only setup the current team if nthreads > 1
}
@ -1227,9 +1227,32 @@ __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
} else { // Team is serialized.
status = 0;
if (__kmp_tasking_mode != tskm_immediate_exec) {
#if OMP_41_ENABLED
if ( this_thr->th.th_task_team != NULL ) {
void *itt_sync_obj = NULL;
#if USE_ITT_NOTIFY
if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
__kmp_itt_barrier_starting(gtid, itt_sync_obj);
}
#endif
kmp_task_team_t * task_team = this_thr->th.th_task_team;
KMP_DEBUG_ASSERT(task_team->tt.tt_found_proxy_tasks == TRUE);
__kmp_task_team_wait(this_thr, team
USE_ITT_BUILD_ARG(itt_sync_obj));
__kmp_task_team_setup(this_thr, team, 0, 0);
#if USE_ITT_BUILD
if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
__kmp_itt_barrier_finished(gtid, itt_sync_obj);
#endif /* USE_ITT_BUILD */
}
#else
// The task team should be NULL for serialized code (tasks will be executed immediately)
KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
#endif
}
}
KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
@ -1532,7 +1555,7 @@ __kmp_fork_barrier(int gtid, int tid)
#endif
if (__kmp_tasking_mode != tskm_immediate_exec) {
__kmp_task_team_setup(this_thr, team, 1); // 1 indicates setup both task teams
__kmp_task_team_setup(this_thr, team, 1, 0); // 1,0 indicates setup both task teams if nthreads > 1
}
/* The master thread may have changed its blocktime between the join barrier and the

View File

@ -461,6 +461,14 @@ __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
this_thr = __kmp_threads[ global_tid ];
serial_team = this_thr->th.th_serial_team;
#if OMP_41_ENABLED
kmp_task_team_t * task_team = this_thr->th.th_task_team;
// we need to wait for the proxy tasks before finishing the thread
if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks )
__kmp_task_team_wait(this_thr, serial_team, NULL ); // is an ITT object needed here?
#endif
KMP_MB();
KMP_DEBUG_ASSERT( serial_team );
KMP_ASSERT( serial_team -> t.t_serialized );

View File

@ -3940,6 +3940,16 @@ __kmp_unregister_root_current_thread( int gtid )
KMP_MB();
#if OMP_41_ENABLED
kmp_info_t * thread = __kmp_threads[gtid];
kmp_team_t * team = thread->th.th_team;
kmp_task_team_t * task_team = thread->th.th_task_team;
// we need to wait for the proxy tasks before finishing the thread
if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks )
__kmp_task_team_wait(thread, team, NULL );
#endif
__kmp_reset_root(gtid, root);
/* free up this thread slot */

View File

@ -403,6 +403,9 @@ __kmpc_omp_task_with_deps( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_ta
kmp_taskdata_t * current_task = thread->th.th_current_task;
bool serial = current_task->td_flags.team_serial || current_task->td_flags.tasking_ser || current_task->td_flags.final;
#if OMP_41_ENABLED
serial = serial && !(new_taskdata->td_flags.proxy == TASK_PROXY);
#endif
if ( !serial && ( ndeps > 0 || ndeps_noalias > 0 )) {
/* if no dependencies have been tracked yet, create the dependence hash */
@ -425,11 +428,20 @@ __kmpc_omp_task_with_deps( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_ta
new_taskdata ) );
return TASK_CURRENT_NOT_QUEUED;
}
} else {
#if OMP_41_ENABLED
kmp_task_team_t * task_team = thread->th.th_task_team;
if ( task_team && task_team->tt.tt_found_proxy_tasks )
__kmpc_omp_wait_deps ( loc_ref, gtid, ndeps, dep_list, ndeps_noalias, noalias_dep_list );
else
#endif
KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d ignored dependencies for task (serialized)"
"loc=%p task=%p\n", gtid, loc_ref, new_taskdata ) );
}
KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d task had no blocking dependencies : "
"loc=%p task=%p, transferring to __kmpc_omp_task\n", gtid, loc_ref,
new_taskdata ) );
new_taskdata ) );
return __kmpc_omp_task(loc_ref,gtid,new_task);
}
@ -460,9 +472,15 @@ __kmpc_omp_wait_deps ( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps, kmp_de
kmp_taskdata_t * current_task = thread->th.th_current_task;
// We can return immediately as:
// - dependences are not computed in serial teams
// - dependences are not computed in serial teams (except if we have proxy tasks)
// - if the dephash is not yet created it means we have nothing to wait for
if ( current_task->td_flags.team_serial || current_task->td_flags.tasking_ser || current_task->td_flags.final || current_task->td_dephash == NULL ) {
bool ignore = current_task->td_flags.team_serial || current_task->td_flags.tasking_ser || current_task->td_flags.final;
#if OMP_41_ENABLED
ignore = ignore && thread->th.th_task_team->tt.tt_found_proxy_tasks == FALSE;
#endif
ignore = ignore || current_task->td_dephash == NULL;
if ( ignore ) {
KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking dependencies : loc=%p\n", gtid, loc_ref) );
return;
}

View File

@ -32,6 +32,10 @@ static void __kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_t
static void __kmp_alloc_task_deque( kmp_info_t *thread, kmp_thread_data_t *thread_data );
static int __kmp_realloc_task_threads_data( kmp_info_t *thread, kmp_task_team_t *task_team );
#ifdef OMP_41_ENABLED
static void __kmp_bottom_half_finish_proxy( kmp_int32 gtid, kmp_task_t * ptask );
#endif
static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) {
switch (((kmp_flag_64 *)flag)->get_type()) {
case flag32: __kmp_resume_32(gtid, NULL); break;
@ -312,8 +316,19 @@ __kmp_push_task(kmp_int32 gtid, kmp_task_t * task )
// Lock the deque for the task push operation
__kmp_acquire_bootstrap_lock( & thread_data -> td.td_deque_lock );
#if OMP_41_ENABLED
// Need to recheck as we can get a proxy task from a thread outside of OpenMP
if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
{
__kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
KA_TRACE(20, ( "__kmp_push_task: T#%d deque is full on 2nd check; returning TASK_NOT_PUSHED for task %p\n",
gtid, taskdata ) );
return TASK_NOT_PUSHED;
}
#else
// Must have room since no thread can add tasks but calling thread
KMP_DEBUG_ASSERT( TCR_4(thread_data -> td.td_deque_ntasks) < TASK_DEQUE_SIZE );
#endif
thread_data -> td.td_deque[ thread_data -> td.td_deque_tail ] = taskdata; // Push taskdata
// Wrap index.
@ -780,6 +795,10 @@ __kmp_init_implicit_task( ident_t *loc_ref, kmp_info_t *this_thr, kmp_team_t *te
task->td_flags.tiedness = TASK_TIED;
task->td_flags.tasktype = TASK_IMPLICIT;
#if OMP_41_ENABLED
task->td_flags.proxy = TASK_FULL;
#endif
// All implicit tasks are executed immediately, not deferred
task->td_flags.task_serial = 1;
task->td_flags.tasking_ser = ( __kmp_tasking_mode == tskm_immediate_exec );
@ -864,6 +883,40 @@ __kmp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_tasking_flags_t *flags,
flags->final = 1;
}
#if OMP_41_ENABLED
if ( flags->proxy == TASK_PROXY ) {
flags->tiedness = TASK_UNTIED;
flags->merged_if0 = 1;
/* are we running in a sequential parallel or tskm_immediate_exec... we need tasking support enabled */
if ( (thread->th.th_task_team) == NULL ) {
/* This should only happen if the team is serialized
setup a task team and propagate it to the thread
*/
KMP_DEBUG_ASSERT(team->t.t_serialized);
KA_TRACE(30,("T#%d creating task team in __kmp_task_alloc for proxy task\n", gtid));
__kmp_task_team_setup(thread,team,0,1); // 0,1 indicates only setup the current team regardless of nthreads
thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
}
kmp_task_team_t * task_team = thread->th.th_task_team;
/* tasking must be enabled now as the task might not be pushed */
if ( !KMP_TASKING_ENABLED( task_team ) ) {
KA_TRACE(30,("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
__kmp_enable_tasking( task_team, thread );
kmp_int32 tid = thread->th.th_info.ds.ds_tid;
kmp_thread_data_t * thread_data = & task_team -> tt.tt_threads_data[ tid ];
// No lock needed since only owner can allocate
if (thread_data -> td.td_deque == NULL ) {
__kmp_alloc_task_deque( thread, thread_data );
}
}
if ( task_team->tt.tt_found_proxy_tasks == FALSE )
TCW_4(task_team -> tt.tt_found_proxy_tasks, TRUE);
}
#endif
// Calculate shared structure offset including padding after kmp_task_t struct
// to align pointers in shared struct
shareds_offset = sizeof( kmp_taskdata_t ) + sizeof_kmp_task_t;
@ -913,7 +966,11 @@ __kmp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_tasking_flags_t *flags,
taskdata->td_taskwait_counter = 0;
taskdata->td_taskwait_thread = 0;
KMP_DEBUG_ASSERT( taskdata->td_parent != NULL );
copy_icvs( &taskdata->td_icvs, &taskdata->td_parent->td_icvs );
#if OMP_41_ENABLED
// avoid copying icvs for proxy tasks
if ( flags->proxy == TASK_FULL )
#endif
copy_icvs( &taskdata->td_icvs, &taskdata->td_parent->td_icvs );
taskdata->td_flags.tiedness = flags->tiedness;
taskdata->td_flags.final = flags->final;
@ -921,6 +978,9 @@ __kmp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_tasking_flags_t *flags,
#if OMP_40_ENABLED
taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
#endif // OMP_40_ENABLED
#if OMP_41_ENABLED
taskdata->td_flags.proxy = flags->proxy;
#endif
taskdata->td_flags.tasktype = TASK_EXPLICIT;
// GEH - TODO: fix this to copy parent task's value of tasking_ser flag
@ -949,8 +1009,14 @@ __kmp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_tasking_flags_t *flags,
taskdata->td_dephash = NULL;
taskdata->td_depnode = NULL;
#endif
// Only need to keep track of child task counts if team parallel and tasking not serialized
if ( !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) ) {
// Only need to keep track of child task counts if team parallel and tasking not serialized or if it is a proxy task
#if OMP_41_ENABLED
if ( flags->proxy == TASK_PROXY || !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) )
#else
if ( !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) )
#endif
{
KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_incomplete_child_tasks) );
#if OMP_40_ENABLED
if ( parent_task->td_taskgroup )
@ -989,9 +1055,14 @@ __kmpc_omp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
input_flags->native = FALSE;
// __kmp_task_alloc() sets up all other runtime flags
KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) "
KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) "
"sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
#if OMP_41_ENABLED
input_flags->proxy ? "proxy" : "",
#else
"",
#endif
sizeof_kmp_task_t, sizeof_shareds, task_entry) );
retval = __kmp_task_alloc( loc_ref, gtid, input_flags, sizeof_kmp_task_t,
@ -1019,6 +1090,27 @@ __kmp_invoke_task( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t * current_ta
KA_TRACE(30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
gtid, taskdata, current_task) );
#if OMP_41_ENABLED
if ( taskdata->td_flags.proxy == TASK_PROXY &&
taskdata->td_flags.complete == 1)
{
// This is a proxy task that was already completed but it needs to run
// its bottom-half finish
KA_TRACE(30, ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
gtid, taskdata) );
__kmp_bottom_half_finish_proxy(gtid,task);
KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for proxy task %p, resuming task %p\n", gtid, taskdata, current_task) );
return;
}
#endif
#if OMP_41_ENABLED
// Proxy tasks are not handled by the runtime
if ( taskdata->td_flags.proxy != TASK_PROXY )
#endif
__kmp_task_start( gtid, task, current_task );
#if OMPT_SUPPORT
@ -1075,9 +1167,13 @@ __kmp_invoke_task( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t * current_ta
}
#endif
__kmp_task_finish( gtid, task, current_task );
#if OMP_41_ENABLED
// Proxy tasks are not handled by the runtime
if ( taskdata->td_flags.proxy != TASK_PROXY )
#endif
__kmp_task_finish( gtid, task, current_task );
KA_TRACE(30, ("__kmp_inovke_task(exit): T#%d completed task %p, resuming task %p\n",
KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
gtid, taskdata, current_task) );
return;
}
@ -1140,8 +1236,11 @@ __kmp_omp_task( kmp_int32 gtid, kmp_task_t * new_task, bool serialize_immediate
/* Should we execute the new task or queue it? For now, let's just always try to
queue it. If the queue fills up, then we'll execute it. */
#if OMP_41_ENABLED
if ( new_taskdata->td_flags.proxy == TASK_PROXY || __kmp_push_task( gtid, new_task ) == TASK_NOT_PUSHED ) // if cannot defer
#else
if ( __kmp_push_task( gtid, new_task ) == TASK_NOT_PUSHED ) // if cannot defer
#endif
{ // Execute this task immediately
kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
if ( serialize_immediate )
@ -1216,7 +1315,12 @@ __kmpc_omp_taskwait( ident_t *loc_ref, kmp_int32 gtid )
__kmp_itt_taskwait_starting( gtid, itt_sync_obj );
#endif /* USE_ITT_BUILD */
if ( ! taskdata->td_flags.team_serial ) {
#if OMP_41_ENABLED
if ( ! taskdata->td_flags.team_serial || (thread->th.th_task_team != NULL && thread->th.th_task_team->tt.tt_found_proxy_tasks) )
#else
if ( ! taskdata->td_flags.team_serial )
#endif
{
// GEH: if team serialized, avoid reading the volatile variable below.
kmp_flag_32 flag(&(taskdata->td_incomplete_child_tasks), 0U);
while ( TCR_4(taskdata -> td_incomplete_child_tasks) != 0 ) {
@ -1338,7 +1442,12 @@ __kmpc_end_taskgroup( ident_t* loc, int gtid )
__kmp_itt_taskwait_starting( gtid, itt_sync_obj );
#endif /* USE_ITT_BUILD */
if ( ! taskdata->td_flags.team_serial ) {
#if OMP_41_ENABLED
if ( ! taskdata->td_flags.team_serial || (thread->th.th_task_team != NULL && thread->th.th_task_team->tt.tt_found_proxy_tasks) )
#else
if ( ! taskdata->td_flags.team_serial )
#endif
{
kmp_flag_32 flag(&(taskgroup->count), 0U);
while ( TCR_4(taskgroup->count) != 0 ) {
flag.execute_tasks(thread, gtid, FALSE, &thread_finished
@ -1582,7 +1691,11 @@ static inline int __kmp_execute_tasks_template(kmp_info_t *thread, kmp_int32 gti
nthreads = task_team -> tt.tt_nproc;
unfinished_threads = &(task_team -> tt.tt_unfinished_threads);
#if OMP_41_ENABLED
KMP_DEBUG_ASSERT( nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
#else
KMP_DEBUG_ASSERT( nthreads > 1 );
#endif
KMP_DEBUG_ASSERT( TCR_4((int)*unfinished_threads) >= 0 );
// Choose tasks from our own work queue.
@ -1617,7 +1730,13 @@ static inline int __kmp_execute_tasks_template(kmp_info_t *thread, kmp_int32 gti
// This thread's work queue is empty. If we are in the final spin loop
// of the barrier, check and see if the termination condition is satisfied.
if (final_spin) {
#if OMP_41_ENABLED
// The work queue may be empty but there might be proxy tasks still executing
if (final_spin && TCR_4(current_task -> td_incomplete_child_tasks) == 0)
#else
if (final_spin)
#endif
{
// First, decrement the #unfinished threads, if that has not already
// been done. This decrement might be to the spin location, and
// result in the termination condition being satisfied.
@ -1639,6 +1758,12 @@ static inline int __kmp_execute_tasks_template(kmp_info_t *thread, kmp_int32 gti
}
}
#if OMP_41_ENABLED
// check if there are other threads to steal from, otherwise go back
if ( nthreads == 1 )
goto start;
#endif
// Try to steal from the last place I stole from successfully.
tid = thread -> th.th_info.ds.ds_tid;//__kmp_tid_from_gtid( gtid );
last_stolen = threads_data[ tid ].td.td_deque_last_stolen;
@ -1686,7 +1811,13 @@ static inline int __kmp_execute_tasks_template(kmp_info_t *thread, kmp_int32 gti
// The victims's work queue is empty. If we are in the final spin loop
// of the barrier, check and see if the termination condition is satisfied.
if (final_spin) {
#if OMP_41_ENABLED
// The work queue may be empty but there might be proxy tasks still executing
if (final_spin && TCR_4(current_task -> td_incomplete_child_tasks) == 0)
#else
if (final_spin)
#endif
{
// First, decrement the #unfinished threads, if that has not already
// been done. This decrement might be to the spin location, and
// result in the termination condition being satisfied.
@ -1793,7 +1924,13 @@ static inline int __kmp_execute_tasks_template(kmp_info_t *thread, kmp_int32 gti
// Going on and finding a new victim to steal from is expensive, as it
// involves a lot of cache misses, so we definitely want to re-check the
// termination condition before doing that.
if (final_spin) {
#if OMP_41_ENABLED
// The work queue may be empty but there might be proxy tasks still executing
if (final_spin && TCR_4(current_task -> td_incomplete_child_tasks) == 0)
#else
if (final_spin)
#endif
{
// First, decrement the #unfinished threads, if that has not already
// been done. This decrement might be to the spin location, and
// result in the termination condition being satisfied.
@ -2073,8 +2210,8 @@ __kmp_realloc_task_threads_data( kmp_info_t *thread, kmp_task_team_t *task_team
__kmp_allocate( nthreads * sizeof(kmp_thread_data_t) );
// copy old data to new data
KMP_MEMCPY_S( (void *) new_data, nthreads * sizeof(kmp_thread_data_t),
(void *) old_data,
maxthreads * sizeof(kmp_taskdata_t *) );
(void *) old_data,
maxthreads * sizeof(kmp_taskdata_t *) );
#ifdef BUILD_TIED_TASK_STACK
// GEH: Figure out if this is the right thing to do
@ -2194,6 +2331,9 @@ __kmp_allocate_task_team( kmp_info_t *thread, kmp_team_t *team )
}
TCW_4(task_team -> tt.tt_found_tasks, FALSE);
#if OMP_41_ENABLED
TCW_4(task_team -> tt.tt_found_proxy_tasks, FALSE);
#endif
task_team -> tt.tt_nproc = nthreads = team->t.t_nproc;
TCW_4( task_team -> tt.tt_unfinished_threads, nthreads );
@ -2365,11 +2505,11 @@ __kmp_wait_to_unref_task_teams(void)
// an already created, unused one if it already exists.
// This may be called by any thread, but only for teams with # threads >1.
void
__kmp_task_team_setup( kmp_info_t *this_thr, kmp_team_t *team, int both )
__kmp_task_team_setup( kmp_info_t *this_thr, kmp_team_t *team, int both, int always )
{
KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
if ( ( team->t.t_task_team[this_thr->th.th_task_state] == NULL ) && ( team->t.t_nproc > 1 ) ) {
if ( ( team->t.t_task_team[this_thr->th.th_task_state] == NULL ) && ( always || team->t.t_nproc > 1 ) ) {
// Allocate a new task team, which will be propagated to
// all of the worker threads after the barrier. As they
// spin in the barrier release phase, then will continue
@ -2431,7 +2571,7 @@ __kmp_task_team_sync( kmp_info_t *this_thr, kmp_team_t *team )
//------------------------------------------------------------------------------
// __kmp_task_team_wait: Master thread waits for outstanding tasks after the
// barrier gather phase. Only called by master thread if #threads in team > 1 !
// barrier gather phase. Only called by master thread if #threads in team > 1 or if proxy tasks were created
void
__kmp_task_team_wait( kmp_info_t *this_thr, kmp_team_t *team
USE_ITT_BUILD_ARG(void * itt_sync_obj)
@ -2457,7 +2597,12 @@ __kmp_task_team_wait( kmp_info_t *this_thr, kmp_team_t *team
// The master thread is not included in the ref count.
KA_TRACE( 20, ( "__kmp_task_team_wait: Master T#%d deactivating task_team %p\n",
__kmp_gtid_from_thread( this_thr ), task_team ) );
#if OMP_41_ENABLED
KMP_DEBUG_ASSERT( task_team->tt.tt_nproc > 1 || task_team->tt.tt_found_proxy_tasks == TRUE );
TCW_SYNC_4( task_team->tt.tt_found_proxy_tasks, FALSE );
#else
KMP_DEBUG_ASSERT( task_team->tt.tt_nproc > 1 );
#endif
TCW_SYNC_4( task_team->tt.tt_active, FALSE );
KMP_MB();
@ -2505,3 +2650,176 @@ __kmp_tasking_barrier( kmp_team_t *team, kmp_info_t *thread, int gtid )
#endif /* USE_ITT_BUILD */
}
#if OMP_41_ENABLED
/* __kmp_give_task puts a task into a given thread queue if:
- the queue for that thread it was created
- there's space in that queue
Because of this, __kmp_push_task needs to check if there's space after getting the lock
*/
static bool __kmp_give_task ( kmp_info_t *thread, kmp_int32 tid, kmp_task_t * task )
{
kmp_task_team_t * task_team = thread->th.th_task_team;
kmp_thread_data_t * thread_data = & task_team -> tt.tt_threads_data[ tid ];
kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
bool result = false;
KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n", taskdata, tid ) );
// assert tasking is enabled? what if not?
KMP_DEBUG_ASSERT( task_team != NULL );
if (thread_data -> td.td_deque == NULL ) {
// There's no queue in this thread, go find another one
// We're guaranteed that at least one thread has a queue
KA_TRACE(30, ("__kmp_give_task: thread %d has no queue while giving task %p.\n", tid, taskdata ) );
return result;
}
if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
{
KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) );
return result;
}
__kmp_acquire_bootstrap_lock( & thread_data-> td.td_deque_lock );
if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
{
KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) );
goto release_and_exit;
}
thread_data -> td.td_deque[ thread_data -> td.td_deque_tail ] = taskdata;
// Wrap index.
thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK;
TCW_4(thread_data -> td.td_deque_ntasks, TCR_4(thread_data -> td.td_deque_ntasks) + 1);
result = true;
KA_TRACE(30, ("__kmp_give_task: succesfully gave task %p to thread %d.\n", taskdata, tid ) );
release_and_exit:
__kmp_release_bootstrap_lock( & thread_data-> td.td_deque_lock );
return result;
}
/* The finish of the a proxy tasks is divided in two pieces:
- the top half is the one that can be done from a thread outside the team
- the bottom half must be run from a them within the team
In order to run the bottom half the task gets queued back into one of the threads of the team.
Once the td_incomplete_child_task counter of the parent is decremented the threads can leave the barriers.
So, the bottom half needs to be queued before the counter is decremented. The top half is therefore divided in two parts:
- things that can be run before queuing the bottom half
- things that must be run after queuing the bottom half
This creates a second race as the bottom half can free the task before the second top half is executed. To avoid this
we use the td_incomplete_child_task of the proxy task to synchronize the top and bottom half.
*/
static void __kmp_first_top_half_finish_proxy( kmp_taskdata_t * taskdata )
{
KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
KMP_DEBUG_ASSERT( taskdata -> td_flags.proxy == TASK_PROXY );
KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 0 );
KMP_DEBUG_ASSERT( taskdata -> td_flags.freed == 0 );
taskdata -> td_flags.complete = 1; // mark the task as completed
if ( taskdata->td_taskgroup )
KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata->td_taskgroup->count) );
// Create an imaginary children for this task so the bottom half cannot release the task before we have completed the second top half
TCR_4(taskdata->td_incomplete_child_tasks++);
}
static void __kmp_second_top_half_finish_proxy( kmp_taskdata_t * taskdata )
{
kmp_int32 children = 0;
// Predecrement simulated by "- 1" calculation
children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_parent -> td_incomplete_child_tasks) ) - 1;
KMP_DEBUG_ASSERT( children >= 0 );
// Remove the imaginary children
TCR_4(taskdata->td_incomplete_child_tasks--);
}
static void __kmp_bottom_half_finish_proxy( kmp_int32 gtid, kmp_task_t * ptask )
{
kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(ptask);
kmp_info_t * thread = __kmp_threads[ gtid ];
KMP_DEBUG_ASSERT( taskdata -> td_flags.proxy == TASK_PROXY );
KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 1 ); // top half must run before bottom half
// We need to wait to make sure the top half is finished
// Spinning here should be ok as this should happen quickly
while ( TCR_4(taskdata->td_incomplete_child_tasks) > 0 ) ;
__kmp_release_deps(gtid,taskdata);
__kmp_free_task_and_ancestors(gtid, taskdata, thread);
}
/*!
@ingroup TASKING
@param gtid Global Thread ID of encountering thread
@param ptask Task which execution is completed
Execute the completation of a proxy task from a thread of that is part of the team. Run first and bottom halves directly.
*/
void __kmpc_proxy_task_completed( kmp_int32 gtid, kmp_task_t *ptask )
{
KMP_DEBUG_ASSERT( ptask != NULL );
kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(ptask);
KA_TRACE(10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n", gtid, taskdata ) );
KMP_DEBUG_ASSERT( taskdata->td_flags.proxy == TASK_PROXY );
__kmp_first_top_half_finish_proxy(taskdata);
__kmp_second_top_half_finish_proxy(taskdata);
__kmp_bottom_half_finish_proxy(gtid,ptask);
KA_TRACE(10, ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n", gtid, taskdata ) );
}
/*!
@ingroup TASKING
@param ptask Task which execution is completed
Execute the completation of a proxy task from a thread that could not belong to the team.
*/
void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask )
{
KMP_DEBUG_ASSERT( ptask != NULL );
kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(ptask);
KA_TRACE(10, ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n", taskdata ) );
KMP_DEBUG_ASSERT( taskdata->td_flags.proxy == TASK_PROXY );
__kmp_first_top_half_finish_proxy(taskdata);
// Enqueue task to complete bottom half completation from a thread within the corresponding team
kmp_team_t * team = taskdata->td_team;
kmp_int32 nthreads = team->t.t_nproc;
kmp_info_t *thread;
kmp_int32 k = 0;
do {
//This should be similiar to k = __kmp_get_random( thread ) % nthreads but we cannot use __kmp_get_random here
//For now we're just linearly trying to find a thread
k = (k+1) % nthreads;
thread = team->t.t_threads[k];
} while ( !__kmp_give_task( thread, k, ptask ) );
__kmp_second_top_half_finish_proxy(taskdata);
KA_TRACE(10, ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", taskdata ) );
}
#endif