forked from OSchip/llvm-project
Avoid deadlock with COI
When an asynchronous offload task is completed, COI calls the runtime to queue a "destructor task". When the task deques are full, a dead-lock situation arises where the OpenMP threads are inside but cannot progress because the COI thread is stuck inside the runtime trying to find a slot in a deque. This patch implements the solution where the task deques doubled in size when a task is being queued from a COI thread. Differential Revision: http://reviews.llvm.org/D20733 llvm-svn: 271319
This commit is contained in:
parent
067325f935
commit
f4f969569d
|
@ -35,10 +35,6 @@
|
|||
#define TASK_CURRENT_NOT_QUEUED 0
|
||||
#define TASK_CURRENT_QUEUED 1
|
||||
|
||||
#define TASK_DEQUE_BITS 8 // Used solely to define TASK_DEQUE_SIZE and TASK_DEQUE_MASK.
|
||||
#define TASK_DEQUE_SIZE ( 1 << TASK_DEQUE_BITS )
|
||||
#define TASK_DEQUE_MASK ( TASK_DEQUE_SIZE - 1 )
|
||||
|
||||
#ifdef BUILD_TIED_TASK_STACK
|
||||
#define TASK_STACK_EMPTY 0 // entries when the stack is empty
|
||||
|
||||
|
@ -2223,6 +2219,7 @@ typedef struct kmp_base_thread_data {
|
|||
// Used only in __kmp_execute_tasks_template, maybe not avail until task is queued?
|
||||
kmp_bootstrap_lock_t td_deque_lock; // Lock for accessing deque
|
||||
kmp_taskdata_t ** td_deque; // Deque of tasks encountered by td_thr, dynamically allocated
|
||||
kmp_int32 td_deque_size; // Size of deck
|
||||
kmp_uint32 td_deque_head; // Head of deque (will wrap)
|
||||
kmp_uint32 td_deque_tail; // Tail of deque (will wrap)
|
||||
kmp_int32 td_deque_ntasks; // Number of tasks in deque
|
||||
|
@ -2233,6 +2230,12 @@ typedef struct kmp_base_thread_data {
|
|||
#endif // BUILD_TIED_TASK_STACK
|
||||
} kmp_base_thread_data_t;
|
||||
|
||||
#define TASK_DEQUE_BITS 8 // Used solely to define INITIAL_TASK_DEQUE_SIZE
|
||||
#define INITIAL_TASK_DEQUE_SIZE ( 1 << TASK_DEQUE_BITS )
|
||||
|
||||
#define TASK_DEQUE_SIZE(td) ((td).td_deque_size)
|
||||
#define TASK_DEQUE_MASK(td) ((td).td_deque_size - 1)
|
||||
|
||||
typedef union KMP_ALIGN_CACHE kmp_thread_data {
|
||||
kmp_base_thread_data_t td;
|
||||
double td_align; /* use worst case alignment */
|
||||
|
|
|
@ -93,7 +93,7 @@ __kmp_omp_debug_struct_info = {
|
|||
sizeof( void * ),
|
||||
OMP_LOCK_T_SIZE < sizeof(void *),
|
||||
bs_last_barrier,
|
||||
TASK_DEQUE_SIZE,
|
||||
INITIAL_TASK_DEQUE_SIZE,
|
||||
|
||||
// thread structure information
|
||||
sizeof( kmp_base_info_t ),
|
||||
|
@ -222,6 +222,7 @@ __kmp_omp_debug_struct_info = {
|
|||
// thread_data_t.
|
||||
sizeof( kmp_thread_data_t ),
|
||||
offset_and_size_of( kmp_base_thread_data_t, td_deque ),
|
||||
offset_and_size_of( kmp_base_thread_data_t, td_deque_size ),
|
||||
offset_and_size_of( kmp_base_thread_data_t, td_deque_head ),
|
||||
offset_and_size_of( kmp_base_thread_data_t, td_deque_tail ),
|
||||
offset_and_size_of( kmp_base_thread_data_t, td_deque_ntasks ),
|
||||
|
|
|
@ -218,6 +218,7 @@ typedef struct {
|
|||
/* kmp_thread_data_t */
|
||||
kmp_int32 hd_sizeof_struct;
|
||||
offset_and_size_t hd_deque;
|
||||
offset_and_size_t hd_deque_size;
|
||||
offset_and_size_t hd_deque_head;
|
||||
offset_and_size_t hd_deque_tail;
|
||||
offset_and_size_t hd_deque_ntasks;
|
||||
|
|
|
@ -305,7 +305,7 @@ __kmp_push_task(kmp_int32 gtid, kmp_task_t * task )
|
|||
}
|
||||
|
||||
// Check if deque is full
|
||||
if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
|
||||
if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) )
|
||||
{
|
||||
KA_TRACE(20, ( "__kmp_push_task: T#%d deque is full; returning TASK_NOT_PUSHED for task %p\n",
|
||||
gtid, taskdata ) );
|
||||
|
@ -317,7 +317,7 @@ __kmp_push_task(kmp_int32 gtid, kmp_task_t * task )
|
|||
|
||||
#if OMP_41_ENABLED
|
||||
// Need to recheck as we can get a proxy task from a thread outside of OpenMP
|
||||
if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
|
||||
if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) )
|
||||
{
|
||||
__kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
|
||||
KA_TRACE(20, ( "__kmp_push_task: T#%d deque is full on 2nd check; returning TASK_NOT_PUSHED for task %p\n",
|
||||
|
@ -326,12 +326,12 @@ __kmp_push_task(kmp_int32 gtid, kmp_task_t * task )
|
|||
}
|
||||
#else
|
||||
// Must have room since no thread can add tasks but calling thread
|
||||
KMP_DEBUG_ASSERT( TCR_4(thread_data -> td.td_deque_ntasks) < TASK_DEQUE_SIZE );
|
||||
KMP_DEBUG_ASSERT( TCR_4(thread_data -> td.td_deque_ntasks) < TASK_DEQUE_SIZE(thread_data->td) );
|
||||
#endif
|
||||
|
||||
thread_data -> td.td_deque[ thread_data -> td.td_deque_tail ] = taskdata; // Push taskdata
|
||||
// Wrap index.
|
||||
thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK;
|
||||
thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK(thread_data->td);
|
||||
TCW_4(thread_data -> td.td_deque_ntasks, TCR_4(thread_data -> td.td_deque_ntasks) + 1); // Adjust task count
|
||||
|
||||
__kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
|
||||
|
@ -1641,7 +1641,7 @@ __kmp_remove_my_task( kmp_info_t * thread, kmp_int32 gtid, kmp_task_team_t *task
|
|||
return NULL;
|
||||
}
|
||||
|
||||
tail = ( thread_data -> td.td_deque_tail - 1 ) & TASK_DEQUE_MASK; // Wrap index.
|
||||
tail = ( thread_data -> td.td_deque_tail - 1 ) & TASK_DEQUE_MASK(thread_data->td); // Wrap index.
|
||||
taskdata = thread_data -> td.td_deque[ tail ];
|
||||
|
||||
if (is_constrained) {
|
||||
|
@ -1735,10 +1735,10 @@ __kmp_steal_task( kmp_info_t *victim, kmp_int32 gtid, kmp_task_team_t *task_team
|
|||
if ( !is_constrained ) {
|
||||
taskdata = victim_td -> td.td_deque[ victim_td -> td.td_deque_head ];
|
||||
// Bump head pointer and Wrap.
|
||||
victim_td -> td.td_deque_head = ( victim_td -> td.td_deque_head + 1 ) & TASK_DEQUE_MASK;
|
||||
victim_td -> td.td_deque_head = ( victim_td -> td.td_deque_head + 1 ) & TASK_DEQUE_MASK(victim_td->td);
|
||||
} else {
|
||||
// While we have postponed tasks let's steal from tail of the deque (smaller tasks)
|
||||
kmp_int32 tail = ( victim_td -> td.td_deque_tail - 1 ) & TASK_DEQUE_MASK; // Wrap index.
|
||||
kmp_int32 tail = ( victim_td -> td.td_deque_tail - 1 ) & TASK_DEQUE_MASK(victim_td->td); // Wrap index.
|
||||
taskdata = victim_td -> td.td_deque[ tail ];
|
||||
// we need to check if the candidate obeys task scheduling constraint:
|
||||
// only child of current task can be scheduled
|
||||
|
@ -2267,14 +2267,42 @@ __kmp_alloc_task_deque( kmp_info_t *thread, kmp_thread_data_t *thread_data )
|
|||
KMP_DEBUG_ASSERT( thread_data -> td.td_deque_tail == 0 );
|
||||
|
||||
KE_TRACE( 10, ( "__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
|
||||
__kmp_gtid_from_thread( thread ), TASK_DEQUE_SIZE, thread_data ) );
|
||||
__kmp_gtid_from_thread( thread ), INITIAL_TASK_DEQUE_SIZE, thread_data ) );
|
||||
// Allocate space for task deque, and zero the deque
|
||||
// Cannot use __kmp_thread_calloc() because threads not around for
|
||||
// kmp_reap_task_team( ).
|
||||
thread_data -> td.td_deque = (kmp_taskdata_t **)
|
||||
__kmp_allocate( TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
|
||||
__kmp_allocate( INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
|
||||
thread_data -> td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// __kmp_realloc_task_deque:
|
||||
// Re-allocates a task deque for a particular thread, copies the content from the old deque
|
||||
// and adjusts the necessary data structures relating to the deque.
|
||||
// This operation must be done with a the deque_lock being held
|
||||
|
||||
static void __kmp_realloc_task_deque ( kmp_info_t *thread, kmp_thread_data_t *thread_data )
|
||||
{
|
||||
kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
|
||||
kmp_int32 new_size = 2 * size;
|
||||
|
||||
KE_TRACE( 10, ( "__kmp_realloc_task_deque: T#%d reallocating deque[from %d to %d] for thread_data %p\n",
|
||||
__kmp_gtid_from_thread( thread ), size, new_size, thread_data ) );
|
||||
|
||||
kmp_taskdata_t ** new_deque = (kmp_taskdata_t **) __kmp_allocate( new_size * sizeof(kmp_taskdata_t *));
|
||||
|
||||
int i,j;
|
||||
for ( i = thread_data->td.td_deque_head, j = 0; j < size; i = (i+1) & TASK_DEQUE_MASK(thread_data->td), j++ )
|
||||
new_deque[j] = thread_data->td.td_deque[i];
|
||||
|
||||
__kmp_free(thread_data->td.td_deque);
|
||||
|
||||
thread_data -> td.td_deque_head = 0;
|
||||
thread_data -> td.td_deque_tail = size;
|
||||
thread_data -> td.td_deque = new_deque;
|
||||
thread_data -> td.td_deque_size = new_size;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// __kmp_free_task_deque:
|
||||
|
@ -2769,7 +2797,7 @@ __kmp_tasking_barrier( kmp_team_t *team, kmp_info_t *thread, int gtid )
|
|||
|
||||
Because of this, __kmp_push_task needs to check if there's space after getting the lock
|
||||
*/
|
||||
static bool __kmp_give_task ( kmp_info_t *thread, kmp_int32 tid, kmp_task_t * task )
|
||||
static bool __kmp_give_task ( kmp_info_t *thread, kmp_int32 tid, kmp_task_t * task, kmp_int32 pass )
|
||||
{
|
||||
kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
|
||||
kmp_task_team_t * task_team = taskdata->td_task_team;
|
||||
|
@ -2789,23 +2817,37 @@ static bool __kmp_give_task ( kmp_info_t *thread, kmp_int32 tid, kmp_task_t * ta
|
|||
return result;
|
||||
}
|
||||
|
||||
if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
|
||||
if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) )
|
||||
{
|
||||
KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) );
|
||||
return result;
|
||||
|
||||
// if this deque is bigger than the pass ratio give a chance to another thread
|
||||
if ( TASK_DEQUE_SIZE(thread_data->td)/INITIAL_TASK_DEQUE_SIZE >= pass ) return result;
|
||||
|
||||
__kmp_acquire_bootstrap_lock( & thread_data-> td.td_deque_lock );
|
||||
__kmp_realloc_task_deque(thread,thread_data);
|
||||
|
||||
} else {
|
||||
|
||||
__kmp_acquire_bootstrap_lock( & thread_data-> td.td_deque_lock );
|
||||
|
||||
if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) )
|
||||
{
|
||||
KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) );
|
||||
|
||||
// if this deque is bigger than the pass ratio give a chance to another thread
|
||||
if ( TASK_DEQUE_SIZE(thread_data->td)/INITIAL_TASK_DEQUE_SIZE >= pass )
|
||||
goto release_and_exit;
|
||||
|
||||
__kmp_realloc_task_deque(thread,thread_data);
|
||||
}
|
||||
}
|
||||
|
||||
__kmp_acquire_bootstrap_lock( & thread_data-> td.td_deque_lock );
|
||||
|
||||
if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
|
||||
{
|
||||
KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) );
|
||||
goto release_and_exit;
|
||||
}
|
||||
// lock is held here, and there is space in the deque
|
||||
|
||||
thread_data -> td.td_deque[ thread_data -> td.td_deque_tail ] = taskdata;
|
||||
// Wrap index.
|
||||
thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK;
|
||||
thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK(thread_data->td);
|
||||
TCW_4(thread_data -> td.td_deque_ntasks, TCR_4(thread_data -> td.td_deque_ntasks) + 1);
|
||||
|
||||
result = true;
|
||||
|
@ -2919,14 +2961,21 @@ void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask )
|
|||
kmp_team_t * team = taskdata->td_team;
|
||||
kmp_int32 nthreads = team->t.t_nproc;
|
||||
kmp_info_t *thread;
|
||||
kmp_int32 k = 0;
|
||||
|
||||
//This should be similar to start_k = __kmp_get_random( thread ) % nthreads but we cannot use __kmp_get_random here
|
||||
kmp_int32 start_k = 0;
|
||||
kmp_int32 pass = 1;
|
||||
kmp_int32 k = start_k;
|
||||
|
||||
do {
|
||||
//This should be similar to k = __kmp_get_random( thread ) % nthreads but we cannot use __kmp_get_random here
|
||||
//For now we're just linearly trying to find a thread
|
||||
k = (k+1) % nthreads;
|
||||
thread = team->t.t_threads[k];
|
||||
} while ( !__kmp_give_task( thread, k, ptask ) );
|
||||
k = (k+1) % nthreads;
|
||||
|
||||
// we did a full pass through all the threads
|
||||
if ( k == start_k ) pass = pass << 1;
|
||||
|
||||
} while ( !__kmp_give_task( thread, k, ptask, pass ) );
|
||||
|
||||
__kmp_second_top_half_finish_proxy(taskdata);
|
||||
|
||||
|
|
Loading…
Reference in New Issue