Add new OpenMP 4.5 taskloop construct feature

From the standard: The taskloop construct specifies that the iterations of one
or more associated loops will be executed in parallel using OpenMP tasks. The
iterations are distributed across tasks created by the construct and scheduled
to be executed.

This initial implementation uses a simple linear tasks distribution algorithm.
Later we can add other algorithms to speedup generation of huge number of tasks
(i.e., tree-like tasks generation should be faster).

This needs to be put into the OpenMP runtime library in order for the
compiler team to develop the compiler side of the implementation.

Differential Revision: http://reviews.llvm.org/D17404

llvm-svn: 262535
This commit is contained in:
Jonathan Peyton 2016-03-02 22:47:51 +00:00
parent abd1f57453
commit 283a215c7a
4 changed files with 391 additions and 6 deletions

View File

@ -393,6 +393,7 @@ kmpc_set_defaults 224
__kmpc_doacross_wait 262
__kmpc_doacross_post 263
__kmpc_doacross_fini 264
__kmpc_taskloop 266
%endif
%endif

View File

@ -2205,11 +2205,7 @@ struct kmp_taskdata { /* aligned during dynamic
#endif
#if OMP_41_ENABLED
kmp_task_team_t * td_task_team;
#endif
#if KMP_HAVE_QUAD
_Quad td_dummy; // Align structure 16-byte size since allocated just before kmp_task_t
#else
kmp_uint32 td_dummy[2];
kmp_int32 td_size_alloc; // The size of task structure, including shareds etc.
#endif
}; // struct kmp_taskdata
@ -3478,7 +3474,9 @@ KMP_EXPORT int __kmp_get_cancellation_status(int cancel_kind);
KMP_EXPORT void __kmpc_proxy_task_completed( kmp_int32 gtid, kmp_task_t *ptask );
KMP_EXPORT void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask );
KMP_EXPORT void __kmpc_taskloop(ident_t *loc, kmp_int32 gtid, kmp_task_t *task, kmp_int32 if_val,
kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
kmp_int32 nogroup, kmp_int32 sched, kmp_uint64 grainsize, void * task_dup );
#endif
#endif

View File

@ -1000,6 +1000,7 @@ __kmp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_tasking_flags_t *flags,
#if OMP_41_ENABLED
taskdata->td_flags.proxy = flags->proxy;
taskdata->td_task_team = thread->th.th_task_team;
taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
#endif
taskdata->td_flags.tasktype = TASK_EXPLICIT;
@ -2877,4 +2878,231 @@ void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask )
KA_TRACE(10, ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", taskdata ) );
}
//---------------------------------------------------------------------------------
// __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task for taskloop
//
// thread: allocating thread
// task_src: pointer to source task to be duplicated
// returns: a pointer to the allocated kmp_task_t structure (task).
kmp_task_t *
__kmp_task_dup_alloc( kmp_info_t *thread, kmp_task_t *task_src )
{
kmp_task_t *task;
kmp_taskdata_t *taskdata;
kmp_taskdata_t *taskdata_src;
kmp_taskdata_t *parent_task = thread->th.th_current_task;
size_t shareds_offset;
size_t task_size;
KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, task_src) );
taskdata_src = KMP_TASK_TO_TASKDATA( task_src );
KMP_DEBUG_ASSERT( taskdata_src->td_flags.proxy == TASK_FULL ); // it should not be proxy task
KMP_DEBUG_ASSERT( taskdata_src->td_flags.tasktype == TASK_EXPLICIT );
task_size = taskdata_src->td_size_alloc;
// Allocate a kmp_taskdata_t block and a kmp_task_t block.
KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, task_size) );
#if USE_FAST_MEMORY
taskdata = (kmp_taskdata_t *)__kmp_fast_allocate( thread, task_size );
#else
taskdata = (kmp_taskdata_t *)__kmp_thread_malloc( thread, task_size );
#endif /* USE_FAST_MEMORY */
KMP_MEMCPY(taskdata, taskdata_src, task_size);
task = KMP_TASKDATA_TO_TASK(taskdata);
// Initialize new task (only specific fields not affected by memcpy)
taskdata->td_task_id = KMP_GEN_TASK_ID();
if( task->shareds != NULL ) { // need setup shareds pointer
shareds_offset = (char*)task_src->shareds - (char*)taskdata_src;
task->shareds = &((char*)taskdata)[shareds_offset];
KMP_DEBUG_ASSERT( (((kmp_uintptr_t)task->shareds) & (sizeof(void*)-1)) == 0 );
}
taskdata->td_alloc_thread = thread;
taskdata->td_taskgroup = parent_task->td_taskgroup; // task inherits the taskgroup from the parent task
// Only need to keep track of child task counts if team parallel and tasking not serialized
if ( !( taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser ) ) {
KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_incomplete_child_tasks) );
if ( parent_task->td_taskgroup )
KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_taskgroup->count) );
// Only need to keep track of allocated child tasks for explicit tasks since implicit not deallocated
if ( taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT )
KMP_TEST_THEN_INC32( (kmp_int32 *)(& taskdata->td_parent->td_allocated_child_tasks) );
}
KA_TRACE(20, ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
thread, taskdata, taskdata->td_parent) );
#if OMPT_SUPPORT
__kmp_task_init_ompt(taskdata, thread->th.th_info.ds.ds_gtid, (void*)task->routine);
#endif
return task;
}
// Routine optionally generated by th ecompiler for setting the lastprivate flag
// and calling needed constructors for private/firstprivate objects
// (used to form taskloop tasks from pattern task)
typedef void(*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
//---------------------------------------------------------------------------------
// __kmp_taskloop_linear: Start tasks of the taskloop linearly
//
// loc Source location information
// gtid Global thread ID
// task Task with whole loop iteration range
// lb Pointer to loop lower bound
// ub Pointer to loop upper bound
// st Loop stride
// sched Schedule specified 0/1/2 for none/grainsize/num_tasks
// grainsize Schedule value if specified
// task_dup Tasks duplication routine
void
__kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
int sched, kmp_uint64 grainsize, void *task_dup )
{
p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
kmp_uint64 tc;
kmp_uint64 lower = *lb; // compiler provides global bounds here
kmp_uint64 upper = *ub;
kmp_uint64 i, num_tasks, extras;
kmp_info_t *thread = __kmp_threads[gtid];
kmp_taskdata_t *current_task = thread->th.th_current_task;
kmp_task_t *next_task;
kmp_int32 lastpriv = 0;
size_t lower_offset = (char*)lb - (char*)task; // remember offset of lb in the task structure
size_t upper_offset = (char*)ub - (char*)task; // remember offset of ub in the task structure
// compute trip count
if ( st == 1 ) { // most common case
tc = upper - lower + 1;
} else if ( st < 0 ) {
tc = (lower - upper) / (-st) + 1;
} else { // st > 0
tc = (upper - lower) / st + 1;
}
if(tc == 0) {
// free the pattern task and exit
__kmp_task_start( gtid, task, current_task );
// do not execute anything for zero-trip loop
__kmp_task_finish( gtid, task, current_task );
return;
}
// compute num_tasks/grainsize based on the input provided
switch( sched ) {
case 0: // no schedule clause specified, we can choose the default
// let's try to schedule (team_size*10) tasks
grainsize = thread->th.th_team_nproc * 10;
case 2: // num_tasks provided
if( grainsize > tc ) {
num_tasks = tc; // too big num_tasks requested, adjust values
grainsize = 1;
extras = 0;
} else {
num_tasks = grainsize;
grainsize = tc / num_tasks;
extras = tc % num_tasks;
}
break;
case 1: // grainsize provided
if( grainsize > tc ) {
num_tasks = 1; // too big grainsize requested, adjust values
grainsize = tc;
extras = 0;
} else {
num_tasks = tc / grainsize;
grainsize = tc / num_tasks; // adjust grainsize for balanced distribution of iterations
extras = tc % num_tasks;
}
break;
default:
KMP_ASSERT2(0, "unknown scheduling of taskloop");
}
KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
KMP_DEBUG_ASSERT(num_tasks > extras);
KMP_DEBUG_ASSERT(num_tasks > 0);
// Main loop, launch num_tasks tasks, assign grainsize iterations each task
for( i = 0; i < num_tasks; ++i ) {
kmp_uint64 chunk_minus_1;
if( extras == 0 ) {
chunk_minus_1 = grainsize - 1;
} else {
chunk_minus_1 = grainsize;
--extras; // first extras iterations get bigger chunk (grainsize+1)
}
upper = lower + st * chunk_minus_1;
if( i == num_tasks - 1 ) {
// schedule the last task, set lastprivate flag
lastpriv = 1;
#if KMP_DEBUG
if( st == 1 )
KMP_DEBUG_ASSERT(upper == *ub);
else if( st > 0 )
KMP_DEBUG_ASSERT(upper+st > *ub);
else
KMP_DEBUG_ASSERT(upper+st < *ub);
#endif
}
next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
*(kmp_uint64*)((char*)next_task + lower_offset) = lower; // adjust task-specific bounds
*(kmp_uint64*)((char*)next_task + upper_offset) = upper;
if( ptask_dup != NULL )
ptask_dup(next_task, task, lastpriv); // set lastprivate flag, construct fistprivates, etc.
__kmp_omp_task(gtid, next_task, true); // schedule new task
lower = upper + st; // adjust lower bound for the next iteration
}
// free the pattern task and exit
__kmp_task_start( gtid, task, current_task );
// do not execute the pattern task, just do bookkeeping
__kmp_task_finish( gtid, task, current_task );
}
/*!
@ingroup TASKING
@param loc Source location information
@param gtid Global thread ID
@param task Task structure
@param if_val Value of the if clause
@param lb Pointer to loop lower bound
@param ub Pointer to loop upper bound
@param st Loop stride
@param nogroup Flag, 1 if nogroup clause specified, 0 otherwise
@param sched Schedule specified 0/1/2 for none/grainsize/num_tasks
@param grainsize Schedule value if specified
@param task_dup Tasks duplication routine
Execute the taskloop construct.
*/
void
__kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
int nogroup, int sched, kmp_uint64 grainsize, void *task_dup )
{
kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
KMP_DEBUG_ASSERT( task != NULL );
KA_TRACE(10, ("__kmpc_taskloop(enter): T#%d, pattern task %p, lb %lld ub %lld st %lld, grain %llu(%d)\n",
gtid, taskdata, *lb, *ub, st, grainsize, sched));
// check if clause value first
if( if_val == 0 ) { // if(0) specified, mark task as serial
taskdata->td_flags.task_serial = 1;
taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
}
if( nogroup == 0 ) {
__kmpc_taskgroup( loc, gtid );
}
if( 1 /* AC: use some heuristic here to choose task scheduling method */ ) {
__kmp_taskloop_linear( loc, gtid, task, lb, ub, st, sched, grainsize, task_dup );
}
if( nogroup == 0 ) {
__kmpc_end_taskgroup( loc, gtid );
}
KA_TRACE(10, ("__kmpc_taskloop(exit): T#%d\n", gtid));
}
#endif

View File

@ -0,0 +1,158 @@
// RUN: %libomp-compile-and-run
#include <stdio.h>
#include <omp.h>
#include "omp_my_sleep.h"
#define N 4
#define GRAIN 10
#define STRIDE 3
// globals
int th_counter[N];
int counter;
// Compiler-generated code (emulation)
typedef struct ident {
void* dummy;
} ident_t;
typedef struct shar {
int(*pth_counter)[N];
int *pcounter;
int *pj;
} *pshareds;
typedef struct task {
pshareds shareds;
int(* routine)(int,struct task*);
int part_id;
// privates:
unsigned long long lb; // library always uses ULONG
unsigned long long ub;
int st;
int last;
int i;
int j;
int th;
} *ptask, kmp_task_t;
typedef int(* task_entry_t)( int, ptask );
void
__task_dup_entry(ptask task_dst, ptask task_src, int lastpriv)
{
// setup lastprivate flag
task_dst->last = lastpriv;
// could be constructor calls here...
}
// OpenMP RTL interfaces
typedef unsigned long long kmp_uint64;
typedef long long kmp_int64;
#ifdef __cplusplus
extern "C" {
#endif
void
__kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
int nogroup, int sched, kmp_int64 grainsize, void *task_dup );
ptask
__kmpc_omp_task_alloc( ident_t *loc, int gtid, int flags,
size_t sizeof_kmp_task_t, size_t sizeof_shareds,
task_entry_t task_entry );
void __kmpc_atomic_fixed4_add(void *id_ref, int gtid, int * lhs, int rhs);
int __kmpc_global_thread_num(void *id_ref);
#ifdef __cplusplus
}
#endif
// User's code
int task_entry(int gtid, ptask task)
{
pshareds pshar = task->shareds;
for( task->i = task->lb; task->i <= (int)task->ub; task->i += task->st ) {
task->th = omp_get_thread_num();
__kmpc_atomic_fixed4_add(NULL,gtid,pshar->pcounter,1);
__kmpc_atomic_fixed4_add(NULL,gtid,&((*pshar->pth_counter)[task->th]),1);
task->j = task->i;
}
my_sleep( 0.1 ); // sleep 100 ms in order to allow other threads to steal tasks
if( task->last ) {
*(pshar->pj) = task->j; // lastprivate
}
return 0;
}
int main()
{
int i, j, gtid = __kmpc_global_thread_num(NULL);
ptask task;
pshareds psh;
omp_set_dynamic(0);
counter = 0;
for( i=0; i<N; ++i )
th_counter[i] = 0;
#pragma omp parallel num_threads(N)
{
#pragma omp master
{
int gtid = __kmpc_global_thread_num(NULL);
/*
* This is what the OpenMP runtime calls correspond to:
#pragma omp taskloop num_tasks(N) lastprivate(j)
for( i=0; i<N*GRAIN*STRIDE-1; i+=STRIDE )
{
int th = omp_get_thread_num();
#pragma omp atomic
counter++;
#pragma omp atomic
th_counter[th]++;
j = i;
}
*/
task = __kmpc_omp_task_alloc(NULL,gtid,1,sizeof(struct task),sizeof(struct shar),&task_entry);
psh = task->shareds;
psh->pth_counter = &th_counter;
psh->pcounter = &counter;
psh->pj = &j;
task->lb = 0;
task->ub = N*GRAIN*STRIDE-2;
task->st = STRIDE;
__kmpc_taskloop(
NULL, // location
gtid, // gtid
task, // task structure
1, // if clause value
&task->lb, // lower bound
&task->ub, // upper bound
STRIDE, // loop increment
0, // 1 if nogroup specified
2, // schedule type: 0-none, 1-grainsize, 2-num_tasks
N, // schedule value (ignored for type 0)
(void*)&__task_dup_entry // tasks duplication routine
);
} // end master
} // end parallel
// check results
if( j != N*GRAIN*STRIDE-STRIDE ) {
printf("Error in lastprivate, %d != %d\n",j,N*GRAIN*STRIDE-STRIDE);
return 1;
}
if( counter != N*GRAIN ) {
printf("Error, counter %d != %d\n",counter,N*GRAIN);
return 1;
}
for( i=0; i<N; ++i ) {
if( th_counter[i] % GRAIN ) {
printf("Error, th_counter[%d] = %d\n",i,th_counter[i]);
return 1;
}
}
printf("passed\n");
return 0;
}