Add new OpenMP 4.5 taskloop construct feature

From the standard: The taskloop construct specifies that the iterations of one or more associated loops will be executed in parallel using OpenMP tasks. The iterations are distributed across tasks created by the construct and scheduled to be executed. This initial implementation uses a simple linear tasks distribution algorithm. Later we can add other algorithms to speedup generation of huge number of tasks (i.e., tree-like tasks generation should be faster). This needs to be put into the OpenMP runtime library in order for the compiler team to develop the compiler side of the implementation. Differential Revision: http://reviews.llvm.org/D17404 llvm-svn: 262535
2016-03-02 22:47:51 +00:00 · 2016-03-02 22:47:51 +00:00 · 283a215c7a
parent abd1f57453
commit 283a215c7a
4 changed files with 391 additions and 6 deletions
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@ -393,6 +393,7 @@ kmpc_set_defaults                           224
        __kmpc_doacross_wait                262
        __kmpc_doacross_post                263
        __kmpc_doacross_fini                264
        __kmpc_taskloop                     266
    %endif
 %endif
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@ -2205,11 +2205,7 @@ struct kmp_taskdata {                                 /* aligned during dynamic
 #endif
 #if OMP_41_ENABLED
    kmp_task_team_t *       td_task_team;
-#endif
+    kmp_int32               td_size_alloc;        // The size of task structure, including shareds etc.
 #if KMP_HAVE_QUAD
    _Quad                   td_dummy;             // Align structure 16-byte size since allocated just before kmp_task_t
 #else
    kmp_uint32              td_dummy[2];
 #endif
 }; // struct kmp_taskdata
@ -3478,7 +3474,9 @@ KMP_EXPORT int __kmp_get_cancellation_status(int cancel_kind);
 KMP_EXPORT void __kmpc_proxy_task_completed( kmp_int32 gtid, kmp_task_t *ptask );
 KMP_EXPORT void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask );
-
+KMP_EXPORT void __kmpc_taskloop(ident_t *loc, kmp_int32 gtid, kmp_task_t *task, kmp_int32 if_val,
                kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
                kmp_int32 nogroup, kmp_int32 sched, kmp_uint64 grainsize, void * task_dup );
 #endif
 #endif
--- a/openmp/runtime/src/kmp_tasking.c
+++ b/openmp/runtime/src/kmp_tasking.c
@ -1000,6 +1000,7 @@ __kmp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_tasking_flags_t *flags,
 #if OMP_41_ENABLED
    taskdata->td_flags.proxy           = flags->proxy;
    taskdata->td_task_team         = thread->th.th_task_team;
    taskdata->td_size_alloc        = shareds_offset + sizeof_shareds;
 #endif
    taskdata->td_flags.tasktype    = TASK_EXPLICIT;
@ -2877,4 +2878,231 @@ void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask )
    KA_TRACE(10, ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", taskdata ) );
 }
 //---------------------------------------------------------------------------------
 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task for taskloop
 //
 // thread:   allocating thread
 // task_src: pointer to source task to be duplicated
 // returns:  a pointer to the allocated kmp_task_t structure (task).
 kmp_task_t *
 __kmp_task_dup_alloc( kmp_info_t *thread, kmp_task_t *task_src )
 {
    kmp_task_t     *task;
    kmp_taskdata_t *taskdata;
    kmp_taskdata_t *taskdata_src;
    kmp_taskdata_t *parent_task = thread->th.th_current_task;
    size_t shareds_offset;
    size_t task_size;
    KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, task_src) );
    taskdata_src = KMP_TASK_TO_TASKDATA( task_src );
    KMP_DEBUG_ASSERT( taskdata_src->td_flags.proxy == TASK_FULL ); // it should not be proxy task
    KMP_DEBUG_ASSERT( taskdata_src->td_flags.tasktype == TASK_EXPLICIT );
    task_size = taskdata_src->td_size_alloc;
    // Allocate a kmp_taskdata_t block and a kmp_task_t block.
    KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, task_size) );
    #if USE_FAST_MEMORY
    taskdata = (kmp_taskdata_t *)__kmp_fast_allocate( thread, task_size );
    #else
    taskdata = (kmp_taskdata_t *)__kmp_thread_malloc( thread, task_size );
    #endif /* USE_FAST_MEMORY */
    KMP_MEMCPY(taskdata, taskdata_src, task_size);
    task = KMP_TASKDATA_TO_TASK(taskdata);
    // Initialize new task (only specific fields not affected by memcpy)
    taskdata->td_task_id = KMP_GEN_TASK_ID();
    if( task->shareds != NULL ) { // need setup shareds pointer
        shareds_offset = (char*)task_src->shareds - (char*)taskdata_src;
        task->shareds = &((char*)taskdata)[shareds_offset];
        KMP_DEBUG_ASSERT( (((kmp_uintptr_t)task->shareds) & (sizeof(void*)-1)) == 0 );
    }
    taskdata->td_alloc_thread = thread;
    taskdata->td_taskgroup = parent_task->td_taskgroup; // task inherits the taskgroup from the parent task
    // Only need to keep track of child task counts if team parallel and tasking not serialized
    if ( !( taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser ) ) {
        KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_incomplete_child_tasks) );
        if ( parent_task->td_taskgroup )
            KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_taskgroup->count) );
        // Only need to keep track of allocated child tasks for explicit tasks since implicit not deallocated
        if ( taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT )
            KMP_TEST_THEN_INC32( (kmp_int32 *)(& taskdata->td_parent->td_allocated_child_tasks) );
    }
    KA_TRACE(20, ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
                  thread, taskdata, taskdata->td_parent) );
 #if OMPT_SUPPORT
    __kmp_task_init_ompt(taskdata, thread->th.th_info.ds.ds_gtid, (void*)task->routine);
 #endif
    return task;
 }
 // Routine optionally generated by th ecompiler for setting the lastprivate flag
 // and calling needed constructors for private/firstprivate objects
 // (used to form taskloop tasks from pattern task)
 typedef void(*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
 //---------------------------------------------------------------------------------
 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
 //
 // loc       Source location information
 // gtid      Global thread ID
 // task      Task with whole loop iteration range
 // lb        Pointer to loop lower bound
 // ub        Pointer to loop upper bound
 // st        Loop stride
 // sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
 // grainsize Schedule value if specified
 // task_dup  Tasks duplication routine
 void
 __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
                kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
                int sched, kmp_uint64 grainsize, void *task_dup )
 {
    p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
    kmp_uint64 tc;
    kmp_uint64 lower = *lb; // compiler provides global bounds here
    kmp_uint64 upper = *ub;
    kmp_uint64 i, num_tasks, extras;
    kmp_info_t *thread = __kmp_threads[gtid];
    kmp_taskdata_t *current_task = thread->th.th_current_task;
    kmp_task_t *next_task;
    kmp_int32 lastpriv = 0;
    size_t lower_offset = (char*)lb - (char*)task; // remember offset of lb in the task structure
    size_t upper_offset = (char*)ub - (char*)task; // remember offset of ub in the task structure
    // compute trip count
    if ( st == 1 ) {   // most common case
        tc = upper - lower + 1;
    } else if ( st < 0 ) {
        tc = (lower - upper) / (-st) + 1;
    } else {       // st > 0
        tc = (upper - lower) / st + 1;
    }
    if(tc == 0) {
        // free the pattern task and exit
        __kmp_task_start( gtid, task, current_task );
        // do not execute anything for zero-trip loop
        __kmp_task_finish( gtid, task, current_task );
        return;
    }
    // compute num_tasks/grainsize based on the input provided
    switch( sched ) {
    case 0: // no schedule clause specified, we can choose the default
            // let's try to schedule (team_size*10) tasks
        grainsize = thread->th.th_team_nproc * 10;
    case 2: // num_tasks provided
        if( grainsize > tc ) {
            num_tasks = tc;   // too big num_tasks requested, adjust values
            grainsize = 1;
            extras = 0;
        } else {
            num_tasks = grainsize;
            grainsize = tc / num_tasks;
            extras = tc % num_tasks;
        }
        break;
    case 1: // grainsize provided
        if( grainsize > tc ) {
            num_tasks = 1;    // too big grainsize requested, adjust values
            grainsize = tc;
            extras = 0;
        } else {
            num_tasks = tc / grainsize;
            grainsize = tc / num_tasks; // adjust grainsize for balanced distribution of iterations
            extras = tc % num_tasks;
        }
        break;
    default:
        KMP_ASSERT2(0, "unknown scheduling of taskloop");
    }
    KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
    KMP_DEBUG_ASSERT(num_tasks > extras);
    KMP_DEBUG_ASSERT(num_tasks > 0);
    // Main loop, launch num_tasks tasks, assign grainsize iterations each task
    for( i = 0; i < num_tasks; ++i ) {
        kmp_uint64 chunk_minus_1;
        if( extras == 0 ) {
            chunk_minus_1 = grainsize - 1;
        } else {
            chunk_minus_1 = grainsize;
            --extras; // first extras iterations get bigger chunk (grainsize+1)
        }
        upper = lower + st * chunk_minus_1;
        if( i == num_tasks - 1 ) {
            // schedule the last task, set lastprivate flag
            lastpriv = 1;
 #if KMP_DEBUG
            if( st == 1 )
                KMP_DEBUG_ASSERT(upper == *ub);
            else if( st > 0 )
                KMP_DEBUG_ASSERT(upper+st > *ub);
            else
                KMP_DEBUG_ASSERT(upper+st < *ub);
 #endif
        }
        next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
        *(kmp_uint64*)((char*)next_task + lower_offset) = lower; // adjust task-specific bounds
        *(kmp_uint64*)((char*)next_task + upper_offset) = upper;
        if( ptask_dup != NULL )
            ptask_dup(next_task, task, lastpriv); // set lastprivate flag, construct fistprivates, etc.
        __kmp_omp_task(gtid, next_task, true); // schedule new task
        lower = upper + st; // adjust lower bound for the next iteration
    }
    // free the pattern task and exit
    __kmp_task_start( gtid, task, current_task );
    // do not execute the pattern task, just do bookkeeping
    __kmp_task_finish( gtid, task, current_task );
 }
 /*!
@ingroup TASKING
@param loc       Source location information
@param gtid      Global thread ID
@param task      Task structure
@param if_val    Value of the if clause
@param lb        Pointer to loop lower bound
@param ub        Pointer to loop upper bound
@param st        Loop stride
@param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
@param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
@param grainsize Schedule value if specified
@param task_dup  Tasks duplication routine
 Execute the taskloop construct.
 */
 void
 __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
                kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
                int nogroup, int sched, kmp_uint64 grainsize, void *task_dup )
 {
    kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
    KMP_DEBUG_ASSERT( task != NULL );
    KA_TRACE(10, ("__kmpc_taskloop(enter): T#%d, pattern task %p, lb %lld ub %lld st %lld, grain %llu(%d)\n",
        gtid, taskdata, *lb, *ub, st, grainsize, sched));
    // check if clause value first
    if( if_val == 0 ) { // if(0) specified, mark task as serial
        taskdata->td_flags.task_serial = 1;
        taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
    }
    if( nogroup == 0 ) {
        __kmpc_taskgroup( loc, gtid );
    }
    if( 1 /* AC: use some heuristic here to choose task scheduling method */ ) {
        __kmp_taskloop_linear( loc, gtid, task, lb, ub, st, sched, grainsize, task_dup );
    }
    if( nogroup == 0 ) {
        __kmpc_end_taskgroup( loc, gtid );
    }
    KA_TRACE(10, ("__kmpc_taskloop(exit): T#%d\n", gtid));
 }
 #endif
--- a/openmp/runtime/test/tasking/kmp_taskloop.c
+++ b/openmp/runtime/test/tasking/kmp_taskloop.c
@ -0,0 +1,158 @@
 // RUN: %libomp-compile-and-run
 #include <stdio.h>
 #include <omp.h>
 #include "omp_my_sleep.h"
 #define N 4
 #define GRAIN 10
 #define STRIDE 3
 // globals
 int th_counter[N];
 int counter;
 // Compiler-generated code (emulation)
 typedef struct ident {
    void* dummy;
 } ident_t;
 typedef struct shar {
    int(*pth_counter)[N];
    int *pcounter;
    int *pj;
 } *pshareds;
 typedef struct task {
    pshareds shareds;
    int(* routine)(int,struct task*);
    int part_id;
 // privates:
    unsigned long long lb; // library always uses ULONG
    unsigned long long ub;
    int st;
    int last;
    int i;
    int j;
    int th;
 } *ptask, kmp_task_t;
 typedef int(* task_entry_t)( int, ptask );
 void
 __task_dup_entry(ptask task_dst, ptask task_src, int lastpriv)
 {
 // setup lastprivate flag
    task_dst->last = lastpriv;
 // could be constructor calls here...
 }
 // OpenMP RTL interfaces
 typedef unsigned long long kmp_uint64;
 typedef long long kmp_int64;
 #ifdef __cplusplus
 extern "C" {
 #endif
 void
 __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
                kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
                int nogroup, int sched, kmp_int64 grainsize, void *task_dup );
 ptask
 __kmpc_omp_task_alloc( ident_t *loc, int gtid, int flags,
                  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
                  task_entry_t task_entry );
 void __kmpc_atomic_fixed4_add(void *id_ref, int gtid, int * lhs, int rhs);
 int  __kmpc_global_thread_num(void *id_ref);
 #ifdef __cplusplus
 }
 #endif
 // User's code
 int task_entry(int gtid, ptask task)
 {
    pshareds pshar = task->shareds;
    for( task->i = task->lb; task->i <= (int)task->ub; task->i += task->st ) {
        task->th = omp_get_thread_num();
        __kmpc_atomic_fixed4_add(NULL,gtid,pshar->pcounter,1);
        __kmpc_atomic_fixed4_add(NULL,gtid,&((*pshar->pth_counter)[task->th]),1);
        task->j = task->i;
    }
    my_sleep( 0.1 ); // sleep 100 ms in order to allow other threads to steal tasks
    if( task->last ) {
        *(pshar->pj) = task->j; // lastprivate
    }
    return 0;
 }
 int main()
 {
    int i, j, gtid = __kmpc_global_thread_num(NULL);
    ptask task;
    pshareds psh;
    omp_set_dynamic(0);
    counter = 0;
    for( i=0; i<N; ++i )
        th_counter[i] = 0;
    #pragma omp parallel num_threads(N)
    {
      #pragma omp master
      {
        int gtid = __kmpc_global_thread_num(NULL);
 /*
 *  This is what the OpenMP runtime calls correspond to:
    #pragma omp taskloop num_tasks(N) lastprivate(j)
    for( i=0; i<N*GRAIN*STRIDE-1; i+=STRIDE )
    {
        int th = omp_get_thread_num();
        #pragma omp atomic
            counter++;
        #pragma omp atomic
            th_counter[th]++;
        j = i;
    }
 */
    task = __kmpc_omp_task_alloc(NULL,gtid,1,sizeof(struct task),sizeof(struct shar),&task_entry);
    psh = task->shareds;
    psh->pth_counter = &th_counter;
    psh->pcounter = &counter;
    psh->pj = &j;
    task->lb = 0;
    task->ub = N*GRAIN*STRIDE-2;
    task->st = STRIDE;
    __kmpc_taskloop(
        NULL,             // location
        gtid,             // gtid
        task,             // task structure
        1,                // if clause value
        &task->lb,        // lower bound
        &task->ub,        // upper bound
        STRIDE,           // loop increment
        0,                // 1 if nogroup specified
        2,                // schedule type: 0-none, 1-grainsize, 2-num_tasks
        N,                // schedule value (ignored for type 0)
        (void*)&__task_dup_entry // tasks duplication routine
        );
      } // end master
    } // end parallel
 // check results
    if( j != N*GRAIN*STRIDE-STRIDE ) {
        printf("Error in lastprivate, %d != %d\n",j,N*GRAIN*STRIDE-STRIDE);
        return 1;
    }
    if( counter != N*GRAIN ) {
        printf("Error, counter %d != %d\n",counter,N*GRAIN);
        return 1;
    }
    for( i=0; i<N; ++i ) {
        if( th_counter[i] % GRAIN ) {
            printf("Error, th_counter[%d] = %d\n",i,th_counter[i]);
            return 1;
        }
    }
    printf("passed\n");
    return 0;
 }