llvm-project/openmp/runtime/test/tasking/kmp_task_reduction_nest.cpp

// RUN: %libomp-cxx-compile-and-run
// RUN: %libomp-cxx-compile -DFLG=1 && %libomp-run
// GCC-5 is needed for OpenMP 4.0 support (taskgroup)
// XFAIL: gcc-4
#include <cstdio>
#include <cmath>
#include <cassert>
#include <omp.h>

// Total number of loop iterations, should be multiple of T for this test
#define N 10000

// Flag to request lazy (1) or eager (0) allocation of reduction objects
#ifndef FLG
#define FLG 0
#endif

/*
  // initial user's code that corresponds to pseudo code of the test
  #pragma omp taskgroup task_reduction(+:i,j) task_reduction(*:x)
  {
    for( int l = 0; l < N; ++l ) {
      #pragma omp task firstprivate(l) in_reduction(+:i) in_reduction(*:x)
      {
        i += l;
        if( l%2 )
          x *= 1.0 / (l + 1);
        else
          x *= (l + 1);
      }
    }

    #pragma omp taskgroup task_reduction(-:i,k) task_reduction(+:y)
    {
      for( int l = 0; l < N; ++l ) {
        #pragma omp task firstprivate(l) in_reduction(+:j,y) \
            in_reduction(*:x) in_reduction(-:k)
        {
          j += l;
          k -= l;
          y += (double)l;
          if( l%2 )
            x *= 1.0 / (l + 1);
          else
            x *= (l + 1);
        }
        #pragma omp task firstprivate(l) in_reduction(+:y) in_reduction(-:i,k)
        {
          i -= l;
          k -= l;
          y += (double)l;
        }
        #pragma omp task firstprivate(l) in_reduction(+:j) in_reduction(*:x)
        {
          j += l;
          if( l%2 )
            x *= 1.0 / (l + 1);
          else
            x *= (l + 1);
        }
      }
    } // inner reduction

    for( int l = 0; l < N; ++l ) {
      #pragma omp task firstprivate(l) in_reduction(+:j)
        j += l;
    }
  } // outer reduction
*/

//------------------------------------------------
// OpenMP runtime library routines
#ifdef __cplusplus
extern "C" {
#endif
extern void* __kmpc_task_reduction_get_th_data(int gtid, void* tg, void* item);
extern void* __kmpc_task_reduction_init(int gtid, int num, void* data);
extern int __kmpc_global_thread_num(void*);
#ifdef __cplusplus
}
#endif

//------------------------------------------------
// Compiler-generated code

typedef struct _task_red_item {
    void       *shar; // shared reduction item
    size_t      size; // size of data item
    void       *f_init; // data initialization routine
    void       *f_fini; // data finalization routine
    void       *f_comb; // data combiner routine
    unsigned    flags;
} _task_red_item_t;

// int:+   no need in init/fini callbacks, valid for subtraction
void __red_int_add_comb(void *lhs, void *rhs) // combiner
{ *(int*)lhs += *(int*)rhs; }

// long long:+   no need in init/fini callbacks, valid for subtraction
void __red_llong_add_comb(void *lhs, void *rhs) // combiner
{ *(long long*)lhs += *(long long*)rhs; }

// double:*   no need in fini callback
void __red_dbl_mul_init(void *data) // initializer
{ *(double*)data = 1.0; }
void __red_dbl_mul_comb(void *lhs, void *rhs) // combiner
{ *(double*)lhs *= *(double*)rhs; }

// double:+   no need in init/fini callbacks
void __red_dbl_add_comb(void *lhs, void *rhs) // combiner
{ *(double*)lhs += *(double*)rhs; }

// ==============================

void calc_serial(int *pi, long long *pj, double *px, long long *pk, double *py)
{
    for( int l = 0; l < N; ++l ) {
        *pi += l;
        if( l%2 )
          *px *= 1.0 / (l + 1);
        else
          *px *= (l + 1);
    }
    for( int l = 0; l < N; ++l ) {
        *pj += l;
        *pk -= l;
        *py += (double)l;
        if( l%2 )
            *px *= 1.0 / (l + 1);
        else
            *px *= (l + 1);

        *pi -= l;
        *pk -= l;
        *py += (double)l;

        *pj += l;
        if( l%2 )
            *px *= 1.0 / (l + 1);
        else
            *px *= (l + 1);
    }
    for( int l = 0; l < N; ++l ) {
        *pj += l;
    }
}

//------------------------------------------------
// Test case
int main()
{
  int nthreads = omp_get_max_threads();
  int err = 0;
  void** ptrs = (void**)malloc(nthreads*sizeof(void*));

  // user's code ======================================
  // variables for serial calculations:
  int is = 3;
  long long js = -9999999;
  double xs = 99999.0;
  long long ks = 99999999;
  double ys = -99999999.0;
  // variables for parallel calculations:
  int ip = 3;
  long long jp = -9999999;
  double xp = 99999.0;
  long long kp = 99999999;
  double yp = -99999999.0;

  calc_serial(&is, &js, &xs, &ks, &ys);
  // ==================================================
  for (int i = 0; i < nthreads; ++i)
    ptrs[i] = NULL;
  #pragma omp parallel
  {
    #pragma omp single nowait
    {
      // outer taskgroup reduces (i,j,x)
      #pragma omp taskgroup // task_reduction(+:i,j) task_reduction(*:x)
      {
        _task_red_item_t red_data[3];
        red_data[0].shar = &ip;
        red_data[0].size = sizeof(ip);
        red_data[0].f_init = NULL; // RTL will zero thread-specific objects
        red_data[0].f_fini = NULL; // no destructors needed
        red_data[0].f_comb = (void*)&__red_int_add_comb;
        red_data[0].flags = FLG;
        red_data[1].shar = &jp;
        red_data[1].size = sizeof(jp);
        red_data[1].f_init = NULL; // RTL will zero thread-specific objects
        red_data[1].f_fini = NULL; // no destructors needed
        red_data[1].f_comb = (void*)&__red_llong_add_comb;
        red_data[1].flags = FLG;
        red_data[2].shar = &xp;
        red_data[2].size = sizeof(xp);
        red_data[2].f_init = (void*)&__red_dbl_mul_init;
        red_data[2].f_fini = NULL; // no destructors needed
        red_data[2].f_comb = (void*)&__red_dbl_mul_comb;
        red_data[2].flags = FLG;
        int gtid = __kmpc_global_thread_num(NULL);
        void* tg1 = __kmpc_task_reduction_init(gtid, 3, red_data);

        for( int l = 0; l < N; l += 2 ) {
          // 2 iterations per task to get correct x value; actually any even
          // number of iters per task will work, otherwise x looses precision
          #pragma omp task firstprivate(l) //in_reduction(+:i) in_reduction(*:x)
          {
            int gtid = __kmpc_global_thread_num(NULL);
            int *p_ip = (int*)__kmpc_task_reduction_get_th_data(gtid, tg1, &ip);
            double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
                                        gtid, tg1, &xp);
            if (!ptrs[gtid]) ptrs[gtid] = p_xp;

            // user's pseudo-code ==============================
            *p_ip += l;
            *p_xp *= (l + 1);

            *p_ip += l + 1;
            *p_xp *= 1.0 / (l + 2);
            // ==================================================
          }
        }
        // inner taskgroup reduces (i,k,y), i is same object as in outer one
        #pragma omp taskgroup // task_reduction(-:i,k) task_reduction(+:y)
        {
          _task_red_item_t red_data[3];
          red_data[0].shar = &ip;
          red_data[0].size = sizeof(ip);
          red_data[0].f_init = NULL; // RTL will zero thread-specific objects
          red_data[0].f_fini = NULL; // no destructors needed
          red_data[0].f_comb = (void*)&__red_int_add_comb;
          red_data[0].flags = FLG;
          red_data[1].shar = &kp;
          red_data[1].size = sizeof(kp);
          red_data[1].f_init = NULL; // RTL will zero thread-specific objects
          red_data[1].f_fini = NULL; // no destructors needed
          red_data[1].f_comb = (void*)&__red_llong_add_comb; // same for + and -
          red_data[1].flags = FLG;
          red_data[2].shar = &yp;
          red_data[2].size = sizeof(yp);
          red_data[2].f_init = NULL; // RTL will zero thread-specific objects
          red_data[2].f_fini = NULL; // no destructors needed
          red_data[2].f_comb = (void*)&__red_dbl_add_comb;
          red_data[2].flags = FLG;
          int gtid = __kmpc_global_thread_num(NULL);
          void* tg2 = __kmpc_task_reduction_init(gtid, 3, red_data);

          for( int l = 0; l < N; l += 2 ) {
            #pragma omp task firstprivate(l)
            // in_reduction(+:j,y) in_reduction(*:x) in_reduction(-:k)
            {
              int gtid = __kmpc_global_thread_num(NULL);
              long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
                                                gtid, tg1, &jp);
              long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
                                                gtid, tg2, &kp);
              double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
                                          gtid, tg1, &xp);
              double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
                                          gtid, tg2, &yp);
              // user's pseudo-code ==============================
              *p_jp += l;
              *p_kp -= l;
              *p_yp += (double)l;
              *p_xp *= (l + 1);

              *p_jp += l + 1;
              *p_kp -= l + 1;
              *p_yp += (double)(l + 1);
              *p_xp *= 1.0 / (l + 2);
              // =================================================
{
  // the following code is here just to check __kmpc_task_reduction_get_th_data:
  int tid = omp_get_thread_num();
  void *addr1;
  void *addr2;
  addr1 = __kmpc_task_reduction_get_th_data(gtid, tg1, &xp); // from shared
  addr2 = __kmpc_task_reduction_get_th_data(gtid, tg1, addr1); // from private
  if (addr1 != addr2) {
    #pragma omp atomic
      ++err;
    printf("Wrong thread-specific addresses %d s:%p p:%p\n", tid, addr1, addr2);
  }
  // from neighbour w/o taskgroup (should start lookup from current tg2)
  if (tid > 0) {
    if (ptrs[tid-1]) {
      addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[tid-1]);
      if (addr1 != addr2) {
        #pragma omp atomic
          ++err;
        printf("Wrong thread-specific addresses %d s:%p n:%p\n",
               tid, addr1, addr2);
      }
    }
  } else {
    if (ptrs[nthreads-1]) {
      addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[nthreads-1]);
      if (addr1 != addr2) {
        #pragma omp atomic
          ++err;
        printf("Wrong thread-specific addresses %d s:%p n:%p\n",
               tid, addr1, addr2);
      }
    }
  }
  // ----------------------------------------------
}
            }
            #pragma omp task firstprivate(l)
            // in_reduction(+:y) in_reduction(-:i,k)
            {
              int gtid = __kmpc_global_thread_num(NULL);
              int *p_ip = (int*)__kmpc_task_reduction_get_th_data(
                                    gtid, tg2, &ip);
              long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
                                                gtid, tg2, &kp);
              double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
                                          gtid, tg2, &yp);

              // user's pseudo-code ==============================
              *p_ip -= l;
              *p_kp -= l;
              *p_yp += (double)l;

              *p_ip -= l + 1;
              *p_kp -= l + 1;
              *p_yp += (double)(l + 1);
              // =================================================
            }
            #pragma omp task firstprivate(l)
            // in_reduction(+:j) in_reduction(*:x)
            {
              int gtid = __kmpc_global_thread_num(NULL);
              long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
                                                gtid, tg1, &jp);
              double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
                                          gtid, tg1, &xp);
              // user's pseudo-code ==============================
              *p_jp += l;
              *p_xp *= (l + 1);

              *p_jp += l + 1;
              *p_xp *= 1.0 / (l + 2);
              // =================================================
            }
          }
        } // inner reduction

        for( int l = 0; l < N; l += 2 ) {
          #pragma omp task firstprivate(l) // in_reduction(+:j)
          {
            int gtid = __kmpc_global_thread_num(NULL);
            long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
                                              gtid, tg1, &jp);
            // user's pseudo-code ==============================
            *p_jp += l;
            *p_jp += l + 1;
            // =================================================
          }
        }
      } // outer reduction
    } // end single
  } // end parallel
  // check results
#if _DEBUG
  printf("reduction flags = %u\n", FLG);
#endif
  if (ip == is && jp == js && ks == kp &&
      fabs(xp - xs) < 0.01 && fabs(yp - ys) < 0.01)
    printf("passed\n");
  else
    printf("failed,\n ser:(%d %lld %f %lld %f)\n par:(%d %lld %f %lld %f)\n",
      is, js, xs, ks, ys,
      ip, jp, xp, kp, yp);
  return 0;
}