forked from OSchip/llvm-project
377 lines
13 KiB
C++
377 lines
13 KiB
C++
// RUN: %libomp-cxx-compile-and-run
|
|
// RUN: %libomp-cxx-compile -DFLG=1 && %libomp-run
|
|
// GCC-5 is needed for OpenMP 4.0 support (taskgroup)
|
|
// XFAIL: gcc-4
|
|
#include <cstdio>
|
|
#include <cmath>
|
|
#include <cassert>
|
|
#include <omp.h>
|
|
|
|
// Total number of loop iterations, should be multiple of T for this test
|
|
#define N 10000
|
|
|
|
// Flag to request lazy (1) or eager (0) allocation of reduction objects
|
|
#ifndef FLG
|
|
#define FLG 0
|
|
#endif
|
|
|
|
/*
|
|
// initial user's code that corresponds to pseudo code of the test
|
|
#pragma omp taskgroup task_reduction(+:i,j) task_reduction(*:x)
|
|
{
|
|
for( int l = 0; l < N; ++l ) {
|
|
#pragma omp task firstprivate(l) in_reduction(+:i) in_reduction(*:x)
|
|
{
|
|
i += l;
|
|
if( l%2 )
|
|
x *= 1.0 / (l + 1);
|
|
else
|
|
x *= (l + 1);
|
|
}
|
|
}
|
|
|
|
#pragma omp taskgroup task_reduction(-:i,k) task_reduction(+:y)
|
|
{
|
|
for( int l = 0; l < N; ++l ) {
|
|
#pragma omp task firstprivate(l) in_reduction(+:j,y) \
|
|
in_reduction(*:x) in_reduction(-:k)
|
|
{
|
|
j += l;
|
|
k -= l;
|
|
y += (double)l;
|
|
if( l%2 )
|
|
x *= 1.0 / (l + 1);
|
|
else
|
|
x *= (l + 1);
|
|
}
|
|
#pragma omp task firstprivate(l) in_reduction(+:y) in_reduction(-:i,k)
|
|
{
|
|
i -= l;
|
|
k -= l;
|
|
y += (double)l;
|
|
}
|
|
#pragma omp task firstprivate(l) in_reduction(+:j) in_reduction(*:x)
|
|
{
|
|
j += l;
|
|
if( l%2 )
|
|
x *= 1.0 / (l + 1);
|
|
else
|
|
x *= (l + 1);
|
|
}
|
|
}
|
|
} // inner reduction
|
|
|
|
for( int l = 0; l < N; ++l ) {
|
|
#pragma omp task firstprivate(l) in_reduction(+:j)
|
|
j += l;
|
|
}
|
|
} // outer reduction
|
|
*/
|
|
|
|
//------------------------------------------------
|
|
// OpenMP runtime library routines
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
extern void* __kmpc_task_reduction_get_th_data(int gtid, void* tg, void* item);
|
|
extern void* __kmpc_task_reduction_init(int gtid, int num, void* data);
|
|
extern int __kmpc_global_thread_num(void*);
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
//------------------------------------------------
|
|
// Compiler-generated code
|
|
|
|
typedef struct _task_red_item {
|
|
void *shar; // shared reduction item
|
|
size_t size; // size of data item
|
|
void *f_init; // data initialization routine
|
|
void *f_fini; // data finalization routine
|
|
void *f_comb; // data combiner routine
|
|
unsigned flags;
|
|
} _task_red_item_t;
|
|
|
|
// int:+ no need in init/fini callbacks, valid for subtraction
|
|
void __red_int_add_comb(void *lhs, void *rhs) // combiner
|
|
{ *(int*)lhs += *(int*)rhs; }
|
|
|
|
// long long:+ no need in init/fini callbacks, valid for subtraction
|
|
void __red_llong_add_comb(void *lhs, void *rhs) // combiner
|
|
{ *(long long*)lhs += *(long long*)rhs; }
|
|
|
|
// double:* no need in fini callback
|
|
void __red_dbl_mul_init(void *data) // initializer
|
|
{ *(double*)data = 1.0; }
|
|
void __red_dbl_mul_comb(void *lhs, void *rhs) // combiner
|
|
{ *(double*)lhs *= *(double*)rhs; }
|
|
|
|
// double:+ no need in init/fini callbacks
|
|
void __red_dbl_add_comb(void *lhs, void *rhs) // combiner
|
|
{ *(double*)lhs += *(double*)rhs; }
|
|
|
|
// ==============================
|
|
|
|
void calc_serial(int *pi, long long *pj, double *px, long long *pk, double *py)
|
|
{
|
|
for( int l = 0; l < N; ++l ) {
|
|
*pi += l;
|
|
if( l%2 )
|
|
*px *= 1.0 / (l + 1);
|
|
else
|
|
*px *= (l + 1);
|
|
}
|
|
for( int l = 0; l < N; ++l ) {
|
|
*pj += l;
|
|
*pk -= l;
|
|
*py += (double)l;
|
|
if( l%2 )
|
|
*px *= 1.0 / (l + 1);
|
|
else
|
|
*px *= (l + 1);
|
|
|
|
*pi -= l;
|
|
*pk -= l;
|
|
*py += (double)l;
|
|
|
|
*pj += l;
|
|
if( l%2 )
|
|
*px *= 1.0 / (l + 1);
|
|
else
|
|
*px *= (l + 1);
|
|
}
|
|
for( int l = 0; l < N; ++l ) {
|
|
*pj += l;
|
|
}
|
|
}
|
|
|
|
//------------------------------------------------
|
|
// Test case
|
|
int main()
|
|
{
|
|
int nthreads = omp_get_max_threads();
|
|
int err = 0;
|
|
void** ptrs = (void**)malloc(nthreads*sizeof(void*));
|
|
|
|
// user's code ======================================
|
|
// variables for serial calculations:
|
|
int is = 3;
|
|
long long js = -9999999;
|
|
double xs = 99999.0;
|
|
long long ks = 99999999;
|
|
double ys = -99999999.0;
|
|
// variables for parallel calculations:
|
|
int ip = 3;
|
|
long long jp = -9999999;
|
|
double xp = 99999.0;
|
|
long long kp = 99999999;
|
|
double yp = -99999999.0;
|
|
|
|
calc_serial(&is, &js, &xs, &ks, &ys);
|
|
// ==================================================
|
|
for (int i = 0; i < nthreads; ++i)
|
|
ptrs[i] = NULL;
|
|
#pragma omp parallel
|
|
{
|
|
#pragma omp single nowait
|
|
{
|
|
// outer taskgroup reduces (i,j,x)
|
|
#pragma omp taskgroup // task_reduction(+:i,j) task_reduction(*:x)
|
|
{
|
|
_task_red_item_t red_data[3];
|
|
red_data[0].shar = &ip;
|
|
red_data[0].size = sizeof(ip);
|
|
red_data[0].f_init = NULL; // RTL will zero thread-specific objects
|
|
red_data[0].f_fini = NULL; // no destructors needed
|
|
red_data[0].f_comb = (void*)&__red_int_add_comb;
|
|
red_data[0].flags = FLG;
|
|
red_data[1].shar = &jp;
|
|
red_data[1].size = sizeof(jp);
|
|
red_data[1].f_init = NULL; // RTL will zero thread-specific objects
|
|
red_data[1].f_fini = NULL; // no destructors needed
|
|
red_data[1].f_comb = (void*)&__red_llong_add_comb;
|
|
red_data[1].flags = FLG;
|
|
red_data[2].shar = &xp;
|
|
red_data[2].size = sizeof(xp);
|
|
red_data[2].f_init = (void*)&__red_dbl_mul_init;
|
|
red_data[2].f_fini = NULL; // no destructors needed
|
|
red_data[2].f_comb = (void*)&__red_dbl_mul_comb;
|
|
red_data[2].flags = FLG;
|
|
int gtid = __kmpc_global_thread_num(NULL);
|
|
void* tg1 = __kmpc_task_reduction_init(gtid, 3, red_data);
|
|
|
|
for( int l = 0; l < N; l += 2 ) {
|
|
// 2 iterations per task to get correct x value; actually any even
|
|
// number of iters per task will work, otherwise x looses precision
|
|
#pragma omp task firstprivate(l) //in_reduction(+:i) in_reduction(*:x)
|
|
{
|
|
int gtid = __kmpc_global_thread_num(NULL);
|
|
int *p_ip = (int*)__kmpc_task_reduction_get_th_data(gtid, tg1, &ip);
|
|
double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
|
|
gtid, tg1, &xp);
|
|
if (!ptrs[gtid]) ptrs[gtid] = p_xp;
|
|
|
|
// user's pseudo-code ==============================
|
|
*p_ip += l;
|
|
*p_xp *= (l + 1);
|
|
|
|
*p_ip += l + 1;
|
|
*p_xp *= 1.0 / (l + 2);
|
|
// ==================================================
|
|
}
|
|
}
|
|
// inner taskgroup reduces (i,k,y), i is same object as in outer one
|
|
#pragma omp taskgroup // task_reduction(-:i,k) task_reduction(+:y)
|
|
{
|
|
_task_red_item_t red_data[3];
|
|
red_data[0].shar = &ip;
|
|
red_data[0].size = sizeof(ip);
|
|
red_data[0].f_init = NULL; // RTL will zero thread-specific objects
|
|
red_data[0].f_fini = NULL; // no destructors needed
|
|
red_data[0].f_comb = (void*)&__red_int_add_comb;
|
|
red_data[0].flags = FLG;
|
|
red_data[1].shar = &kp;
|
|
red_data[1].size = sizeof(kp);
|
|
red_data[1].f_init = NULL; // RTL will zero thread-specific objects
|
|
red_data[1].f_fini = NULL; // no destructors needed
|
|
red_data[1].f_comb = (void*)&__red_llong_add_comb; // same for + and -
|
|
red_data[1].flags = FLG;
|
|
red_data[2].shar = &yp;
|
|
red_data[2].size = sizeof(yp);
|
|
red_data[2].f_init = NULL; // RTL will zero thread-specific objects
|
|
red_data[2].f_fini = NULL; // no destructors needed
|
|
red_data[2].f_comb = (void*)&__red_dbl_add_comb;
|
|
red_data[2].flags = FLG;
|
|
int gtid = __kmpc_global_thread_num(NULL);
|
|
void* tg2 = __kmpc_task_reduction_init(gtid, 3, red_data);
|
|
|
|
for( int l = 0; l < N; l += 2 ) {
|
|
#pragma omp task firstprivate(l)
|
|
// in_reduction(+:j,y) in_reduction(*:x) in_reduction(-:k)
|
|
{
|
|
int gtid = __kmpc_global_thread_num(NULL);
|
|
long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
|
|
gtid, tg1, &jp);
|
|
long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
|
|
gtid, tg2, &kp);
|
|
double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
|
|
gtid, tg1, &xp);
|
|
double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
|
|
gtid, tg2, &yp);
|
|
// user's pseudo-code ==============================
|
|
*p_jp += l;
|
|
*p_kp -= l;
|
|
*p_yp += (double)l;
|
|
*p_xp *= (l + 1);
|
|
|
|
*p_jp += l + 1;
|
|
*p_kp -= l + 1;
|
|
*p_yp += (double)(l + 1);
|
|
*p_xp *= 1.0 / (l + 2);
|
|
// =================================================
|
|
{
|
|
// the following code is here just to check __kmpc_task_reduction_get_th_data:
|
|
int tid = omp_get_thread_num();
|
|
void *addr1;
|
|
void *addr2;
|
|
addr1 = __kmpc_task_reduction_get_th_data(gtid, tg1, &xp); // from shared
|
|
addr2 = __kmpc_task_reduction_get_th_data(gtid, tg1, addr1); // from private
|
|
if (addr1 != addr2) {
|
|
#pragma omp atomic
|
|
++err;
|
|
printf("Wrong thread-specific addresses %d s:%p p:%p\n", tid, addr1, addr2);
|
|
}
|
|
// from neighbour w/o taskgroup (should start lookup from current tg2)
|
|
if (tid > 0) {
|
|
if (ptrs[tid-1]) {
|
|
addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[tid-1]);
|
|
if (addr1 != addr2) {
|
|
#pragma omp atomic
|
|
++err;
|
|
printf("Wrong thread-specific addresses %d s:%p n:%p\n",
|
|
tid, addr1, addr2);
|
|
}
|
|
}
|
|
} else {
|
|
if (ptrs[nthreads-1]) {
|
|
addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[nthreads-1]);
|
|
if (addr1 != addr2) {
|
|
#pragma omp atomic
|
|
++err;
|
|
printf("Wrong thread-specific addresses %d s:%p n:%p\n",
|
|
tid, addr1, addr2);
|
|
}
|
|
}
|
|
}
|
|
// ----------------------------------------------
|
|
}
|
|
}
|
|
#pragma omp task firstprivate(l)
|
|
// in_reduction(+:y) in_reduction(-:i,k)
|
|
{
|
|
int gtid = __kmpc_global_thread_num(NULL);
|
|
int *p_ip = (int*)__kmpc_task_reduction_get_th_data(
|
|
gtid, tg2, &ip);
|
|
long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
|
|
gtid, tg2, &kp);
|
|
double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
|
|
gtid, tg2, &yp);
|
|
|
|
// user's pseudo-code ==============================
|
|
*p_ip -= l;
|
|
*p_kp -= l;
|
|
*p_yp += (double)l;
|
|
|
|
*p_ip -= l + 1;
|
|
*p_kp -= l + 1;
|
|
*p_yp += (double)(l + 1);
|
|
// =================================================
|
|
}
|
|
#pragma omp task firstprivate(l)
|
|
// in_reduction(+:j) in_reduction(*:x)
|
|
{
|
|
int gtid = __kmpc_global_thread_num(NULL);
|
|
long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
|
|
gtid, tg1, &jp);
|
|
double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
|
|
gtid, tg1, &xp);
|
|
// user's pseudo-code ==============================
|
|
*p_jp += l;
|
|
*p_xp *= (l + 1);
|
|
|
|
*p_jp += l + 1;
|
|
*p_xp *= 1.0 / (l + 2);
|
|
// =================================================
|
|
}
|
|
}
|
|
} // inner reduction
|
|
|
|
for( int l = 0; l < N; l += 2 ) {
|
|
#pragma omp task firstprivate(l) // in_reduction(+:j)
|
|
{
|
|
int gtid = __kmpc_global_thread_num(NULL);
|
|
long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
|
|
gtid, tg1, &jp);
|
|
// user's pseudo-code ==============================
|
|
*p_jp += l;
|
|
*p_jp += l + 1;
|
|
// =================================================
|
|
}
|
|
}
|
|
} // outer reduction
|
|
} // end single
|
|
} // end parallel
|
|
// check results
|
|
#if _DEBUG
|
|
printf("reduction flags = %u\n", FLG);
|
|
#endif
|
|
if (ip == is && jp == js && ks == kp &&
|
|
fabs(xp - xs) < 0.01 && fabs(yp - ys) < 0.01)
|
|
printf("passed\n");
|
|
else
|
|
printf("failed,\n ser:(%d %lld %f %lld %f)\n par:(%d %lld %f %lld %f)\n",
|
|
is, js, xs, ks, ys,
|
|
ip, jp, xp, kp, yp);
|
|
return 0;
|
|
}
|