Merge branch 'master' into improve-include-consistency

# Conflicts:
#	src/USER-MESO/atom_vec_tdpd.cpp
This commit is contained in:
Axel Kohlmeyer 2019-07-09 14:50:00 -04:00
commit 06dcc9e283
No known key found for this signature in database
GPG Key ID: D9B44E93BF0C375A
610 changed files with 39096 additions and 8292 deletions

View File

@ -57,8 +57,7 @@ Boolean expression is FALSE, then no commands are executed.
The syntax for Boolean expressions is described below.
Each command (t1, f1, e1, etc) can be any valid LAMMPS input script
command, except an "include"_include.html command, which is not
allowed. If the command is more than one word, it must enclosed in
command. If the command is more than one word, it must enclosed in
quotes, so it will be treated as a single argument, as in the examples
above.

View File

@ -147,7 +147,8 @@ asub = "A" parameter for MEAM (see e.g. "(Baskes)"_#Baskes) :pre
The alpha, b0, b1, b2, b3, t0, t1, t2, t3 parameters correspond to the
standard MEAM parameters in the literature "(Baskes)"_#Baskes (the b
parameters are the standard beta parameters). The rozero parameter is
parameters are the standard beta parameters). Note that only parameters
normalized to t0 = 1.0 are supported. The rozero parameter is
an element-dependent density scaling that weights the reference
background density (see e.g. equation 4.5 in "(Gullet)"_#Gullet) and
is typically 1.0 for single-element systems. The ibar parameter

View File

@ -5092,4 +5092,17 @@ span[id*='MathJax-Span'] {
src: local("Roboto Slab Bold"), local("RobotoSlab-Bold"), url(../fonts/RobotoSlab-Bold.ttf) format("truetype");
}
.codeblock, pre.literal-block, .rst-content .literal-block, .rst-content pre.literal-block, div[class^='highlight'] {
font-size: 12px;
line-height: 1.5;
display: block;
overflow: auto;
color: #404040;
padding: 12px 12px;
}
.codeblock,div[class^='highlight'] {
padding: 0;
}
/*# sourceMappingURL=theme.css.map */

View File

@ -174,6 +174,7 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp r3inv = ucl_sqrt(r6inv);
numtyp force = r2inv*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
force*=factor_lj;
f.x+=delx*force;
f.y+=dely*force;

View File

@ -308,8 +308,6 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
delr1.z = jx.z-ix.z;
numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
// if (rsq1 > cutsq[ijparam]) continue;
// compute zeta_ij
z = (acctyp)0;
@ -355,13 +353,9 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
rsq1, rsq2, delr1, delr2);
}
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
// idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acc_zeta(z, tid, t_per_atom, offset_k);
numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
@ -585,14 +579,9 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
numtyp r1inv = ucl_rsqrt(rsq1);
// look up for zeta_ij
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
// idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
numtyp force = zeta_ij.x*tpainv;
numtyp prefactor = zeta_ij.y;
@ -823,13 +812,9 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
offset_kf = red_acc[2*m+1];
}
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -891,13 +876,10 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
f.y += fi[1];
f.z += fi[2];
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
// idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@ -1068,13 +1050,9 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
offset_kf = red_acc[2*m+1];
}
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -1143,13 +1121,9 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
virial[4] += TWOTHIRD*(mdelr1[0]*fj[2] + delr2[0]*fk[2]);
virial[5] += TWOTHIRD*(mdelr1[1]*fj[2] + delr2[1]*fk[2]);
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
// idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;

View File

@ -356,13 +356,9 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
ijkparam_c5, rsq1, rsq2, delr1, delr2);
}
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
// idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acc_zeta(z, tid, t_per_atom, offset_k);
numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
@ -587,14 +583,9 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
numtyp r1inv = ucl_rsqrt(rsq1);
// look up for zeta_ij
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
// idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
numtyp force = zeta_ij.x*tpainv;
numtyp prefactor = zeta_ij.y;
@ -831,13 +822,9 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
offset_kf = red_acc[2*m+1];
}
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -902,13 +889,9 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
f.y += fi[1];
f.z += fi[2];
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
// idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@ -1085,13 +1068,9 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
offset_kf = red_acc[2*m+1];
}
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -1163,13 +1142,9 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
virial[4] += TWOTHIRD*(mdelr1[0]*fj[2] + delr2[0]*fk[2]);
virial[5] += TWOTHIRD*(mdelr1[1]*fj[2] + delr2[1]*fk[2]);
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
// idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;

View File

@ -359,13 +359,9 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
rsq1, rsq2, delr1, delr2);
}
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
// idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acc_zeta(z, tid, t_per_atom, offset_k);
numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
@ -603,14 +599,9 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
numtyp r1inv = ucl_rsqrt(rsq1);
// look up for zeta_ij
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
// idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
numtyp force = zeta_ij.x*tpainv;
numtyp prefactor = zeta_ij.y;
@ -841,13 +832,9 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
offset_kf = red_acc[2*m+1];
}
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -909,13 +896,9 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
f.y += fi[1];
f.z += fi[2];
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
// idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@ -1086,13 +1069,9 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
offset_kf = red_acc[2*m+1];
}
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -1161,13 +1140,9 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
virial[4] += TWOTHIRD*(mdelr1[0]*fj[2] + delr2[0]*fk[2]);
virial[5] += TWOTHIRD*(mdelr1[1]*fj[2] + delr2[1]*fk[2]);
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
// idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;

View File

@ -89,10 +89,10 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
if (rsq<coeff[mtype].z) {
numtyp r = ucl_sqrt(rsq);
numtyp rinv = ucl_recip(r);
numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
numtyp force = coeff[mtype].x * screening;
numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
numtyp force = coeff[mtype].x * screening;
force = factor_lj*force * rinv;
force = factor_lj*force * rinv;
f.x+=delx*force;
f.y+=dely*force;
@ -181,10 +181,10 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
if (rsq<coeff[mtype].z) {
numtyp r = ucl_sqrt(rsq);
numtyp rinv = ucl_recip(r);
numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
numtyp force = coeff[mtype].x * screening;
numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
numtyp force = coeff[mtype].x * screening;
force = factor_lj*force * rinv;
force = factor_lj*force * rinv;
f.x+=delx*force;
f.y+=dely*force;

View File

@ -129,16 +129,13 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
int mtype=itype*lj_types+jtype;
if (rsq<cut_globalsq) {
numtyp r, t, force;
r = ucl_sqrt(rsq);
force = dzbldr(r, coeff2[mtype].x, coeff2[mtype].y,
coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
if (rsq>cut_innersq) {
t = r - cut_inner;
force = t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
}
if (rsq>cut_innersq) {
t = r - cut_inner;
force = t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
}
force *= (numtyp)-1.0*ucl_recip(r);
f.x+=delx*force;
@ -148,11 +145,10 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
if (eflag>0) {
numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y,
coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
e += coeff3[mtype].z;
if (rsq > cut_innersq) {
e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
}
e += coeff3[mtype].z;
if (rsq > cut_innersq) {
e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
}
energy+=e;
}
if (vflag>0) {
@ -232,15 +228,13 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
if (rsq<cut_globalsq) {
numtyp r, t, force;
r = ucl_sqrt(rsq);
force = dzbldr(r, coeff2[mtype].x, coeff2[mtype].y,
coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
if (rsq>cut_innersq) {
t = r - cut_inner;
force += t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
}
if (rsq>cut_innersq) {
t = r - cut_inner;
force += t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
}
force *= (numtyp)-1.0*ucl_recip(r);
@ -251,11 +245,10 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
if (eflag>0) {
numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y,
coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
e += coeff3[mtype].z;
if (rsq > cut_innersq) {
e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
}
e += coeff3[mtype].z;
if (rsq > cut_innersq) {
e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
}
energy+=e;
}
if (vflag>0) {

View File

@ -1,5 +1,39 @@
# Change Log
## [2.9.00](https://github.com/kokkos/kokkos/tree/2.9.00) (2019-06-24)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.8.00...2.9.00)
**Implemented enhancements:**
- Capability: CUDA Streams [\#1723](https://github.com/kokkos/kokkos/issues/1723)
- Capability: CUDA Stream support for parallel\_reduce [\#2061](https://github.com/kokkos/kokkos/issues/2061)
- Capability: Feature Request: TeamVectorRange [\#713](https://github.com/kokkos/kokkos/issues/713)
- Capability: Adding HPX backend [\#2080](https://github.com/kokkos/kokkos/issues/2080)
- Capability: TaskScheduler to have multiple queues [\#565](https://github.com/kokkos/kokkos/issues/565)
- Capability: Support for additional reductions in ScatterView [\#1674](https://github.com/kokkos/kokkos/issues/1674)
- Capability: Request: deep\_copy within parallel regions [\#689](https://github.com/kokkos/kokkos/issues/689)
- Capability: Feature Request: `create\_mirror\_view\_without\_initializing` [\#1765](https://github.com/kokkos/kokkos/issues/1765)
- View: Use SFINAE to restrict possible View type conversions [\#2127](https://github.com/kokkos/kokkos/issues/2127)
- Deprecation: Deprecate ExecutionSpace::fence\(\) as static function and make it non-static [\#2140](https://github.com/kokkos/kokkos/issues/2140)
- Deprecation: Deprecate LayoutTileLeft [\#2122](https://github.com/kokkos/kokkos/issues/2122)
- Macros: KOKKOS\_RESTRICT defined for non-Intel compilers [\#2038](https://github.com/kokkos/kokkos/issues/2038)
**Fixed bugs:**
- Cuda: TeamThreadRange loop count on device is passed by reference to host static constexpr [\#1733](https://github.com/kokkos/kokkos/issues/1733)
- Cuda: Build error with relocatable device code with CUDA 10.1 GCC 7.3 [\#2134](https://github.com/kokkos/kokkos/issues/2134)
- Cuda: cudaFuncSetCacheConfig is setting CachePreferShared too often [\#2066](https://github.com/kokkos/kokkos/issues/2066)
- Cuda: TeamPolicy doesn't throw then created with non-viable vector length and also doesn't backscale to viable one [\#2020](https://github.com/kokkos/kokkos/issues/2020)
- Cuda: cudaMemcpy error for large league sizes on V100 [\#1991](https://github.com/kokkos/kokkos/issues/1991)
- Cuda: illegal warp sync in parallel\_reduce by functor on Turing 75 [\#1958](https://github.com/kokkos/kokkos/issues/1958)
- TeamThreadRange: Inconsistent results from TeamThreadRange reduction [\#1905](https://github.com/kokkos/kokkos/issues/1905)
- Atomics: atomic\_fetch\_oper & atomic\_oper\_fetch don't build for complex\<float\> [\#1964](https://github.com/kokkos/kokkos/issues/1964)
- Views: Kokkos randomread Views leak memory [\#2155](https://github.com/kokkos/kokkos/issues/2155)
- ScatterView: LayoutLeft overload currently non-functional [\#2165](https://github.com/kokkos/kokkos/issues/2165)
- KNL: With intel 17.2.174 illegal instruction in random number test [\#2078](https://github.com/kokkos/kokkos/issues/2078)
- Bitset: Enable copy constructor on device [\#2094](https://github.com/kokkos/kokkos/issues/2094)
- Examples: do not compile due to template deduction error \(multi\_fem\) [\#1928](https://github.com/kokkos/kokkos/issues/1928)
## [2.8.00](https://github.com/kokkos/kokkos/tree/2.8.00) (2019-02-05)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.7.24...2.8.00)

View File

@ -23,7 +23,7 @@ KOKKOS_DEBUG ?= "no"
KOKKOS_USE_TPLS ?= ""
# Options: c++11,c++14,c++1y,c++17,c++1z,c++2a
KOKKOS_CXX_STANDARD ?= "c++11"
# Options: aggressive_vectorization,disable_profiling,disable_deprecated_code,enable_large_mem_tests
# Options: aggressive_vectorization,disable_profiling,enable_deprecated_code,disable_deprecated_code,enable_large_mem_tests
KOKKOS_OPTIONS ?= ""
# Option for setting ETI path
KOKKOS_ETI_PATH ?= ${KOKKOS_PATH}/core/src/eti
@ -33,11 +33,19 @@ KOKKOS_CMAKE ?= "no"
# Options: force_uvm,use_ldg,rdc,enable_lambda
KOKKOS_CUDA_OPTIONS ?= "enable_lambda"
# Default settings specific options.
# Options: enable_async_dispatch
KOKKOS_HPX_OPTIONS ?= ""
# Return a 1 if a string contains a substring and 0 if not
# Note the search string should be without '"'
# Example: $(call kokkos_has_string,"hwloc,librt",hwloc)
# Will return a 1
kokkos_has_string=$(if $(findstring $2,$1),1,0)
# Returns 1 if the path exists, 0 otherwise
# Example: $(call kokkos_path_exists,/path/to/file)
# Will return a 1 if /path/to/file exists
kokkos_path_exists=$(if $(wildcard $1),1,0)
# Check for general settings.
KOKKOS_INTERNAL_ENABLE_DEBUG := $(call kokkos_has_string,$(KOKKOS_DEBUG),yes)
@ -58,6 +66,7 @@ KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OP
KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OPTIONS),aggressive_vectorization)
KOKKOS_INTERNAL_DISABLE_PROFILING := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_profiling)
KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_deprecated_code)
KOKKOS_INTERNAL_ENABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_deprecated_code)
KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_dualview_modify_check)
KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_profile_load_print)
KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_large_mem_tests)
@ -65,6 +74,7 @@ KOKKOS_INTERNAL_CUDA_USE_LDG := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),
KOKKOS_INTERNAL_CUDA_USE_UVM := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),force_uvm)
KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),rdc)
KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda)
KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch)
KOKKOS_INTERNAL_ENABLE_ETI := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_eti)
@ -72,12 +82,15 @@ KOKKOS_INTERNAL_ENABLE_ETI := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_
KOKKOS_INTERNAL_USE_OPENMP := $(call kokkos_has_string,$(subst OpenMPTarget,,$(KOKKOS_DEVICES)),OpenMP)
KOKKOS_INTERNAL_USE_PTHREADS := $(call kokkos_has_string,$(KOKKOS_DEVICES),Pthread)
KOKKOS_INTERNAL_USE_QTHREADS := $(call kokkos_has_string,$(KOKKOS_DEVICES),Qthreads)
KOKKOS_INTERNAL_USE_HPX := $(call kokkos_has_string,$(KOKKOS_DEVICES),HPX)
KOKKOS_INTERNAL_USE_SERIAL := $(call kokkos_has_string,$(KOKKOS_DEVICES),Serial)
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 0)
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 0)
ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 0)
KOKKOS_INTERNAL_USE_SERIAL := 1
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 0)
KOKKOS_INTERNAL_USE_SERIAL := 1
endif
endif
endif
endif
@ -112,7 +125,7 @@ KOKKOS_INTERNAL_COMPILER_XL := $(strip $(shell $(CXX) -qversion 2
KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-" | wc -l))
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep nvcc | wc -l))
KOKKOS_INTERNAL_COMPILER_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang)
KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),apple-darwin)
KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple LLVM)
KOKKOS_INTERNAL_COMPILER_HCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC)
# Check Host Compiler if using NVCC through nvcc_wrapper
@ -283,9 +296,9 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53))
@ -300,19 +313,19 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53))
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1)
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc)
CUDA_PATH ?= $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=)
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc)
CUDA_PATH ?= $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=)
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
KOKKOS_INTERNAL_OPENMPTARGET_FLAG := $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG) --cuda-path=$(CUDA_PATH)
endif
endif
@ -441,6 +454,10 @@ ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_QTHREADS")
endif
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_HPX")
endif
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_SERIAL")
endif
@ -559,9 +576,15 @@ ifeq ($(KOKKOS_INTERNAL_DISABLE_PROFILING), 0)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_PROFILING")
endif
ifeq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 0)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEPRECATED_CODE")
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 0)
ifeq ($(KOKKOS_INTERNAL_ENABLE_DEPRECATED_CODE), 1)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEPRECATED_CODE")
endif
ifeq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 0)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEPRECATED_CODE")
endif
endif
ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_ETI")
endif
@ -593,8 +616,13 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE")
KOKKOS_CXXFLAGS += --relocatable-device-code=true
KOKKOS_LDFLAGS += --relocatable-device-code=true
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_CXXFLAGS += -fcuda-rdc
KOKKOS_LDFLAGS += -fcuda-rdc
else
KOKKOS_CXXFLAGS += --relocatable-device-code=true
KOKKOS_LDFLAGS += --relocatable-device-code=true
endif
endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
@ -625,6 +653,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
ifeq ($(KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH), 1)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_HPX_ASYNC_DISPATCH")
endif
endif
# Add Architecture flags.
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1)
@ -908,7 +942,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
KOKKOS_INTERNAL_CUDA_ARCH_FLAG=--cuda-gpu-arch
KOKKOS_CXXFLAGS += -x cuda
else
$(error Makefile.kokkos: CUDA is enabled but the compiler is neither NVCC nor Clang)
$(error Makefile.kokkos: CUDA is enabled but the compiler is neither NVCC nor Clang (got version string $(KOKKOS_CXX_VERSION)) )
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1)
@ -1058,10 +1092,18 @@ endif
ifneq ($(KOKKOS_CMAKE), yes)
KOKKOS_CXXFLAGS += -I$(CUDA_PATH)/include
endif
KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64
KOKKOS_CXXLDFLAGS += -L$(CUDA_PATH)/lib64
ifeq ($(call kokkos_path_exists,$(CUDA_PATH)/lib64), 1)
KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64
KOKKOS_CXXLDFLAGS += -L$(CUDA_PATH)/lib64
KOKKOS_TPL_LIBRARY_DIRS += $(CUDA_PATH)/lib64
else ifeq ($(call kokkos_path_exists,$(CUDA_PATH)/lib), 1)
KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib
KOKKOS_CXXLDFLAGS += -L$(CUDA_PATH)/lib
KOKKOS_TPL_LIBRARY_DIRS += $(CUDA_PATH)/lib
else
$(error Can't find CUDA library directory: no lib64 or lib directory in $(CUDA_PATH))
endif
KOKKOS_TPL_INCLUDE_DIRS += $(CUDA_PATH)/include
KOKKOS_TPL_LIBRARY_DIRS += $(CUDA_PATH)/lib64
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_CXXFLAGS += --cuda-path=$(CUDA_PATH)
endif
@ -1124,6 +1166,33 @@ ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
KOKKOS_TPL_LIBRARY_NAMES += qthread
endif
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HPX/*.cpp)
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HPX/*.hpp)
ifneq ($(HPX_PATH),)
ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
KOKKOS_CXXFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --cflags hpx_application_debug)
KOKKOS_CXXLDFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --libs hpx_application_debug)
KOKKOS_LDFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --libs hpx_application_debug)
else
KOKKOS_CXXFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --cflags hpx_application)
KOKKOS_CXXLDFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --libs hpx_application)
KOKKOS_LDFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --libs hpx_application)
endif
else
ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
KOKKOS_CXXFLAGS += $(shell pkg-config --cflags hpx_application_debug)
KOKKOS_CXXLDFLAGS += $(shell pkg-config --libs hpx_application_debug)
KOKKOS_LDFLAGS += $(shell pkg-config --libs hpx_application_debug)
else
KOKKOS_CXXFLAGS += $(shell pkg-config --cflags hpx_application)
KOKKOS_CXXLDFLAGS += $(shell pkg-config --libs hpx_application)
KOKKOS_LDFLAGS += $(shell pkg-config --libs hpx_application)
endif
endif
KOKKOS_TPL_LIBRARY_NAMES += hpx
endif
# Explicitly set the GCC Toolchain for Clang.
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_INTERNAL_GCC_PATH = $(shell which g++)

View File

@ -30,6 +30,8 @@ Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
@ -38,8 +40,8 @@ endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
Kokkos_Cuda_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
Kokkos_Cuda_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp
Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
@ -92,6 +94,13 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
Kokkos_HPX.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp
Kokkos_HPX_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
Kokkos_OpenMPTarget_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp

View File

@ -1,5 +1,5 @@
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
TEST_OPTIONAL_TPLS CUSPARSE
)

View File

@ -328,6 +328,8 @@ public:
parallel_for("Kokkos::Sort::Copy", Kokkos::RangePolicy<execution_space>(0,len),functor);
}
Kokkos::fence();
}
template<class ValuesViewType>

View File

@ -42,6 +42,12 @@ IF(Kokkos_ENABLE_OpenMP)
)
ENDIF()
IF(Kokkos_ENABLE_HPX)
LIST( APPEND SOURCES
TestHPX.cpp
)
ENDIF()
IF(Kokkos_ENABLE_Serial)
LIST( APPEND SOURCES
TestSerial.cpp

View File

@ -49,6 +49,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
TEST_TARGETS += test-openmp
endif
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
OBJ_HPX = TestHPX.o UnitTestMain.o gtest-all.o
TARGETS += KokkosAlgorithms_UnitTest_HPX
TEST_TARGETS += test-hpx
endif
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
OBJ_SERIAL = TestSerial.o UnitTestMain.o gtest-all.o
TARGETS += KokkosAlgorithms_UnitTest_Serial
@ -67,6 +73,9 @@ KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
KokkosAlgorithms_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_OpenMP
KokkosAlgorithms_UnitTest_HPX: $(OBJ_HPX) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(EXTRA_PATH) $(OBJ_HPX) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_HPX
KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Serial
@ -82,6 +91,9 @@ test-threads: KokkosAlgorithms_UnitTest_Threads
test-openmp: KokkosAlgorithms_UnitTest_OpenMP
./KokkosAlgorithms_UnitTest_OpenMP
test-hpx: KokkosAlgorithms_UnitTest_HPX
./KokkosAlgorithms_UnitTest_HPX
test-serial: KokkosAlgorithms_UnitTest_Serial
./KokkosAlgorithms_UnitTest_Serial

View File

@ -0,0 +1,96 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_HPX
#include <gtest/gtest.h>
#include <Kokkos_Core.hpp>
//----------------------------------------------------------------------------
#include <TestRandom.hpp>
#include <TestSort.hpp>
#include <iomanip>
namespace Test {
class hpx : public ::testing::Test {
protected:
static void SetUpTestCase()
{
std::cout << std::setprecision(5) << std::scientific;
}
static void TearDownTestCase()
{
}
};
#define HPX_RANDOM_XORSHIFT64( num_draws ) \
TEST_F( hpx, Random_XorShift64 ) { \
Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Experimental::HPX> >(num_draws); \
}
#define HPX_RANDOM_XORSHIFT1024( num_draws ) \
TEST_F( hpx, Random_XorShift1024 ) { \
Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Experimental::HPX> >(num_draws); \
}
#define HPX_SORT_UNSIGNED( size ) \
TEST_F( hpx, SortUnsigned ) { \
Impl::test_sort< Kokkos::Experimental::HPX, unsigned >(size); \
}
HPX_RANDOM_XORSHIFT64( 10240000 )
HPX_RANDOM_XORSHIFT1024( 10130144 )
HPX_SORT_UNSIGNED(171)
#undef HPX_RANDOM_XORSHIFT64
#undef HPX_RANDOM_XORSHIFT1024
#undef HPX_SORT_UNSIGNED
} // namespace test
#else
void KOKKOS_ALGORITHMS_UNITTESTS_TESTHPX_PREVENT_LINK_ERROR() {}
#endif

View File

@ -225,9 +225,9 @@ void test_dynamic_view_sort(unsigned int n )
Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
Kokkos::fill_random(keys_view,g,Kokkos::Random_XorShift64_Pool<ExecutionSpace>::generator_type::MAX_URAND);
ExecutionSpace::fence();
ExecutionSpace().fence();
Kokkos::deep_copy(keys,keys_view);
//ExecutionSpace::fence();
//ExecutionSpace().fence();
double sum_before = 0.0;
double sum_after = 0.0;
@ -237,9 +237,9 @@ void test_dynamic_view_sort(unsigned int n )
Kokkos::sort(keys, 0 /* begin */ , n /* end */ );
ExecutionSpace::fence(); // Need this fence to prevent BusError with Cuda
ExecutionSpace().fence(); // Need this fence to prevent BusError with Cuda
Kokkos::deep_copy( keys_view , keys );
//ExecutionSpace::fence();
//ExecutionSpace().fence();
Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys_view),sum_after);
Kokkos::parallel_reduce(n-1,is_sorted_struct<ExecutionSpace, KeyType>(keys_view),sort_fails);

View File

@ -76,8 +76,20 @@ IF(KOKKOS_SEPARATE_LIBS)
)
foreach(lib IN LISTS KOKKOS_TPL_LIBRARY_NAMES)
if ("${lib}" STREQUAL "cuda")
if (("${lib}" STREQUAL "cuda") AND (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang"))
set(LIB_cuda "-lcuda")
elseif ("${lib}" STREQUAL "hpx")
find_package(HPX REQUIRED)
if(${HPX_FOUND})
target_link_libraries(kokkoscore PUBLIC ${HPX_LIBRARIES})
target_link_libraries(kokkoscontainers PUBLIC ${HPX_LIBRARIES})
target_link_libraries(kokkosalgorithms PUBLIC ${HPX_LIBRARIES})
target_include_directories(kokkoscore PUBLIC ${HPX_INCLUDE_DIRS})
target_include_directories(kokkoscontainers PUBLIC ${HPX_INCLUDE_DIRS})
target_include_directories(kokkosalgorithms PUBLIC ${HPX_INCLUDE_DIRS})
else()
message(ERROR "HPX not found. Check the value of HPX_DIR (= ${HPX_DIR}) or CMAKE_PREFIX_PATH (= ${CMAKE_PREFIX_PATH}).")
endif()
else()
find_library(LIB_${lib} ${lib} PATHS ${KOKKOS_TPL_LIBRARY_DIRS})
endif()
@ -158,8 +170,16 @@ ELSE()
)
foreach(lib IN LISTS KOKKOS_TPL_LIBRARY_NAMES)
if ("${lib}" STREQUAL "cuda")
if (("${lib}" STREQUAL "cuda") AND (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang"))
set(LIB_cuda "-lcuda")
elseif ("${lib}" STREQUAL "hpx")
find_package(HPX REQUIRED)
if(${HPX_FOUND})
target_link_libraries(kokkos PUBLIC ${HPX_LIBRARIES})
target_include_directories(kokkos PUBLIC ${HPX_INCLUDE_DIRS})
else()
message(ERROR "HPX not found. Check the value of HPX_DIR (= ${HPX_DIR}) or CMAKE_PREFIX_PATH (= ${CMAKE_PREFIX_PATH}).")
endif()
else()
find_library(LIB_${lib} ${lib} PATHS ${KOKKOS_TPL_LIBRARY_DIRS})
endif()

View File

@ -95,7 +95,7 @@ function(set_kokkos_cxx_compiler)
message(FATAL_ERROR "Compiling CUDA code directly with Clang requires version 4.0.0 or higher.")
endif()
elseif(NOT INTERNAL_CXX_COMPILER_ID STREQUAL NVIDIA)
message(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang.")
message(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang, but compiler ID was ${INTERNAL_CXX_COMPILER_ID}")
endif()
endif()

View File

@ -14,6 +14,7 @@ list(APPEND KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST
OpenMP
Pthread
Qthread
HPX
Cuda
ROCm
HWLOC
@ -23,6 +24,7 @@ list(APPEND KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST
Cuda_Relocatable_Device_Code
Cuda_UVM
Cuda_LDG_Intrinsic
HPX_ASYNC_DISPATCH
Debug
Debug_DualView_Modify_Check
Debug_Bounds_Check
@ -116,6 +118,7 @@ list(APPEND KOKKOS_DEVICES_LIST
OpenMP # OpenMP
Pthread # pthread
Qthreads # qthreads
HPX # HPX
Serial # serial
ROCm # Relocatable device code
)
@ -173,6 +176,19 @@ set(KOKKOS_INTERNAL_RELOCATABLE_DEVICE_CODE rdc)
set(KOKKOS_INTERNAL_LAMBDA enable_lambda)
#-------------------------------------------------------------------------------
# List of possible Options for HPX
#-------------------------------------------------------------------------------
# From Makefile.kokkos: Options: enable_async_dispatch
set(KOKKOS_HPX_OPTIONS_LIST)
list(APPEND KOKKOS_HPX_OPTIONS_LIST
ASYNC_DISPATCH # enable_async_dispatch
)
# Map of cmake variables to Makefile variables
set(KOKKOS_INTERNAL_ENABLE_ASYNC_DISPATCH enable_async_dispatch)
#-------------------------------------------------------------------------------
#------------------------------- Create doc strings ----------------------------
#-------------------------------------------------------------------------------
@ -202,6 +218,11 @@ set(KOKKOS_SEPARATE_LIBS OFF CACHE BOOL "OFF = kokkos. ON = kokkoscore, kokkosc
# Qthreads options.
set(KOKKOS_QTHREADS_DIR "" CACHE PATH "Location of Qthreads library.")
# HPX options.
set(KOKKOS_HPX_DIR "" CACHE PATH "Location of HPX library.")
# Whether to build separate libraries or now
set(KOKKOS_SEPARATE_TESTS OFF CACHE BOOL "Provide unit test targets with finer granularity.")
#-------------------------------------------------------------------------------
#------------------------------- KOKKOS_DEVICES --------------------------------
@ -215,6 +236,11 @@ IF(Trilinos_ENABLE_Kokkos)
ELSE()
set_kokkos_default_default(QTHREADS OFF)
ENDIF()
IF(TPL_ENABLE_HPX)
set_kokkos_default_default(HPX ON)
ELSE()
set_kokkos_default_default(HPX OFF)
ENDIF()
IF(Trilinos_ENABLE_OpenMP)
set_kokkos_default_default(OPENMP ${Trilinos_ENABLE_OpenMP})
ELSE()
@ -231,6 +257,7 @@ ELSE()
set_kokkos_default_default(OPENMP OFF)
set_kokkos_default_default(PTHREAD OFF)
set_kokkos_default_default(QTHREAD OFF)
set_kokkos_default_default(HPX OFF)
set_kokkos_default_default(CUDA OFF)
set_kokkos_default_default(ROCM OFF)
ENDIF()
@ -241,6 +268,7 @@ set(KOKKOS_ENABLE_SERIAL ${KOKKOS_INTERNAL_ENABLE_SERIAL_DEFAULT} CACHE BOOL "Wh
set(KOKKOS_ENABLE_OPENMP ${KOKKOS_INTERNAL_ENABLE_OPENMP_DEFAULT} CACHE BOOL "Enable OpenMP support in Kokkos." FORCE)
set(KOKKOS_ENABLE_PTHREAD ${KOKKOS_INTERNAL_ENABLE_PTHREAD_DEFAULT} CACHE BOOL "Enable Pthread support in Kokkos.")
set(KOKKOS_ENABLE_QTHREADS ${KOKKOS_INTERNAL_ENABLE_QTHREADS_DEFAULT} CACHE BOOL "Enable Qthreads support in Kokkos.")
set(KOKKOS_ENABLE_HPX ${KOKKOS_INTERNAL_ENABLE_HPX_DEFAULT} CACHE BOOL "Enable HPX support in Kokkos.")
set(KOKKOS_ENABLE_CUDA ${KOKKOS_INTERNAL_ENABLE_CUDA_DEFAULT} CACHE BOOL "Enable CUDA support in Kokkos.")
set(KOKKOS_ENABLE_ROCM ${KOKKOS_INTERNAL_ENABLE_ROCM_DEFAULT} CACHE BOOL "Enable ROCm support in Kokkos.")
@ -343,6 +371,18 @@ set(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE ${KOKKOS_INTERNAL_ENABLE_CUDA_REL
set(KOKKOS_ENABLE_CUDA_LAMBDA ${KOKKOS_INTERNAL_ENABLE_CUDA_LAMBDA_DEFAULT} CACHE BOOL "Enable lambdas for CUDA. (cuda option)")
#-------------------------------------------------------------------------------
#------------------------------- KOKKOS_HPX_OPTIONS ----------------------------
#-------------------------------------------------------------------------------
# HPX options.
# Set Defaults
set_kokkos_default_default(HPX_ASYNC_DISPATCH OFF)
# Set actual options
set(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH ${KOKKOS_INTERNAL_ENABLE_HPX_ASYNC_DISPATCH_DEFAULT} CACHE BOOL "Enable HPX async dispatch.")
#-------------------------------------------------------------------------------
#----------------------- HOST ARCH AND LEGACY TRIBITS --------------------------
#-------------------------------------------------------------------------------
@ -376,4 +416,3 @@ foreach(opt ${KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST})
SET(Kokkos_ENABLE_${opt} ${KOKKOS_ENABLE_${OPT}} CACHE BOOL "CamelCase Compatibility setting for KOKKOS_ENABLE_${OPT}")
ENDIF()
endforeach()

View File

@ -198,6 +198,8 @@ if(KOKKOS_CMAKE_VERBOSE)
message(STATUS " Host Parallel: Pthread")
elseif(KOKKOS_ENABLE_QTHREADS)
message(STATUS " Host Parallel: Qthreads")
elseif(KOKKOS_ENABLE_HPX)
message(STATUS " Host Parallel: HPX")
else()
message(STATUS " Host Parallel: None")
endif()
@ -244,6 +246,10 @@ if(KOKKOS_CMAKE_VERBOSE)
message(STATUS " KOKKOS_MEMKIND_DIR: ${KOKKOS_MEMKIND_DIR}")
endif()
if(KOKKOS_HPX_DIR)
message(STATUS " KOKKOS_HPX_DIR: ${KOKKOS_HPX_DIR}")
endif()
message(STATUS "")
message(STATUS "Final kokkos settings variable:")
message(STATUS " ${KOKKOS_SETTINGS}")

View File

@ -9,6 +9,10 @@ IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_OpenMP)
SET(${PROJECT_NAME}_ENABLE_OpenMP OFF)
ENDIF()
IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_HPX)
SET(${PROJECT_NAME}_ENABLE_HPX OFF)
ENDIF()
IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_DEBUG)
SET(${PROJECT_NAME}_ENABLE_DEBUG OFF)
ENDIF()
@ -309,6 +313,10 @@ ENDFUNCTION()
FUNCTION(TRIBITS_TPL_TENTATIVELY_ENABLE)
ENDFUNCTION()
FUNCTION(TRIBITS_ADD_ADVANCED_TEST)
# TODO Write this
ENDFUNCTION()
FUNCTION(TRIBITS_ADD_EXECUTABLE_AND_TEST EXE_NAME)
SET(options STANDARD_PASS_OUTPUT WILL_FAIL)

View File

@ -1,5 +1,5 @@
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
LIB_REQUIRED_PACKAGES KokkosCore
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
TEST_OPTIONAL_TPLS CUSPARSE
)

View File

@ -24,6 +24,10 @@ IF(Kokkos_ENABLE_OpenMP)
LIST( APPEND SOURCES TestOpenMP.cpp)
ENDIF()
IF(Kokkos_ENABLE_HPX)
LIST( APPEND SOURCES TestHPX.cpp)
ENDIF()
# Per #374, we always want to build this test, but we only want to run
# it as a PERFORMANCE test. That's why we separate building the test
# from running the test.

View File

@ -49,6 +49,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
TEST_TARGETS += test-openmp
endif
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
OBJ_HPX = TestHPX.o TestMain.o gtest-all.o
TARGETS += KokkosContainers_PerformanceTest_HPX
TEST_TARGETS += test-hpx
endif
KokkosContainers_PerformanceTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Cuda
@ -61,6 +67,9 @@ KokkosContainers_PerformanceTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
KokkosContainers_PerformanceTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_OpenMP
KokkosContainers_PerformanceTest_HPX: $(OBJ_HPX) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_HPX) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_HPX
test-cuda: KokkosContainers_PerformanceTest_Cuda
./KokkosContainers_PerformanceTest_Cuda
@ -73,6 +82,9 @@ test-threads: KokkosContainers_PerformanceTest_Threads
test-openmp: KokkosContainers_PerformanceTest_OpenMP
./KokkosContainers_PerformanceTest_OpenMP
test-hpx: KokkosContainers_PerformanceTest_HPX
./KokkosContainers_PerformanceTest_HPX
build_all: $(TARGETS)
test: $(TEST_TARGETS)

View File

@ -197,7 +197,7 @@ void test_dynrankview_op_perf( const int par_size )
timer.reset();
Kokkos::RangePolicy<DeviceType> policy(0,par_size);
Kokkos::parallel_for( policy , FunctorType(testview) );
DeviceType::fence();
DeviceType().fence();
elapsed_time_view = timer.seconds();
std::cout << " View time (init only): " << elapsed_time_view << std::endl;
@ -205,7 +205,7 @@ void test_dynrankview_op_perf( const int par_size )
timer.reset();
Kokkos::View<double*,DeviceType> sumview("sumview",par_size);
Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testview, sumview) );
DeviceType::fence();
DeviceType().fence();
elapsed_time_compview = timer.seconds();
std::cout << " View sum computation time: " << elapsed_time_view << std::endl;
@ -215,7 +215,7 @@ void test_dynrankview_op_perf( const int par_size )
timer.reset();
Kokkos::parallel_for( policy , FunctorStrideType(teststrideview) );
DeviceType::fence();
DeviceType().fence();
elapsed_time_strideview = timer.seconds();
std::cout << " Strided View time (init only): " << elapsed_time_strideview << std::endl;
}
@ -226,7 +226,7 @@ void test_dynrankview_op_perf( const int par_size )
timer.reset();
Kokkos::RangePolicy<DeviceType> policy(0,par_size);
Kokkos::parallel_for( policy , FunctorType(testview) );
DeviceType::fence();
DeviceType().fence();
elapsed_time_view_rank7 = timer.seconds();
std::cout << " View Rank7 time (init only): " << elapsed_time_view_rank7 << std::endl;
}
@ -237,14 +237,14 @@ void test_dynrankview_op_perf( const int par_size )
timer.reset();
Kokkos::RangePolicy<DeviceType> policy(0,par_size);
Kokkos::parallel_for( policy , FunctorType(testdrview) );
DeviceType::fence();
DeviceType().fence();
elapsed_time_drview = timer.seconds();
std::cout << " DynRankView time (init only): " << elapsed_time_drview << std::endl;
timer.reset();
Kokkos::DynRankView<double,DeviceType> sumview("sumview",par_size);
Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testdrview, sumview) );
DeviceType::fence();
DeviceType().fence();
elapsed_time_compdrview = timer.seconds();
std::cout << " DynRankView sum computation time: " << elapsed_time_compdrview << std::endl;

View File

@ -192,7 +192,7 @@ void test_global_to_local_ids(unsigned num_ids)
{
generate_ids<Device> gen(local_2_global);
}
Device::fence();
Device().fence();
// generate
elasped_time = timer.seconds();
std::cout << elasped_time << ", ";
@ -201,7 +201,7 @@ void test_global_to_local_ids(unsigned num_ids)
{
fill_map<Device> fill(global_2_local, local_2_global);
}
Device::fence();
Device().fence();
// fill
elasped_time = timer.seconds();
@ -214,7 +214,7 @@ void test_global_to_local_ids(unsigned num_ids)
{
find_test<Device> find(global_2_local, local_2_global,num_errors);
}
Device::fence();
Device().fence();
// find
elasped_time = timer.seconds();

View File

@ -0,0 +1,130 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_HPX )
#include <gtest/gtest.h>
#include <Kokkos_Core.hpp>
#include <Kokkos_UnorderedMap.hpp>
#include <TestGlobal2LocalIds.hpp>
#include <TestUnorderedMapPerformance.hpp>
#include <TestDynRankView.hpp>
#include <TestScatterView.hpp>
#include <iomanip>
#include <sstream>
#include <string>
#include <fstream>
namespace Performance {
class hpx : public ::testing::Test {
protected:
static void SetUpTestCase()
{
std::cout << std::setprecision(5) << std::scientific;
Kokkos::initialize();
Kokkos::print_configuration( std::cout );
}
static void TearDownTestCase()
{
Kokkos::finalize();
}
};
TEST_F( hpx, dynrankview_perf )
{
std::cout << "HPX" << std::endl;
std::cout << " DynRankView vs View: Initialization Only " << std::endl;
test_dynrankview_op_perf<Kokkos::Experimental::HPX>( 8192 );
}
TEST_F( hpx, global_2_local)
{
std::cout << "HPX" << std::endl;
std::cout << "size, create, generate, fill, find" << std::endl;
for (unsigned i=Performance::begin_id_size; i<=Performance::end_id_size; i *= Performance::id_step)
test_global_to_local_ids<Kokkos::Experimental::HPX>(i);
}
TEST_F( hpx, unordered_map_performance_near)
{
unsigned num_hpx = 4;
std::ostringstream base_file_name;
base_file_name << "hpx-" << num_hpx << "-near";
Perf::run_performance_tests<Kokkos::Experimental::HPX,true>(base_file_name.str());
}
TEST_F( hpx, unordered_map_performance_far)
{
unsigned num_hpx = 4;
std::ostringstream base_file_name;
base_file_name << "hpx-" << num_hpx << "-far";
Perf::run_performance_tests<Kokkos::Experimental::HPX,false>(base_file_name.str());
}
TEST_F( hpx, scatter_view)
{
std::cout << "ScatterView data-duplicated test:\n";
Perf::test_scatter_view<Kokkos::Experimental::HPX, Kokkos::LayoutRight,
Kokkos::Experimental::ScatterDuplicated,
Kokkos::Experimental::ScatterNonAtomic>(10, 1000 * 1000);
//std::cout << "ScatterView atomics test:\n";
//Perf::test_scatter_view<Kokkos::Experimental::HPX, Kokkos::LayoutRight,
// Kokkos::Experimental::ScatterNonDuplicated,
// Kokkos::Experimental::ScatterAtomic>(10, 1000 * 1000);
}
} // namespace test
#else
void KOKKOS_CONTAINERS_PERFORMANCE_TESTS_TESTHPX_PREVENT_EMPTY_LINK_ERROR() {}
#endif

View File

@ -83,6 +83,7 @@ void test_scatter_view(int m, int n)
for (int k = 0; k < m; ++k) {
Kokkos::parallel_for(policy, f2, "hand_coded_duplicate_scatter_view_test");
}
Kokkos::fence();
auto t = timer.seconds();
std::cout << "hand-coded test took " << t << " seconds\n";
}
@ -101,6 +102,7 @@ void test_scatter_view(int m, int n)
for (int k = 0; k < m; ++k) {
Kokkos::parallel_for(policy, f, "scatter_view_test");
}
Kokkos::fence();
auto t = timer.seconds();
std::cout << "test took " << t << " seconds\n";
}

View File

@ -108,7 +108,7 @@ struct UnorderedMapTest
std::cout << std::setprecision(2) << std::fixed << std::setw(5) << (1e9*(seconds/(inserts))) << "; " << std::flush;
histogram.calculate();
Device::fence();
Device().fence();
}
void print(std::ostream & metrics_out, std::ostream & length_out, std::ostream & distance_out, std::ostream & block_distance_out)
@ -236,7 +236,7 @@ void run_performance_tests(std::string const & base_file_name)
uint32_t inserts = static_cast<uint32_t>(test_ratios[j]*(capacity));
std::cout << capacity << std::flush;
UnorderedMapTest<Device, Near> test(capacity, inserts*collisions[i], collisions[i]);
Device::fence();
Device().fence();
test.print(metrics_out, length_out, distance_out, block_distance_out);
}
std::cout << "\b\b " << std::endl;

View File

@ -107,22 +107,20 @@ public:
}
}
/// assignment
Bitset<Device> & operator = (Bitset<Device> const & rhs)
{
this->m_size = rhs.m_size;
this->m_last_block_mask = rhs.m_last_block_mask;
this->m_blocks = rhs.m_blocks;
KOKKOS_INLINE_FUNCTION
Bitset (const Bitset<Device>&) = default;
return *this;
}
KOKKOS_INLINE_FUNCTION
Bitset& operator= (const Bitset<Device>&) = default;
/// copy constructor
Bitset( Bitset<Device> const & rhs)
: m_size( rhs.m_size )
, m_last_block_mask( rhs.m_last_block_mask )
, m_blocks( rhs.m_blocks )
{}
KOKKOS_INLINE_FUNCTION
Bitset (Bitset<Device>&&) = default;
KOKKOS_INLINE_FUNCTION
Bitset& operator= (Bitset<Device>&&) = default;
KOKKOS_INLINE_FUNCTION
~Bitset () = default;
/// number of bits in the set
/// can be call from the host or the device

View File

@ -484,8 +484,8 @@ public:
}
}
if(std::is_same<typename t_host::memory_space,typename t_dev::memory_space>::value) {
t_dev::execution_space::fence();
t_host::execution_space::fence();
typename t_dev::execution_space().fence();
typename t_host::execution_space().fence();
}
}

View File

@ -75,7 +75,7 @@ struct DynRankDimTraits {
, const size_t N4
, const size_t N5
, const size_t N6
, const size_t N7 )
, const size_t /* N7 */)
{
return
( (N6 == unspecified && N5 == unspecified && N4 == unspecified && N3 == unspecified && N2 == unspecified && N1 == unspecified && N0 == unspecified) ? 0
@ -106,7 +106,7 @@ struct DynRankDimTraits {
// Extra overload to match that for specialize types v2
template <typename Layout, typename ... P>
KOKKOS_INLINE_FUNCTION
static size_t computeRank( const Kokkos::Impl::ViewCtorProp<P...>& prop, const Layout& layout )
static size_t computeRank( const Kokkos::Impl::ViewCtorProp<P...>& /* prop */, const Layout& layout )
{
return computeRank(layout);
}
@ -155,7 +155,7 @@ struct DynRankDimTraits {
// Extra overload to match that for specialize types
template <typename Traits, typename ... P>
KOKKOS_INLINE_FUNCTION
static typename std::enable_if< (std::is_same<typename Traits::array_layout , Kokkos::LayoutRight>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutLeft>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutStride>::value) , typename Traits::array_layout >::type createLayout( const Kokkos::Impl::ViewCtorProp<P...>& prop, const typename Traits::array_layout& layout )
static typename std::enable_if< (std::is_same<typename Traits::array_layout , Kokkos::LayoutRight>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutLeft>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutStride>::value) , typename Traits::array_layout >::type createLayout( const Kokkos::Impl::ViewCtorProp<P...>& /* prop */, const typename Traits::array_layout& layout )
{
return createLayout( layout );
}
@ -655,7 +655,7 @@ public:
const size_t dim_scalar = m_map.dimension_scalar();
const size_t bytes = this->span() / dim_scalar;
typedef Kokkos::View<DataType*, typename traits::array_layout, typename traits::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged | traits::memory_traits::RandomAccess | traits::memory_traits::Atomic> > tmp_view_type;
typedef Kokkos::View<DataType*, typename traits::array_layout, typename traits::device_type, Kokkos::MemoryTraits<traits::memory_traits::is_unmanaged | traits::memory_traits::is_random_access | traits::memory_traits::is_atomic> > tmp_view_type;
tmp_view_type rankone_view(this->data(), bytes, dim_scalar);
return rankone_view(i0);
}
@ -1060,7 +1060,7 @@ public:
}
// Copy the input allocation properties with possibly defaulted properties
alloc_prop prop( arg_prop );
alloc_prop prop_copy( arg_prop );
//------------------------------------------------------------
#if defined( KOKKOS_ENABLE_CUDA )
@ -1070,18 +1070,18 @@ public:
// Fence using the trait's executon space (which will be Kokkos::Cuda)
// to avoid incomplete type errors from usng Kokkos::Cuda directly.
if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
traits::device_type::memory_space::execution_space::fence();
typename traits::device_type::memory_space::execution_space().fence();
}
#endif
//------------------------------------------------------------
Kokkos::Impl::SharedAllocationRecord<> *
record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) );
record = m_map.allocate_shared( prop_copy, Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) );
//------------------------------------------------------------
#if defined( KOKKOS_ENABLE_CUDA )
if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
traits::device_type::memory_space::execution_space::fence();
typename traits::device_type::memory_space::execution_space().fence();
}
#endif
//------------------------------------------------------------
@ -1609,7 +1609,7 @@ struct DynRankViewFill {
closure.execute();
execution_space::fence();
execution_space().fence();
}
};
@ -1650,6 +1650,7 @@ struct DynRankViewRemap {
typedef Kokkos::RangePolicy< ExecSpace > Policy ;
const Kokkos::Impl::ParallelFor< DynRankViewRemap , Policy > closure( *this , Policy( 0 , n0 ) );
closure.execute();
// Kokkos::fence(); // ??
}
KOKKOS_INLINE_FUNCTION

View File

@ -288,8 +288,8 @@ public:
>::type
resize_serial( IntType const & n )
{
typedef typename traits::value_type value_type ;
typedef value_type * value_pointer_type ;
typedef typename traits::value_type local_value_type ;
typedef local_value_type * value_pointer_type ;
const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ; // New total number of chunks needed for resize
@ -304,8 +304,8 @@ public:
if ( *pc < NC ) {
while ( *pc < NC ) {
m_chunks[*pc] = reinterpret_cast<value_pointer_type>
(
typename traits::memory_space().allocate( sizeof(value_type) << m_chunk_shift )
(
typename traits::memory_space().allocate( sizeof(local_value_type) << m_chunk_shift )
);
++*pc ;
}
@ -314,7 +314,7 @@ public:
while ( NC + 1 <= *pc ) {
--*pc ;
typename traits::memory_space().deallocate( m_chunks[*pc]
, sizeof(value_type) << m_chunk_shift );
, sizeof(local_value_type) << m_chunk_shift );
m_chunks[*pc] = 0 ;
}
}
@ -376,8 +376,8 @@ public:
closure.execute();
traits::execution_space::fence();
//Impl::ChunkArraySpace< typename traits::memory_space >::memory_space::execution_space::fence();
typename traits::execution_space().fence();
//Impl::ChunkArraySpace< typename traits::memory_space >::memory_space::execution_space().fence();
}
void construct_shared_allocation()

View File

@ -202,8 +202,8 @@ namespace Kokkos {
template <typename iType, typename std::enable_if< std::is_integral<iType>::value, iType>::type = 0>
KOKKOS_INLINE_FUNCTION
int64_t begin(const iType dimension) const {
return dimension < Rank ? m_begins[dimension] : 0;
int64_t begin(const iType local_dimension) const {
return local_dimension < Rank ? m_begins[local_dimension] : 0;
}
KOKKOS_INLINE_FUNCTION
@ -211,7 +211,9 @@ namespace Kokkos {
template <typename iType, typename std::enable_if< std::is_integral<iType>::value, iType>::type = 0>
KOKKOS_INLINE_FUNCTION
int64_t end(const iType dimension) const {return begin(dimension) + m_map.extent(dimension);}
int64_t end(const iType local_dimension) const {
return begin(local_dimension) + m_map.extent(local_dimension);
}
private:
@ -1068,7 +1070,7 @@ namespace Kokkos {
}
// Copy the input allocation properties with possibly defaulted properties
alloc_prop prop( arg_prop );
alloc_prop prop_copy( arg_prop );
//------------------------------------------------------------
#if defined( KOKKOS_ENABLE_CUDA )
@ -1078,18 +1080,18 @@ namespace Kokkos {
// Fence using the trait's executon space (which will be Kokkos::Cuda)
// to avoid incomplete type errors from usng Kokkos::Cuda directly.
if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
traits::device_type::memory_space::execution_space::fence();
typename traits::device_type::memory_space::execution_space().fence();
}
#endif
//------------------------------------------------------------
Kokkos::Impl::SharedAllocationRecord<> *
record = m_map.allocate_shared( prop , arg_layout );
record = m_map.allocate_shared( prop_copy , arg_layout );
//------------------------------------------------------------
#if defined( KOKKOS_ENABLE_CUDA )
if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
traits::device_type::memory_space::execution_space::fence();
typename traits::device_type::memory_space::execution_space().fence();
}
#endif
//------------------------------------------------------------

View File

@ -57,9 +57,16 @@
namespace Kokkos {
namespace Experimental {
//TODO: replace this enum with the Kokkos::Sum, etc reducers for parallel_reduce
/*
* Reduction Type list
* - These corresponds to subset of the reducers in parallel_reduce
* - See Implementations of ScatterValue for details.
*/
enum : int {
ScatterSum,
ScatterProd,
ScatterMax,
ScatterMin,
};
enum : int {
@ -114,6 +121,21 @@ struct DefaultContribution<Kokkos::OpenMP, Kokkos::Experimental::ScatterDuplicat
};
#endif
#ifdef KOKKOS_ENABLE_HPX
template <>
struct DefaultDuplication<Kokkos::Experimental::HPX> {
enum : int { value = Kokkos::Experimental::ScatterDuplicated };
};
template <>
struct DefaultContribution<Kokkos::Experimental::HPX, Kokkos::Experimental::ScatterNonDuplicated> {
enum : int { value = Kokkos::Experimental::ScatterAtomic };
};
template <>
struct DefaultContribution<Kokkos::Experimental::HPX, Kokkos::Experimental::ScatterDuplicated> {
enum : int { value = Kokkos::Experimental::ScatterNonAtomic };
};
#endif
#ifdef KOKKOS_ENABLE_THREADS
template <>
struct DefaultDuplication<Kokkos::Threads> {
@ -144,39 +166,277 @@ struct DefaultContribution<Kokkos::Cuda, Kokkos::Experimental::ScatterDuplicated
};
#endif
/* ScatterValue is the object returned by the access operator() of ScatterAccess,
similar to that returned by an Atomic View, it wraps Kokkos::atomic_add with convenient
operator+=, etc. */
/* ScatterValue <Op=ScatterSum, contribution=ScatterNonAtomic> is the object returned by the access operator() of ScatterAccess,
This class inherits from the Sum<> reducer and it wraps join(dest, src) with convenient operator+=, etc.
Note the addition of update(ValueType const& rhs) and reset() so that all reducers can have common functions
See ReduceDuplicates and ResetDuplicates ) */
template <typename ValueType, int Op, int contribution>
struct ScatterValue;
template <typename ValueType>
struct ScatterValue<ValueType, Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonAtomic> {
struct ScatterValue<ValueType, Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonAtomic> :
Sum<ValueType,Kokkos::DefaultExecutionSpace> {
public:
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) : value( value_in ) {}
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) : value( other.value ) {}
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) :
Sum<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
{}
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) :
Sum<ValueType,Kokkos::DefaultExecutionSpace>(other.reference())
{}
KOKKOS_FORCEINLINE_FUNCTION void operator+=(ValueType const& rhs) {
value += rhs;
this->join( this->reference(), rhs );
}
KOKKOS_FORCEINLINE_FUNCTION void operator-=(ValueType const& rhs) {
value -= rhs;
this->join( this->reference(), -rhs );
}
KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
this->join( this->reference(), rhs );
}
KOKKOS_FORCEINLINE_FUNCTION void reset() {
this->init( this->reference() );
}
private:
ValueType& value;
};
/* ScatterValue <Op=ScatterSum, contribution=ScatterAtomic> is the object returned by the access operator()
* of ScatterAccess, similar to that returned by an Atomic View, it wraps Kokkos::atomic_add with convenient
operator+=, etc. This version also has the update(rhs) and reset() functions. */
template <typename ValueType>
struct ScatterValue<ValueType, Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterAtomic> {
struct ScatterValue<ValueType, Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterAtomic> :
Sum<ValueType,Kokkos::DefaultExecutionSpace> {
public:
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) : value( value_in ) {}
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) :
Sum<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
{}
KOKKOS_FORCEINLINE_FUNCTION void operator+=(ValueType const& rhs) {
Kokkos::atomic_add(&value, rhs);
this->join(this->reference(), rhs);
}
KOKKOS_FORCEINLINE_FUNCTION void operator-=(ValueType const& rhs) {
Kokkos::atomic_add(&value, -rhs);
this->join(this->reference(), -rhs);
}
private:
ValueType& value;
KOKKOS_INLINE_FUNCTION
void join(ValueType& dest, const ValueType& src) const {
Kokkos::atomic_add(&dest, src);
}
KOKKOS_INLINE_FUNCTION
void join(volatile ValueType& dest, const volatile ValueType& src) const {
Kokkos::atomic_add(&dest, src);
}
KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
this->join( this->reference(), rhs );
}
KOKKOS_FORCEINLINE_FUNCTION void reset() {
this->init( this->reference() );
}
};
/* ScatterValue <Op=ScatterProd, contribution=ScatterNonAtomic> is the object returned by the access operator() of ScatterAccess,
This class inherits from the Prod<> reducer and it wraps join(dest, src) with convenient operator*=, etc.
Note the addition of update(ValueType const& rhs) and reset() so that all reducers can have common functions
See ReduceDuplicates and ResetDuplicates ) */
template <typename ValueType>
struct ScatterValue<ValueType, Kokkos::Experimental::ScatterProd, Kokkos::Experimental::ScatterNonAtomic> :
Prod<ValueType,Kokkos::DefaultExecutionSpace> {
public:
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) :
Prod<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
{}
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) :
Prod<ValueType,Kokkos::DefaultExecutionSpace>(other.reference())
{}
KOKKOS_FORCEINLINE_FUNCTION void operator*=(ValueType const& rhs) {
this->join( this->reference(), rhs );
}
KOKKOS_FORCEINLINE_FUNCTION void operator/=(ValueType const& rhs) {
this->join( this->reference(), static_cast<ValueType>(1)/rhs );
}
KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
this->join( this->reference(), rhs );
}
KOKKOS_FORCEINLINE_FUNCTION void reset() {
this->init( this->reference() );
}
};
/* ScatterValue <Op=ScatterProd, contribution=ScatterAtomic> is the object returned by the access operator()
* of ScatterAccess, similar to that returned by an Atomic View, it wraps and atomic_prod with convenient
operator*=, etc. atomic_prod uses the atomic_compare_exchange. This version also has the update(rhs) and reset() functions. */
template <typename ValueType>
struct ScatterValue<ValueType, Kokkos::Experimental::ScatterProd, Kokkos::Experimental::ScatterAtomic> :
Prod<ValueType,Kokkos::DefaultExecutionSpace> {
public:
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) :
Prod<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
{}
KOKKOS_FORCEINLINE_FUNCTION void operator*=(ValueType const& rhs) {
this->join(this->reference(), rhs);
}
KOKKOS_FORCEINLINE_FUNCTION void operator/=(ValueType const& rhs) {
this->join(this->reference(), static_cast<ValueType>(1)/rhs);
}
KOKKOS_FORCEINLINE_FUNCTION
void atomic_prod(ValueType & dest, const ValueType& src) const {
bool success = false;
while(!success) {
ValueType dest_old = dest;
ValueType dest_new = dest_old * src;
dest_new = Kokkos::atomic_compare_exchange<ValueType>(&dest,dest_old,dest_new);
success = ( (dest_new - dest_old)/dest_old <= 1e-15 );
}
}
KOKKOS_INLINE_FUNCTION
void join(ValueType& dest, const ValueType& src) const {
atomic_prod(dest, src);
}
KOKKOS_INLINE_FUNCTION
void join(volatile ValueType& dest, const volatile ValueType& src) const {
atomic_prod(dest, src);
}
KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
this->join( this->reference(), rhs );
}
KOKKOS_FORCEINLINE_FUNCTION void reset() {
this->init( this->reference() );
}
};
/* ScatterValue <Op=ScatterMin, contribution=ScatterNonAtomic> is the object returned by the access operator() of ScatterAccess,
This class inherits from the Min<> reducer and it wraps join(dest, src) with convenient update(rhs).
Note the addition of update(ValueType const& rhs) and reset() are so that all reducers can have a common update function
See ReduceDuplicates and ResetDuplicates ) */
template <typename ValueType>
struct ScatterValue<ValueType, Kokkos::Experimental::ScatterMin, Kokkos::Experimental::ScatterNonAtomic> :
Min<ValueType,Kokkos::DefaultExecutionSpace> {
public:
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) :
Min<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
{}
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) :
Min<ValueType,Kokkos::DefaultExecutionSpace>(other.reference())
{}
KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
this->join( this->reference(), rhs );
}
KOKKOS_FORCEINLINE_FUNCTION void reset() {
this->init( this->reference() );
}
};
/* ScatterValue <Op=ScatterMin, contribution=ScatterAtomic> is the object returned by the access operator()
* of ScatterAccess, similar to that returned by an Atomic View, it wraps and atomic_min with the update(rhs)
function. atomic_min uses the atomic_compare_exchange. This version also has the reset() function */
template <typename ValueType>
struct ScatterValue<ValueType, Kokkos::Experimental::ScatterMin, Kokkos::Experimental::ScatterAtomic> :
Min<ValueType,Kokkos::DefaultExecutionSpace> {
public:
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) :
Min<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
{}
KOKKOS_FORCEINLINE_FUNCTION
void atomic_min(ValueType & dest, const ValueType& src) const {
bool success = false;
while(!success) {
ValueType dest_old = dest;
ValueType dest_new = ( dest_old > src ) ? src : dest_old;
dest_new = Kokkos::atomic_compare_exchange<ValueType>(&dest,dest_old,dest_new);
success = ( (dest_new - dest_old)/dest_old <= 1e-15 );
}
}
KOKKOS_INLINE_FUNCTION
void join(ValueType& dest, const ValueType& src) const {
atomic_min(dest, src);
}
KOKKOS_INLINE_FUNCTION
void join(volatile ValueType& dest, const volatile ValueType& src) const {
atomic_min(dest, src);
}
KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
this->join( this->reference(), rhs );
}
KOKKOS_FORCEINLINE_FUNCTION void reset() {
this->init( this->reference() );
}
};
/* ScatterValue <Op=ScatterMax, contribution=ScatterNonAtomic> is the object returned by the access operataor() of ScatterAccess,
This class inherits from the Max<> reducer and it wraps join(dest, src) with convenient update(rhs).
Note the addition of update(ValueType const& rhs) and reset() are so that all reducers can have a common update function
See ReduceDuplicates and ResetDuplicates ) */
template <typename ValueType>
struct ScatterValue<ValueType, Kokkos::Experimental::ScatterMax, Kokkos::Experimental::ScatterNonAtomic> :
Max<ValueType,Kokkos::DefaultExecutionSpace> {
public:
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) :
Max<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
{}
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) :
Max<ValueType,Kokkos::DefaultExecutionSpace>(other.reference())
{}
KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
this->join( this->reference(), rhs );
}
KOKKOS_FORCEINLINE_FUNCTION void reset() {
this->init( this->reference() );
}
};
/* ScatterValue <Op=ScatterMax, contribution=ScatterAtomic> is the object returned by the access operator()
* of ScatterAccess, similar to that returned by an Atomic View, it wraps and atomic_max with the update(rhs)
function. atomic_max uses the atomic_compare_exchange. This version also has the reset() function */
template <typename ValueType>
struct ScatterValue<ValueType, Kokkos::Experimental::ScatterMax, Kokkos::Experimental::ScatterAtomic> :
Max<ValueType,Kokkos::DefaultExecutionSpace> {
public:
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) :
Max<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
{}
KOKKOS_FORCEINLINE_FUNCTION
void atomic_max(ValueType & dest, const ValueType& src) const {
bool success = false;
while(!success) {
ValueType dest_old = dest;
ValueType dest_new = ( dest_old < src ) ? src : dest_old;
dest_new = Kokkos::atomic_compare_exchange<ValueType>(&dest,dest_old,dest_new);
success = ( (dest_new - dest_old)/dest_old <= 1e-15 );
}
}
KOKKOS_INLINE_FUNCTION
void join(ValueType& dest, const ValueType& src) const {
atomic_max(dest, src);
}
KOKKOS_INLINE_FUNCTION
void join(volatile ValueType& dest, const volatile ValueType& src) const {
atomic_max(dest, src);
}
KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
this->join( this->reference(), rhs );
}
KOKKOS_FORCEINLINE_FUNCTION void reset() {
this->init( this->reference() );
}
};
/* DuplicatedDataType, given a View DataType, will create a new DataType
@ -226,6 +486,18 @@ struct DuplicatedDataType<T*, Kokkos::LayoutLeft> {
typedef typename DuplicatedDataType<T, Kokkos::LayoutLeft>::value_type* value_type;
};
/* Insert integer argument pack into array */
template<class T>
void args_to_array(size_t* array, int pos, T dim0) {
array[pos] = dim0;
}
template<class T, class ... Dims>
void args_to_array(size_t* array, int pos, T dim0, Dims ... dims) {
array[pos] = dim0;
args_to_array(array,pos+1,dims...);
}
/* Slice is just responsible for stuffing the correct number of Kokkos::ALL
arguments on the correct side of the index in a call to subview() to get a
subview where the index specified is the largest-stride one. */
@ -304,21 +576,26 @@ struct ReduceDuplicatesBase {
}
};
template <typename ExecSpace, typename ValueType>
struct ReduceDuplicates<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum> :
public ReduceDuplicatesBase<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum>
/* ReduceDuplicates -- Perform reduction on destination array using strided source
* Use ScatterValue<> specific to operation to wrap destination array so that
* the reduction operation can be accessed via the update(rhs) function */
template <typename ExecSpace, typename ValueType, int Op>
struct ReduceDuplicates :
public ReduceDuplicatesBase<ExecSpace, ValueType, Op>
{
typedef ReduceDuplicatesBase<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum> Base;
typedef ReduceDuplicatesBase<ExecSpace, ValueType, Op> Base;
ReduceDuplicates(ValueType const* src_in, ValueType* dst_in, size_t stride_in, size_t start_in, size_t n_in, std::string const& name):
Base(src_in, dst_in, stride_in, start_in, n_in, name)
{}
KOKKOS_FORCEINLINE_FUNCTION void operator()(size_t i) const {
for (size_t j = Base::start; j < Base::n; ++j) {
Base::dst[i] += Base::src[i + Base::stride * j];
ScatterValue<ValueType, Op, Kokkos::Experimental::ScatterNonAtomic> sv(Base::dst[i]);
sv.update( Base::src[i + Base::stride * j] );
}
}
};
template <typename ExecSpace, typename ValueType, int Op>
struct ResetDuplicates;
@ -347,19 +624,24 @@ struct ResetDuplicatesBase {
}
};
template <typename ExecSpace, typename ValueType>
struct ResetDuplicates<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum> :
public ResetDuplicatesBase<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum>
/* ResetDuplicates -- Perform reset on destination array
* Use ScatterValue<> specific to operation to wrap destination array so that
* the reset operation can be accessed via the reset() function */
template <typename ExecSpace, typename ValueType, int Op>
struct ResetDuplicates :
public ResetDuplicatesBase<ExecSpace, ValueType, Op>
{
typedef ResetDuplicatesBase<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum> Base;
typedef ResetDuplicatesBase<ExecSpace, ValueType, Op> Base;
ResetDuplicates(ValueType* data_in, size_t size_in, std::string const& name):
Base(data_in, size_in, name)
{}
KOKKOS_FORCEINLINE_FUNCTION void operator()(size_t i) const {
Base::data[i] = Kokkos::reduction_identity<ValueType>::sum();
ScatterValue<ValueType, Op, Kokkos::Experimental::ScatterNonAtomic> sv(Base::data[i]);
sv.reset();
}
};
}}} // Kokkos::Impl::Experimental
namespace Kokkos {
@ -519,12 +801,22 @@ public:
typedef Kokkos::Impl::Experimental::ScatterValue<
original_value_type, Op, override_contribution> value_type;
KOKKOS_INLINE_FUNCTION
ScatterAccess() :
view(view_type()) {
}
KOKKOS_INLINE_FUNCTION
ScatterAccess(view_type const& view_in)
: view(view_in)
{
}
KOKKOS_INLINE_FUNCTION
~ScatterAccess()
{
}
template <typename ... Args>
KOKKOS_FORCEINLINE_FUNCTION
value_type operator()(Args ... args) const {
@ -608,7 +900,7 @@ public:
}
template <int override_contribution = contribution>
inline
KOKKOS_FORCEINLINE_FUNCTION
ScatterAccess<DataType, Op, ExecSpace, Kokkos::LayoutRight, ScatterDuplicated, contribution, override_contribution>
access() const {
return ScatterAccess<DataType, Op, ExecSpace, Kokkos::LayoutRight, ScatterDuplicated, contribution, override_contribution>{*this};
@ -729,14 +1021,14 @@ public:
: unique_token()
{
size_t arg_N[8] = {
original_view.extent(0),
original_view.extent(1),
original_view.extent(2),
original_view.extent(3),
original_view.extent(4),
original_view.extent(5),
original_view.extent(6),
0
original_view.rank>0?original_view.extent(0):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
original_view.rank>1?original_view.extent(1):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
original_view.rank>2?original_view.extent(2):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
original_view.rank>3?original_view.extent(3):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
original_view.rank>4?original_view.extent(4):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
original_view.rank>5?original_view.extent(5):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
original_view.rank>6?original_view.extent(6):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
KOKKOS_IMPL_CTOR_DEFAULT_ARG
};
arg_N[internal_view_type::rank - 1] = unique_token.size();
internal_view = internal_view_type(
@ -748,14 +1040,28 @@ public:
}
template <typename ... Dims>
ScatterView(std::string const& name, Dims ... dims)
: internal_view(Kokkos::ViewAllocateWithoutInitializing(name), dims ..., unique_token.size())
{
ScatterView(std::string const& name, Dims ... dims) {
original_view_type original_view;
size_t arg_N[8] = {
original_view.rank>0?original_view.static_extent(0):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
original_view.rank>1?original_view.static_extent(1):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
original_view.rank>2?original_view.static_extent(2):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
original_view.rank>3?original_view.static_extent(3):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
original_view.rank>4?original_view.static_extent(4):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
original_view.rank>5?original_view.static_extent(5):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
original_view.rank>6?original_view.static_extent(6):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
KOKKOS_IMPL_CTOR_DEFAULT_ARG
};
Kokkos::Impl::Experimental::args_to_array(arg_N,0,dims ...);
arg_N[internal_view_type::rank - 1] = unique_token.size();
internal_view = internal_view_type(Kokkos::ViewAllocateWithoutInitializing(name),
arg_N[0], arg_N[1], arg_N[2], arg_N[3],
arg_N[4], arg_N[5], arg_N[6], arg_N[7]);
reset();
}
template <int override_contribution = contribution>
inline
KOKKOS_FORCEINLINE_FUNCTION
ScatterAccess<DataType, Op, ExecSpace, Kokkos::LayoutLeft, ScatterDuplicated, contribution, override_contribution>
access() const {
return ScatterAccess<DataType, Op, ExecSpace, Kokkos::LayoutLeft, ScatterDuplicated, contribution, override_contribution>{*this};
@ -770,9 +1076,13 @@ public:
}
template <typename ... RP>
void contribute_into(View<DataType, RP...> const& dest) const
void contribute_into(View<RP...> const& dest) const
{
typedef View<DataType, RP...> dest_type;
typedef View<RP...> dest_type;
static_assert(std::is_same<
typename dest_type::value_type,
typename original_view_type::non_const_value_type>::value,
"ScatterView deep_copy destination has wrong value_type");
static_assert(std::is_same<
typename dest_type::array_layout,
Kokkos::LayoutLeft>::value,
@ -891,12 +1201,14 @@ public:
typedef Kokkos::Impl::Experimental::ScatterValue<
original_value_type, Op, override_contribution> value_type;
inline ScatterAccess(view_type const& view_in)
KOKKOS_FORCEINLINE_FUNCTION
ScatterAccess(view_type const& view_in)
: view(view_in)
, thread_id(view_in.unique_token.acquire()) {
}
inline ~ScatterAccess() {
KOKKOS_FORCEINLINE_FUNCTION
~ScatterAccess() {
if (thread_id != ~thread_id_type(0)) view.unique_token.release(thread_id);
}
@ -926,8 +1238,9 @@ private:
public:
// do need to allow moves though, for the common
// auto b = a.access();
// that assignments turns into a move constructor call
inline ScatterAccess(ScatterAccess&& other)
// that assignments turns into a move constructor call
KOKKOS_FORCEINLINE_FUNCTION
ScatterAccess(ScatterAccess&& other)
: view(other.view)
, thread_id(other.thread_id)
{

View File

@ -437,9 +437,9 @@ public:
{
bool result = !erasable();
if (is_insertable_map && result) {
execution_space::fence();
execution_space().fence();
set_flag(erasable_idx);
execution_space::fence();
execution_space().fence();
}
return result;
}
@ -448,10 +448,10 @@ public:
{
bool result = erasable();
if (is_insertable_map && result) {
execution_space::fence();
execution_space().fence();
Impl::UnorderedMapErase<declared_map_type> f(*this);
f.apply();
execution_space::fence();
execution_space().fence();
reset_flag(erasable_idx);
}
return result;

View File

@ -121,12 +121,12 @@ public:
if( DV::template need_sync<typename DV::t_dev::device_type>() ) {
set_functor_host f(DV::h_view,val);
parallel_for(n,f);
DV::t_host::execution_space::fence();
typename DV::t_host::execution_space().fence();
DV::template modify<typename DV::t_host::device_type>();
} else {
set_functor f(DV::d_view,val);
parallel_for(n,f);
DV::t_dev::execution_space::fence();
typename DV::t_dev::execution_space().fence();
DV::template modify<typename DV::t_dev::device_type>();
}
}

View File

@ -86,6 +86,31 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
)
ENDIF()
IF(Kokkos_ENABLE_HPX)
TRIBITS_ADD_EXECUTABLE_AND_TEST(
UnitTest_HPX
SOURCES
UnitTestMain.cpp
hpx/TestHPX_BitSet.cpp
hpx/TestHPX_DualView.cpp
hpx/TestHPX_DynamicView.cpp
hpx/TestHPX_DynRankViewAPI_generic.cpp
hpx/TestHPX_DynRankViewAPI_rank12345.cpp
hpx/TestHPX_DynRankViewAPI_rank67.cpp
hpx/TestHPX_ErrorReporter.cpp
hpx/TestHPX_OffsetView.cpp
hpx/TestHPX_ScatterView.cpp
hpx/TestHPX_StaticCrsGraph.cpp
hpx/TestHPX_UnorderedMap.cpp
hpx/TestHPX_Vector.cpp
hpx/TestHPX_ViewCtorPropEmbeddedDim.cpp
COMM serial mpi
NUM_MPI_PROCS 1
FAIL_REGULAR_EXPRESSION " FAILED "
TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
)
ENDIF()
IF(Kokkos_ENABLE_Cuda)
TRIBITS_ADD_EXECUTABLE_AND_TEST(
UnitTest_Cuda

View File

@ -4,6 +4,7 @@ GTEST_PATH = ../../TPL/gtest
vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests
vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/openmp
vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/hpx
vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/serial
vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/threads
vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/rocm
@ -106,6 +107,25 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
TEST_TARGETS += test-openmp
endif
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
OBJ_HPX = UnitTestMain.o gtest-all.o
OBJ_HPX += TestHPX_BitSet.o
OBJ_HPX += TestHPX_DualView.o
OBJ_HPX += TestHPX_DynamicView.o
OBJ_HPX += TestHPX_DynRankViewAPI_generic.o
OBJ_HPX += TestHPX_DynRankViewAPI_rank12345.o
OBJ_HPX += TestHPX_DynRankViewAPI_rank67.o
OBJ_HPX += TestHPX_ErrorReporter.o
OBJ_HPX += TestHPX_OffsetView.o
OBJ_HPX += TestHPX_ScatterView.o
OBJ_HPX += TestHPX_StaticCrsGraph.o
OBJ_HPX += TestHPX_UnorderedMap.o
OBJ_HPX += TestHPX_Vector.o
OBJ_HPX += TestHPX_ViewCtorPropEmbeddedDim.o
TARGETS += KokkosContainers_UnitTest_HPX
TEST_TARGETS += test-hpx
endif
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
OBJ_SERIAL = UnitTestMain.o gtest-all.o
OBJ_SERIAL += TestSerial_BitSet.o
@ -137,6 +157,9 @@ KokkosContainers_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
KokkosContainers_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_OpenMP
KokkosContainers_UnitTest_HPX: $(OBJ_HPX) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(EXTRA_PATH) $(OBJ_HPX) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_HPX
KokkosContainers_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_Serial
@ -152,6 +175,9 @@ test-threads: KokkosContainers_UnitTest_Threads
test-openmp: KokkosContainers_UnitTest_OpenMP
./KokkosContainers_UnitTest_OpenMP
test-hpx: KokkosContainers_UnitTest_HPX
./KokkosContainers_UnitTest_HPX
test-serial: KokkosContainers_UnitTest_Serial
./KokkosContainers_UnitTest_Serial

View File

@ -66,7 +66,7 @@ struct TestBitset
unsigned testit(unsigned collisions)
{
execution_space::fence();
execution_space().fence();
unsigned count = 0;
Kokkos::parallel_reduce( m_bitset.size()*collisions, *this, count);
@ -114,7 +114,7 @@ struct TestBitsetTest
unsigned testit()
{
execution_space::fence();
execution_space().fence();
unsigned count = 0;
Kokkos::parallel_reduce( m_bitset.size(), *this, count);
@ -151,7 +151,7 @@ struct TestBitsetAny
unsigned testit()
{
execution_space::fence();
execution_space().fence();
unsigned count = 0;
Kokkos::parallel_reduce( m_bitset.size(), *this, count);

View File

@ -1276,6 +1276,7 @@ public:
Kokkos::deep_copy( dx , hx );
Kokkos::deep_copy( dy , dx );
Kokkos::deep_copy( hy , dy );
Kokkos::fence();
for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
@ -1286,6 +1287,7 @@ public:
Kokkos::deep_copy( dx , T(0) );
Kokkos::deep_copy( hx , dx );
Kokkos::fence();
for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {

View File

@ -162,6 +162,7 @@ struct ErrorReporterDriver : public ErrorReporterDriverBase<DeviceType>
void execute(int reporter_capacity, int test_size)
{
Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0,test_size), *this);
Kokkos::fence();
driver_base::check_expectations(reporter_capacity, test_size);
}
@ -194,6 +195,7 @@ struct ErrorReporterDriverUseLambda : public ErrorReporterDriverBase<DeviceType>
driver_base::m_errorReporter.add_report(work_idx, report);
}
});
Kokkos::fence();
driver_base::check_expectations(reporter_capacity, test_size);
}

View File

@ -48,79 +48,387 @@
namespace Test {
template <typename ExecSpace, typename Layout, int duplication, int contribution, int op>
struct test_scatter_view_impl_cls;
template <typename ExecSpace, typename Layout, int duplication, int contribution>
void test_scatter_view_config(int n)
struct test_scatter_view_impl_cls<ExecSpace, Layout, duplication, contribution, Kokkos::Experimental::ScatterSum>
{
Kokkos::View<double *[3], Layout, ExecSpace> original_view("original_view", n);
{
auto scatter_view = Kokkos::Experimental::create_scatter_view
< Kokkos::Experimental::ScatterSum
, duplication
, contribution
> (original_view);
#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
auto policy = Kokkos::RangePolicy<ExecSpace, int>(0, n);
auto f = KOKKOS_LAMBDA(int i) {
public:
typedef Kokkos::Experimental::ScatterView
< double*[3]
, Layout
, ExecSpace
, Kokkos::Experimental::ScatterSum
, duplication
, contribution
> scatter_view_type;
typedef Kokkos::View<double *[3], Layout, ExecSpace> orig_view_type;
scatter_view_type scatter_view;
int scatterSize;
test_scatter_view_impl_cls(const scatter_view_type& view){
scatter_view = view;
scatterSize = 0;
}
void initialize(orig_view_type orig) {
auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
Kokkos::fence();
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
host_view(i, 0) = 0.0;
host_view(i, 1) = 0.0;
host_view(i, 2) = 0.0;
}
Kokkos::fence();
Kokkos::deep_copy(orig, host_view);
}
void run_parallel(int n) {
scatterSize = n;
auto policy = Kokkos::RangePolicy<ExecSpace, int>(0, n);
Kokkos::parallel_for(policy, *this, "scatter_view_test: Sum");
}
KOKKOS_INLINE_FUNCTION
void operator()(int i) const {
auto scatter_access = scatter_view.access();
auto scatter_access_atomic = scatter_view.template access<Kokkos::Experimental::ScatterAtomic>();
for (int j = 0; j < 10; ++j) {
auto k = (i + j) % n;
auto k = (i + j) % scatterSize;
scatter_access(k, 0) += 4.2;
scatter_access_atomic(k, 1) += 2.0;
scatter_access(k, 2) += 1.0;
}
};
Kokkos::parallel_for(policy, f, "scatter_view_test");
#endif
Kokkos::Experimental::contribute(original_view, scatter_view);
scatter_view.reset_except(original_view);
#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
Kokkos::parallel_for(policy, f, "scatter_view_test");
#endif
Kokkos::Experimental::contribute(original_view, scatter_view);
}
#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
Kokkos::fence();
auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), original_view);
Kokkos::fence();
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
auto val0 = host_view(i, 0);
auto val1 = host_view(i, 1);
auto val2 = host_view(i, 2);
EXPECT_TRUE(std::fabs((val0 - 84.0) / 84.0) < 1e-15);
EXPECT_TRUE(std::fabs((val1 - 40.0) / 40.0) < 1e-15);
EXPECT_TRUE(std::fabs((val2 - 20.0) / 20.0) < 1e-15);
}
#endif
{
Kokkos::Experimental::ScatterView
< double*[3]
, Layout
, ExecSpace
, Kokkos::Experimental::ScatterSum
, duplication
, contribution
>
persistent_view("persistent", n);
auto result_view = persistent_view.subview();
contribute(result_view, persistent_view);
}
}
}
template <typename ExecSpace>
void validateResults(orig_view_type orig) {
auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
Kokkos::fence();
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
auto val0 = host_view(i, 0);
auto val1 = host_view(i, 1);
auto val2 = host_view(i, 2);
EXPECT_TRUE(std::fabs((val0 - 84.0) / 84.0) < 1e-14);
EXPECT_TRUE(std::fabs((val1 - 40.0) / 40.0) < 1e-14);
EXPECT_TRUE(std::fabs((val2 - 20.0) / 20.0) < 1e-14);
}
}
};
template <typename ExecSpace, typename Layout, int duplication, int contribution>
struct test_scatter_view_impl_cls<ExecSpace, Layout, duplication, contribution, Kokkos::Experimental::ScatterProd>
{
public:
typedef Kokkos::Experimental::ScatterView
< double*[3]
, Layout
, ExecSpace
, Kokkos::Experimental::ScatterProd
, duplication
, contribution
> scatter_view_type;
typedef Kokkos::View<double *[3], Layout, ExecSpace> orig_view_type;
scatter_view_type scatter_view;
int scatterSize;
test_scatter_view_impl_cls(const scatter_view_type& view){
scatter_view = view;
scatterSize = 0;
}
void initialize(orig_view_type orig) {
auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
Kokkos::fence();
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
host_view(i, 0) = 1.0;
host_view(i, 1) = 1.0;
host_view(i, 2) = 1.0;
}
Kokkos::fence();
Kokkos::deep_copy(orig, host_view);
}
void run_parallel(int n) {
scatterSize = n;
auto policy = Kokkos::RangePolicy<ExecSpace, int>(0, n);
Kokkos::parallel_for(policy, *this, "scatter_view_test: Prod");
}
KOKKOS_INLINE_FUNCTION
void operator()(int i) const {
auto scatter_access = scatter_view.access();
auto scatter_access_atomic = scatter_view.template access<Kokkos::Experimental::ScatterAtomic>();
for (int j = 0; j < 4; ++j) {
auto k = (i + j) % scatterSize;
scatter_access(k, 0) *= 4.0;
scatter_access_atomic(k, 1) *= 2.0;
scatter_access(k, 2) *= 1.0;
}
}
void validateResults(orig_view_type orig) {
auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
Kokkos::fence();
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
auto val0 = host_view(i, 0);
auto val1 = host_view(i, 1);
auto val2 = host_view(i, 2);
EXPECT_TRUE(std::fabs((val0 - 65536.0) / 65536.0) < 1e-14);
EXPECT_TRUE(std::fabs((val1 - 256.0) / 256.0) < 1e-14);
EXPECT_TRUE(std::fabs((val2 - 1.0) / 1.0) < 1e-14);
}
}
};
template <typename ExecSpace, typename Layout, int duplication, int contribution>
struct test_scatter_view_impl_cls<ExecSpace, Layout, duplication, contribution, Kokkos::Experimental::ScatterMin>
{
public:
typedef Kokkos::Experimental::ScatterView
< double*[3]
, Layout
, ExecSpace
, Kokkos::Experimental::ScatterMin
, duplication
, contribution
> scatter_view_type;
typedef Kokkos::View<double *[3], Layout, ExecSpace> orig_view_type;
scatter_view_type scatter_view;
int scatterSize;
test_scatter_view_impl_cls(const scatter_view_type& view){
scatter_view = view;
scatterSize = 0;
}
void initialize(orig_view_type orig) {
auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
Kokkos::fence();
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
host_view(i, 0) = 999999.0;
host_view(i, 1) = 999999.0;
host_view(i, 2) = 999999.0;
}
Kokkos::fence();
Kokkos::deep_copy(orig, host_view);
}
void run_parallel(int n) {
scatterSize = n;
auto policy = Kokkos::RangePolicy<ExecSpace, int>(0, n);
Kokkos::parallel_for(policy, *this, "scatter_view_test: Prod");
}
KOKKOS_INLINE_FUNCTION
void operator()(int i) const {
auto scatter_access = scatter_view.access();
auto scatter_access_atomic = scatter_view.template access<Kokkos::Experimental::ScatterAtomic>();
for (int j = 0; j < 4; ++j) {
auto k = (i + j) % scatterSize;
scatter_access(k, 0).update((double)(j+1)*4);
scatter_access_atomic(k, 1).update((double)(j+1)*2.0);
scatter_access(k, 2).update((double)(j+1)*1.0);
}
}
void validateResults(orig_view_type orig) {
auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
Kokkos::fence();
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
auto val0 = host_view(i, 0);
auto val1 = host_view(i, 1);
auto val2 = host_view(i, 2);
EXPECT_TRUE(std::fabs((val0 - 4.0) / 4.0) < 1e-14);
EXPECT_TRUE(std::fabs((val1 - 2.0) / 2.0) < 1e-14);
EXPECT_TRUE(std::fabs((val2 - 1.0) / 1.0) < 1e-14);
}
}
};
template <typename ExecSpace, typename Layout, int duplication, int contribution>
struct test_scatter_view_impl_cls<ExecSpace, Layout, duplication, contribution, Kokkos::Experimental::ScatterMax>
{
public:
typedef Kokkos::Experimental::ScatterView
< double*[3]
, Layout
, ExecSpace
, Kokkos::Experimental::ScatterMax
, duplication
, contribution
> scatter_view_type;
typedef Kokkos::View<double *[3], Layout, ExecSpace> orig_view_type;
scatter_view_type scatter_view;
int scatterSize;
test_scatter_view_impl_cls(const scatter_view_type& view){
scatter_view = view;
scatterSize = 0;
}
void initialize(orig_view_type orig) {
auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
Kokkos::fence();
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
host_view(i, 0) = 0.0;
host_view(i, 1) = 0.0;
host_view(i, 2) = 0.0;
}
Kokkos::fence();
Kokkos::deep_copy(orig, host_view);
}
void run_parallel(int n) {
scatterSize = n;
auto policy = Kokkos::RangePolicy<ExecSpace, int>(0, n);
Kokkos::parallel_for(policy, *this, "scatter_view_test: Prod");
}
KOKKOS_INLINE_FUNCTION
void operator()(int i) const {
auto scatter_access = scatter_view.access();
auto scatter_access_atomic = scatter_view.template access<Kokkos::Experimental::ScatterAtomic>();
for (int j = 0; j < 4; ++j) {
auto k = (i + j) % scatterSize;
scatter_access(k, 0).update((double)(j+1)*4);
scatter_access_atomic(k, 1).update((double)(j+1)*2.0);
scatter_access(k, 2).update((double)(j+1)*1.0);
}
}
void validateResults(orig_view_type orig) {
auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
Kokkos::fence();
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
auto val0 = host_view(i, 0);
auto val1 = host_view(i, 1);
auto val2 = host_view(i, 2);
EXPECT_TRUE(std::fabs((val0 - 16.0) / 16.0) < 1e-14);
EXPECT_TRUE(std::fabs((val1 - 8.0) / 8.0) < 1e-14);
EXPECT_TRUE(std::fabs((val2 - 4.0) / 4.0) < 1e-14);
}
}
};
template <typename ExecSpace, typename Layout, int duplication, int contribution, int op>
struct test_scatter_view_config
{
public:
typedef typename test_scatter_view_impl_cls<ExecSpace, Layout,
duplication, contribution, op>::scatter_view_type scatter_view_def;
typedef typename test_scatter_view_impl_cls<ExecSpace, Layout,
duplication, contribution, op>::orig_view_type orig_view_def;
test_scatter_view_config() {
}
void run_test(int n)
{
//Test creation via create_scatter_view
{
orig_view_def original_view("original_view", n);
scatter_view_def scatter_view = Kokkos::Experimental::create_scatter_view
< op
, duplication
, contribution
> (original_view);
test_scatter_view_impl_cls<ExecSpace, Layout, duplication, contribution, op> scatter_view_test_impl(scatter_view);
scatter_view_test_impl.initialize(original_view);
scatter_view_test_impl.run_parallel(n);
Kokkos::Experimental::contribute(original_view, scatter_view);
scatter_view.reset_except(original_view);
scatter_view_test_impl.run_parallel(n);
Kokkos::Experimental::contribute(original_view, scatter_view);
Kokkos::fence();
scatter_view_test_impl.validateResults(original_view);
{
scatter_view_def persistent_view("persistent", n);
auto result_view = persistent_view.subview();
contribute(result_view, persistent_view);
Kokkos::fence();
}
}
//Test creation via constructor
{
orig_view_def original_view("original_view", n);
scatter_view_def scatter_view(original_view);
test_scatter_view_impl_cls<ExecSpace, Layout, duplication, contribution, op> scatter_view_test_impl(scatter_view);
scatter_view_test_impl.initialize(original_view);
scatter_view_test_impl.run_parallel(n);
Kokkos::Experimental::contribute(original_view, scatter_view);
scatter_view.reset_except(original_view);
scatter_view_test_impl.run_parallel(n);
Kokkos::Experimental::contribute(original_view, scatter_view);
Kokkos::fence();
scatter_view_test_impl.validateResults(original_view);
{
scatter_view_def persistent_view("persistent", n);
auto result_view = persistent_view.subview();
contribute(result_view, persistent_view);
Kokkos::fence();
}
}
}
};
template <typename ExecSpace, int ScatterType>
struct TestDuplicatedScatterView {
TestDuplicatedScatterView(int n) {
// ScatterSum test
test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
Kokkos::Experimental::ScatterDuplicated,
Kokkos::Experimental::ScatterNonAtomic>(n);
Kokkos::Experimental::ScatterNonAtomic,
ScatterType> test_sv_right_config;
test_sv_right_config.run_test(n);
test_scatter_view_config<ExecSpace, Kokkos::LayoutLeft,
Kokkos::Experimental::ScatterDuplicated,
Kokkos::Experimental::ScatterNonAtomic,
ScatterType> test_sv_left_config;
test_sv_left_config.run_test(n);
}
};
#ifdef KOKKOS_ENABLE_CUDA
// disable duplicated instantiation with CUDA until
// UniqueToken can support it
template <>
struct TestDuplicatedScatterView<Kokkos::Cuda> {
template <int ScatterType>
struct TestDuplicatedScatterView<Kokkos::Cuda, ScatterType> {
TestDuplicatedScatterView(int) {
}
};
@ -129,14 +437,14 @@ struct TestDuplicatedScatterView<Kokkos::Cuda> {
#ifdef KOKKOS_ENABLE_ROCM
// disable duplicated instantiation with ROCm until
// UniqueToken can support it
template <>
struct TestDuplicatedScatterView<Kokkos::Experimental::ROCm> {
template <int ScatterType>
struct TestDuplicatedScatterView<Kokkos::Experimental::ROCm, ScatterType> {
TestDuplicatedScatterView(int) {
}
};
#endif
template <typename ExecSpace>
template <typename ExecSpace, int ScatterType>
void test_scatter_view(int n)
{
// all of these configurations should compile okay, but only some of them are
@ -149,29 +457,47 @@ void test_scatter_view(int n)
if (unique_token.size() == 1) {
test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
Kokkos::Experimental::ScatterNonDuplicated,
Kokkos::Experimental::ScatterNonAtomic>(n);
Kokkos::Experimental::ScatterNonAtomic,
ScatterType> test_sv_config;
test_sv_config.run_test(n);
}
#ifdef KOKKOS_ENABLE_SERIAL
if (!std::is_same<ExecSpace, Kokkos::Serial>::value) {
#endif
test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
Kokkos::Experimental::ScatterNonDuplicated,
Kokkos::Experimental::ScatterAtomic>(n);
Kokkos::Experimental::ScatterAtomic,
ScatterType> test_sv_config;
test_sv_config.run_test(n);
#ifdef KOKKOS_ENABLE_SERIAL
}
#endif
TestDuplicatedScatterView<ExecSpace> duptest(n);
// with hundreds of threads we were running out of memory.
// limit (n) so that duplication doesn't exceed 8GB
constexpr std::size_t maximum_allowed_total_bytes = 8ull * 1024ull * 1024ull * 1024ull;
std::size_t const maximum_allowed_copy_bytes = maximum_allowed_total_bytes / std::size_t(unique_token.size());
constexpr std::size_t bytes_per_value = sizeof(double) * 3;
std::size_t const maximum_allowed_copy_values = maximum_allowed_copy_bytes / bytes_per_value;
n = std::min(n, int(maximum_allowed_copy_values));
TestDuplicatedScatterView<ExecSpace, ScatterType> duptest(n);
}
TEST_F( TEST_CATEGORY, scatterview) {
#ifndef KOKKOS_ENABLE_ROCM
test_scatter_view<TEST_EXECSPACE>(10);
test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterSum>(10);
test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterProd>(10);
test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterMin>(10);
test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterMax>(10);
// tests were timing out in DEBUG mode, reduce the amount of work
#ifdef KOKKOS_ENABLE_DEBUG
test_scatter_view<TEST_EXECSPACE>(100000);
int big_n = 100 * 1000;
#else
test_scatter_view<TEST_EXECSPACE>(10000000);
int big_n = 10 * 1000 * 1000;
#endif
test_scatter_view<TEST_EXECSPACE,Kokkos::Experimental::ScatterSum>(big_n);
test_scatter_view<TEST_EXECSPACE,Kokkos::Experimental::ScatterProd>(big_n);
test_scatter_view<TEST_EXECSPACE,Kokkos::Experimental::ScatterMin>(big_n);
test_scatter_view<TEST_EXECSPACE,Kokkos::Experimental::ScatterMax>(big_n);
#endif
}

View File

@ -69,7 +69,7 @@ struct TestInsert
void testit( bool rehash_on_fail = true )
{
execution_space::fence();
execution_space().fence();
uint32_t failed_count = 0;
do {
@ -82,7 +82,7 @@ struct TestInsert
}
} while (rehash_on_fail && failed_count > 0u);
execution_space::fence();
execution_space().fence();
}
@ -122,9 +122,9 @@ struct TestInsert
void testit()
{
execution_space::fence();
execution_space().fence();
Kokkos::parallel_for(m_num_erase, *this);
execution_space::fence();
execution_space().fence();
}
KOKKOS_INLINE_FUNCTION
@ -161,9 +161,9 @@ struct TestInsert
void testit(value_type &errors)
{
execution_space::execution_space::fence();
execution_space().fence();
Kokkos::parallel_reduce(m_map.capacity(), *this, errors);
execution_space::execution_space::fence();
execution_space().fence();
}
KOKKOS_INLINE_FUNCTION
@ -247,7 +247,7 @@ void test_failed_insert( uint32_t num_nodes)
map_type map(num_nodes);
Impl::TestInsert<map_type> test_insert(map, 2u*num_nodes, 1u);
test_insert.testit(false /*don't rehash on fail*/);
Device::execution_space::fence();
typename Device::execution_space().fence();
EXPECT_TRUE( map.failed_insert() );
}

View File

@ -0,0 +1,47 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include<hpx/TestHPX_Category.hpp>
#include<TestBitset.hpp>

View File

@ -0,0 +1,65 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_TEST_HPX_HPP
#define KOKKOS_TEST_HPX_HPP
#include <gtest/gtest.h>
namespace Test {
class hpx : public ::testing::Test {
protected:
static void SetUpTestCase() {
}
static void TearDownTestCase() {
}
};
} // namespace Test
#define TEST_CATEGORY hpx
#define TEST_EXECSPACE Kokkos::Experimental::HPX
#endif

View File

@ -0,0 +1,47 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include<hpx/TestHPX_Category.hpp>
#include<TestDualView.hpp>

View File

@ -0,0 +1,47 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include<hpx/TestHPX_Category.hpp>
#include<TestDynViewAPI_generic.hpp>

View File

@ -0,0 +1,47 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include<hpx/TestHPX_Category.hpp>
#include<TestDynViewAPI_rank12345.hpp>

View File

@ -0,0 +1,47 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include<hpx/TestHPX_Category.hpp>
#include<TestDynViewAPI_rank67.hpp>

View File

@ -0,0 +1,47 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include<hpx/TestHPX_Category.hpp>
#include<TestDynamicView.hpp>

View File

@ -0,0 +1,47 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include<hpx/TestHPX_Category.hpp>
#include<TestErrorReporter.hpp>

View File

@ -0,0 +1,47 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include<hpx/TestHPX_Category.hpp>
#include<TestOffsetView.hpp>

View File

@ -0,0 +1,47 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include<hpx/TestHPX_Category.hpp>
#include<TestScatterView.hpp>

View File

@ -0,0 +1,47 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include<hpx/TestHPX_Category.hpp>
#include<TestStaticCrsGraph.hpp>

View File

@ -0,0 +1,47 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include<hpx/TestHPX_Category.hpp>
#include<TestUnorderedMap.hpp>

View File

@ -0,0 +1,47 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include<hpx/TestHPX_Category.hpp>
#include<TestVector.hpp>

View File

@ -0,0 +1,47 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include<hpx/TestHPX_Category.hpp>
#include<TestViewCtorPropEmbeddedDim.hpp>

View File

@ -1,5 +1,5 @@
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREADS DLlib
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREADS DLlib HPX
TEST_OPTIONAL_TPLS CUSPARSE
)

View File

@ -47,6 +47,7 @@ TRIBITS_ADD_EXECUTABLE(
PerformanceTest_TaskDAG
SOURCES test_taskdag.cpp
COMM serial mpi
TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
)
TRIBITS_ADD_TEST(

View File

@ -30,6 +30,7 @@ TARGETS =
#
OBJ_PERF = PerfTestMain.o gtest-all.o
OBJ_PERF += PerfTest_ExecSpacePartitioning.o
OBJ_PERF += PerfTestGramSchmidt.o
OBJ_PERF += PerfTestHexGrad.o
OBJ_PERF += PerfTest_CustomReduction.o

View File

@ -44,6 +44,8 @@
#ifndef KOKKOS_BLAS_KERNELS_HPP
#define KOKKOS_BLAS_KERNELS_HPP
#include <type_traits>
namespace Kokkos {
template< class ConstVectorType ,
@ -123,15 +125,10 @@ struct Dot
{
typedef typename Device::execution_space execution_space ;
typedef typename
Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
Impl::unsigned_< Type::Rank > >::type ok_rank ;
static_assert( static_cast<unsigned>(Type::Rank) == static_cast<unsigned>(1),
"Dot static_assert Fail: Rank != 1");
/* typedef typename
Impl::StaticAssertSame< execution_space ,
typename Type::execution_space >::type ok_device ;*/
typedef double value_type ;
#if 1
@ -164,13 +161,8 @@ struct DotSingle
{
typedef typename Device::execution_space execution_space ;
typedef typename
Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
Impl::unsigned_< Type::Rank > >::type ok_rank ;
/* typedef typename
Impl::StaticAssertSame< execution_space ,
typename Type::execution_space >::type ok_device ;*/
static_assert( static_cast<unsigned>(Type::Rank) == static_cast<unsigned>(1),
"DotSingle static_assert Fail: Rank != 1");
typedef double value_type ;
@ -204,25 +196,11 @@ struct Scale
{
typedef typename Device::execution_space execution_space ;
/* typedef typename
Impl::StaticAssertSame< execution_space ,
typename ScalarType::execution_space >::type
ok_scalar_device ;
static_assert( static_cast<unsigned>(ScalarType::Rank) == static_cast<unsigned>(0),
"Scale static_assert Fail: ScalarType::Rank != 0");
typedef typename
Impl::StaticAssertSame< execution_space ,
typename VectorType::execution_space >::type
ok_vector_device ;*/
typedef typename
Impl::StaticAssertSame< Impl::unsigned_< 0 > ,
Impl::unsigned_< ScalarType::Rank > >::type
ok_scalar_rank ;
typedef typename
Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
Impl::unsigned_< VectorType::Rank > >::type
ok_vector_rank ;
static_assert( static_cast<unsigned>(VectorType::Rank) == static_cast<unsigned>(1),
"Scale static_assert Fail: VectorType::Rank != 1");
#if 1
typename ScalarType::const_type alpha ;
@ -251,35 +229,14 @@ struct AXPBY
{
typedef typename Device::execution_space execution_space ;
/* typedef typename
Impl::StaticAssertSame< execution_space ,
typename ScalarType::execution_space >::type
ok_scalar_device ;
static_assert( static_cast<unsigned>(ScalarType::Rank) == static_cast<unsigned>(0),
"AXPBY static_assert Fail: ScalarType::Rank != 0");
typedef typename
Impl::StaticAssertSame< execution_space ,
typename ConstVectorType::execution_space >::type
ok_const_vector_device ;
static_assert( static_cast<unsigned>(ConstVectorType::Rank) == static_cast<unsigned>(1),
"AXPBY static_assert Fail: ConstVectorType::Rank != 1");
typedef typename
Impl::StaticAssertSame< execution_space ,
typename VectorType::execution_space >::type
ok_vector_device ;*/
typedef typename
Impl::StaticAssertSame< Impl::unsigned_< 0 > ,
Impl::unsigned_< ScalarType::Rank > >::type
ok_scalar_rank ;
typedef typename
Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
Impl::unsigned_< ConstVectorType::Rank > >::type
ok_const_vector_rank ;
typedef typename
Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
Impl::unsigned_< VectorType::Rank > >::type
ok_vector_rank ;
static_assert( static_cast<unsigned>(VectorType::Rank) == static_cast<unsigned>(1),
"AXPBY static_assert Fail: VectorType::Rank != 1");
#if 1
typename ScalarType::const_type alpha , beta ;

View File

@ -183,7 +183,7 @@ struct ModifiedGramSchmidt
}
}
execution_space::fence();
execution_space().fence();
return timer.seconds();
}

View File

@ -253,12 +253,12 @@ struct HexGrad
double dt_min = 0 ;
Kokkos::parallel_for( count , Init( coord ) );
execution_space::fence();
execution_space().fence();
for ( int i = 0 ; i < iter ; ++i ) {
Kokkos::Timer timer ;
Kokkos::parallel_for( count , HexGrad<execution_space>( coord , grad ) );
execution_space::fence();
execution_space().fence();
const double dt = timer.seconds();
if ( 0 == i ) dt_min = dt ;
else dt_min = dt < dt_min ? dt : dt_min ;

View File

@ -125,15 +125,15 @@ struct MultiDimRangePerf3D
Kokkos::MDRangePolicy<Kokkos::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > policy(point_type{{0,0,0}},point_type{{icount,jcount,kcount}},tile_type{{Ti,Tj,Tk}} );
Kokkos::parallel_for( policy_initA, Init(Atest, icount, jcount, kcount) );
execution_space::fence();
execution_space().fence();
Kokkos::parallel_for( policy_initB, Init(Btest, icount+2, jcount+2, kcount+2) );
execution_space::fence();
execution_space().fence();
for (int i = 0; i < iter; ++i)
{
Kokkos::Timer timer;
Kokkos::parallel_for( policy, FunctorType(Atest, Btest, icount, jcount, kcount) );
execution_space::fence();
execution_space().fence();
const double dt = timer.seconds();
if ( 0 == i ) dt_min = dt ;
else dt_min = dt < dt_min ? dt : dt_min ;
@ -189,15 +189,15 @@ struct MultiDimRangePerf3D
Kokkos::MDRangePolicy<Kokkos::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space > policy({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}} );
Kokkos::parallel_for( policy_initA, Init(Atest, icount, jcount, kcount) );
execution_space::fence();
execution_space().fence();
Kokkos::parallel_for( policy_initB, Init(Btest, icount+2, jcount+2, kcount+2) );
execution_space::fence();
execution_space().fence();
for (int i = 0; i < iter; ++i)
{
Kokkos::Timer timer;
Kokkos::parallel_for( policy, FunctorType(Atest, Btest, icount, jcount, kcount) );
execution_space::fence();
execution_space().fence();
const double dt = timer.seconds();
if ( 0 == i ) dt_min = dt ;
else dt_min = dt < dt_min ? dt : dt_min ;
@ -368,15 +368,15 @@ struct RangePolicyCollapseTwo
double dt_min = 0;
Kokkos::parallel_for( policy, Init(Atest,icount,jcount,kcount) );
execution_space::fence();
execution_space().fence();
Kokkos::parallel_for( policy_initB, Init(Btest,icount+2,jcount+2,kcount+2) );
execution_space::fence();
execution_space().fence();
for (int i = 0; i < iter; ++i)
{
Kokkos::Timer timer;
Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount));
execution_space::fence();
execution_space().fence();
const double dt = timer.seconds();
if ( 0 == i ) dt_min = dt ;
else dt_min = dt < dt_min ? dt : dt_min ;
@ -513,15 +513,15 @@ struct RangePolicyCollapseAll
double dt_min = 0;
Kokkos::parallel_for( policy, Init(Atest,icount,jcount,kcount) );
execution_space::fence();
execution_space().fence();
Kokkos::parallel_for( policy_initB, Init(Btest,icount+2,jcount+2,kcount+2) );
execution_space::fence();
execution_space().fence();
for (int i = 0; i < iter; ++i)
{
Kokkos::Timer timer;
Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount));
execution_space::fence();
execution_space().fence();
const double dt = timer.seconds();
if ( 0 == i ) dt_min = dt ;
else dt_min = dt < dt_min ? dt : dt_min ;

View File

@ -0,0 +1,564 @@
#include <Kokkos_Core.hpp>
#include <gtest/gtest.h>
#include <PerfTest_Category.hpp>
namespace Test {
namespace {
template<class ExecSpace>
struct SpaceInstance {
static ExecSpace create() {
return ExecSpace();
}
static void destroy(ExecSpace&) {
}
static bool overlap() {
return false;
}
};
#ifndef KOKKOS_ENABLE_DEBUG
#ifdef KOKKOS_ENABLE_CUDA
template<>
struct SpaceInstance<Kokkos::Cuda> {
static Kokkos::Cuda create() {
cudaStream_t stream;
cudaStreamCreate(&stream);
return Kokkos::Cuda(stream);
}
static void destroy(Kokkos::Cuda& space) {
cudaStream_t stream = space.cuda_stream();
cudaStreamDestroy(stream);
}
static bool overlap() {
bool value = true;
auto local_rank_str = std::getenv("CUDA_LAUNCH_BLOCKING");
if(local_rank_str) {
value = (std::atoi(local_rank_str)==0);
}
return value;
}
};
#endif
#endif
}
struct FunctorRange {
int M,R;
Kokkos::View<double**,TEST_EXECSPACE> a;
FunctorRange(int M_, int R_, Kokkos::View<double**,TEST_EXECSPACE> a_):M(M_),R(R_),a(a_){}
KOKKOS_INLINE_FUNCTION
void operator() (const int i) const {
for(int r=0;r<R;r++)
for(int j=0;j<M;j++) {
a(i,j)+=1.0;
}
}
};
struct FunctorMDRange {
int M,R;
Kokkos::View<double**,TEST_EXECSPACE> a;
FunctorMDRange(int M_, int R_, Kokkos::View<double**,TEST_EXECSPACE> a_):M(M_),R(R_),a(a_){}
KOKKOS_INLINE_FUNCTION
void operator() (const int i, const int) const {
for(int j=0;j<M;j++)
a(i,j)+=1.0;
}
};
struct FunctorTeam {
int M,R;
Kokkos::View<double**,Kokkos::LayoutRight,TEST_EXECSPACE> a;
FunctorTeam(int M_, int R_, Kokkos::View<double**,Kokkos::LayoutRight,TEST_EXECSPACE> a_):M(M_),R(R_),a(a_){}
KOKKOS_INLINE_FUNCTION
void operator() (const Kokkos::TeamPolicy<TEST_EXECSPACE>::member_type& team) const {
int i = team.league_rank();
for(int r=0;r<R;r++) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,M), [&] (const int j) {
a(i,j)+=1.0;
});
}
}
};
struct FunctorRangeReduce {
int M,R;
Kokkos::View<double**,TEST_EXECSPACE> a;
FunctorRangeReduce(int M_, int R_, Kokkos::View<double**,TEST_EXECSPACE> a_):M(M_),R(R_),a(a_){}
KOKKOS_INLINE_FUNCTION
void operator() (const int i, double& tmp) const {
for(int r=0;r<R;r++)
for(int j=0;j<M;j++) {
tmp += a(i,j);
}
}
};
struct FunctorMDRangeReduce {
int M,R;
Kokkos::View<double**,TEST_EXECSPACE> a;
FunctorMDRangeReduce(int M_, int R_, Kokkos::View<double**,TEST_EXECSPACE> a_):M(M_),R(R_),a(a_){}
KOKKOS_INLINE_FUNCTION
void operator() (const int i, const int, double& tmp) const {
for(int j=0;j<M;j++)
tmp += a(i,j);
}
};
struct FunctorTeamReduce {
int M,R;
Kokkos::View<double**,Kokkos::LayoutRight,TEST_EXECSPACE> a;
FunctorTeamReduce(int M_, int R_, Kokkos::View<double**,Kokkos::LayoutRight,TEST_EXECSPACE> a_):M(M_),R(R_),a(a_){}
KOKKOS_INLINE_FUNCTION
void operator() (const Kokkos::TeamPolicy<TEST_EXECSPACE>::member_type& team, double& tmp) const {
int i = team.league_rank();
for(int r=0;r<R;r++) {
double val;
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,M), [&] (const int j, double& tmp2) {
tmp2 += a(i,j);
},val);
tmp+=val;
}
}
};
TEST_F( default_exec, overlap_range_policy ) {
int N = 2000;
int M = 10000;
int R = 10;
TEST_EXECSPACE space;
TEST_EXECSPACE space1 = SpaceInstance<TEST_EXECSPACE>::create();
TEST_EXECSPACE space2 = SpaceInstance<TEST_EXECSPACE>::create();
Kokkos::View<double**,TEST_EXECSPACE> a("A",N,M);
FunctorRange f(M,R,a);
FunctorRangeReduce fr(M,R,a);
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel0",
Kokkos::RangePolicy<TEST_EXECSPACE>(0,N), FunctorRange(M,R,a));
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel1",
Kokkos::Experimental::require(
Kokkos::RangePolicy<TEST_EXECSPACE>(space1,0,N),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, f);
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel2",
Kokkos::Experimental::require(
Kokkos::RangePolicy<TEST_EXECSPACE>(space2,0,N),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, f);
Kokkos::fence();
Kokkos::Timer timer;
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel3",
Kokkos::Experimental::require(
Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, f);
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel4",
Kokkos::Experimental::require(
Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, f);
Kokkos::fence();
timer.reset();
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel5",
Kokkos::Experimental::require(
Kokkos::RangePolicy<TEST_EXECSPACE>(space1,0,N),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, FunctorRange(M,R,a));
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel6",
Kokkos::Experimental::require(
Kokkos::RangePolicy<TEST_EXECSPACE>(space2,0,N),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, FunctorRange(M,R,a));
Kokkos::fence();
double time_overlap = timer.seconds();
timer.reset();
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel7",
Kokkos::Experimental::require(
Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, f);
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel8",
Kokkos::Experimental::require(
Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, f);
Kokkos::fence();
double time_end = timer.seconds();
if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
ASSERT_TRUE( (time_end > 1.5*time_overlap) );
}
printf("Time RangePolicy: NonOverlap: %lf Time Overlap: %lf\n",time_end,time_overlap);
Kokkos::View<double,TEST_EXECSPACE> result("result");
Kokkos::View<double,TEST_EXECSPACE> result1("result1");
Kokkos::View<double,TEST_EXECSPACE> result2("result2");
Kokkos::View<double,Kokkos::HostSpace> h_result("h_result");
Kokkos::View<double,Kokkos::HostSpace> h_result1("h_result1");
Kokkos::View<double,Kokkos::HostSpace> h_result2("h_result2");
timer.reset();
Kokkos::parallel_reduce("default_exec::overlap_range_policy::kernel_reduce",
Kokkos::Experimental::require(
Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, fr, result);
Kokkos::fence();
double time_fenced = timer.seconds();
Kokkos::deep_copy(h_result,result);
timer.reset();
Kokkos::parallel_reduce("default_exec::overlap_range_policy::kernel_reduce",
Kokkos::Experimental::require(
Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, fr, result);
double time_not_fenced = timer.seconds();
Kokkos::fence();
if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
ASSERT_TRUE(time_fenced>2.0*time_not_fenced);
}
timer.reset();
Kokkos::parallel_reduce("default_exec::overlap_range_policy::kernel_reduce",
Kokkos::Experimental::require(
Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, fr, result);
Kokkos::parallel_reduce("default_exec::overlap_range_policy::kernel_reduce",
Kokkos::Experimental::require(
Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, fr, result);
Kokkos::fence();
double time_no_overlapped_reduce = timer.seconds();
timer.reset();
Kokkos::parallel_reduce("default_exec::overlap_range_policy::kernel_reduce",
Kokkos::Experimental::require(
Kokkos::RangePolicy<TEST_EXECSPACE>(space1,0,N),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, fr, result1);
Kokkos::parallel_reduce("default_exec::overlap_range_policy::kernel_reduce",
Kokkos::Experimental::require(
Kokkos::RangePolicy<TEST_EXECSPACE>(space2,0,N),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, fr, result2);
Kokkos::fence();
double time_overlapped_reduce = timer.seconds();
Kokkos::deep_copy(h_result2,result2);
Kokkos::deep_copy(h_result1,result1);
ASSERT_EQ(h_result1(),h_result());
ASSERT_EQ(h_result2(),h_result());
if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
ASSERT_TRUE(time_overlapped_reduce < 1.5*time_no_overlapped_reduce);
}
printf("Time RangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",time_no_overlapped_reduce,time_overlapped_reduce);
SpaceInstance<TEST_EXECSPACE>::destroy(space1);
SpaceInstance<TEST_EXECSPACE>::destroy(space2);
}
TEST_F( default_exec, overlap_mdrange_policy ) {
int N = 200;
int M = 10000;
int R = 10;
TEST_EXECSPACE space;
TEST_EXECSPACE space1 = SpaceInstance<TEST_EXECSPACE>::create();
TEST_EXECSPACE space2 = SpaceInstance<TEST_EXECSPACE>::create();
Kokkos::View<double**,TEST_EXECSPACE> a("A",N,M);
FunctorMDRange f(M,R,a);
FunctorMDRangeReduce fr(M,R,a);
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel0",
Kokkos::Experimental::require(
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>({0,0},{N,R}),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, FunctorMDRange(M,R,a));
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel1",
Kokkos::Experimental::require(
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space1,{0,0},{N,R}),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, f);
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel2",
Kokkos::Experimental::require(
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space2,{0,0},{N,R}),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, f);
Kokkos::fence();
Kokkos::Timer timer;
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel3",
Kokkos::Experimental::require(
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, f);
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel4",
Kokkos::Experimental::require(
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, f);
Kokkos::fence();
timer.reset();
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel5",
Kokkos::Experimental::require(
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space1,{0,0},{N,R}),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, FunctorMDRange(M,R,a));
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel6",
Kokkos::Experimental::require(
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space2,{0,0},{N,R}),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, FunctorMDRange(M,R,a));
Kokkos::fence();
double time_overlap = timer.seconds();
timer.reset();
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel7",
Kokkos::Experimental::require(
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, f);
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel8",
Kokkos::Experimental::require(
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, f);
Kokkos::fence();
double time_end = timer.seconds();
if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
ASSERT_TRUE( (time_end > 1.5*time_overlap) );
}
printf("Time MDRangePolicy: NonOverlap: %lf Time Overlap: %lf\n",time_end,time_overlap);
Kokkos::View<double,TEST_EXECSPACE> result("result");
Kokkos::View<double,TEST_EXECSPACE> result1("result1");
Kokkos::View<double,TEST_EXECSPACE> result2("result2");
Kokkos::View<double,Kokkos::HostSpace> h_result("h_result");
Kokkos::View<double,Kokkos::HostSpace> h_result1("h_result1");
Kokkos::View<double,Kokkos::HostSpace> h_result2("h_result2");
timer.reset();
Kokkos::parallel_reduce("default_exec::overlap_mdrange_policy::kernel_reduce",
Kokkos::Experimental::require(
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, fr, result);
Kokkos::fence();
double time_fenced = timer.seconds();
Kokkos::deep_copy(h_result,result);
timer.reset();
Kokkos::parallel_reduce("default_exec::overlap_mdrange_policy::kernel_reduce",
Kokkos::Experimental::require(
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, fr, result);
double time_not_fenced = timer.seconds();
Kokkos::fence();
if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
ASSERT_TRUE(time_fenced>2.0*time_not_fenced);
}
timer.reset();
Kokkos::parallel_reduce("default_exec::overlap_mdrange_policy::kernel_reduce",
Kokkos::Experimental::require(
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, fr, result);
Kokkos::parallel_reduce("default_exec::overlap_mdrange_policy::kernel_reduce",
Kokkos::Experimental::require(
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, fr, result);
Kokkos::fence();
double time_no_overlapped_reduce = timer.seconds();
timer.reset();
Kokkos::parallel_reduce("default_exec::overlap_mdrange_policy::kernel_reduce",
Kokkos::Experimental::require(
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space1,{0,0},{N,R}),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, fr, result1);
Kokkos::parallel_reduce("default_exec::overlap_mdrange_policy::kernel_reduce",
Kokkos::Experimental::require(
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space2,{0,0},{N,R}),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, fr, result2);
Kokkos::fence();
double time_overlapped_reduce = timer.seconds();
Kokkos::deep_copy(h_result2,result2);
Kokkos::deep_copy(h_result1,result1);
ASSERT_EQ(h_result1(),h_result());
ASSERT_EQ(h_result2(),h_result());
if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
ASSERT_TRUE(time_overlapped_reduce < 1.5*time_no_overlapped_reduce);
}
printf("Time MDRangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",time_no_overlapped_reduce,time_overlapped_reduce);
SpaceInstance<TEST_EXECSPACE>::destroy(space2);
SpaceInstance<TEST_EXECSPACE>::destroy(space1);
}
TEST_F( default_exec, overlap_team_policy ) {
int N = 20;
int M = 1000000;
int R = 10;
TEST_EXECSPACE space;
TEST_EXECSPACE space1 = SpaceInstance<TEST_EXECSPACE>::create();
TEST_EXECSPACE space2 = SpaceInstance<TEST_EXECSPACE>::create();
Kokkos::View<double**,Kokkos::LayoutRight,TEST_EXECSPACE> a("A",N,M);
FunctorTeam f(M,R,a);
FunctorTeamReduce fr(M,R,a);
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel0",
Kokkos::Experimental::require(
Kokkos::TeamPolicy<TEST_EXECSPACE>(N,Kokkos::AUTO),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, FunctorTeam(M,R,a));
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel1",
Kokkos::Experimental::require(
Kokkos::TeamPolicy<TEST_EXECSPACE>(space1,N,Kokkos::AUTO),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, f);
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel2",
Kokkos::Experimental::require(
Kokkos::TeamPolicy<TEST_EXECSPACE>(space2,N,Kokkos::AUTO),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, f);
Kokkos::fence();
Kokkos::Timer timer;
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel3",
Kokkos::Experimental::require(
Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, f);
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel4",
Kokkos::Experimental::require(
Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, f);
Kokkos::fence();
timer.reset();
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel5",
Kokkos::Experimental::require(
Kokkos::TeamPolicy<TEST_EXECSPACE>(space1,N,Kokkos::AUTO),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, FunctorTeam(M,R,a));
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel6",
Kokkos::Experimental::require(
Kokkos::TeamPolicy<TEST_EXECSPACE>(space2,N,Kokkos::AUTO),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, FunctorTeam(M,R,a));
Kokkos::fence();
double time_overlap = timer.seconds();
timer.reset();
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel7",
Kokkos::Experimental::require(
Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, f);
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel8",
Kokkos::Experimental::require(
Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, f);
Kokkos::fence();
double time_end = timer.seconds();
if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
ASSERT_TRUE( (time_end > 1.5*time_overlap) );
}
printf("Time TeamPolicy: NonOverlap: %lf Time Overlap: %lf\n",time_end,time_overlap);
Kokkos::View<double,TEST_EXECSPACE> result("result");
Kokkos::View<double,TEST_EXECSPACE> result1("result1");
Kokkos::View<double,TEST_EXECSPACE> result2("result2");
Kokkos::View<double,Kokkos::HostSpace> h_result("h_result");
Kokkos::View<double,Kokkos::HostSpace> h_result1("h_result1");
Kokkos::View<double,Kokkos::HostSpace> h_result2("h_result2");
timer.reset();
Kokkos::parallel_reduce("default_exec::overlap_team_policy::kernel_reduce",
Kokkos::Experimental::require(
Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, fr, result);
Kokkos::fence();
double time_fenced = timer.seconds();
Kokkos::deep_copy(h_result,result);
timer.reset();
Kokkos::parallel_reduce("default_exec::overlap_team_policy::kernel_reduce",
Kokkos::Experimental::require(
Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, fr, result);
double time_not_fenced = timer.seconds();
Kokkos::fence();
if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
ASSERT_TRUE(time_fenced>2.0*time_not_fenced);
}
timer.reset();
Kokkos::parallel_reduce("default_exec::overlap_team_policy::kernel_reduce",
Kokkos::Experimental::require(
Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, fr, result);
Kokkos::parallel_reduce("default_exec::overlap_team_policy::kernel_reduce",
Kokkos::Experimental::require(
Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, fr, result);
Kokkos::fence();
double time_no_overlapped_reduce = timer.seconds();
timer.reset();
Kokkos::parallel_reduce("default_exec::overlap_team_policy::kernel_reduce",
Kokkos::Experimental::require(
Kokkos::TeamPolicy<TEST_EXECSPACE>(space1,N,Kokkos::AUTO),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, fr, result1);
Kokkos::parallel_reduce("default_exec::overlap_team_policy::kernel_reduce",
Kokkos::Experimental::require(
Kokkos::TeamPolicy<TEST_EXECSPACE>(space2,N,Kokkos::AUTO),
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
, fr, result2);
Kokkos::fence();
double time_overlapped_reduce = timer.seconds();
Kokkos::deep_copy(h_result2,result2);
Kokkos::deep_copy(h_result1,result1);
ASSERT_EQ(h_result1(),h_result());
ASSERT_EQ(h_result2(),h_result());
if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
ASSERT_TRUE(time_overlapped_reduce < 1.5*time_no_overlapped_reduce);
}
printf("Time TeamPolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",time_no_overlapped_reduce,time_overlapped_reduce);
SpaceInstance<TEST_EXECSPACE>::destroy(space1);
SpaceInstance<TEST_EXECSPACE>::destroy(space2);
}
}

View File

@ -121,6 +121,7 @@ void run_allocateview_tests(int N, int R) {
Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
a_ptr[i] = 0.0;
});
Kokkos::fence();
Kokkos::kokkos_free(a_ptr);
}
time_raw = timer.seconds()/R;

View File

@ -95,6 +95,7 @@ void run_deepcopyview_tests123(int N, int R) {
a_ptr[i] = b_ptr[i];
});
}
Kokkos::fence();
time_raw = timer.seconds()/R;
}
#endif
@ -135,6 +136,7 @@ void run_deepcopyview_tests45(int N, int R) {
a_ptr[i] = b_ptr[i];
});
}
Kokkos::fence();
time_raw = timer.seconds()/R;
}
#endif
@ -169,6 +171,7 @@ void run_deepcopyview_tests6(int N, int R) {
a_ptr[i] = b_ptr[i];
});
}
Kokkos::fence();
time_raw = timer.seconds()/R;
}
#endif
@ -202,6 +205,7 @@ void run_deepcopyview_tests7(int N, int R) {
a_ptr[i] = b_ptr[i];
});
}
Kokkos::fence();
time_raw = timer.seconds()/R;
}
#endif
@ -235,6 +239,7 @@ void run_deepcopyview_tests8(int N, int R) {
a_ptr[i] = b_ptr[i];
});
}
Kokkos::fence();
time_raw = timer.seconds()/R;
}
#endif

View File

@ -90,6 +90,7 @@ void run_fillview_tests123(int N, int R) {
a_ptr[i] = 1.1;
});
}
Kokkos::fence();
time_raw = timer.seconds()/R;
}
#endif
@ -126,6 +127,7 @@ void run_fillview_tests45(int N, int R) {
a_ptr[i] = 1.1;
});
}
Kokkos::fence();
time_raw = timer.seconds()/R;
}
#endif
@ -157,6 +159,7 @@ void run_fillview_tests6(int N, int R) {
a_ptr[i] = 1.1;
});
}
Kokkos::fence();
time_raw = timer.seconds()/R;
}
#endif
@ -187,6 +190,7 @@ void run_fillview_tests7(int N, int R) {
a_ptr[i] = 1.1;
});
}
Kokkos::fence();
time_raw = timer.seconds()/R;
}
#endif
@ -217,6 +221,7 @@ void run_fillview_tests8(int N, int R) {
a_ptr[i] = 1.1;
});
}
Kokkos::fence();
time_raw = timer.seconds()/R;
}
#endif

View File

@ -95,7 +95,9 @@ void run_resizeview_tests123(int N, int R) {
Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
a1_ptr[i] = a_ptr[i];
});
Kokkos::fence();
}
Kokkos::fence();
time_raw = timer.seconds()/R;
}
#endif
@ -143,7 +145,9 @@ void run_resizeview_tests45(int N, int R) {
Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
a1_ptr[i] = a_ptr[i];
});
Kokkos::fence();
}
Kokkos::fence();
time_raw = timer.seconds()/R;
}
#endif
@ -181,7 +185,9 @@ void run_resizeview_tests6(int N, int R) {
Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
a1_ptr[i] = a_ptr[i];
});
Kokkos::fence();
}
Kokkos::fence();
time_raw = timer.seconds()/R;
}
#endif
@ -218,7 +224,9 @@ void run_resizeview_tests7(int N, int R) {
Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
a1_ptr[i] = a_ptr[i];
});
Kokkos::fence();
}
Kokkos::fence();
time_raw = timer.seconds()/R;
}
#endif
@ -255,7 +263,9 @@ void run_resizeview_tests8(int N, int R) {
Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
a1_ptr[i] = a_ptr[i];
});
Kokkos::fence();
}
Kokkos::fence();
time_raw = timer.seconds()/R;
}
#endif

View File

@ -69,7 +69,7 @@ typedef Kokkos::DefaultExecutionSpace exec_space;
#define WHITE 8
void textcolor(int attr, int fg, int bg)
{ char command[13];
{ char command[40];
/* Command is the control command to the terminal */
sprintf(command, "%c[%d;%d;%dm", 0x1B, attr, fg + 30, bg + 40);
@ -85,7 +85,7 @@ struct ZeroFunctor{
typedef typename Kokkos::View<T,execution_space>::HostMirror h_type;
type data;
KOKKOS_INLINE_FUNCTION
void operator()(int i) const {
void operator()(int) const {
data() = 0;
}
};
@ -101,7 +101,7 @@ struct AddFunctor{
type data;
KOKKOS_INLINE_FUNCTION
void operator()(int i) const {
void operator()(int) const {
Kokkos::atomic_fetch_add(&data(),(T)1);
}
};
@ -113,12 +113,12 @@ T AddLoop(int loop) {
typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
f_zero.data = data;
Kokkos::parallel_for(1,f_zero);
exec_space::fence();
exec_space().fence();
struct AddFunctor<T,exec_space> f_add;
f_add.data = data;
Kokkos::parallel_for(loop,f_add);
exec_space::fence();
exec_space().fence();
Kokkos::deep_copy(h_data,data);
T val = h_data();
@ -132,7 +132,7 @@ struct AddNonAtomicFunctor{
type data;
KOKKOS_INLINE_FUNCTION
void operator()(int i) const {
void operator()(int) const {
data()+=(T)1;
}
};
@ -145,12 +145,12 @@ T AddLoopNonAtomic(int loop) {
f_zero.data = data;
Kokkos::parallel_for(1,f_zero);
exec_space::fence();
exec_space().fence();
struct AddNonAtomicFunctor<T,exec_space> f_add;
f_add.data = data;
Kokkos::parallel_for(loop,f_add);
exec_space::fence();
exec_space().fence();
Kokkos::deep_copy(h_data,data);
T val = h_data();
@ -178,7 +178,7 @@ struct CASFunctor{
type data;
KOKKOS_INLINE_FUNCTION
void operator()(int i) const {
void operator()(int) const {
T old = data();
T newval, assumed;
do {
@ -197,12 +197,12 @@ T CASLoop(int loop) {
typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
f_zero.data = data;
Kokkos::parallel_for(1,f_zero);
exec_space::fence();
exec_space().fence();
struct CASFunctor<T,exec_space> f_cas;
f_cas.data = data;
Kokkos::parallel_for(loop,f_cas);
exec_space::fence();
exec_space().fence();
Kokkos::deep_copy(h_data,data);
T val = h_data();
@ -217,7 +217,7 @@ struct CASNonAtomicFunctor{
type data;
KOKKOS_INLINE_FUNCTION
void operator()(int i) const {
void operator()(int) const {
volatile T assumed;
volatile T newval;
bool fail=1;
@ -240,12 +240,12 @@ T CASLoopNonAtomic(int loop) {
typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
f_zero.data = data;
Kokkos::parallel_for(1,f_zero);
exec_space::fence();
exec_space().fence();
struct CASNonAtomicFunctor<T,exec_space> f_cas;
f_cas.data = data;
Kokkos::parallel_for(loop,f_cas);
exec_space::fence();
exec_space().fence();
Kokkos::deep_copy(h_data,data);
T val = h_data();
@ -296,19 +296,19 @@ T ExchLoop(int loop) {
typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
f_zero.data = data;
Kokkos::parallel_for(1,f_zero);
exec_space::fence();
exec_space().fence();
typename ZeroFunctor<T,exec_space>::type data2("Data");
typename ZeroFunctor<T,exec_space>::h_type h_data2("HData");
f_zero.data = data2;
Kokkos::parallel_for(1,f_zero);
exec_space::fence();
exec_space().fence();
struct ExchFunctor<T,exec_space> f_exch;
f_exch.data = data;
f_exch.data2 = data2;
Kokkos::parallel_for(loop,f_exch);
exec_space::fence();
exec_space().fence();
Kokkos::deep_copy(h_data,data);
Kokkos::deep_copy(h_data2,data2);
@ -339,19 +339,19 @@ T ExchLoopNonAtomic(int loop) {
typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
f_zero.data = data;
Kokkos::parallel_for(1,f_zero);
exec_space::fence();
exec_space().fence();
typename ZeroFunctor<T,exec_space>::type data2("Data");
typename ZeroFunctor<T,exec_space>::h_type h_data2("HData");
f_zero.data = data2;
Kokkos::parallel_for(1,f_zero);
exec_space::fence();
exec_space().fence();
struct ExchNonAtomicFunctor<T,exec_space> f_exch;
f_exch.data = data;
f_exch.data2 = data2;
Kokkos::parallel_for(loop,f_exch);
exec_space::fence();
exec_space().fence();
Kokkos::deep_copy(h_data,data);
Kokkos::deep_copy(h_data2,data2);

View File

@ -153,6 +153,7 @@ struct TestFunctor {
typedef Kokkos::RangePolicy< ExecSpace , TagDel > policy ;
Kokkos::parallel_for( policy(0,range_iter), *this );
Kokkos::fence();
}
//----------------------------------------

View File

@ -92,27 +92,26 @@ long fib_alloc_count( long n )
return count[ n & mask ];
}
template< class Space >
template< class Scheduler >
struct TestFib {
using Scheduler = Kokkos::TaskScheduler< Space > ;
using MemorySpace = typename Scheduler::memory_space ;
using MemberType = typename Scheduler::member_type ;
using FutureType = Kokkos::Future< long , Space > ;
using FutureType = Kokkos::BasicFuture< long , Scheduler > ;
typedef long value_type ;
Scheduler sched ;
FutureType dep[2] ;
const value_type n ;
KOKKOS_INLINE_FUNCTION
TestFib( const Scheduler & arg_sched , const value_type arg_n )
: sched( arg_sched ), dep{} , n( arg_n ) {}
TestFib( const value_type arg_n )
: dep{} , n( arg_n ) {}
KOKKOS_INLINE_FUNCTION
void operator()( const MemberType & , value_type & result ) noexcept
void operator()( MemberType & member, value_type & result ) noexcept
{
auto& sched = member.scheduler();
if ( n < 2 ) {
result = n ;
}
@ -126,13 +125,13 @@ struct TestFib {
dep[1] = Kokkos::task_spawn
( Kokkos::TaskSingle( sched, Kokkos::TaskPriority::High )
, TestFib( sched, n - 2 ) );
, TestFib( n - 2 ) );
dep[0] = Kokkos::task_spawn
( Kokkos::TaskSingle( sched )
, TestFib( sched, n - 1 ) );
, TestFib( n - 1 ) );
Kokkos::Future< ExecSpace > fib_all = Kokkos::when_all( dep, 2 );
auto fib_all = sched.when_all( dep, 2 );
if ( ! dep[0].is_null() && ! dep[1].is_null() && ! fib_all.is_null() ) {
// High priority to retire this branch.
@ -202,13 +201,15 @@ int main( int argc , char* argv[] )
return -1;
}
typedef TestFib< ExecSpace > Functor ;
using Scheduler = Kokkos::TaskSchedulerMultiple<ExecSpace>;
typedef TestFib< Scheduler > Functor ;
Kokkos::initialize(argc,argv);
{
Functor::Scheduler sched( Functor::MemorySpace()
Scheduler sched( Functor::MemorySpace()
, total_alloc_size
, min_block_size
, max_block_size
@ -217,21 +218,21 @@ int main( int argc , char* argv[] )
Functor::FutureType f =
Kokkos::host_spawn( Kokkos::TaskSingle( sched )
, Functor( sched , fib_input )
, Functor( fib_input )
);
Kokkos::wait( sched );
test_result = f.get();
task_count_max = sched.allocated_task_count_max();
task_count_accum = sched.allocated_task_count_accum();
//task_count_max = sched.allocated_task_count_max();
//task_count_accum = sched.allocated_task_count_accum();
if ( number_alloc != task_count_accum ) {
std::cout << " number_alloc( " << number_alloc << " )"
<< " != task_count_accum( " << task_count_accum << " )"
<< std::endl ;
}
//if ( number_alloc != task_count_accum ) {
// std::cout << " number_alloc( " << number_alloc << " )"
// << " != task_count_accum( " << task_count_accum << " )"
// << std::endl ;
//}
if ( fib_output != test_result ) {
std::cout << " answer( " << fib_output << " )"
@ -239,7 +240,7 @@ int main( int argc , char* argv[] )
<< std::endl ;
}
if ( fib_output != test_result || number_alloc != task_count_accum ) {
if ( fib_output != test_result) { // || number_alloc != task_count_accum ) {
printf(" TEST FAILED\n");
return -1;
}
@ -252,7 +253,7 @@ int main( int argc , char* argv[] )
Functor::FutureType ftmp =
Kokkos::host_spawn( Kokkos::TaskSingle( sched )
, Functor( sched , fib_input )
, Functor( fib_input )
);
Kokkos::wait( sched );

View File

@ -61,6 +61,16 @@ IF(KOKKOS_LEGACY_TRIBITS)
#-----------------------------------------------------------------------------
FILE(GLOB HEADERS_HPX HPX/*.hpp)
FILE(GLOB SOURCES_HPX HPX/*.cpp)
LIST(APPEND HEADERS_PRIVATE ${HEADERS_HPX} )
LIST(APPEND SOURCES ${SOURCES_HPX} )
INSTALL(FILES ${HEADERS_HPX} DESTINATION ${TRILINOS_INCDIR}/HPX/)
#-----------------------------------------------------------------------------
FILE(GLOB HEADERS_CUDA Cuda/*.hpp)
FILE(GLOB SOURCES_CUDA Cuda/*.cpp)

View File

@ -1,419 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDAEXEC_HPP
#define KOKKOS_CUDAEXEC_HPP
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_CUDA
#include <string>
#include <cstdint>
#include <Kokkos_Parallel.hpp>
#include <impl/Kokkos_Error.hpp>
#include <Cuda/Kokkos_Cuda_abort.hpp>
#include <Cuda/Kokkos_Cuda_Error.hpp>
#include <Cuda/Kokkos_Cuda_Locks.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
struct CudaTraits {
enum { WarpSize = 32 /* 0x0020 */ };
enum { WarpIndexMask = 0x001f /* Mask for warpindex */ };
enum { WarpIndexShift = 5 /* WarpSize == 1 << WarpShift */ };
enum { SharedMemoryBanks = 32 /* Compute device 2.0 */ };
enum { SharedMemoryCapacity = 0x0C000 /* 48k shared / 16k L1 Cache */ };
enum { SharedMemoryUsage = 0x04000 /* 16k shared / 48k L1 Cache */ };
enum { UpperBoundGridCount = 65535 /* Hard upper bound */ };
enum { ConstantMemoryCapacity = 0x010000 /* 64k bytes */ };
enum { ConstantMemoryUsage = 0x008000 /* 32k bytes */ };
enum { ConstantMemoryCache = 0x002000 /* 8k bytes */ };
typedef unsigned long
ConstantGlobalBufferType[ ConstantMemoryUsage / sizeof(unsigned long) ];
enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ };
KOKKOS_INLINE_FUNCTION static
CudaSpace::size_type warp_count( CudaSpace::size_type i )
{ return ( i + WarpIndexMask ) >> WarpIndexShift ; }
KOKKOS_INLINE_FUNCTION static
CudaSpace::size_type warp_align( CudaSpace::size_type i )
{
enum { Mask = ~CudaSpace::size_type( WarpIndexMask ) };
return ( i + WarpIndexMask ) & Mask ;
}
};
//----------------------------------------------------------------------------
CudaSpace::size_type cuda_internal_multiprocessor_count();
CudaSpace::size_type cuda_internal_maximum_warp_count();
CudaSpace::size_type cuda_internal_maximum_grid_count();
CudaSpace::size_type cuda_internal_maximum_shared_words();
CudaSpace::size_type cuda_internal_maximum_concurrent_block_count();
CudaSpace::size_type * cuda_internal_scratch_flags( const CudaSpace::size_type size );
CudaSpace::size_type * cuda_internal_scratch_space( const CudaSpace::size_type size );
CudaSpace::size_type * cuda_internal_scratch_unified( const CudaSpace::size_type size );
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if defined( __CUDACC__ )
/** \brief Access to constant memory on the device */
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
__device__ __constant__
extern unsigned long kokkos_impl_cuda_constant_memory_buffer[] ;
#else
__device__ __constant__
unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;
#endif
namespace Kokkos {
namespace Impl {
void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink = false);
}
}
template< typename T >
inline
__device__
T * kokkos_impl_cuda_shared_memory()
{ extern __shared__ Kokkos::CudaSpace::size_type sh[]; return (T*) sh ; }
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
// See section B.17 of Cuda C Programming Guide Version 3.2
// for discussion of
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
// function qualifier which could be used to improve performance.
//----------------------------------------------------------------------------
// Maximize L1 cache and minimize shared memory:
// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
// For 2.0 capability: 48 KB L1 and 16 KB shared
//----------------------------------------------------------------------------
template< class DriverType>
__global__
static void cuda_parallel_launch_constant_memory()
{
const DriverType & driver =
*((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
driver();
}
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
__global__
__launch_bounds__(maxTperB, minBperSM)
static void cuda_parallel_launch_constant_memory()
{
const DriverType & driver =
*((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
driver();
}
template< class DriverType>
__global__
static void cuda_parallel_launch_local_memory( const DriverType driver )
{
driver();
}
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
__global__
__launch_bounds__(maxTperB, minBperSM)
static void cuda_parallel_launch_local_memory( const DriverType driver )
{
driver();
}
template < class DriverType
, class LaunchBounds = Kokkos::LaunchBounds<>
, bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
struct CudaParallelLaunch ;
template < class DriverType
, unsigned int MaxThreadsPerBlock
, unsigned int MinBlocksPerSM >
struct CudaParallelLaunch< DriverType
, Kokkos::LaunchBounds< MaxThreadsPerBlock
, MinBlocksPerSM >
, true >
{
inline
CudaParallelLaunch( const DriverType & driver
, const dim3 & grid
, const dim3 & block
, const int shmem
, const cudaStream_t stream = 0 )
{
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
sizeof( DriverType ) ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
}
// Fence before changing settings and copying closure
Kokkos::Cuda::fence();
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
#ifndef KOKKOS_ARCH_KEPLER
// On Kepler the L1 has no benefit since it doesn't cache reads
else {
CUDA_SAFE_CALL(
cudaFuncSetCacheConfig
( cuda_parallel_launch_constant_memory
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
, ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
) );
}
#endif
// Copy functor to constant memory on the device
cudaMemcpyToSymbol(
kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType) );
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
// Invoke the driver function on the device
cuda_parallel_launch_constant_memory
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
<<< grid , block , shmem , stream >>>();
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );
Kokkos::Cuda::fence();
#endif
}
}
};
template < class DriverType >
struct CudaParallelLaunch< DriverType
, Kokkos::LaunchBounds<>
, true >
{
inline
CudaParallelLaunch( const DriverType & driver
, const dim3 & grid
, const dim3 & block
, const int shmem
, const cudaStream_t stream = 0 )
{
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
sizeof( DriverType ) ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
}
// Fence before changing settings and copying closure
Kokkos::Cuda::fence();
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
#ifndef KOKKOS_ARCH_KEPLER
// On Kepler the L1 has no benefit since it doesn't cache reads
else {
CUDA_SAFE_CALL(
cudaFuncSetCacheConfig
( cuda_parallel_launch_constant_memory< DriverType >
, ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
) );
}
#endif
// Copy functor to constant memory on the device
cudaMemcpyToSymbol(
kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType) );
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
// Invoke the driver function on the device
cuda_parallel_launch_constant_memory< DriverType >
<<< grid , block , shmem , stream >>>();
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );
Kokkos::Cuda::fence();
#endif
}
}
};
template < class DriverType
, unsigned int MaxThreadsPerBlock
, unsigned int MinBlocksPerSM >
struct CudaParallelLaunch< DriverType
, Kokkos::LaunchBounds< MaxThreadsPerBlock
, MinBlocksPerSM >
, false >
{
inline
CudaParallelLaunch( const DriverType & driver
, const dim3 & grid
, const dim3 & block
, const int shmem
, const cudaStream_t stream = 0 )
{
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
sizeof( DriverType ) ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
}
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
#ifndef KOKKOS_ARCH_KEPLER
// On Kepler the L1 has no benefit since it doesn't cache reads
else {
CUDA_SAFE_CALL(
cudaFuncSetCacheConfig
( cuda_parallel_launch_local_memory
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
, ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
) );
}
#endif
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
// Invoke the driver function on the device
cuda_parallel_launch_local_memory
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
<<< grid , block , shmem , stream >>>( driver );
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );
Kokkos::Cuda::fence();
#endif
}
}
};
template < class DriverType >
struct CudaParallelLaunch< DriverType
, Kokkos::LaunchBounds<>
, false >
{
inline
CudaParallelLaunch( const DriverType & driver
, const dim3 & grid
, const dim3 & block
, const int shmem
, const cudaStream_t stream = 0 )
{
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
sizeof( DriverType ) ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
}
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
#ifndef KOKKOS_ARCH_KEPLER
// On Kepler the L1 has no benefit since it doesn't cache reads
else {
CUDA_SAFE_CALL(
cudaFuncSetCacheConfig
( cuda_parallel_launch_local_memory< DriverType >
, ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
) );
}
#endif
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
// Invoke the driver function on the device
cuda_parallel_launch_local_memory< DriverType >
<<< grid , block , shmem , stream >>>( driver );
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );
Kokkos::Cuda::fence();
#endif
}
}
};
//----------------------------------------------------------------------------
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* defined( __CUDACC__ ) */
#endif /* defined( KOKKOS_ENABLE_CUDA ) */
#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */

View File

@ -55,7 +55,7 @@
#include <Kokkos_Cuda.hpp>
#include <Kokkos_CudaSpace.hpp>
#include <Cuda/Kokkos_Cuda_Internal.hpp>
//#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
#include <impl/Kokkos_Error.hpp>
#if defined(KOKKOS_ENABLE_PROFILING)
@ -183,7 +183,7 @@ void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const
enum { max_uvm_allocations = 65536 };
Cuda::fence();
Cuda::impl_static_fence();
if ( arg_alloc_size > 0 )
{
Kokkos::Impl::num_uvm_allocations++;
@ -194,7 +194,7 @@ void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const
CUDA_SAFE_CALL( cudaMallocManaged( &ptr, arg_alloc_size , cudaMemAttachGlobal ) );
}
Cuda::fence();
Cuda::impl_static_fence();
return ptr ;
}
@ -217,14 +217,14 @@ void CudaSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_all
void CudaUVMSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
{
Cuda::fence();
Cuda::impl_static_fence();
try {
if ( arg_alloc_ptr != nullptr ) {
Kokkos::Impl::num_uvm_allocations--;
CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) );
}
} catch(...) {}
Cuda::fence();
Cuda::impl_static_fence();
}
void CudaHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
@ -390,7 +390,7 @@ SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
{
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Cuda::fence(); //Make sure I can access the label ...
Cuda::impl_static_fence(); //Make sure I can access the label ...
Kokkos::Profiling::deallocateData(
Kokkos::Profiling::SpaceHandle(Kokkos::CudaUVMSpace::name()),RecordBase::m_alloc_ptr->m_label,
data(),size());

View File

@ -0,0 +1,657 @@
/*
@HEADER
================================================================================
ORIGINAL LICENSE
----------------
Copyright (c) 2018, NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
================================================================================
LICENSE ASSOCIATED WITH SUBSEQUENT MODIFICATIONS
------------------------------------------------
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2019) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
@HEADER
*/
#include <Kokkos_Macros.hpp>
#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS)
#include <cassert>
#ifndef _SIMT_DETAILS_CONFIG
#define _SIMT_DETAILS_CONFIG
namespace Kokkos {
namespace Impl {
#ifndef __simt_scope
// Modification: Kokkos GPU atomics should default to `gpu` scope
#define __simt_scope "gpu"
#endif
#define __simt_fence_signal_() asm volatile("":::"memory")
#define __simt_fence_sc_() asm volatile("fence.sc." __simt_scope ";":::"memory")
#define __simt_fence_() asm volatile("fence." __simt_scope ";":::"memory")
#define __simt_load_acquire_8_as_32(ptr,ret) asm volatile("ld.acquire." __simt_scope ".b8 %0, [%1];" : "=r"(ret) : "l"(ptr) : "memory")
#define __simt_load_relaxed_8_as_32(ptr,ret) asm volatile("ld.relaxed." __simt_scope ".b8 %0, [%1];" : "=r"(ret) : "l"(ptr) : "memory")
#define __simt_store_release_8_as_32(ptr,desired) asm volatile("st.release." __simt_scope ".b8 [%0], %1;" :: "l"(ptr), "r"(desired) : "memory")
#define __simt_store_relaxed_8_as_32(ptr,desired) asm volatile("st.relaxed." __simt_scope ".b8 [%0], %1;" :: "l"(ptr), "r"(desired) : "memory")
#define __simt_load_acquire_16(ptr,ret) asm volatile("ld.acquire." __simt_scope ".b16 %0, [%1];" : "=h"(ret) : "l"(ptr) : "memory")
#define __simt_load_relaxed_16(ptr,ret) asm volatile("ld.relaxed." __simt_scope ".b16 %0, [%1];" : "=h"(ret) : "l"(ptr) : "memory")
#define __simt_store_release_16(ptr,desired) asm volatile("st.release." __simt_scope ".b16 [%0], %1;" :: "l"(ptr), "h"(desired) : "memory")
#define __simt_store_relaxed_16(ptr,desired) asm volatile("st.relaxed." __simt_scope ".b16 [%0], %1;" :: "l"(ptr), "h"(desired) : "memory")
#define __simt_load_acquire_32(ptr,ret) asm volatile("ld.acquire." __simt_scope ".b32 %0, [%1];" : "=r"(ret) : "l"(ptr) : "memory")
#define __simt_load_relaxed_32(ptr,ret) asm volatile("ld.relaxed." __simt_scope ".b32 %0, [%1];" : "=r"(ret) : "l"(ptr) : "memory")
#define __simt_store_release_32(ptr,desired) asm volatile("st.release." __simt_scope ".b32 [%0], %1;" :: "l"(ptr), "r"(desired) : "memory")
#define __simt_store_relaxed_32(ptr,desired) asm volatile("st.relaxed." __simt_scope ".b32 [%0], %1;" :: "l"(ptr), "r"(desired) : "memory")
#define __simt_exch_release_32(ptr,old,desired) asm volatile("atom.exch.release." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(desired) : "memory")
#define __simt_exch_acquire_32(ptr,old,desired) asm volatile("atom.exch.acquire." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(desired) : "memory")
#define __simt_exch_acq_rel_32(ptr,old,desired) asm volatile("atom.exch.acq_rel." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(desired) : "memory")
#define __simt_exch_relaxed_32(ptr,old,desired) asm volatile("atom.exch.relaxed." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(desired) : "memory")
#define __simt_cas_release_32(ptr,old,expected,desired) asm volatile("atom.cas.release." __simt_scope ".b32 %0, [%1], %2, %3;" : "=r"(old) : "l"(ptr), "r"(expected), "r"(desired) : "memory")
#define __simt_cas_acquire_32(ptr,old,expected,desired) asm volatile("atom.cas.acquire." __simt_scope ".b32 %0, [%1], %2, %3;" : "=r"(old) : "l"(ptr), "r"(expected), "r"(desired) : "memory")
#define __simt_cas_acq_rel_32(ptr,old,expected,desired) asm volatile("atom.cas.acq_rel." __simt_scope ".b32 %0, [%1], %2, %3;" : "=r"(old) : "l"(ptr), "r"(expected), "r"(desired) : "memory")
#define __simt_cas_relaxed_32(ptr,old,expected,desired) asm volatile("atom.cas.relaxed." __simt_scope ".b32 %0, [%1], %2, %3;" : "=r"(old) : "l"(ptr), "r"(expected), "r"(desired) : "memory")
#define __simt_add_release_32(ptr,old,addend) asm volatile("atom.add.release." __simt_scope ".u32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(addend) : "memory")
#define __simt_add_acquire_32(ptr,old,addend) asm volatile("atom.add.acquire." __simt_scope ".u32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(addend) : "memory")
#define __simt_add_acq_rel_32(ptr,old,addend) asm volatile("atom.add.acq_rel." __simt_scope ".u32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(addend) : "memory")
#define __simt_add_relaxed_32(ptr,old,addend) asm volatile("atom.add.relaxed." __simt_scope ".u32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(addend) : "memory")
#define __simt_and_release_32(ptr,old,andend) asm volatile("atom.and.release." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(andend) : "memory")
#define __simt_and_acquire_32(ptr,old,andend) asm volatile("atom.and.acquire." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(andend) : "memory")
#define __simt_and_acq_rel_32(ptr,old,andend) asm volatile("atom.and.acq_rel." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(andend) : "memory")
#define __simt_and_relaxed_32(ptr,old,andend) asm volatile("atom.and.relaxed." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(andend) : "memory")
#define __simt_or_release_32(ptr,old,orend) asm volatile("atom.or.release." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(orend) : "memory")
#define __simt_or_acquire_32(ptr,old,orend) asm volatile("atom.or.acquire." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(orend) : "memory")
#define __simt_or_acq_rel_32(ptr,old,orend) asm volatile("atom.or.acq_rel." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(orend) : "memory")
#define __simt_or_relaxed_32(ptr,old,orend) asm volatile("atom.or.relaxed." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(orend) : "memory")
#define __simt_xor_release_32(ptr,old,xorend) asm volatile("atom.xor.release." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(xorend) : "memory")
#define __simt_xor_acquire_32(ptr,old,xorend) asm volatile("atom.xor.acquire." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(xorend) : "memory")
#define __simt_xor_acq_rel_32(ptr,old,xorend) asm volatile("atom.xor.acq_rel." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(xorend) : "memory")
#define __simt_xor_relaxed_32(ptr,old,xorend) asm volatile("atom.xor.relaxed." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(xorend) : "memory")
#define __simt_load_acquire_64(ptr,ret) asm volatile("ld.acquire." __simt_scope ".b64 %0, [%1];" : "=l"(ret) : "l"(ptr) : "memory")
#define __simt_load_relaxed_64(ptr,ret) asm volatile("ld.relaxed." __simt_scope ".b64 %0, [%1];" : "=l"(ret) : "l"(ptr) : "memory")
#define __simt_store_release_64(ptr,desired) asm volatile("st.release." __simt_scope ".b64 [%0], %1;" :: "l"(ptr), "l"(desired) : "memory")
#define __simt_store_relaxed_64(ptr,desired) asm volatile("st.relaxed." __simt_scope ".b64 [%0], %1;" :: "l"(ptr), "l"(desired) : "memory")
#define __simt_exch_release_64(ptr,old,desired) asm volatile("atom.exch.release." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(desired) : "memory")
#define __simt_exch_acquire_64(ptr,old,desired) asm volatile("atom.exch.acquire." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(desired) : "memory")
#define __simt_exch_acq_rel_64(ptr,old,desired) asm volatile("atom.exch.acq_rel." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(desired) : "memory")
#define __simt_exch_relaxed_64(ptr,old,desired) asm volatile("atom.exch.relaxed." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(desired) : "memory")
#define __simt_cas_release_64(ptr,old,expected,desired) asm volatile("atom.cas.release." __simt_scope ".b64 %0, [%1], %2, %3;" : "=l"(old) : "l"(ptr), "l"(expected), "l"(desired) : "memory")
#define __simt_cas_acquire_64(ptr,old,expected,desired) asm volatile("atom.cas.acquire." __simt_scope ".b64 %0, [%1], %2, %3;" : "=l"(old) : "l"(ptr), "l"(expected), "l"(desired) : "memory")
#define __simt_cas_acq_rel_64(ptr,old,expected,desired) asm volatile("atom.cas.acq_rel." __simt_scope ".b64 %0, [%1], %2, %3;" : "=l"(old) : "l"(ptr), "l"(expected), "l"(desired) : "memory")
#define __simt_cas_relaxed_64(ptr,old,expected,desired) asm volatile("atom.cas.relaxed." __simt_scope ".b64 %0, [%1], %2, %3;" : "=l"(old) : "l"(ptr), "l"(expected), "l"(desired) : "memory")
#define __simt_add_release_64(ptr,old,addend) asm volatile("atom.add.release." __simt_scope ".u64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(addend) : "memory")
#define __simt_add_acquire_64(ptr,old,addend) asm volatile("atom.add.acquire." __simt_scope ".u64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(addend) : "memory")
#define __simt_add_acq_rel_64(ptr,old,addend) asm volatile("atom.add.acq_rel." __simt_scope ".u64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(addend) : "memory")
#define __simt_add_relaxed_64(ptr,old,addend) asm volatile("atom.add.relaxed." __simt_scope ".u64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(addend) : "memory")
#define __simt_and_release_64(ptr,old,andend) asm volatile("atom.and.release." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(andend) : "memory")
#define __simt_and_acquire_64(ptr,old,andend) asm volatile("atom.and.acquire." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(andend) : "memory")
#define __simt_and_acq_rel_64(ptr,old,andend) asm volatile("atom.and.acq_rel." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(andend) : "memory")
#define __simt_and_relaxed_64(ptr,old,andend) asm volatile("atom.and.relaxed." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(andend) : "memory")
#define __simt_or_release_64(ptr,old,orend) asm volatile("atom.or.release." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(orend) : "memory")
#define __simt_or_acquire_64(ptr,old,orend) asm volatile("atom.or.acquire." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(orend) : "memory")
#define __simt_or_acq_rel_64(ptr,old,orend) asm volatile("atom.or.acq_rel." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(orend) : "memory")
#define __simt_or_relaxed_64(ptr,old,orend) asm volatile("atom.or.relaxed." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(orend) : "memory")
#define __simt_xor_release_64(ptr,old,xorend) asm volatile("atom.xor.release." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(xorend) : "memory")
#define __simt_xor_acquire_64(ptr,old,xorend) asm volatile("atom.xor.acquire." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(xorend) : "memory")
#define __simt_xor_acq_rel_64(ptr,old,xorend) asm volatile("atom.xor.acq_rel." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(xorend) : "memory")
#define __simt_xor_relaxed_64(ptr,old,xorend) asm volatile("atom.xor.relaxed." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(xorend) : "memory")
#define __simt_nanosleep(timeout) asm volatile("nanosleep.u32 %0;" :: "r"(unsigned(timeout)) : )
/*
definitions
*/
#ifndef __GCC_ATOMIC_BOOL_LOCK_FREE
#define __GCC_ATOMIC_BOOL_LOCK_FREE 2
#define __GCC_ATOMIC_CHAR_LOCK_FREE 2
#define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2
#define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2
#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2
#define __GCC_ATOMIC_SHORT_LOCK_FREE 2
#define __GCC_ATOMIC_INT_LOCK_FREE 2
#define __GCC_ATOMIC_LONG_LOCK_FREE 2
#define __GCC_ATOMIC_LLONG_LOCK_FREE 2
#define __GCC_ATOMIC_POINTER_LOCK_FREE 2
#endif
#ifndef __ATOMIC_RELAXED
#define __ATOMIC_RELAXED 0
#define __ATOMIC_CONSUME 1
#define __ATOMIC_ACQUIRE 2
#define __ATOMIC_RELEASE 3
#define __ATOMIC_ACQ_REL 4
#define __ATOMIC_SEQ_CST 5
#endif
inline __device__ int __stronger_order_simt_(int a, int b) {
if (b == __ATOMIC_SEQ_CST) return __ATOMIC_SEQ_CST;
if (b == __ATOMIC_RELAXED) return a;
switch (a) {
case __ATOMIC_SEQ_CST:
case __ATOMIC_ACQ_REL: return a;
case __ATOMIC_CONSUME:
case __ATOMIC_ACQUIRE: if (b != __ATOMIC_ACQUIRE) return __ATOMIC_ACQ_REL; else return __ATOMIC_ACQUIRE;
case __ATOMIC_RELEASE: if (b != __ATOMIC_RELEASE) return __ATOMIC_ACQ_REL; else return __ATOMIC_RELEASE;
case __ATOMIC_RELAXED: return b;
default: assert(0);
}
return __ATOMIC_SEQ_CST;
}
/*
base
*/
#define DO__atomic_load_simt_(bytes, bits) \
template<class type, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
void __device__ __atomic_load_simt_ (const type *ptr, type *ret, int memorder) { \
int##bits##_t tmp = 0; \
switch (memorder) { \
case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
case __ATOMIC_CONSUME: \
case __ATOMIC_ACQUIRE: __simt_load_acquire_##bits(ptr, tmp); break; \
case __ATOMIC_RELAXED: __simt_load_relaxed_##bits(ptr, tmp); break; \
default: assert(0); \
} \
memcpy(ret, &tmp, bytes); \
}
DO__atomic_load_simt_(1,32)
DO__atomic_load_simt_(2,16)
DO__atomic_load_simt_(4,32)
DO__atomic_load_simt_(8,64)
template<class type>
type __device__ __atomic_load_n_simt_(const type *ptr, int memorder) {
type ret;
__atomic_load_simt_(ptr, &ret, memorder);
return ret;
}
#define DO__atomic_store_simt_(bytes, bits) \
template<class type, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
void __device__ __atomic_store_simt_ (type *ptr, type *val, int memorder) { \
int##bits##_t tmp = 0; \
memcpy(&tmp, val, bytes); \
switch (memorder) { \
case __ATOMIC_RELEASE: __simt_store_release_##bits(ptr, tmp); break; \
case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
case __ATOMIC_RELAXED: __simt_store_relaxed_##bits(ptr, tmp); break; \
default: assert(0); \
} \
}
DO__atomic_store_simt_(1,32)
DO__atomic_store_simt_(2,16)
DO__atomic_store_simt_(4,32)
DO__atomic_store_simt_(8,64)
template<class type>
void __device__ __atomic_store_n_simt_(type *ptr, type val, int memorder) {
__atomic_store_simt_(ptr, &val, memorder);
}
#define DO__atomic_compare_exchange_simt_(bytes, bits) \
template<class type, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
bool __device__ __atomic_compare_exchange_simt_ (type *ptr, type *expected, const type *desired, bool, int success_memorder, int failure_memorder) { \
int##bits##_t tmp = 0, old = 0, old_tmp; \
memcpy(&tmp, desired, bytes); \
memcpy(&old, expected, bytes); \
old_tmp = old; \
switch (__stronger_order_simt_(success_memorder, failure_memorder)) { \
case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
case __ATOMIC_CONSUME: \
case __ATOMIC_ACQUIRE: __simt_cas_acquire_##bits(ptr, old, old_tmp, tmp); break; \
case __ATOMIC_ACQ_REL: __simt_cas_acq_rel_##bits(ptr, old, old_tmp, tmp); break; \
case __ATOMIC_RELEASE: __simt_cas_release_##bits(ptr, old, old_tmp, tmp); break; \
case __ATOMIC_RELAXED: __simt_cas_relaxed_##bits(ptr, old, old_tmp, tmp); break; \
default: assert(0); \
} \
bool const ret = old == old_tmp; \
memcpy(expected, &old, bytes); \
return ret; \
}
DO__atomic_compare_exchange_simt_(4, 32)
DO__atomic_compare_exchange_simt_(8, 64)
template<class type, typename std::enable_if<sizeof(type) <= 2, int>::type = 0> \
bool __device__ __atomic_compare_exchange_simt_(type *ptr, type *expected, const type *desired, bool, int success_memorder, int failure_memorder) {
using R = typename std::conditional<std::is_volatile<type>::value, volatile uint32_t, uint32_t>::type;
auto const aligned = (R*)((intptr_t)ptr & ~(sizeof(uint32_t) - 1));
auto const offset = uint32_t((intptr_t)ptr & (sizeof(uint32_t) - 1)) * 8;
auto const mask = ((1 << sizeof(type)*8) - 1) << offset;
uint32_t old = *expected << offset, old_value;
while (1) {
old_value = (old & mask) >> offset;
if (old_value != *expected)
break;
uint32_t const attempt = (old & ~mask) | (*desired << offset);
if (__atomic_compare_exchange_simt_ (aligned, &old, &attempt, true, success_memorder, failure_memorder))
return true;
}
*expected = old_value;
return false;
}
template<class type>
bool __device__ __atomic_compare_exchange_n_simt_(type *ptr, type *expected, type desired, bool weak, int success_memorder, int failure_memorder) {
return __atomic_compare_exchange_simt_(ptr, expected, &desired, weak, success_memorder, failure_memorder);
}
#define DO__atomic_exchange_simt_(bytes, bits) \
template<class type, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
void __device__ __atomic_exchange_simt_ (type *ptr, type *val, type *ret, int memorder) { \
int##bits##_t tmp = 0; \
memcpy(&tmp, val, bytes); \
switch (memorder) { \
case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
case __ATOMIC_CONSUME: \
case __ATOMIC_ACQUIRE: __simt_exch_acquire_##bits(ptr, tmp, tmp); break; \
case __ATOMIC_ACQ_REL: __simt_exch_acq_rel_##bits(ptr, tmp, tmp); break; \
case __ATOMIC_RELEASE: __simt_exch_release_##bits(ptr, tmp, tmp); break; \
case __ATOMIC_RELAXED: __simt_exch_relaxed_##bits(ptr, tmp, tmp); break; \
default: assert(0); \
} \
memcpy(ret, &tmp, bytes); \
}
DO__atomic_exchange_simt_(4,32)
DO__atomic_exchange_simt_(8,64)
template<class type, typename std::enable_if<sizeof(type)<=2, int>::type = 0>
void __device__ __atomic_exchange_simt_ (type *ptr, type *val, type *ret, int memorder) {
type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
while(!__atomic_compare_exchange_simt_(ptr, &expected, val, true, memorder, memorder))
;
*ret = expected;
}
template<class type>
type __device__ __atomic_exchange_n_simt_(type *ptr, type val, int memorder) {
type ret;
__atomic_exchange_simt_(ptr, &val, &ret, memorder);
return ret;
}
#define DO__atomic_fetch_add_simt_(bytes, bits) \
template<class type, class delta, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
type __device__ __atomic_fetch_add_simt_ (type *ptr, delta val, int memorder) { \
type ret; \
switch (memorder) { \
case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
case __ATOMIC_CONSUME: \
case __ATOMIC_ACQUIRE: __simt_add_acquire_##bits(ptr, ret, val); break; \
case __ATOMIC_ACQ_REL: __simt_add_acq_rel_##bits(ptr, ret, val); break; \
case __ATOMIC_RELEASE: __simt_add_release_##bits(ptr, ret, val); break; \
case __ATOMIC_RELAXED: __simt_add_relaxed_##bits(ptr, ret, val); break; \
default: assert(0); \
} \
return ret; \
}
DO__atomic_fetch_add_simt_(4, 32)
DO__atomic_fetch_add_simt_(8, 64)
template<class type, class delta, typename std::enable_if<sizeof(type)<=2, int>::type = 0>
type __device__ __atomic_fetch_add_simt_ (type *ptr, delta val, int memorder) {
type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
type const desired = expected + val;
while(!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, memorder, memorder))
;
return expected;
}
#define DO__atomic_fetch_sub_simt_(bytes, bits) \
template<class type, class delta, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
type __device__ __atomic_fetch_sub_simt_ (type *ptr, delta val, int memorder) { \
type ret; \
switch (memorder) { \
case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
case __ATOMIC_CONSUME: \
case __ATOMIC_ACQUIRE: __simt_add_acquire_##bits(ptr, ret, -val); break; \
case __ATOMIC_ACQ_REL: __simt_add_acq_rel_##bits(ptr, ret, -val); break; \
case __ATOMIC_RELEASE: __simt_add_release_##bits(ptr, ret, -val); break; \
case __ATOMIC_RELAXED: __simt_add_relaxed_##bits(ptr, ret, -val); break; \
default: assert(0); \
} \
return ret; \
}
DO__atomic_fetch_sub_simt_(4,32)
DO__atomic_fetch_sub_simt_(8,64)
template<class type, class delta, typename std::enable_if<sizeof(type)<=2, int>::type = 0>
type __device__ __atomic_fetch_sub_simt_ (type *ptr, delta val, int memorder) {
type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
type const desired = expected - val;
while(!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, memorder, memorder))
;
return expected;
}
#define DO__atomic_fetch_and_simt_(bytes, bits) \
template<class type, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
type __device__ __atomic_fetch_and_simt_ (type *ptr, type val, int memorder) { \
type ret; \
switch (memorder) { \
case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
case __ATOMIC_CONSUME: \
case __ATOMIC_ACQUIRE: __simt_and_acquire_##bits(ptr, ret, val); break; \
case __ATOMIC_ACQ_REL: __simt_and_acq_rel_##bits(ptr, ret, val); break; \
case __ATOMIC_RELEASE: __simt_and_release_##bits(ptr, ret, val); break; \
case __ATOMIC_RELAXED: __simt_and_relaxed_##bits(ptr, ret, val); break; \
default: assert(0); \
} \
return ret; \
}
DO__atomic_fetch_and_simt_(4,32)
DO__atomic_fetch_and_simt_(8,64)
template<class type, class delta, typename std::enable_if<sizeof(type)<=2, int>::type = 0>
type __device__ __atomic_fetch_and_simt_ (type *ptr, delta val, int memorder) {
type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
type const desired = expected & val;
while(!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, memorder, memorder))
;
return expected;
}
#define DO__atomic_fetch_xor_simt_(bytes, bits) \
template<class type, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
type __device__ __atomic_fetch_xor_simt_ (type *ptr, type val, int memorder) { \
type ret; \
switch (memorder) { \
case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
case __ATOMIC_CONSUME: \
case __ATOMIC_ACQUIRE: __simt_xor_acquire_##bits(ptr, ret, val); break; \
case __ATOMIC_ACQ_REL: __simt_xor_acq_rel_##bits(ptr, ret, val); break; \
case __ATOMIC_RELEASE: __simt_xor_release_##bits(ptr, ret, val); break; \
case __ATOMIC_RELAXED: __simt_xor_relaxed_##bits(ptr, ret, val); break; \
default: assert(0); \
} \
return ret; \
}
DO__atomic_fetch_xor_simt_(4,32)
DO__atomic_fetch_xor_simt_(8,64)
template<class type, class delta, typename std::enable_if<sizeof(type)<=2, int>::type = 0>
type __device__ __atomic_fetch_xor_simt_ (type *ptr, delta val, int memorder) {
type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
type const desired = expected ^ val;
while(!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, memorder, memorder))
;
return expected;
}
#define DO__atomic_fetch_or_simt_(bytes, bits) \
template<class type, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
type __device__ __atomic_fetch_or_simt_ (type *ptr, type val, int memorder) { \
type ret; \
switch (memorder) { \
case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
case __ATOMIC_CONSUME: \
case __ATOMIC_ACQUIRE: __simt_or_acquire_##bits(ptr, ret, val); break; \
case __ATOMIC_ACQ_REL: __simt_or_acq_rel_##bits(ptr, ret, val); break; \
case __ATOMIC_RELEASE: __simt_or_release_##bits(ptr, ret, val); break; \
case __ATOMIC_RELAXED: __simt_or_relaxed_##bits(ptr, ret, val); break; \
default: assert(0); \
} \
return ret; \
}
DO__atomic_fetch_or_simt_(4,32)
DO__atomic_fetch_or_simt_(8,64)
template<class type, class delta, typename std::enable_if<sizeof(type)<=2, int>::type = 0>
type __device__ __atomic_fetch_or_simt_ (type *ptr, delta val, int memorder) {
type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
type const desired = expected | val;
while(!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, memorder, memorder))
;
return expected;
}
template<class type>
inline bool __device__ __atomic_test_and_set_simt_(type *ptr, int memorder) {
return __atomic_exchange_n_simt_((char*)ptr, (char)1, memorder) == 1;
}
template<class type>
inline void __device__ __atomic_clear_simt_(type *ptr, int memorder) {
return __atomic_store_n_simt_((char*)ptr, (char)0, memorder);
}
inline constexpr __device__ bool __atomic_always_lock_free_simt_ (size_t size, void *) {
return size <= 8;
}
inline __device__ bool __atomic_is_lock_free_simt_(size_t size, void * ptr) {
return __atomic_always_lock_free_simt_(size, ptr);
}
/*
fences
*/
inline void __device__ __atomic_thread_fence_simt(int memorder) {
switch (memorder) {
case __ATOMIC_SEQ_CST: __simt_fence_sc_(); break;
case __ATOMIC_CONSUME:
case __ATOMIC_ACQUIRE:
case __ATOMIC_ACQ_REL:
case __ATOMIC_RELEASE: __simt_fence_(); break;
case __ATOMIC_RELAXED: break;
default: assert(0);
}
}
inline void __device__ __atomic_signal_fence_simt(int memorder) {
__atomic_thread_fence_simt(memorder);
}
/*
non-volatile
*/
template<class type> type __device__ __atomic_load_n_simt(const type *ptr, int memorder) {
return __atomic_load_n_simt_(const_cast<const type*>(ptr), memorder);
}
template<class type> void __device__ __atomic_load_simt(const type *ptr, type *ret, int memorder) {
__atomic_load_simt_(const_cast<const type*>(ptr), ret, memorder);
}
template<class type> void __device__ __atomic_store_n_simt(type *ptr, type val, int memorder) {
__atomic_store_n_simt_(const_cast<type*>(ptr), val, memorder);
}
template<class type> void __device__ __atomic_store_simt(type *ptr, type *val, int memorder) {
__atomic_store_simt_(const_cast<type*>(ptr), val, memorder);
}
template<class type> type __device__ __atomic_exchange_n_simt(type *ptr, type val, int memorder) {
return __atomic_exchange_n_simt_(const_cast<type*>(ptr), val, memorder);
}
template<class type> void __device__ __atomic_exchange_simt(type *ptr, type *val, type *ret, int memorder) {
__atomic_exchange_simt_(const_cast<type*>(ptr), val, ret, memorder);
}
template<class type> bool __device__ __atomic_compare_exchange_n_simt(type *ptr, type *expected, type desired, bool weak, int success_memorder, int failure_memorder) {
return __atomic_compare_exchange_n_simt_(const_cast<type*>(ptr), expected, desired, weak, success_memorder, failure_memorder);
}
template<class type> bool __device__ __atomic_compare_exchange_simt(type *ptr, type *expected, type *desired, bool weak, int success_memorder, int failure_memorder) {
return __atomic_compare_exchange_simt_(const_cast<type*>(ptr), expected, desired, weak, success_memorder, failure_memorder);
}
template<class type, class delta> type __device__ __atomic_fetch_add_simt(type *ptr, delta val, int memorder) {
return __atomic_fetch_add_simt_(const_cast<type*>(ptr), val, memorder);
}
template<class type, class delta> type __device__ __atomic_fetch_sub_simt(type *ptr, delta val, int memorder) {
return __atomic_fetch_sub_simt_(const_cast<type*>(ptr), val, memorder);
}
template<class type> type __device__ __atomic_fetch_and_simt(type *ptr, type val, int memorder) {
return __atomic_fetch_and_simt_(const_cast<type*>(ptr), val, memorder);
}
template<class type> type __device__ __atomic_fetch_xor_simt(type *ptr, type val, int memorder) {
return __atomic_fetch_xor_simt_(const_cast<type*>(ptr), val, memorder);
}
template<class type> type __device__ __atomic_fetch_or_simt(type *ptr, type val, int memorder) {
return __atomic_fetch_or_simt_(const_cast<type*>(ptr), val, memorder);
}
template<class type> bool __device__ __atomic_test_and_set_simt(void *ptr, int memorder) {
return __atomic_test_and_set_simt_(const_cast<void*>(ptr), memorder);
}
template<class type> void __device__ __atomic_clear_simt(void *ptr, int memorder) {
return __atomic_clear_simt_(const_cast<void*>(ptr), memorder);
}
inline bool __device__ __atomic_always_lock_free_simt(size_t size, void *ptr) {
return __atomic_always_lock_free_simt_(size, const_cast<void*>(ptr));
}
inline bool __device__ __atomic_is_lock_free_simt(size_t size, void *ptr) {
return __atomic_is_lock_free_simt_(size, const_cast<void*>(ptr));
}
/*
volatile
*/
template<class type> type __device__ __atomic_load_n_simt(const volatile type *ptr, int memorder) {
return __atomic_load_n_simt_(const_cast<const type*>(ptr), memorder);
}
template<class type> void __device__ __atomic_load_simt(const volatile type *ptr, type *ret, int memorder) {
__atomic_load_simt_(const_cast<const type*>(ptr), ret, memorder);
}
template<class type> void __device__ __atomic_store_n_simt(volatile type *ptr, type val, int memorder) {
__atomic_store_n_simt_(const_cast<type*>(ptr), val, memorder);
}
template<class type> void __device__ __atomic_store_simt(volatile type *ptr, type *val, int memorder) {
__atomic_store_simt_(const_cast<type*>(ptr), val, memorder);
}
template<class type> type __device__ __atomic_exchange_n_simt(volatile type *ptr, type val, int memorder) {
return __atomic_exchange_n_simt_(const_cast<type*>(ptr), val, memorder);
}
template<class type> void __device__ __atomic_exchange_simt(volatile type *ptr, type *val, type *ret, int memorder) {
__atomic_exchange_simt_(const_cast<type*>(ptr), val, ret, memorder);
}
template<class type> bool __device__ __atomic_compare_exchange_n_simt(volatile type *ptr, type *expected, type desired, bool weak, int success_memorder, int failure_memorder) {
return __atomic_compare_exchange_n_simt_(const_cast<type*>(ptr), expected, desired, weak, success_memorder, failure_memorder);
}
template<class type> bool __device__ __atomic_compare_exchange_simt(volatile type *ptr, type *expected, type *desired, bool weak, int success_memorder, int failure_memorder) {
return __atomic_compare_exchange_simt_(const_cast<type*>(ptr), expected, desired, weak, success_memorder, failure_memorder);
}
template<class type, class delta> type __device__ __atomic_fetch_add_simt(volatile type *ptr, delta val, int memorder) {
return __atomic_fetch_add_simt_(const_cast<type*>(ptr), val, memorder);
}
template<class type, class delta> type __device__ __atomic_fetch_sub_simt(volatile type *ptr, delta val, int memorder) {
return __atomic_fetch_sub_simt_(const_cast<type*>(ptr), val, memorder);
}
template<class type> type __device__ __atomic_fetch_and_simt(volatile type *ptr, type val, int memorder) {
return __atomic_fetch_and_simt_(const_cast<type*>(ptr), val, memorder);
}
template<class type> type __device__ __atomic_fetch_xor_simt(volatile type *ptr, type val, int memorder) {
return __atomic_fetch_xor_simt_(const_cast<type*>(ptr), val, memorder);
}
template<class type> type __device__ __atomic_fetch_or_simt(volatile type *ptr, type val, int memorder) {
return __atomic_fetch_or_simt_(const_cast<type*>(ptr), val, memorder);
}
template<class type> bool __device__ __atomic_test_and_set_simt(volatile void *ptr, int memorder) {
return __atomic_test_and_set_simt_(const_cast<void*>(ptr), memorder);
}
template<class type> void __device__ __atomic_clear_simt(volatile void *ptr, int memorder) {
return __atomic_clear_simt_(const_cast<void*>(ptr), memorder);
}
} // end namespace Impl
} // end namespace Kokkos
#endif //_SIMT_DETAILS_CONFIG
#ifndef KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
/*
builtins
*/
#define __atomic_load_n __atomic_load_n_simt
#define __atomic_load __atomic_load_simt
#define __atomic_store_n __atomic_store_n_simt
#define __atomic_store __atomic_store_simt
#define __atomic_exchange_n __atomic_exchange_n_simt
#define __atomic_exchange __atomic_exchange_simt
#define __atomic_compare_exchange_n __atomic_compare_exchange_n_simt
#define __atomic_compare_exchange __atomic_compare_exchange_simt
#define __atomic_fetch_add __atomic_fetch_add_simt
#define __atomic_fetch_sub __atomic_fetch_sub_simt
#define __atomic_fetch_and __atomic_fetch_and_simt
#define __atomic_fetch_xor __atomic_fetch_xor_simt
#define __atomic_fetch_or __atomic_fetch_or_simt
#define __atomic_test_and_set __atomic_test_and_set_simt
#define __atomic_clear __atomic_clear_simt
#define __atomic_always_lock_free __atomic_always_lock_free_simt
#define __atomic_is_lock_free __atomic_is_lock_free_simt
#define __atomic_thread_fence __atomic_thread_fence_simt
#define __atomic_signal_fence __atomic_signal_fence_simt
#define KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
#endif //__CUDA_ARCH__ && KOKKOS_ENABLE_CUDA_ASM_ATOMICS
#endif // KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED

View File

@ -0,0 +1,68 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2019) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifdef KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
#undef __atomic_load_n
#undef __atomic_load
#undef __atomic_store_n
#undef __atomic_store
#undef __atomic_exchange_n
#undef __atomic_exchange
#undef __atomic_compare_exchange_n
#undef __atomic_compare_exchange
#undef __atomic_fetch_add
#undef __atomic_fetch_sub
#undef __atomic_fetch_and
#undef __atomic_fetch_xor
#undef __atomic_fetch_or
#undef __atomic_test_and_set
#undef __atomic_clear
#undef __atomic_always_lock_free
#undef __atomic_is_lock_free
#undef __atomic_thread_fence
#undef __atomic_signal_fence
#undef KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
#endif // KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED

View File

@ -58,7 +58,68 @@ struct CudaGetMaxBlockSize;
template<class DriverType, class LaunchBounds>
int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
return CudaGetMaxBlockSize<DriverType,LaunchBounds,(CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>::get_block_size(f,vector_length, shmem_extra_block,shmem_extra_thread);
return CudaGetMaxBlockSize<DriverType,LaunchBounds
, true
>::get_block_size(f,vector_length, shmem_extra_block,shmem_extra_thread);
}
template<class FunctorType, class LaunchBounds>
int cuda_get_max_block_size(const CudaInternal* cuda_instance, const cudaFuncAttributes& attr, const FunctorType& f, const size_t vector_length,
const size_t shmem_block, const size_t shmem_thread) {
const int min_blocks_per_sm = LaunchBounds::minBperSM == 0 ?
1 : LaunchBounds::minBperSM ;
const int max_threads_per_block = LaunchBounds::maxTperB == 0 ?
cuda_instance->m_maxThreadsPerBlock : LaunchBounds::maxTperB ;
const int regs_per_thread = attr.numRegs;
const int regs_per_sm = cuda_instance->m_regsPerSM;
const int shmem_per_sm = cuda_instance->m_shmemPerSM;
const int max_shmem_per_block = cuda_instance->m_maxShmemPerBlock;
const int max_blocks_per_sm = cuda_instance->m_maxBlocksPerSM;
const int max_threads_per_sm = cuda_instance->m_maxThreadsPerSM;
int block_size = std::min(attr.maxThreadsPerBlock,max_threads_per_block);
int functor_shmem = FunctorTeamShmemSize< FunctorType >::value( f , block_size/vector_length );
int total_shmem = shmem_block + shmem_thread*(block_size/vector_length) + functor_shmem + attr.sharedSizeBytes;
int max_blocks_regs = regs_per_sm/(regs_per_thread*block_size);
int max_blocks_shmem = (total_shmem<max_shmem_per_block)?(total_shmem>0?shmem_per_sm/total_shmem:max_blocks_regs):0;
int blocks_per_sm = std::min(max_blocks_regs,max_blocks_shmem);
int threads_per_sm = blocks_per_sm * block_size;
if(threads_per_sm > max_threads_per_sm) {
blocks_per_sm = max_threads_per_sm/block_size;
threads_per_sm = blocks_per_sm * block_size;
}
int opt_block_size = (blocks_per_sm>=min_blocks_per_sm) ? block_size : 0;
int opt_threads_per_sm = threads_per_sm;
//printf("BlockSizeMax: %i Shmem: %i %i %i %i Regs: %i %i Blocks: %i %i Achieved: %i %i Opt: %i %i\n",block_size,
// shmem_per_sm,max_shmem_per_block,functor_shmem,total_shmem,
// regs_per_sm,regs_per_thread,max_blocks_shmem,max_blocks_regs,blocks_per_sm,threads_per_sm,opt_block_size,opt_threads_per_sm);
block_size-=32;
while ((blocks_per_sm==0) && (block_size>=32)) {
functor_shmem = FunctorTeamShmemSize< FunctorType >::value( f , block_size/vector_length );
total_shmem = shmem_block + shmem_thread*(block_size/vector_length) + functor_shmem + attr.sharedSizeBytes;
max_blocks_regs = regs_per_sm/(regs_per_thread*block_size);
max_blocks_shmem = (total_shmem<max_shmem_per_block)?(total_shmem>0?shmem_per_sm/total_shmem:max_blocks_regs):0;
blocks_per_sm = std::min(max_blocks_regs,max_blocks_shmem);
threads_per_sm = blocks_per_sm * block_size;
if(threads_per_sm > max_threads_per_sm) {
blocks_per_sm = max_threads_per_sm/block_size;
threads_per_sm = blocks_per_sm * block_size;
}
if((blocks_per_sm >= min_blocks_per_sm) && (blocks_per_sm <= max_blocks_per_sm)) {
if(threads_per_sm>=opt_threads_per_sm) {
opt_block_size = block_size;
opt_threads_per_sm = threads_per_sm;
}
}
//printf("BlockSizeMax: %i Shmem: %i %i %i %i Regs: %i %i Blocks: %i %i Achieved: %i %i Opt: %i %i\n",block_size,
// shmem_per_sm,max_shmem_per_block,functor_shmem,total_shmem,
// regs_per_sm,regs_per_thread,max_blocks_shmem,max_blocks_regs,blocks_per_sm,threads_per_sm,opt_block_size,opt_threads_per_sm);
block_size-=32;
}
return opt_block_size;
}
@ -241,11 +302,71 @@ struct CudaGetOptBlockSize;
template<class DriverType, class LaunchBounds>
int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
return CudaGetOptBlockSize<DriverType,LaunchBounds,(CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>::get_block_size(f,vector_length,shmem_extra_block,shmem_extra_thread);
return CudaGetOptBlockSize<DriverType,LaunchBounds,
//LaunchBounds::launch_mechanism == Kokkos::Experimental::LaunchDefault ?
// (( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) )?
// Kokkos::Experimental::CudaLaunchConstantMemory:Kokkos::Experimental::CudaLaunchLocalMemory):
// LaunchBounds::launch_mechanism
(CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))
>::get_block_size(f,vector_length,shmem_extra_block,shmem_extra_thread);
}
template<class FunctorType, class LaunchBounds>
int cuda_get_opt_block_size(const CudaInternal* cuda_instance, const cudaFuncAttributes& attr, const FunctorType& f, const size_t vector_length,
const size_t shmem_block, const size_t shmem_thread) {
const int min_blocks_per_sm = LaunchBounds::minBperSM == 0 ?
1 : LaunchBounds::minBperSM ;
const int max_threads_per_block = LaunchBounds::maxTperB == 0 ?
cuda_instance->m_maxThreadsPerBlock : LaunchBounds::maxTperB ;
const int regs_per_thread = attr.numRegs;
const int regs_per_sm = cuda_instance->m_regsPerSM;
const int shmem_per_sm = cuda_instance->m_shmemPerSM;
const int max_shmem_per_block = cuda_instance->m_maxShmemPerBlock;
const int max_blocks_per_sm = cuda_instance->m_maxBlocksPerSM;
const int max_threads_per_sm = cuda_instance->m_maxThreadsPerSM;
int block_size = std::min(attr.maxThreadsPerBlock,max_threads_per_block);
int functor_shmem = FunctorTeamShmemSize< FunctorType >::value( f , block_size/vector_length );
int total_shmem = shmem_block + shmem_thread*(block_size/vector_length) + functor_shmem + attr.sharedSizeBytes;
int max_blocks_regs = regs_per_sm/(regs_per_thread*block_size);
int max_blocks_shmem = (total_shmem<max_shmem_per_block)?(total_shmem>0?shmem_per_sm/total_shmem:max_blocks_regs):0;
int blocks_per_sm = std::min(max_blocks_regs,max_blocks_shmem);
int threads_per_sm = blocks_per_sm * block_size;
if(threads_per_sm > max_threads_per_sm) {
blocks_per_sm = max_threads_per_sm/block_size;
threads_per_sm = blocks_per_sm * block_size;
}
int opt_block_size = (blocks_per_sm>=min_blocks_per_sm) ? block_size : 0;
int opt_threads_per_sm = threads_per_sm;
block_size-=32;
while ((block_size>=32)) {
functor_shmem = FunctorTeamShmemSize< FunctorType >::value( f , block_size/vector_length );
total_shmem = shmem_block + shmem_thread*(block_size/vector_length) + functor_shmem + attr.sharedSizeBytes;
max_blocks_regs = regs_per_sm/(regs_per_thread*block_size);
max_blocks_shmem = (total_shmem<max_shmem_per_block)?(total_shmem>0?shmem_per_sm/total_shmem:max_blocks_regs):0;
blocks_per_sm = std::min(max_blocks_regs,max_blocks_shmem);
threads_per_sm = blocks_per_sm * block_size;
if(threads_per_sm > max_threads_per_sm) {
blocks_per_sm = max_threads_per_sm/block_size;
threads_per_sm = blocks_per_sm * block_size;
}
if((blocks_per_sm >= min_blocks_per_sm) && (blocks_per_sm <= max_blocks_per_sm)) {
if(threads_per_sm>=opt_threads_per_sm) {
opt_block_size = block_size;
opt_threads_per_sm = threads_per_sm;
}
}
block_size-=32;
}
return opt_block_size;
}
template<class DriverType>
struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<>,true> {
struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<0,0>,true> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int blockSize=16;
@ -275,7 +396,7 @@ struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<>,true> {
};
template<class DriverType>
struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<>,false> {
struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<0,0>,false> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int blockSize=16;
@ -305,7 +426,7 @@ struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<>,false> {
};
template<class DriverType, unsigned int MaxThreadsPerBlock, unsigned int MinBlocksPerSM>
struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds< MaxThreadsPerBlock, MinBlocksPerSM >,true> {
struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds< MaxThreadsPerBlock, MinBlocksPerSM>,true> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int blockSize=16;

View File

@ -50,7 +50,8 @@
#include <Kokkos_Core.hpp>
#include <Cuda/Kokkos_Cuda_Error.hpp>
#include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
#include <Cuda/Kokkos_Cuda_Instance.hpp>
#include <Cuda/Kokkos_Cuda_Locks.hpp>
#include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_Profiling_Interface.hpp>
@ -217,78 +218,6 @@ const CudaInternalDevices & CudaInternalDevices::singleton()
}
//----------------------------------------------------------------------------
class CudaInternal {
private:
CudaInternal( const CudaInternal & );
CudaInternal & operator = ( const CudaInternal & );
public:
typedef Cuda::size_type size_type ;
int m_cudaDev ;
int m_cudaArch ;
unsigned m_multiProcCount ;
unsigned m_maxWarpCount ;
unsigned m_maxBlock ;
unsigned m_maxSharedWords ;
uint32_t m_maxConcurrency ;
size_type m_scratchSpaceCount ;
size_type m_scratchFlagsCount ;
size_type m_scratchUnifiedCount ;
size_type m_scratchUnifiedSupported ;
size_type m_streamCount ;
size_type * m_scratchSpace ;
size_type * m_scratchFlags ;
size_type * m_scratchUnified ;
uint32_t * m_scratchConcurrentBitset ;
cudaStream_t * m_stream ;
static int was_initialized;
static int was_finalized;
static CudaInternal & singleton();
int verify_is_initialized( const char * const label ) const ;
int is_initialized() const
{ return 0 != m_scratchSpace && 0 != m_scratchFlags ; }
void initialize( int cuda_device_id , int stream_count );
void finalize();
void print_configuration( std::ostream & ) const ;
~CudaInternal();
CudaInternal()
: m_cudaDev( -1 )
, m_cudaArch( -1 )
, m_multiProcCount( 0 )
, m_maxWarpCount( 0 )
, m_maxBlock( 0 )
, m_maxSharedWords( 0 )
, m_maxConcurrency( 0 )
, m_scratchSpaceCount( 0 )
, m_scratchFlagsCount( 0 )
, m_scratchUnifiedCount( 0 )
, m_scratchUnifiedSupported( 0 )
, m_streamCount( 0 )
, m_scratchSpace( 0 )
, m_scratchFlags( 0 )
, m_scratchUnified( 0 )
, m_scratchConcurrentBitset( 0 )
, m_stream( 0 )
{}
size_type * scratch_space( const size_type size );
size_type * scratch_flags( const size_type size );
size_type * scratch_unified( const size_type size );
};
int CudaInternal::was_initialized = 0;
int CudaInternal::was_finalized = 0;
@ -366,8 +295,11 @@ CudaInternal & CudaInternal::singleton()
static CudaInternal self ;
return self ;
}
void CudaInternal::fence() const {
cudaStreamSynchronize(m_stream);
}
void CudaInternal::initialize( int cuda_device_id , int stream_count )
void CudaInternal::initialize( int cuda_device_id , cudaStream_t stream )
{
if ( was_finalized ) Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n");
was_initialized = 1;
@ -454,6 +386,15 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
m_maxBlock = cudaProp.maxGridSize[0] ;
m_shmemPerSM = cudaProp.sharedMemPerMultiprocessor ;
m_maxShmemPerBlock = cudaProp.sharedMemPerBlock ;
m_regsPerSM = cudaProp.regsPerMultiprocessor ;
m_maxBlocksPerSM = m_cudaArch < 500 ? 16 : (
m_cudaArch < 750 ? 32 : (
m_cudaArch == 750 ? 16 : 32));
m_maxThreadsPerSM = cudaProp.maxThreadsPerMultiProcessor ;
m_maxThreadsPerBlock = cudaProp.maxThreadsPerBlock ;
//----------------------------------
m_scratchUnifiedSupported = cudaProp.unifiedAddressing ;
@ -482,10 +423,9 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
// Concurrent bitset for obtaining unique tokens from within
// an executing kernel.
{
const unsigned max_threads_per_sm = 2048 ; // up to capability 7.0
m_maxConcurrency =
max_threads_per_sm * cudaProp.multiProcessorCount ;
m_maxThreadsPerSM * cudaProp.multiProcessorCount ;
const int32_t buffer_bound =
Kokkos::Impl::concurrent_bitset::buffer_bound( m_maxConcurrency );
@ -507,11 +447,6 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
}
//----------------------------------
if ( stream_count ) {
m_stream = (cudaStream_t*) ::malloc( stream_count * sizeof(cudaStream_t) );
m_streamCount = stream_count ;
for ( size_type i = 0 ; i < m_streamCount ; ++i ) m_stream[i] = 0 ;
}
}
else {
@ -539,7 +474,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
if( Kokkos::show_warnings() && !cuda_launch_blocking() ) {
std::cerr << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
std::cerr << " without setting CUDA_LAUNCH_BLOCKING=1." << std::endl;
std::cerr << " The code must call Cuda::fence() after each kernel" << std::endl;
std::cerr << " The code must call Cuda().fence() after each kernel" << std::endl;
std::cerr << " or will likely crash when accessing data on the host." << std::endl;
}
@ -568,7 +503,10 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
#endif
// Init the array for used for arbitrarily sized atomics
Impl::initialize_host_cuda_lock_arrays();
if(stream == 0)
Impl::initialize_host_cuda_lock_arrays();
m_stream = stream;
}
//----------------------------------------------------------------------------
@ -578,7 +516,7 @@ enum { sizeScratchGrain = sizeof(ScratchGrain) };
Cuda::size_type *
CudaInternal::scratch_flags( const Cuda::size_type size )
CudaInternal::scratch_flags( const Cuda::size_type size ) const
{
if ( verify_is_initialized("scratch_flags") && m_scratchFlagsCount * sizeScratchGrain < size ) {
@ -587,6 +525,9 @@ CudaInternal::scratch_flags( const Cuda::size_type size )
typedef Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
if( m_scratchFlags )
Record::decrement( Record::get_record( m_scratchFlags ) );
Record * const r = Record::allocate( Kokkos::CudaSpace()
, "InternalScratchFlags"
, ( sizeof( ScratchGrain ) * m_scratchFlagsCount ) );
@ -602,7 +543,7 @@ CudaInternal::scratch_flags( const Cuda::size_type size )
}
Cuda::size_type *
CudaInternal::scratch_space( const Cuda::size_type size )
CudaInternal::scratch_space( const Cuda::size_type size ) const
{
if ( verify_is_initialized("scratch_space") && m_scratchSpaceCount * sizeScratchGrain < size ) {
@ -610,6 +551,9 @@ CudaInternal::scratch_space( const Cuda::size_type size )
typedef Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
if( m_scratchSpace )
Record::decrement( Record::get_record( m_scratchSpace ) );
Record * const r = Record::allocate( Kokkos::CudaSpace()
, "InternalScratchSpace"
, ( sizeof( ScratchGrain ) * m_scratchSpaceCount ) );
@ -623,7 +567,7 @@ CudaInternal::scratch_space( const Cuda::size_type size )
}
Cuda::size_type *
CudaInternal::scratch_unified( const Cuda::size_type size )
CudaInternal::scratch_unified( const Cuda::size_type size ) const
{
if ( verify_is_initialized("scratch_unified") &&
m_scratchUnifiedSupported && m_scratchUnifiedCount * sizeScratchGrain < size ) {
@ -632,6 +576,9 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
typedef Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > Record ;
if( m_scratchUnified )
Record::decrement( Record::get_record( m_scratchUnified ) );
Record * const r = Record::allocate( Kokkos::CudaHostPinnedSpace()
, "InternalScratchUnified"
, ( sizeof( ScratchGrain ) * m_scratchUnifiedCount ) );
@ -644,6 +591,31 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
return m_scratchUnified ;
}
Cuda::size_type *
CudaInternal::scratch_functor( const Cuda::size_type size ) const
{
if ( verify_is_initialized("scratch_functor") &&
m_scratchFunctorSize < size ) {
m_scratchFunctorSize = size ;
typedef Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
if( m_scratchFunctor )
Record::decrement( Record::get_record( m_scratchFunctor ) );
Record * const r = Record::allocate( Kokkos::CudaSpace()
, "InternalScratchFunctor"
, m_scratchFunctorSize );
Record::increment( r );
m_scratchFunctor = reinterpret_cast<size_type *>( r->data() );
}
return m_scratchFunctor ;
}
//----------------------------------------------------------------------------
void CudaInternal::finalize()
@ -653,13 +625,7 @@ void CudaInternal::finalize()
Impl::finalize_host_cuda_lock_arrays();
if ( m_stream ) {
for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
cudaStreamDestroy( m_stream[i] );
m_stream[i] = 0 ;
}
::free( m_stream );
}
if(m_stream!=0) cudaStreamDestroy(m_stream);
typedef Kokkos::Impl::SharedAllocationRecord< CudaSpace > RecordCuda ;
typedef Kokkos::Impl::SharedAllocationRecord< CudaHostPinnedSpace > RecordHost ;
@ -668,6 +634,8 @@ void CudaInternal::finalize()
RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) );
RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) );
RecordCuda::decrement( RecordCuda::get_record( m_scratchConcurrentBitset ) );
if(m_scratchFunctorSize>0)
RecordCuda::decrement( RecordCuda::get_record( m_scratchFunctor ) );
m_cudaDev = -1 ;
m_multiProcCount = 0 ;
@ -713,14 +681,14 @@ Cuda::size_type cuda_internal_maximum_grid_count()
Cuda::size_type cuda_internal_maximum_shared_words()
{ return CudaInternal::singleton().m_maxSharedWords ; }
Cuda::size_type * cuda_internal_scratch_space( const Cuda::size_type size )
{ return CudaInternal::singleton().scratch_space( size ); }
Cuda::size_type * cuda_internal_scratch_space( const Cuda& instance, const Cuda::size_type size )
{ return instance.impl_internal_space_instance()->scratch_space( size ); }
Cuda::size_type * cuda_internal_scratch_flags( const Cuda::size_type size )
{ return CudaInternal::singleton().scratch_flags( size ); }
Cuda::size_type * cuda_internal_scratch_flags( const Cuda& instance, const Cuda::size_type size )
{ return instance.impl_internal_space_instance()->scratch_flags( size ); }
Cuda::size_type * cuda_internal_scratch_unified( const Cuda::size_type size )
{ return CudaInternal::singleton().scratch_unified( size ); }
Cuda::size_type * cuda_internal_scratch_unified( const Cuda& instance, const Cuda::size_type size )
{ return instance.impl_internal_space_instance()->scratch_unified( size ); }
} // namespace Impl
@ -749,7 +717,7 @@ void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances )
void Cuda::impl_initialize( const Cuda::SelectDevice config , size_t num_instances )
#endif
{
Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances );
Impl::CudaInternal::singleton().initialize( config.cuda_device_id , 0 );
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
@ -800,19 +768,17 @@ void Cuda::impl_finalize()
}
Cuda::Cuda()
: m_device( Impl::CudaInternal::singleton().m_cudaDev )
, m_stream( 0 )
: m_space_instance( &Impl::CudaInternal::singleton() )
{
Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" );
}
Cuda::Cuda( const int instance_id )
: m_device( Impl::CudaInternal::singleton().m_cudaDev )
, m_stream(
Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" )
? Impl::CudaInternal::singleton().m_stream[ instance_id % Impl::CudaInternal::singleton().m_streamCount ]
: 0 )
{}
Cuda::Cuda(cudaStream_t stream)
: m_space_instance(new Impl::CudaInternal)
{
Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" );
m_space_instance->initialize(Impl::CudaInternal::singleton().m_cudaDev,stream);
}
void Cuda::print_configuration( std::ostream & s , const bool )
{ Impl::CudaInternal::singleton().print_configuration( s ); }
@ -823,13 +789,27 @@ bool Cuda::sleep() { return false ; }
bool Cuda::wake() { return true ; }
#endif
void Cuda::fence()
void Cuda::impl_static_fence()
{
Kokkos::Impl::cuda_device_synchronize();
}
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
void Cuda::fence() {
impl_static_fence();
}
#else
void Cuda::fence() const {
m_space_instance->fence();
}
#endif
const char* Cuda::name() { return "Cuda"; }
cudaStream_t Cuda::cuda_stream() const { return m_space_instance->m_stream ; }
int Cuda::cuda_device() const { return m_space_instance->m_cudaDev ; }
} // namespace Kokkos
namespace Kokkos {

View File

@ -0,0 +1,156 @@
#ifndef KOKKOS_CUDA_INSTANCE_HPP_
#define KOKKOS_CUDA_INSTANCE_HPP_
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
struct CudaTraits {
enum { WarpSize = 32 /* 0x0020 */ };
enum { WarpIndexMask = 0x001f /* Mask for warpindex */ };
enum { WarpIndexShift = 5 /* WarpSize == 1 << WarpShift */ };
enum { ConstantMemoryUsage = 0x008000 /* 32k bytes */ };
enum { ConstantMemoryCache = 0x002000 /* 8k bytes */ };
enum { KernelArgumentLimit = 0x001000 /* 4k bytes */ };
typedef unsigned long
ConstantGlobalBufferType[ ConstantMemoryUsage / sizeof(unsigned long) ];
#if defined(KOKKOS_ARCH_VOLTA) || \
defined(KOKKOS_ARCH_PASCAL)
enum { ConstantMemoryUseThreshold = 0x000200 /* 0 bytes -> always use constant (or global)*/ };
#else
enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ };
#endif
KOKKOS_INLINE_FUNCTION static
CudaSpace::size_type warp_count( CudaSpace::size_type i )
{ return ( i + WarpIndexMask ) >> WarpIndexShift ; }
KOKKOS_INLINE_FUNCTION static
CudaSpace::size_type warp_align( CudaSpace::size_type i )
{
enum { Mask = ~CudaSpace::size_type( WarpIndexMask ) };
return ( i + WarpIndexMask ) & Mask ;
}
};
//----------------------------------------------------------------------------
CudaSpace::size_type cuda_internal_multiprocessor_count();
CudaSpace::size_type cuda_internal_maximum_warp_count();
CudaSpace::size_type cuda_internal_maximum_grid_count();
CudaSpace::size_type cuda_internal_maximum_shared_words();
CudaSpace::size_type cuda_internal_maximum_concurrent_block_count();
CudaSpace::size_type * cuda_internal_scratch_flags( const Cuda&, const CudaSpace::size_type size );
CudaSpace::size_type * cuda_internal_scratch_space( const Cuda&, const CudaSpace::size_type size );
CudaSpace::size_type * cuda_internal_scratch_unified( const Cuda&, const CudaSpace::size_type size );
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
class CudaInternal {
private:
CudaInternal( const CudaInternal & );
CudaInternal & operator = ( const CudaInternal & );
public:
typedef Cuda::size_type size_type ;
int m_cudaDev ;
// Device Properties
int m_cudaArch ;
unsigned m_multiProcCount ;
unsigned m_maxWarpCount ;
unsigned m_maxBlock ;
unsigned m_maxSharedWords ;
uint32_t m_maxConcurrency ;
int m_shmemPerSM ;
int m_maxShmemPerBlock ;
int m_regsPerSM ;
int m_maxBlocksPerSM ;
int m_maxThreadsPerSM ;
int m_maxThreadsPerBlock ;
mutable size_type m_scratchSpaceCount ;
mutable size_type m_scratchFlagsCount ;
mutable size_type m_scratchUnifiedCount ;
mutable size_type m_scratchFunctorSize ;
size_type m_scratchUnifiedSupported ;
size_type m_streamCount ;
mutable size_type * m_scratchSpace ;
mutable size_type * m_scratchFlags ;
mutable size_type * m_scratchUnified ;
mutable size_type * m_scratchFunctor ;
uint32_t * m_scratchConcurrentBitset ;
cudaStream_t m_stream ;
static int was_initialized;
static int was_finalized;
static CudaInternal & singleton();
int verify_is_initialized( const char * const label ) const ;
int is_initialized() const
{ return 0 != m_scratchSpace && 0 != m_scratchFlags ; }
void initialize( int cuda_device_id , cudaStream_t stream = 0 );
void finalize();
void print_configuration( std::ostream & ) const ;
void fence() const ;
~CudaInternal();
CudaInternal()
: m_cudaDev( -1 )
, m_cudaArch( -1 )
, m_multiProcCount( 0 )
, m_maxWarpCount( 0 )
, m_maxBlock( 0 )
, m_maxSharedWords( 0 )
, m_maxConcurrency( 0 )
, m_shmemPerSM( 0 )
, m_maxShmemPerBlock( 0 )
, m_regsPerSM( 0 )
, m_maxBlocksPerSM( 0 )
, m_maxThreadsPerSM( 0 )
, m_maxThreadsPerBlock( 0 )
, m_scratchSpaceCount( 0 )
, m_scratchFlagsCount( 0 )
, m_scratchUnifiedCount( 0 )
, m_scratchFunctorSize( 0 )
, m_scratchUnifiedSupported( 0 )
, m_streamCount( 0 )
, m_scratchSpace( 0 )
, m_scratchFlags( 0 )
, m_scratchUnified( 0 )
, m_scratchFunctor( 0 )
, m_scratchConcurrentBitset( 0 )
, m_stream( 0 )
{}
size_type * scratch_space( const size_type size ) const ;
size_type * scratch_flags( const size_type size ) const ;
size_type * scratch_unified( const size_type size ) const ;
size_type * scratch_functor( const size_type size ) const ;
};
} // Namespace Impl
} // Namespace Kokkos
#endif

View File

@ -0,0 +1,579 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDAEXEC_HPP
#define KOKKOS_CUDAEXEC_HPP
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_CUDA
#include <string>
#include <cstdint>
#include <Kokkos_Parallel.hpp>
#include <impl/Kokkos_Error.hpp>
#include <Cuda/Kokkos_Cuda_abort.hpp>
#include <Cuda/Kokkos_Cuda_Error.hpp>
#include <Cuda/Kokkos_Cuda_Locks.hpp>
#include <Cuda/Kokkos_Cuda_Instance.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if defined( __CUDACC__ )
/** \brief Access to constant memory on the device */
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
__device__ __constant__
extern unsigned long kokkos_impl_cuda_constant_memory_buffer[] ;
#else
__device__ __constant__
unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;
#endif
namespace Kokkos {
namespace Impl {
void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink = false);
}
}
template< typename T >
inline
__device__
T * kokkos_impl_cuda_shared_memory()
{ extern __shared__ Kokkos::CudaSpace::size_type sh[]; return (T*) sh ; }
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
// See section B.17 of Cuda C Programming Guide Version 3.2
// for discussion of
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
// function qualifier which could be used to improve performance.
//----------------------------------------------------------------------------
// Maximize L1 cache and minimize shared memory:
// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
// For 2.0 capability: 48 KB L1 and 16 KB shared
//----------------------------------------------------------------------------
template< class DriverType>
__global__
static void cuda_parallel_launch_constant_memory()
{
const DriverType & driver =
*((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
driver();
}
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
__global__
__launch_bounds__(maxTperB, minBperSM)
static void cuda_parallel_launch_constant_memory()
{
const DriverType & driver =
*((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
driver();
}
template< class DriverType>
__global__
static void cuda_parallel_launch_local_memory( const DriverType driver )
{
driver();
}
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
__global__
__launch_bounds__(maxTperB, minBperSM)
static void cuda_parallel_launch_local_memory( const DriverType driver )
{
driver();
}
template< class DriverType>
__global__
static void cuda_parallel_launch_global_memory( const DriverType* driver )
{
driver->operator()();
}
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
__global__
__launch_bounds__(maxTperB, minBperSM)
static void cuda_parallel_launch_global_memory( const DriverType* driver )
{
driver->operator()();
}
template< class DriverType>
__global__
static void cuda_parallel_launch_constant_or_global_memory( const DriverType* driver_ptr )
{
const DriverType & driver = driver_ptr!=NULL ? *driver_ptr :
*((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
driver();
}
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
__global__
__launch_bounds__(maxTperB, minBperSM)
static void cuda_parallel_launch_constant_or_global_memory( const DriverType* driver_ptr )
{
const DriverType & driver = driver_ptr!=NULL ? *driver_ptr :
*((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
driver();
}
template< class DriverType >
struct DeduceCudaLaunchMechanism {
constexpr static const Kokkos::Experimental::WorkItemProperty::HintLightWeight_t light_weight = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
constexpr static const Kokkos::Experimental::WorkItemProperty::HintHeavyWeight_t heavy_weight = Kokkos::Experimental::WorkItemProperty::HintHeavyWeight ;
constexpr static const typename DriverType::Policy::work_item_property property = typename DriverType::Policy::work_item_property();
static constexpr const Experimental::CudaLaunchMechanism valid_launch_mechanism =
// BuildValidMask
(sizeof(DriverType)<CudaTraits::KernelArgumentLimit?
Experimental::CudaLaunchMechanism::LocalMemory:Experimental::CudaLaunchMechanism::Default)|
(sizeof(DriverType)<CudaTraits::ConstantMemoryUsage?
Experimental::CudaLaunchMechanism::ConstantMemory:Experimental::CudaLaunchMechanism::Default)|
Experimental::CudaLaunchMechanism::GlobalMemory;
static constexpr const Experimental::CudaLaunchMechanism requested_launch_mechanism =
(((property&light_weight)==light_weight)?
Experimental::CudaLaunchMechanism::LocalMemory :
Experimental::CudaLaunchMechanism::ConstantMemory)
| Experimental::CudaLaunchMechanism::GlobalMemory;
static constexpr const Experimental::CudaLaunchMechanism default_launch_mechanism =
// BuildValidMask
(sizeof(DriverType)<CudaTraits::ConstantMemoryUseThreshold)?
Experimental::CudaLaunchMechanism::LocalMemory:(
(sizeof(DriverType)<CudaTraits::ConstantMemoryUsage)?
Experimental::CudaLaunchMechanism::ConstantMemory:
Experimental::CudaLaunchMechanism::GlobalMemory);
// None LightWeight HeavyWeight
// F<UseT LCG LCG L L LCG LG L L LCG CG L C
// UseT<F<KAL LCG LCG C C LCG LG C L LCG CG C C
// Kal<F<CMU CG LCG C C CG LG C G CG CG C C
// CMU<F G LCG G G G LG G G G CG G G
static constexpr const Experimental::CudaLaunchMechanism launch_mechanism =
((property&light_weight)==light_weight)?
(sizeof(DriverType)<CudaTraits::KernelArgumentLimit?
Experimental::CudaLaunchMechanism::LocalMemory:
Experimental::CudaLaunchMechanism::GlobalMemory):(
((property&heavy_weight)==heavy_weight)?
(sizeof(DriverType)<CudaTraits::ConstantMemoryUsage?
Experimental::CudaLaunchMechanism::ConstantMemory:
Experimental::CudaLaunchMechanism::GlobalMemory):
(default_launch_mechanism)
);
};
// Use local memory up to ConstantMemoryUseThreshold
// Use global memory above ConstantMemoryUsage
// In between use ConstantMemory
template < class DriverType
, class LaunchBounds = Kokkos::LaunchBounds<>
, Experimental::CudaLaunchMechanism LaunchMechanism =
DeduceCudaLaunchMechanism<DriverType>::launch_mechanism >
struct CudaParallelLaunch ;
template < class DriverType
, unsigned int MaxThreadsPerBlock
, unsigned int MinBlocksPerSM>
struct CudaParallelLaunch< DriverType
, Kokkos::LaunchBounds< MaxThreadsPerBlock
, MinBlocksPerSM >
, Experimental::CudaLaunchMechanism::ConstantMemory>
{
static_assert(sizeof(DriverType)<CudaTraits::ConstantMemoryUsage,"Kokkos Error: Requested CudaLaunchConstantMemory with a Functor larger than 32kB.");
inline
CudaParallelLaunch( const DriverType & driver
, const dim3 & grid
, const dim3 & block
, const int shmem
, const CudaInternal* cuda_instance
, const bool prefer_shmem )
{
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
// Fence before changing settings and copying closure
Kokkos::Cuda().fence();
if ( cuda_instance->m_maxShmemPerBlock < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
#ifndef KOKKOS_ARCH_KEPLER
// On Kepler the L1 has no benefit since it doesn't cache reads
else {
CUDA_SAFE_CALL(
cudaFuncSetCacheConfig
( cuda_parallel_launch_constant_memory
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
, ( prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
) );
}
#endif
// Copy functor to constant memory on the device
cudaMemcpyToSymbolAsync(
kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType), 0, cudaMemcpyHostToDevice, cudaStream_t(cuda_instance->m_stream));
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
// Invoke the driver function on the device
cuda_parallel_launch_constant_memory
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
<<< grid , block , shmem , cuda_instance->m_stream >>>();
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );
Kokkos::Cuda().fence();
#endif
}
}
static cudaFuncAttributes get_cuda_func_attributes() {
cudaFuncAttributes attr;
cudaFuncGetAttributes(&attr,cuda_parallel_launch_constant_memory
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >);
return attr;
}
};
template < class DriverType>
struct CudaParallelLaunch< DriverType
, Kokkos::LaunchBounds<0,0>
, Experimental::CudaLaunchMechanism::ConstantMemory >
{
static_assert(sizeof(DriverType)<CudaTraits::ConstantMemoryUsage,"Kokkos Error: Requested CudaLaunchConstantMemory with a Functor larger than 32kB.");
inline
CudaParallelLaunch( const DriverType & driver
, const dim3 & grid
, const dim3 & block
, const int shmem
, const CudaInternal* cuda_instance
, const bool prefer_shmem )
{
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
// Fence before changing settings and copying closure
Kokkos::Cuda().fence();
if ( cuda_instance->m_maxShmemPerBlock < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
#ifndef KOKKOS_ARCH_KEPLER
// On Kepler the L1 has no benefit since it doesn't cache reads
else {
CUDA_SAFE_CALL(
cudaFuncSetCacheConfig
( cuda_parallel_launch_constant_memory< DriverType >
, ( prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
) );
}
#endif
// Copy functor to constant memory on the device
cudaMemcpyToSymbolAsync(
kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType), 0, cudaMemcpyHostToDevice, cudaStream_t(cuda_instance->m_stream));
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
// Invoke the driver function on the device
cuda_parallel_launch_constant_memory< DriverType >
<<< grid , block , shmem , cuda_instance->m_stream >>>();
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );
Kokkos::Cuda().fence();
#endif
}
}
static cudaFuncAttributes get_cuda_func_attributes() {
cudaFuncAttributes attr;
cudaFuncGetAttributes(&attr,cuda_parallel_launch_constant_memory
< DriverType >);
return attr;
}
};
template < class DriverType
, unsigned int MaxThreadsPerBlock
, unsigned int MinBlocksPerSM >
struct CudaParallelLaunch< DriverType
, Kokkos::LaunchBounds< MaxThreadsPerBlock
, MinBlocksPerSM >
, Experimental::CudaLaunchMechanism::LocalMemory >
{
static_assert(sizeof(DriverType)<CudaTraits::KernelArgumentLimit,"Kokkos Error: Requested CudaLaunchLocalMemory with a Functor larger than 4096 bytes.");
inline
CudaParallelLaunch( const DriverType & driver
, const dim3 & grid
, const dim3 & block
, const int shmem
, const CudaInternal* cuda_instance
, const bool prefer_shmem )
{
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
if ( cuda_instance->m_maxShmemPerBlock < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
#ifndef KOKKOS_ARCH_KEPLER
// On Kepler the L1 has no benefit since it doesn't cache reads
else {
CUDA_SAFE_CALL(
cudaFuncSetCacheConfig
( cuda_parallel_launch_local_memory
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
, ( prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
) );
}
#endif
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
// Invoke the driver function on the device
cuda_parallel_launch_local_memory
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
<<< grid , block , shmem , cuda_instance->m_stream >>>( driver );
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );
Kokkos::Cuda().fence();
#endif
}
}
static cudaFuncAttributes get_cuda_func_attributes() {
cudaFuncAttributes attr;
cudaFuncGetAttributes(&attr,cuda_parallel_launch_local_memory
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >);
return attr;
}
};
template < class DriverType>
struct CudaParallelLaunch< DriverType
, Kokkos::LaunchBounds<0,0>
, Experimental::CudaLaunchMechanism::LocalMemory >
{
static_assert(sizeof(DriverType)<CudaTraits::KernelArgumentLimit,"Kokkos Error: Requested CudaLaunchLocalMemory with a Functor larger than 4096 bytes.");
inline
CudaParallelLaunch( const DriverType & driver
, const dim3 & grid
, const dim3 & block
, const int shmem
, const CudaInternal* cuda_instance
, const bool prefer_shmem)
{
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
if ( cuda_instance->m_maxShmemPerBlock < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
#ifndef KOKKOS_ARCH_KEPLER
// On Kepler the L1 has no benefit since it doesn't cache reads
else {
CUDA_SAFE_CALL(
cudaFuncSetCacheConfig
( cuda_parallel_launch_local_memory< DriverType >
, ( prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
) );
}
#endif
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
// Invoke the driver function on the device
cuda_parallel_launch_local_memory< DriverType >
<<< grid , block , shmem , cuda_instance->m_stream >>>( driver );
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );
Kokkos::Cuda().fence();
#endif
}
}
static cudaFuncAttributes get_cuda_func_attributes() {
cudaFuncAttributes attr;
cudaFuncGetAttributes(&attr,cuda_parallel_launch_local_memory
< DriverType >);
return attr;
}
};
template < class DriverType
, unsigned int MaxThreadsPerBlock
, unsigned int MinBlocksPerSM>
struct CudaParallelLaunch< DriverType
, Kokkos::LaunchBounds< MaxThreadsPerBlock
, MinBlocksPerSM>
, Experimental::CudaLaunchMechanism::GlobalMemory >
{
inline
CudaParallelLaunch( const DriverType & driver
, const dim3 & grid
, const dim3 & block
, const int shmem
, CudaInternal* cuda_instance
, const bool prefer_shmem )
{
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
if ( cuda_instance->m_maxShmemPerBlock < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
#ifndef KOKKOS_ARCH_KEPLER
// On Kepler the L1 has no benefit since it doesn't cache reads
else {
CUDA_SAFE_CALL(
cudaFuncSetCacheConfig
( cuda_parallel_launch_global_memory
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
, ( prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
) );
}
#endif
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
DriverType* driver_ptr = NULL;
driver_ptr = reinterpret_cast<DriverType*>(cuda_instance->scratch_functor(sizeof(DriverType)));
cudaMemcpyAsync(driver_ptr,&driver, sizeof(DriverType), cudaMemcpyDefault, cuda_instance->m_stream);
// Invoke the driver function on the device
cuda_parallel_launch_global_memory
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
<<< grid , block , shmem , cuda_instance->m_stream >>>( driver_ptr );
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );
Kokkos::Cuda().fence();
#endif
}
}
static cudaFuncAttributes get_cuda_func_attributes() {
cudaFuncAttributes attr;
cudaFuncGetAttributes(&attr,cuda_parallel_launch_global_memory
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >);
return attr;
}
};
template < class DriverType>
struct CudaParallelLaunch< DriverType
, Kokkos::LaunchBounds<0,0>
, Experimental::CudaLaunchMechanism::GlobalMemory >
{
inline
CudaParallelLaunch( const DriverType & driver
, const dim3 & grid
, const dim3 & block
, const int shmem
, CudaInternal* cuda_instance
, const bool prefer_shmem)
{
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
if ( cuda_instance->m_maxShmemPerBlock < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
#ifndef KOKKOS_ARCH_KEPLER
// On Kepler the L1 has no benefit since it doesn't cache reads
else {
CUDA_SAFE_CALL(
cudaFuncSetCacheConfig
( cuda_parallel_launch_global_memory< DriverType >
, ( prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
) );
}
#endif
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
DriverType* driver_ptr = NULL;
driver_ptr = reinterpret_cast<DriverType*>(cuda_instance->scratch_functor(sizeof(DriverType)));
cudaMemcpyAsync(driver_ptr,&driver, sizeof(DriverType), cudaMemcpyDefault, cuda_instance->m_stream);
cuda_parallel_launch_global_memory< DriverType >
<<< grid , block , shmem , cuda_instance->m_stream >>>( driver_ptr );
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );
Kokkos::Cuda().fence();
#endif
}
}
static cudaFuncAttributes get_cuda_func_attributes() {
cudaFuncAttributes attr;
cudaFuncGetAttributes(&attr,cuda_parallel_launch_global_memory
< DriverType >);
return attr;
}
};
//----------------------------------------------------------------------------
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* defined( __CUDACC__ ) */
#endif /* defined( KOKKOS_ENABLE_CUDA ) */
#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */

File diff suppressed because it is too large Load Diff

View File

@ -376,13 +376,13 @@ template< class ReducerType >
__device__ inline
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
cuda_intra_warp_reduction( const ReducerType& reducer,
typename ReducerType::value_type& result,
const uint32_t max_active_thread = blockDim.y) {
typedef typename ReducerType::value_type ValueType;
unsigned int shift = 1;
ValueType result = reducer.reference();
//Reduce over values from threads with different threadIdx.y
while(blockDim.x * shift < 32 ) {
const ValueType tmp = shfl_down(result, blockDim.x*shift,32u);
@ -400,6 +400,7 @@ template< class ReducerType >
__device__ inline
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
cuda_inter_warp_reduction( const ReducerType& reducer,
typename ReducerType::value_type value,
const int max_active_thread = blockDim.y) {
typedef typename ReducerType::value_type ValueType;
@ -410,7 +411,6 @@ cuda_inter_warp_reduction( const ReducerType& reducer,
// could lead to race conditions
__shared__ double sh_result[(sizeof(ValueType)+7)/8*STEP_WIDTH];
ValueType* result = (ValueType*) & sh_result;
ValueType value = reducer.reference();
const int step = 32 / blockDim.x;
int shift = STEP_WIDTH;
const int id = threadIdx.y%step==0?threadIdx.y/step:65000;
@ -438,9 +438,18 @@ template< class ReducerType >
__device__ inline
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
cuda_intra_block_reduction( const ReducerType& reducer,
typename ReducerType::value_type value,
const int max_active_thread = blockDim.y) {
cuda_intra_warp_reduction(reducer,max_active_thread);
cuda_inter_warp_reduction(reducer,max_active_thread);
cuda_intra_warp_reduction(reducer,value,max_active_thread);
cuda_inter_warp_reduction(reducer,value,max_active_thread);
}
template< class ReducerType >
__device__ inline
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
cuda_intra_block_reduction( const ReducerType& reducer,
const int max_active_thread = blockDim.y) {
cuda_intra_block_reduction(reducer,reducer.reference(),max_active_thread);
}
template< class ReducerType>

View File

@ -54,194 +54,8 @@
namespace Kokkos {
namespace Impl {
template class TaskQueue< Kokkos::Cuda > ;
//----------------------------------------------------------------------------
__device__
void TaskQueueSpecialization< Kokkos::Cuda >::driver
( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue
, int32_t shmem_per_warp )
{
using Member = TaskExec< Kokkos::Cuda > ;
using Queue = TaskQueue< Kokkos::Cuda > ;
using task_root_type = TaskBase< void , void , void > ;
extern __shared__ int32_t shmem_all[];
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
int32_t * const warp_shmem =
shmem_all + ( threadIdx.z * shmem_per_warp ) / sizeof(int32_t);
task_root_type * const task_shmem = (task_root_type *) warp_shmem ;
const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;
Member single_exec( warp_shmem , 1 );
Member team_exec( warp_shmem , blockDim.y );
task_root_type * task_ptr ;
// Loop until all queues are empty and no tasks in flight
do {
// Each team lead attempts to acquire either a thread team task
// or collection of single thread tasks for the team.
if ( 0 == warp_lane ) {
task_ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
// Loop by priority and then type
for ( int i = 0 ; i < Queue::NumQueue && end == task_ptr ; ++i ) {
for ( int j = 0 ; j < 2 && end == task_ptr ; ++j ) {
task_ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
}
}
#if 0
printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
, uintptr_t(task_ptr));
#endif
}
// Synchronize warp with memory fence before broadcasting task pointer:
// KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "A" );
KOKKOS_IMPL_CUDA_SYNCWARP ;
// Broadcast task pointer:
((int*) & task_ptr )[0] = KOKKOS_IMPL_CUDA_SHFL( ((int*) & task_ptr )[0] , 0 , 32 );
((int*) & task_ptr )[1] = KOKKOS_IMPL_CUDA_SHFL( ((int*) & task_ptr )[1] , 0 , 32 );
#if defined( KOKKOS_DEBUG )
KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "TaskQueue CUDA task_ptr" );
#endif
if ( 0 == task_ptr ) break ; // 0 == queue->m_ready_count
if ( end != task_ptr ) {
// Whole warp copy task's closure to/from shared memory.
// Use all threads of warp for coalesced read/write.
int32_t const b = sizeof(task_root_type) / sizeof(int32_t);
int32_t const e = *((int32_t volatile *)( & task_ptr->m_alloc_size )) / sizeof(int32_t);
int32_t volatile * const task_mem = (int32_t volatile *) task_ptr ;
// copy task closure from global to shared memory:
for ( int32_t i = warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
warp_shmem[i] = task_mem[i] ;
}
// Synchronize threads of the warp and insure memory
// writes are visible to all threads in the warp.
// KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "B" );
KOKKOS_IMPL_CUDA_SYNCWARP ;
if ( task_root_type::TaskTeam == task_shmem->m_task_type ) {
// Thread Team Task
(*task_shmem->m_apply)( task_shmem , & team_exec );
}
else if ( 0 == threadIdx.y ) {
// Single Thread Task
(*task_shmem->m_apply)( task_shmem , & single_exec );
}
// Synchronize threads of the warp and insure memory
// writes are visible to all threads in the warp.
// KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "C" );
KOKKOS_IMPL_CUDA_SYNCWARP ;
// copy task closure from shared to global memory:
for ( int32_t i = b + warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
task_mem[i] = warp_shmem[i] ;
}
// Synchronize threads of the warp and insure memory
// writes are visible to root thread of the warp for
// respawn or completion.
// KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "D" );
KOKKOS_IMPL_CUDA_SYNCWARP ;
// If respawn requested copy respawn data back to main memory
if ( 0 == warp_lane ) {
if ( ((task_root_type *) task_root_type::LockTag) != task_shmem->m_next ) {
( (volatile task_root_type *) task_ptr )->m_next = task_shmem->m_next ;
( (volatile task_root_type *) task_ptr )->m_priority = task_shmem->m_priority ;
}
queue->complete( task_ptr );
}
}
} while(1);
}
namespace {
__global__
void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue
, int32_t shmem_size )
{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue , shmem_size ); }
}
void TaskQueueSpecialization< Kokkos::Cuda >::execute
( TaskQueue< Kokkos::Cuda > * const queue )
{
const int shared_per_warp = 2048 ;
const int warps_per_block = 4 ;
const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
const int shared_total = shared_per_warp * warps_per_block ;
const cudaStream_t stream = 0 ;
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
#if 0
printf("cuda_task_queue_execute before\n");
#endif
// Query the stack size, in bytes:
size_t previous_stack_size = 0 ;
CUDA_SAFE_CALL( cudaDeviceGetLimit( & previous_stack_size , cudaLimitStackSize ) );
// If not large enough then set the stack size, in bytes:
const size_t larger_stack_size = 2048 ;
if ( previous_stack_size < larger_stack_size ) {
CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , larger_stack_size ) );
}
cuda_task_queue_execute<<< grid , block , shared_total , stream >>>( queue , shared_per_warp );
CUDA_SAFE_CALL( cudaGetLastError() );
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
if ( previous_stack_size < larger_stack_size ) {
CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , previous_stack_size ) );
}
#if 0
printf("cuda_task_queue_execute after\n");
#endif
}
template class TaskQueue< Kokkos::Cuda, Impl::default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda> > ;
template class TaskQueueMultiple< Kokkos::Cuda, Impl::default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda> > ;
}} /* namespace Kokkos::Impl */

View File

@ -50,6 +50,14 @@
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#include <Kokkos_Core_fwd.hpp>
#include <impl/Kokkos_TaskBase.hpp>
#include <Cuda/Kokkos_Cuda_Error.hpp> // CUDA_SAFE_CALL
#include <impl/Kokkos_TaskTeamMember.hpp>
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
namespace {
@ -57,54 +65,498 @@ namespace {
template< typename TaskType >
__global__
void set_cuda_task_base_apply_function_pointer
( TaskBase<void,void,void>::function_type * ptr )
{ *ptr = TaskType::apply ; }
( typename TaskType::function_type * ptr, typename TaskType::destroy_type* dtor )
{
*ptr = TaskType::apply;
*dtor = TaskType::destroy;
}
template< typename Scheduler >
__global__
void cuda_task_queue_execute( Scheduler scheduler, int32_t shmem_size ) {
TaskQueueSpecialization< Scheduler >::driver( std::move(scheduler) , shmem_size );
}
}
template< class > class TaskExec ;
template <class, class> class TaskExec ;
template<>
class TaskQueueSpecialization< Kokkos::Cuda >
template<class QueueType>
class TaskQueueSpecialization<
SimpleTaskScheduler<Kokkos::Cuda, QueueType>
>
{
public:
using execution_space = Kokkos::Cuda ;
using memory_space = Kokkos::CudaUVMSpace ;
using queue_type = TaskQueue< execution_space > ;
using member_type = TaskExec< Kokkos::Cuda > ;
using scheduler_type = SimpleTaskScheduler<Kokkos::Cuda, QueueType>;
using execution_space = Kokkos::Cuda;
using memory_space = Kokkos::CudaUVMSpace;
using member_type = TaskExec<Kokkos::Cuda, scheduler_type> ;
enum : long { max_league_size = 16 };
enum : int { warps_per_block = 4 };
KOKKOS_INLINE_FUNCTION
static
void iff_single_thread_recursive_execute( queue_type * const ) {}
void iff_single_thread_recursive_execute( scheduler_type const& ) {}
static int get_max_team_count(
execution_space const&
) {
return Kokkos::Impl::cuda_internal_multiprocessor_count() * warps_per_block;
}
__device__
static void driver( queue_type * const , int32_t );
static void driver(scheduler_type scheduler, int32_t shmem_per_warp)
{
using queue_type = typename scheduler_type::task_queue_type;
using task_base_type = typename scheduler_type::task_base_type;
using runnable_task_base_type = typename scheduler_type::runnable_task_base_type;
using scheduling_info_storage_type =
SchedulingInfoStorage<
runnable_task_base_type,
typename scheduler_type::task_scheduling_info_type
>;
extern __shared__ int32_t shmem_all[];
int32_t* const warp_shmem = shmem_all + (threadIdx.z * shmem_per_warp) / sizeof(int32_t);
task_base_type* const shared_memory_task_copy = (task_base_type*)warp_shmem;
const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x;
member_type single_exec(scheduler, warp_shmem, 1);
member_type team_exec(scheduler, warp_shmem, blockDim.y);
auto& queue = scheduler.queue();
auto& team_scheduler = team_exec.scheduler();
auto current_task = OptionalRef<task_base_type>();
// Loop until all queues are empty and no tasks in flight
while(not queue.is_done()) {
if(warp_lane == 0) { // should be (?) same as team_exec.team_rank() == 0
// pop off a task
current_task = queue.pop_ready_task(team_scheduler.team_scheduler_info());
}
// Broadcast task pointer:
// Sync before the broadcast
KOKKOS_IMPL_CUDA_SYNCWARP;
// pretend it's an int* for shuffle purposes
((int*) &current_task)[0] = KOKKOS_IMPL_CUDA_SHFL(((int*) &current_task)[0], 0, 32);
((int*) &current_task)[1] = KOKKOS_IMPL_CUDA_SHFL(((int*) &current_task)[1], 0, 32);
if(current_task) {
KOKKOS_ASSERT(!current_task->as_runnable_task().get_respawn_flag());
int32_t b = sizeof(scheduling_info_storage_type) / sizeof(int32_t);
static_assert(
sizeof(scheduling_info_storage_type) % sizeof(int32_t) == 0,
"bad task size"
);
int32_t const e = current_task->get_allocation_size() / sizeof(int32_t);
KOKKOS_ASSERT(current_task->get_allocation_size() % sizeof(int32_t) == 0);
int32_t volatile* const task_mem = (int32_t volatile*)current_task.get();
// do a coordinated copy of the task closure from global to shared memory:
for(int32_t i = warp_lane; i < e; i += CudaTraits::WarpSize) {
warp_shmem[i] = task_mem[i];
}
// Synchronize threads of the warp and insure memory
// writes are visible to all threads in the warp.
KOKKOS_IMPL_CUDA_SYNCWARP;
if(shared_memory_task_copy->is_team_runnable()) {
// Thread Team Task
shared_memory_task_copy->as_runnable_task().run(team_exec);
}
else if(threadIdx.y == 0) {
// TODO @tasking @optimization DSH Change this to warp_lane == 0 when we allow blockDim.x to be more than 1
// Single Thread Task
shared_memory_task_copy->as_runnable_task().run(single_exec);
}
// Synchronize threads of the warp and insure memory
// writes are visible to all threads in the warp.
KOKKOS_IMPL_CUDA_SYNCWARP;
//if(warp_lane < b % CudaTraits::WarpSize) b += CudaTraits::WarpSize;
//b -= b % CudaTraits::WarpSize;
// copy task closure from shared to global memory:
for (int32_t i = b + warp_lane; i < e; i += CudaTraits::WarpSize) {
task_mem[i] = warp_shmem[i];
}
// Synchronize threads of the warp and insure memory
// writes are visible to root thread of the warp for
// respawn or completion.
KOKKOS_IMPL_CUDA_SYNCWARP;
if (warp_lane == 0) {
// If respawn requested copy respawn data back to main memory
if(shared_memory_task_copy->as_runnable_task().get_respawn_flag()) {
if(shared_memory_task_copy->as_runnable_task().has_predecessor()) {
// It's not necessary to make this a volatile write because
// the next read of the predecessor is on this thread in complete,
// and the predecessor is cleared there (using a volatile write)
current_task->as_runnable_task().acquire_predecessor_from(
shared_memory_task_copy->as_runnable_task()
);
}
// It may not necessary to make this a volatile write, since the
// next read will be done by this thread in complete where the
// rescheduling occurs, but since the task could be stolen later
// before this is written again, we should do the volatile write
// here. (It might not be necessary though because I don't know
// where else the priority would be read after it is scheduled
// by this thread; for now, we leave it volatile, but we should
// benchmark the cost of this.)
current_task.as_volatile()->set_priority(shared_memory_task_copy->get_priority());
// It's not necessary to make this a volatile write, since the
// next read of it (if true) will be by this thread in `complete()`,
// which will unset the flag (using volatile) once it has handled
// the respawn
current_task->as_runnable_task().set_respawn_flag();
}
queue.complete(
(*std::move(current_task)).as_runnable_task(),
team_scheduler.team_scheduler_info()
);
}
}
}
}
static
void execute( queue_type * const );
void execute(scheduler_type const& scheduler)
{
const int shared_per_warp = 2048 ;
const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1);
const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block);
const int shared_total = shared_per_warp * warps_per_block;
const cudaStream_t stream = nullptr;
KOKKOS_ASSERT(
static_cast<long>(grid.x * grid.y * grid.z * block.x * block.y * block.z)
== static_cast<long>(get_max_team_count(scheduler.get_execution_space()) * Kokkos::Impl::CudaTraits::WarpSize)
);
auto& queue = scheduler.queue();
CUDA_SAFE_CALL(cudaDeviceSynchronize());
// Query the stack size, in bytes:
size_t previous_stack_size = 0;
CUDA_SAFE_CALL(cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize));
// If not large enough then set the stack size, in bytes:
const size_t larger_stack_size = 1 << 11;
if (previous_stack_size < larger_stack_size) {
CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
}
cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(scheduler, shared_per_warp);
CUDA_SAFE_CALL(cudaGetLastError());
CUDA_SAFE_CALL(cudaDeviceSynchronize());
if (previous_stack_size < larger_stack_size) {
CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size));
}
}
template <typename TaskType>
static
// TODO @tasking @optimiazation DSH specialize this for trivially destructible types
void
get_function_pointer(
typename TaskType::function_type& ptr,
typename TaskType::destroy_type& dtor
)
{
using function_type = typename TaskType::function_type;
using destroy_type = typename TaskType::destroy_type;
// TODO @tasking @minor DSH make sure there aren't any alignment concerns?
void* storage = cuda_internal_scratch_unified(
Kokkos::Cuda(),
sizeof(function_type) + sizeof(destroy_type)
);
function_type* ptr_ptr = (function_type*)storage;
destroy_type* dtor_ptr = (destroy_type*)((char*)storage + sizeof(function_type));
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
set_cuda_task_base_apply_function_pointer<TaskType><<<1,1>>>(ptr_ptr, dtor_ptr);
CUDA_SAFE_CALL( cudaGetLastError() );
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
ptr = *ptr_ptr;
dtor = *dtor_ptr;
}
};
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
template<class Scheduler>
class TaskQueueSpecializationConstrained<
Scheduler,
typename std::enable_if<
std::is_same<typename Scheduler::execution_space, Kokkos::Cuda>::value
>::type
>
{
public:
using scheduler_type = Scheduler;
using execution_space = Kokkos::Cuda;
using memory_space = Kokkos::CudaUVMSpace;
using member_type = TaskExec<Kokkos::Cuda, Scheduler> ;
enum : long { max_league_size = 16 };
KOKKOS_INLINE_FUNCTION
static
void iff_single_thread_recursive_execute( scheduler_type const& ) {}
__device__
static void driver(scheduler_type scheduler, int32_t shmem_per_warp)
{
using queue_type = typename scheduler_type::queue_type;
using task_root_type = TaskBase;
extern __shared__ int32_t shmem_all[];
task_root_type* const end = (task_root_type *) task_root_type::EndTag ;
task_root_type* const no_more_tasks_sentinel = nullptr;
int32_t * const warp_shmem =
shmem_all + ( threadIdx.z * shmem_per_warp ) / sizeof(int32_t);
task_root_type * const task_shmem = (task_root_type *) warp_shmem ;
const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;
member_type single_exec(scheduler, warp_shmem, 1);
member_type team_exec(scheduler, warp_shmem, blockDim.y);
auto& team_queue = team_exec.scheduler().queue();
task_root_type * task_ptr = no_more_tasks_sentinel;
// Loop until all queues are empty and no tasks in flight
do {
// Each team lead attempts to acquire either a thread team task
// or collection of single thread tasks for the team.
if ( 0 == warp_lane ) {
if( *((volatile int *) & team_queue.m_ready_count) > 0 ) {
task_ptr = end;
// Attempt to acquire a task
// Loop by priority and then type
for ( int i = 0 ; i < queue_type::NumQueue && end == task_ptr ; ++i ) {
for ( int j = 0 ; j < 2 && end == task_ptr ; ++j ) {
task_ptr = queue_type::pop_ready_task( & team_queue.m_ready[i][j] );
}
}
}
else {
// returns nullptr if and only if all other queues have a ready
// count of 0 also. Otherwise, returns a task from another queue
// or `end` if one couldn't be popped
task_ptr = team_queue.attempt_to_steal_task();
#if 0
if(task != no_more_tasks_sentinel && task != end) {
std::printf("task stolen on rank %d\n", team_exec.league_rank());
}
#endif
}
}
// Synchronize warp with memory fence before broadcasting task pointer:
// KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "A" );
KOKKOS_IMPL_CUDA_SYNCWARP ;
// Broadcast task pointer:
((int*) & task_ptr )[0] = KOKKOS_IMPL_CUDA_SHFL( ((int*) & task_ptr )[0] , 0 , 32 );
((int*) & task_ptr )[1] = KOKKOS_IMPL_CUDA_SHFL( ((int*) & task_ptr )[1] , 0 , 32 );
#if defined( KOKKOS_DEBUG )
KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "TaskQueue CUDA task_ptr" );
#endif
if ( 0 == task_ptr ) break ; // 0 == queue->m_ready_count
if ( end != task_ptr ) {
// Whole warp copy task's closure to/from shared memory.
// Use all threads of warp for coalesced read/write.
int32_t const b = sizeof(task_root_type) / sizeof(int32_t);
int32_t const e = *((int32_t volatile *)( & task_ptr->m_alloc_size )) / sizeof(int32_t);
int32_t volatile * const task_mem = (int32_t volatile *) task_ptr ;
KOKKOS_ASSERT(e * sizeof(int32_t) < shmem_per_warp);
// copy task closure from global to shared memory:
for ( int32_t i = warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
warp_shmem[i] = task_mem[i] ;
}
// Synchronize threads of the warp and insure memory
// writes are visible to all threads in the warp.
// KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "B" );
KOKKOS_IMPL_CUDA_SYNCWARP ;
if ( task_root_type::TaskTeam == task_shmem->m_task_type ) {
// Thread Team Task
(*task_shmem->m_apply)( task_shmem , & team_exec );
}
else if ( 0 == threadIdx.y ) {
// Single Thread Task
(*task_shmem->m_apply)( task_shmem , & single_exec );
}
// Synchronize threads of the warp and insure memory
// writes are visible to all threads in the warp.
// KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "C" );
KOKKOS_IMPL_CUDA_SYNCWARP ;
// copy task closure from shared to global memory:
for ( int32_t i = b + warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
task_mem[i] = warp_shmem[i] ;
}
// Synchronize threads of the warp and insure memory
// writes are visible to root thread of the warp for
// respawn or completion.
// KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "D" );
KOKKOS_IMPL_CUDA_SYNCWARP ;
// If respawn requested copy respawn data back to main memory
if ( 0 == warp_lane ) {
if ( ((task_root_type *) task_root_type::LockTag) != task_shmem->m_next ) {
( (volatile task_root_type *) task_ptr )->m_next = task_shmem->m_next ;
( (volatile task_root_type *) task_ptr )->m_priority = task_shmem->m_priority ;
}
team_queue.complete( task_ptr );
}
}
} while(1);
}
static
void execute(scheduler_type const& scheduler)
{
const int shared_per_warp = 2048 ;
const int warps_per_block = 4 ;
const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
//const dim3 grid( 1 , 1 , 1 );
const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
const int shared_total = shared_per_warp * warps_per_block ;
const cudaStream_t stream = 0 ;
auto& queue = scheduler.queue();
queue.initialize_team_queues(warps_per_block * grid.x);
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
// Query the stack size, in bytes:
size_t previous_stack_size = 0 ;
CUDA_SAFE_CALL( cudaDeviceGetLimit( & previous_stack_size , cudaLimitStackSize ) );
// If not large enough then set the stack size, in bytes:
const size_t larger_stack_size = 2048 ;
if ( previous_stack_size < larger_stack_size ) {
CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , larger_stack_size ) );
}
cuda_task_queue_execute<<< grid , block , shared_total , stream >>>( scheduler , shared_per_warp );
CUDA_SAFE_CALL( cudaGetLastError() );
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
if ( previous_stack_size < larger_stack_size ) {
CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , previous_stack_size ) );
}
}
template< typename TaskType >
static
typename TaskType::function_type
get_function_pointer()
void
get_function_pointer(
typename TaskType::function_type& ptr,
typename TaskType::destroy_type& dtor
)
{
using function_type = typename TaskType::function_type ;
using function_type = typename TaskType::function_type;
using destroy_type = typename TaskType::destroy_type;
function_type * const ptr =
(function_type*) cuda_internal_scratch_unified( sizeof(function_type) );
void* storage = cuda_internal_scratch_unified(
Kokkos::Cuda(),
sizeof(function_type) + sizeof(destroy_type)
);
function_type* ptr_ptr = (function_type*)storage;
destroy_type* dtor_ptr = (destroy_type*)((char*)storage + sizeof(function_type));
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
set_cuda_task_base_apply_function_pointer<TaskType><<<1,1>>>(ptr);
set_cuda_task_base_apply_function_pointer<TaskType><<<1,1>>>(ptr_ptr, dtor_ptr);
CUDA_SAFE_CALL( cudaGetLastError() );
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
return *ptr ;
ptr = *ptr_ptr;
dtor = *dtor_ptr;
}
};
extern template class TaskQueue< Kokkos::Cuda > ;
extern template class TaskQueue< Kokkos::Cuda, default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda> > ;
}} /* namespace Kokkos::Impl */
@ -136,8 +588,8 @@ namespace Impl {
* When executing a single thread task the syncwarp or other
* warp synchronizing functions must not be called.
*/
template<>
class TaskExec< Kokkos::Cuda >
template <class Scheduler>
class TaskExec<Kokkos::Cuda, Scheduler>
{
private:
@ -148,24 +600,39 @@ private:
TaskExec & operator = ( TaskExec && ) = delete ;
TaskExec & operator = ( TaskExec const & ) = delete ;
friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda > ;
friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Cuda > ;
friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda, default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda> > ;
template <class, class>
friend class Kokkos::Impl::TaskQueueSpecializationConstrained;
template <class>
friend class Kokkos::Impl::TaskQueueSpecialization;
int32_t * m_team_shmem ;
const int m_team_size ;
Scheduler m_scheduler;
// If constructed with arg_team_size == 1 the object
// can only be used by 0 == threadIdx.y.
__device__
TaskExec( int32_t * arg_team_shmem , int arg_team_size = blockDim.y )
: m_team_shmem( arg_team_shmem )
, m_team_size( arg_team_size ) {}
KOKKOS_INLINE_FUNCTION
TaskExec(
Scheduler const& parent_scheduler,
int32_t* arg_team_shmem,
int arg_team_size = blockDim.y
)
: m_team_shmem(arg_team_shmem),
m_team_size(arg_team_size),
m_scheduler(parent_scheduler.get_team_scheduler(league_rank()))
{ }
public:
using thread_team_member = TaskExec;
#if defined( __CUDA_ARCH__ )
__device__ int team_rank() const { return threadIdx.y ; }
__device__ int team_size() const { return m_team_size ; }
__device__ int team_rank() const { return threadIdx.y ; }
__device__ int team_size() const { return m_team_size ; }
//__device__ int league_rank() const { return threadIdx.z; }
__device__ int league_rank() const { return blockIdx.x * blockDim.z + threadIdx.z; }
__device__ int league_size() const { return blockDim.z * gridDim.x; }
__device__ void team_barrier() const
{
@ -186,13 +653,18 @@ public:
}
#else
__host__ int team_rank() const { return 0 ; }
__host__ int team_size() const { return 0 ; }
__host__ int team_rank() const { return 0 ; }
__host__ int team_size() const { return 0 ; }
__host__ int league_rank() const { return 0; }
__host__ int league_size() const { return 0; }
__host__ void team_barrier() const {}
template< class ValueType >
__host__ void team_broadcast( ValueType & , const int ) const {}
#endif
KOKKOS_INLINE_FUNCTION Scheduler const& scheduler() const noexcept { return m_scheduler; }
KOKKOS_INLINE_FUNCTION Scheduler& scheduler() noexcept { return m_scheduler; }
};
}} /* namespace Kokkos::Impl */
@ -203,20 +675,22 @@ public:
namespace Kokkos {
namespace Impl {
template<typename iType>
struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
template<typename iType, typename Scheduler>
struct TeamThreadRangeBoundariesStruct<iType, TaskExec<Kokkos::Cuda, Scheduler>>
{
typedef iType index_type;
using index_type = iType;
using member_type = TaskExec<Kokkos::Cuda, Scheduler>;
const iType start ;
const iType end ;
const iType increment ;
const TaskExec< Kokkos::Cuda > & thread;
member_type const& thread;
#if defined( __CUDA_ARCH__ )
__device__ inline
TeamThreadRangeBoundariesStruct
( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count)
( member_type const& arg_thread, const iType& arg_count)
: start( threadIdx.y )
, end(arg_count)
, increment( blockDim.y )
@ -225,7 +699,7 @@ struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
__device__ inline
TeamThreadRangeBoundariesStruct
( const TaskExec< Kokkos::Cuda > & arg_thread
( member_type const& arg_thread
, const iType & arg_start
, const iType & arg_end
)
@ -238,10 +712,10 @@ struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
#else
TeamThreadRangeBoundariesStruct
( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count);
( member_type const& arg_thread, const iType& arg_count);
TeamThreadRangeBoundariesStruct
( const TaskExec< Kokkos::Cuda > & arg_thread
( member_type const& arg_thread
, const iType & arg_start
, const iType & arg_end
);
@ -252,20 +726,22 @@ struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
//----------------------------------------------------------------------------
template<typename iType>
struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
template<typename iType, typename Scheduler>
struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda, Scheduler > >
{
typedef iType index_type;
using index_type = iType;
using member_type = TaskExec<Kokkos::Cuda, Scheduler>;
const index_type start ;
const index_type end ;
const index_type increment ;
const TaskExec< Kokkos::Cuda > & thread;
const member_type& thread;
#if defined( __CUDA_ARCH__ )
__device__ inline
ThreadVectorRangeBoundariesStruct
( const TaskExec< Kokkos::Cuda > & arg_thread, const index_type& arg_count )
( member_type const& arg_thread, const index_type& arg_count )
: start( threadIdx.x )
, end(arg_count)
, increment( blockDim.x )
@ -274,9 +750,9 @@ struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
__device__ inline
ThreadVectorRangeBoundariesStruct
( const TaskExec< Kokkos::Cuda > & arg_thread, const index_type& arg_begin, const index_type& arg_end )
( member_type const& arg_thread, const index_type& arg_begin, const index_type& arg_end )
: start( arg_begin + threadIdx.x )
, end(arg_count)
, end(arg_end)
, increment( blockDim.x )
, thread(arg_thread)
{}
@ -284,10 +760,10 @@ struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
#else
ThreadVectorRangeBoundariesStruct
( const TaskExec< Kokkos::Cuda > & arg_thread, const index_type& arg_count );
( member_type const& arg_thread, const index_type& arg_count );
ThreadVectorRangeBoundariesStruct
( const TaskExec< Kokkos::Cuda > & arg_thread, const index_type& arg_begin, const index_type& arg_end);
( member_type const& arg_thread, const index_type& arg_begin, const index_type& arg_end);
#endif
@ -299,69 +775,69 @@ struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
namespace Kokkos {
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >
TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & count )
{
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >( thread, count );
}
//template<typename iType>
//KOKKOS_INLINE_FUNCTION
//Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >
//TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & count )
//{
// return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >( thread, count );
//}
//
//template<typename iType1, typename iType2>
//KOKKOS_INLINE_FUNCTION
//Impl::TeamThreadRangeBoundariesStruct
// < typename std::common_type<iType1,iType2>::type
// , Impl::TaskExec< Kokkos::Cuda > >
//TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
// , const iType1 & begin, const iType2 & end )
//{
// typedef typename std::common_type< iType1, iType2 >::type iType;
// return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >(
// thread, iType(begin), iType(end) );
//}
//
//template<typename iType>
//KOKKOS_INLINE_FUNCTION
//Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
//ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
// , const iType & count )
//{
// return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
//}
//
//template<typename iType>
//KOKKOS_INLINE_FUNCTION
//Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
//ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
// , const iType & arg_begin
// , const iType & arg_end )
//{
// return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,arg_begin,arg_end);
//}
template<typename iType1, typename iType2>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct
< typename std::common_type<iType1,iType2>::type
, Impl::TaskExec< Kokkos::Cuda > >
TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
, const iType1 & begin, const iType2 & end )
{
typedef typename std::common_type< iType1, iType2 >::type iType;
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >(
thread, iType(begin), iType(end) );
}
// KOKKOS_INLINE_FUNCTION
// Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >
// PerTeam(const Impl::TaskExec< Kokkos::Cuda >& thread)
// {
// return Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread);
// }
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
, const iType & count )
{
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
, const iType & arg_begin
, const iType & arg_end )
{
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,arg_begin,arg_end);
}
KOKKOS_INLINE_FUNCTION
Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >
PerTeam(const Impl::TaskExec< Kokkos::Cuda >& thread)
{
return Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread);
}
KOKKOS_INLINE_FUNCTION
Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >
PerThread(const Impl::TaskExec< Kokkos::Cuda >& thread)
{
return Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread);
}
// KOKKOS_INLINE_FUNCTION
// Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >
// PerThread(const Impl::TaskExec< Kokkos::Cuda >& thread)
// {
// return Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread);
// }
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
* This functionality requires C++11 support.
*/
template<typename iType, class Lambda>
template<typename iType, class Lambda, class Scheduler>
KOKKOS_INLINE_FUNCTION
void parallel_for
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >& loop_boundaries
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries
, const Lambda& lambda
)
{
@ -370,10 +846,10 @@ void parallel_for
}
}
template< typename iType, class Lambda >
template< typename iType, class Lambda, class Scheduler >
KOKKOS_INLINE_FUNCTION
void parallel_for
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
const Lambda & lambda) {
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i);
@ -459,14 +935,14 @@ void parallel_reduce
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename iType, class Lambda, typename ValueType >
template< typename iType, class Lambda, typename ValueType, class Scheduler >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
const Lambda & lambda,
ValueType& initialized_result) {
//TODO what is the point of creating this temporary?
//TODO @internal_documentation what is the point of creating this temporary?
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,result);
@ -487,15 +963,15 @@ void parallel_reduce
}
}
template< typename iType, class Lambda, typename ReducerType >
template< typename iType, class Lambda, typename ReducerType, class Scheduler >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
const Lambda & lambda,
const ReducerType& reducer) {
typedef typename ReducerType::value_type ValueType;
//TODO what is the point of creating this temporary?
//TODO @internal_documentation what is the point of creating this temporary?
ValueType result = ValueType();
reducer.init(result);
@ -549,10 +1025,10 @@ void parallel_reduce
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename iType, class Lambda, typename ValueType >
template< typename iType, class Lambda, typename ValueType, class Scheduler >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
const Lambda & lambda,
ValueType& initialized_result) {
@ -576,10 +1052,10 @@ void parallel_reduce
}
}
template< typename iType, class Lambda, typename ReducerType >
template< typename iType, class Lambda, typename ReducerType, class Scheduler >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
const Lambda & lambda,
const ReducerType& reducer) {
@ -611,10 +1087,10 @@ void parallel_reduce
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename iType, class Closure >
template< typename iType, class Closure, class Scheduler >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
const Closure & closure )
{
// Extract value_type from closure
@ -676,10 +1152,10 @@ void parallel_scan
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename iType, class Closure >
template< typename iType, class Closure, class Scheduler >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
const Closure & closure )
{
// Extract value_type from closure
@ -735,25 +1211,25 @@ void parallel_scan
namespace Kokkos {
template<class FunctorType>
template<class FunctorType, class Scheduler>
KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >& , const FunctorType& lambda) {
void single(const Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda, Scheduler > >& , const FunctorType& lambda) {
#ifdef __CUDA_ARCH__
if(threadIdx.x == 0) lambda();
#endif
}
template<class FunctorType>
template<class FunctorType, class Scheduler>
KOKKOS_INLINE_FUNCTION
void single(const Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >& , const FunctorType& lambda) {
void single(const Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda, Scheduler > >& , const FunctorType& lambda) {
#ifdef __CUDA_ARCH__
if(threadIdx.x == 0 && threadIdx.y == 0) lambda();
#endif
}
template<class FunctorType, class ValueType>
template<class FunctorType, class ValueType, class Scheduler>
KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >& s , const FunctorType& lambda, ValueType& val) {
void single(const Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda, Scheduler > >& s , const FunctorType& lambda, ValueType& val) {
#ifdef __CUDA_ARCH__
if(threadIdx.x == 0) lambda(val);
if ( 1 < s.team_member.team_size() ) {
@ -762,9 +1238,9 @@ namespace Kokkos {
#endif
}
template<class FunctorType, class ValueType>
template<class FunctorType, class ValueType, class Scheduler>
KOKKOS_INLINE_FUNCTION
void single(const Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >& single_struct, const FunctorType& lambda, ValueType& val) {
void single(const Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda, Scheduler > >& single_struct, const FunctorType& lambda, ValueType& val) {
#ifdef __CUDA_ARCH__
if(threadIdx.x == 0 && threadIdx.y == 0) {
lambda(val);

View File

@ -56,9 +56,9 @@
#include <utility>
#include <Kokkos_Parallel.hpp>
#include <Cuda/Kokkos_CudaExec.hpp>
#include <Cuda/Kokkos_Cuda_KernelLaunch.hpp>
#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
#include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
#include <Kokkos_Vectorization.hpp>
#if defined(KOKKOS_ENABLE_PROFILING)
@ -101,11 +101,13 @@ struct CudaJoinFunctor {
* total available shared memory must be partitioned among teams.
*/
class CudaTeamMember {
private:
public:
typedef Kokkos::Cuda execution_space ;
typedef execution_space::scratch_memory_space scratch_memory_space ;
private:
mutable void * m_team_reduce ;
scratch_memory_space m_team_shared ;
int m_team_reduce_size ;
@ -221,12 +223,21 @@ public:
KOKKOS_INLINE_FUNCTION
typename std::enable_if< is_reducer< ReducerType >::value >::type
team_reduce( ReducerType const & reducer ) const noexcept
{
team_reduce(reducer,reducer.reference());
}
template< typename ReducerType >
KOKKOS_INLINE_FUNCTION
typename std::enable_if< is_reducer< ReducerType >::value >::type
team_reduce( ReducerType const & reducer, typename ReducerType::value_type& value ) const noexcept
{
#ifdef __CUDA_ARCH__
cuda_intra_block_reduction(reducer,blockDim.y);
cuda_intra_block_reduction(reducer,value,blockDim.y);
#endif /* #ifdef __CUDA_ARCH__ */
}
//--------------------------------------------------------------------------
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
* with intra-team non-deterministic ordering accumulation.
@ -281,20 +292,28 @@ public:
template< typename ReducerType >
KOKKOS_INLINE_FUNCTION static
typename std::enable_if< is_reducer< ReducerType >::value >::type
vector_reduce( ReducerType const & reducer )
vector_reduce( ReducerType const & reducer ) {
vector_reduce(reducer,reducer.reference());
}
template< typename ReducerType >
KOKKOS_INLINE_FUNCTION static
typename std::enable_if< is_reducer< ReducerType >::value >::type
vector_reduce( ReducerType const & reducer, typename ReducerType::value_type& value )
{
#ifdef __CUDA_ARCH__
if(blockDim.x == 1) return;
// Intra vector lane shuffle reduction:
typename ReducerType::value_type tmp ( reducer.reference() );
typename ReducerType::value_type tmp ( value );
typename ReducerType::value_type tmp2 = tmp;
unsigned mask = blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<((threadIdx.y%(32/blockDim.x))*blockDim.x);
for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
cuda_shfl_down( reducer.reference() , tmp , i , blockDim.x , mask );
if ( (int)threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
cuda_shfl_down( tmp2 , tmp , i , blockDim.x , mask );
if ( (int)threadIdx.x < i ) { reducer.join( tmp , tmp2 ); }
}
// Broadcast from root lane to all other lanes.
@ -302,7 +321,9 @@ public:
// because floating point summation is not associative
// and thus different threads could have different results.
cuda_shfl( reducer.reference() , tmp , 0 , blockDim.x , mask );
cuda_shfl( tmp2 , tmp , 0 , blockDim.x , mask );
value = tmp2;
reducer.reference() = tmp2;
#endif
}
@ -543,19 +564,37 @@ struct TeamThreadRangeBoundariesStruct<iType,CudaTeamMember> {
const iType end;
KOKKOS_INLINE_FUNCTION
TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count)
TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, iType count)
: member(thread_)
, start( 0 )
, end( count ) {}
KOKKOS_INLINE_FUNCTION
TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& begin_, const iType& end_)
TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, iType begin_, iType end_)
: member(thread_)
, start( begin_ )
, end( end_ ) {}
};
template<typename iType>
struct TeamVectorRangeBoundariesStruct<iType,CudaTeamMember> {
typedef iType index_type;
const CudaTeamMember& member;
const iType start;
const iType end;
KOKKOS_INLINE_FUNCTION
TeamVectorRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count)
: member(thread_)
, start( 0 )
, end( count ) {}
KOKKOS_INLINE_FUNCTION
TeamVectorRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& begin_, const iType& end_)
: member(thread_)
, start( begin_ )
, end( end_ ) {}
};
template<typename iType>
struct ThreadVectorRangeBoundariesStruct<iType,CudaTeamMember> {
@ -564,19 +603,19 @@ struct ThreadVectorRangeBoundariesStruct<iType,CudaTeamMember> {
const index_type end;
KOKKOS_INLINE_FUNCTION
ThreadVectorRangeBoundariesStruct (const CudaTeamMember, const index_type& count)
ThreadVectorRangeBoundariesStruct (const CudaTeamMember, index_type count)
: start( static_cast<index_type>(0) ), end( count ) {}
KOKKOS_INLINE_FUNCTION
ThreadVectorRangeBoundariesStruct (const index_type& count)
ThreadVectorRangeBoundariesStruct (index_type count)
: start( static_cast<index_type>(0) ), end( count ) {}
KOKKOS_INLINE_FUNCTION
ThreadVectorRangeBoundariesStruct (const CudaTeamMember, const index_type& arg_begin, const index_type& arg_end)
ThreadVectorRangeBoundariesStruct (const CudaTeamMember, index_type arg_begin, index_type arg_end)
: start( arg_begin ), end( arg_end ) {}
KOKKOS_INLINE_FUNCTION
ThreadVectorRangeBoundariesStruct (const index_type& arg_begin, const index_type& arg_end)
ThreadVectorRangeBoundariesStruct (index_type arg_begin, index_type arg_end)
: start( arg_begin ), end( arg_end ) {}
};
@ -585,7 +624,7 @@ struct ThreadVectorRangeBoundariesStruct<iType,CudaTeamMember> {
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >
TeamThreadRange( const Impl::CudaTeamMember & thread, const iType & count ) {
TeamThreadRange( const Impl::CudaTeamMember & thread, iType count ) {
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, count );
}
@ -593,22 +632,38 @@ template< typename iType1, typename iType2 >
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
Impl::CudaTeamMember >
TeamThreadRange( const Impl::CudaTeamMember & thread, const iType1 & begin, const iType2 & end ) {
TeamThreadRange( const Impl::CudaTeamMember & thread, iType1 begin, iType2 end ) {
typedef typename std::common_type< iType1, iType2 >::type iType;
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, iType(begin), iType(end) );
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamVectorRangeBoundariesStruct< iType, Impl::CudaTeamMember >
TeamVectorRange( const Impl::CudaTeamMember & thread, const iType & count ) {
return Impl::TeamVectorRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, count );
}
template< typename iType1, typename iType2 >
KOKKOS_INLINE_FUNCTION
Impl::TeamVectorRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
Impl::CudaTeamMember >
TeamVectorRange( const Impl::CudaTeamMember & thread, const iType1 & begin, const iType2 & end ) {
typedef typename std::common_type< iType1, iType2 >::type iType;
return Impl::TeamVectorRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, iType(begin), iType(end) );
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
ThreadVectorRange(const Impl::CudaTeamMember& thread, const iType& count) {
ThreadVectorRange(const Impl::CudaTeamMember& thread, iType count) {
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >(thread,count);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
ThreadVectorRange(const Impl::CudaTeamMember& thread, const iType& arg_begin, const iType& arg_end) {
ThreadVectorRange(const Impl::CudaTeamMember& thread, iType arg_begin, iType arg_end) {
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >(thread,arg_begin,arg_end);
}
@ -667,16 +722,16 @@ parallel_reduce
)
{
#ifdef __CUDA_ARCH__
reducer.init( reducer.reference() );
typename ReducerType::value_type value;
reducer.init( value );
for( iType i = loop_boundaries.start + threadIdx.y
; i < loop_boundaries.end
; i += blockDim.y ) {
closure(i,reducer.reference());
closure(i,value);
}
loop_boundaries.member.team_reduce( reducer );
loop_boundaries.member.team_reduce( reducer, value );
#endif
}
@ -701,19 +756,88 @@ parallel_reduce
)
{
#ifdef __CUDA_ARCH__
Kokkos::Sum<ValueType> reducer(result);
ValueType val;
Kokkos::Sum<ValueType> reducer(val);
reducer.init( reducer.reference() );
for( iType i = loop_boundaries.start + threadIdx.y
; i < loop_boundaries.end
; i += blockDim.y ) {
closure(i,result);
closure(i,val);
}
loop_boundaries.member.team_reduce( reducer );
loop_boundaries.member.team_reduce( reducer , val);
result = reducer.reference();
#endif
}
template<typename iType, class Closure >
KOKKOS_INLINE_FUNCTION
void parallel_for
( const Impl::TeamVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember>&
loop_boundaries
, const Closure & closure
)
{
#ifdef __CUDA_ARCH__
for( iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x
; i < loop_boundaries.end
; i += blockDim.y*blockDim.x )
closure(i);
#endif
}
template< typename iType, class Closure, class ReducerType >
KOKKOS_INLINE_FUNCTION
typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
parallel_reduce
( const Impl::TeamVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember> &
loop_boundaries
, const Closure & closure
, const ReducerType & reducer
)
{
#ifdef __CUDA_ARCH__
typename ReducerType::value_type value;
reducer.init( value );
for( iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x
; i < loop_boundaries.end
; i += blockDim.y * blockDim.x ) {
closure(i,value);
}
loop_boundaries.member.vector_reduce( reducer, value );
loop_boundaries.member.team_reduce( reducer, value );
#endif
}
template< typename iType, class Closure, typename ValueType >
KOKKOS_INLINE_FUNCTION
typename std::enable_if< ! Kokkos::is_reducer< ValueType >::value >::type
parallel_reduce
( const Impl::TeamVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember> &
loop_boundaries
, const Closure & closure
, ValueType & result
)
{
#ifdef __CUDA_ARCH__
ValueType val;
Kokkos::Sum<ValueType> reducer(val);
reducer.init( reducer.reference() );
for( iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x
; i < loop_boundaries.end
; i += blockDim.y * blockDim.x ) {
closure(i,val);
}
loop_boundaries.member.vector_reduce( reducer );
loop_boundaries.member.team_reduce( reducer );
result = reducer.reference();
#endif
}

View File

@ -241,7 +241,7 @@ class ViewDataHandle< Traits ,
sizeof(typename Traits::const_value_type) == 16 )
&&
// Random access trait
( Traits::memory_traits::RandomAccess != 0 )
( Traits::memory_traits::is_random_access != 0 )
)>::type >
{
public:

View File

@ -102,9 +102,8 @@ public:
const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
const int shared = 0 ;
const cudaStream_t stream = 0 ;
Kokkos::Impl::CudaParallelLaunch<Self>(*this, grid, block, shared, stream);
Kokkos::Impl::CudaParallelLaunch<Self>(*this, grid, block, shared, Cuda().impl_internal_space_instance() , false );
}
inline

View File

@ -0,0 +1,152 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_HPX
#include <Kokkos_HPX.hpp>
#include <hpx/util/yield_while.hpp>
namespace Kokkos {
namespace Experimental {
bool HPX::m_hpx_initialized = false;
Kokkos::Impl::thread_buffer HPX::m_buffer;
#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
hpx::future<void> HPX::m_future = hpx::make_ready_future<void>();
#endif
int HPX::concurrency() {
hpx::runtime *rt = hpx::get_runtime_ptr();
if (rt == nullptr) {
return hpx::threads::hardware_concurrency();
} else {
if (hpx::threads::get_self_ptr() == nullptr) {
return hpx::resource::get_thread_pool(0).get_os_thread_count();
} else {
return hpx::this_thread::get_pool()->get_os_thread_count();
}
}
}
void HPX::impl_initialize(int thread_count) {
hpx::runtime *rt = hpx::get_runtime_ptr();
if (rt == nullptr) {
std::vector<std::string> config = {
"hpx.os_threads=" + std::to_string(thread_count),
#ifdef KOKKOS_DEBUG
"--hpx:attach-debugger=exception",
#endif
};
int argc_hpx = 1;
char name[] = "kokkos_hpx";
char *argv_hpx[] = {name, nullptr};
hpx::start(nullptr, argc_hpx, argv_hpx, config);
// NOTE: Wait for runtime to start. hpx::start returns as soon as
// possible, meaning some operations are not allowed immediately
// after hpx::start. Notably, hpx::stop needs state_running. This
// needs to be fixed in HPX itself.
// Get runtime pointer again after it has been started.
rt = hpx::get_runtime_ptr();
hpx::util::yield_while(
[rt]() { return rt->get_state() < hpx::state_running; });
m_hpx_initialized = true;
}
}
void HPX::impl_initialize() {
hpx::runtime *rt = hpx::get_runtime_ptr();
if (rt == nullptr) {
std::vector<std::string> config = {
#ifdef KOKKOS_DEBUG
"--hpx:attach-debugger=exception",
#endif
};
int argc_hpx = 1;
char name[] = "kokkos_hpx";
char *argv_hpx[] = {name, nullptr};
hpx::start(nullptr, argc_hpx, argv_hpx, config);
// NOTE: Wait for runtime to start. hpx::start returns as soon as
// possible, meaning some operations are not allowed immediately
// after hpx::start. Notably, hpx::stop needs state_running. This
// needs to be fixed in HPX itself.
// Get runtime pointer again after it has been started.
rt = hpx::get_runtime_ptr();
hpx::util::yield_while(
[rt]() { return rt->get_state() < hpx::state_running; });
m_hpx_initialized = true;
}
}
bool HPX::impl_is_initialized() noexcept {
hpx::runtime *rt = hpx::get_runtime_ptr();
return rt != nullptr;
}
void HPX::impl_finalize() {
if (m_hpx_initialized) {
hpx::runtime *rt = hpx::get_runtime_ptr();
if (rt != nullptr) {
hpx::apply([]() { hpx::finalize(); });
hpx::stop();
} else {
Kokkos::abort("Kokkos::Experimental::HPX::impl_finalize: Kokkos started "
"HPX but something else already stopped HPX\n");
}
}
}
} // namespace Experimental
} // namespace Kokkos
#else
void KOKKOS_CORE_SRC_IMPL_HPX_PREVENT_LINK_ERROR() {}
#endif //#ifdef KOKKOS_ENABLE_HPX

View File

@ -41,38 +41,25 @@
//@HEADER
*/
#ifndef KOKKOS_STATICASSERT_HPP
#define KOKKOS_STATICASSERT_HPP
#include <Kokkos_Macros.hpp>
#if defined(KOKKOS_ENABLE_HPX) && defined(KOKKOS_ENABLE_TASKDAG)
#include <Kokkos_Core.hpp>
#include <impl/Kokkos_TaskQueue_impl.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template < bool , class T = void >
struct StaticAssert ;
template< class T >
struct StaticAssert< true , T > {
typedef T type ;
static const bool value = true ;
};
template < class A , class B >
struct StaticAssertSame ;
template < class A >
struct StaticAssertSame<A,A> { typedef A type ; };
template < class A , class B >
struct StaticAssertAssignable ;
template < class A >
struct StaticAssertAssignable<A,A> { typedef A type ; };
template < class A >
struct StaticAssertAssignable< const A , A > { typedef const A type ; };
template class TaskQueue<Kokkos::Experimental::HPX,
Kokkos::Experimental::HPX::memory_space>;
} // namespace Impl
} // namespace Kokkos
#endif /* KOKKOS_STATICASSERT_HPP */
#else
void KOKKOS_CORE_SRC_IMPL_HPX_TASK_PREVENT_LINK_ERROR() {}
#endif // #if defined( KOKKOS_ENABLE_HPX ) && defined( KOKKOS_ENABLE_TASKDAG )

View File

@ -0,0 +1,298 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_HPX_TASK_HPP
#define KOKKOS_HPX_TASK_HPP
#include <Kokkos_Macros.hpp>
#if defined(KOKKOS_ENABLE_HPX) && defined(KOKKOS_ENABLE_TASKDAG)
#include <Kokkos_TaskScheduler_fwd.hpp>
#include <Kokkos_HPX.hpp>
#include <hpx/apply.hpp>
#include <hpx/lcos/local/counting_semaphore.hpp>
#include <type_traits>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template <class QueueType>
class TaskQueueSpecialization<
SimpleTaskScheduler<Kokkos::Experimental::HPX, QueueType>> {
public:
using execution_space = Kokkos::Experimental::HPX;
using scheduler_type =
SimpleTaskScheduler<Kokkos::Experimental::HPX, QueueType>;
using member_type =
TaskTeamMemberAdapter<Kokkos::Impl::HPXTeamMember, scheduler_type>;
using memory_space = Kokkos::HostSpace;
static void execute(scheduler_type const &scheduler) {
// NOTE: We create an instance so that we can use dispatch_execute_task.
// This is not necessarily the most efficient, but can be improved later.
TaskQueueSpecialization<scheduler_type> task_queue;
task_queue.scheduler = &scheduler;
Kokkos::Impl::dispatch_execute_task(&task_queue);
Kokkos::Experimental::HPX().fence();
}
// Must provide task queue execution function
void execute_task() const {
using hpx::apply;
using hpx::lcos::local::counting_semaphore;
using task_base_type = typename scheduler_type::task_base_type;
const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
buffer.resize(num_worker_threads, 512);
auto &queue = scheduler->queue();
counting_semaphore sem(0);
for (int thread = 0; thread < num_worker_threads; ++thread) {
apply([this, &sem, &queue, &buffer, num_worker_threads, thread]() {
// NOTE: This implementation has been simplified based on the
// assumption that team_size = 1. The HPX backend currently only
// supports a team size of 1.
std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id();
buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id());
HPXTeamMember member(TeamPolicyInternal<Kokkos::Experimental::HPX>(
Kokkos::Experimental::HPX(), num_worker_threads, 1),
0, t, buffer.get(t), 512);
member_type single_exec(*scheduler, member);
member_type &team_exec = single_exec;
auto &team_scheduler = team_exec.scheduler();
auto current_task = OptionalRef<task_base_type>(nullptr);
while (!queue.is_done()) {
current_task =
queue.pop_ready_task(team_scheduler.team_scheduler_info());
if (current_task) {
KOKKOS_ASSERT(current_task->is_single_runnable() ||
current_task->is_team_runnable());
current_task->as_runnable_task().run(single_exec);
queue.complete((*std::move(current_task)).as_runnable_task(),
team_scheduler.team_scheduler_info());
}
}
sem.signal(1);
});
}
sem.wait(num_worker_threads);
}
static uint32_t get_max_team_count(execution_space const &espace) {
return static_cast<uint32_t>(espace.concurrency());
}
template <typename TaskType>
static void get_function_pointer(typename TaskType::function_type &ptr,
typename TaskType::destroy_type &dtor) {
ptr = TaskType::apply;
dtor = TaskType::destroy;
}
private:
const scheduler_type *scheduler;
};
template <class Scheduler>
class TaskQueueSpecializationConstrained<
Scheduler, typename std::enable_if<
std::is_same<typename Scheduler::execution_space,
Kokkos::Experimental::HPX>::value>::type> {
public:
using execution_space = Kokkos::Experimental::HPX;
using scheduler_type = Scheduler;
using member_type =
TaskTeamMemberAdapter<Kokkos::Impl::HPXTeamMember, scheduler_type>;
using memory_space = Kokkos::HostSpace;
static void
iff_single_thread_recursive_execute(scheduler_type const &scheduler) {
using task_base_type = typename scheduler_type::task_base;
using queue_type = typename scheduler_type::queue_type;
if (1 == Kokkos::Experimental::HPX::concurrency()) {
task_base_type *const end = (task_base_type *)task_base_type::EndTag;
task_base_type *task = end;
HPXTeamMember member(TeamPolicyInternal<Kokkos::Experimental::HPX>(
Kokkos::Experimental::HPX(), 1, 1),
0, 0, nullptr, 0);
member_type single_exec(scheduler, member);
do {
task = end;
// Loop by priority and then type
for (int i = 0; i < queue_type::NumQueue && end == task; ++i) {
for (int j = 0; j < 2 && end == task; ++j) {
task =
queue_type::pop_ready_task(&scheduler.m_queue->m_ready[i][j]);
}
}
if (end == task)
break;
(*task->m_apply)(task, &single_exec);
scheduler.m_queue->complete(task);
} while (true);
}
}
static void execute(scheduler_type const &scheduler) {
// NOTE: We create an instance so that we can use dispatch_execute_task.
// This is not necessarily the most efficient, but can be improved later.
TaskQueueSpecializationConstrained<scheduler_type> task_queue;
task_queue.scheduler = &scheduler;
Kokkos::Impl::dispatch_execute_task(&task_queue);
Kokkos::Experimental::HPX().fence();
}
// Must provide task queue execution function
void execute_task() const {
using hpx::apply;
using hpx::lcos::local::counting_semaphore;
using task_base_type = typename scheduler_type::task_base;
using queue_type = typename scheduler_type::queue_type;
const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
static task_base_type *const end = (task_base_type *)task_base_type::EndTag;
constexpr task_base_type *no_more_tasks_sentinel = nullptr;
thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
buffer.resize(num_worker_threads, 512);
auto &queue = scheduler->queue();
queue.initialize_team_queues(num_worker_threads);
counting_semaphore sem(0);
for (int thread = 0; thread < num_worker_threads; ++thread) {
apply([this, &sem, &buffer, num_worker_threads, thread]() {
// NOTE: This implementation has been simplified based on the assumption
// that team_size = 1. The HPX backend currently only supports a team
// size of 1.
std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id();
buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id());
HPXTeamMember member(
TeamPolicyInternal<Kokkos::Experimental::HPX>(
Kokkos::Experimental::HPX(), num_worker_threads, 1),
0, t, buffer.get(t), 512);
member_type single_exec(*scheduler, member);
member_type &team_exec = single_exec;
auto &team_queue = team_exec.scheduler().queue();
task_base_type *task = no_more_tasks_sentinel;
do {
if (task != no_more_tasks_sentinel && task != end) {
team_queue.complete(task);
}
if (*((volatile int *)&team_queue.m_ready_count) > 0) {
task = end;
for (int i = 0; i < queue_type::NumQueue && end == task; ++i) {
for (int j = 0; j < 2 && end == task; ++j) {
task = queue_type::pop_ready_task(&team_queue.m_ready[i][j]);
}
}
} else {
task = team_queue.attempt_to_steal_task();
}
if (task != no_more_tasks_sentinel && task != end) {
(*task->m_apply)(task, &single_exec);
}
} while (task != no_more_tasks_sentinel);
sem.signal(1);
});
}
sem.wait(num_worker_threads);
}
template <typename TaskType>
static void get_function_pointer(typename TaskType::function_type &ptr,
typename TaskType::destroy_type &dtor) {
ptr = TaskType::apply;
dtor = TaskType::destroy;
}
private:
const scheduler_type *scheduler;
};
extern template class TaskQueue<
Kokkos::Experimental::HPX,
typename Kokkos::Experimental::HPX::memory_space>;
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
#endif /* #ifndef KOKKOS_HPX_TASK_HPP */

View File

@ -0,0 +1,57 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_HPX_VIEWETIAVAIL_HPP
#define KOKKOS_HPX_VIEWETIAVAIL_HPP
namespace Kokkos {
namespace Impl {
#define KOKKOS_IMPL_VIEWCOPY_ETI_AVAIL_EXECSPACE Kokkos::Experimental::HPX
#include<eti/common/Kokkos_ViewFillCopyETIAvail_Macros.hpp>
#undef KOKKOS_IMPL_VIEWCOPY_ETI_AVAIL_EXECSPACE
}
}
#endif

View File

@ -0,0 +1,57 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_HPX_VIEWETIDECL_HPP
#define KOKKOS_HPX_VIEWETIDECL_HPP
namespace Kokkos {
namespace Impl {
#define KOKKOS_IMPL_VIEWCOPY_ETI_AVAIL_EXECSPACE Kokkos::Experimental::HPX
#include<eti/common/Kokkos_ViewFillCopyETIDecl_Macros.hpp>
#undef KOKKOS_IMPL_VIEWCOPY_ETI_AVAIL_EXECSPACE
}
}
#endif

View File

@ -0,0 +1,116 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_HPX_WORKGRAPHPOLICY_HPP
#define KOKKOS_HPX_WORKGRAPHPOLICY_HPP
#include <hpx/apply.hpp>
#include <hpx/lcos/local/counting_semaphore.hpp>
namespace Kokkos {
namespace Impl {
template <class FunctorType, class... Traits>
class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
Kokkos::Experimental::HPX> {
private:
using Policy = Kokkos::WorkGraphPolicy<Traits...>;
using WorkTag = typename Policy::work_tag;
Policy m_policy;
FunctorType m_functor;
template <class TagType>
typename std::enable_if<std::is_same<TagType, void>::value>::type
execute_functor(const std::int32_t w) const noexcept {
m_functor(w);
}
template <class TagType>
typename std::enable_if<!std::is_same<TagType, void>::value>::type
execute_functor(const std::int32_t w) const noexcept {
const TagType t{};
m_functor(t, w);
}
public:
void execute() const {
dispatch_execute_task(this);
Kokkos::Experimental::HPX().fence();
}
void execute_task() const {
const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
using hpx::apply;
using hpx::lcos::local::counting_semaphore;
counting_semaphore sem(0);
for (int thread = 0; thread < num_worker_threads; ++thread) {
apply([this, &sem]() {
std::int32_t w = m_policy.pop_work();
while (w != Policy::COMPLETED_TOKEN) {
if (w != Policy::END_TOKEN) {
execute_functor<WorkTag>(w);
m_policy.completed_work(w);
}
w = m_policy.pop_work();
}
sem.signal(1);
});
}
sem.wait(num_worker_threads);
}
inline ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy)
: m_policy(arg_policy), m_functor(arg_functor) {}
};
} // namespace Impl
} // namespace Kokkos
#endif /* #define KOKKOS_HPX_WORKGRAPHPOLICY_HPP */

View File

@ -125,6 +125,8 @@ struct MDRangePolicy
using traits = Kokkos::Impl::PolicyTraits<Properties ...>;
using range_policy = RangePolicy<Properties...>;
typename traits::execution_space m_space;
using impl_range_policy = RangePolicy< typename traits::execution_space
, typename traits::schedule_type
, typename traits::index_type
@ -132,6 +134,9 @@ struct MDRangePolicy
typedef MDRangePolicy execution_policy; // needed for is_execution_space interrogation
template<class ... OtherProperties>
friend struct MDRangePolicy;
static_assert( !std::is_same<typename traits::iteration_pattern,void>::value
, "Kokkos Error: MD iteration pattern not defined" );
@ -192,13 +197,54 @@ struct MDRangePolicy
static constexpr int Right = static_cast<int>( Iterate::Right );
static constexpr int Left = static_cast<int>( Iterate::Left );
KOKKOS_INLINE_FUNCTION const typename traits::execution_space & space() const { return m_space ; }
template < typename LT , typename UT , typename TT = array_index_type >
MDRangePolicy(std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
: m_space() {
init(lower, upper, tile);
}
template < typename LT , typename UT , typename TT = array_index_type >
MDRangePolicy(const typename traits::execution_space & work_space,
std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
: m_space( work_space ) {
init(lower, upper, tile);
}
MDRangePolicy( point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{} )
: m_lower(lower)
: m_space()
, m_lower(lower)
, m_upper(upper)
, m_tile(tile)
, m_num_tiles(1)
, m_prod_tile_dims(1)
{
, m_prod_tile_dims(1) {
init();
}
MDRangePolicy( const typename traits::execution_space & work_space,
point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{} )
: m_space( work_space )
, m_lower(lower)
, m_upper(upper)
, m_tile(tile)
, m_num_tiles(1)
, m_prod_tile_dims(1) {
init();
}
template<class ... OtherProperties>
MDRangePolicy( const MDRangePolicy<OtherProperties...> p ):
m_space(p.m_space),
m_lower(p.m_lower),
m_upper(p.m_upper),
m_tile(p.m_tile),
m_tile_end(p.m_tile_end),
m_num_tiles(p.m_num_tiles),
m_prod_tile_dims(p.m_prod_tile_dims) {}
private:
void init() {
// Host
if ( true
#if defined(KOKKOS_ENABLE_CUDA)
@ -211,7 +257,7 @@ struct MDRangePolicy
{
index_type span;
for (int i=0; i<rank; ++i) {
span = upper[i] - lower[i];
span = m_upper[i] - m_lower[i];
if ( m_tile[i] <= 0 ) {
if ( ((int)inner_direction == (int)Right && (i < rank-1))
|| ((int)inner_direction == (int)Left && (i > 0)) )
@ -311,11 +357,9 @@ struct MDRangePolicy
#endif
}
template < typename LT , typename UT , typename TT = array_index_type >
MDRangePolicy( std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
void init( std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
{
if(static_cast<int>(m_lower.size()) != rank || static_cast<int>(m_upper.size()) != rank)
Kokkos::abort("MDRangePolicy: Constructor initializer lists have wrong size");
@ -589,5 +633,26 @@ void md_parallel_reduce( const std::string& str
} } // namespace Kokkos::Experimental
#endif
namespace Kokkos {
namespace Experimental {
namespace Impl {
template<unsigned long P, class ... Properties>
struct PolicyPropertyAdaptor<WorkItemProperty::ImplWorkItemProperty<P>,MDRangePolicy<Properties...>> {
typedef MDRangePolicy<Properties...> policy_in_t;
typedef MDRangePolicy<typename policy_in_t::traits::execution_space,
typename policy_in_t::traits::schedule_type,
typename policy_in_t::traits::work_tag,
typename policy_in_t::traits::index_type,
typename policy_in_t::traits::iteration_pattern,
typename policy_in_t::traits::launch_bounds,
WorkItemProperty::ImplWorkItemProperty<P>> policy_out_t;
};
}
}
}
#endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP

View File

@ -90,6 +90,7 @@
#if ! defined( KOKKOS_ENABLE_GNU_ATOMICS ) && \
! defined( KOKKOS_ENABLE_INTEL_ATOMICS ) && \
! defined( KOKKOS_ENABLE_OPENMP_ATOMICS ) && \
! defined( KOKKOS_ENABLE_STD_ATOMICS ) && \
! defined( KOKKOS_ENABLE_SERIAL_ATOMICS )
// Compiling for non-Cuda atomic implementation has not been pre-selected.
@ -168,6 +169,12 @@ const char * atomic_query_version()
} // namespace Kokkos
//----------------------------------------------------------------------------
// Atomic Memory Orders
//
// Implements Strongly-typed analogs of C++ standard memory orders
#include "impl/Kokkos_Atomic_Memory_Order.hpp"
#if defined( KOKKOS_ENABLE_ROCM )
namespace Kokkos {
namespace Impl {
@ -287,6 +294,14 @@ void unlock_address_rocm_space(void* ptr);
#ifndef _WIN32
#include "impl/Kokkos_Atomic_Generic.hpp"
#endif
//----------------------------------------------------------------------------
// Provide atomic loads and stores with memory order semantics
#include "impl/Kokkos_Atomic_Load.hpp"
#include "impl/Kokkos_Atomic_Store.hpp"
//----------------------------------------------------------------------------
// This atomic-style macro should be an inlined function, not a macro

View File

@ -631,8 +631,10 @@ RealType real (const complex<RealType>& x) {
template<class RealType>
KOKKOS_INLINE_FUNCTION
RealType abs (const complex<RealType>& x) {
// FIXME (mfh 31 Oct 2014) Scale to avoid unwarranted overflow.
return std::sqrt (real (x) * real (x) + imag (x) * imag (x));
#ifndef __CUDA_ARCH__
using std::hypot;
#endif
return hypot(x.real(),x.imag());
}
//! Power of a complex number

Some files were not shown because too many files have changed in this diff Show More