forked from lijiext/lammps
Merge branch 'master' into improve-include-consistency
# Conflicts: # src/USER-MESO/atom_vec_tdpd.cpp
This commit is contained in:
commit
06dcc9e283
|
@ -57,8 +57,7 @@ Boolean expression is FALSE, then no commands are executed.
|
|||
The syntax for Boolean expressions is described below.
|
||||
|
||||
Each command (t1, f1, e1, etc) can be any valid LAMMPS input script
|
||||
command, except an "include"_include.html command, which is not
|
||||
allowed. If the command is more than one word, it must enclosed in
|
||||
command. If the command is more than one word, it must enclosed in
|
||||
quotes, so it will be treated as a single argument, as in the examples
|
||||
above.
|
||||
|
||||
|
|
|
@ -147,7 +147,8 @@ asub = "A" parameter for MEAM (see e.g. "(Baskes)"_#Baskes) :pre
|
|||
|
||||
The alpha, b0, b1, b2, b3, t0, t1, t2, t3 parameters correspond to the
|
||||
standard MEAM parameters in the literature "(Baskes)"_#Baskes (the b
|
||||
parameters are the standard beta parameters). The rozero parameter is
|
||||
parameters are the standard beta parameters). Note that only parameters
|
||||
normalized to t0 = 1.0 are supported. The rozero parameter is
|
||||
an element-dependent density scaling that weights the reference
|
||||
background density (see e.g. equation 4.5 in "(Gullet)"_#Gullet) and
|
||||
is typically 1.0 for single-element systems. The ibar parameter
|
||||
|
|
|
@ -5092,4 +5092,17 @@ span[id*='MathJax-Span'] {
|
|||
src: local("Roboto Slab Bold"), local("RobotoSlab-Bold"), url(../fonts/RobotoSlab-Bold.ttf) format("truetype");
|
||||
}
|
||||
|
||||
.codeblock, pre.literal-block, .rst-content .literal-block, .rst-content pre.literal-block, div[class^='highlight'] {
|
||||
font-size: 12px;
|
||||
line-height: 1.5;
|
||||
display: block;
|
||||
overflow: auto;
|
||||
color: #404040;
|
||||
padding: 12px 12px;
|
||||
}
|
||||
|
||||
.codeblock,div[class^='highlight'] {
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
/*# sourceMappingURL=theme.css.map */
|
||||
|
|
|
@ -174,6 +174,7 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
|
|||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp r3inv = ucl_sqrt(r6inv);
|
||||
numtyp force = r2inv*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
|
||||
force*=factor_lj;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
|
|
|
@ -308,8 +308,6 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
|
|||
delr1.z = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
|
||||
|
||||
// if (rsq1 > cutsq[ijparam]) continue;
|
||||
|
||||
// compute zeta_ij
|
||||
z = (acctyp)0;
|
||||
|
||||
|
@ -355,13 +353,9 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
|
|||
rsq1, rsq2, delr1, delr2);
|
||||
}
|
||||
|
||||
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
|
||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
// idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
int idx = nbor_j;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// i, nbor_j, offset_j, idx);
|
||||
acc_zeta(z, tid, t_per_atom, offset_k);
|
||||
|
||||
numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
|
||||
|
@ -585,14 +579,9 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
|
|||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
|
||||
// look up for zeta_ij
|
||||
|
||||
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
|
||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
// idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
int idx = nbor_j;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// i, nbor_j, offset_j, idx);
|
||||
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
|
||||
numtyp force = zeta_ij.x*tpainv;
|
||||
numtyp prefactor = zeta_ij.y;
|
||||
|
@ -823,13 +812,9 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
|||
offset_kf = red_acc[2*m+1];
|
||||
}
|
||||
|
||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, ijnum, offset_kf, idx);
|
||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||
numtyp force = zeta_ji.x*tpainv;
|
||||
numtyp prefactor_ji = zeta_ji.y;
|
||||
|
@ -891,13 +876,10 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
|||
f.y += fi[1];
|
||||
f.z += fi[2];
|
||||
|
||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
// idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, nbor_k, offset_k, idx);
|
||||
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
|
||||
|
@ -1068,13 +1050,9 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
offset_kf = red_acc[2*m+1];
|
||||
}
|
||||
|
||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, ijnum, offset_kf, idx);
|
||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||
numtyp force = zeta_ji.x*tpainv;
|
||||
numtyp prefactor_ji = zeta_ji.y;
|
||||
|
@ -1143,13 +1121,9 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
virial[4] += TWOTHIRD*(mdelr1[0]*fj[2] + delr2[0]*fk[2]);
|
||||
virial[5] += TWOTHIRD*(mdelr1[1]*fj[2] + delr2[1]*fk[2]);
|
||||
|
||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
// idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, nbor_k, offset_k, idx);
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
|
||||
|
|
|
@ -356,13 +356,9 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
|
|||
ijkparam_c5, rsq1, rsq2, delr1, delr2);
|
||||
}
|
||||
|
||||
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
|
||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
// idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
int idx = nbor_j;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// i, nbor_j, offset_j, idx);
|
||||
acc_zeta(z, tid, t_per_atom, offset_k);
|
||||
|
||||
numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
|
||||
|
@ -587,14 +583,9 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
|
|||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
|
||||
// look up for zeta_ij
|
||||
|
||||
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
|
||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
// idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
int idx = nbor_j;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// i, nbor_j, offset_j, idx);
|
||||
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
|
||||
numtyp force = zeta_ij.x*tpainv;
|
||||
numtyp prefactor = zeta_ij.y;
|
||||
|
@ -831,13 +822,9 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
|||
offset_kf = red_acc[2*m+1];
|
||||
}
|
||||
|
||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, ijnum, offset_kf, idx);
|
||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||
numtyp force = zeta_ji.x*tpainv;
|
||||
numtyp prefactor_ji = zeta_ji.y;
|
||||
|
@ -902,13 +889,9 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
|||
f.y += fi[1];
|
||||
f.z += fi[2];
|
||||
|
||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
// idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, nbor_k, offset_k, idx);
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
|
||||
|
@ -1085,13 +1068,9 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
offset_kf = red_acc[2*m+1];
|
||||
}
|
||||
|
||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, ijnum, offset_kf, idx);
|
||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||
numtyp force = zeta_ji.x*tpainv;
|
||||
numtyp prefactor_ji = zeta_ji.y;
|
||||
|
@ -1163,13 +1142,9 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
virial[4] += TWOTHIRD*(mdelr1[0]*fj[2] + delr2[0]*fk[2]);
|
||||
virial[5] += TWOTHIRD*(mdelr1[1]*fj[2] + delr2[1]*fk[2]);
|
||||
|
||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
// idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, nbor_k, offset_k, idx);
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
|
||||
|
|
|
@ -359,13 +359,9 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
|
|||
rsq1, rsq2, delr1, delr2);
|
||||
}
|
||||
|
||||
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
|
||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
// idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
int idx = nbor_j;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// i, nbor_j, offset_j, idx);
|
||||
acc_zeta(z, tid, t_per_atom, offset_k);
|
||||
|
||||
numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
|
||||
|
@ -603,14 +599,9 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
|
|||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
|
||||
// look up for zeta_ij
|
||||
|
||||
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
|
||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
// idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
int idx = nbor_j;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// i, nbor_j, offset_j, idx);
|
||||
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
|
||||
numtyp force = zeta_ij.x*tpainv;
|
||||
numtyp prefactor = zeta_ij.y;
|
||||
|
@ -841,13 +832,9 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
|||
offset_kf = red_acc[2*m+1];
|
||||
}
|
||||
|
||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, ijnum, offset_kf, idx);
|
||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||
numtyp force = zeta_ji.x*tpainv;
|
||||
numtyp prefactor_ji = zeta_ji.y;
|
||||
|
@ -909,13 +896,9 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
|||
f.y += fi[1];
|
||||
f.z += fi[2];
|
||||
|
||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
// idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, nbor_k, offset_k, idx);
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
|
||||
|
@ -1086,13 +1069,9 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
offset_kf = red_acc[2*m+1];
|
||||
}
|
||||
|
||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, ijnum, offset_kf, idx);
|
||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||
numtyp force = zeta_ji.x*tpainv;
|
||||
numtyp prefactor_ji = zeta_ji.y;
|
||||
|
@ -1161,13 +1140,9 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
virial[4] += TWOTHIRD*(mdelr1[0]*fj[2] + delr2[0]*fk[2]);
|
||||
virial[5] += TWOTHIRD*(mdelr1[1]*fj[2] + delr2[1]*fk[2]);
|
||||
|
||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
// idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, nbor_k, offset_k, idx);
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
|
||||
|
|
|
@ -89,10 +89,10 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
|
|||
if (rsq<coeff[mtype].z) {
|
||||
numtyp r = ucl_sqrt(rsq);
|
||||
numtyp rinv = ucl_recip(r);
|
||||
numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
|
||||
numtyp force = coeff[mtype].x * screening;
|
||||
numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
|
||||
numtyp force = coeff[mtype].x * screening;
|
||||
|
||||
force = factor_lj*force * rinv;
|
||||
force = factor_lj*force * rinv;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
|
@ -181,10 +181,10 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
|
|||
if (rsq<coeff[mtype].z) {
|
||||
numtyp r = ucl_sqrt(rsq);
|
||||
numtyp rinv = ucl_recip(r);
|
||||
numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
|
||||
numtyp force = coeff[mtype].x * screening;
|
||||
numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
|
||||
numtyp force = coeff[mtype].x * screening;
|
||||
|
||||
force = factor_lj*force * rinv;
|
||||
force = factor_lj*force * rinv;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
|
|
|
@ -129,16 +129,13 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
|
|||
int mtype=itype*lj_types+jtype;
|
||||
if (rsq<cut_globalsq) {
|
||||
numtyp r, t, force;
|
||||
|
||||
r = ucl_sqrt(rsq);
|
||||
force = dzbldr(r, coeff2[mtype].x, coeff2[mtype].y,
|
||||
coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
|
||||
|
||||
if (rsq>cut_innersq) {
|
||||
t = r - cut_inner;
|
||||
force = t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
|
||||
}
|
||||
|
||||
if (rsq>cut_innersq) {
|
||||
t = r - cut_inner;
|
||||
force = t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
|
||||
}
|
||||
force *= (numtyp)-1.0*ucl_recip(r);
|
||||
|
||||
f.x+=delx*force;
|
||||
|
@ -148,11 +145,10 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
|
|||
if (eflag>0) {
|
||||
numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y,
|
||||
coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
|
||||
e += coeff3[mtype].z;
|
||||
if (rsq > cut_innersq) {
|
||||
e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
|
||||
}
|
||||
|
||||
e += coeff3[mtype].z;
|
||||
if (rsq > cut_innersq) {
|
||||
e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
|
||||
}
|
||||
energy+=e;
|
||||
}
|
||||
if (vflag>0) {
|
||||
|
@ -232,15 +228,13 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
|
|||
|
||||
if (rsq<cut_globalsq) {
|
||||
numtyp r, t, force;
|
||||
|
||||
r = ucl_sqrt(rsq);
|
||||
force = dzbldr(r, coeff2[mtype].x, coeff2[mtype].y,
|
||||
coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
|
||||
|
||||
if (rsq>cut_innersq) {
|
||||
t = r - cut_inner;
|
||||
force += t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
|
||||
}
|
||||
if (rsq>cut_innersq) {
|
||||
t = r - cut_inner;
|
||||
force += t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
|
||||
}
|
||||
|
||||
force *= (numtyp)-1.0*ucl_recip(r);
|
||||
|
||||
|
@ -251,11 +245,10 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
|
|||
if (eflag>0) {
|
||||
numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y,
|
||||
coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
|
||||
e += coeff3[mtype].z;
|
||||
if (rsq > cut_innersq) {
|
||||
e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
|
||||
}
|
||||
|
||||
e += coeff3[mtype].z;
|
||||
if (rsq > cut_innersq) {
|
||||
e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
|
||||
}
|
||||
energy+=e;
|
||||
}
|
||||
if (vflag>0) {
|
||||
|
|
|
@ -1,5 +1,39 @@
|
|||
# Change Log
|
||||
|
||||
## [2.9.00](https://github.com/kokkos/kokkos/tree/2.9.00) (2019-06-24)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.8.00...2.9.00)
|
||||
|
||||
**Implemented enhancements:**
|
||||
|
||||
- Capability: CUDA Streams [\#1723](https://github.com/kokkos/kokkos/issues/1723)
|
||||
- Capability: CUDA Stream support for parallel\_reduce [\#2061](https://github.com/kokkos/kokkos/issues/2061)
|
||||
- Capability: Feature Request: TeamVectorRange [\#713](https://github.com/kokkos/kokkos/issues/713)
|
||||
- Capability: Adding HPX backend [\#2080](https://github.com/kokkos/kokkos/issues/2080)
|
||||
- Capability: TaskScheduler to have multiple queues [\#565](https://github.com/kokkos/kokkos/issues/565)
|
||||
- Capability: Support for additional reductions in ScatterView [\#1674](https://github.com/kokkos/kokkos/issues/1674)
|
||||
- Capability: Request: deep\_copy within parallel regions [\#689](https://github.com/kokkos/kokkos/issues/689)
|
||||
- Capability: Feature Request: `create\_mirror\_view\_without\_initializing` [\#1765](https://github.com/kokkos/kokkos/issues/1765)
|
||||
- View: Use SFINAE to restrict possible View type conversions [\#2127](https://github.com/kokkos/kokkos/issues/2127)
|
||||
- Deprecation: Deprecate ExecutionSpace::fence\(\) as static function and make it non-static [\#2140](https://github.com/kokkos/kokkos/issues/2140)
|
||||
- Deprecation: Deprecate LayoutTileLeft [\#2122](https://github.com/kokkos/kokkos/issues/2122)
|
||||
- Macros: KOKKOS\_RESTRICT defined for non-Intel compilers [\#2038](https://github.com/kokkos/kokkos/issues/2038)
|
||||
|
||||
**Fixed bugs:**
|
||||
|
||||
- Cuda: TeamThreadRange loop count on device is passed by reference to host static constexpr [\#1733](https://github.com/kokkos/kokkos/issues/1733)
|
||||
- Cuda: Build error with relocatable device code with CUDA 10.1 GCC 7.3 [\#2134](https://github.com/kokkos/kokkos/issues/2134)
|
||||
- Cuda: cudaFuncSetCacheConfig is setting CachePreferShared too often [\#2066](https://github.com/kokkos/kokkos/issues/2066)
|
||||
- Cuda: TeamPolicy doesn't throw then created with non-viable vector length and also doesn't backscale to viable one [\#2020](https://github.com/kokkos/kokkos/issues/2020)
|
||||
- Cuda: cudaMemcpy error for large league sizes on V100 [\#1991](https://github.com/kokkos/kokkos/issues/1991)
|
||||
- Cuda: illegal warp sync in parallel\_reduce by functor on Turing 75 [\#1958](https://github.com/kokkos/kokkos/issues/1958)
|
||||
- TeamThreadRange: Inconsistent results from TeamThreadRange reduction [\#1905](https://github.com/kokkos/kokkos/issues/1905)
|
||||
- Atomics: atomic\_fetch\_oper & atomic\_oper\_fetch don't build for complex\<float\> [\#1964](https://github.com/kokkos/kokkos/issues/1964)
|
||||
- Views: Kokkos randomread Views leak memory [\#2155](https://github.com/kokkos/kokkos/issues/2155)
|
||||
- ScatterView: LayoutLeft overload currently non-functional [\#2165](https://github.com/kokkos/kokkos/issues/2165)
|
||||
- KNL: With intel 17.2.174 illegal instruction in random number test [\#2078](https://github.com/kokkos/kokkos/issues/2078)
|
||||
- Bitset: Enable copy constructor on device [\#2094](https://github.com/kokkos/kokkos/issues/2094)
|
||||
- Examples: do not compile due to template deduction error \(multi\_fem\) [\#1928](https://github.com/kokkos/kokkos/issues/1928)
|
||||
|
||||
## [2.8.00](https://github.com/kokkos/kokkos/tree/2.8.00) (2019-02-05)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.7.24...2.8.00)
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@ KOKKOS_DEBUG ?= "no"
|
|||
KOKKOS_USE_TPLS ?= ""
|
||||
# Options: c++11,c++14,c++1y,c++17,c++1z,c++2a
|
||||
KOKKOS_CXX_STANDARD ?= "c++11"
|
||||
# Options: aggressive_vectorization,disable_profiling,disable_deprecated_code,enable_large_mem_tests
|
||||
# Options: aggressive_vectorization,disable_profiling,enable_deprecated_code,disable_deprecated_code,enable_large_mem_tests
|
||||
KOKKOS_OPTIONS ?= ""
|
||||
# Option for setting ETI path
|
||||
KOKKOS_ETI_PATH ?= ${KOKKOS_PATH}/core/src/eti
|
||||
|
@ -33,11 +33,19 @@ KOKKOS_CMAKE ?= "no"
|
|||
# Options: force_uvm,use_ldg,rdc,enable_lambda
|
||||
KOKKOS_CUDA_OPTIONS ?= "enable_lambda"
|
||||
|
||||
# Default settings specific options.
|
||||
# Options: enable_async_dispatch
|
||||
KOKKOS_HPX_OPTIONS ?= ""
|
||||
|
||||
# Return a 1 if a string contains a substring and 0 if not
|
||||
# Note the search string should be without '"'
|
||||
# Example: $(call kokkos_has_string,"hwloc,librt",hwloc)
|
||||
# Will return a 1
|
||||
kokkos_has_string=$(if $(findstring $2,$1),1,0)
|
||||
# Returns 1 if the path exists, 0 otherwise
|
||||
# Example: $(call kokkos_path_exists,/path/to/file)
|
||||
# Will return a 1 if /path/to/file exists
|
||||
kokkos_path_exists=$(if $(wildcard $1),1,0)
|
||||
|
||||
# Check for general settings.
|
||||
KOKKOS_INTERNAL_ENABLE_DEBUG := $(call kokkos_has_string,$(KOKKOS_DEBUG),yes)
|
||||
|
@ -58,6 +66,7 @@ KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OP
|
|||
KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OPTIONS),aggressive_vectorization)
|
||||
KOKKOS_INTERNAL_DISABLE_PROFILING := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_profiling)
|
||||
KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_deprecated_code)
|
||||
KOKKOS_INTERNAL_ENABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_deprecated_code)
|
||||
KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_dualview_modify_check)
|
||||
KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_profile_load_print)
|
||||
KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_large_mem_tests)
|
||||
|
@ -65,6 +74,7 @@ KOKKOS_INTERNAL_CUDA_USE_LDG := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),
|
|||
KOKKOS_INTERNAL_CUDA_USE_UVM := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),force_uvm)
|
||||
KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),rdc)
|
||||
KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda)
|
||||
KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch)
|
||||
KOKKOS_INTERNAL_ENABLE_ETI := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_eti)
|
||||
|
||||
|
||||
|
@ -72,12 +82,15 @@ KOKKOS_INTERNAL_ENABLE_ETI := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_
|
|||
KOKKOS_INTERNAL_USE_OPENMP := $(call kokkos_has_string,$(subst OpenMPTarget,,$(KOKKOS_DEVICES)),OpenMP)
|
||||
KOKKOS_INTERNAL_USE_PTHREADS := $(call kokkos_has_string,$(KOKKOS_DEVICES),Pthread)
|
||||
KOKKOS_INTERNAL_USE_QTHREADS := $(call kokkos_has_string,$(KOKKOS_DEVICES),Qthreads)
|
||||
KOKKOS_INTERNAL_USE_HPX := $(call kokkos_has_string,$(KOKKOS_DEVICES),HPX)
|
||||
KOKKOS_INTERNAL_USE_SERIAL := $(call kokkos_has_string,$(KOKKOS_DEVICES),Serial)
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 0)
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 0)
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 0)
|
||||
KOKKOS_INTERNAL_USE_SERIAL := 1
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 0)
|
||||
KOKKOS_INTERNAL_USE_SERIAL := 1
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
@ -112,7 +125,7 @@ KOKKOS_INTERNAL_COMPILER_XL := $(strip $(shell $(CXX) -qversion 2
|
|||
KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-" | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep nvcc | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang)
|
||||
KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),apple-darwin)
|
||||
KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple LLVM)
|
||||
KOKKOS_INTERNAL_COMPILER_HCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC)
|
||||
|
||||
# Check Host Compiler if using NVCC through nvcc_wrapper
|
||||
|
@ -283,9 +296,9 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
|
|||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53))
|
||||
|
@ -300,19 +313,19 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
|
|||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53))
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc)
|
||||
CUDA_PATH ?= $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=)
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc)
|
||||
CUDA_PATH ?= $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=)
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
|
||||
KOKKOS_INTERNAL_OPENMPTARGET_FLAG := $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG) --cuda-path=$(CUDA_PATH)
|
||||
endif
|
||||
endif
|
||||
|
@ -441,6 +454,10 @@ ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
|
|||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_QTHREADS")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_HPX")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_SERIAL")
|
||||
endif
|
||||
|
@ -559,9 +576,15 @@ ifeq ($(KOKKOS_INTERNAL_DISABLE_PROFILING), 0)
|
|||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_PROFILING")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 0)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEPRECATED_CODE")
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 0)
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_DEPRECATED_CODE), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEPRECATED_CODE")
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 0)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEPRECATED_CODE")
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_ETI")
|
||||
endif
|
||||
|
@ -593,8 +616,13 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
|||
|
||||
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE")
|
||||
KOKKOS_CXXFLAGS += --relocatable-device-code=true
|
||||
KOKKOS_LDFLAGS += --relocatable-device-code=true
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
KOKKOS_CXXFLAGS += -fcuda-rdc
|
||||
KOKKOS_LDFLAGS += -fcuda-rdc
|
||||
else
|
||||
KOKKOS_CXXFLAGS += --relocatable-device-code=true
|
||||
KOKKOS_LDFLAGS += --relocatable-device-code=true
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
|
||||
|
@ -625,6 +653,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_HPX_ASYNC_DISPATCH")
|
||||
endif
|
||||
endif
|
||||
|
||||
# Add Architecture flags.
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1)
|
||||
|
@ -908,7 +942,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
|||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG=--cuda-gpu-arch
|
||||
KOKKOS_CXXFLAGS += -x cuda
|
||||
else
|
||||
$(error Makefile.kokkos: CUDA is enabled but the compiler is neither NVCC nor Clang)
|
||||
$(error Makefile.kokkos: CUDA is enabled but the compiler is neither NVCC nor Clang (got version string $(KOKKOS_CXX_VERSION)) )
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1)
|
||||
|
@ -1058,10 +1092,18 @@ endif
|
|||
ifneq ($(KOKKOS_CMAKE), yes)
|
||||
KOKKOS_CXXFLAGS += -I$(CUDA_PATH)/include
|
||||
endif
|
||||
KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64
|
||||
KOKKOS_CXXLDFLAGS += -L$(CUDA_PATH)/lib64
|
||||
ifeq ($(call kokkos_path_exists,$(CUDA_PATH)/lib64), 1)
|
||||
KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64
|
||||
KOKKOS_CXXLDFLAGS += -L$(CUDA_PATH)/lib64
|
||||
KOKKOS_TPL_LIBRARY_DIRS += $(CUDA_PATH)/lib64
|
||||
else ifeq ($(call kokkos_path_exists,$(CUDA_PATH)/lib), 1)
|
||||
KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib
|
||||
KOKKOS_CXXLDFLAGS += -L$(CUDA_PATH)/lib
|
||||
KOKKOS_TPL_LIBRARY_DIRS += $(CUDA_PATH)/lib
|
||||
else
|
||||
$(error Can't find CUDA library directory: no lib64 or lib directory in $(CUDA_PATH))
|
||||
endif
|
||||
KOKKOS_TPL_INCLUDE_DIRS += $(CUDA_PATH)/include
|
||||
KOKKOS_TPL_LIBRARY_DIRS += $(CUDA_PATH)/lib64
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
KOKKOS_CXXFLAGS += --cuda-path=$(CUDA_PATH)
|
||||
endif
|
||||
|
@ -1124,6 +1166,33 @@ ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
|
|||
KOKKOS_TPL_LIBRARY_NAMES += qthread
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
|
||||
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HPX/*.cpp)
|
||||
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HPX/*.hpp)
|
||||
ifneq ($(HPX_PATH),)
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
|
||||
KOKKOS_CXXFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --cflags hpx_application_debug)
|
||||
KOKKOS_CXXLDFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --libs hpx_application_debug)
|
||||
KOKKOS_LDFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --libs hpx_application_debug)
|
||||
else
|
||||
KOKKOS_CXXFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --cflags hpx_application)
|
||||
KOKKOS_CXXLDFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --libs hpx_application)
|
||||
KOKKOS_LDFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --libs hpx_application)
|
||||
endif
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
|
||||
KOKKOS_CXXFLAGS += $(shell pkg-config --cflags hpx_application_debug)
|
||||
KOKKOS_CXXLDFLAGS += $(shell pkg-config --libs hpx_application_debug)
|
||||
KOKKOS_LDFLAGS += $(shell pkg-config --libs hpx_application_debug)
|
||||
else
|
||||
KOKKOS_CXXFLAGS += $(shell pkg-config --cflags hpx_application)
|
||||
KOKKOS_CXXLDFLAGS += $(shell pkg-config --libs hpx_application)
|
||||
KOKKOS_LDFLAGS += $(shell pkg-config --libs hpx_application)
|
||||
endif
|
||||
endif
|
||||
KOKKOS_TPL_LIBRARY_NAMES += hpx
|
||||
endif
|
||||
|
||||
# Explicitly set the GCC Toolchain for Clang.
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
KOKKOS_INTERNAL_GCC_PATH = $(shell which g++)
|
||||
|
|
|
@ -30,6 +30,8 @@ Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_
|
|||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
|
||||
Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
|
||||
Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
|
||||
|
@ -38,8 +40,8 @@ endif
|
|||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
Kokkos_Cuda_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
|
||||
Kokkos_Cuda_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp
|
||||
Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
|
||||
Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
|
||||
|
@ -92,6 +94,13 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
|
||||
Kokkos_HPX.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp
|
||||
Kokkos_HPX_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
|
||||
Kokkos_OpenMPTarget_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
||||
LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers
|
||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
|
||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
|
||||
TEST_OPTIONAL_TPLS CUSPARSE
|
||||
)
|
||||
|
|
|
@ -328,6 +328,8 @@ public:
|
|||
|
||||
parallel_for("Kokkos::Sort::Copy", Kokkos::RangePolicy<execution_space>(0,len),functor);
|
||||
}
|
||||
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
template<class ValuesViewType>
|
||||
|
|
|
@ -42,6 +42,12 @@ IF(Kokkos_ENABLE_OpenMP)
|
|||
)
|
||||
ENDIF()
|
||||
|
||||
IF(Kokkos_ENABLE_HPX)
|
||||
LIST( APPEND SOURCES
|
||||
TestHPX.cpp
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
IF(Kokkos_ENABLE_Serial)
|
||||
LIST( APPEND SOURCES
|
||||
TestSerial.cpp
|
||||
|
|
|
@ -49,6 +49,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
|
|||
TEST_TARGETS += test-openmp
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
|
||||
OBJ_HPX = TestHPX.o UnitTestMain.o gtest-all.o
|
||||
TARGETS += KokkosAlgorithms_UnitTest_HPX
|
||||
TEST_TARGETS += test-hpx
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
|
||||
OBJ_SERIAL = TestSerial.o UnitTestMain.o gtest-all.o
|
||||
TARGETS += KokkosAlgorithms_UnitTest_Serial
|
||||
|
@ -67,6 +73,9 @@ KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
|
|||
KokkosAlgorithms_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_OpenMP
|
||||
|
||||
KokkosAlgorithms_UnitTest_HPX: $(OBJ_HPX) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(EXTRA_PATH) $(OBJ_HPX) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_HPX
|
||||
|
||||
KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Serial
|
||||
|
||||
|
@ -82,6 +91,9 @@ test-threads: KokkosAlgorithms_UnitTest_Threads
|
|||
test-openmp: KokkosAlgorithms_UnitTest_OpenMP
|
||||
./KokkosAlgorithms_UnitTest_OpenMP
|
||||
|
||||
test-hpx: KokkosAlgorithms_UnitTest_HPX
|
||||
./KokkosAlgorithms_UnitTest_HPX
|
||||
|
||||
test-serial: KokkosAlgorithms_UnitTest_Serial
|
||||
./KokkosAlgorithms_UnitTest_Serial
|
||||
|
||||
|
|
|
@ -0,0 +1,96 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#ifdef KOKKOS_ENABLE_HPX
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
#include <TestRandom.hpp>
|
||||
#include <TestSort.hpp>
|
||||
#include <iomanip>
|
||||
|
||||
namespace Test {
|
||||
|
||||
class hpx : public ::testing::Test {
|
||||
protected:
|
||||
static void SetUpTestCase()
|
||||
{
|
||||
std::cout << std::setprecision(5) << std::scientific;
|
||||
}
|
||||
|
||||
static void TearDownTestCase()
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
#define HPX_RANDOM_XORSHIFT64( num_draws ) \
|
||||
TEST_F( hpx, Random_XorShift64 ) { \
|
||||
Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Experimental::HPX> >(num_draws); \
|
||||
}
|
||||
|
||||
#define HPX_RANDOM_XORSHIFT1024( num_draws ) \
|
||||
TEST_F( hpx, Random_XorShift1024 ) { \
|
||||
Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Experimental::HPX> >(num_draws); \
|
||||
}
|
||||
|
||||
#define HPX_SORT_UNSIGNED( size ) \
|
||||
TEST_F( hpx, SortUnsigned ) { \
|
||||
Impl::test_sort< Kokkos::Experimental::HPX, unsigned >(size); \
|
||||
}
|
||||
|
||||
HPX_RANDOM_XORSHIFT64( 10240000 )
|
||||
HPX_RANDOM_XORSHIFT1024( 10130144 )
|
||||
HPX_SORT_UNSIGNED(171)
|
||||
|
||||
#undef HPX_RANDOM_XORSHIFT64
|
||||
#undef HPX_RANDOM_XORSHIFT1024
|
||||
#undef HPX_SORT_UNSIGNED
|
||||
} // namespace test
|
||||
#else
|
||||
void KOKKOS_ALGORITHMS_UNITTESTS_TESTHPX_PREVENT_LINK_ERROR() {}
|
||||
#endif
|
||||
|
|
@ -225,9 +225,9 @@ void test_dynamic_view_sort(unsigned int n )
|
|||
Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
|
||||
Kokkos::fill_random(keys_view,g,Kokkos::Random_XorShift64_Pool<ExecutionSpace>::generator_type::MAX_URAND);
|
||||
|
||||
ExecutionSpace::fence();
|
||||
ExecutionSpace().fence();
|
||||
Kokkos::deep_copy(keys,keys_view);
|
||||
//ExecutionSpace::fence();
|
||||
//ExecutionSpace().fence();
|
||||
|
||||
double sum_before = 0.0;
|
||||
double sum_after = 0.0;
|
||||
|
@ -237,9 +237,9 @@ void test_dynamic_view_sort(unsigned int n )
|
|||
|
||||
Kokkos::sort(keys, 0 /* begin */ , n /* end */ );
|
||||
|
||||
ExecutionSpace::fence(); // Need this fence to prevent BusError with Cuda
|
||||
ExecutionSpace().fence(); // Need this fence to prevent BusError with Cuda
|
||||
Kokkos::deep_copy( keys_view , keys );
|
||||
//ExecutionSpace::fence();
|
||||
//ExecutionSpace().fence();
|
||||
|
||||
Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys_view),sum_after);
|
||||
Kokkos::parallel_reduce(n-1,is_sorted_struct<ExecutionSpace, KeyType>(keys_view),sort_fails);
|
||||
|
|
|
@ -76,8 +76,20 @@ IF(KOKKOS_SEPARATE_LIBS)
|
|||
)
|
||||
|
||||
foreach(lib IN LISTS KOKKOS_TPL_LIBRARY_NAMES)
|
||||
if ("${lib}" STREQUAL "cuda")
|
||||
if (("${lib}" STREQUAL "cuda") AND (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang"))
|
||||
set(LIB_cuda "-lcuda")
|
||||
elseif ("${lib}" STREQUAL "hpx")
|
||||
find_package(HPX REQUIRED)
|
||||
if(${HPX_FOUND})
|
||||
target_link_libraries(kokkoscore PUBLIC ${HPX_LIBRARIES})
|
||||
target_link_libraries(kokkoscontainers PUBLIC ${HPX_LIBRARIES})
|
||||
target_link_libraries(kokkosalgorithms PUBLIC ${HPX_LIBRARIES})
|
||||
target_include_directories(kokkoscore PUBLIC ${HPX_INCLUDE_DIRS})
|
||||
target_include_directories(kokkoscontainers PUBLIC ${HPX_INCLUDE_DIRS})
|
||||
target_include_directories(kokkosalgorithms PUBLIC ${HPX_INCLUDE_DIRS})
|
||||
else()
|
||||
message(ERROR "HPX not found. Check the value of HPX_DIR (= ${HPX_DIR}) or CMAKE_PREFIX_PATH (= ${CMAKE_PREFIX_PATH}).")
|
||||
endif()
|
||||
else()
|
||||
find_library(LIB_${lib} ${lib} PATHS ${KOKKOS_TPL_LIBRARY_DIRS})
|
||||
endif()
|
||||
|
@ -158,8 +170,16 @@ ELSE()
|
|||
)
|
||||
|
||||
foreach(lib IN LISTS KOKKOS_TPL_LIBRARY_NAMES)
|
||||
if ("${lib}" STREQUAL "cuda")
|
||||
if (("${lib}" STREQUAL "cuda") AND (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang"))
|
||||
set(LIB_cuda "-lcuda")
|
||||
elseif ("${lib}" STREQUAL "hpx")
|
||||
find_package(HPX REQUIRED)
|
||||
if(${HPX_FOUND})
|
||||
target_link_libraries(kokkos PUBLIC ${HPX_LIBRARIES})
|
||||
target_include_directories(kokkos PUBLIC ${HPX_INCLUDE_DIRS})
|
||||
else()
|
||||
message(ERROR "HPX not found. Check the value of HPX_DIR (= ${HPX_DIR}) or CMAKE_PREFIX_PATH (= ${CMAKE_PREFIX_PATH}).")
|
||||
endif()
|
||||
else()
|
||||
find_library(LIB_${lib} ${lib} PATHS ${KOKKOS_TPL_LIBRARY_DIRS})
|
||||
endif()
|
||||
|
|
|
@ -95,7 +95,7 @@ function(set_kokkos_cxx_compiler)
|
|||
message(FATAL_ERROR "Compiling CUDA code directly with Clang requires version 4.0.0 or higher.")
|
||||
endif()
|
||||
elseif(NOT INTERNAL_CXX_COMPILER_ID STREQUAL NVIDIA)
|
||||
message(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang.")
|
||||
message(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang, but compiler ID was ${INTERNAL_CXX_COMPILER_ID}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
|
|
@ -14,6 +14,7 @@ list(APPEND KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST
|
|||
OpenMP
|
||||
Pthread
|
||||
Qthread
|
||||
HPX
|
||||
Cuda
|
||||
ROCm
|
||||
HWLOC
|
||||
|
@ -23,6 +24,7 @@ list(APPEND KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST
|
|||
Cuda_Relocatable_Device_Code
|
||||
Cuda_UVM
|
||||
Cuda_LDG_Intrinsic
|
||||
HPX_ASYNC_DISPATCH
|
||||
Debug
|
||||
Debug_DualView_Modify_Check
|
||||
Debug_Bounds_Check
|
||||
|
@ -116,6 +118,7 @@ list(APPEND KOKKOS_DEVICES_LIST
|
|||
OpenMP # OpenMP
|
||||
Pthread # pthread
|
||||
Qthreads # qthreads
|
||||
HPX # HPX
|
||||
Serial # serial
|
||||
ROCm # Relocatable device code
|
||||
)
|
||||
|
@ -173,6 +176,19 @@ set(KOKKOS_INTERNAL_RELOCATABLE_DEVICE_CODE rdc)
|
|||
set(KOKKOS_INTERNAL_LAMBDA enable_lambda)
|
||||
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# List of possible Options for HPX
|
||||
#-------------------------------------------------------------------------------
|
||||
# From Makefile.kokkos: Options: enable_async_dispatch
|
||||
set(KOKKOS_HPX_OPTIONS_LIST)
|
||||
list(APPEND KOKKOS_HPX_OPTIONS_LIST
|
||||
ASYNC_DISPATCH # enable_async_dispatch
|
||||
)
|
||||
|
||||
# Map of cmake variables to Makefile variables
|
||||
set(KOKKOS_INTERNAL_ENABLE_ASYNC_DISPATCH enable_async_dispatch)
|
||||
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
#------------------------------- Create doc strings ----------------------------
|
||||
#-------------------------------------------------------------------------------
|
||||
|
@ -202,6 +218,11 @@ set(KOKKOS_SEPARATE_LIBS OFF CACHE BOOL "OFF = kokkos. ON = kokkoscore, kokkosc
|
|||
# Qthreads options.
|
||||
set(KOKKOS_QTHREADS_DIR "" CACHE PATH "Location of Qthreads library.")
|
||||
|
||||
# HPX options.
|
||||
set(KOKKOS_HPX_DIR "" CACHE PATH "Location of HPX library.")
|
||||
|
||||
# Whether to build separate libraries or now
|
||||
set(KOKKOS_SEPARATE_TESTS OFF CACHE BOOL "Provide unit test targets with finer granularity.")
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
#------------------------------- KOKKOS_DEVICES --------------------------------
|
||||
|
@ -215,6 +236,11 @@ IF(Trilinos_ENABLE_Kokkos)
|
|||
ELSE()
|
||||
set_kokkos_default_default(QTHREADS OFF)
|
||||
ENDIF()
|
||||
IF(TPL_ENABLE_HPX)
|
||||
set_kokkos_default_default(HPX ON)
|
||||
ELSE()
|
||||
set_kokkos_default_default(HPX OFF)
|
||||
ENDIF()
|
||||
IF(Trilinos_ENABLE_OpenMP)
|
||||
set_kokkos_default_default(OPENMP ${Trilinos_ENABLE_OpenMP})
|
||||
ELSE()
|
||||
|
@ -231,6 +257,7 @@ ELSE()
|
|||
set_kokkos_default_default(OPENMP OFF)
|
||||
set_kokkos_default_default(PTHREAD OFF)
|
||||
set_kokkos_default_default(QTHREAD OFF)
|
||||
set_kokkos_default_default(HPX OFF)
|
||||
set_kokkos_default_default(CUDA OFF)
|
||||
set_kokkos_default_default(ROCM OFF)
|
||||
ENDIF()
|
||||
|
@ -241,6 +268,7 @@ set(KOKKOS_ENABLE_SERIAL ${KOKKOS_INTERNAL_ENABLE_SERIAL_DEFAULT} CACHE BOOL "Wh
|
|||
set(KOKKOS_ENABLE_OPENMP ${KOKKOS_INTERNAL_ENABLE_OPENMP_DEFAULT} CACHE BOOL "Enable OpenMP support in Kokkos." FORCE)
|
||||
set(KOKKOS_ENABLE_PTHREAD ${KOKKOS_INTERNAL_ENABLE_PTHREAD_DEFAULT} CACHE BOOL "Enable Pthread support in Kokkos.")
|
||||
set(KOKKOS_ENABLE_QTHREADS ${KOKKOS_INTERNAL_ENABLE_QTHREADS_DEFAULT} CACHE BOOL "Enable Qthreads support in Kokkos.")
|
||||
set(KOKKOS_ENABLE_HPX ${KOKKOS_INTERNAL_ENABLE_HPX_DEFAULT} CACHE BOOL "Enable HPX support in Kokkos.")
|
||||
set(KOKKOS_ENABLE_CUDA ${KOKKOS_INTERNAL_ENABLE_CUDA_DEFAULT} CACHE BOOL "Enable CUDA support in Kokkos.")
|
||||
set(KOKKOS_ENABLE_ROCM ${KOKKOS_INTERNAL_ENABLE_ROCM_DEFAULT} CACHE BOOL "Enable ROCm support in Kokkos.")
|
||||
|
||||
|
@ -343,6 +371,18 @@ set(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE ${KOKKOS_INTERNAL_ENABLE_CUDA_REL
|
|||
set(KOKKOS_ENABLE_CUDA_LAMBDA ${KOKKOS_INTERNAL_ENABLE_CUDA_LAMBDA_DEFAULT} CACHE BOOL "Enable lambdas for CUDA. (cuda option)")
|
||||
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
#------------------------------- KOKKOS_HPX_OPTIONS ----------------------------
|
||||
#-------------------------------------------------------------------------------
|
||||
|
||||
# HPX options.
|
||||
# Set Defaults
|
||||
set_kokkos_default_default(HPX_ASYNC_DISPATCH OFF)
|
||||
|
||||
# Set actual options
|
||||
set(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH ${KOKKOS_INTERNAL_ENABLE_HPX_ASYNC_DISPATCH_DEFAULT} CACHE BOOL "Enable HPX async dispatch.")
|
||||
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
#----------------------- HOST ARCH AND LEGACY TRIBITS --------------------------
|
||||
#-------------------------------------------------------------------------------
|
||||
|
@ -376,4 +416,3 @@ foreach(opt ${KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST})
|
|||
SET(Kokkos_ENABLE_${opt} ${KOKKOS_ENABLE_${OPT}} CACHE BOOL "CamelCase Compatibility setting for KOKKOS_ENABLE_${OPT}")
|
||||
ENDIF()
|
||||
endforeach()
|
||||
|
||||
|
|
|
@ -198,6 +198,8 @@ if(KOKKOS_CMAKE_VERBOSE)
|
|||
message(STATUS " Host Parallel: Pthread")
|
||||
elseif(KOKKOS_ENABLE_QTHREADS)
|
||||
message(STATUS " Host Parallel: Qthreads")
|
||||
elseif(KOKKOS_ENABLE_HPX)
|
||||
message(STATUS " Host Parallel: HPX")
|
||||
else()
|
||||
message(STATUS " Host Parallel: None")
|
||||
endif()
|
||||
|
@ -244,6 +246,10 @@ if(KOKKOS_CMAKE_VERBOSE)
|
|||
message(STATUS " KOKKOS_MEMKIND_DIR: ${KOKKOS_MEMKIND_DIR}")
|
||||
endif()
|
||||
|
||||
if(KOKKOS_HPX_DIR)
|
||||
message(STATUS " KOKKOS_HPX_DIR: ${KOKKOS_HPX_DIR}")
|
||||
endif()
|
||||
|
||||
message(STATUS "")
|
||||
message(STATUS "Final kokkos settings variable:")
|
||||
message(STATUS " ${KOKKOS_SETTINGS}")
|
||||
|
|
|
@ -9,6 +9,10 @@ IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_OpenMP)
|
|||
SET(${PROJECT_NAME}_ENABLE_OpenMP OFF)
|
||||
ENDIF()
|
||||
|
||||
IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_HPX)
|
||||
SET(${PROJECT_NAME}_ENABLE_HPX OFF)
|
||||
ENDIF()
|
||||
|
||||
IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_DEBUG)
|
||||
SET(${PROJECT_NAME}_ENABLE_DEBUG OFF)
|
||||
ENDIF()
|
||||
|
@ -309,6 +313,10 @@ ENDFUNCTION()
|
|||
FUNCTION(TRIBITS_TPL_TENTATIVELY_ENABLE)
|
||||
ENDFUNCTION()
|
||||
|
||||
FUNCTION(TRIBITS_ADD_ADVANCED_TEST)
|
||||
# TODO Write this
|
||||
ENDFUNCTION()
|
||||
|
||||
FUNCTION(TRIBITS_ADD_EXECUTABLE_AND_TEST EXE_NAME)
|
||||
|
||||
SET(options STANDARD_PASS_OUTPUT WILL_FAIL)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
||||
LIB_REQUIRED_PACKAGES KokkosCore
|
||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
|
||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
|
||||
TEST_OPTIONAL_TPLS CUSPARSE
|
||||
)
|
||||
|
|
|
@ -24,6 +24,10 @@ IF(Kokkos_ENABLE_OpenMP)
|
|||
LIST( APPEND SOURCES TestOpenMP.cpp)
|
||||
ENDIF()
|
||||
|
||||
IF(Kokkos_ENABLE_HPX)
|
||||
LIST( APPEND SOURCES TestHPX.cpp)
|
||||
ENDIF()
|
||||
|
||||
# Per #374, we always want to build this test, but we only want to run
|
||||
# it as a PERFORMANCE test. That's why we separate building the test
|
||||
# from running the test.
|
||||
|
|
|
@ -49,6 +49,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
|
|||
TEST_TARGETS += test-openmp
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
|
||||
OBJ_HPX = TestHPX.o TestMain.o gtest-all.o
|
||||
TARGETS += KokkosContainers_PerformanceTest_HPX
|
||||
TEST_TARGETS += test-hpx
|
||||
endif
|
||||
|
||||
KokkosContainers_PerformanceTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Cuda
|
||||
|
||||
|
@ -61,6 +67,9 @@ KokkosContainers_PerformanceTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
|
|||
KokkosContainers_PerformanceTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_OpenMP
|
||||
|
||||
KokkosContainers_PerformanceTest_HPX: $(OBJ_HPX) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_HPX) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_HPX
|
||||
|
||||
test-cuda: KokkosContainers_PerformanceTest_Cuda
|
||||
./KokkosContainers_PerformanceTest_Cuda
|
||||
|
||||
|
@ -73,6 +82,9 @@ test-threads: KokkosContainers_PerformanceTest_Threads
|
|||
test-openmp: KokkosContainers_PerformanceTest_OpenMP
|
||||
./KokkosContainers_PerformanceTest_OpenMP
|
||||
|
||||
test-hpx: KokkosContainers_PerformanceTest_HPX
|
||||
./KokkosContainers_PerformanceTest_HPX
|
||||
|
||||
build_all: $(TARGETS)
|
||||
|
||||
test: $(TEST_TARGETS)
|
||||
|
|
|
@ -197,7 +197,7 @@ void test_dynrankview_op_perf( const int par_size )
|
|||
timer.reset();
|
||||
Kokkos::RangePolicy<DeviceType> policy(0,par_size);
|
||||
Kokkos::parallel_for( policy , FunctorType(testview) );
|
||||
DeviceType::fence();
|
||||
DeviceType().fence();
|
||||
elapsed_time_view = timer.seconds();
|
||||
std::cout << " View time (init only): " << elapsed_time_view << std::endl;
|
||||
|
||||
|
@ -205,7 +205,7 @@ void test_dynrankview_op_perf( const int par_size )
|
|||
timer.reset();
|
||||
Kokkos::View<double*,DeviceType> sumview("sumview",par_size);
|
||||
Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testview, sumview) );
|
||||
DeviceType::fence();
|
||||
DeviceType().fence();
|
||||
elapsed_time_compview = timer.seconds();
|
||||
std::cout << " View sum computation time: " << elapsed_time_view << std::endl;
|
||||
|
||||
|
@ -215,7 +215,7 @@ void test_dynrankview_op_perf( const int par_size )
|
|||
|
||||
timer.reset();
|
||||
Kokkos::parallel_for( policy , FunctorStrideType(teststrideview) );
|
||||
DeviceType::fence();
|
||||
DeviceType().fence();
|
||||
elapsed_time_strideview = timer.seconds();
|
||||
std::cout << " Strided View time (init only): " << elapsed_time_strideview << std::endl;
|
||||
}
|
||||
|
@ -226,7 +226,7 @@ void test_dynrankview_op_perf( const int par_size )
|
|||
timer.reset();
|
||||
Kokkos::RangePolicy<DeviceType> policy(0,par_size);
|
||||
Kokkos::parallel_for( policy , FunctorType(testview) );
|
||||
DeviceType::fence();
|
||||
DeviceType().fence();
|
||||
elapsed_time_view_rank7 = timer.seconds();
|
||||
std::cout << " View Rank7 time (init only): " << elapsed_time_view_rank7 << std::endl;
|
||||
}
|
||||
|
@ -237,14 +237,14 @@ void test_dynrankview_op_perf( const int par_size )
|
|||
timer.reset();
|
||||
Kokkos::RangePolicy<DeviceType> policy(0,par_size);
|
||||
Kokkos::parallel_for( policy , FunctorType(testdrview) );
|
||||
DeviceType::fence();
|
||||
DeviceType().fence();
|
||||
elapsed_time_drview = timer.seconds();
|
||||
std::cout << " DynRankView time (init only): " << elapsed_time_drview << std::endl;
|
||||
|
||||
timer.reset();
|
||||
Kokkos::DynRankView<double,DeviceType> sumview("sumview",par_size);
|
||||
Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testdrview, sumview) );
|
||||
DeviceType::fence();
|
||||
DeviceType().fence();
|
||||
elapsed_time_compdrview = timer.seconds();
|
||||
std::cout << " DynRankView sum computation time: " << elapsed_time_compdrview << std::endl;
|
||||
|
||||
|
|
|
@ -192,7 +192,7 @@ void test_global_to_local_ids(unsigned num_ids)
|
|||
{
|
||||
generate_ids<Device> gen(local_2_global);
|
||||
}
|
||||
Device::fence();
|
||||
Device().fence();
|
||||
// generate
|
||||
elasped_time = timer.seconds();
|
||||
std::cout << elasped_time << ", ";
|
||||
|
@ -201,7 +201,7 @@ void test_global_to_local_ids(unsigned num_ids)
|
|||
{
|
||||
fill_map<Device> fill(global_2_local, local_2_global);
|
||||
}
|
||||
Device::fence();
|
||||
Device().fence();
|
||||
|
||||
// fill
|
||||
elasped_time = timer.seconds();
|
||||
|
@ -214,7 +214,7 @@ void test_global_to_local_ids(unsigned num_ids)
|
|||
{
|
||||
find_test<Device> find(global_2_local, local_2_global,num_errors);
|
||||
}
|
||||
Device::fence();
|
||||
Device().fence();
|
||||
|
||||
// find
|
||||
elasped_time = timer.seconds();
|
||||
|
|
|
@ -0,0 +1,130 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_HPX )
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include <Kokkos_UnorderedMap.hpp>
|
||||
|
||||
#include <TestGlobal2LocalIds.hpp>
|
||||
#include <TestUnorderedMapPerformance.hpp>
|
||||
|
||||
#include <TestDynRankView.hpp>
|
||||
#include <TestScatterView.hpp>
|
||||
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <fstream>
|
||||
|
||||
|
||||
namespace Performance {
|
||||
|
||||
class hpx : public ::testing::Test {
|
||||
protected:
|
||||
static void SetUpTestCase()
|
||||
{
|
||||
std::cout << std::setprecision(5) << std::scientific;
|
||||
|
||||
Kokkos::initialize();
|
||||
Kokkos::print_configuration( std::cout );
|
||||
}
|
||||
|
||||
static void TearDownTestCase()
|
||||
{
|
||||
Kokkos::finalize();
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F( hpx, dynrankview_perf )
|
||||
{
|
||||
std::cout << "HPX" << std::endl;
|
||||
std::cout << " DynRankView vs View: Initialization Only " << std::endl;
|
||||
test_dynrankview_op_perf<Kokkos::Experimental::HPX>( 8192 );
|
||||
}
|
||||
|
||||
TEST_F( hpx, global_2_local)
|
||||
{
|
||||
std::cout << "HPX" << std::endl;
|
||||
std::cout << "size, create, generate, fill, find" << std::endl;
|
||||
for (unsigned i=Performance::begin_id_size; i<=Performance::end_id_size; i *= Performance::id_step)
|
||||
test_global_to_local_ids<Kokkos::Experimental::HPX>(i);
|
||||
}
|
||||
|
||||
TEST_F( hpx, unordered_map_performance_near)
|
||||
{
|
||||
unsigned num_hpx = 4;
|
||||
std::ostringstream base_file_name;
|
||||
base_file_name << "hpx-" << num_hpx << "-near";
|
||||
Perf::run_performance_tests<Kokkos::Experimental::HPX,true>(base_file_name.str());
|
||||
}
|
||||
|
||||
TEST_F( hpx, unordered_map_performance_far)
|
||||
{
|
||||
unsigned num_hpx = 4;
|
||||
std::ostringstream base_file_name;
|
||||
base_file_name << "hpx-" << num_hpx << "-far";
|
||||
Perf::run_performance_tests<Kokkos::Experimental::HPX,false>(base_file_name.str());
|
||||
}
|
||||
|
||||
TEST_F( hpx, scatter_view)
|
||||
{
|
||||
std::cout << "ScatterView data-duplicated test:\n";
|
||||
Perf::test_scatter_view<Kokkos::Experimental::HPX, Kokkos::LayoutRight,
|
||||
Kokkos::Experimental::ScatterDuplicated,
|
||||
Kokkos::Experimental::ScatterNonAtomic>(10, 1000 * 1000);
|
||||
//std::cout << "ScatterView atomics test:\n";
|
||||
//Perf::test_scatter_view<Kokkos::Experimental::HPX, Kokkos::LayoutRight,
|
||||
// Kokkos::Experimental::ScatterNonDuplicated,
|
||||
// Kokkos::Experimental::ScatterAtomic>(10, 1000 * 1000);
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
#else
|
||||
void KOKKOS_CONTAINERS_PERFORMANCE_TESTS_TESTHPX_PREVENT_EMPTY_LINK_ERROR() {}
|
||||
#endif
|
||||
|
|
@ -83,6 +83,7 @@ void test_scatter_view(int m, int n)
|
|||
for (int k = 0; k < m; ++k) {
|
||||
Kokkos::parallel_for(policy, f2, "hand_coded_duplicate_scatter_view_test");
|
||||
}
|
||||
Kokkos::fence();
|
||||
auto t = timer.seconds();
|
||||
std::cout << "hand-coded test took " << t << " seconds\n";
|
||||
}
|
||||
|
@ -101,6 +102,7 @@ void test_scatter_view(int m, int n)
|
|||
for (int k = 0; k < m; ++k) {
|
||||
Kokkos::parallel_for(policy, f, "scatter_view_test");
|
||||
}
|
||||
Kokkos::fence();
|
||||
auto t = timer.seconds();
|
||||
std::cout << "test took " << t << " seconds\n";
|
||||
}
|
||||
|
|
|
@ -108,7 +108,7 @@ struct UnorderedMapTest
|
|||
std::cout << std::setprecision(2) << std::fixed << std::setw(5) << (1e9*(seconds/(inserts))) << "; " << std::flush;
|
||||
|
||||
histogram.calculate();
|
||||
Device::fence();
|
||||
Device().fence();
|
||||
}
|
||||
|
||||
void print(std::ostream & metrics_out, std::ostream & length_out, std::ostream & distance_out, std::ostream & block_distance_out)
|
||||
|
@ -236,7 +236,7 @@ void run_performance_tests(std::string const & base_file_name)
|
|||
uint32_t inserts = static_cast<uint32_t>(test_ratios[j]*(capacity));
|
||||
std::cout << capacity << std::flush;
|
||||
UnorderedMapTest<Device, Near> test(capacity, inserts*collisions[i], collisions[i]);
|
||||
Device::fence();
|
||||
Device().fence();
|
||||
test.print(metrics_out, length_out, distance_out, block_distance_out);
|
||||
}
|
||||
std::cout << "\b\b " << std::endl;
|
||||
|
|
|
@ -107,22 +107,20 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
/// assignment
|
||||
Bitset<Device> & operator = (Bitset<Device> const & rhs)
|
||||
{
|
||||
this->m_size = rhs.m_size;
|
||||
this->m_last_block_mask = rhs.m_last_block_mask;
|
||||
this->m_blocks = rhs.m_blocks;
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Bitset (const Bitset<Device>&) = default;
|
||||
|
||||
return *this;
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Bitset& operator= (const Bitset<Device>&) = default;
|
||||
|
||||
/// copy constructor
|
||||
Bitset( Bitset<Device> const & rhs)
|
||||
: m_size( rhs.m_size )
|
||||
, m_last_block_mask( rhs.m_last_block_mask )
|
||||
, m_blocks( rhs.m_blocks )
|
||||
{}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Bitset (Bitset<Device>&&) = default;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Bitset& operator= (Bitset<Device>&&) = default;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
~Bitset () = default;
|
||||
|
||||
/// number of bits in the set
|
||||
/// can be call from the host or the device
|
||||
|
|
|
@ -484,8 +484,8 @@ public:
|
|||
}
|
||||
}
|
||||
if(std::is_same<typename t_host::memory_space,typename t_dev::memory_space>::value) {
|
||||
t_dev::execution_space::fence();
|
||||
t_host::execution_space::fence();
|
||||
typename t_dev::execution_space().fence();
|
||||
typename t_host::execution_space().fence();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -75,7 +75,7 @@ struct DynRankDimTraits {
|
|||
, const size_t N4
|
||||
, const size_t N5
|
||||
, const size_t N6
|
||||
, const size_t N7 )
|
||||
, const size_t /* N7 */)
|
||||
{
|
||||
return
|
||||
( (N6 == unspecified && N5 == unspecified && N4 == unspecified && N3 == unspecified && N2 == unspecified && N1 == unspecified && N0 == unspecified) ? 0
|
||||
|
@ -106,7 +106,7 @@ struct DynRankDimTraits {
|
|||
// Extra overload to match that for specialize types v2
|
||||
template <typename Layout, typename ... P>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static size_t computeRank( const Kokkos::Impl::ViewCtorProp<P...>& prop, const Layout& layout )
|
||||
static size_t computeRank( const Kokkos::Impl::ViewCtorProp<P...>& /* prop */, const Layout& layout )
|
||||
{
|
||||
return computeRank(layout);
|
||||
}
|
||||
|
@ -155,7 +155,7 @@ struct DynRankDimTraits {
|
|||
// Extra overload to match that for specialize types
|
||||
template <typename Traits, typename ... P>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static typename std::enable_if< (std::is_same<typename Traits::array_layout , Kokkos::LayoutRight>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutLeft>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutStride>::value) , typename Traits::array_layout >::type createLayout( const Kokkos::Impl::ViewCtorProp<P...>& prop, const typename Traits::array_layout& layout )
|
||||
static typename std::enable_if< (std::is_same<typename Traits::array_layout , Kokkos::LayoutRight>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutLeft>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutStride>::value) , typename Traits::array_layout >::type createLayout( const Kokkos::Impl::ViewCtorProp<P...>& /* prop */, const typename Traits::array_layout& layout )
|
||||
{
|
||||
return createLayout( layout );
|
||||
}
|
||||
|
@ -655,7 +655,7 @@ public:
|
|||
const size_t dim_scalar = m_map.dimension_scalar();
|
||||
const size_t bytes = this->span() / dim_scalar;
|
||||
|
||||
typedef Kokkos::View<DataType*, typename traits::array_layout, typename traits::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged | traits::memory_traits::RandomAccess | traits::memory_traits::Atomic> > tmp_view_type;
|
||||
typedef Kokkos::View<DataType*, typename traits::array_layout, typename traits::device_type, Kokkos::MemoryTraits<traits::memory_traits::is_unmanaged | traits::memory_traits::is_random_access | traits::memory_traits::is_atomic> > tmp_view_type;
|
||||
tmp_view_type rankone_view(this->data(), bytes, dim_scalar);
|
||||
return rankone_view(i0);
|
||||
}
|
||||
|
@ -1060,7 +1060,7 @@ public:
|
|||
}
|
||||
|
||||
// Copy the input allocation properties with possibly defaulted properties
|
||||
alloc_prop prop( arg_prop );
|
||||
alloc_prop prop_copy( arg_prop );
|
||||
|
||||
//------------------------------------------------------------
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
|
@ -1070,18 +1070,18 @@ public:
|
|||
// Fence using the trait's executon space (which will be Kokkos::Cuda)
|
||||
// to avoid incomplete type errors from usng Kokkos::Cuda directly.
|
||||
if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
|
||||
traits::device_type::memory_space::execution_space::fence();
|
||||
typename traits::device_type::memory_space::execution_space().fence();
|
||||
}
|
||||
#endif
|
||||
//------------------------------------------------------------
|
||||
|
||||
Kokkos::Impl::SharedAllocationRecord<> *
|
||||
record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) );
|
||||
record = m_map.allocate_shared( prop_copy, Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) );
|
||||
|
||||
//------------------------------------------------------------
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
|
||||
traits::device_type::memory_space::execution_space::fence();
|
||||
typename traits::device_type::memory_space::execution_space().fence();
|
||||
}
|
||||
#endif
|
||||
//------------------------------------------------------------
|
||||
|
@ -1609,7 +1609,7 @@ struct DynRankViewFill {
|
|||
|
||||
closure.execute();
|
||||
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1650,6 +1650,7 @@ struct DynRankViewRemap {
|
|||
typedef Kokkos::RangePolicy< ExecSpace > Policy ;
|
||||
const Kokkos::Impl::ParallelFor< DynRankViewRemap , Policy > closure( *this , Policy( 0 , n0 ) );
|
||||
closure.execute();
|
||||
// Kokkos::fence(); // ??
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
|
|
@ -288,8 +288,8 @@ public:
|
|||
>::type
|
||||
resize_serial( IntType const & n )
|
||||
{
|
||||
typedef typename traits::value_type value_type ;
|
||||
typedef value_type * value_pointer_type ;
|
||||
typedef typename traits::value_type local_value_type ;
|
||||
typedef local_value_type * value_pointer_type ;
|
||||
|
||||
const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ; // New total number of chunks needed for resize
|
||||
|
||||
|
@ -304,8 +304,8 @@ public:
|
|||
if ( *pc < NC ) {
|
||||
while ( *pc < NC ) {
|
||||
m_chunks[*pc] = reinterpret_cast<value_pointer_type>
|
||||
(
|
||||
typename traits::memory_space().allocate( sizeof(value_type) << m_chunk_shift )
|
||||
(
|
||||
typename traits::memory_space().allocate( sizeof(local_value_type) << m_chunk_shift )
|
||||
);
|
||||
++*pc ;
|
||||
}
|
||||
|
@ -314,7 +314,7 @@ public:
|
|||
while ( NC + 1 <= *pc ) {
|
||||
--*pc ;
|
||||
typename traits::memory_space().deallocate( m_chunks[*pc]
|
||||
, sizeof(value_type) << m_chunk_shift );
|
||||
, sizeof(local_value_type) << m_chunk_shift );
|
||||
m_chunks[*pc] = 0 ;
|
||||
}
|
||||
}
|
||||
|
@ -376,8 +376,8 @@ public:
|
|||
|
||||
closure.execute();
|
||||
|
||||
traits::execution_space::fence();
|
||||
//Impl::ChunkArraySpace< typename traits::memory_space >::memory_space::execution_space::fence();
|
||||
typename traits::execution_space().fence();
|
||||
//Impl::ChunkArraySpace< typename traits::memory_space >::memory_space::execution_space().fence();
|
||||
}
|
||||
|
||||
void construct_shared_allocation()
|
||||
|
|
|
@ -202,8 +202,8 @@ namespace Kokkos {
|
|||
|
||||
template <typename iType, typename std::enable_if< std::is_integral<iType>::value, iType>::type = 0>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t begin(const iType dimension) const {
|
||||
return dimension < Rank ? m_begins[dimension] : 0;
|
||||
int64_t begin(const iType local_dimension) const {
|
||||
return local_dimension < Rank ? m_begins[local_dimension] : 0;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -211,7 +211,9 @@ namespace Kokkos {
|
|||
|
||||
template <typename iType, typename std::enable_if< std::is_integral<iType>::value, iType>::type = 0>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t end(const iType dimension) const {return begin(dimension) + m_map.extent(dimension);}
|
||||
int64_t end(const iType local_dimension) const {
|
||||
return begin(local_dimension) + m_map.extent(local_dimension);
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
|
@ -1068,7 +1070,7 @@ namespace Kokkos {
|
|||
}
|
||||
|
||||
// Copy the input allocation properties with possibly defaulted properties
|
||||
alloc_prop prop( arg_prop );
|
||||
alloc_prop prop_copy( arg_prop );
|
||||
|
||||
//------------------------------------------------------------
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
|
@ -1078,18 +1080,18 @@ namespace Kokkos {
|
|||
// Fence using the trait's executon space (which will be Kokkos::Cuda)
|
||||
// to avoid incomplete type errors from usng Kokkos::Cuda directly.
|
||||
if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
|
||||
traits::device_type::memory_space::execution_space::fence();
|
||||
typename traits::device_type::memory_space::execution_space().fence();
|
||||
}
|
||||
#endif
|
||||
//------------------------------------------------------------
|
||||
|
||||
Kokkos::Impl::SharedAllocationRecord<> *
|
||||
record = m_map.allocate_shared( prop , arg_layout );
|
||||
record = m_map.allocate_shared( prop_copy , arg_layout );
|
||||
|
||||
//------------------------------------------------------------
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
|
||||
traits::device_type::memory_space::execution_space::fence();
|
||||
typename traits::device_type::memory_space::execution_space().fence();
|
||||
}
|
||||
#endif
|
||||
//------------------------------------------------------------
|
||||
|
|
|
@ -57,9 +57,16 @@
|
|||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
//TODO: replace this enum with the Kokkos::Sum, etc reducers for parallel_reduce
|
||||
/*
|
||||
* Reduction Type list
|
||||
* - These corresponds to subset of the reducers in parallel_reduce
|
||||
* - See Implementations of ScatterValue for details.
|
||||
*/
|
||||
enum : int {
|
||||
ScatterSum,
|
||||
ScatterProd,
|
||||
ScatterMax,
|
||||
ScatterMin,
|
||||
};
|
||||
|
||||
enum : int {
|
||||
|
@ -114,6 +121,21 @@ struct DefaultContribution<Kokkos::OpenMP, Kokkos::Experimental::ScatterDuplicat
|
|||
};
|
||||
#endif
|
||||
|
||||
#ifdef KOKKOS_ENABLE_HPX
|
||||
template <>
|
||||
struct DefaultDuplication<Kokkos::Experimental::HPX> {
|
||||
enum : int { value = Kokkos::Experimental::ScatterDuplicated };
|
||||
};
|
||||
template <>
|
||||
struct DefaultContribution<Kokkos::Experimental::HPX, Kokkos::Experimental::ScatterNonDuplicated> {
|
||||
enum : int { value = Kokkos::Experimental::ScatterAtomic };
|
||||
};
|
||||
template <>
|
||||
struct DefaultContribution<Kokkos::Experimental::HPX, Kokkos::Experimental::ScatterDuplicated> {
|
||||
enum : int { value = Kokkos::Experimental::ScatterNonAtomic };
|
||||
};
|
||||
#endif
|
||||
|
||||
#ifdef KOKKOS_ENABLE_THREADS
|
||||
template <>
|
||||
struct DefaultDuplication<Kokkos::Threads> {
|
||||
|
@ -144,39 +166,277 @@ struct DefaultContribution<Kokkos::Cuda, Kokkos::Experimental::ScatterDuplicated
|
|||
};
|
||||
#endif
|
||||
|
||||
/* ScatterValue is the object returned by the access operator() of ScatterAccess,
|
||||
similar to that returned by an Atomic View, it wraps Kokkos::atomic_add with convenient
|
||||
operator+=, etc. */
|
||||
/* ScatterValue <Op=ScatterSum, contribution=ScatterNonAtomic> is the object returned by the access operator() of ScatterAccess,
|
||||
This class inherits from the Sum<> reducer and it wraps join(dest, src) with convenient operator+=, etc.
|
||||
Note the addition of update(ValueType const& rhs) and reset() so that all reducers can have common functions
|
||||
See ReduceDuplicates and ResetDuplicates ) */
|
||||
template <typename ValueType, int Op, int contribution>
|
||||
struct ScatterValue;
|
||||
|
||||
template <typename ValueType>
|
||||
struct ScatterValue<ValueType, Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonAtomic> {
|
||||
struct ScatterValue<ValueType, Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonAtomic> :
|
||||
Sum<ValueType,Kokkos::DefaultExecutionSpace> {
|
||||
public:
|
||||
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) : value( value_in ) {}
|
||||
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) : value( other.value ) {}
|
||||
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) :
|
||||
Sum<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
|
||||
{}
|
||||
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) :
|
||||
Sum<ValueType,Kokkos::DefaultExecutionSpace>(other.reference())
|
||||
{}
|
||||
KOKKOS_FORCEINLINE_FUNCTION void operator+=(ValueType const& rhs) {
|
||||
value += rhs;
|
||||
this->join( this->reference(), rhs );
|
||||
}
|
||||
KOKKOS_FORCEINLINE_FUNCTION void operator-=(ValueType const& rhs) {
|
||||
value -= rhs;
|
||||
this->join( this->reference(), -rhs );
|
||||
}
|
||||
KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
|
||||
this->join( this->reference(), rhs );
|
||||
}
|
||||
KOKKOS_FORCEINLINE_FUNCTION void reset() {
|
||||
this->init( this->reference() );
|
||||
}
|
||||
private:
|
||||
ValueType& value;
|
||||
};
|
||||
|
||||
/* ScatterValue <Op=ScatterSum, contribution=ScatterAtomic> is the object returned by the access operator()
|
||||
* of ScatterAccess, similar to that returned by an Atomic View, it wraps Kokkos::atomic_add with convenient
|
||||
operator+=, etc. This version also has the update(rhs) and reset() functions. */
|
||||
template <typename ValueType>
|
||||
struct ScatterValue<ValueType, Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterAtomic> {
|
||||
struct ScatterValue<ValueType, Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterAtomic> :
|
||||
Sum<ValueType,Kokkos::DefaultExecutionSpace> {
|
||||
public:
|
||||
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) : value( value_in ) {}
|
||||
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) :
|
||||
Sum<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
|
||||
{}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION void operator+=(ValueType const& rhs) {
|
||||
Kokkos::atomic_add(&value, rhs);
|
||||
this->join(this->reference(), rhs);
|
||||
}
|
||||
KOKKOS_FORCEINLINE_FUNCTION void operator-=(ValueType const& rhs) {
|
||||
Kokkos::atomic_add(&value, -rhs);
|
||||
this->join(this->reference(), -rhs);
|
||||
}
|
||||
private:
|
||||
ValueType& value;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void join(ValueType& dest, const ValueType& src) const {
|
||||
Kokkos::atomic_add(&dest, src);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void join(volatile ValueType& dest, const volatile ValueType& src) const {
|
||||
Kokkos::atomic_add(&dest, src);
|
||||
}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
|
||||
this->join( this->reference(), rhs );
|
||||
}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION void reset() {
|
||||
this->init( this->reference() );
|
||||
}
|
||||
};
|
||||
|
||||
/* ScatterValue <Op=ScatterProd, contribution=ScatterNonAtomic> is the object returned by the access operator() of ScatterAccess,
|
||||
This class inherits from the Prod<> reducer and it wraps join(dest, src) with convenient operator*=, etc.
|
||||
Note the addition of update(ValueType const& rhs) and reset() so that all reducers can have common functions
|
||||
See ReduceDuplicates and ResetDuplicates ) */
|
||||
template <typename ValueType>
|
||||
struct ScatterValue<ValueType, Kokkos::Experimental::ScatterProd, Kokkos::Experimental::ScatterNonAtomic> :
|
||||
Prod<ValueType,Kokkos::DefaultExecutionSpace> {
|
||||
public:
|
||||
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) :
|
||||
Prod<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
|
||||
{}
|
||||
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) :
|
||||
Prod<ValueType,Kokkos::DefaultExecutionSpace>(other.reference())
|
||||
{}
|
||||
KOKKOS_FORCEINLINE_FUNCTION void operator*=(ValueType const& rhs) {
|
||||
this->join( this->reference(), rhs );
|
||||
}
|
||||
KOKKOS_FORCEINLINE_FUNCTION void operator/=(ValueType const& rhs) {
|
||||
this->join( this->reference(), static_cast<ValueType>(1)/rhs );
|
||||
}
|
||||
KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
|
||||
this->join( this->reference(), rhs );
|
||||
}
|
||||
KOKKOS_FORCEINLINE_FUNCTION void reset() {
|
||||
this->init( this->reference() );
|
||||
}
|
||||
};
|
||||
|
||||
/* ScatterValue <Op=ScatterProd, contribution=ScatterAtomic> is the object returned by the access operator()
|
||||
* of ScatterAccess, similar to that returned by an Atomic View, it wraps and atomic_prod with convenient
|
||||
operator*=, etc. atomic_prod uses the atomic_compare_exchange. This version also has the update(rhs) and reset() functions. */
|
||||
template <typename ValueType>
|
||||
struct ScatterValue<ValueType, Kokkos::Experimental::ScatterProd, Kokkos::Experimental::ScatterAtomic> :
|
||||
Prod<ValueType,Kokkos::DefaultExecutionSpace> {
|
||||
public:
|
||||
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) :
|
||||
Prod<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
|
||||
{}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION void operator*=(ValueType const& rhs) {
|
||||
this->join(this->reference(), rhs);
|
||||
}
|
||||
KOKKOS_FORCEINLINE_FUNCTION void operator/=(ValueType const& rhs) {
|
||||
this->join(this->reference(), static_cast<ValueType>(1)/rhs);
|
||||
}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
void atomic_prod(ValueType & dest, const ValueType& src) const {
|
||||
|
||||
bool success = false;
|
||||
while(!success) {
|
||||
ValueType dest_old = dest;
|
||||
ValueType dest_new = dest_old * src;
|
||||
dest_new = Kokkos::atomic_compare_exchange<ValueType>(&dest,dest_old,dest_new);
|
||||
success = ( (dest_new - dest_old)/dest_old <= 1e-15 );
|
||||
}
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void join(ValueType& dest, const ValueType& src) const {
|
||||
atomic_prod(dest, src);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void join(volatile ValueType& dest, const volatile ValueType& src) const {
|
||||
atomic_prod(dest, src);
|
||||
}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
|
||||
this->join( this->reference(), rhs );
|
||||
}
|
||||
KOKKOS_FORCEINLINE_FUNCTION void reset() {
|
||||
this->init( this->reference() );
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
/* ScatterValue <Op=ScatterMin, contribution=ScatterNonAtomic> is the object returned by the access operator() of ScatterAccess,
|
||||
This class inherits from the Min<> reducer and it wraps join(dest, src) with convenient update(rhs).
|
||||
Note the addition of update(ValueType const& rhs) and reset() are so that all reducers can have a common update function
|
||||
See ReduceDuplicates and ResetDuplicates ) */
|
||||
template <typename ValueType>
|
||||
struct ScatterValue<ValueType, Kokkos::Experimental::ScatterMin, Kokkos::Experimental::ScatterNonAtomic> :
|
||||
Min<ValueType,Kokkos::DefaultExecutionSpace> {
|
||||
public:
|
||||
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) :
|
||||
Min<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
|
||||
{}
|
||||
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) :
|
||||
Min<ValueType,Kokkos::DefaultExecutionSpace>(other.reference())
|
||||
{}
|
||||
KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
|
||||
this->join( this->reference(), rhs );
|
||||
}
|
||||
KOKKOS_FORCEINLINE_FUNCTION void reset() {
|
||||
this->init( this->reference() );
|
||||
}
|
||||
};
|
||||
|
||||
/* ScatterValue <Op=ScatterMin, contribution=ScatterAtomic> is the object returned by the access operator()
|
||||
* of ScatterAccess, similar to that returned by an Atomic View, it wraps and atomic_min with the update(rhs)
|
||||
function. atomic_min uses the atomic_compare_exchange. This version also has the reset() function */
|
||||
template <typename ValueType>
|
||||
struct ScatterValue<ValueType, Kokkos::Experimental::ScatterMin, Kokkos::Experimental::ScatterAtomic> :
|
||||
Min<ValueType,Kokkos::DefaultExecutionSpace> {
|
||||
public:
|
||||
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) :
|
||||
Min<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
|
||||
{}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
void atomic_min(ValueType & dest, const ValueType& src) const {
|
||||
|
||||
bool success = false;
|
||||
while(!success) {
|
||||
ValueType dest_old = dest;
|
||||
ValueType dest_new = ( dest_old > src ) ? src : dest_old;
|
||||
dest_new = Kokkos::atomic_compare_exchange<ValueType>(&dest,dest_old,dest_new);
|
||||
success = ( (dest_new - dest_old)/dest_old <= 1e-15 );
|
||||
}
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void join(ValueType& dest, const ValueType& src) const {
|
||||
atomic_min(dest, src);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void join(volatile ValueType& dest, const volatile ValueType& src) const {
|
||||
atomic_min(dest, src);
|
||||
}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
|
||||
this->join( this->reference(), rhs );
|
||||
}
|
||||
KOKKOS_FORCEINLINE_FUNCTION void reset() {
|
||||
this->init( this->reference() );
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
/* ScatterValue <Op=ScatterMax, contribution=ScatterNonAtomic> is the object returned by the access operataor() of ScatterAccess,
|
||||
This class inherits from the Max<> reducer and it wraps join(dest, src) with convenient update(rhs).
|
||||
Note the addition of update(ValueType const& rhs) and reset() are so that all reducers can have a common update function
|
||||
See ReduceDuplicates and ResetDuplicates ) */
|
||||
template <typename ValueType>
|
||||
struct ScatterValue<ValueType, Kokkos::Experimental::ScatterMax, Kokkos::Experimental::ScatterNonAtomic> :
|
||||
Max<ValueType,Kokkos::DefaultExecutionSpace> {
|
||||
public:
|
||||
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) :
|
||||
Max<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
|
||||
{}
|
||||
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) :
|
||||
Max<ValueType,Kokkos::DefaultExecutionSpace>(other.reference())
|
||||
{}
|
||||
KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
|
||||
this->join( this->reference(), rhs );
|
||||
}
|
||||
KOKKOS_FORCEINLINE_FUNCTION void reset() {
|
||||
this->init( this->reference() );
|
||||
}
|
||||
};
|
||||
|
||||
/* ScatterValue <Op=ScatterMax, contribution=ScatterAtomic> is the object returned by the access operator()
|
||||
* of ScatterAccess, similar to that returned by an Atomic View, it wraps and atomic_max with the update(rhs)
|
||||
function. atomic_max uses the atomic_compare_exchange. This version also has the reset() function */
|
||||
template <typename ValueType>
|
||||
struct ScatterValue<ValueType, Kokkos::Experimental::ScatterMax, Kokkos::Experimental::ScatterAtomic> :
|
||||
Max<ValueType,Kokkos::DefaultExecutionSpace> {
|
||||
public:
|
||||
KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) :
|
||||
Max<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
|
||||
{}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
void atomic_max(ValueType & dest, const ValueType& src) const {
|
||||
|
||||
bool success = false;
|
||||
while(!success) {
|
||||
ValueType dest_old = dest;
|
||||
ValueType dest_new = ( dest_old < src ) ? src : dest_old;
|
||||
dest_new = Kokkos::atomic_compare_exchange<ValueType>(&dest,dest_old,dest_new);
|
||||
success = ( (dest_new - dest_old)/dest_old <= 1e-15 );
|
||||
}
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void join(ValueType& dest, const ValueType& src) const {
|
||||
atomic_max(dest, src);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void join(volatile ValueType& dest, const volatile ValueType& src) const {
|
||||
atomic_max(dest, src);
|
||||
}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
|
||||
this->join( this->reference(), rhs );
|
||||
}
|
||||
KOKKOS_FORCEINLINE_FUNCTION void reset() {
|
||||
this->init( this->reference() );
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
/* DuplicatedDataType, given a View DataType, will create a new DataType
|
||||
|
@ -226,6 +486,18 @@ struct DuplicatedDataType<T*, Kokkos::LayoutLeft> {
|
|||
typedef typename DuplicatedDataType<T, Kokkos::LayoutLeft>::value_type* value_type;
|
||||
};
|
||||
|
||||
/* Insert integer argument pack into array */
|
||||
|
||||
template<class T>
|
||||
void args_to_array(size_t* array, int pos, T dim0) {
|
||||
array[pos] = dim0;
|
||||
}
|
||||
template<class T, class ... Dims>
|
||||
void args_to_array(size_t* array, int pos, T dim0, Dims ... dims) {
|
||||
array[pos] = dim0;
|
||||
args_to_array(array,pos+1,dims...);
|
||||
}
|
||||
|
||||
/* Slice is just responsible for stuffing the correct number of Kokkos::ALL
|
||||
arguments on the correct side of the index in a call to subview() to get a
|
||||
subview where the index specified is the largest-stride one. */
|
||||
|
@ -304,21 +576,26 @@ struct ReduceDuplicatesBase {
|
|||
}
|
||||
};
|
||||
|
||||
template <typename ExecSpace, typename ValueType>
|
||||
struct ReduceDuplicates<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum> :
|
||||
public ReduceDuplicatesBase<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum>
|
||||
/* ReduceDuplicates -- Perform reduction on destination array using strided source
|
||||
* Use ScatterValue<> specific to operation to wrap destination array so that
|
||||
* the reduction operation can be accessed via the update(rhs) function */
|
||||
template <typename ExecSpace, typename ValueType, int Op>
|
||||
struct ReduceDuplicates :
|
||||
public ReduceDuplicatesBase<ExecSpace, ValueType, Op>
|
||||
{
|
||||
typedef ReduceDuplicatesBase<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum> Base;
|
||||
typedef ReduceDuplicatesBase<ExecSpace, ValueType, Op> Base;
|
||||
ReduceDuplicates(ValueType const* src_in, ValueType* dst_in, size_t stride_in, size_t start_in, size_t n_in, std::string const& name):
|
||||
Base(src_in, dst_in, stride_in, start_in, n_in, name)
|
||||
{}
|
||||
KOKKOS_FORCEINLINE_FUNCTION void operator()(size_t i) const {
|
||||
for (size_t j = Base::start; j < Base::n; ++j) {
|
||||
Base::dst[i] += Base::src[i + Base::stride * j];
|
||||
ScatterValue<ValueType, Op, Kokkos::Experimental::ScatterNonAtomic> sv(Base::dst[i]);
|
||||
sv.update( Base::src[i + Base::stride * j] );
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename ExecSpace, typename ValueType, int Op>
|
||||
struct ResetDuplicates;
|
||||
|
||||
|
@ -347,19 +624,24 @@ struct ResetDuplicatesBase {
|
|||
}
|
||||
};
|
||||
|
||||
template <typename ExecSpace, typename ValueType>
|
||||
struct ResetDuplicates<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum> :
|
||||
public ResetDuplicatesBase<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum>
|
||||
/* ResetDuplicates -- Perform reset on destination array
|
||||
* Use ScatterValue<> specific to operation to wrap destination array so that
|
||||
* the reset operation can be accessed via the reset() function */
|
||||
template <typename ExecSpace, typename ValueType, int Op>
|
||||
struct ResetDuplicates :
|
||||
public ResetDuplicatesBase<ExecSpace, ValueType, Op>
|
||||
{
|
||||
typedef ResetDuplicatesBase<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum> Base;
|
||||
typedef ResetDuplicatesBase<ExecSpace, ValueType, Op> Base;
|
||||
ResetDuplicates(ValueType* data_in, size_t size_in, std::string const& name):
|
||||
Base(data_in, size_in, name)
|
||||
{}
|
||||
KOKKOS_FORCEINLINE_FUNCTION void operator()(size_t i) const {
|
||||
Base::data[i] = Kokkos::reduction_identity<ValueType>::sum();
|
||||
ScatterValue<ValueType, Op, Kokkos::Experimental::ScatterNonAtomic> sv(Base::data[i]);
|
||||
sv.reset();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
}}} // Kokkos::Impl::Experimental
|
||||
|
||||
namespace Kokkos {
|
||||
|
@ -519,12 +801,22 @@ public:
|
|||
typedef Kokkos::Impl::Experimental::ScatterValue<
|
||||
original_value_type, Op, override_contribution> value_type;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ScatterAccess() :
|
||||
view(view_type()) {
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ScatterAccess(view_type const& view_in)
|
||||
: view(view_in)
|
||||
{
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
~ScatterAccess()
|
||||
{
|
||||
}
|
||||
|
||||
template <typename ... Args>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
value_type operator()(Args ... args) const {
|
||||
|
@ -608,7 +900,7 @@ public:
|
|||
}
|
||||
|
||||
template <int override_contribution = contribution>
|
||||
inline
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
ScatterAccess<DataType, Op, ExecSpace, Kokkos::LayoutRight, ScatterDuplicated, contribution, override_contribution>
|
||||
access() const {
|
||||
return ScatterAccess<DataType, Op, ExecSpace, Kokkos::LayoutRight, ScatterDuplicated, contribution, override_contribution>{*this};
|
||||
|
@ -729,14 +1021,14 @@ public:
|
|||
: unique_token()
|
||||
{
|
||||
size_t arg_N[8] = {
|
||||
original_view.extent(0),
|
||||
original_view.extent(1),
|
||||
original_view.extent(2),
|
||||
original_view.extent(3),
|
||||
original_view.extent(4),
|
||||
original_view.extent(5),
|
||||
original_view.extent(6),
|
||||
0
|
||||
original_view.rank>0?original_view.extent(0):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
original_view.rank>1?original_view.extent(1):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
original_view.rank>2?original_view.extent(2):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
original_view.rank>3?original_view.extent(3):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
original_view.rank>4?original_view.extent(4):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
original_view.rank>5?original_view.extent(5):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
original_view.rank>6?original_view.extent(6):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
KOKKOS_IMPL_CTOR_DEFAULT_ARG
|
||||
};
|
||||
arg_N[internal_view_type::rank - 1] = unique_token.size();
|
||||
internal_view = internal_view_type(
|
||||
|
@ -748,14 +1040,28 @@ public:
|
|||
}
|
||||
|
||||
template <typename ... Dims>
|
||||
ScatterView(std::string const& name, Dims ... dims)
|
||||
: internal_view(Kokkos::ViewAllocateWithoutInitializing(name), dims ..., unique_token.size())
|
||||
{
|
||||
ScatterView(std::string const& name, Dims ... dims) {
|
||||
original_view_type original_view;
|
||||
size_t arg_N[8] = {
|
||||
original_view.rank>0?original_view.static_extent(0):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
original_view.rank>1?original_view.static_extent(1):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
original_view.rank>2?original_view.static_extent(2):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
original_view.rank>3?original_view.static_extent(3):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
original_view.rank>4?original_view.static_extent(4):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
original_view.rank>5?original_view.static_extent(5):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
original_view.rank>6?original_view.static_extent(6):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
KOKKOS_IMPL_CTOR_DEFAULT_ARG
|
||||
};
|
||||
Kokkos::Impl::Experimental::args_to_array(arg_N,0,dims ...);
|
||||
arg_N[internal_view_type::rank - 1] = unique_token.size();
|
||||
internal_view = internal_view_type(Kokkos::ViewAllocateWithoutInitializing(name),
|
||||
arg_N[0], arg_N[1], arg_N[2], arg_N[3],
|
||||
arg_N[4], arg_N[5], arg_N[6], arg_N[7]);
|
||||
reset();
|
||||
}
|
||||
|
||||
template <int override_contribution = contribution>
|
||||
inline
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
ScatterAccess<DataType, Op, ExecSpace, Kokkos::LayoutLeft, ScatterDuplicated, contribution, override_contribution>
|
||||
access() const {
|
||||
return ScatterAccess<DataType, Op, ExecSpace, Kokkos::LayoutLeft, ScatterDuplicated, contribution, override_contribution>{*this};
|
||||
|
@ -770,9 +1076,13 @@ public:
|
|||
}
|
||||
|
||||
template <typename ... RP>
|
||||
void contribute_into(View<DataType, RP...> const& dest) const
|
||||
void contribute_into(View<RP...> const& dest) const
|
||||
{
|
||||
typedef View<DataType, RP...> dest_type;
|
||||
typedef View<RP...> dest_type;
|
||||
static_assert(std::is_same<
|
||||
typename dest_type::value_type,
|
||||
typename original_view_type::non_const_value_type>::value,
|
||||
"ScatterView deep_copy destination has wrong value_type");
|
||||
static_assert(std::is_same<
|
||||
typename dest_type::array_layout,
|
||||
Kokkos::LayoutLeft>::value,
|
||||
|
@ -891,12 +1201,14 @@ public:
|
|||
typedef Kokkos::Impl::Experimental::ScatterValue<
|
||||
original_value_type, Op, override_contribution> value_type;
|
||||
|
||||
inline ScatterAccess(view_type const& view_in)
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
ScatterAccess(view_type const& view_in)
|
||||
: view(view_in)
|
||||
, thread_id(view_in.unique_token.acquire()) {
|
||||
}
|
||||
|
||||
inline ~ScatterAccess() {
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
~ScatterAccess() {
|
||||
if (thread_id != ~thread_id_type(0)) view.unique_token.release(thread_id);
|
||||
}
|
||||
|
||||
|
@ -926,8 +1238,9 @@ private:
|
|||
public:
|
||||
// do need to allow moves though, for the common
|
||||
// auto b = a.access();
|
||||
// that assignments turns into a move constructor call
|
||||
inline ScatterAccess(ScatterAccess&& other)
|
||||
// that assignments turns into a move constructor call
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
ScatterAccess(ScatterAccess&& other)
|
||||
: view(other.view)
|
||||
, thread_id(other.thread_id)
|
||||
{
|
||||
|
|
|
@ -437,9 +437,9 @@ public:
|
|||
{
|
||||
bool result = !erasable();
|
||||
if (is_insertable_map && result) {
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
set_flag(erasable_idx);
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
@ -448,10 +448,10 @@ public:
|
|||
{
|
||||
bool result = erasable();
|
||||
if (is_insertable_map && result) {
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
Impl::UnorderedMapErase<declared_map_type> f(*this);
|
||||
f.apply();
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
reset_flag(erasable_idx);
|
||||
}
|
||||
return result;
|
||||
|
|
|
@ -121,12 +121,12 @@ public:
|
|||
if( DV::template need_sync<typename DV::t_dev::device_type>() ) {
|
||||
set_functor_host f(DV::h_view,val);
|
||||
parallel_for(n,f);
|
||||
DV::t_host::execution_space::fence();
|
||||
typename DV::t_host::execution_space().fence();
|
||||
DV::template modify<typename DV::t_host::device_type>();
|
||||
} else {
|
||||
set_functor f(DV::d_view,val);
|
||||
parallel_for(n,f);
|
||||
DV::t_dev::execution_space::fence();
|
||||
typename DV::t_dev::execution_space().fence();
|
||||
DV::template modify<typename DV::t_dev::device_type>();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -86,6 +86,31 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
|
|||
)
|
||||
ENDIF()
|
||||
|
||||
IF(Kokkos_ENABLE_HPX)
|
||||
TRIBITS_ADD_EXECUTABLE_AND_TEST(
|
||||
UnitTest_HPX
|
||||
SOURCES
|
||||
UnitTestMain.cpp
|
||||
hpx/TestHPX_BitSet.cpp
|
||||
hpx/TestHPX_DualView.cpp
|
||||
hpx/TestHPX_DynamicView.cpp
|
||||
hpx/TestHPX_DynRankViewAPI_generic.cpp
|
||||
hpx/TestHPX_DynRankViewAPI_rank12345.cpp
|
||||
hpx/TestHPX_DynRankViewAPI_rank67.cpp
|
||||
hpx/TestHPX_ErrorReporter.cpp
|
||||
hpx/TestHPX_OffsetView.cpp
|
||||
hpx/TestHPX_ScatterView.cpp
|
||||
hpx/TestHPX_StaticCrsGraph.cpp
|
||||
hpx/TestHPX_UnorderedMap.cpp
|
||||
hpx/TestHPX_Vector.cpp
|
||||
hpx/TestHPX_ViewCtorPropEmbeddedDim.cpp
|
||||
COMM serial mpi
|
||||
NUM_MPI_PROCS 1
|
||||
FAIL_REGULAR_EXPRESSION " FAILED "
|
||||
TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
IF(Kokkos_ENABLE_Cuda)
|
||||
TRIBITS_ADD_EXECUTABLE_AND_TEST(
|
||||
UnitTest_Cuda
|
||||
|
|
|
@ -4,6 +4,7 @@ GTEST_PATH = ../../TPL/gtest
|
|||
|
||||
vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests
|
||||
vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/openmp
|
||||
vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/hpx
|
||||
vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/serial
|
||||
vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/threads
|
||||
vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/rocm
|
||||
|
@ -106,6 +107,25 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
|
|||
TEST_TARGETS += test-openmp
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
|
||||
OBJ_HPX = UnitTestMain.o gtest-all.o
|
||||
OBJ_HPX += TestHPX_BitSet.o
|
||||
OBJ_HPX += TestHPX_DualView.o
|
||||
OBJ_HPX += TestHPX_DynamicView.o
|
||||
OBJ_HPX += TestHPX_DynRankViewAPI_generic.o
|
||||
OBJ_HPX += TestHPX_DynRankViewAPI_rank12345.o
|
||||
OBJ_HPX += TestHPX_DynRankViewAPI_rank67.o
|
||||
OBJ_HPX += TestHPX_ErrorReporter.o
|
||||
OBJ_HPX += TestHPX_OffsetView.o
|
||||
OBJ_HPX += TestHPX_ScatterView.o
|
||||
OBJ_HPX += TestHPX_StaticCrsGraph.o
|
||||
OBJ_HPX += TestHPX_UnorderedMap.o
|
||||
OBJ_HPX += TestHPX_Vector.o
|
||||
OBJ_HPX += TestHPX_ViewCtorPropEmbeddedDim.o
|
||||
TARGETS += KokkosContainers_UnitTest_HPX
|
||||
TEST_TARGETS += test-hpx
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
|
||||
OBJ_SERIAL = UnitTestMain.o gtest-all.o
|
||||
OBJ_SERIAL += TestSerial_BitSet.o
|
||||
|
@ -137,6 +157,9 @@ KokkosContainers_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
|
|||
KokkosContainers_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_OpenMP
|
||||
|
||||
KokkosContainers_UnitTest_HPX: $(OBJ_HPX) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(EXTRA_PATH) $(OBJ_HPX) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_HPX
|
||||
|
||||
KokkosContainers_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_Serial
|
||||
|
||||
|
@ -152,6 +175,9 @@ test-threads: KokkosContainers_UnitTest_Threads
|
|||
test-openmp: KokkosContainers_UnitTest_OpenMP
|
||||
./KokkosContainers_UnitTest_OpenMP
|
||||
|
||||
test-hpx: KokkosContainers_UnitTest_HPX
|
||||
./KokkosContainers_UnitTest_HPX
|
||||
|
||||
test-serial: KokkosContainers_UnitTest_Serial
|
||||
./KokkosContainers_UnitTest_Serial
|
||||
|
||||
|
|
|
@ -66,7 +66,7 @@ struct TestBitset
|
|||
|
||||
unsigned testit(unsigned collisions)
|
||||
{
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
|
||||
unsigned count = 0;
|
||||
Kokkos::parallel_reduce( m_bitset.size()*collisions, *this, count);
|
||||
|
@ -114,7 +114,7 @@ struct TestBitsetTest
|
|||
|
||||
unsigned testit()
|
||||
{
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
|
||||
unsigned count = 0;
|
||||
Kokkos::parallel_reduce( m_bitset.size(), *this, count);
|
||||
|
@ -151,7 +151,7 @@ struct TestBitsetAny
|
|||
|
||||
unsigned testit()
|
||||
{
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
|
||||
unsigned count = 0;
|
||||
Kokkos::parallel_reduce( m_bitset.size(), *this, count);
|
||||
|
|
|
@ -1276,6 +1276,7 @@ public:
|
|||
Kokkos::deep_copy( dx , hx );
|
||||
Kokkos::deep_copy( dy , dx );
|
||||
Kokkos::deep_copy( hy , dy );
|
||||
Kokkos::fence();
|
||||
|
||||
for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
|
||||
for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
|
||||
|
@ -1286,6 +1287,7 @@ public:
|
|||
|
||||
Kokkos::deep_copy( dx , T(0) );
|
||||
Kokkos::deep_copy( hx , dx );
|
||||
Kokkos::fence();
|
||||
|
||||
for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
|
||||
for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
|
||||
|
|
|
@ -162,6 +162,7 @@ struct ErrorReporterDriver : public ErrorReporterDriverBase<DeviceType>
|
|||
void execute(int reporter_capacity, int test_size)
|
||||
{
|
||||
Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0,test_size), *this);
|
||||
Kokkos::fence();
|
||||
driver_base::check_expectations(reporter_capacity, test_size);
|
||||
}
|
||||
|
||||
|
@ -194,6 +195,7 @@ struct ErrorReporterDriverUseLambda : public ErrorReporterDriverBase<DeviceType>
|
|||
driver_base::m_errorReporter.add_report(work_idx, report);
|
||||
}
|
||||
});
|
||||
Kokkos::fence();
|
||||
driver_base::check_expectations(reporter_capacity, test_size);
|
||||
}
|
||||
|
||||
|
|
|
@ -48,79 +48,387 @@
|
|||
|
||||
namespace Test {
|
||||
|
||||
template <typename ExecSpace, typename Layout, int duplication, int contribution, int op>
|
||||
struct test_scatter_view_impl_cls;
|
||||
|
||||
template <typename ExecSpace, typename Layout, int duplication, int contribution>
|
||||
void test_scatter_view_config(int n)
|
||||
struct test_scatter_view_impl_cls<ExecSpace, Layout, duplication, contribution, Kokkos::Experimental::ScatterSum>
|
||||
{
|
||||
Kokkos::View<double *[3], Layout, ExecSpace> original_view("original_view", n);
|
||||
{
|
||||
auto scatter_view = Kokkos::Experimental::create_scatter_view
|
||||
< Kokkos::Experimental::ScatterSum
|
||||
, duplication
|
||||
, contribution
|
||||
> (original_view);
|
||||
#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
|
||||
auto policy = Kokkos::RangePolicy<ExecSpace, int>(0, n);
|
||||
auto f = KOKKOS_LAMBDA(int i) {
|
||||
public:
|
||||
|
||||
typedef Kokkos::Experimental::ScatterView
|
||||
< double*[3]
|
||||
, Layout
|
||||
, ExecSpace
|
||||
, Kokkos::Experimental::ScatterSum
|
||||
, duplication
|
||||
, contribution
|
||||
> scatter_view_type;
|
||||
|
||||
typedef Kokkos::View<double *[3], Layout, ExecSpace> orig_view_type;
|
||||
|
||||
|
||||
scatter_view_type scatter_view;
|
||||
int scatterSize;
|
||||
|
||||
test_scatter_view_impl_cls(const scatter_view_type& view){
|
||||
scatter_view = view;
|
||||
scatterSize = 0;
|
||||
}
|
||||
|
||||
void initialize(orig_view_type orig) {
|
||||
auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
|
||||
Kokkos::fence();
|
||||
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
|
||||
host_view(i, 0) = 0.0;
|
||||
host_view(i, 1) = 0.0;
|
||||
host_view(i, 2) = 0.0;
|
||||
}
|
||||
Kokkos::fence();
|
||||
Kokkos::deep_copy(orig, host_view);
|
||||
}
|
||||
|
||||
void run_parallel(int n) {
|
||||
scatterSize = n;
|
||||
auto policy = Kokkos::RangePolicy<ExecSpace, int>(0, n);
|
||||
Kokkos::parallel_for(policy, *this, "scatter_view_test: Sum");
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(int i) const {
|
||||
auto scatter_access = scatter_view.access();
|
||||
auto scatter_access_atomic = scatter_view.template access<Kokkos::Experimental::ScatterAtomic>();
|
||||
for (int j = 0; j < 10; ++j) {
|
||||
auto k = (i + j) % n;
|
||||
auto k = (i + j) % scatterSize;
|
||||
scatter_access(k, 0) += 4.2;
|
||||
scatter_access_atomic(k, 1) += 2.0;
|
||||
scatter_access(k, 2) += 1.0;
|
||||
}
|
||||
};
|
||||
Kokkos::parallel_for(policy, f, "scatter_view_test");
|
||||
#endif
|
||||
Kokkos::Experimental::contribute(original_view, scatter_view);
|
||||
scatter_view.reset_except(original_view);
|
||||
#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
|
||||
Kokkos::parallel_for(policy, f, "scatter_view_test");
|
||||
#endif
|
||||
Kokkos::Experimental::contribute(original_view, scatter_view);
|
||||
}
|
||||
#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
|
||||
Kokkos::fence();
|
||||
auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), original_view);
|
||||
Kokkos::fence();
|
||||
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
|
||||
auto val0 = host_view(i, 0);
|
||||
auto val1 = host_view(i, 1);
|
||||
auto val2 = host_view(i, 2);
|
||||
EXPECT_TRUE(std::fabs((val0 - 84.0) / 84.0) < 1e-15);
|
||||
EXPECT_TRUE(std::fabs((val1 - 40.0) / 40.0) < 1e-15);
|
||||
EXPECT_TRUE(std::fabs((val2 - 20.0) / 20.0) < 1e-15);
|
||||
}
|
||||
#endif
|
||||
{
|
||||
Kokkos::Experimental::ScatterView
|
||||
< double*[3]
|
||||
, Layout
|
||||
, ExecSpace
|
||||
, Kokkos::Experimental::ScatterSum
|
||||
, duplication
|
||||
, contribution
|
||||
>
|
||||
persistent_view("persistent", n);
|
||||
auto result_view = persistent_view.subview();
|
||||
contribute(result_view, persistent_view);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ExecSpace>
|
||||
void validateResults(orig_view_type orig) {
|
||||
auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
|
||||
Kokkos::fence();
|
||||
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
|
||||
auto val0 = host_view(i, 0);
|
||||
auto val1 = host_view(i, 1);
|
||||
auto val2 = host_view(i, 2);
|
||||
EXPECT_TRUE(std::fabs((val0 - 84.0) / 84.0) < 1e-14);
|
||||
EXPECT_TRUE(std::fabs((val1 - 40.0) / 40.0) < 1e-14);
|
||||
EXPECT_TRUE(std::fabs((val2 - 20.0) / 20.0) < 1e-14);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename ExecSpace, typename Layout, int duplication, int contribution>
|
||||
struct test_scatter_view_impl_cls<ExecSpace, Layout, duplication, contribution, Kokkos::Experimental::ScatterProd>
|
||||
{
|
||||
public:
|
||||
|
||||
typedef Kokkos::Experimental::ScatterView
|
||||
< double*[3]
|
||||
, Layout
|
||||
, ExecSpace
|
||||
, Kokkos::Experimental::ScatterProd
|
||||
, duplication
|
||||
, contribution
|
||||
> scatter_view_type;
|
||||
|
||||
typedef Kokkos::View<double *[3], Layout, ExecSpace> orig_view_type;
|
||||
|
||||
|
||||
scatter_view_type scatter_view;
|
||||
int scatterSize;
|
||||
|
||||
test_scatter_view_impl_cls(const scatter_view_type& view){
|
||||
scatter_view = view;
|
||||
scatterSize = 0;
|
||||
}
|
||||
|
||||
void initialize(orig_view_type orig) {
|
||||
auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
|
||||
Kokkos::fence();
|
||||
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
|
||||
host_view(i, 0) = 1.0;
|
||||
host_view(i, 1) = 1.0;
|
||||
host_view(i, 2) = 1.0;
|
||||
}
|
||||
Kokkos::fence();
|
||||
Kokkos::deep_copy(orig, host_view);
|
||||
}
|
||||
|
||||
void run_parallel(int n) {
|
||||
scatterSize = n;
|
||||
auto policy = Kokkos::RangePolicy<ExecSpace, int>(0, n);
|
||||
Kokkos::parallel_for(policy, *this, "scatter_view_test: Prod");
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(int i) const {
|
||||
auto scatter_access = scatter_view.access();
|
||||
auto scatter_access_atomic = scatter_view.template access<Kokkos::Experimental::ScatterAtomic>();
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
auto k = (i + j) % scatterSize;
|
||||
scatter_access(k, 0) *= 4.0;
|
||||
scatter_access_atomic(k, 1) *= 2.0;
|
||||
scatter_access(k, 2) *= 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
void validateResults(orig_view_type orig) {
|
||||
auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
|
||||
Kokkos::fence();
|
||||
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
|
||||
auto val0 = host_view(i, 0);
|
||||
auto val1 = host_view(i, 1);
|
||||
auto val2 = host_view(i, 2);
|
||||
EXPECT_TRUE(std::fabs((val0 - 65536.0) / 65536.0) < 1e-14);
|
||||
EXPECT_TRUE(std::fabs((val1 - 256.0) / 256.0) < 1e-14);
|
||||
EXPECT_TRUE(std::fabs((val2 - 1.0) / 1.0) < 1e-14);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename ExecSpace, typename Layout, int duplication, int contribution>
|
||||
struct test_scatter_view_impl_cls<ExecSpace, Layout, duplication, contribution, Kokkos::Experimental::ScatterMin>
|
||||
{
|
||||
public:
|
||||
|
||||
typedef Kokkos::Experimental::ScatterView
|
||||
< double*[3]
|
||||
, Layout
|
||||
, ExecSpace
|
||||
, Kokkos::Experimental::ScatterMin
|
||||
, duplication
|
||||
, contribution
|
||||
> scatter_view_type;
|
||||
|
||||
typedef Kokkos::View<double *[3], Layout, ExecSpace> orig_view_type;
|
||||
|
||||
|
||||
scatter_view_type scatter_view;
|
||||
int scatterSize;
|
||||
|
||||
test_scatter_view_impl_cls(const scatter_view_type& view){
|
||||
scatter_view = view;
|
||||
scatterSize = 0;
|
||||
}
|
||||
|
||||
void initialize(orig_view_type orig) {
|
||||
auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
|
||||
Kokkos::fence();
|
||||
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
|
||||
host_view(i, 0) = 999999.0;
|
||||
host_view(i, 1) = 999999.0;
|
||||
host_view(i, 2) = 999999.0;
|
||||
}
|
||||
Kokkos::fence();
|
||||
Kokkos::deep_copy(orig, host_view);
|
||||
}
|
||||
|
||||
void run_parallel(int n) {
|
||||
scatterSize = n;
|
||||
auto policy = Kokkos::RangePolicy<ExecSpace, int>(0, n);
|
||||
Kokkos::parallel_for(policy, *this, "scatter_view_test: Prod");
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(int i) const {
|
||||
auto scatter_access = scatter_view.access();
|
||||
auto scatter_access_atomic = scatter_view.template access<Kokkos::Experimental::ScatterAtomic>();
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
auto k = (i + j) % scatterSize;
|
||||
scatter_access(k, 0).update((double)(j+1)*4);
|
||||
scatter_access_atomic(k, 1).update((double)(j+1)*2.0);
|
||||
scatter_access(k, 2).update((double)(j+1)*1.0);
|
||||
}
|
||||
}
|
||||
|
||||
void validateResults(orig_view_type orig) {
|
||||
auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
|
||||
Kokkos::fence();
|
||||
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
|
||||
auto val0 = host_view(i, 0);
|
||||
auto val1 = host_view(i, 1);
|
||||
auto val2 = host_view(i, 2);
|
||||
EXPECT_TRUE(std::fabs((val0 - 4.0) / 4.0) < 1e-14);
|
||||
EXPECT_TRUE(std::fabs((val1 - 2.0) / 2.0) < 1e-14);
|
||||
EXPECT_TRUE(std::fabs((val2 - 1.0) / 1.0) < 1e-14);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename ExecSpace, typename Layout, int duplication, int contribution>
|
||||
struct test_scatter_view_impl_cls<ExecSpace, Layout, duplication, contribution, Kokkos::Experimental::ScatterMax>
|
||||
{
|
||||
public:
|
||||
|
||||
typedef Kokkos::Experimental::ScatterView
|
||||
< double*[3]
|
||||
, Layout
|
||||
, ExecSpace
|
||||
, Kokkos::Experimental::ScatterMax
|
||||
, duplication
|
||||
, contribution
|
||||
> scatter_view_type;
|
||||
|
||||
typedef Kokkos::View<double *[3], Layout, ExecSpace> orig_view_type;
|
||||
|
||||
|
||||
scatter_view_type scatter_view;
|
||||
int scatterSize;
|
||||
|
||||
test_scatter_view_impl_cls(const scatter_view_type& view){
|
||||
scatter_view = view;
|
||||
scatterSize = 0;
|
||||
}
|
||||
|
||||
void initialize(orig_view_type orig) {
|
||||
auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
|
||||
Kokkos::fence();
|
||||
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
|
||||
host_view(i, 0) = 0.0;
|
||||
host_view(i, 1) = 0.0;
|
||||
host_view(i, 2) = 0.0;
|
||||
}
|
||||
Kokkos::fence();
|
||||
Kokkos::deep_copy(orig, host_view);
|
||||
}
|
||||
|
||||
void run_parallel(int n) {
|
||||
scatterSize = n;
|
||||
auto policy = Kokkos::RangePolicy<ExecSpace, int>(0, n);
|
||||
Kokkos::parallel_for(policy, *this, "scatter_view_test: Prod");
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(int i) const {
|
||||
auto scatter_access = scatter_view.access();
|
||||
auto scatter_access_atomic = scatter_view.template access<Kokkos::Experimental::ScatterAtomic>();
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
auto k = (i + j) % scatterSize;
|
||||
scatter_access(k, 0).update((double)(j+1)*4);
|
||||
scatter_access_atomic(k, 1).update((double)(j+1)*2.0);
|
||||
scatter_access(k, 2).update((double)(j+1)*1.0);
|
||||
}
|
||||
}
|
||||
|
||||
void validateResults(orig_view_type orig) {
|
||||
auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
|
||||
Kokkos::fence();
|
||||
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
|
||||
auto val0 = host_view(i, 0);
|
||||
auto val1 = host_view(i, 1);
|
||||
auto val2 = host_view(i, 2);
|
||||
EXPECT_TRUE(std::fabs((val0 - 16.0) / 16.0) < 1e-14);
|
||||
EXPECT_TRUE(std::fabs((val1 - 8.0) / 8.0) < 1e-14);
|
||||
EXPECT_TRUE(std::fabs((val2 - 4.0) / 4.0) < 1e-14);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
template <typename ExecSpace, typename Layout, int duplication, int contribution, int op>
|
||||
struct test_scatter_view_config
|
||||
{
|
||||
public:
|
||||
typedef typename test_scatter_view_impl_cls<ExecSpace, Layout,
|
||||
duplication, contribution, op>::scatter_view_type scatter_view_def;
|
||||
typedef typename test_scatter_view_impl_cls<ExecSpace, Layout,
|
||||
duplication, contribution, op>::orig_view_type orig_view_def;
|
||||
|
||||
test_scatter_view_config() {
|
||||
}
|
||||
|
||||
void run_test(int n)
|
||||
{
|
||||
//Test creation via create_scatter_view
|
||||
{
|
||||
orig_view_def original_view("original_view", n);
|
||||
scatter_view_def scatter_view = Kokkos::Experimental::create_scatter_view
|
||||
< op
|
||||
, duplication
|
||||
, contribution
|
||||
> (original_view);
|
||||
|
||||
test_scatter_view_impl_cls<ExecSpace, Layout, duplication, contribution, op> scatter_view_test_impl(scatter_view);
|
||||
scatter_view_test_impl.initialize(original_view);
|
||||
scatter_view_test_impl.run_parallel(n);
|
||||
|
||||
Kokkos::Experimental::contribute(original_view, scatter_view);
|
||||
scatter_view.reset_except(original_view);
|
||||
|
||||
scatter_view_test_impl.run_parallel(n);
|
||||
|
||||
Kokkos::Experimental::contribute(original_view, scatter_view);
|
||||
Kokkos::fence();
|
||||
|
||||
scatter_view_test_impl.validateResults(original_view);
|
||||
|
||||
{
|
||||
scatter_view_def persistent_view("persistent", n);
|
||||
auto result_view = persistent_view.subview();
|
||||
contribute(result_view, persistent_view);
|
||||
Kokkos::fence();
|
||||
}
|
||||
}
|
||||
//Test creation via constructor
|
||||
{
|
||||
orig_view_def original_view("original_view", n);
|
||||
scatter_view_def scatter_view(original_view);
|
||||
|
||||
test_scatter_view_impl_cls<ExecSpace, Layout, duplication, contribution, op> scatter_view_test_impl(scatter_view);
|
||||
scatter_view_test_impl.initialize(original_view);
|
||||
scatter_view_test_impl.run_parallel(n);
|
||||
|
||||
Kokkos::Experimental::contribute(original_view, scatter_view);
|
||||
scatter_view.reset_except(original_view);
|
||||
|
||||
scatter_view_test_impl.run_parallel(n);
|
||||
|
||||
Kokkos::Experimental::contribute(original_view, scatter_view);
|
||||
Kokkos::fence();
|
||||
|
||||
scatter_view_test_impl.validateResults(original_view);
|
||||
|
||||
{
|
||||
scatter_view_def persistent_view("persistent", n);
|
||||
auto result_view = persistent_view.subview();
|
||||
contribute(result_view, persistent_view);
|
||||
Kokkos::fence();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
template <typename ExecSpace, int ScatterType>
|
||||
struct TestDuplicatedScatterView {
|
||||
TestDuplicatedScatterView(int n) {
|
||||
// ScatterSum test
|
||||
test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
|
||||
Kokkos::Experimental::ScatterDuplicated,
|
||||
Kokkos::Experimental::ScatterNonAtomic>(n);
|
||||
Kokkos::Experimental::ScatterNonAtomic,
|
||||
ScatterType> test_sv_right_config;
|
||||
test_sv_right_config.run_test(n);
|
||||
test_scatter_view_config<ExecSpace, Kokkos::LayoutLeft,
|
||||
Kokkos::Experimental::ScatterDuplicated,
|
||||
Kokkos::Experimental::ScatterNonAtomic,
|
||||
ScatterType> test_sv_left_config;
|
||||
test_sv_left_config.run_test(n);
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
// disable duplicated instantiation with CUDA until
|
||||
// UniqueToken can support it
|
||||
template <>
|
||||
struct TestDuplicatedScatterView<Kokkos::Cuda> {
|
||||
template <int ScatterType>
|
||||
struct TestDuplicatedScatterView<Kokkos::Cuda, ScatterType> {
|
||||
TestDuplicatedScatterView(int) {
|
||||
}
|
||||
};
|
||||
|
@ -129,14 +437,14 @@ struct TestDuplicatedScatterView<Kokkos::Cuda> {
|
|||
#ifdef KOKKOS_ENABLE_ROCM
|
||||
// disable duplicated instantiation with ROCm until
|
||||
// UniqueToken can support it
|
||||
template <>
|
||||
struct TestDuplicatedScatterView<Kokkos::Experimental::ROCm> {
|
||||
template <int ScatterType>
|
||||
struct TestDuplicatedScatterView<Kokkos::Experimental::ROCm, ScatterType> {
|
||||
TestDuplicatedScatterView(int) {
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
template <typename ExecSpace>
|
||||
template <typename ExecSpace, int ScatterType>
|
||||
void test_scatter_view(int n)
|
||||
{
|
||||
// all of these configurations should compile okay, but only some of them are
|
||||
|
@ -149,29 +457,47 @@ void test_scatter_view(int n)
|
|||
if (unique_token.size() == 1) {
|
||||
test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
|
||||
Kokkos::Experimental::ScatterNonDuplicated,
|
||||
Kokkos::Experimental::ScatterNonAtomic>(n);
|
||||
Kokkos::Experimental::ScatterNonAtomic,
|
||||
ScatterType> test_sv_config;
|
||||
test_sv_config.run_test(n);
|
||||
}
|
||||
#ifdef KOKKOS_ENABLE_SERIAL
|
||||
if (!std::is_same<ExecSpace, Kokkos::Serial>::value) {
|
||||
#endif
|
||||
test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
|
||||
Kokkos::Experimental::ScatterNonDuplicated,
|
||||
Kokkos::Experimental::ScatterAtomic>(n);
|
||||
Kokkos::Experimental::ScatterAtomic,
|
||||
ScatterType> test_sv_config;
|
||||
test_sv_config.run_test(n);
|
||||
#ifdef KOKKOS_ENABLE_SERIAL
|
||||
}
|
||||
#endif
|
||||
|
||||
TestDuplicatedScatterView<ExecSpace> duptest(n);
|
||||
// with hundreds of threads we were running out of memory.
|
||||
// limit (n) so that duplication doesn't exceed 8GB
|
||||
constexpr std::size_t maximum_allowed_total_bytes = 8ull * 1024ull * 1024ull * 1024ull;
|
||||
std::size_t const maximum_allowed_copy_bytes = maximum_allowed_total_bytes / std::size_t(unique_token.size());
|
||||
constexpr std::size_t bytes_per_value = sizeof(double) * 3;
|
||||
std::size_t const maximum_allowed_copy_values = maximum_allowed_copy_bytes / bytes_per_value;
|
||||
n = std::min(n, int(maximum_allowed_copy_values));
|
||||
TestDuplicatedScatterView<ExecSpace, ScatterType> duptest(n);
|
||||
}
|
||||
|
||||
TEST_F( TEST_CATEGORY, scatterview) {
|
||||
#ifndef KOKKOS_ENABLE_ROCM
|
||||
test_scatter_view<TEST_EXECSPACE>(10);
|
||||
test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterSum>(10);
|
||||
test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterProd>(10);
|
||||
test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterMin>(10);
|
||||
test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterMax>(10);
|
||||
// tests were timing out in DEBUG mode, reduce the amount of work
|
||||
#ifdef KOKKOS_ENABLE_DEBUG
|
||||
test_scatter_view<TEST_EXECSPACE>(100000);
|
||||
int big_n = 100 * 1000;
|
||||
#else
|
||||
test_scatter_view<TEST_EXECSPACE>(10000000);
|
||||
int big_n = 10 * 1000 * 1000;
|
||||
#endif
|
||||
test_scatter_view<TEST_EXECSPACE,Kokkos::Experimental::ScatterSum>(big_n);
|
||||
test_scatter_view<TEST_EXECSPACE,Kokkos::Experimental::ScatterProd>(big_n);
|
||||
test_scatter_view<TEST_EXECSPACE,Kokkos::Experimental::ScatterMin>(big_n);
|
||||
test_scatter_view<TEST_EXECSPACE,Kokkos::Experimental::ScatterMax>(big_n);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -69,7 +69,7 @@ struct TestInsert
|
|||
|
||||
void testit( bool rehash_on_fail = true )
|
||||
{
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
|
||||
uint32_t failed_count = 0;
|
||||
do {
|
||||
|
@ -82,7 +82,7 @@ struct TestInsert
|
|||
}
|
||||
} while (rehash_on_fail && failed_count > 0u);
|
||||
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
}
|
||||
|
||||
|
||||
|
@ -122,9 +122,9 @@ struct TestInsert
|
|||
|
||||
void testit()
|
||||
{
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
Kokkos::parallel_for(m_num_erase, *this);
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -161,9 +161,9 @@ struct TestInsert
|
|||
|
||||
void testit(value_type &errors)
|
||||
{
|
||||
execution_space::execution_space::fence();
|
||||
execution_space().fence();
|
||||
Kokkos::parallel_reduce(m_map.capacity(), *this, errors);
|
||||
execution_space::execution_space::fence();
|
||||
execution_space().fence();
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -247,7 +247,7 @@ void test_failed_insert( uint32_t num_nodes)
|
|||
map_type map(num_nodes);
|
||||
Impl::TestInsert<map_type> test_insert(map, 2u*num_nodes, 1u);
|
||||
test_insert.testit(false /*don't rehash on fail*/);
|
||||
Device::execution_space::fence();
|
||||
typename Device::execution_space().fence();
|
||||
|
||||
EXPECT_TRUE( map.failed_insert() );
|
||||
}
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include<hpx/TestHPX_Category.hpp>
|
||||
#include<TestBitset.hpp>
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_TEST_HPX_HPP
|
||||
#define KOKKOS_TEST_HPX_HPP
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
namespace Test {
|
||||
|
||||
class hpx : public ::testing::Test {
|
||||
protected:
|
||||
static void SetUpTestCase() {
|
||||
}
|
||||
|
||||
static void TearDownTestCase() {
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Test
|
||||
|
||||
#define TEST_CATEGORY hpx
|
||||
#define TEST_EXECSPACE Kokkos::Experimental::HPX
|
||||
|
||||
#endif
|
|
@ -0,0 +1,47 @@
|
|||
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include<hpx/TestHPX_Category.hpp>
|
||||
#include<TestDualView.hpp>
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include<hpx/TestHPX_Category.hpp>
|
||||
#include<TestDynViewAPI_generic.hpp>
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include<hpx/TestHPX_Category.hpp>
|
||||
#include<TestDynViewAPI_rank12345.hpp>
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include<hpx/TestHPX_Category.hpp>
|
||||
#include<TestDynViewAPI_rank67.hpp>
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include<hpx/TestHPX_Category.hpp>
|
||||
#include<TestDynamicView.hpp>
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include<hpx/TestHPX_Category.hpp>
|
||||
#include<TestErrorReporter.hpp>
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include<hpx/TestHPX_Category.hpp>
|
||||
#include<TestOffsetView.hpp>
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include<hpx/TestHPX_Category.hpp>
|
||||
#include<TestScatterView.hpp>
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include<hpx/TestHPX_Category.hpp>
|
||||
#include<TestStaticCrsGraph.hpp>
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include<hpx/TestHPX_Category.hpp>
|
||||
#include<TestUnorderedMap.hpp>
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include<hpx/TestHPX_Category.hpp>
|
||||
#include<TestVector.hpp>
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include<hpx/TestHPX_Category.hpp>
|
||||
#include<TestViewCtorPropEmbeddedDim.hpp>
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREADS DLlib
|
||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREADS DLlib HPX
|
||||
TEST_OPTIONAL_TPLS CUSPARSE
|
||||
)
|
||||
|
||||
|
|
|
@ -47,6 +47,7 @@ TRIBITS_ADD_EXECUTABLE(
|
|||
PerformanceTest_TaskDAG
|
||||
SOURCES test_taskdag.cpp
|
||||
COMM serial mpi
|
||||
TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
|
||||
)
|
||||
|
||||
TRIBITS_ADD_TEST(
|
||||
|
|
|
@ -30,6 +30,7 @@ TARGETS =
|
|||
#
|
||||
|
||||
OBJ_PERF = PerfTestMain.o gtest-all.o
|
||||
OBJ_PERF += PerfTest_ExecSpacePartitioning.o
|
||||
OBJ_PERF += PerfTestGramSchmidt.o
|
||||
OBJ_PERF += PerfTestHexGrad.o
|
||||
OBJ_PERF += PerfTest_CustomReduction.o
|
||||
|
|
|
@ -44,6 +44,8 @@
|
|||
#ifndef KOKKOS_BLAS_KERNELS_HPP
|
||||
#define KOKKOS_BLAS_KERNELS_HPP
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
template< class ConstVectorType ,
|
||||
|
@ -123,15 +125,10 @@ struct Dot
|
|||
{
|
||||
typedef typename Device::execution_space execution_space ;
|
||||
|
||||
typedef typename
|
||||
Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
|
||||
Impl::unsigned_< Type::Rank > >::type ok_rank ;
|
||||
static_assert( static_cast<unsigned>(Type::Rank) == static_cast<unsigned>(1),
|
||||
"Dot static_assert Fail: Rank != 1");
|
||||
|
||||
|
||||
/* typedef typename
|
||||
Impl::StaticAssertSame< execution_space ,
|
||||
typename Type::execution_space >::type ok_device ;*/
|
||||
|
||||
typedef double value_type ;
|
||||
|
||||
#if 1
|
||||
|
@ -164,13 +161,8 @@ struct DotSingle
|
|||
{
|
||||
typedef typename Device::execution_space execution_space ;
|
||||
|
||||
typedef typename
|
||||
Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
|
||||
Impl::unsigned_< Type::Rank > >::type ok_rank ;
|
||||
|
||||
/* typedef typename
|
||||
Impl::StaticAssertSame< execution_space ,
|
||||
typename Type::execution_space >::type ok_device ;*/
|
||||
static_assert( static_cast<unsigned>(Type::Rank) == static_cast<unsigned>(1),
|
||||
"DotSingle static_assert Fail: Rank != 1");
|
||||
|
||||
typedef double value_type ;
|
||||
|
||||
|
@ -204,25 +196,11 @@ struct Scale
|
|||
{
|
||||
typedef typename Device::execution_space execution_space ;
|
||||
|
||||
/* typedef typename
|
||||
Impl::StaticAssertSame< execution_space ,
|
||||
typename ScalarType::execution_space >::type
|
||||
ok_scalar_device ;
|
||||
static_assert( static_cast<unsigned>(ScalarType::Rank) == static_cast<unsigned>(0),
|
||||
"Scale static_assert Fail: ScalarType::Rank != 0");
|
||||
|
||||
typedef typename
|
||||
Impl::StaticAssertSame< execution_space ,
|
||||
typename VectorType::execution_space >::type
|
||||
ok_vector_device ;*/
|
||||
|
||||
typedef typename
|
||||
Impl::StaticAssertSame< Impl::unsigned_< 0 > ,
|
||||
Impl::unsigned_< ScalarType::Rank > >::type
|
||||
ok_scalar_rank ;
|
||||
|
||||
typedef typename
|
||||
Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
|
||||
Impl::unsigned_< VectorType::Rank > >::type
|
||||
ok_vector_rank ;
|
||||
static_assert( static_cast<unsigned>(VectorType::Rank) == static_cast<unsigned>(1),
|
||||
"Scale static_assert Fail: VectorType::Rank != 1");
|
||||
|
||||
#if 1
|
||||
typename ScalarType::const_type alpha ;
|
||||
|
@ -251,35 +229,14 @@ struct AXPBY
|
|||
{
|
||||
typedef typename Device::execution_space execution_space ;
|
||||
|
||||
/* typedef typename
|
||||
Impl::StaticAssertSame< execution_space ,
|
||||
typename ScalarType::execution_space >::type
|
||||
ok_scalar_device ;
|
||||
static_assert( static_cast<unsigned>(ScalarType::Rank) == static_cast<unsigned>(0),
|
||||
"AXPBY static_assert Fail: ScalarType::Rank != 0");
|
||||
|
||||
typedef typename
|
||||
Impl::StaticAssertSame< execution_space ,
|
||||
typename ConstVectorType::execution_space >::type
|
||||
ok_const_vector_device ;
|
||||
static_assert( static_cast<unsigned>(ConstVectorType::Rank) == static_cast<unsigned>(1),
|
||||
"AXPBY static_assert Fail: ConstVectorType::Rank != 1");
|
||||
|
||||
typedef typename
|
||||
Impl::StaticAssertSame< execution_space ,
|
||||
typename VectorType::execution_space >::type
|
||||
ok_vector_device ;*/
|
||||
|
||||
typedef typename
|
||||
Impl::StaticAssertSame< Impl::unsigned_< 0 > ,
|
||||
Impl::unsigned_< ScalarType::Rank > >::type
|
||||
ok_scalar_rank ;
|
||||
|
||||
typedef typename
|
||||
Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
|
||||
Impl::unsigned_< ConstVectorType::Rank > >::type
|
||||
ok_const_vector_rank ;
|
||||
|
||||
typedef typename
|
||||
Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
|
||||
Impl::unsigned_< VectorType::Rank > >::type
|
||||
ok_vector_rank ;
|
||||
static_assert( static_cast<unsigned>(VectorType::Rank) == static_cast<unsigned>(1),
|
||||
"AXPBY static_assert Fail: VectorType::Rank != 1");
|
||||
|
||||
#if 1
|
||||
typename ScalarType::const_type alpha , beta ;
|
||||
|
|
|
@ -183,7 +183,7 @@ struct ModifiedGramSchmidt
|
|||
}
|
||||
}
|
||||
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
|
||||
return timer.seconds();
|
||||
}
|
||||
|
|
|
@ -253,12 +253,12 @@ struct HexGrad
|
|||
double dt_min = 0 ;
|
||||
|
||||
Kokkos::parallel_for( count , Init( coord ) );
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
|
||||
for ( int i = 0 ; i < iter ; ++i ) {
|
||||
Kokkos::Timer timer ;
|
||||
Kokkos::parallel_for( count , HexGrad<execution_space>( coord , grad ) );
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
const double dt = timer.seconds();
|
||||
if ( 0 == i ) dt_min = dt ;
|
||||
else dt_min = dt < dt_min ? dt : dt_min ;
|
||||
|
|
|
@ -125,15 +125,15 @@ struct MultiDimRangePerf3D
|
|||
Kokkos::MDRangePolicy<Kokkos::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > policy(point_type{{0,0,0}},point_type{{icount,jcount,kcount}},tile_type{{Ti,Tj,Tk}} );
|
||||
|
||||
Kokkos::parallel_for( policy_initA, Init(Atest, icount, jcount, kcount) );
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
Kokkos::parallel_for( policy_initB, Init(Btest, icount+2, jcount+2, kcount+2) );
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
|
||||
for (int i = 0; i < iter; ++i)
|
||||
{
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::parallel_for( policy, FunctorType(Atest, Btest, icount, jcount, kcount) );
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
const double dt = timer.seconds();
|
||||
if ( 0 == i ) dt_min = dt ;
|
||||
else dt_min = dt < dt_min ? dt : dt_min ;
|
||||
|
@ -189,15 +189,15 @@ struct MultiDimRangePerf3D
|
|||
Kokkos::MDRangePolicy<Kokkos::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space > policy({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}} );
|
||||
|
||||
Kokkos::parallel_for( policy_initA, Init(Atest, icount, jcount, kcount) );
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
Kokkos::parallel_for( policy_initB, Init(Btest, icount+2, jcount+2, kcount+2) );
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
|
||||
for (int i = 0; i < iter; ++i)
|
||||
{
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::parallel_for( policy, FunctorType(Atest, Btest, icount, jcount, kcount) );
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
const double dt = timer.seconds();
|
||||
if ( 0 == i ) dt_min = dt ;
|
||||
else dt_min = dt < dt_min ? dt : dt_min ;
|
||||
|
@ -368,15 +368,15 @@ struct RangePolicyCollapseTwo
|
|||
double dt_min = 0;
|
||||
|
||||
Kokkos::parallel_for( policy, Init(Atest,icount,jcount,kcount) );
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
Kokkos::parallel_for( policy_initB, Init(Btest,icount+2,jcount+2,kcount+2) );
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
|
||||
for (int i = 0; i < iter; ++i)
|
||||
{
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount));
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
const double dt = timer.seconds();
|
||||
if ( 0 == i ) dt_min = dt ;
|
||||
else dt_min = dt < dt_min ? dt : dt_min ;
|
||||
|
@ -513,15 +513,15 @@ struct RangePolicyCollapseAll
|
|||
double dt_min = 0;
|
||||
|
||||
Kokkos::parallel_for( policy, Init(Atest,icount,jcount,kcount) );
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
Kokkos::parallel_for( policy_initB, Init(Btest,icount+2,jcount+2,kcount+2) );
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
|
||||
for (int i = 0; i < iter; ++i)
|
||||
{
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount));
|
||||
execution_space::fence();
|
||||
execution_space().fence();
|
||||
const double dt = timer.seconds();
|
||||
if ( 0 == i ) dt_min = dt ;
|
||||
else dt_min = dt < dt_min ? dt : dt_min ;
|
||||
|
|
|
@ -0,0 +1,564 @@
|
|||
#include <Kokkos_Core.hpp>
|
||||
#include <gtest/gtest.h>
|
||||
#include <PerfTest_Category.hpp>
|
||||
|
||||
|
||||
namespace Test {
|
||||
|
||||
namespace {
|
||||
template<class ExecSpace>
|
||||
struct SpaceInstance {
|
||||
static ExecSpace create() {
|
||||
return ExecSpace();
|
||||
}
|
||||
static void destroy(ExecSpace&) {
|
||||
}
|
||||
static bool overlap() {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
#ifndef KOKKOS_ENABLE_DEBUG
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
template<>
|
||||
struct SpaceInstance<Kokkos::Cuda> {
|
||||
static Kokkos::Cuda create() {
|
||||
cudaStream_t stream;
|
||||
cudaStreamCreate(&stream);
|
||||
return Kokkos::Cuda(stream);
|
||||
}
|
||||
static void destroy(Kokkos::Cuda& space) {
|
||||
cudaStream_t stream = space.cuda_stream();
|
||||
cudaStreamDestroy(stream);
|
||||
}
|
||||
static bool overlap() {
|
||||
bool value = true;
|
||||
auto local_rank_str = std::getenv("CUDA_LAUNCH_BLOCKING");
|
||||
if(local_rank_str) {
|
||||
value = (std::atoi(local_rank_str)==0);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
struct FunctorRange {
|
||||
int M,R;
|
||||
Kokkos::View<double**,TEST_EXECSPACE> a;
|
||||
FunctorRange(int M_, int R_, Kokkos::View<double**,TEST_EXECSPACE> a_):M(M_),R(R_),a(a_){}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int i) const {
|
||||
for(int r=0;r<R;r++)
|
||||
for(int j=0;j<M;j++) {
|
||||
a(i,j)+=1.0;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct FunctorMDRange {
|
||||
int M,R;
|
||||
Kokkos::View<double**,TEST_EXECSPACE> a;
|
||||
FunctorMDRange(int M_, int R_, Kokkos::View<double**,TEST_EXECSPACE> a_):M(M_),R(R_),a(a_){}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int i, const int) const {
|
||||
for(int j=0;j<M;j++)
|
||||
a(i,j)+=1.0;
|
||||
}
|
||||
};
|
||||
|
||||
struct FunctorTeam {
|
||||
int M,R;
|
||||
Kokkos::View<double**,Kokkos::LayoutRight,TEST_EXECSPACE> a;
|
||||
FunctorTeam(int M_, int R_, Kokkos::View<double**,Kokkos::LayoutRight,TEST_EXECSPACE> a_):M(M_),R(R_),a(a_){}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const Kokkos::TeamPolicy<TEST_EXECSPACE>::member_type& team) const {
|
||||
int i = team.league_rank();
|
||||
for(int r=0;r<R;r++) {
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,M), [&] (const int j) {
|
||||
a(i,j)+=1.0;
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct FunctorRangeReduce {
|
||||
int M,R;
|
||||
Kokkos::View<double**,TEST_EXECSPACE> a;
|
||||
FunctorRangeReduce(int M_, int R_, Kokkos::View<double**,TEST_EXECSPACE> a_):M(M_),R(R_),a(a_){}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int i, double& tmp) const {
|
||||
for(int r=0;r<R;r++)
|
||||
for(int j=0;j<M;j++) {
|
||||
tmp += a(i,j);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct FunctorMDRangeReduce {
|
||||
int M,R;
|
||||
Kokkos::View<double**,TEST_EXECSPACE> a;
|
||||
FunctorMDRangeReduce(int M_, int R_, Kokkos::View<double**,TEST_EXECSPACE> a_):M(M_),R(R_),a(a_){}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int i, const int, double& tmp) const {
|
||||
for(int j=0;j<M;j++)
|
||||
tmp += a(i,j);
|
||||
}
|
||||
};
|
||||
|
||||
struct FunctorTeamReduce {
|
||||
int M,R;
|
||||
Kokkos::View<double**,Kokkos::LayoutRight,TEST_EXECSPACE> a;
|
||||
FunctorTeamReduce(int M_, int R_, Kokkos::View<double**,Kokkos::LayoutRight,TEST_EXECSPACE> a_):M(M_),R(R_),a(a_){}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const Kokkos::TeamPolicy<TEST_EXECSPACE>::member_type& team, double& tmp) const {
|
||||
int i = team.league_rank();
|
||||
for(int r=0;r<R;r++) {
|
||||
double val;
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,M), [&] (const int j, double& tmp2) {
|
||||
tmp2 += a(i,j);
|
||||
},val);
|
||||
tmp+=val;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F( default_exec, overlap_range_policy ) {
|
||||
int N = 2000;
|
||||
int M = 10000;
|
||||
int R = 10;
|
||||
|
||||
TEST_EXECSPACE space;
|
||||
TEST_EXECSPACE space1 = SpaceInstance<TEST_EXECSPACE>::create();
|
||||
TEST_EXECSPACE space2 = SpaceInstance<TEST_EXECSPACE>::create();
|
||||
|
||||
Kokkos::View<double**,TEST_EXECSPACE> a("A",N,M);
|
||||
FunctorRange f(M,R,a);
|
||||
FunctorRangeReduce fr(M,R,a);
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel0",
|
||||
Kokkos::RangePolicy<TEST_EXECSPACE>(0,N), FunctorRange(M,R,a));
|
||||
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel1",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::RangePolicy<TEST_EXECSPACE>(space1,0,N),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, f);
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel2",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::RangePolicy<TEST_EXECSPACE>(space2,0,N),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, f);
|
||||
Kokkos::fence();
|
||||
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel3",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, f);
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel4",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, f);
|
||||
Kokkos::fence();
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel5",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::RangePolicy<TEST_EXECSPACE>(space1,0,N),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, FunctorRange(M,R,a));
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel6",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::RangePolicy<TEST_EXECSPACE>(space2,0,N),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, FunctorRange(M,R,a));
|
||||
Kokkos::fence();
|
||||
double time_overlap = timer.seconds();
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel7",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, f);
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel8",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, f);
|
||||
Kokkos::fence();
|
||||
double time_end = timer.seconds();
|
||||
|
||||
if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
|
||||
ASSERT_TRUE( (time_end > 1.5*time_overlap) );
|
||||
}
|
||||
printf("Time RangePolicy: NonOverlap: %lf Time Overlap: %lf\n",time_end,time_overlap);
|
||||
|
||||
Kokkos::View<double,TEST_EXECSPACE> result("result");
|
||||
Kokkos::View<double,TEST_EXECSPACE> result1("result1");
|
||||
Kokkos::View<double,TEST_EXECSPACE> result2("result2");
|
||||
Kokkos::View<double,Kokkos::HostSpace> h_result("h_result");
|
||||
Kokkos::View<double,Kokkos::HostSpace> h_result1("h_result1");
|
||||
Kokkos::View<double,Kokkos::HostSpace> h_result2("h_result2");
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_reduce("default_exec::overlap_range_policy::kernel_reduce",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, fr, result);
|
||||
Kokkos::fence();
|
||||
double time_fenced = timer.seconds();
|
||||
Kokkos::deep_copy(h_result,result);
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_reduce("default_exec::overlap_range_policy::kernel_reduce",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, fr, result);
|
||||
double time_not_fenced = timer.seconds();
|
||||
Kokkos::fence();
|
||||
if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
|
||||
ASSERT_TRUE(time_fenced>2.0*time_not_fenced);
|
||||
}
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_reduce("default_exec::overlap_range_policy::kernel_reduce",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, fr, result);
|
||||
Kokkos::parallel_reduce("default_exec::overlap_range_policy::kernel_reduce",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, fr, result);
|
||||
Kokkos::fence();
|
||||
double time_no_overlapped_reduce = timer.seconds();
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_reduce("default_exec::overlap_range_policy::kernel_reduce",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::RangePolicy<TEST_EXECSPACE>(space1,0,N),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, fr, result1);
|
||||
Kokkos::parallel_reduce("default_exec::overlap_range_policy::kernel_reduce",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::RangePolicy<TEST_EXECSPACE>(space2,0,N),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, fr, result2);
|
||||
Kokkos::fence();
|
||||
double time_overlapped_reduce = timer.seconds();
|
||||
|
||||
Kokkos::deep_copy(h_result2,result2);
|
||||
Kokkos::deep_copy(h_result1,result1);
|
||||
|
||||
ASSERT_EQ(h_result1(),h_result());
|
||||
ASSERT_EQ(h_result2(),h_result());
|
||||
|
||||
if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
|
||||
ASSERT_TRUE(time_overlapped_reduce < 1.5*time_no_overlapped_reduce);
|
||||
}
|
||||
printf("Time RangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",time_no_overlapped_reduce,time_overlapped_reduce);
|
||||
SpaceInstance<TEST_EXECSPACE>::destroy(space1);
|
||||
SpaceInstance<TEST_EXECSPACE>::destroy(space2);
|
||||
}
|
||||
|
||||
TEST_F( default_exec, overlap_mdrange_policy ) {
|
||||
int N = 200;
|
||||
int M = 10000;
|
||||
int R = 10;
|
||||
|
||||
TEST_EXECSPACE space;
|
||||
TEST_EXECSPACE space1 = SpaceInstance<TEST_EXECSPACE>::create();
|
||||
TEST_EXECSPACE space2 = SpaceInstance<TEST_EXECSPACE>::create();
|
||||
|
||||
Kokkos::View<double**,TEST_EXECSPACE> a("A",N,M);
|
||||
FunctorMDRange f(M,R,a);
|
||||
FunctorMDRangeReduce fr(M,R,a);
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel0",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>({0,0},{N,R}),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, FunctorMDRange(M,R,a));
|
||||
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel1",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space1,{0,0},{N,R}),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, f);
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel2",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space2,{0,0},{N,R}),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, f);
|
||||
Kokkos::fence();
|
||||
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel3",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, f);
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel4",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, f);
|
||||
Kokkos::fence();
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel5",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space1,{0,0},{N,R}),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, FunctorMDRange(M,R,a));
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel6",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space2,{0,0},{N,R}),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, FunctorMDRange(M,R,a));
|
||||
Kokkos::fence();
|
||||
double time_overlap = timer.seconds();
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel7",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, f);
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel8",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, f);
|
||||
Kokkos::fence();
|
||||
double time_end = timer.seconds();
|
||||
|
||||
if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
|
||||
ASSERT_TRUE( (time_end > 1.5*time_overlap) );
|
||||
}
|
||||
printf("Time MDRangePolicy: NonOverlap: %lf Time Overlap: %lf\n",time_end,time_overlap);
|
||||
|
||||
Kokkos::View<double,TEST_EXECSPACE> result("result");
|
||||
Kokkos::View<double,TEST_EXECSPACE> result1("result1");
|
||||
Kokkos::View<double,TEST_EXECSPACE> result2("result2");
|
||||
Kokkos::View<double,Kokkos::HostSpace> h_result("h_result");
|
||||
Kokkos::View<double,Kokkos::HostSpace> h_result1("h_result1");
|
||||
Kokkos::View<double,Kokkos::HostSpace> h_result2("h_result2");
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_reduce("default_exec::overlap_mdrange_policy::kernel_reduce",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, fr, result);
|
||||
Kokkos::fence();
|
||||
double time_fenced = timer.seconds();
|
||||
Kokkos::deep_copy(h_result,result);
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_reduce("default_exec::overlap_mdrange_policy::kernel_reduce",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, fr, result);
|
||||
double time_not_fenced = timer.seconds();
|
||||
Kokkos::fence();
|
||||
if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
|
||||
ASSERT_TRUE(time_fenced>2.0*time_not_fenced);
|
||||
}
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_reduce("default_exec::overlap_mdrange_policy::kernel_reduce",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, fr, result);
|
||||
Kokkos::parallel_reduce("default_exec::overlap_mdrange_policy::kernel_reduce",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, fr, result);
|
||||
Kokkos::fence();
|
||||
double time_no_overlapped_reduce = timer.seconds();
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_reduce("default_exec::overlap_mdrange_policy::kernel_reduce",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space1,{0,0},{N,R}),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, fr, result1);
|
||||
Kokkos::parallel_reduce("default_exec::overlap_mdrange_policy::kernel_reduce",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space2,{0,0},{N,R}),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, fr, result2);
|
||||
Kokkos::fence();
|
||||
double time_overlapped_reduce = timer.seconds();
|
||||
|
||||
Kokkos::deep_copy(h_result2,result2);
|
||||
Kokkos::deep_copy(h_result1,result1);
|
||||
|
||||
ASSERT_EQ(h_result1(),h_result());
|
||||
ASSERT_EQ(h_result2(),h_result());
|
||||
|
||||
if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
|
||||
ASSERT_TRUE(time_overlapped_reduce < 1.5*time_no_overlapped_reduce);
|
||||
}
|
||||
printf("Time MDRangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",time_no_overlapped_reduce,time_overlapped_reduce);
|
||||
SpaceInstance<TEST_EXECSPACE>::destroy(space2);
|
||||
SpaceInstance<TEST_EXECSPACE>::destroy(space1);
|
||||
|
||||
}
|
||||
|
||||
TEST_F( default_exec, overlap_team_policy ) {
|
||||
int N = 20;
|
||||
int M = 1000000;
|
||||
int R = 10;
|
||||
|
||||
TEST_EXECSPACE space;
|
||||
TEST_EXECSPACE space1 = SpaceInstance<TEST_EXECSPACE>::create();
|
||||
TEST_EXECSPACE space2 = SpaceInstance<TEST_EXECSPACE>::create();
|
||||
|
||||
Kokkos::View<double**,Kokkos::LayoutRight,TEST_EXECSPACE> a("A",N,M);
|
||||
FunctorTeam f(M,R,a);
|
||||
FunctorTeamReduce fr(M,R,a);
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel0",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::TeamPolicy<TEST_EXECSPACE>(N,Kokkos::AUTO),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, FunctorTeam(M,R,a));
|
||||
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel1",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::TeamPolicy<TEST_EXECSPACE>(space1,N,Kokkos::AUTO),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, f);
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel2",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::TeamPolicy<TEST_EXECSPACE>(space2,N,Kokkos::AUTO),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, f);
|
||||
Kokkos::fence();
|
||||
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel3",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, f);
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel4",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, f);
|
||||
Kokkos::fence();
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel5",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::TeamPolicy<TEST_EXECSPACE>(space1,N,Kokkos::AUTO),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, FunctorTeam(M,R,a));
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel6",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::TeamPolicy<TEST_EXECSPACE>(space2,N,Kokkos::AUTO),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, FunctorTeam(M,R,a));
|
||||
Kokkos::fence();
|
||||
double time_overlap = timer.seconds();
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel7",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, f);
|
||||
Kokkos::parallel_for("default_exec::overlap_range_policy::kernel8",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, f);
|
||||
Kokkos::fence();
|
||||
double time_end = timer.seconds();
|
||||
|
||||
if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
|
||||
ASSERT_TRUE( (time_end > 1.5*time_overlap) );
|
||||
}
|
||||
printf("Time TeamPolicy: NonOverlap: %lf Time Overlap: %lf\n",time_end,time_overlap);
|
||||
|
||||
Kokkos::View<double,TEST_EXECSPACE> result("result");
|
||||
Kokkos::View<double,TEST_EXECSPACE> result1("result1");
|
||||
Kokkos::View<double,TEST_EXECSPACE> result2("result2");
|
||||
Kokkos::View<double,Kokkos::HostSpace> h_result("h_result");
|
||||
Kokkos::View<double,Kokkos::HostSpace> h_result1("h_result1");
|
||||
Kokkos::View<double,Kokkos::HostSpace> h_result2("h_result2");
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_reduce("default_exec::overlap_team_policy::kernel_reduce",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, fr, result);
|
||||
Kokkos::fence();
|
||||
double time_fenced = timer.seconds();
|
||||
Kokkos::deep_copy(h_result,result);
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_reduce("default_exec::overlap_team_policy::kernel_reduce",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, fr, result);
|
||||
double time_not_fenced = timer.seconds();
|
||||
Kokkos::fence();
|
||||
if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
|
||||
ASSERT_TRUE(time_fenced>2.0*time_not_fenced);
|
||||
}
|
||||
timer.reset();
|
||||
Kokkos::parallel_reduce("default_exec::overlap_team_policy::kernel_reduce",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, fr, result);
|
||||
Kokkos::parallel_reduce("default_exec::overlap_team_policy::kernel_reduce",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, fr, result);
|
||||
Kokkos::fence();
|
||||
double time_no_overlapped_reduce = timer.seconds();
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_reduce("default_exec::overlap_team_policy::kernel_reduce",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::TeamPolicy<TEST_EXECSPACE>(space1,N,Kokkos::AUTO),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, fr, result1);
|
||||
Kokkos::parallel_reduce("default_exec::overlap_team_policy::kernel_reduce",
|
||||
Kokkos::Experimental::require(
|
||||
Kokkos::TeamPolicy<TEST_EXECSPACE>(space2,N,Kokkos::AUTO),
|
||||
Kokkos::Experimental::WorkItemProperty::HintLightWeight)
|
||||
, fr, result2);
|
||||
Kokkos::fence();
|
||||
double time_overlapped_reduce = timer.seconds();
|
||||
|
||||
Kokkos::deep_copy(h_result2,result2);
|
||||
Kokkos::deep_copy(h_result1,result1);
|
||||
|
||||
ASSERT_EQ(h_result1(),h_result());
|
||||
ASSERT_EQ(h_result2(),h_result());
|
||||
|
||||
if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
|
||||
ASSERT_TRUE(time_overlapped_reduce < 1.5*time_no_overlapped_reduce);
|
||||
}
|
||||
printf("Time TeamPolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",time_no_overlapped_reduce,time_overlapped_reduce);
|
||||
SpaceInstance<TEST_EXECSPACE>::destroy(space1);
|
||||
SpaceInstance<TEST_EXECSPACE>::destroy(space2);
|
||||
}
|
||||
}
|
|
@ -121,6 +121,7 @@ void run_allocateview_tests(int N, int R) {
|
|||
Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
|
||||
a_ptr[i] = 0.0;
|
||||
});
|
||||
Kokkos::fence();
|
||||
Kokkos::kokkos_free(a_ptr);
|
||||
}
|
||||
time_raw = timer.seconds()/R;
|
||||
|
|
|
@ -95,6 +95,7 @@ void run_deepcopyview_tests123(int N, int R) {
|
|||
a_ptr[i] = b_ptr[i];
|
||||
});
|
||||
}
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds()/R;
|
||||
}
|
||||
#endif
|
||||
|
@ -135,6 +136,7 @@ void run_deepcopyview_tests45(int N, int R) {
|
|||
a_ptr[i] = b_ptr[i];
|
||||
});
|
||||
}
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds()/R;
|
||||
}
|
||||
#endif
|
||||
|
@ -169,6 +171,7 @@ void run_deepcopyview_tests6(int N, int R) {
|
|||
a_ptr[i] = b_ptr[i];
|
||||
});
|
||||
}
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds()/R;
|
||||
}
|
||||
#endif
|
||||
|
@ -202,6 +205,7 @@ void run_deepcopyview_tests7(int N, int R) {
|
|||
a_ptr[i] = b_ptr[i];
|
||||
});
|
||||
}
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds()/R;
|
||||
}
|
||||
#endif
|
||||
|
@ -235,6 +239,7 @@ void run_deepcopyview_tests8(int N, int R) {
|
|||
a_ptr[i] = b_ptr[i];
|
||||
});
|
||||
}
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds()/R;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -90,6 +90,7 @@ void run_fillview_tests123(int N, int R) {
|
|||
a_ptr[i] = 1.1;
|
||||
});
|
||||
}
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds()/R;
|
||||
}
|
||||
#endif
|
||||
|
@ -126,6 +127,7 @@ void run_fillview_tests45(int N, int R) {
|
|||
a_ptr[i] = 1.1;
|
||||
});
|
||||
}
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds()/R;
|
||||
}
|
||||
#endif
|
||||
|
@ -157,6 +159,7 @@ void run_fillview_tests6(int N, int R) {
|
|||
a_ptr[i] = 1.1;
|
||||
});
|
||||
}
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds()/R;
|
||||
}
|
||||
#endif
|
||||
|
@ -187,6 +190,7 @@ void run_fillview_tests7(int N, int R) {
|
|||
a_ptr[i] = 1.1;
|
||||
});
|
||||
}
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds()/R;
|
||||
}
|
||||
#endif
|
||||
|
@ -217,6 +221,7 @@ void run_fillview_tests8(int N, int R) {
|
|||
a_ptr[i] = 1.1;
|
||||
});
|
||||
}
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds()/R;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -95,7 +95,9 @@ void run_resizeview_tests123(int N, int R) {
|
|||
Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
|
||||
a1_ptr[i] = a_ptr[i];
|
||||
});
|
||||
Kokkos::fence();
|
||||
}
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds()/R;
|
||||
}
|
||||
#endif
|
||||
|
@ -143,7 +145,9 @@ void run_resizeview_tests45(int N, int R) {
|
|||
Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
|
||||
a1_ptr[i] = a_ptr[i];
|
||||
});
|
||||
Kokkos::fence();
|
||||
}
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds()/R;
|
||||
}
|
||||
#endif
|
||||
|
@ -181,7 +185,9 @@ void run_resizeview_tests6(int N, int R) {
|
|||
Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
|
||||
a1_ptr[i] = a_ptr[i];
|
||||
});
|
||||
Kokkos::fence();
|
||||
}
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds()/R;
|
||||
}
|
||||
#endif
|
||||
|
@ -218,7 +224,9 @@ void run_resizeview_tests7(int N, int R) {
|
|||
Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
|
||||
a1_ptr[i] = a_ptr[i];
|
||||
});
|
||||
Kokkos::fence();
|
||||
}
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds()/R;
|
||||
}
|
||||
#endif
|
||||
|
@ -255,7 +263,9 @@ void run_resizeview_tests8(int N, int R) {
|
|||
Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
|
||||
a1_ptr[i] = a_ptr[i];
|
||||
});
|
||||
Kokkos::fence();
|
||||
}
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds()/R;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -69,7 +69,7 @@ typedef Kokkos::DefaultExecutionSpace exec_space;
|
|||
#define WHITE 8
|
||||
|
||||
void textcolor(int attr, int fg, int bg)
|
||||
{ char command[13];
|
||||
{ char command[40];
|
||||
|
||||
/* Command is the control command to the terminal */
|
||||
sprintf(command, "%c[%d;%d;%dm", 0x1B, attr, fg + 30, bg + 40);
|
||||
|
@ -85,7 +85,7 @@ struct ZeroFunctor{
|
|||
typedef typename Kokkos::View<T,execution_space>::HostMirror h_type;
|
||||
type data;
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(int i) const {
|
||||
void operator()(int) const {
|
||||
data() = 0;
|
||||
}
|
||||
};
|
||||
|
@ -101,7 +101,7 @@ struct AddFunctor{
|
|||
type data;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(int i) const {
|
||||
void operator()(int) const {
|
||||
Kokkos::atomic_fetch_add(&data(),(T)1);
|
||||
}
|
||||
};
|
||||
|
@ -113,12 +113,12 @@ T AddLoop(int loop) {
|
|||
typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
|
||||
f_zero.data = data;
|
||||
Kokkos::parallel_for(1,f_zero);
|
||||
exec_space::fence();
|
||||
exec_space().fence();
|
||||
|
||||
struct AddFunctor<T,exec_space> f_add;
|
||||
f_add.data = data;
|
||||
Kokkos::parallel_for(loop,f_add);
|
||||
exec_space::fence();
|
||||
exec_space().fence();
|
||||
|
||||
Kokkos::deep_copy(h_data,data);
|
||||
T val = h_data();
|
||||
|
@ -132,7 +132,7 @@ struct AddNonAtomicFunctor{
|
|||
type data;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(int i) const {
|
||||
void operator()(int) const {
|
||||
data()+=(T)1;
|
||||
}
|
||||
};
|
||||
|
@ -145,12 +145,12 @@ T AddLoopNonAtomic(int loop) {
|
|||
|
||||
f_zero.data = data;
|
||||
Kokkos::parallel_for(1,f_zero);
|
||||
exec_space::fence();
|
||||
exec_space().fence();
|
||||
|
||||
struct AddNonAtomicFunctor<T,exec_space> f_add;
|
||||
f_add.data = data;
|
||||
Kokkos::parallel_for(loop,f_add);
|
||||
exec_space::fence();
|
||||
exec_space().fence();
|
||||
|
||||
Kokkos::deep_copy(h_data,data);
|
||||
T val = h_data();
|
||||
|
@ -178,7 +178,7 @@ struct CASFunctor{
|
|||
type data;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(int i) const {
|
||||
void operator()(int) const {
|
||||
T old = data();
|
||||
T newval, assumed;
|
||||
do {
|
||||
|
@ -197,12 +197,12 @@ T CASLoop(int loop) {
|
|||
typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
|
||||
f_zero.data = data;
|
||||
Kokkos::parallel_for(1,f_zero);
|
||||
exec_space::fence();
|
||||
exec_space().fence();
|
||||
|
||||
struct CASFunctor<T,exec_space> f_cas;
|
||||
f_cas.data = data;
|
||||
Kokkos::parallel_for(loop,f_cas);
|
||||
exec_space::fence();
|
||||
exec_space().fence();
|
||||
|
||||
Kokkos::deep_copy(h_data,data);
|
||||
T val = h_data();
|
||||
|
@ -217,7 +217,7 @@ struct CASNonAtomicFunctor{
|
|||
type data;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(int i) const {
|
||||
void operator()(int) const {
|
||||
volatile T assumed;
|
||||
volatile T newval;
|
||||
bool fail=1;
|
||||
|
@ -240,12 +240,12 @@ T CASLoopNonAtomic(int loop) {
|
|||
typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
|
||||
f_zero.data = data;
|
||||
Kokkos::parallel_for(1,f_zero);
|
||||
exec_space::fence();
|
||||
exec_space().fence();
|
||||
|
||||
struct CASNonAtomicFunctor<T,exec_space> f_cas;
|
||||
f_cas.data = data;
|
||||
Kokkos::parallel_for(loop,f_cas);
|
||||
exec_space::fence();
|
||||
exec_space().fence();
|
||||
|
||||
Kokkos::deep_copy(h_data,data);
|
||||
T val = h_data();
|
||||
|
@ -296,19 +296,19 @@ T ExchLoop(int loop) {
|
|||
typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
|
||||
f_zero.data = data;
|
||||
Kokkos::parallel_for(1,f_zero);
|
||||
exec_space::fence();
|
||||
exec_space().fence();
|
||||
|
||||
typename ZeroFunctor<T,exec_space>::type data2("Data");
|
||||
typename ZeroFunctor<T,exec_space>::h_type h_data2("HData");
|
||||
f_zero.data = data2;
|
||||
Kokkos::parallel_for(1,f_zero);
|
||||
exec_space::fence();
|
||||
exec_space().fence();
|
||||
|
||||
struct ExchFunctor<T,exec_space> f_exch;
|
||||
f_exch.data = data;
|
||||
f_exch.data2 = data2;
|
||||
Kokkos::parallel_for(loop,f_exch);
|
||||
exec_space::fence();
|
||||
exec_space().fence();
|
||||
|
||||
Kokkos::deep_copy(h_data,data);
|
||||
Kokkos::deep_copy(h_data2,data2);
|
||||
|
@ -339,19 +339,19 @@ T ExchLoopNonAtomic(int loop) {
|
|||
typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
|
||||
f_zero.data = data;
|
||||
Kokkos::parallel_for(1,f_zero);
|
||||
exec_space::fence();
|
||||
exec_space().fence();
|
||||
|
||||
typename ZeroFunctor<T,exec_space>::type data2("Data");
|
||||
typename ZeroFunctor<T,exec_space>::h_type h_data2("HData");
|
||||
f_zero.data = data2;
|
||||
Kokkos::parallel_for(1,f_zero);
|
||||
exec_space::fence();
|
||||
exec_space().fence();
|
||||
|
||||
struct ExchNonAtomicFunctor<T,exec_space> f_exch;
|
||||
f_exch.data = data;
|
||||
f_exch.data2 = data2;
|
||||
Kokkos::parallel_for(loop,f_exch);
|
||||
exec_space::fence();
|
||||
exec_space().fence();
|
||||
|
||||
Kokkos::deep_copy(h_data,data);
|
||||
Kokkos::deep_copy(h_data2,data2);
|
||||
|
|
|
@ -153,6 +153,7 @@ struct TestFunctor {
|
|||
typedef Kokkos::RangePolicy< ExecSpace , TagDel > policy ;
|
||||
|
||||
Kokkos::parallel_for( policy(0,range_iter), *this );
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
|
|
@ -92,27 +92,26 @@ long fib_alloc_count( long n )
|
|||
return count[ n & mask ];
|
||||
}
|
||||
|
||||
template< class Space >
|
||||
template< class Scheduler >
|
||||
struct TestFib {
|
||||
|
||||
using Scheduler = Kokkos::TaskScheduler< Space > ;
|
||||
using MemorySpace = typename Scheduler::memory_space ;
|
||||
using MemberType = typename Scheduler::member_type ;
|
||||
using FutureType = Kokkos::Future< long , Space > ;
|
||||
using FutureType = Kokkos::BasicFuture< long , Scheduler > ;
|
||||
|
||||
typedef long value_type ;
|
||||
|
||||
Scheduler sched ;
|
||||
FutureType dep[2] ;
|
||||
const value_type n ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TestFib( const Scheduler & arg_sched , const value_type arg_n )
|
||||
: sched( arg_sched ), dep{} , n( arg_n ) {}
|
||||
TestFib( const value_type arg_n )
|
||||
: dep{} , n( arg_n ) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()( const MemberType & , value_type & result ) noexcept
|
||||
void operator()( MemberType & member, value_type & result ) noexcept
|
||||
{
|
||||
auto& sched = member.scheduler();
|
||||
if ( n < 2 ) {
|
||||
result = n ;
|
||||
}
|
||||
|
@ -126,13 +125,13 @@ struct TestFib {
|
|||
|
||||
dep[1] = Kokkos::task_spawn
|
||||
( Kokkos::TaskSingle( sched, Kokkos::TaskPriority::High )
|
||||
, TestFib( sched, n - 2 ) );
|
||||
, TestFib( n - 2 ) );
|
||||
|
||||
dep[0] = Kokkos::task_spawn
|
||||
( Kokkos::TaskSingle( sched )
|
||||
, TestFib( sched, n - 1 ) );
|
||||
, TestFib( n - 1 ) );
|
||||
|
||||
Kokkos::Future< ExecSpace > fib_all = Kokkos::when_all( dep, 2 );
|
||||
auto fib_all = sched.when_all( dep, 2 );
|
||||
|
||||
if ( ! dep[0].is_null() && ! dep[1].is_null() && ! fib_all.is_null() ) {
|
||||
// High priority to retire this branch.
|
||||
|
@ -202,13 +201,15 @@ int main( int argc , char* argv[] )
|
|||
return -1;
|
||||
}
|
||||
|
||||
typedef TestFib< ExecSpace > Functor ;
|
||||
using Scheduler = Kokkos::TaskSchedulerMultiple<ExecSpace>;
|
||||
|
||||
typedef TestFib< Scheduler > Functor ;
|
||||
|
||||
Kokkos::initialize(argc,argv);
|
||||
|
||||
{
|
||||
|
||||
Functor::Scheduler sched( Functor::MemorySpace()
|
||||
Scheduler sched( Functor::MemorySpace()
|
||||
, total_alloc_size
|
||||
, min_block_size
|
||||
, max_block_size
|
||||
|
@ -217,21 +218,21 @@ int main( int argc , char* argv[] )
|
|||
|
||||
Functor::FutureType f =
|
||||
Kokkos::host_spawn( Kokkos::TaskSingle( sched )
|
||||
, Functor( sched , fib_input )
|
||||
, Functor( fib_input )
|
||||
);
|
||||
|
||||
Kokkos::wait( sched );
|
||||
|
||||
test_result = f.get();
|
||||
|
||||
task_count_max = sched.allocated_task_count_max();
|
||||
task_count_accum = sched.allocated_task_count_accum();
|
||||
//task_count_max = sched.allocated_task_count_max();
|
||||
//task_count_accum = sched.allocated_task_count_accum();
|
||||
|
||||
if ( number_alloc != task_count_accum ) {
|
||||
std::cout << " number_alloc( " << number_alloc << " )"
|
||||
<< " != task_count_accum( " << task_count_accum << " )"
|
||||
<< std::endl ;
|
||||
}
|
||||
//if ( number_alloc != task_count_accum ) {
|
||||
// std::cout << " number_alloc( " << number_alloc << " )"
|
||||
// << " != task_count_accum( " << task_count_accum << " )"
|
||||
// << std::endl ;
|
||||
//}
|
||||
|
||||
if ( fib_output != test_result ) {
|
||||
std::cout << " answer( " << fib_output << " )"
|
||||
|
@ -239,7 +240,7 @@ int main( int argc , char* argv[] )
|
|||
<< std::endl ;
|
||||
}
|
||||
|
||||
if ( fib_output != test_result || number_alloc != task_count_accum ) {
|
||||
if ( fib_output != test_result) { // || number_alloc != task_count_accum ) {
|
||||
printf(" TEST FAILED\n");
|
||||
return -1;
|
||||
}
|
||||
|
@ -252,7 +253,7 @@ int main( int argc , char* argv[] )
|
|||
|
||||
Functor::FutureType ftmp =
|
||||
Kokkos::host_spawn( Kokkos::TaskSingle( sched )
|
||||
, Functor( sched , fib_input )
|
||||
, Functor( fib_input )
|
||||
);
|
||||
|
||||
Kokkos::wait( sched );
|
||||
|
|
|
@ -61,6 +61,16 @@ IF(KOKKOS_LEGACY_TRIBITS)
|
|||
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
FILE(GLOB HEADERS_HPX HPX/*.hpp)
|
||||
FILE(GLOB SOURCES_HPX HPX/*.cpp)
|
||||
|
||||
LIST(APPEND HEADERS_PRIVATE ${HEADERS_HPX} )
|
||||
LIST(APPEND SOURCES ${SOURCES_HPX} )
|
||||
|
||||
INSTALL(FILES ${HEADERS_HPX} DESTINATION ${TRILINOS_INCDIR}/HPX/)
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
FILE(GLOB HEADERS_CUDA Cuda/*.hpp)
|
||||
FILE(GLOB SOURCES_CUDA Cuda/*.cpp)
|
||||
|
||||
|
|
|
@ -1,419 +0,0 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_CUDAEXEC_HPP
|
||||
#define KOKKOS_CUDAEXEC_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include <string>
|
||||
#include <cstdint>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_abort.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Error.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Locks.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
struct CudaTraits {
|
||||
enum { WarpSize = 32 /* 0x0020 */ };
|
||||
enum { WarpIndexMask = 0x001f /* Mask for warpindex */ };
|
||||
enum { WarpIndexShift = 5 /* WarpSize == 1 << WarpShift */ };
|
||||
|
||||
enum { SharedMemoryBanks = 32 /* Compute device 2.0 */ };
|
||||
enum { SharedMemoryCapacity = 0x0C000 /* 48k shared / 16k L1 Cache */ };
|
||||
enum { SharedMemoryUsage = 0x04000 /* 16k shared / 48k L1 Cache */ };
|
||||
|
||||
enum { UpperBoundGridCount = 65535 /* Hard upper bound */ };
|
||||
enum { ConstantMemoryCapacity = 0x010000 /* 64k bytes */ };
|
||||
enum { ConstantMemoryUsage = 0x008000 /* 32k bytes */ };
|
||||
enum { ConstantMemoryCache = 0x002000 /* 8k bytes */ };
|
||||
|
||||
typedef unsigned long
|
||||
ConstantGlobalBufferType[ ConstantMemoryUsage / sizeof(unsigned long) ];
|
||||
|
||||
enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ };
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
CudaSpace::size_type warp_count( CudaSpace::size_type i )
|
||||
{ return ( i + WarpIndexMask ) >> WarpIndexShift ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
CudaSpace::size_type warp_align( CudaSpace::size_type i )
|
||||
{
|
||||
enum { Mask = ~CudaSpace::size_type( WarpIndexMask ) };
|
||||
return ( i + WarpIndexMask ) & Mask ;
|
||||
}
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
CudaSpace::size_type cuda_internal_multiprocessor_count();
|
||||
CudaSpace::size_type cuda_internal_maximum_warp_count();
|
||||
CudaSpace::size_type cuda_internal_maximum_grid_count();
|
||||
CudaSpace::size_type cuda_internal_maximum_shared_words();
|
||||
|
||||
CudaSpace::size_type cuda_internal_maximum_concurrent_block_count();
|
||||
|
||||
CudaSpace::size_type * cuda_internal_scratch_flags( const CudaSpace::size_type size );
|
||||
CudaSpace::size_type * cuda_internal_scratch_space( const CudaSpace::size_type size );
|
||||
CudaSpace::size_type * cuda_internal_scratch_unified( const CudaSpace::size_type size );
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#if defined( __CUDACC__ )
|
||||
|
||||
/** \brief Access to constant memory on the device */
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
|
||||
__device__ __constant__
|
||||
extern unsigned long kokkos_impl_cuda_constant_memory_buffer[] ;
|
||||
|
||||
#else
|
||||
|
||||
__device__ __constant__
|
||||
unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;
|
||||
|
||||
#endif
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink = false);
|
||||
}
|
||||
}
|
||||
|
||||
template< typename T >
|
||||
inline
|
||||
__device__
|
||||
T * kokkos_impl_cuda_shared_memory()
|
||||
{ extern __shared__ Kokkos::CudaSpace::size_type sh[]; return (T*) sh ; }
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// See section B.17 of Cuda C Programming Guide Version 3.2
|
||||
// for discussion of
|
||||
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
|
||||
// function qualifier which could be used to improve performance.
|
||||
//----------------------------------------------------------------------------
|
||||
// Maximize L1 cache and minimize shared memory:
|
||||
// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
|
||||
// For 2.0 capability: 48 KB L1 and 16 KB shared
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< class DriverType>
|
||||
__global__
|
||||
static void cuda_parallel_launch_constant_memory()
|
||||
{
|
||||
const DriverType & driver =
|
||||
*((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
|
||||
|
||||
driver();
|
||||
}
|
||||
|
||||
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
|
||||
__global__
|
||||
__launch_bounds__(maxTperB, minBperSM)
|
||||
static void cuda_parallel_launch_constant_memory()
|
||||
{
|
||||
const DriverType & driver =
|
||||
*((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
|
||||
|
||||
driver();
|
||||
}
|
||||
|
||||
template< class DriverType>
|
||||
__global__
|
||||
static void cuda_parallel_launch_local_memory( const DriverType driver )
|
||||
{
|
||||
driver();
|
||||
}
|
||||
|
||||
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
|
||||
__global__
|
||||
__launch_bounds__(maxTperB, minBperSM)
|
||||
static void cuda_parallel_launch_local_memory( const DriverType driver )
|
||||
{
|
||||
driver();
|
||||
}
|
||||
|
||||
template < class DriverType
|
||||
, class LaunchBounds = Kokkos::LaunchBounds<>
|
||||
, bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
|
||||
struct CudaParallelLaunch ;
|
||||
|
||||
template < class DriverType
|
||||
, unsigned int MaxThreadsPerBlock
|
||||
, unsigned int MinBlocksPerSM >
|
||||
struct CudaParallelLaunch< DriverType
|
||||
, Kokkos::LaunchBounds< MaxThreadsPerBlock
|
||||
, MinBlocksPerSM >
|
||||
, true >
|
||||
{
|
||||
inline
|
||||
CudaParallelLaunch( const DriverType & driver
|
||||
, const dim3 & grid
|
||||
, const dim3 & block
|
||||
, const int shmem
|
||||
, const cudaStream_t stream = 0 )
|
||||
{
|
||||
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
|
||||
|
||||
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
|
||||
sizeof( DriverType ) ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
|
||||
}
|
||||
|
||||
// Fence before changing settings and copying closure
|
||||
Kokkos::Cuda::fence();
|
||||
|
||||
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
|
||||
}
|
||||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else {
|
||||
CUDA_SAFE_CALL(
|
||||
cudaFuncSetCacheConfig
|
||||
( cuda_parallel_launch_constant_memory
|
||||
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
|
||||
, ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
|
||||
) );
|
||||
}
|
||||
#endif
|
||||
|
||||
// Copy functor to constant memory on the device
|
||||
cudaMemcpyToSymbol(
|
||||
kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType) );
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
// Invoke the driver function on the device
|
||||
cuda_parallel_launch_constant_memory
|
||||
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
|
||||
<<< grid , block , shmem , stream >>>();
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
Kokkos::Cuda::fence();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template < class DriverType >
|
||||
struct CudaParallelLaunch< DriverType
|
||||
, Kokkos::LaunchBounds<>
|
||||
, true >
|
||||
{
|
||||
inline
|
||||
CudaParallelLaunch( const DriverType & driver
|
||||
, const dim3 & grid
|
||||
, const dim3 & block
|
||||
, const int shmem
|
||||
, const cudaStream_t stream = 0 )
|
||||
{
|
||||
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
|
||||
|
||||
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
|
||||
sizeof( DriverType ) ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
|
||||
}
|
||||
|
||||
// Fence before changing settings and copying closure
|
||||
Kokkos::Cuda::fence();
|
||||
|
||||
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
|
||||
}
|
||||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else {
|
||||
CUDA_SAFE_CALL(
|
||||
cudaFuncSetCacheConfig
|
||||
( cuda_parallel_launch_constant_memory< DriverType >
|
||||
, ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
|
||||
) );
|
||||
}
|
||||
#endif
|
||||
|
||||
// Copy functor to constant memory on the device
|
||||
cudaMemcpyToSymbol(
|
||||
kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType) );
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
// Invoke the driver function on the device
|
||||
cuda_parallel_launch_constant_memory< DriverType >
|
||||
<<< grid , block , shmem , stream >>>();
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
Kokkos::Cuda::fence();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template < class DriverType
|
||||
, unsigned int MaxThreadsPerBlock
|
||||
, unsigned int MinBlocksPerSM >
|
||||
struct CudaParallelLaunch< DriverType
|
||||
, Kokkos::LaunchBounds< MaxThreadsPerBlock
|
||||
, MinBlocksPerSM >
|
||||
, false >
|
||||
{
|
||||
inline
|
||||
CudaParallelLaunch( const DriverType & driver
|
||||
, const dim3 & grid
|
||||
, const dim3 & block
|
||||
, const int shmem
|
||||
, const cudaStream_t stream = 0 )
|
||||
{
|
||||
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
|
||||
|
||||
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
|
||||
sizeof( DriverType ) ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
|
||||
}
|
||||
|
||||
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
|
||||
}
|
||||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else {
|
||||
CUDA_SAFE_CALL(
|
||||
cudaFuncSetCacheConfig
|
||||
( cuda_parallel_launch_local_memory
|
||||
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
|
||||
, ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
|
||||
) );
|
||||
}
|
||||
#endif
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
// Invoke the driver function on the device
|
||||
cuda_parallel_launch_local_memory
|
||||
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
|
||||
<<< grid , block , shmem , stream >>>( driver );
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
Kokkos::Cuda::fence();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template < class DriverType >
|
||||
struct CudaParallelLaunch< DriverType
|
||||
, Kokkos::LaunchBounds<>
|
||||
, false >
|
||||
{
|
||||
inline
|
||||
CudaParallelLaunch( const DriverType & driver
|
||||
, const dim3 & grid
|
||||
, const dim3 & block
|
||||
, const int shmem
|
||||
, const cudaStream_t stream = 0 )
|
||||
{
|
||||
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
|
||||
|
||||
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
|
||||
sizeof( DriverType ) ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
|
||||
}
|
||||
|
||||
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
|
||||
}
|
||||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else {
|
||||
CUDA_SAFE_CALL(
|
||||
cudaFuncSetCacheConfig
|
||||
( cuda_parallel_launch_local_memory< DriverType >
|
||||
, ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
|
||||
) );
|
||||
}
|
||||
#endif
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
// Invoke the driver function on the device
|
||||
cuda_parallel_launch_local_memory< DriverType >
|
||||
<<< grid , block , shmem , stream >>>( driver );
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
Kokkos::Cuda::fence();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* defined( __CUDACC__ ) */
|
||||
#endif /* defined( KOKKOS_ENABLE_CUDA ) */
|
||||
#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */
|
||||
|
|
@ -55,7 +55,7 @@
|
|||
#include <Kokkos_Cuda.hpp>
|
||||
#include <Kokkos_CudaSpace.hpp>
|
||||
|
||||
#include <Cuda/Kokkos_Cuda_Internal.hpp>
|
||||
//#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
|
@ -183,7 +183,7 @@ void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const
|
|||
|
||||
enum { max_uvm_allocations = 65536 };
|
||||
|
||||
Cuda::fence();
|
||||
Cuda::impl_static_fence();
|
||||
if ( arg_alloc_size > 0 )
|
||||
{
|
||||
Kokkos::Impl::num_uvm_allocations++;
|
||||
|
@ -194,7 +194,7 @@ void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const
|
|||
|
||||
CUDA_SAFE_CALL( cudaMallocManaged( &ptr, arg_alloc_size , cudaMemAttachGlobal ) );
|
||||
}
|
||||
Cuda::fence();
|
||||
Cuda::impl_static_fence();
|
||||
|
||||
return ptr ;
|
||||
}
|
||||
|
@ -217,14 +217,14 @@ void CudaSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_all
|
|||
|
||||
void CudaUVMSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
|
||||
{
|
||||
Cuda::fence();
|
||||
Cuda::impl_static_fence();
|
||||
try {
|
||||
if ( arg_alloc_ptr != nullptr ) {
|
||||
Kokkos::Impl::num_uvm_allocations--;
|
||||
CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) );
|
||||
}
|
||||
} catch(...) {}
|
||||
Cuda::fence();
|
||||
Cuda::impl_static_fence();
|
||||
}
|
||||
|
||||
void CudaHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
|
||||
|
@ -390,7 +390,7 @@ SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
|
|||
{
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Cuda::fence(); //Make sure I can access the label ...
|
||||
Cuda::impl_static_fence(); //Make sure I can access the label ...
|
||||
Kokkos::Profiling::deallocateData(
|
||||
Kokkos::Profiling::SpaceHandle(Kokkos::CudaUVMSpace::name()),RecordBase::m_alloc_ptr->m_label,
|
||||
data(),size());
|
||||
|
|
|
@ -0,0 +1,657 @@
|
|||
/*
|
||||
@HEADER
|
||||
================================================================================
|
||||
|
||||
ORIGINAL LICENSE
|
||||
----------------
|
||||
|
||||
Copyright (c) 2018, NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
|
||||
================================================================================
|
||||
|
||||
LICENSE ASSOCIATED WITH SUBSEQUENT MODIFICATIONS
|
||||
------------------------------------------------
|
||||
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2019) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS)
|
||||
|
||||
#include <cassert>
|
||||
|
||||
#ifndef _SIMT_DETAILS_CONFIG
|
||||
#define _SIMT_DETAILS_CONFIG
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
|
||||
#ifndef __simt_scope
|
||||
// Modification: Kokkos GPU atomics should default to `gpu` scope
|
||||
#define __simt_scope "gpu"
|
||||
#endif
|
||||
|
||||
#define __simt_fence_signal_() asm volatile("":::"memory")
|
||||
#define __simt_fence_sc_() asm volatile("fence.sc." __simt_scope ";":::"memory")
|
||||
#define __simt_fence_() asm volatile("fence." __simt_scope ";":::"memory")
|
||||
|
||||
#define __simt_load_acquire_8_as_32(ptr,ret) asm volatile("ld.acquire." __simt_scope ".b8 %0, [%1];" : "=r"(ret) : "l"(ptr) : "memory")
|
||||
#define __simt_load_relaxed_8_as_32(ptr,ret) asm volatile("ld.relaxed." __simt_scope ".b8 %0, [%1];" : "=r"(ret) : "l"(ptr) : "memory")
|
||||
#define __simt_store_release_8_as_32(ptr,desired) asm volatile("st.release." __simt_scope ".b8 [%0], %1;" :: "l"(ptr), "r"(desired) : "memory")
|
||||
#define __simt_store_relaxed_8_as_32(ptr,desired) asm volatile("st.relaxed." __simt_scope ".b8 [%0], %1;" :: "l"(ptr), "r"(desired) : "memory")
|
||||
|
||||
#define __simt_load_acquire_16(ptr,ret) asm volatile("ld.acquire." __simt_scope ".b16 %0, [%1];" : "=h"(ret) : "l"(ptr) : "memory")
|
||||
#define __simt_load_relaxed_16(ptr,ret) asm volatile("ld.relaxed." __simt_scope ".b16 %0, [%1];" : "=h"(ret) : "l"(ptr) : "memory")
|
||||
#define __simt_store_release_16(ptr,desired) asm volatile("st.release." __simt_scope ".b16 [%0], %1;" :: "l"(ptr), "h"(desired) : "memory")
|
||||
#define __simt_store_relaxed_16(ptr,desired) asm volatile("st.relaxed." __simt_scope ".b16 [%0], %1;" :: "l"(ptr), "h"(desired) : "memory")
|
||||
|
||||
#define __simt_load_acquire_32(ptr,ret) asm volatile("ld.acquire." __simt_scope ".b32 %0, [%1];" : "=r"(ret) : "l"(ptr) : "memory")
|
||||
#define __simt_load_relaxed_32(ptr,ret) asm volatile("ld.relaxed." __simt_scope ".b32 %0, [%1];" : "=r"(ret) : "l"(ptr) : "memory")
|
||||
#define __simt_store_release_32(ptr,desired) asm volatile("st.release." __simt_scope ".b32 [%0], %1;" :: "l"(ptr), "r"(desired) : "memory")
|
||||
#define __simt_store_relaxed_32(ptr,desired) asm volatile("st.relaxed." __simt_scope ".b32 [%0], %1;" :: "l"(ptr), "r"(desired) : "memory")
|
||||
#define __simt_exch_release_32(ptr,old,desired) asm volatile("atom.exch.release." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(desired) : "memory")
|
||||
#define __simt_exch_acquire_32(ptr,old,desired) asm volatile("atom.exch.acquire." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(desired) : "memory")
|
||||
#define __simt_exch_acq_rel_32(ptr,old,desired) asm volatile("atom.exch.acq_rel." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(desired) : "memory")
|
||||
#define __simt_exch_relaxed_32(ptr,old,desired) asm volatile("atom.exch.relaxed." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(desired) : "memory")
|
||||
#define __simt_cas_release_32(ptr,old,expected,desired) asm volatile("atom.cas.release." __simt_scope ".b32 %0, [%1], %2, %3;" : "=r"(old) : "l"(ptr), "r"(expected), "r"(desired) : "memory")
|
||||
#define __simt_cas_acquire_32(ptr,old,expected,desired) asm volatile("atom.cas.acquire." __simt_scope ".b32 %0, [%1], %2, %3;" : "=r"(old) : "l"(ptr), "r"(expected), "r"(desired) : "memory")
|
||||
#define __simt_cas_acq_rel_32(ptr,old,expected,desired) asm volatile("atom.cas.acq_rel." __simt_scope ".b32 %0, [%1], %2, %3;" : "=r"(old) : "l"(ptr), "r"(expected), "r"(desired) : "memory")
|
||||
#define __simt_cas_relaxed_32(ptr,old,expected,desired) asm volatile("atom.cas.relaxed." __simt_scope ".b32 %0, [%1], %2, %3;" : "=r"(old) : "l"(ptr), "r"(expected), "r"(desired) : "memory")
|
||||
#define __simt_add_release_32(ptr,old,addend) asm volatile("atom.add.release." __simt_scope ".u32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(addend) : "memory")
|
||||
#define __simt_add_acquire_32(ptr,old,addend) asm volatile("atom.add.acquire." __simt_scope ".u32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(addend) : "memory")
|
||||
#define __simt_add_acq_rel_32(ptr,old,addend) asm volatile("atom.add.acq_rel." __simt_scope ".u32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(addend) : "memory")
|
||||
#define __simt_add_relaxed_32(ptr,old,addend) asm volatile("atom.add.relaxed." __simt_scope ".u32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(addend) : "memory")
|
||||
#define __simt_and_release_32(ptr,old,andend) asm volatile("atom.and.release." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(andend) : "memory")
|
||||
#define __simt_and_acquire_32(ptr,old,andend) asm volatile("atom.and.acquire." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(andend) : "memory")
|
||||
#define __simt_and_acq_rel_32(ptr,old,andend) asm volatile("atom.and.acq_rel." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(andend) : "memory")
|
||||
#define __simt_and_relaxed_32(ptr,old,andend) asm volatile("atom.and.relaxed." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(andend) : "memory")
|
||||
#define __simt_or_release_32(ptr,old,orend) asm volatile("atom.or.release." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(orend) : "memory")
|
||||
#define __simt_or_acquire_32(ptr,old,orend) asm volatile("atom.or.acquire." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(orend) : "memory")
|
||||
#define __simt_or_acq_rel_32(ptr,old,orend) asm volatile("atom.or.acq_rel." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(orend) : "memory")
|
||||
#define __simt_or_relaxed_32(ptr,old,orend) asm volatile("atom.or.relaxed." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(orend) : "memory")
|
||||
#define __simt_xor_release_32(ptr,old,xorend) asm volatile("atom.xor.release." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(xorend) : "memory")
|
||||
#define __simt_xor_acquire_32(ptr,old,xorend) asm volatile("atom.xor.acquire." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(xorend) : "memory")
|
||||
#define __simt_xor_acq_rel_32(ptr,old,xorend) asm volatile("atom.xor.acq_rel." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(xorend) : "memory")
|
||||
#define __simt_xor_relaxed_32(ptr,old,xorend) asm volatile("atom.xor.relaxed." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(xorend) : "memory")
|
||||
|
||||
#define __simt_load_acquire_64(ptr,ret) asm volatile("ld.acquire." __simt_scope ".b64 %0, [%1];" : "=l"(ret) : "l"(ptr) : "memory")
|
||||
#define __simt_load_relaxed_64(ptr,ret) asm volatile("ld.relaxed." __simt_scope ".b64 %0, [%1];" : "=l"(ret) : "l"(ptr) : "memory")
|
||||
#define __simt_store_release_64(ptr,desired) asm volatile("st.release." __simt_scope ".b64 [%0], %1;" :: "l"(ptr), "l"(desired) : "memory")
|
||||
#define __simt_store_relaxed_64(ptr,desired) asm volatile("st.relaxed." __simt_scope ".b64 [%0], %1;" :: "l"(ptr), "l"(desired) : "memory")
|
||||
#define __simt_exch_release_64(ptr,old,desired) asm volatile("atom.exch.release." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(desired) : "memory")
|
||||
#define __simt_exch_acquire_64(ptr,old,desired) asm volatile("atom.exch.acquire." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(desired) : "memory")
|
||||
#define __simt_exch_acq_rel_64(ptr,old,desired) asm volatile("atom.exch.acq_rel." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(desired) : "memory")
|
||||
#define __simt_exch_relaxed_64(ptr,old,desired) asm volatile("atom.exch.relaxed." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(desired) : "memory")
|
||||
#define __simt_cas_release_64(ptr,old,expected,desired) asm volatile("atom.cas.release." __simt_scope ".b64 %0, [%1], %2, %3;" : "=l"(old) : "l"(ptr), "l"(expected), "l"(desired) : "memory")
|
||||
#define __simt_cas_acquire_64(ptr,old,expected,desired) asm volatile("atom.cas.acquire." __simt_scope ".b64 %0, [%1], %2, %3;" : "=l"(old) : "l"(ptr), "l"(expected), "l"(desired) : "memory")
|
||||
#define __simt_cas_acq_rel_64(ptr,old,expected,desired) asm volatile("atom.cas.acq_rel." __simt_scope ".b64 %0, [%1], %2, %3;" : "=l"(old) : "l"(ptr), "l"(expected), "l"(desired) : "memory")
|
||||
#define __simt_cas_relaxed_64(ptr,old,expected,desired) asm volatile("atom.cas.relaxed." __simt_scope ".b64 %0, [%1], %2, %3;" : "=l"(old) : "l"(ptr), "l"(expected), "l"(desired) : "memory")
|
||||
#define __simt_add_release_64(ptr,old,addend) asm volatile("atom.add.release." __simt_scope ".u64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(addend) : "memory")
|
||||
#define __simt_add_acquire_64(ptr,old,addend) asm volatile("atom.add.acquire." __simt_scope ".u64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(addend) : "memory")
|
||||
#define __simt_add_acq_rel_64(ptr,old,addend) asm volatile("atom.add.acq_rel." __simt_scope ".u64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(addend) : "memory")
|
||||
#define __simt_add_relaxed_64(ptr,old,addend) asm volatile("atom.add.relaxed." __simt_scope ".u64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(addend) : "memory")
|
||||
#define __simt_and_release_64(ptr,old,andend) asm volatile("atom.and.release." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(andend) : "memory")
|
||||
#define __simt_and_acquire_64(ptr,old,andend) asm volatile("atom.and.acquire." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(andend) : "memory")
|
||||
#define __simt_and_acq_rel_64(ptr,old,andend) asm volatile("atom.and.acq_rel." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(andend) : "memory")
|
||||
#define __simt_and_relaxed_64(ptr,old,andend) asm volatile("atom.and.relaxed." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(andend) : "memory")
|
||||
#define __simt_or_release_64(ptr,old,orend) asm volatile("atom.or.release." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(orend) : "memory")
|
||||
#define __simt_or_acquire_64(ptr,old,orend) asm volatile("atom.or.acquire." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(orend) : "memory")
|
||||
#define __simt_or_acq_rel_64(ptr,old,orend) asm volatile("atom.or.acq_rel." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(orend) : "memory")
|
||||
#define __simt_or_relaxed_64(ptr,old,orend) asm volatile("atom.or.relaxed." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(orend) : "memory")
|
||||
#define __simt_xor_release_64(ptr,old,xorend) asm volatile("atom.xor.release." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(xorend) : "memory")
|
||||
#define __simt_xor_acquire_64(ptr,old,xorend) asm volatile("atom.xor.acquire." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(xorend) : "memory")
|
||||
#define __simt_xor_acq_rel_64(ptr,old,xorend) asm volatile("atom.xor.acq_rel." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(xorend) : "memory")
|
||||
#define __simt_xor_relaxed_64(ptr,old,xorend) asm volatile("atom.xor.relaxed." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(xorend) : "memory")
|
||||
|
||||
#define __simt_nanosleep(timeout) asm volatile("nanosleep.u32 %0;" :: "r"(unsigned(timeout)) : )
|
||||
|
||||
/*
|
||||
definitions
|
||||
*/
|
||||
|
||||
#ifndef __GCC_ATOMIC_BOOL_LOCK_FREE
|
||||
#define __GCC_ATOMIC_BOOL_LOCK_FREE 2
|
||||
#define __GCC_ATOMIC_CHAR_LOCK_FREE 2
|
||||
#define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2
|
||||
#define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2
|
||||
#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2
|
||||
#define __GCC_ATOMIC_SHORT_LOCK_FREE 2
|
||||
#define __GCC_ATOMIC_INT_LOCK_FREE 2
|
||||
#define __GCC_ATOMIC_LONG_LOCK_FREE 2
|
||||
#define __GCC_ATOMIC_LLONG_LOCK_FREE 2
|
||||
#define __GCC_ATOMIC_POINTER_LOCK_FREE 2
|
||||
#endif
|
||||
|
||||
#ifndef __ATOMIC_RELAXED
|
||||
#define __ATOMIC_RELAXED 0
|
||||
#define __ATOMIC_CONSUME 1
|
||||
#define __ATOMIC_ACQUIRE 2
|
||||
#define __ATOMIC_RELEASE 3
|
||||
#define __ATOMIC_ACQ_REL 4
|
||||
#define __ATOMIC_SEQ_CST 5
|
||||
#endif
|
||||
|
||||
inline __device__ int __stronger_order_simt_(int a, int b) {
|
||||
if (b == __ATOMIC_SEQ_CST) return __ATOMIC_SEQ_CST;
|
||||
if (b == __ATOMIC_RELAXED) return a;
|
||||
switch (a) {
|
||||
case __ATOMIC_SEQ_CST:
|
||||
case __ATOMIC_ACQ_REL: return a;
|
||||
case __ATOMIC_CONSUME:
|
||||
case __ATOMIC_ACQUIRE: if (b != __ATOMIC_ACQUIRE) return __ATOMIC_ACQ_REL; else return __ATOMIC_ACQUIRE;
|
||||
case __ATOMIC_RELEASE: if (b != __ATOMIC_RELEASE) return __ATOMIC_ACQ_REL; else return __ATOMIC_RELEASE;
|
||||
case __ATOMIC_RELAXED: return b;
|
||||
default: assert(0);
|
||||
}
|
||||
return __ATOMIC_SEQ_CST;
|
||||
}
|
||||
|
||||
/*
|
||||
base
|
||||
*/
|
||||
|
||||
#define DO__atomic_load_simt_(bytes, bits) \
|
||||
template<class type, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
|
||||
void __device__ __atomic_load_simt_ (const type *ptr, type *ret, int memorder) { \
|
||||
int##bits##_t tmp = 0; \
|
||||
switch (memorder) { \
|
||||
case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
|
||||
case __ATOMIC_CONSUME: \
|
||||
case __ATOMIC_ACQUIRE: __simt_load_acquire_##bits(ptr, tmp); break; \
|
||||
case __ATOMIC_RELAXED: __simt_load_relaxed_##bits(ptr, tmp); break; \
|
||||
default: assert(0); \
|
||||
} \
|
||||
memcpy(ret, &tmp, bytes); \
|
||||
}
|
||||
DO__atomic_load_simt_(1,32)
|
||||
DO__atomic_load_simt_(2,16)
|
||||
DO__atomic_load_simt_(4,32)
|
||||
DO__atomic_load_simt_(8,64)
|
||||
|
||||
template<class type>
|
||||
type __device__ __atomic_load_n_simt_(const type *ptr, int memorder) {
|
||||
type ret;
|
||||
__atomic_load_simt_(ptr, &ret, memorder);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define DO__atomic_store_simt_(bytes, bits) \
|
||||
template<class type, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
|
||||
void __device__ __atomic_store_simt_ (type *ptr, type *val, int memorder) { \
|
||||
int##bits##_t tmp = 0; \
|
||||
memcpy(&tmp, val, bytes); \
|
||||
switch (memorder) { \
|
||||
case __ATOMIC_RELEASE: __simt_store_release_##bits(ptr, tmp); break; \
|
||||
case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
|
||||
case __ATOMIC_RELAXED: __simt_store_relaxed_##bits(ptr, tmp); break; \
|
||||
default: assert(0); \
|
||||
} \
|
||||
}
|
||||
DO__atomic_store_simt_(1,32)
|
||||
DO__atomic_store_simt_(2,16)
|
||||
DO__atomic_store_simt_(4,32)
|
||||
DO__atomic_store_simt_(8,64)
|
||||
|
||||
template<class type>
|
||||
void __device__ __atomic_store_n_simt_(type *ptr, type val, int memorder) {
|
||||
__atomic_store_simt_(ptr, &val, memorder);
|
||||
}
|
||||
|
||||
#define DO__atomic_compare_exchange_simt_(bytes, bits) \
|
||||
template<class type, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
|
||||
bool __device__ __atomic_compare_exchange_simt_ (type *ptr, type *expected, const type *desired, bool, int success_memorder, int failure_memorder) { \
|
||||
int##bits##_t tmp = 0, old = 0, old_tmp; \
|
||||
memcpy(&tmp, desired, bytes); \
|
||||
memcpy(&old, expected, bytes); \
|
||||
old_tmp = old; \
|
||||
switch (__stronger_order_simt_(success_memorder, failure_memorder)) { \
|
||||
case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
|
||||
case __ATOMIC_CONSUME: \
|
||||
case __ATOMIC_ACQUIRE: __simt_cas_acquire_##bits(ptr, old, old_tmp, tmp); break; \
|
||||
case __ATOMIC_ACQ_REL: __simt_cas_acq_rel_##bits(ptr, old, old_tmp, tmp); break; \
|
||||
case __ATOMIC_RELEASE: __simt_cas_release_##bits(ptr, old, old_tmp, tmp); break; \
|
||||
case __ATOMIC_RELAXED: __simt_cas_relaxed_##bits(ptr, old, old_tmp, tmp); break; \
|
||||
default: assert(0); \
|
||||
} \
|
||||
bool const ret = old == old_tmp; \
|
||||
memcpy(expected, &old, bytes); \
|
||||
return ret; \
|
||||
}
|
||||
DO__atomic_compare_exchange_simt_(4, 32)
|
||||
DO__atomic_compare_exchange_simt_(8, 64)
|
||||
|
||||
template<class type, typename std::enable_if<sizeof(type) <= 2, int>::type = 0> \
|
||||
bool __device__ __atomic_compare_exchange_simt_(type *ptr, type *expected, const type *desired, bool, int success_memorder, int failure_memorder) {
|
||||
|
||||
using R = typename std::conditional<std::is_volatile<type>::value, volatile uint32_t, uint32_t>::type;
|
||||
auto const aligned = (R*)((intptr_t)ptr & ~(sizeof(uint32_t) - 1));
|
||||
auto const offset = uint32_t((intptr_t)ptr & (sizeof(uint32_t) - 1)) * 8;
|
||||
auto const mask = ((1 << sizeof(type)*8) - 1) << offset;
|
||||
|
||||
uint32_t old = *expected << offset, old_value;
|
||||
while (1) {
|
||||
old_value = (old & mask) >> offset;
|
||||
if (old_value != *expected)
|
||||
break;
|
||||
uint32_t const attempt = (old & ~mask) | (*desired << offset);
|
||||
if (__atomic_compare_exchange_simt_ (aligned, &old, &attempt, true, success_memorder, failure_memorder))
|
||||
return true;
|
||||
}
|
||||
*expected = old_value;
|
||||
return false;
|
||||
}
|
||||
|
||||
template<class type>
|
||||
bool __device__ __atomic_compare_exchange_n_simt_(type *ptr, type *expected, type desired, bool weak, int success_memorder, int failure_memorder) {
|
||||
return __atomic_compare_exchange_simt_(ptr, expected, &desired, weak, success_memorder, failure_memorder);
|
||||
}
|
||||
|
||||
#define DO__atomic_exchange_simt_(bytes, bits) \
|
||||
template<class type, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
|
||||
void __device__ __atomic_exchange_simt_ (type *ptr, type *val, type *ret, int memorder) { \
|
||||
int##bits##_t tmp = 0; \
|
||||
memcpy(&tmp, val, bytes); \
|
||||
switch (memorder) { \
|
||||
case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
|
||||
case __ATOMIC_CONSUME: \
|
||||
case __ATOMIC_ACQUIRE: __simt_exch_acquire_##bits(ptr, tmp, tmp); break; \
|
||||
case __ATOMIC_ACQ_REL: __simt_exch_acq_rel_##bits(ptr, tmp, tmp); break; \
|
||||
case __ATOMIC_RELEASE: __simt_exch_release_##bits(ptr, tmp, tmp); break; \
|
||||
case __ATOMIC_RELAXED: __simt_exch_relaxed_##bits(ptr, tmp, tmp); break; \
|
||||
default: assert(0); \
|
||||
} \
|
||||
memcpy(ret, &tmp, bytes); \
|
||||
}
|
||||
DO__atomic_exchange_simt_(4,32)
|
||||
DO__atomic_exchange_simt_(8,64)
|
||||
|
||||
template<class type, typename std::enable_if<sizeof(type)<=2, int>::type = 0>
|
||||
void __device__ __atomic_exchange_simt_ (type *ptr, type *val, type *ret, int memorder) {
|
||||
|
||||
type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
|
||||
while(!__atomic_compare_exchange_simt_(ptr, &expected, val, true, memorder, memorder))
|
||||
;
|
||||
*ret = expected;
|
||||
}
|
||||
|
||||
template<class type>
|
||||
type __device__ __atomic_exchange_n_simt_(type *ptr, type val, int memorder) {
|
||||
type ret;
|
||||
__atomic_exchange_simt_(ptr, &val, &ret, memorder);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define DO__atomic_fetch_add_simt_(bytes, bits) \
|
||||
template<class type, class delta, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
|
||||
type __device__ __atomic_fetch_add_simt_ (type *ptr, delta val, int memorder) { \
|
||||
type ret; \
|
||||
switch (memorder) { \
|
||||
case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
|
||||
case __ATOMIC_CONSUME: \
|
||||
case __ATOMIC_ACQUIRE: __simt_add_acquire_##bits(ptr, ret, val); break; \
|
||||
case __ATOMIC_ACQ_REL: __simt_add_acq_rel_##bits(ptr, ret, val); break; \
|
||||
case __ATOMIC_RELEASE: __simt_add_release_##bits(ptr, ret, val); break; \
|
||||
case __ATOMIC_RELAXED: __simt_add_relaxed_##bits(ptr, ret, val); break; \
|
||||
default: assert(0); \
|
||||
} \
|
||||
return ret; \
|
||||
}
|
||||
DO__atomic_fetch_add_simt_(4, 32)
|
||||
DO__atomic_fetch_add_simt_(8, 64)
|
||||
|
||||
template<class type, class delta, typename std::enable_if<sizeof(type)<=2, int>::type = 0>
|
||||
type __device__ __atomic_fetch_add_simt_ (type *ptr, delta val, int memorder) {
|
||||
|
||||
type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
|
||||
type const desired = expected + val;
|
||||
while(!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, memorder, memorder))
|
||||
;
|
||||
return expected;
|
||||
}
|
||||
|
||||
#define DO__atomic_fetch_sub_simt_(bytes, bits) \
|
||||
template<class type, class delta, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
|
||||
type __device__ __atomic_fetch_sub_simt_ (type *ptr, delta val, int memorder) { \
|
||||
type ret; \
|
||||
switch (memorder) { \
|
||||
case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
|
||||
case __ATOMIC_CONSUME: \
|
||||
case __ATOMIC_ACQUIRE: __simt_add_acquire_##bits(ptr, ret, -val); break; \
|
||||
case __ATOMIC_ACQ_REL: __simt_add_acq_rel_##bits(ptr, ret, -val); break; \
|
||||
case __ATOMIC_RELEASE: __simt_add_release_##bits(ptr, ret, -val); break; \
|
||||
case __ATOMIC_RELAXED: __simt_add_relaxed_##bits(ptr, ret, -val); break; \
|
||||
default: assert(0); \
|
||||
} \
|
||||
return ret; \
|
||||
}
|
||||
DO__atomic_fetch_sub_simt_(4,32)
|
||||
DO__atomic_fetch_sub_simt_(8,64)
|
||||
|
||||
template<class type, class delta, typename std::enable_if<sizeof(type)<=2, int>::type = 0>
|
||||
type __device__ __atomic_fetch_sub_simt_ (type *ptr, delta val, int memorder) {
|
||||
|
||||
type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
|
||||
type const desired = expected - val;
|
||||
while(!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, memorder, memorder))
|
||||
;
|
||||
return expected;
|
||||
}
|
||||
|
||||
#define DO__atomic_fetch_and_simt_(bytes, bits) \
|
||||
template<class type, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
|
||||
type __device__ __atomic_fetch_and_simt_ (type *ptr, type val, int memorder) { \
|
||||
type ret; \
|
||||
switch (memorder) { \
|
||||
case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
|
||||
case __ATOMIC_CONSUME: \
|
||||
case __ATOMIC_ACQUIRE: __simt_and_acquire_##bits(ptr, ret, val); break; \
|
||||
case __ATOMIC_ACQ_REL: __simt_and_acq_rel_##bits(ptr, ret, val); break; \
|
||||
case __ATOMIC_RELEASE: __simt_and_release_##bits(ptr, ret, val); break; \
|
||||
case __ATOMIC_RELAXED: __simt_and_relaxed_##bits(ptr, ret, val); break; \
|
||||
default: assert(0); \
|
||||
} \
|
||||
return ret; \
|
||||
}
|
||||
DO__atomic_fetch_and_simt_(4,32)
|
||||
DO__atomic_fetch_and_simt_(8,64)
|
||||
|
||||
template<class type, class delta, typename std::enable_if<sizeof(type)<=2, int>::type = 0>
|
||||
type __device__ __atomic_fetch_and_simt_ (type *ptr, delta val, int memorder) {
|
||||
|
||||
type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
|
||||
type const desired = expected & val;
|
||||
while(!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, memorder, memorder))
|
||||
;
|
||||
return expected;
|
||||
}
|
||||
|
||||
#define DO__atomic_fetch_xor_simt_(bytes, bits) \
|
||||
template<class type, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
|
||||
type __device__ __atomic_fetch_xor_simt_ (type *ptr, type val, int memorder) { \
|
||||
type ret; \
|
||||
switch (memorder) { \
|
||||
case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
|
||||
case __ATOMIC_CONSUME: \
|
||||
case __ATOMIC_ACQUIRE: __simt_xor_acquire_##bits(ptr, ret, val); break; \
|
||||
case __ATOMIC_ACQ_REL: __simt_xor_acq_rel_##bits(ptr, ret, val); break; \
|
||||
case __ATOMIC_RELEASE: __simt_xor_release_##bits(ptr, ret, val); break; \
|
||||
case __ATOMIC_RELAXED: __simt_xor_relaxed_##bits(ptr, ret, val); break; \
|
||||
default: assert(0); \
|
||||
} \
|
||||
return ret; \
|
||||
}
|
||||
DO__atomic_fetch_xor_simt_(4,32)
|
||||
DO__atomic_fetch_xor_simt_(8,64)
|
||||
|
||||
template<class type, class delta, typename std::enable_if<sizeof(type)<=2, int>::type = 0>
|
||||
type __device__ __atomic_fetch_xor_simt_ (type *ptr, delta val, int memorder) {
|
||||
|
||||
type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
|
||||
type const desired = expected ^ val;
|
||||
while(!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, memorder, memorder))
|
||||
;
|
||||
return expected;
|
||||
}
|
||||
|
||||
#define DO__atomic_fetch_or_simt_(bytes, bits) \
|
||||
template<class type, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
|
||||
type __device__ __atomic_fetch_or_simt_ (type *ptr, type val, int memorder) { \
|
||||
type ret; \
|
||||
switch (memorder) { \
|
||||
case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
|
||||
case __ATOMIC_CONSUME: \
|
||||
case __ATOMIC_ACQUIRE: __simt_or_acquire_##bits(ptr, ret, val); break; \
|
||||
case __ATOMIC_ACQ_REL: __simt_or_acq_rel_##bits(ptr, ret, val); break; \
|
||||
case __ATOMIC_RELEASE: __simt_or_release_##bits(ptr, ret, val); break; \
|
||||
case __ATOMIC_RELAXED: __simt_or_relaxed_##bits(ptr, ret, val); break; \
|
||||
default: assert(0); \
|
||||
} \
|
||||
return ret; \
|
||||
}
|
||||
DO__atomic_fetch_or_simt_(4,32)
|
||||
DO__atomic_fetch_or_simt_(8,64)
|
||||
|
||||
template<class type, class delta, typename std::enable_if<sizeof(type)<=2, int>::type = 0>
|
||||
type __device__ __atomic_fetch_or_simt_ (type *ptr, delta val, int memorder) {
|
||||
|
||||
type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
|
||||
type const desired = expected | val;
|
||||
while(!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, memorder, memorder))
|
||||
;
|
||||
return expected;
|
||||
}
|
||||
|
||||
template<class type>
|
||||
inline bool __device__ __atomic_test_and_set_simt_(type *ptr, int memorder) {
|
||||
return __atomic_exchange_n_simt_((char*)ptr, (char)1, memorder) == 1;
|
||||
}
|
||||
template<class type>
|
||||
inline void __device__ __atomic_clear_simt_(type *ptr, int memorder) {
|
||||
return __atomic_store_n_simt_((char*)ptr, (char)0, memorder);
|
||||
}
|
||||
|
||||
inline constexpr __device__ bool __atomic_always_lock_free_simt_ (size_t size, void *) {
|
||||
return size <= 8;
|
||||
}
|
||||
inline __device__ bool __atomic_is_lock_free_simt_(size_t size, void * ptr) {
|
||||
return __atomic_always_lock_free_simt_(size, ptr);
|
||||
}
|
||||
|
||||
/*
|
||||
fences
|
||||
*/
|
||||
|
||||
inline void __device__ __atomic_thread_fence_simt(int memorder) {
|
||||
switch (memorder) {
|
||||
case __ATOMIC_SEQ_CST: __simt_fence_sc_(); break;
|
||||
case __ATOMIC_CONSUME:
|
||||
case __ATOMIC_ACQUIRE:
|
||||
case __ATOMIC_ACQ_REL:
|
||||
case __ATOMIC_RELEASE: __simt_fence_(); break;
|
||||
case __ATOMIC_RELAXED: break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
inline void __device__ __atomic_signal_fence_simt(int memorder) {
|
||||
__atomic_thread_fence_simt(memorder);
|
||||
}
|
||||
|
||||
/*
|
||||
non-volatile
|
||||
*/
|
||||
|
||||
template<class type> type __device__ __atomic_load_n_simt(const type *ptr, int memorder) {
|
||||
return __atomic_load_n_simt_(const_cast<const type*>(ptr), memorder);
|
||||
}
|
||||
template<class type> void __device__ __atomic_load_simt(const type *ptr, type *ret, int memorder) {
|
||||
__atomic_load_simt_(const_cast<const type*>(ptr), ret, memorder);
|
||||
}
|
||||
template<class type> void __device__ __atomic_store_n_simt(type *ptr, type val, int memorder) {
|
||||
__atomic_store_n_simt_(const_cast<type*>(ptr), val, memorder);
|
||||
}
|
||||
template<class type> void __device__ __atomic_store_simt(type *ptr, type *val, int memorder) {
|
||||
__atomic_store_simt_(const_cast<type*>(ptr), val, memorder);
|
||||
}
|
||||
template<class type> type __device__ __atomic_exchange_n_simt(type *ptr, type val, int memorder) {
|
||||
return __atomic_exchange_n_simt_(const_cast<type*>(ptr), val, memorder);
|
||||
}
|
||||
template<class type> void __device__ __atomic_exchange_simt(type *ptr, type *val, type *ret, int memorder) {
|
||||
__atomic_exchange_simt_(const_cast<type*>(ptr), val, ret, memorder);
|
||||
}
|
||||
template<class type> bool __device__ __atomic_compare_exchange_n_simt(type *ptr, type *expected, type desired, bool weak, int success_memorder, int failure_memorder) {
|
||||
return __atomic_compare_exchange_n_simt_(const_cast<type*>(ptr), expected, desired, weak, success_memorder, failure_memorder);
|
||||
}
|
||||
template<class type> bool __device__ __atomic_compare_exchange_simt(type *ptr, type *expected, type *desired, bool weak, int success_memorder, int failure_memorder) {
|
||||
return __atomic_compare_exchange_simt_(const_cast<type*>(ptr), expected, desired, weak, success_memorder, failure_memorder);
|
||||
}
|
||||
template<class type, class delta> type __device__ __atomic_fetch_add_simt(type *ptr, delta val, int memorder) {
|
||||
return __atomic_fetch_add_simt_(const_cast<type*>(ptr), val, memorder);
|
||||
}
|
||||
template<class type, class delta> type __device__ __atomic_fetch_sub_simt(type *ptr, delta val, int memorder) {
|
||||
return __atomic_fetch_sub_simt_(const_cast<type*>(ptr), val, memorder);
|
||||
}
|
||||
template<class type> type __device__ __atomic_fetch_and_simt(type *ptr, type val, int memorder) {
|
||||
return __atomic_fetch_and_simt_(const_cast<type*>(ptr), val, memorder);
|
||||
}
|
||||
template<class type> type __device__ __atomic_fetch_xor_simt(type *ptr, type val, int memorder) {
|
||||
return __atomic_fetch_xor_simt_(const_cast<type*>(ptr), val, memorder);
|
||||
}
|
||||
template<class type> type __device__ __atomic_fetch_or_simt(type *ptr, type val, int memorder) {
|
||||
return __atomic_fetch_or_simt_(const_cast<type*>(ptr), val, memorder);
|
||||
}
|
||||
template<class type> bool __device__ __atomic_test_and_set_simt(void *ptr, int memorder) {
|
||||
return __atomic_test_and_set_simt_(const_cast<void*>(ptr), memorder);
|
||||
}
|
||||
template<class type> void __device__ __atomic_clear_simt(void *ptr, int memorder) {
|
||||
return __atomic_clear_simt_(const_cast<void*>(ptr), memorder);
|
||||
}
|
||||
inline bool __device__ __atomic_always_lock_free_simt(size_t size, void *ptr) {
|
||||
return __atomic_always_lock_free_simt_(size, const_cast<void*>(ptr));
|
||||
}
|
||||
inline bool __device__ __atomic_is_lock_free_simt(size_t size, void *ptr) {
|
||||
return __atomic_is_lock_free_simt_(size, const_cast<void*>(ptr));
|
||||
}
|
||||
|
||||
/*
|
||||
volatile
|
||||
*/
|
||||
|
||||
template<class type> type __device__ __atomic_load_n_simt(const volatile type *ptr, int memorder) {
|
||||
return __atomic_load_n_simt_(const_cast<const type*>(ptr), memorder);
|
||||
}
|
||||
template<class type> void __device__ __atomic_load_simt(const volatile type *ptr, type *ret, int memorder) {
|
||||
__atomic_load_simt_(const_cast<const type*>(ptr), ret, memorder);
|
||||
}
|
||||
template<class type> void __device__ __atomic_store_n_simt(volatile type *ptr, type val, int memorder) {
|
||||
__atomic_store_n_simt_(const_cast<type*>(ptr), val, memorder);
|
||||
}
|
||||
template<class type> void __device__ __atomic_store_simt(volatile type *ptr, type *val, int memorder) {
|
||||
__atomic_store_simt_(const_cast<type*>(ptr), val, memorder);
|
||||
}
|
||||
template<class type> type __device__ __atomic_exchange_n_simt(volatile type *ptr, type val, int memorder) {
|
||||
return __atomic_exchange_n_simt_(const_cast<type*>(ptr), val, memorder);
|
||||
}
|
||||
template<class type> void __device__ __atomic_exchange_simt(volatile type *ptr, type *val, type *ret, int memorder) {
|
||||
__atomic_exchange_simt_(const_cast<type*>(ptr), val, ret, memorder);
|
||||
}
|
||||
template<class type> bool __device__ __atomic_compare_exchange_n_simt(volatile type *ptr, type *expected, type desired, bool weak, int success_memorder, int failure_memorder) {
|
||||
return __atomic_compare_exchange_n_simt_(const_cast<type*>(ptr), expected, desired, weak, success_memorder, failure_memorder);
|
||||
}
|
||||
template<class type> bool __device__ __atomic_compare_exchange_simt(volatile type *ptr, type *expected, type *desired, bool weak, int success_memorder, int failure_memorder) {
|
||||
return __atomic_compare_exchange_simt_(const_cast<type*>(ptr), expected, desired, weak, success_memorder, failure_memorder);
|
||||
}
|
||||
template<class type, class delta> type __device__ __atomic_fetch_add_simt(volatile type *ptr, delta val, int memorder) {
|
||||
return __atomic_fetch_add_simt_(const_cast<type*>(ptr), val, memorder);
|
||||
}
|
||||
template<class type, class delta> type __device__ __atomic_fetch_sub_simt(volatile type *ptr, delta val, int memorder) {
|
||||
return __atomic_fetch_sub_simt_(const_cast<type*>(ptr), val, memorder);
|
||||
}
|
||||
template<class type> type __device__ __atomic_fetch_and_simt(volatile type *ptr, type val, int memorder) {
|
||||
return __atomic_fetch_and_simt_(const_cast<type*>(ptr), val, memorder);
|
||||
}
|
||||
template<class type> type __device__ __atomic_fetch_xor_simt(volatile type *ptr, type val, int memorder) {
|
||||
return __atomic_fetch_xor_simt_(const_cast<type*>(ptr), val, memorder);
|
||||
}
|
||||
template<class type> type __device__ __atomic_fetch_or_simt(volatile type *ptr, type val, int memorder) {
|
||||
return __atomic_fetch_or_simt_(const_cast<type*>(ptr), val, memorder);
|
||||
}
|
||||
template<class type> bool __device__ __atomic_test_and_set_simt(volatile void *ptr, int memorder) {
|
||||
return __atomic_test_and_set_simt_(const_cast<void*>(ptr), memorder);
|
||||
}
|
||||
template<class type> void __device__ __atomic_clear_simt(volatile void *ptr, int memorder) {
|
||||
return __atomic_clear_simt_(const_cast<void*>(ptr), memorder);
|
||||
}
|
||||
|
||||
|
||||
|
||||
} // end namespace Impl
|
||||
} // end namespace Kokkos
|
||||
|
||||
#endif //_SIMT_DETAILS_CONFIG
|
||||
|
||||
#ifndef KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
|
||||
/*
|
||||
builtins
|
||||
*/
|
||||
|
||||
#define __atomic_load_n __atomic_load_n_simt
|
||||
#define __atomic_load __atomic_load_simt
|
||||
#define __atomic_store_n __atomic_store_n_simt
|
||||
#define __atomic_store __atomic_store_simt
|
||||
#define __atomic_exchange_n __atomic_exchange_n_simt
|
||||
#define __atomic_exchange __atomic_exchange_simt
|
||||
#define __atomic_compare_exchange_n __atomic_compare_exchange_n_simt
|
||||
#define __atomic_compare_exchange __atomic_compare_exchange_simt
|
||||
#define __atomic_fetch_add __atomic_fetch_add_simt
|
||||
#define __atomic_fetch_sub __atomic_fetch_sub_simt
|
||||
#define __atomic_fetch_and __atomic_fetch_and_simt
|
||||
#define __atomic_fetch_xor __atomic_fetch_xor_simt
|
||||
#define __atomic_fetch_or __atomic_fetch_or_simt
|
||||
#define __atomic_test_and_set __atomic_test_and_set_simt
|
||||
#define __atomic_clear __atomic_clear_simt
|
||||
#define __atomic_always_lock_free __atomic_always_lock_free_simt
|
||||
#define __atomic_is_lock_free __atomic_is_lock_free_simt
|
||||
#define __atomic_thread_fence __atomic_thread_fence_simt
|
||||
#define __atomic_signal_fence __atomic_signal_fence_simt
|
||||
|
||||
#define KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
|
||||
|
||||
#endif //__CUDA_ARCH__ && KOKKOS_ENABLE_CUDA_ASM_ATOMICS
|
||||
#endif // KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
|
|
@ -0,0 +1,68 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2019) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifdef KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
|
||||
|
||||
#undef __atomic_load_n
|
||||
#undef __atomic_load
|
||||
#undef __atomic_store_n
|
||||
#undef __atomic_store
|
||||
#undef __atomic_exchange_n
|
||||
#undef __atomic_exchange
|
||||
#undef __atomic_compare_exchange_n
|
||||
#undef __atomic_compare_exchange
|
||||
#undef __atomic_fetch_add
|
||||
#undef __atomic_fetch_sub
|
||||
#undef __atomic_fetch_and
|
||||
#undef __atomic_fetch_xor
|
||||
#undef __atomic_fetch_or
|
||||
#undef __atomic_test_and_set
|
||||
#undef __atomic_clear
|
||||
#undef __atomic_always_lock_free
|
||||
#undef __atomic_is_lock_free
|
||||
#undef __atomic_thread_fence
|
||||
#undef __atomic_signal_fence
|
||||
|
||||
#undef KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
|
||||
|
||||
#endif // KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
|
|
@ -58,7 +58,68 @@ struct CudaGetMaxBlockSize;
|
|||
template<class DriverType, class LaunchBounds>
|
||||
int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
|
||||
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
|
||||
return CudaGetMaxBlockSize<DriverType,LaunchBounds,(CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>::get_block_size(f,vector_length, shmem_extra_block,shmem_extra_thread);
|
||||
return CudaGetMaxBlockSize<DriverType,LaunchBounds
|
||||
, true
|
||||
>::get_block_size(f,vector_length, shmem_extra_block,shmem_extra_thread);
|
||||
}
|
||||
|
||||
template<class FunctorType, class LaunchBounds>
|
||||
int cuda_get_max_block_size(const CudaInternal* cuda_instance, const cudaFuncAttributes& attr, const FunctorType& f, const size_t vector_length,
|
||||
const size_t shmem_block, const size_t shmem_thread) {
|
||||
|
||||
const int min_blocks_per_sm = LaunchBounds::minBperSM == 0 ?
|
||||
1 : LaunchBounds::minBperSM ;
|
||||
const int max_threads_per_block = LaunchBounds::maxTperB == 0 ?
|
||||
cuda_instance->m_maxThreadsPerBlock : LaunchBounds::maxTperB ;
|
||||
|
||||
const int regs_per_thread = attr.numRegs;
|
||||
const int regs_per_sm = cuda_instance->m_regsPerSM;
|
||||
const int shmem_per_sm = cuda_instance->m_shmemPerSM;
|
||||
const int max_shmem_per_block = cuda_instance->m_maxShmemPerBlock;
|
||||
const int max_blocks_per_sm = cuda_instance->m_maxBlocksPerSM;
|
||||
const int max_threads_per_sm = cuda_instance->m_maxThreadsPerSM;
|
||||
|
||||
int block_size = std::min(attr.maxThreadsPerBlock,max_threads_per_block);
|
||||
|
||||
int functor_shmem = FunctorTeamShmemSize< FunctorType >::value( f , block_size/vector_length );
|
||||
int total_shmem = shmem_block + shmem_thread*(block_size/vector_length) + functor_shmem + attr.sharedSizeBytes;
|
||||
int max_blocks_regs = regs_per_sm/(regs_per_thread*block_size);
|
||||
int max_blocks_shmem = (total_shmem<max_shmem_per_block)?(total_shmem>0?shmem_per_sm/total_shmem:max_blocks_regs):0;
|
||||
int blocks_per_sm = std::min(max_blocks_regs,max_blocks_shmem);
|
||||
int threads_per_sm = blocks_per_sm * block_size;
|
||||
if(threads_per_sm > max_threads_per_sm) {
|
||||
blocks_per_sm = max_threads_per_sm/block_size;
|
||||
threads_per_sm = blocks_per_sm * block_size;
|
||||
}
|
||||
int opt_block_size = (blocks_per_sm>=min_blocks_per_sm) ? block_size : 0;
|
||||
int opt_threads_per_sm = threads_per_sm;
|
||||
//printf("BlockSizeMax: %i Shmem: %i %i %i %i Regs: %i %i Blocks: %i %i Achieved: %i %i Opt: %i %i\n",block_size,
|
||||
// shmem_per_sm,max_shmem_per_block,functor_shmem,total_shmem,
|
||||
// regs_per_sm,regs_per_thread,max_blocks_shmem,max_blocks_regs,blocks_per_sm,threads_per_sm,opt_block_size,opt_threads_per_sm);
|
||||
block_size-=32;
|
||||
while ((blocks_per_sm==0) && (block_size>=32)) {
|
||||
functor_shmem = FunctorTeamShmemSize< FunctorType >::value( f , block_size/vector_length );
|
||||
total_shmem = shmem_block + shmem_thread*(block_size/vector_length) + functor_shmem + attr.sharedSizeBytes;
|
||||
max_blocks_regs = regs_per_sm/(regs_per_thread*block_size);
|
||||
max_blocks_shmem = (total_shmem<max_shmem_per_block)?(total_shmem>0?shmem_per_sm/total_shmem:max_blocks_regs):0;
|
||||
blocks_per_sm = std::min(max_blocks_regs,max_blocks_shmem);
|
||||
threads_per_sm = blocks_per_sm * block_size;
|
||||
if(threads_per_sm > max_threads_per_sm) {
|
||||
blocks_per_sm = max_threads_per_sm/block_size;
|
||||
threads_per_sm = blocks_per_sm * block_size;
|
||||
}
|
||||
if((blocks_per_sm >= min_blocks_per_sm) && (blocks_per_sm <= max_blocks_per_sm)) {
|
||||
if(threads_per_sm>=opt_threads_per_sm) {
|
||||
opt_block_size = block_size;
|
||||
opt_threads_per_sm = threads_per_sm;
|
||||
}
|
||||
}
|
||||
//printf("BlockSizeMax: %i Shmem: %i %i %i %i Regs: %i %i Blocks: %i %i Achieved: %i %i Opt: %i %i\n",block_size,
|
||||
// shmem_per_sm,max_shmem_per_block,functor_shmem,total_shmem,
|
||||
// regs_per_sm,regs_per_thread,max_blocks_shmem,max_blocks_regs,blocks_per_sm,threads_per_sm,opt_block_size,opt_threads_per_sm);
|
||||
block_size-=32;
|
||||
}
|
||||
return opt_block_size;
|
||||
}
|
||||
|
||||
|
||||
|
@ -241,11 +302,71 @@ struct CudaGetOptBlockSize;
|
|||
template<class DriverType, class LaunchBounds>
|
||||
int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
|
||||
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
|
||||
return CudaGetOptBlockSize<DriverType,LaunchBounds,(CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>::get_block_size(f,vector_length,shmem_extra_block,shmem_extra_thread);
|
||||
return CudaGetOptBlockSize<DriverType,LaunchBounds,
|
||||
//LaunchBounds::launch_mechanism == Kokkos::Experimental::LaunchDefault ?
|
||||
// (( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) )?
|
||||
// Kokkos::Experimental::CudaLaunchConstantMemory:Kokkos::Experimental::CudaLaunchLocalMemory):
|
||||
// LaunchBounds::launch_mechanism
|
||||
(CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))
|
||||
>::get_block_size(f,vector_length,shmem_extra_block,shmem_extra_thread);
|
||||
}
|
||||
|
||||
template<class FunctorType, class LaunchBounds>
|
||||
int cuda_get_opt_block_size(const CudaInternal* cuda_instance, const cudaFuncAttributes& attr, const FunctorType& f, const size_t vector_length,
|
||||
const size_t shmem_block, const size_t shmem_thread) {
|
||||
|
||||
const int min_blocks_per_sm = LaunchBounds::minBperSM == 0 ?
|
||||
1 : LaunchBounds::minBperSM ;
|
||||
const int max_threads_per_block = LaunchBounds::maxTperB == 0 ?
|
||||
cuda_instance->m_maxThreadsPerBlock : LaunchBounds::maxTperB ;
|
||||
|
||||
const int regs_per_thread = attr.numRegs;
|
||||
const int regs_per_sm = cuda_instance->m_regsPerSM;
|
||||
const int shmem_per_sm = cuda_instance->m_shmemPerSM;
|
||||
const int max_shmem_per_block = cuda_instance->m_maxShmemPerBlock;
|
||||
const int max_blocks_per_sm = cuda_instance->m_maxBlocksPerSM;
|
||||
const int max_threads_per_sm = cuda_instance->m_maxThreadsPerSM;
|
||||
|
||||
int block_size = std::min(attr.maxThreadsPerBlock,max_threads_per_block);
|
||||
|
||||
int functor_shmem = FunctorTeamShmemSize< FunctorType >::value( f , block_size/vector_length );
|
||||
int total_shmem = shmem_block + shmem_thread*(block_size/vector_length) + functor_shmem + attr.sharedSizeBytes;
|
||||
int max_blocks_regs = regs_per_sm/(regs_per_thread*block_size);
|
||||
int max_blocks_shmem = (total_shmem<max_shmem_per_block)?(total_shmem>0?shmem_per_sm/total_shmem:max_blocks_regs):0;
|
||||
int blocks_per_sm = std::min(max_blocks_regs,max_blocks_shmem);
|
||||
int threads_per_sm = blocks_per_sm * block_size;
|
||||
if(threads_per_sm > max_threads_per_sm) {
|
||||
blocks_per_sm = max_threads_per_sm/block_size;
|
||||
threads_per_sm = blocks_per_sm * block_size;
|
||||
}
|
||||
int opt_block_size = (blocks_per_sm>=min_blocks_per_sm) ? block_size : 0;
|
||||
int opt_threads_per_sm = threads_per_sm;
|
||||
|
||||
block_size-=32;
|
||||
while ((block_size>=32)) {
|
||||
functor_shmem = FunctorTeamShmemSize< FunctorType >::value( f , block_size/vector_length );
|
||||
total_shmem = shmem_block + shmem_thread*(block_size/vector_length) + functor_shmem + attr.sharedSizeBytes;
|
||||
max_blocks_regs = regs_per_sm/(regs_per_thread*block_size);
|
||||
max_blocks_shmem = (total_shmem<max_shmem_per_block)?(total_shmem>0?shmem_per_sm/total_shmem:max_blocks_regs):0;
|
||||
blocks_per_sm = std::min(max_blocks_regs,max_blocks_shmem);
|
||||
threads_per_sm = blocks_per_sm * block_size;
|
||||
if(threads_per_sm > max_threads_per_sm) {
|
||||
blocks_per_sm = max_threads_per_sm/block_size;
|
||||
threads_per_sm = blocks_per_sm * block_size;
|
||||
}
|
||||
if((blocks_per_sm >= min_blocks_per_sm) && (blocks_per_sm <= max_blocks_per_sm)) {
|
||||
if(threads_per_sm>=opt_threads_per_sm) {
|
||||
opt_block_size = block_size;
|
||||
opt_threads_per_sm = threads_per_sm;
|
||||
}
|
||||
}
|
||||
block_size-=32;
|
||||
}
|
||||
return opt_block_size;
|
||||
}
|
||||
|
||||
template<class DriverType>
|
||||
struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<>,true> {
|
||||
struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<0,0>,true> {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
|
||||
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
|
||||
int blockSize=16;
|
||||
|
@ -275,7 +396,7 @@ struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<>,true> {
|
|||
};
|
||||
|
||||
template<class DriverType>
|
||||
struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<>,false> {
|
||||
struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<0,0>,false> {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
|
||||
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
|
||||
int blockSize=16;
|
||||
|
@ -305,7 +426,7 @@ struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<>,false> {
|
|||
};
|
||||
|
||||
template<class DriverType, unsigned int MaxThreadsPerBlock, unsigned int MinBlocksPerSM>
|
||||
struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds< MaxThreadsPerBlock, MinBlocksPerSM >,true> {
|
||||
struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds< MaxThreadsPerBlock, MinBlocksPerSM>,true> {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
|
||||
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
|
||||
int blockSize=16;
|
|
@ -50,7 +50,8 @@
|
|||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include <Cuda/Kokkos_Cuda_Error.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Internal.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Instance.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Locks.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
|
@ -217,78 +218,6 @@ const CudaInternalDevices & CudaInternalDevices::singleton()
|
|||
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
class CudaInternal {
|
||||
private:
|
||||
|
||||
CudaInternal( const CudaInternal & );
|
||||
CudaInternal & operator = ( const CudaInternal & );
|
||||
|
||||
|
||||
public:
|
||||
|
||||
typedef Cuda::size_type size_type ;
|
||||
|
||||
int m_cudaDev ;
|
||||
int m_cudaArch ;
|
||||
unsigned m_multiProcCount ;
|
||||
unsigned m_maxWarpCount ;
|
||||
unsigned m_maxBlock ;
|
||||
unsigned m_maxSharedWords ;
|
||||
uint32_t m_maxConcurrency ;
|
||||
size_type m_scratchSpaceCount ;
|
||||
size_type m_scratchFlagsCount ;
|
||||
size_type m_scratchUnifiedCount ;
|
||||
size_type m_scratchUnifiedSupported ;
|
||||
size_type m_streamCount ;
|
||||
size_type * m_scratchSpace ;
|
||||
size_type * m_scratchFlags ;
|
||||
size_type * m_scratchUnified ;
|
||||
uint32_t * m_scratchConcurrentBitset ;
|
||||
cudaStream_t * m_stream ;
|
||||
|
||||
static int was_initialized;
|
||||
static int was_finalized;
|
||||
|
||||
static CudaInternal & singleton();
|
||||
|
||||
int verify_is_initialized( const char * const label ) const ;
|
||||
|
||||
int is_initialized() const
|
||||
{ return 0 != m_scratchSpace && 0 != m_scratchFlags ; }
|
||||
|
||||
void initialize( int cuda_device_id , int stream_count );
|
||||
void finalize();
|
||||
|
||||
void print_configuration( std::ostream & ) const ;
|
||||
|
||||
~CudaInternal();
|
||||
|
||||
CudaInternal()
|
||||
: m_cudaDev( -1 )
|
||||
, m_cudaArch( -1 )
|
||||
, m_multiProcCount( 0 )
|
||||
, m_maxWarpCount( 0 )
|
||||
, m_maxBlock( 0 )
|
||||
, m_maxSharedWords( 0 )
|
||||
, m_maxConcurrency( 0 )
|
||||
, m_scratchSpaceCount( 0 )
|
||||
, m_scratchFlagsCount( 0 )
|
||||
, m_scratchUnifiedCount( 0 )
|
||||
, m_scratchUnifiedSupported( 0 )
|
||||
, m_streamCount( 0 )
|
||||
, m_scratchSpace( 0 )
|
||||
, m_scratchFlags( 0 )
|
||||
, m_scratchUnified( 0 )
|
||||
, m_scratchConcurrentBitset( 0 )
|
||||
, m_stream( 0 )
|
||||
{}
|
||||
|
||||
size_type * scratch_space( const size_type size );
|
||||
size_type * scratch_flags( const size_type size );
|
||||
size_type * scratch_unified( const size_type size );
|
||||
};
|
||||
|
||||
int CudaInternal::was_initialized = 0;
|
||||
int CudaInternal::was_finalized = 0;
|
||||
|
@ -366,8 +295,11 @@ CudaInternal & CudaInternal::singleton()
|
|||
static CudaInternal self ;
|
||||
return self ;
|
||||
}
|
||||
void CudaInternal::fence() const {
|
||||
cudaStreamSynchronize(m_stream);
|
||||
}
|
||||
|
||||
void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
||||
void CudaInternal::initialize( int cuda_device_id , cudaStream_t stream )
|
||||
{
|
||||
if ( was_finalized ) Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n");
|
||||
was_initialized = 1;
|
||||
|
@ -454,6 +386,15 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
|||
|
||||
m_maxBlock = cudaProp.maxGridSize[0] ;
|
||||
|
||||
m_shmemPerSM = cudaProp.sharedMemPerMultiprocessor ;
|
||||
m_maxShmemPerBlock = cudaProp.sharedMemPerBlock ;
|
||||
m_regsPerSM = cudaProp.regsPerMultiprocessor ;
|
||||
m_maxBlocksPerSM = m_cudaArch < 500 ? 16 : (
|
||||
m_cudaArch < 750 ? 32 : (
|
||||
m_cudaArch == 750 ? 16 : 32));
|
||||
m_maxThreadsPerSM = cudaProp.maxThreadsPerMultiProcessor ;
|
||||
m_maxThreadsPerBlock = cudaProp.maxThreadsPerBlock ;
|
||||
|
||||
//----------------------------------
|
||||
|
||||
m_scratchUnifiedSupported = cudaProp.unifiedAddressing ;
|
||||
|
@ -482,10 +423,9 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
|||
// Concurrent bitset for obtaining unique tokens from within
|
||||
// an executing kernel.
|
||||
{
|
||||
const unsigned max_threads_per_sm = 2048 ; // up to capability 7.0
|
||||
|
||||
m_maxConcurrency =
|
||||
max_threads_per_sm * cudaProp.multiProcessorCount ;
|
||||
m_maxThreadsPerSM * cudaProp.multiProcessorCount ;
|
||||
|
||||
const int32_t buffer_bound =
|
||||
Kokkos::Impl::concurrent_bitset::buffer_bound( m_maxConcurrency );
|
||||
|
@ -507,11 +447,6 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
|||
}
|
||||
//----------------------------------
|
||||
|
||||
if ( stream_count ) {
|
||||
m_stream = (cudaStream_t*) ::malloc( stream_count * sizeof(cudaStream_t) );
|
||||
m_streamCount = stream_count ;
|
||||
for ( size_type i = 0 ; i < m_streamCount ; ++i ) m_stream[i] = 0 ;
|
||||
}
|
||||
}
|
||||
else {
|
||||
|
||||
|
@ -539,7 +474,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
|||
if( Kokkos::show_warnings() && !cuda_launch_blocking() ) {
|
||||
std::cerr << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
|
||||
std::cerr << " without setting CUDA_LAUNCH_BLOCKING=1." << std::endl;
|
||||
std::cerr << " The code must call Cuda::fence() after each kernel" << std::endl;
|
||||
std::cerr << " The code must call Cuda().fence() after each kernel" << std::endl;
|
||||
std::cerr << " or will likely crash when accessing data on the host." << std::endl;
|
||||
}
|
||||
|
||||
|
@ -568,7 +503,10 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
|||
#endif
|
||||
|
||||
// Init the array for used for arbitrarily sized atomics
|
||||
Impl::initialize_host_cuda_lock_arrays();
|
||||
if(stream == 0)
|
||||
Impl::initialize_host_cuda_lock_arrays();
|
||||
|
||||
m_stream = stream;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
@ -578,7 +516,7 @@ enum { sizeScratchGrain = sizeof(ScratchGrain) };
|
|||
|
||||
|
||||
Cuda::size_type *
|
||||
CudaInternal::scratch_flags( const Cuda::size_type size )
|
||||
CudaInternal::scratch_flags( const Cuda::size_type size ) const
|
||||
{
|
||||
if ( verify_is_initialized("scratch_flags") && m_scratchFlagsCount * sizeScratchGrain < size ) {
|
||||
|
||||
|
@ -587,6 +525,9 @@ CudaInternal::scratch_flags( const Cuda::size_type size )
|
|||
|
||||
typedef Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
|
||||
|
||||
if( m_scratchFlags )
|
||||
Record::decrement( Record::get_record( m_scratchFlags ) );
|
||||
|
||||
Record * const r = Record::allocate( Kokkos::CudaSpace()
|
||||
, "InternalScratchFlags"
|
||||
, ( sizeof( ScratchGrain ) * m_scratchFlagsCount ) );
|
||||
|
@ -602,7 +543,7 @@ CudaInternal::scratch_flags( const Cuda::size_type size )
|
|||
}
|
||||
|
||||
Cuda::size_type *
|
||||
CudaInternal::scratch_space( const Cuda::size_type size )
|
||||
CudaInternal::scratch_space( const Cuda::size_type size ) const
|
||||
{
|
||||
if ( verify_is_initialized("scratch_space") && m_scratchSpaceCount * sizeScratchGrain < size ) {
|
||||
|
||||
|
@ -610,6 +551,9 @@ CudaInternal::scratch_space( const Cuda::size_type size )
|
|||
|
||||
typedef Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
|
||||
|
||||
if( m_scratchSpace )
|
||||
Record::decrement( Record::get_record( m_scratchSpace ) );
|
||||
|
||||
Record * const r = Record::allocate( Kokkos::CudaSpace()
|
||||
, "InternalScratchSpace"
|
||||
, ( sizeof( ScratchGrain ) * m_scratchSpaceCount ) );
|
||||
|
@ -623,7 +567,7 @@ CudaInternal::scratch_space( const Cuda::size_type size )
|
|||
}
|
||||
|
||||
Cuda::size_type *
|
||||
CudaInternal::scratch_unified( const Cuda::size_type size )
|
||||
CudaInternal::scratch_unified( const Cuda::size_type size ) const
|
||||
{
|
||||
if ( verify_is_initialized("scratch_unified") &&
|
||||
m_scratchUnifiedSupported && m_scratchUnifiedCount * sizeScratchGrain < size ) {
|
||||
|
@ -632,6 +576,9 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
|
|||
|
||||
typedef Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > Record ;
|
||||
|
||||
if( m_scratchUnified )
|
||||
Record::decrement( Record::get_record( m_scratchUnified ) );
|
||||
|
||||
Record * const r = Record::allocate( Kokkos::CudaHostPinnedSpace()
|
||||
, "InternalScratchUnified"
|
||||
, ( sizeof( ScratchGrain ) * m_scratchUnifiedCount ) );
|
||||
|
@ -644,6 +591,31 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
|
|||
return m_scratchUnified ;
|
||||
}
|
||||
|
||||
Cuda::size_type *
|
||||
CudaInternal::scratch_functor( const Cuda::size_type size ) const
|
||||
{
|
||||
if ( verify_is_initialized("scratch_functor") &&
|
||||
m_scratchFunctorSize < size ) {
|
||||
|
||||
m_scratchFunctorSize = size ;
|
||||
|
||||
typedef Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
|
||||
|
||||
if( m_scratchFunctor )
|
||||
Record::decrement( Record::get_record( m_scratchFunctor ) );
|
||||
|
||||
Record * const r = Record::allocate( Kokkos::CudaSpace()
|
||||
, "InternalScratchFunctor"
|
||||
, m_scratchFunctorSize );
|
||||
|
||||
Record::increment( r );
|
||||
|
||||
m_scratchFunctor = reinterpret_cast<size_type *>( r->data() );
|
||||
}
|
||||
|
||||
return m_scratchFunctor ;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
void CudaInternal::finalize()
|
||||
|
@ -653,13 +625,7 @@ void CudaInternal::finalize()
|
|||
|
||||
Impl::finalize_host_cuda_lock_arrays();
|
||||
|
||||
if ( m_stream ) {
|
||||
for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
|
||||
cudaStreamDestroy( m_stream[i] );
|
||||
m_stream[i] = 0 ;
|
||||
}
|
||||
::free( m_stream );
|
||||
}
|
||||
if(m_stream!=0) cudaStreamDestroy(m_stream);
|
||||
|
||||
typedef Kokkos::Impl::SharedAllocationRecord< CudaSpace > RecordCuda ;
|
||||
typedef Kokkos::Impl::SharedAllocationRecord< CudaHostPinnedSpace > RecordHost ;
|
||||
|
@ -668,6 +634,8 @@ void CudaInternal::finalize()
|
|||
RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) );
|
||||
RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) );
|
||||
RecordCuda::decrement( RecordCuda::get_record( m_scratchConcurrentBitset ) );
|
||||
if(m_scratchFunctorSize>0)
|
||||
RecordCuda::decrement( RecordCuda::get_record( m_scratchFunctor ) );
|
||||
|
||||
m_cudaDev = -1 ;
|
||||
m_multiProcCount = 0 ;
|
||||
|
@ -713,14 +681,14 @@ Cuda::size_type cuda_internal_maximum_grid_count()
|
|||
Cuda::size_type cuda_internal_maximum_shared_words()
|
||||
{ return CudaInternal::singleton().m_maxSharedWords ; }
|
||||
|
||||
Cuda::size_type * cuda_internal_scratch_space( const Cuda::size_type size )
|
||||
{ return CudaInternal::singleton().scratch_space( size ); }
|
||||
Cuda::size_type * cuda_internal_scratch_space( const Cuda& instance, const Cuda::size_type size )
|
||||
{ return instance.impl_internal_space_instance()->scratch_space( size ); }
|
||||
|
||||
Cuda::size_type * cuda_internal_scratch_flags( const Cuda::size_type size )
|
||||
{ return CudaInternal::singleton().scratch_flags( size ); }
|
||||
Cuda::size_type * cuda_internal_scratch_flags( const Cuda& instance, const Cuda::size_type size )
|
||||
{ return instance.impl_internal_space_instance()->scratch_flags( size ); }
|
||||
|
||||
Cuda::size_type * cuda_internal_scratch_unified( const Cuda::size_type size )
|
||||
{ return CudaInternal::singleton().scratch_unified( size ); }
|
||||
Cuda::size_type * cuda_internal_scratch_unified( const Cuda& instance, const Cuda::size_type size )
|
||||
{ return instance.impl_internal_space_instance()->scratch_unified( size ); }
|
||||
|
||||
|
||||
} // namespace Impl
|
||||
|
@ -749,7 +717,7 @@ void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances )
|
|||
void Cuda::impl_initialize( const Cuda::SelectDevice config , size_t num_instances )
|
||||
#endif
|
||||
{
|
||||
Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances );
|
||||
Impl::CudaInternal::singleton().initialize( config.cuda_device_id , 0 );
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::initialize();
|
||||
|
@ -800,19 +768,17 @@ void Cuda::impl_finalize()
|
|||
}
|
||||
|
||||
Cuda::Cuda()
|
||||
: m_device( Impl::CudaInternal::singleton().m_cudaDev )
|
||||
, m_stream( 0 )
|
||||
: m_space_instance( &Impl::CudaInternal::singleton() )
|
||||
{
|
||||
Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" );
|
||||
}
|
||||
|
||||
Cuda::Cuda( const int instance_id )
|
||||
: m_device( Impl::CudaInternal::singleton().m_cudaDev )
|
||||
, m_stream(
|
||||
Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" )
|
||||
? Impl::CudaInternal::singleton().m_stream[ instance_id % Impl::CudaInternal::singleton().m_streamCount ]
|
||||
: 0 )
|
||||
{}
|
||||
Cuda::Cuda(cudaStream_t stream)
|
||||
: m_space_instance(new Impl::CudaInternal)
|
||||
{
|
||||
Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" );
|
||||
m_space_instance->initialize(Impl::CudaInternal::singleton().m_cudaDev,stream);
|
||||
}
|
||||
|
||||
void Cuda::print_configuration( std::ostream & s , const bool )
|
||||
{ Impl::CudaInternal::singleton().print_configuration( s ); }
|
||||
|
@ -823,13 +789,27 @@ bool Cuda::sleep() { return false ; }
|
|||
bool Cuda::wake() { return true ; }
|
||||
#endif
|
||||
|
||||
void Cuda::fence()
|
||||
void Cuda::impl_static_fence()
|
||||
{
|
||||
Kokkos::Impl::cuda_device_synchronize();
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
void Cuda::fence() {
|
||||
impl_static_fence();
|
||||
}
|
||||
#else
|
||||
void Cuda::fence() const {
|
||||
m_space_instance->fence();
|
||||
}
|
||||
#endif
|
||||
|
||||
const char* Cuda::name() { return "Cuda"; }
|
||||
|
||||
cudaStream_t Cuda::cuda_stream() const { return m_space_instance->m_stream ; }
|
||||
int Cuda::cuda_device() const { return m_space_instance->m_cudaDev ; }
|
||||
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
|
@ -0,0 +1,156 @@
|
|||
#ifndef KOKKOS_CUDA_INSTANCE_HPP_
|
||||
#define KOKKOS_CUDA_INSTANCE_HPP_
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
struct CudaTraits {
|
||||
enum { WarpSize = 32 /* 0x0020 */ };
|
||||
enum { WarpIndexMask = 0x001f /* Mask for warpindex */ };
|
||||
enum { WarpIndexShift = 5 /* WarpSize == 1 << WarpShift */ };
|
||||
|
||||
enum { ConstantMemoryUsage = 0x008000 /* 32k bytes */ };
|
||||
enum { ConstantMemoryCache = 0x002000 /* 8k bytes */ };
|
||||
enum { KernelArgumentLimit = 0x001000 /* 4k bytes */ };
|
||||
|
||||
typedef unsigned long
|
||||
ConstantGlobalBufferType[ ConstantMemoryUsage / sizeof(unsigned long) ];
|
||||
|
||||
#if defined(KOKKOS_ARCH_VOLTA) || \
|
||||
defined(KOKKOS_ARCH_PASCAL)
|
||||
enum { ConstantMemoryUseThreshold = 0x000200 /* 0 bytes -> always use constant (or global)*/ };
|
||||
#else
|
||||
enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ };
|
||||
#endif
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
CudaSpace::size_type warp_count( CudaSpace::size_type i )
|
||||
{ return ( i + WarpIndexMask ) >> WarpIndexShift ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
CudaSpace::size_type warp_align( CudaSpace::size_type i )
|
||||
{
|
||||
enum { Mask = ~CudaSpace::size_type( WarpIndexMask ) };
|
||||
return ( i + WarpIndexMask ) & Mask ;
|
||||
}
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
CudaSpace::size_type cuda_internal_multiprocessor_count();
|
||||
CudaSpace::size_type cuda_internal_maximum_warp_count();
|
||||
CudaSpace::size_type cuda_internal_maximum_grid_count();
|
||||
CudaSpace::size_type cuda_internal_maximum_shared_words();
|
||||
|
||||
CudaSpace::size_type cuda_internal_maximum_concurrent_block_count();
|
||||
|
||||
CudaSpace::size_type * cuda_internal_scratch_flags( const Cuda&, const CudaSpace::size_type size );
|
||||
CudaSpace::size_type * cuda_internal_scratch_space( const Cuda&, const CudaSpace::size_type size );
|
||||
CudaSpace::size_type * cuda_internal_scratch_unified( const Cuda&, const CudaSpace::size_type size );
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
class CudaInternal {
|
||||
private:
|
||||
|
||||
CudaInternal( const CudaInternal & );
|
||||
CudaInternal & operator = ( const CudaInternal & );
|
||||
|
||||
|
||||
public:
|
||||
|
||||
typedef Cuda::size_type size_type ;
|
||||
|
||||
int m_cudaDev ;
|
||||
|
||||
// Device Properties
|
||||
int m_cudaArch ;
|
||||
unsigned m_multiProcCount ;
|
||||
unsigned m_maxWarpCount ;
|
||||
unsigned m_maxBlock ;
|
||||
unsigned m_maxSharedWords ;
|
||||
uint32_t m_maxConcurrency ;
|
||||
int m_shmemPerSM ;
|
||||
int m_maxShmemPerBlock ;
|
||||
int m_regsPerSM ;
|
||||
int m_maxBlocksPerSM ;
|
||||
int m_maxThreadsPerSM ;
|
||||
int m_maxThreadsPerBlock ;
|
||||
|
||||
mutable size_type m_scratchSpaceCount ;
|
||||
mutable size_type m_scratchFlagsCount ;
|
||||
mutable size_type m_scratchUnifiedCount ;
|
||||
mutable size_type m_scratchFunctorSize ;
|
||||
size_type m_scratchUnifiedSupported ;
|
||||
size_type m_streamCount ;
|
||||
mutable size_type * m_scratchSpace ;
|
||||
mutable size_type * m_scratchFlags ;
|
||||
mutable size_type * m_scratchUnified ;
|
||||
mutable size_type * m_scratchFunctor ;
|
||||
uint32_t * m_scratchConcurrentBitset ;
|
||||
cudaStream_t m_stream ;
|
||||
|
||||
static int was_initialized;
|
||||
static int was_finalized;
|
||||
|
||||
static CudaInternal & singleton();
|
||||
|
||||
int verify_is_initialized( const char * const label ) const ;
|
||||
|
||||
int is_initialized() const
|
||||
{ return 0 != m_scratchSpace && 0 != m_scratchFlags ; }
|
||||
|
||||
void initialize( int cuda_device_id , cudaStream_t stream = 0 );
|
||||
void finalize();
|
||||
|
||||
void print_configuration( std::ostream & ) const ;
|
||||
|
||||
void fence() const ;
|
||||
|
||||
~CudaInternal();
|
||||
|
||||
CudaInternal()
|
||||
: m_cudaDev( -1 )
|
||||
, m_cudaArch( -1 )
|
||||
, m_multiProcCount( 0 )
|
||||
, m_maxWarpCount( 0 )
|
||||
, m_maxBlock( 0 )
|
||||
, m_maxSharedWords( 0 )
|
||||
, m_maxConcurrency( 0 )
|
||||
, m_shmemPerSM( 0 )
|
||||
, m_maxShmemPerBlock( 0 )
|
||||
, m_regsPerSM( 0 )
|
||||
, m_maxBlocksPerSM( 0 )
|
||||
, m_maxThreadsPerSM( 0 )
|
||||
, m_maxThreadsPerBlock( 0 )
|
||||
, m_scratchSpaceCount( 0 )
|
||||
, m_scratchFlagsCount( 0 )
|
||||
, m_scratchUnifiedCount( 0 )
|
||||
, m_scratchFunctorSize( 0 )
|
||||
, m_scratchUnifiedSupported( 0 )
|
||||
, m_streamCount( 0 )
|
||||
, m_scratchSpace( 0 )
|
||||
, m_scratchFlags( 0 )
|
||||
, m_scratchUnified( 0 )
|
||||
, m_scratchFunctor( 0 )
|
||||
, m_scratchConcurrentBitset( 0 )
|
||||
, m_stream( 0 )
|
||||
{}
|
||||
|
||||
size_type * scratch_space( const size_type size ) const ;
|
||||
size_type * scratch_flags( const size_type size ) const ;
|
||||
size_type * scratch_unified( const size_type size ) const ;
|
||||
size_type * scratch_functor( const size_type size ) const ;
|
||||
};
|
||||
|
||||
} // Namespace Impl
|
||||
} // Namespace Kokkos
|
||||
#endif
|
|
@ -0,0 +1,579 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_CUDAEXEC_HPP
|
||||
#define KOKKOS_CUDAEXEC_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include <string>
|
||||
#include <cstdint>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_abort.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Error.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Locks.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Instance.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#if defined( __CUDACC__ )
|
||||
|
||||
/** \brief Access to constant memory on the device */
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
|
||||
__device__ __constant__
|
||||
extern unsigned long kokkos_impl_cuda_constant_memory_buffer[] ;
|
||||
|
||||
#else
|
||||
|
||||
__device__ __constant__
|
||||
unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;
|
||||
|
||||
#endif
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink = false);
|
||||
}
|
||||
}
|
||||
|
||||
template< typename T >
|
||||
inline
|
||||
__device__
|
||||
T * kokkos_impl_cuda_shared_memory()
|
||||
{ extern __shared__ Kokkos::CudaSpace::size_type sh[]; return (T*) sh ; }
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// See section B.17 of Cuda C Programming Guide Version 3.2
|
||||
// for discussion of
|
||||
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
|
||||
// function qualifier which could be used to improve performance.
|
||||
//----------------------------------------------------------------------------
|
||||
// Maximize L1 cache and minimize shared memory:
|
||||
// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
|
||||
// For 2.0 capability: 48 KB L1 and 16 KB shared
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< class DriverType>
|
||||
__global__
|
||||
static void cuda_parallel_launch_constant_memory()
|
||||
{
|
||||
const DriverType & driver =
|
||||
*((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
|
||||
|
||||
driver();
|
||||
}
|
||||
|
||||
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
|
||||
__global__
|
||||
__launch_bounds__(maxTperB, minBperSM)
|
||||
static void cuda_parallel_launch_constant_memory()
|
||||
{
|
||||
const DriverType & driver =
|
||||
*((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
|
||||
|
||||
driver();
|
||||
}
|
||||
|
||||
template< class DriverType>
|
||||
__global__
|
||||
static void cuda_parallel_launch_local_memory( const DriverType driver )
|
||||
{
|
||||
driver();
|
||||
}
|
||||
|
||||
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
|
||||
__global__
|
||||
__launch_bounds__(maxTperB, minBperSM)
|
||||
static void cuda_parallel_launch_local_memory( const DriverType driver )
|
||||
{
|
||||
driver();
|
||||
}
|
||||
|
||||
template< class DriverType>
|
||||
__global__
|
||||
static void cuda_parallel_launch_global_memory( const DriverType* driver )
|
||||
{
|
||||
driver->operator()();
|
||||
}
|
||||
|
||||
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
|
||||
__global__
|
||||
__launch_bounds__(maxTperB, minBperSM)
|
||||
static void cuda_parallel_launch_global_memory( const DriverType* driver )
|
||||
{
|
||||
driver->operator()();
|
||||
}
|
||||
|
||||
template< class DriverType>
|
||||
__global__
|
||||
static void cuda_parallel_launch_constant_or_global_memory( const DriverType* driver_ptr )
|
||||
{
|
||||
const DriverType & driver = driver_ptr!=NULL ? *driver_ptr :
|
||||
*((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
|
||||
|
||||
driver();
|
||||
}
|
||||
|
||||
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
|
||||
__global__
|
||||
__launch_bounds__(maxTperB, minBperSM)
|
||||
static void cuda_parallel_launch_constant_or_global_memory( const DriverType* driver_ptr )
|
||||
{
|
||||
const DriverType & driver = driver_ptr!=NULL ? *driver_ptr :
|
||||
*((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
|
||||
|
||||
driver();
|
||||
}
|
||||
|
||||
template< class DriverType >
|
||||
struct DeduceCudaLaunchMechanism {
|
||||
constexpr static const Kokkos::Experimental::WorkItemProperty::HintLightWeight_t light_weight = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
|
||||
constexpr static const Kokkos::Experimental::WorkItemProperty::HintHeavyWeight_t heavy_weight = Kokkos::Experimental::WorkItemProperty::HintHeavyWeight ;
|
||||
constexpr static const typename DriverType::Policy::work_item_property property = typename DriverType::Policy::work_item_property();
|
||||
|
||||
static constexpr const Experimental::CudaLaunchMechanism valid_launch_mechanism =
|
||||
// BuildValidMask
|
||||
(sizeof(DriverType)<CudaTraits::KernelArgumentLimit?
|
||||
Experimental::CudaLaunchMechanism::LocalMemory:Experimental::CudaLaunchMechanism::Default)|
|
||||
(sizeof(DriverType)<CudaTraits::ConstantMemoryUsage?
|
||||
Experimental::CudaLaunchMechanism::ConstantMemory:Experimental::CudaLaunchMechanism::Default)|
|
||||
Experimental::CudaLaunchMechanism::GlobalMemory;
|
||||
|
||||
static constexpr const Experimental::CudaLaunchMechanism requested_launch_mechanism =
|
||||
(((property&light_weight)==light_weight)?
|
||||
Experimental::CudaLaunchMechanism::LocalMemory :
|
||||
Experimental::CudaLaunchMechanism::ConstantMemory)
|
||||
| Experimental::CudaLaunchMechanism::GlobalMemory;
|
||||
|
||||
static constexpr const Experimental::CudaLaunchMechanism default_launch_mechanism =
|
||||
// BuildValidMask
|
||||
(sizeof(DriverType)<CudaTraits::ConstantMemoryUseThreshold)?
|
||||
Experimental::CudaLaunchMechanism::LocalMemory:(
|
||||
(sizeof(DriverType)<CudaTraits::ConstantMemoryUsage)?
|
||||
Experimental::CudaLaunchMechanism::ConstantMemory:
|
||||
Experimental::CudaLaunchMechanism::GlobalMemory);
|
||||
|
||||
// None LightWeight HeavyWeight
|
||||
// F<UseT LCG LCG L L LCG LG L L LCG CG L C
|
||||
// UseT<F<KAL LCG LCG C C LCG LG C L LCG CG C C
|
||||
// Kal<F<CMU CG LCG C C CG LG C G CG CG C C
|
||||
// CMU<F G LCG G G G LG G G G CG G G
|
||||
static constexpr const Experimental::CudaLaunchMechanism launch_mechanism =
|
||||
((property&light_weight)==light_weight)?
|
||||
(sizeof(DriverType)<CudaTraits::KernelArgumentLimit?
|
||||
Experimental::CudaLaunchMechanism::LocalMemory:
|
||||
Experimental::CudaLaunchMechanism::GlobalMemory):(
|
||||
((property&heavy_weight)==heavy_weight)?
|
||||
(sizeof(DriverType)<CudaTraits::ConstantMemoryUsage?
|
||||
Experimental::CudaLaunchMechanism::ConstantMemory:
|
||||
Experimental::CudaLaunchMechanism::GlobalMemory):
|
||||
(default_launch_mechanism)
|
||||
);
|
||||
};
|
||||
// Use local memory up to ConstantMemoryUseThreshold
|
||||
// Use global memory above ConstantMemoryUsage
|
||||
// In between use ConstantMemory
|
||||
template < class DriverType
|
||||
, class LaunchBounds = Kokkos::LaunchBounds<>
|
||||
, Experimental::CudaLaunchMechanism LaunchMechanism =
|
||||
DeduceCudaLaunchMechanism<DriverType>::launch_mechanism >
|
||||
struct CudaParallelLaunch ;
|
||||
|
||||
template < class DriverType
|
||||
, unsigned int MaxThreadsPerBlock
|
||||
, unsigned int MinBlocksPerSM>
|
||||
struct CudaParallelLaunch< DriverType
|
||||
, Kokkos::LaunchBounds< MaxThreadsPerBlock
|
||||
, MinBlocksPerSM >
|
||||
, Experimental::CudaLaunchMechanism::ConstantMemory>
|
||||
{
|
||||
static_assert(sizeof(DriverType)<CudaTraits::ConstantMemoryUsage,"Kokkos Error: Requested CudaLaunchConstantMemory with a Functor larger than 32kB.");
|
||||
inline
|
||||
CudaParallelLaunch( const DriverType & driver
|
||||
, const dim3 & grid
|
||||
, const dim3 & block
|
||||
, const int shmem
|
||||
, const CudaInternal* cuda_instance
|
||||
, const bool prefer_shmem )
|
||||
{
|
||||
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
|
||||
|
||||
// Fence before changing settings and copying closure
|
||||
Kokkos::Cuda().fence();
|
||||
|
||||
if ( cuda_instance->m_maxShmemPerBlock < shmem ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
|
||||
}
|
||||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else {
|
||||
CUDA_SAFE_CALL(
|
||||
cudaFuncSetCacheConfig
|
||||
( cuda_parallel_launch_constant_memory
|
||||
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
|
||||
, ( prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
|
||||
) );
|
||||
}
|
||||
#endif
|
||||
|
||||
// Copy functor to constant memory on the device
|
||||
cudaMemcpyToSymbolAsync(
|
||||
kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType), 0, cudaMemcpyHostToDevice, cudaStream_t(cuda_instance->m_stream));
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
// Invoke the driver function on the device
|
||||
cuda_parallel_launch_constant_memory
|
||||
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
|
||||
<<< grid , block , shmem , cuda_instance->m_stream >>>();
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
Kokkos::Cuda().fence();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static cudaFuncAttributes get_cuda_func_attributes() {
|
||||
cudaFuncAttributes attr;
|
||||
cudaFuncGetAttributes(&attr,cuda_parallel_launch_constant_memory
|
||||
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >);
|
||||
return attr;
|
||||
}
|
||||
};
|
||||
|
||||
template < class DriverType>
|
||||
struct CudaParallelLaunch< DriverType
|
||||
, Kokkos::LaunchBounds<0,0>
|
||||
, Experimental::CudaLaunchMechanism::ConstantMemory >
|
||||
{
|
||||
static_assert(sizeof(DriverType)<CudaTraits::ConstantMemoryUsage,"Kokkos Error: Requested CudaLaunchConstantMemory with a Functor larger than 32kB.");
|
||||
inline
|
||||
CudaParallelLaunch( const DriverType & driver
|
||||
, const dim3 & grid
|
||||
, const dim3 & block
|
||||
, const int shmem
|
||||
, const CudaInternal* cuda_instance
|
||||
, const bool prefer_shmem )
|
||||
{
|
||||
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
|
||||
|
||||
// Fence before changing settings and copying closure
|
||||
Kokkos::Cuda().fence();
|
||||
|
||||
if ( cuda_instance->m_maxShmemPerBlock < shmem ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
|
||||
}
|
||||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else {
|
||||
CUDA_SAFE_CALL(
|
||||
cudaFuncSetCacheConfig
|
||||
( cuda_parallel_launch_constant_memory< DriverType >
|
||||
, ( prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
|
||||
) );
|
||||
}
|
||||
#endif
|
||||
|
||||
// Copy functor to constant memory on the device
|
||||
cudaMemcpyToSymbolAsync(
|
||||
kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType), 0, cudaMemcpyHostToDevice, cudaStream_t(cuda_instance->m_stream));
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
// Invoke the driver function on the device
|
||||
cuda_parallel_launch_constant_memory< DriverType >
|
||||
<<< grid , block , shmem , cuda_instance->m_stream >>>();
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
Kokkos::Cuda().fence();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static cudaFuncAttributes get_cuda_func_attributes() {
|
||||
cudaFuncAttributes attr;
|
||||
cudaFuncGetAttributes(&attr,cuda_parallel_launch_constant_memory
|
||||
< DriverType >);
|
||||
return attr;
|
||||
}
|
||||
};
|
||||
|
||||
template < class DriverType
|
||||
, unsigned int MaxThreadsPerBlock
|
||||
, unsigned int MinBlocksPerSM >
|
||||
struct CudaParallelLaunch< DriverType
|
||||
, Kokkos::LaunchBounds< MaxThreadsPerBlock
|
||||
, MinBlocksPerSM >
|
||||
, Experimental::CudaLaunchMechanism::LocalMemory >
|
||||
{
|
||||
static_assert(sizeof(DriverType)<CudaTraits::KernelArgumentLimit,"Kokkos Error: Requested CudaLaunchLocalMemory with a Functor larger than 4096 bytes.");
|
||||
inline
|
||||
CudaParallelLaunch( const DriverType & driver
|
||||
, const dim3 & grid
|
||||
, const dim3 & block
|
||||
, const int shmem
|
||||
, const CudaInternal* cuda_instance
|
||||
, const bool prefer_shmem )
|
||||
{
|
||||
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
|
||||
|
||||
if ( cuda_instance->m_maxShmemPerBlock < shmem ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
|
||||
}
|
||||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else {
|
||||
CUDA_SAFE_CALL(
|
||||
cudaFuncSetCacheConfig
|
||||
( cuda_parallel_launch_local_memory
|
||||
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
|
||||
, ( prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
|
||||
) );
|
||||
}
|
||||
#endif
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
// Invoke the driver function on the device
|
||||
cuda_parallel_launch_local_memory
|
||||
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
|
||||
<<< grid , block , shmem , cuda_instance->m_stream >>>( driver );
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
Kokkos::Cuda().fence();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static cudaFuncAttributes get_cuda_func_attributes() {
|
||||
cudaFuncAttributes attr;
|
||||
cudaFuncGetAttributes(&attr,cuda_parallel_launch_local_memory
|
||||
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >);
|
||||
return attr;
|
||||
}
|
||||
};
|
||||
|
||||
template < class DriverType>
|
||||
struct CudaParallelLaunch< DriverType
|
||||
, Kokkos::LaunchBounds<0,0>
|
||||
, Experimental::CudaLaunchMechanism::LocalMemory >
|
||||
{
|
||||
static_assert(sizeof(DriverType)<CudaTraits::KernelArgumentLimit,"Kokkos Error: Requested CudaLaunchLocalMemory with a Functor larger than 4096 bytes.");
|
||||
inline
|
||||
CudaParallelLaunch( const DriverType & driver
|
||||
, const dim3 & grid
|
||||
, const dim3 & block
|
||||
, const int shmem
|
||||
, const CudaInternal* cuda_instance
|
||||
, const bool prefer_shmem)
|
||||
{
|
||||
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
|
||||
|
||||
if ( cuda_instance->m_maxShmemPerBlock < shmem ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
|
||||
}
|
||||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else {
|
||||
CUDA_SAFE_CALL(
|
||||
cudaFuncSetCacheConfig
|
||||
( cuda_parallel_launch_local_memory< DriverType >
|
||||
, ( prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
|
||||
) );
|
||||
}
|
||||
#endif
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
// Invoke the driver function on the device
|
||||
cuda_parallel_launch_local_memory< DriverType >
|
||||
<<< grid , block , shmem , cuda_instance->m_stream >>>( driver );
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
Kokkos::Cuda().fence();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static cudaFuncAttributes get_cuda_func_attributes() {
|
||||
cudaFuncAttributes attr;
|
||||
cudaFuncGetAttributes(&attr,cuda_parallel_launch_local_memory
|
||||
< DriverType >);
|
||||
return attr;
|
||||
}
|
||||
};
|
||||
|
||||
template < class DriverType
|
||||
, unsigned int MaxThreadsPerBlock
|
||||
, unsigned int MinBlocksPerSM>
|
||||
struct CudaParallelLaunch< DriverType
|
||||
, Kokkos::LaunchBounds< MaxThreadsPerBlock
|
||||
, MinBlocksPerSM>
|
||||
, Experimental::CudaLaunchMechanism::GlobalMemory >
|
||||
{
|
||||
inline
|
||||
CudaParallelLaunch( const DriverType & driver
|
||||
, const dim3 & grid
|
||||
, const dim3 & block
|
||||
, const int shmem
|
||||
, CudaInternal* cuda_instance
|
||||
, const bool prefer_shmem )
|
||||
{
|
||||
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
|
||||
|
||||
if ( cuda_instance->m_maxShmemPerBlock < shmem ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
|
||||
}
|
||||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else {
|
||||
CUDA_SAFE_CALL(
|
||||
cudaFuncSetCacheConfig
|
||||
( cuda_parallel_launch_global_memory
|
||||
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
|
||||
, ( prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
|
||||
) );
|
||||
}
|
||||
#endif
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
DriverType* driver_ptr = NULL;
|
||||
driver_ptr = reinterpret_cast<DriverType*>(cuda_instance->scratch_functor(sizeof(DriverType)));
|
||||
cudaMemcpyAsync(driver_ptr,&driver, sizeof(DriverType), cudaMemcpyDefault, cuda_instance->m_stream);
|
||||
|
||||
// Invoke the driver function on the device
|
||||
cuda_parallel_launch_global_memory
|
||||
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
|
||||
<<< grid , block , shmem , cuda_instance->m_stream >>>( driver_ptr );
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
Kokkos::Cuda().fence();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
static cudaFuncAttributes get_cuda_func_attributes() {
|
||||
cudaFuncAttributes attr;
|
||||
cudaFuncGetAttributes(&attr,cuda_parallel_launch_global_memory
|
||||
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >);
|
||||
return attr;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
template < class DriverType>
|
||||
struct CudaParallelLaunch< DriverType
|
||||
, Kokkos::LaunchBounds<0,0>
|
||||
, Experimental::CudaLaunchMechanism::GlobalMemory >
|
||||
{
|
||||
inline
|
||||
CudaParallelLaunch( const DriverType & driver
|
||||
, const dim3 & grid
|
||||
, const dim3 & block
|
||||
, const int shmem
|
||||
, CudaInternal* cuda_instance
|
||||
, const bool prefer_shmem)
|
||||
{
|
||||
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
|
||||
|
||||
if ( cuda_instance->m_maxShmemPerBlock < shmem ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
|
||||
}
|
||||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else {
|
||||
CUDA_SAFE_CALL(
|
||||
cudaFuncSetCacheConfig
|
||||
( cuda_parallel_launch_global_memory< DriverType >
|
||||
, ( prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
|
||||
) );
|
||||
}
|
||||
#endif
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
DriverType* driver_ptr = NULL;
|
||||
driver_ptr = reinterpret_cast<DriverType*>(cuda_instance->scratch_functor(sizeof(DriverType)));
|
||||
cudaMemcpyAsync(driver_ptr,&driver, sizeof(DriverType), cudaMemcpyDefault, cuda_instance->m_stream);
|
||||
|
||||
cuda_parallel_launch_global_memory< DriverType >
|
||||
<<< grid , block , shmem , cuda_instance->m_stream >>>( driver_ptr );
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
Kokkos::Cuda().fence();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static cudaFuncAttributes get_cuda_func_attributes() {
|
||||
cudaFuncAttributes attr;
|
||||
cudaFuncGetAttributes(&attr,cuda_parallel_launch_global_memory
|
||||
< DriverType >);
|
||||
return attr;
|
||||
}
|
||||
};
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* defined( __CUDACC__ ) */
|
||||
#endif /* defined( KOKKOS_ENABLE_CUDA ) */
|
||||
#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */
|
||||
|
File diff suppressed because it is too large
Load Diff
|
@ -376,13 +376,13 @@ template< class ReducerType >
|
|||
__device__ inline
|
||||
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
|
||||
cuda_intra_warp_reduction( const ReducerType& reducer,
|
||||
typename ReducerType::value_type& result,
|
||||
const uint32_t max_active_thread = blockDim.y) {
|
||||
|
||||
typedef typename ReducerType::value_type ValueType;
|
||||
|
||||
unsigned int shift = 1;
|
||||
|
||||
ValueType result = reducer.reference();
|
||||
//Reduce over values from threads with different threadIdx.y
|
||||
while(blockDim.x * shift < 32 ) {
|
||||
const ValueType tmp = shfl_down(result, blockDim.x*shift,32u);
|
||||
|
@ -400,6 +400,7 @@ template< class ReducerType >
|
|||
__device__ inline
|
||||
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
|
||||
cuda_inter_warp_reduction( const ReducerType& reducer,
|
||||
typename ReducerType::value_type value,
|
||||
const int max_active_thread = blockDim.y) {
|
||||
|
||||
typedef typename ReducerType::value_type ValueType;
|
||||
|
@ -410,7 +411,6 @@ cuda_inter_warp_reduction( const ReducerType& reducer,
|
|||
// could lead to race conditions
|
||||
__shared__ double sh_result[(sizeof(ValueType)+7)/8*STEP_WIDTH];
|
||||
ValueType* result = (ValueType*) & sh_result;
|
||||
ValueType value = reducer.reference();
|
||||
const int step = 32 / blockDim.x;
|
||||
int shift = STEP_WIDTH;
|
||||
const int id = threadIdx.y%step==0?threadIdx.y/step:65000;
|
||||
|
@ -438,9 +438,18 @@ template< class ReducerType >
|
|||
__device__ inline
|
||||
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
|
||||
cuda_intra_block_reduction( const ReducerType& reducer,
|
||||
typename ReducerType::value_type value,
|
||||
const int max_active_thread = blockDim.y) {
|
||||
cuda_intra_warp_reduction(reducer,max_active_thread);
|
||||
cuda_inter_warp_reduction(reducer,max_active_thread);
|
||||
cuda_intra_warp_reduction(reducer,value,max_active_thread);
|
||||
cuda_inter_warp_reduction(reducer,value,max_active_thread);
|
||||
}
|
||||
|
||||
template< class ReducerType >
|
||||
__device__ inline
|
||||
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
|
||||
cuda_intra_block_reduction( const ReducerType& reducer,
|
||||
const int max_active_thread = blockDim.y) {
|
||||
cuda_intra_block_reduction(reducer,reducer.reference(),max_active_thread);
|
||||
}
|
||||
|
||||
template< class ReducerType>
|
||||
|
|
|
@ -54,194 +54,8 @@
|
|||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template class TaskQueue< Kokkos::Cuda > ;
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
__device__
|
||||
void TaskQueueSpecialization< Kokkos::Cuda >::driver
|
||||
( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue
|
||||
, int32_t shmem_per_warp )
|
||||
{
|
||||
using Member = TaskExec< Kokkos::Cuda > ;
|
||||
using Queue = TaskQueue< Kokkos::Cuda > ;
|
||||
using task_root_type = TaskBase< void , void , void > ;
|
||||
|
||||
extern __shared__ int32_t shmem_all[];
|
||||
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
int32_t * const warp_shmem =
|
||||
shmem_all + ( threadIdx.z * shmem_per_warp ) / sizeof(int32_t);
|
||||
|
||||
task_root_type * const task_shmem = (task_root_type *) warp_shmem ;
|
||||
|
||||
const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;
|
||||
|
||||
Member single_exec( warp_shmem , 1 );
|
||||
Member team_exec( warp_shmem , blockDim.y );
|
||||
|
||||
task_root_type * task_ptr ;
|
||||
|
||||
// Loop until all queues are empty and no tasks in flight
|
||||
|
||||
do {
|
||||
|
||||
// Each team lead attempts to acquire either a thread team task
|
||||
// or collection of single thread tasks for the team.
|
||||
|
||||
if ( 0 == warp_lane ) {
|
||||
|
||||
task_ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
|
||||
|
||||
// Loop by priority and then type
|
||||
for ( int i = 0 ; i < Queue::NumQueue && end == task_ptr ; ++i ) {
|
||||
for ( int j = 0 ; j < 2 && end == task_ptr ; ++j ) {
|
||||
task_ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
|
||||
, uintptr_t(task_ptr));
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
// Synchronize warp with memory fence before broadcasting task pointer:
|
||||
|
||||
// KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "A" );
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP ;
|
||||
|
||||
// Broadcast task pointer:
|
||||
|
||||
((int*) & task_ptr )[0] = KOKKOS_IMPL_CUDA_SHFL( ((int*) & task_ptr )[0] , 0 , 32 );
|
||||
((int*) & task_ptr )[1] = KOKKOS_IMPL_CUDA_SHFL( ((int*) & task_ptr )[1] , 0 , 32 );
|
||||
|
||||
#if defined( KOKKOS_DEBUG )
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "TaskQueue CUDA task_ptr" );
|
||||
#endif
|
||||
|
||||
if ( 0 == task_ptr ) break ; // 0 == queue->m_ready_count
|
||||
|
||||
if ( end != task_ptr ) {
|
||||
|
||||
// Whole warp copy task's closure to/from shared memory.
|
||||
// Use all threads of warp for coalesced read/write.
|
||||
|
||||
int32_t const b = sizeof(task_root_type) / sizeof(int32_t);
|
||||
int32_t const e = *((int32_t volatile *)( & task_ptr->m_alloc_size )) / sizeof(int32_t);
|
||||
|
||||
int32_t volatile * const task_mem = (int32_t volatile *) task_ptr ;
|
||||
|
||||
// copy task closure from global to shared memory:
|
||||
|
||||
for ( int32_t i = warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
|
||||
warp_shmem[i] = task_mem[i] ;
|
||||
}
|
||||
|
||||
// Synchronize threads of the warp and insure memory
|
||||
// writes are visible to all threads in the warp.
|
||||
|
||||
// KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "B" );
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP ;
|
||||
|
||||
if ( task_root_type::TaskTeam == task_shmem->m_task_type ) {
|
||||
// Thread Team Task
|
||||
(*task_shmem->m_apply)( task_shmem , & team_exec );
|
||||
}
|
||||
else if ( 0 == threadIdx.y ) {
|
||||
// Single Thread Task
|
||||
(*task_shmem->m_apply)( task_shmem , & single_exec );
|
||||
}
|
||||
|
||||
// Synchronize threads of the warp and insure memory
|
||||
// writes are visible to all threads in the warp.
|
||||
|
||||
// KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "C" );
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP ;
|
||||
|
||||
// copy task closure from shared to global memory:
|
||||
|
||||
for ( int32_t i = b + warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
|
||||
task_mem[i] = warp_shmem[i] ;
|
||||
}
|
||||
|
||||
// Synchronize threads of the warp and insure memory
|
||||
// writes are visible to root thread of the warp for
|
||||
// respawn or completion.
|
||||
|
||||
// KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "D" );
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP ;
|
||||
|
||||
// If respawn requested copy respawn data back to main memory
|
||||
|
||||
if ( 0 == warp_lane ) {
|
||||
|
||||
if ( ((task_root_type *) task_root_type::LockTag) != task_shmem->m_next ) {
|
||||
( (volatile task_root_type *) task_ptr )->m_next = task_shmem->m_next ;
|
||||
( (volatile task_root_type *) task_ptr )->m_priority = task_shmem->m_priority ;
|
||||
}
|
||||
|
||||
queue->complete( task_ptr );
|
||||
}
|
||||
}
|
||||
} while(1);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
__global__
|
||||
void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue
|
||||
, int32_t shmem_size )
|
||||
{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue , shmem_size ); }
|
||||
|
||||
}
|
||||
|
||||
void TaskQueueSpecialization< Kokkos::Cuda >::execute
|
||||
( TaskQueue< Kokkos::Cuda > * const queue )
|
||||
{
|
||||
const int shared_per_warp = 2048 ;
|
||||
const int warps_per_block = 4 ;
|
||||
const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
|
||||
const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
|
||||
const int shared_total = shared_per_warp * warps_per_block ;
|
||||
const cudaStream_t stream = 0 ;
|
||||
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
|
||||
#if 0
|
||||
printf("cuda_task_queue_execute before\n");
|
||||
#endif
|
||||
|
||||
// Query the stack size, in bytes:
|
||||
|
||||
size_t previous_stack_size = 0 ;
|
||||
CUDA_SAFE_CALL( cudaDeviceGetLimit( & previous_stack_size , cudaLimitStackSize ) );
|
||||
|
||||
// If not large enough then set the stack size, in bytes:
|
||||
|
||||
const size_t larger_stack_size = 2048 ;
|
||||
|
||||
if ( previous_stack_size < larger_stack_size ) {
|
||||
CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , larger_stack_size ) );
|
||||
}
|
||||
|
||||
cuda_task_queue_execute<<< grid , block , shared_total , stream >>>( queue , shared_per_warp );
|
||||
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
|
||||
if ( previous_stack_size < larger_stack_size ) {
|
||||
CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , previous_stack_size ) );
|
||||
}
|
||||
|
||||
#if 0
|
||||
printf("cuda_task_queue_execute after\n");
|
||||
#endif
|
||||
|
||||
}
|
||||
template class TaskQueue< Kokkos::Cuda, Impl::default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda> > ;
|
||||
template class TaskQueueMultiple< Kokkos::Cuda, Impl::default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda> > ;
|
||||
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
|
|
|
@ -50,6 +50,14 @@
|
|||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#include <impl/Kokkos_TaskBase.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Error.hpp> // CUDA_SAFE_CALL
|
||||
#include <impl/Kokkos_TaskTeamMember.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
namespace {
|
||||
|
@ -57,54 +65,498 @@ namespace {
|
|||
template< typename TaskType >
|
||||
__global__
|
||||
void set_cuda_task_base_apply_function_pointer
|
||||
( TaskBase<void,void,void>::function_type * ptr )
|
||||
{ *ptr = TaskType::apply ; }
|
||||
( typename TaskType::function_type * ptr, typename TaskType::destroy_type* dtor )
|
||||
{
|
||||
*ptr = TaskType::apply;
|
||||
*dtor = TaskType::destroy;
|
||||
}
|
||||
|
||||
template< typename Scheduler >
|
||||
__global__
|
||||
void cuda_task_queue_execute( Scheduler scheduler, int32_t shmem_size ) {
|
||||
TaskQueueSpecialization< Scheduler >::driver( std::move(scheduler) , shmem_size );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template< class > class TaskExec ;
|
||||
template <class, class> class TaskExec ;
|
||||
|
||||
template<>
|
||||
class TaskQueueSpecialization< Kokkos::Cuda >
|
||||
template<class QueueType>
|
||||
class TaskQueueSpecialization<
|
||||
SimpleTaskScheduler<Kokkos::Cuda, QueueType>
|
||||
>
|
||||
{
|
||||
public:
|
||||
|
||||
using execution_space = Kokkos::Cuda ;
|
||||
using memory_space = Kokkos::CudaUVMSpace ;
|
||||
using queue_type = TaskQueue< execution_space > ;
|
||||
using member_type = TaskExec< Kokkos::Cuda > ;
|
||||
using scheduler_type = SimpleTaskScheduler<Kokkos::Cuda, QueueType>;
|
||||
using execution_space = Kokkos::Cuda;
|
||||
using memory_space = Kokkos::CudaUVMSpace;
|
||||
using member_type = TaskExec<Kokkos::Cuda, scheduler_type> ;
|
||||
|
||||
enum : long { max_league_size = 16 };
|
||||
enum : int { warps_per_block = 4 };
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static
|
||||
void iff_single_thread_recursive_execute( queue_type * const ) {}
|
||||
void iff_single_thread_recursive_execute( scheduler_type const& ) {}
|
||||
|
||||
static int get_max_team_count(
|
||||
execution_space const&
|
||||
) {
|
||||
return Kokkos::Impl::cuda_internal_multiprocessor_count() * warps_per_block;
|
||||
}
|
||||
|
||||
__device__
|
||||
static void driver( queue_type * const , int32_t );
|
||||
static void driver(scheduler_type scheduler, int32_t shmem_per_warp)
|
||||
{
|
||||
using queue_type = typename scheduler_type::task_queue_type;
|
||||
using task_base_type = typename scheduler_type::task_base_type;
|
||||
using runnable_task_base_type = typename scheduler_type::runnable_task_base_type;
|
||||
using scheduling_info_storage_type =
|
||||
SchedulingInfoStorage<
|
||||
runnable_task_base_type,
|
||||
typename scheduler_type::task_scheduling_info_type
|
||||
>;
|
||||
|
||||
extern __shared__ int32_t shmem_all[];
|
||||
|
||||
int32_t* const warp_shmem = shmem_all + (threadIdx.z * shmem_per_warp) / sizeof(int32_t);
|
||||
|
||||
task_base_type* const shared_memory_task_copy = (task_base_type*)warp_shmem;
|
||||
|
||||
const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x;
|
||||
|
||||
member_type single_exec(scheduler, warp_shmem, 1);
|
||||
member_type team_exec(scheduler, warp_shmem, blockDim.y);
|
||||
|
||||
auto& queue = scheduler.queue();
|
||||
auto& team_scheduler = team_exec.scheduler();
|
||||
|
||||
auto current_task = OptionalRef<task_base_type>();
|
||||
|
||||
// Loop until all queues are empty and no tasks in flight
|
||||
while(not queue.is_done()) {
|
||||
|
||||
if(warp_lane == 0) { // should be (?) same as team_exec.team_rank() == 0
|
||||
// pop off a task
|
||||
current_task = queue.pop_ready_task(team_scheduler.team_scheduler_info());
|
||||
}
|
||||
|
||||
// Broadcast task pointer:
|
||||
|
||||
// Sync before the broadcast
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP;
|
||||
|
||||
// pretend it's an int* for shuffle purposes
|
||||
((int*) ¤t_task)[0] = KOKKOS_IMPL_CUDA_SHFL(((int*) ¤t_task)[0], 0, 32);
|
||||
((int*) ¤t_task)[1] = KOKKOS_IMPL_CUDA_SHFL(((int*) ¤t_task)[1], 0, 32);
|
||||
|
||||
if(current_task) {
|
||||
|
||||
KOKKOS_ASSERT(!current_task->as_runnable_task().get_respawn_flag());
|
||||
|
||||
int32_t b = sizeof(scheduling_info_storage_type) / sizeof(int32_t);
|
||||
static_assert(
|
||||
sizeof(scheduling_info_storage_type) % sizeof(int32_t) == 0,
|
||||
"bad task size"
|
||||
);
|
||||
int32_t const e = current_task->get_allocation_size() / sizeof(int32_t);
|
||||
KOKKOS_ASSERT(current_task->get_allocation_size() % sizeof(int32_t) == 0);
|
||||
|
||||
int32_t volatile* const task_mem = (int32_t volatile*)current_task.get();
|
||||
|
||||
// do a coordinated copy of the task closure from global to shared memory:
|
||||
for(int32_t i = warp_lane; i < e; i += CudaTraits::WarpSize) {
|
||||
warp_shmem[i] = task_mem[i];
|
||||
}
|
||||
|
||||
// Synchronize threads of the warp and insure memory
|
||||
// writes are visible to all threads in the warp.
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP;
|
||||
|
||||
if(shared_memory_task_copy->is_team_runnable()) {
|
||||
// Thread Team Task
|
||||
shared_memory_task_copy->as_runnable_task().run(team_exec);
|
||||
}
|
||||
else if(threadIdx.y == 0) {
|
||||
// TODO @tasking @optimization DSH Change this to warp_lane == 0 when we allow blockDim.x to be more than 1
|
||||
// Single Thread Task
|
||||
shared_memory_task_copy->as_runnable_task().run(single_exec);
|
||||
}
|
||||
|
||||
// Synchronize threads of the warp and insure memory
|
||||
// writes are visible to all threads in the warp.
|
||||
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP;
|
||||
|
||||
//if(warp_lane < b % CudaTraits::WarpSize) b += CudaTraits::WarpSize;
|
||||
//b -= b % CudaTraits::WarpSize;
|
||||
|
||||
// copy task closure from shared to global memory:
|
||||
for (int32_t i = b + warp_lane; i < e; i += CudaTraits::WarpSize) {
|
||||
task_mem[i] = warp_shmem[i];
|
||||
}
|
||||
|
||||
// Synchronize threads of the warp and insure memory
|
||||
// writes are visible to root thread of the warp for
|
||||
// respawn or completion.
|
||||
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP;
|
||||
|
||||
|
||||
if (warp_lane == 0) {
|
||||
// If respawn requested copy respawn data back to main memory
|
||||
if(shared_memory_task_copy->as_runnable_task().get_respawn_flag()) {
|
||||
if(shared_memory_task_copy->as_runnable_task().has_predecessor()) {
|
||||
// It's not necessary to make this a volatile write because
|
||||
// the next read of the predecessor is on this thread in complete,
|
||||
// and the predecessor is cleared there (using a volatile write)
|
||||
current_task->as_runnable_task().acquire_predecessor_from(
|
||||
shared_memory_task_copy->as_runnable_task()
|
||||
);
|
||||
}
|
||||
|
||||
// It may not necessary to make this a volatile write, since the
|
||||
// next read will be done by this thread in complete where the
|
||||
// rescheduling occurs, but since the task could be stolen later
|
||||
// before this is written again, we should do the volatile write
|
||||
// here. (It might not be necessary though because I don't know
|
||||
// where else the priority would be read after it is scheduled
|
||||
// by this thread; for now, we leave it volatile, but we should
|
||||
// benchmark the cost of this.)
|
||||
current_task.as_volatile()->set_priority(shared_memory_task_copy->get_priority());
|
||||
|
||||
// It's not necessary to make this a volatile write, since the
|
||||
// next read of it (if true) will be by this thread in `complete()`,
|
||||
// which will unset the flag (using volatile) once it has handled
|
||||
// the respawn
|
||||
current_task->as_runnable_task().set_respawn_flag();
|
||||
|
||||
}
|
||||
|
||||
queue.complete(
|
||||
(*std::move(current_task)).as_runnable_task(),
|
||||
team_scheduler.team_scheduler_info()
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void execute( queue_type * const );
|
||||
void execute(scheduler_type const& scheduler)
|
||||
{
|
||||
const int shared_per_warp = 2048 ;
|
||||
const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1);
|
||||
const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block);
|
||||
const int shared_total = shared_per_warp * warps_per_block;
|
||||
const cudaStream_t stream = nullptr;
|
||||
|
||||
KOKKOS_ASSERT(
|
||||
static_cast<long>(grid.x * grid.y * grid.z * block.x * block.y * block.z)
|
||||
== static_cast<long>(get_max_team_count(scheduler.get_execution_space()) * Kokkos::Impl::CudaTraits::WarpSize)
|
||||
);
|
||||
|
||||
auto& queue = scheduler.queue();
|
||||
|
||||
CUDA_SAFE_CALL(cudaDeviceSynchronize());
|
||||
|
||||
// Query the stack size, in bytes:
|
||||
|
||||
size_t previous_stack_size = 0;
|
||||
CUDA_SAFE_CALL(cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize));
|
||||
|
||||
// If not large enough then set the stack size, in bytes:
|
||||
|
||||
const size_t larger_stack_size = 1 << 11;
|
||||
|
||||
if (previous_stack_size < larger_stack_size) {
|
||||
CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
|
||||
}
|
||||
|
||||
cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(scheduler, shared_per_warp);
|
||||
|
||||
CUDA_SAFE_CALL(cudaGetLastError());
|
||||
|
||||
CUDA_SAFE_CALL(cudaDeviceSynchronize());
|
||||
|
||||
if (previous_stack_size < larger_stack_size) {
|
||||
CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TaskType>
|
||||
static
|
||||
// TODO @tasking @optimiazation DSH specialize this for trivially destructible types
|
||||
void
|
||||
get_function_pointer(
|
||||
typename TaskType::function_type& ptr,
|
||||
typename TaskType::destroy_type& dtor
|
||||
)
|
||||
{
|
||||
using function_type = typename TaskType::function_type;
|
||||
using destroy_type = typename TaskType::destroy_type;
|
||||
|
||||
// TODO @tasking @minor DSH make sure there aren't any alignment concerns?
|
||||
void* storage = cuda_internal_scratch_unified(
|
||||
Kokkos::Cuda(),
|
||||
sizeof(function_type) + sizeof(destroy_type)
|
||||
);
|
||||
function_type* ptr_ptr = (function_type*)storage;
|
||||
destroy_type* dtor_ptr = (destroy_type*)((char*)storage + sizeof(function_type));
|
||||
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
|
||||
set_cuda_task_base_apply_function_pointer<TaskType><<<1,1>>>(ptr_ptr, dtor_ptr);
|
||||
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
|
||||
ptr = *ptr_ptr;
|
||||
dtor = *dtor_ptr;
|
||||
}
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template<class Scheduler>
|
||||
class TaskQueueSpecializationConstrained<
|
||||
Scheduler,
|
||||
typename std::enable_if<
|
||||
std::is_same<typename Scheduler::execution_space, Kokkos::Cuda>::value
|
||||
>::type
|
||||
>
|
||||
{
|
||||
public:
|
||||
|
||||
using scheduler_type = Scheduler;
|
||||
using execution_space = Kokkos::Cuda;
|
||||
using memory_space = Kokkos::CudaUVMSpace;
|
||||
using member_type = TaskExec<Kokkos::Cuda, Scheduler> ;
|
||||
|
||||
enum : long { max_league_size = 16 };
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static
|
||||
void iff_single_thread_recursive_execute( scheduler_type const& ) {}
|
||||
|
||||
__device__
|
||||
static void driver(scheduler_type scheduler, int32_t shmem_per_warp)
|
||||
{
|
||||
using queue_type = typename scheduler_type::queue_type;
|
||||
using task_root_type = TaskBase;
|
||||
|
||||
extern __shared__ int32_t shmem_all[];
|
||||
|
||||
task_root_type* const end = (task_root_type *) task_root_type::EndTag ;
|
||||
task_root_type* const no_more_tasks_sentinel = nullptr;
|
||||
|
||||
int32_t * const warp_shmem =
|
||||
shmem_all + ( threadIdx.z * shmem_per_warp ) / sizeof(int32_t);
|
||||
|
||||
task_root_type * const task_shmem = (task_root_type *) warp_shmem ;
|
||||
|
||||
const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;
|
||||
|
||||
member_type single_exec(scheduler, warp_shmem, 1);
|
||||
member_type team_exec(scheduler, warp_shmem, blockDim.y);
|
||||
|
||||
auto& team_queue = team_exec.scheduler().queue();
|
||||
|
||||
task_root_type * task_ptr = no_more_tasks_sentinel;
|
||||
|
||||
// Loop until all queues are empty and no tasks in flight
|
||||
|
||||
do {
|
||||
|
||||
// Each team lead attempts to acquire either a thread team task
|
||||
// or collection of single thread tasks for the team.
|
||||
|
||||
if ( 0 == warp_lane ) {
|
||||
|
||||
if( *((volatile int *) & team_queue.m_ready_count) > 0 ) {
|
||||
task_ptr = end;
|
||||
// Attempt to acquire a task
|
||||
// Loop by priority and then type
|
||||
for ( int i = 0 ; i < queue_type::NumQueue && end == task_ptr ; ++i ) {
|
||||
for ( int j = 0 ; j < 2 && end == task_ptr ; ++j ) {
|
||||
task_ptr = queue_type::pop_ready_task( & team_queue.m_ready[i][j] );
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
// returns nullptr if and only if all other queues have a ready
|
||||
// count of 0 also. Otherwise, returns a task from another queue
|
||||
// or `end` if one couldn't be popped
|
||||
task_ptr = team_queue.attempt_to_steal_task();
|
||||
#if 0
|
||||
if(task != no_more_tasks_sentinel && task != end) {
|
||||
std::printf("task stolen on rank %d\n", team_exec.league_rank());
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Synchronize warp with memory fence before broadcasting task pointer:
|
||||
|
||||
// KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "A" );
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP ;
|
||||
|
||||
// Broadcast task pointer:
|
||||
|
||||
((int*) & task_ptr )[0] = KOKKOS_IMPL_CUDA_SHFL( ((int*) & task_ptr )[0] , 0 , 32 );
|
||||
((int*) & task_ptr )[1] = KOKKOS_IMPL_CUDA_SHFL( ((int*) & task_ptr )[1] , 0 , 32 );
|
||||
|
||||
#if defined( KOKKOS_DEBUG )
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "TaskQueue CUDA task_ptr" );
|
||||
#endif
|
||||
|
||||
if ( 0 == task_ptr ) break ; // 0 == queue->m_ready_count
|
||||
|
||||
if ( end != task_ptr ) {
|
||||
|
||||
// Whole warp copy task's closure to/from shared memory.
|
||||
// Use all threads of warp for coalesced read/write.
|
||||
|
||||
int32_t const b = sizeof(task_root_type) / sizeof(int32_t);
|
||||
int32_t const e = *((int32_t volatile *)( & task_ptr->m_alloc_size )) / sizeof(int32_t);
|
||||
|
||||
int32_t volatile * const task_mem = (int32_t volatile *) task_ptr ;
|
||||
|
||||
KOKKOS_ASSERT(e * sizeof(int32_t) < shmem_per_warp);
|
||||
|
||||
// copy task closure from global to shared memory:
|
||||
|
||||
for ( int32_t i = warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
|
||||
warp_shmem[i] = task_mem[i] ;
|
||||
}
|
||||
|
||||
// Synchronize threads of the warp and insure memory
|
||||
// writes are visible to all threads in the warp.
|
||||
|
||||
// KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "B" );
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP ;
|
||||
|
||||
if ( task_root_type::TaskTeam == task_shmem->m_task_type ) {
|
||||
// Thread Team Task
|
||||
(*task_shmem->m_apply)( task_shmem , & team_exec );
|
||||
}
|
||||
else if ( 0 == threadIdx.y ) {
|
||||
// Single Thread Task
|
||||
(*task_shmem->m_apply)( task_shmem , & single_exec );
|
||||
}
|
||||
|
||||
// Synchronize threads of the warp and insure memory
|
||||
// writes are visible to all threads in the warp.
|
||||
|
||||
// KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "C" );
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP ;
|
||||
|
||||
// copy task closure from shared to global memory:
|
||||
|
||||
for ( int32_t i = b + warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
|
||||
task_mem[i] = warp_shmem[i] ;
|
||||
}
|
||||
|
||||
// Synchronize threads of the warp and insure memory
|
||||
// writes are visible to root thread of the warp for
|
||||
// respawn or completion.
|
||||
|
||||
// KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "D" );
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP ;
|
||||
|
||||
// If respawn requested copy respawn data back to main memory
|
||||
|
||||
if ( 0 == warp_lane ) {
|
||||
|
||||
if ( ((task_root_type *) task_root_type::LockTag) != task_shmem->m_next ) {
|
||||
( (volatile task_root_type *) task_ptr )->m_next = task_shmem->m_next ;
|
||||
( (volatile task_root_type *) task_ptr )->m_priority = task_shmem->m_priority ;
|
||||
}
|
||||
|
||||
team_queue.complete( task_ptr );
|
||||
}
|
||||
|
||||
}
|
||||
} while(1);
|
||||
}
|
||||
|
||||
static
|
||||
void execute(scheduler_type const& scheduler)
|
||||
{
|
||||
const int shared_per_warp = 2048 ;
|
||||
const int warps_per_block = 4 ;
|
||||
const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
|
||||
//const dim3 grid( 1 , 1 , 1 );
|
||||
const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
|
||||
const int shared_total = shared_per_warp * warps_per_block ;
|
||||
const cudaStream_t stream = 0 ;
|
||||
|
||||
auto& queue = scheduler.queue();
|
||||
queue.initialize_team_queues(warps_per_block * grid.x);
|
||||
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
|
||||
// Query the stack size, in bytes:
|
||||
|
||||
size_t previous_stack_size = 0 ;
|
||||
CUDA_SAFE_CALL( cudaDeviceGetLimit( & previous_stack_size , cudaLimitStackSize ) );
|
||||
|
||||
// If not large enough then set the stack size, in bytes:
|
||||
|
||||
const size_t larger_stack_size = 2048 ;
|
||||
|
||||
if ( previous_stack_size < larger_stack_size ) {
|
||||
CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , larger_stack_size ) );
|
||||
}
|
||||
|
||||
cuda_task_queue_execute<<< grid , block , shared_total , stream >>>( scheduler , shared_per_warp );
|
||||
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
|
||||
if ( previous_stack_size < larger_stack_size ) {
|
||||
CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , previous_stack_size ) );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template< typename TaskType >
|
||||
static
|
||||
typename TaskType::function_type
|
||||
get_function_pointer()
|
||||
void
|
||||
get_function_pointer(
|
||||
typename TaskType::function_type& ptr,
|
||||
typename TaskType::destroy_type& dtor
|
||||
)
|
||||
{
|
||||
using function_type = typename TaskType::function_type ;
|
||||
using function_type = typename TaskType::function_type;
|
||||
using destroy_type = typename TaskType::destroy_type;
|
||||
|
||||
function_type * const ptr =
|
||||
(function_type*) cuda_internal_scratch_unified( sizeof(function_type) );
|
||||
void* storage = cuda_internal_scratch_unified(
|
||||
Kokkos::Cuda(),
|
||||
sizeof(function_type) + sizeof(destroy_type)
|
||||
);
|
||||
function_type* ptr_ptr = (function_type*)storage;
|
||||
destroy_type* dtor_ptr = (destroy_type*)((char*)storage + sizeof(function_type));
|
||||
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
|
||||
set_cuda_task_base_apply_function_pointer<TaskType><<<1,1>>>(ptr);
|
||||
set_cuda_task_base_apply_function_pointer<TaskType><<<1,1>>>(ptr_ptr, dtor_ptr);
|
||||
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
|
||||
return *ptr ;
|
||||
ptr = *ptr_ptr;
|
||||
dtor = *dtor_ptr;
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
extern template class TaskQueue< Kokkos::Cuda > ;
|
||||
extern template class TaskQueue< Kokkos::Cuda, default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda> > ;
|
||||
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
|
@ -136,8 +588,8 @@ namespace Impl {
|
|||
* When executing a single thread task the syncwarp or other
|
||||
* warp synchronizing functions must not be called.
|
||||
*/
|
||||
template<>
|
||||
class TaskExec< Kokkos::Cuda >
|
||||
template <class Scheduler>
|
||||
class TaskExec<Kokkos::Cuda, Scheduler>
|
||||
{
|
||||
private:
|
||||
|
||||
|
@ -148,24 +600,39 @@ private:
|
|||
TaskExec & operator = ( TaskExec && ) = delete ;
|
||||
TaskExec & operator = ( TaskExec const & ) = delete ;
|
||||
|
||||
friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda > ;
|
||||
friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Cuda > ;
|
||||
friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda, default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda> > ;
|
||||
template <class, class>
|
||||
friend class Kokkos::Impl::TaskQueueSpecializationConstrained;
|
||||
template <class>
|
||||
friend class Kokkos::Impl::TaskQueueSpecialization;
|
||||
|
||||
int32_t * m_team_shmem ;
|
||||
const int m_team_size ;
|
||||
Scheduler m_scheduler;
|
||||
|
||||
// If constructed with arg_team_size == 1 the object
|
||||
// can only be used by 0 == threadIdx.y.
|
||||
__device__
|
||||
TaskExec( int32_t * arg_team_shmem , int arg_team_size = blockDim.y )
|
||||
: m_team_shmem( arg_team_shmem )
|
||||
, m_team_size( arg_team_size ) {}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskExec(
|
||||
Scheduler const& parent_scheduler,
|
||||
int32_t* arg_team_shmem,
|
||||
int arg_team_size = blockDim.y
|
||||
)
|
||||
: m_team_shmem(arg_team_shmem),
|
||||
m_team_size(arg_team_size),
|
||||
m_scheduler(parent_scheduler.get_team_scheduler(league_rank()))
|
||||
{ }
|
||||
|
||||
public:
|
||||
|
||||
using thread_team_member = TaskExec;
|
||||
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
__device__ int team_rank() const { return threadIdx.y ; }
|
||||
__device__ int team_size() const { return m_team_size ; }
|
||||
__device__ int team_rank() const { return threadIdx.y ; }
|
||||
__device__ int team_size() const { return m_team_size ; }
|
||||
//__device__ int league_rank() const { return threadIdx.z; }
|
||||
__device__ int league_rank() const { return blockIdx.x * blockDim.z + threadIdx.z; }
|
||||
__device__ int league_size() const { return blockDim.z * gridDim.x; }
|
||||
|
||||
__device__ void team_barrier() const
|
||||
{
|
||||
|
@ -186,13 +653,18 @@ public:
|
|||
}
|
||||
|
||||
#else
|
||||
__host__ int team_rank() const { return 0 ; }
|
||||
__host__ int team_size() const { return 0 ; }
|
||||
__host__ int team_rank() const { return 0 ; }
|
||||
__host__ int team_size() const { return 0 ; }
|
||||
__host__ int league_rank() const { return 0; }
|
||||
__host__ int league_size() const { return 0; }
|
||||
__host__ void team_barrier() const {}
|
||||
template< class ValueType >
|
||||
__host__ void team_broadcast( ValueType & , const int ) const {}
|
||||
#endif
|
||||
|
||||
KOKKOS_INLINE_FUNCTION Scheduler const& scheduler() const noexcept { return m_scheduler; }
|
||||
KOKKOS_INLINE_FUNCTION Scheduler& scheduler() noexcept { return m_scheduler; }
|
||||
|
||||
};
|
||||
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
@ -203,20 +675,22 @@ public:
|
|||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<typename iType>
|
||||
struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
|
||||
template<typename iType, typename Scheduler>
|
||||
struct TeamThreadRangeBoundariesStruct<iType, TaskExec<Kokkos::Cuda, Scheduler>>
|
||||
{
|
||||
typedef iType index_type;
|
||||
using index_type = iType;
|
||||
using member_type = TaskExec<Kokkos::Cuda, Scheduler>;
|
||||
|
||||
const iType start ;
|
||||
const iType end ;
|
||||
const iType increment ;
|
||||
const TaskExec< Kokkos::Cuda > & thread;
|
||||
member_type const& thread;
|
||||
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
|
||||
__device__ inline
|
||||
TeamThreadRangeBoundariesStruct
|
||||
( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count)
|
||||
( member_type const& arg_thread, const iType& arg_count)
|
||||
: start( threadIdx.y )
|
||||
, end(arg_count)
|
||||
, increment( blockDim.y )
|
||||
|
@ -225,7 +699,7 @@ struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
|
|||
|
||||
__device__ inline
|
||||
TeamThreadRangeBoundariesStruct
|
||||
( const TaskExec< Kokkos::Cuda > & arg_thread
|
||||
( member_type const& arg_thread
|
||||
, const iType & arg_start
|
||||
, const iType & arg_end
|
||||
)
|
||||
|
@ -238,10 +712,10 @@ struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
|
|||
#else
|
||||
|
||||
TeamThreadRangeBoundariesStruct
|
||||
( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count);
|
||||
( member_type const& arg_thread, const iType& arg_count);
|
||||
|
||||
TeamThreadRangeBoundariesStruct
|
||||
( const TaskExec< Kokkos::Cuda > & arg_thread
|
||||
( member_type const& arg_thread
|
||||
, const iType & arg_start
|
||||
, const iType & arg_end
|
||||
);
|
||||
|
@ -252,20 +726,22 @@ struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
|
|||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template<typename iType>
|
||||
struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
|
||||
template<typename iType, typename Scheduler>
|
||||
struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda, Scheduler > >
|
||||
{
|
||||
typedef iType index_type;
|
||||
using index_type = iType;
|
||||
using member_type = TaskExec<Kokkos::Cuda, Scheduler>;
|
||||
|
||||
const index_type start ;
|
||||
const index_type end ;
|
||||
const index_type increment ;
|
||||
const TaskExec< Kokkos::Cuda > & thread;
|
||||
const member_type& thread;
|
||||
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
|
||||
__device__ inline
|
||||
ThreadVectorRangeBoundariesStruct
|
||||
( const TaskExec< Kokkos::Cuda > & arg_thread, const index_type& arg_count )
|
||||
( member_type const& arg_thread, const index_type& arg_count )
|
||||
: start( threadIdx.x )
|
||||
, end(arg_count)
|
||||
, increment( blockDim.x )
|
||||
|
@ -274,9 +750,9 @@ struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
|
|||
|
||||
__device__ inline
|
||||
ThreadVectorRangeBoundariesStruct
|
||||
( const TaskExec< Kokkos::Cuda > & arg_thread, const index_type& arg_begin, const index_type& arg_end )
|
||||
( member_type const& arg_thread, const index_type& arg_begin, const index_type& arg_end )
|
||||
: start( arg_begin + threadIdx.x )
|
||||
, end(arg_count)
|
||||
, end(arg_end)
|
||||
, increment( blockDim.x )
|
||||
, thread(arg_thread)
|
||||
{}
|
||||
|
@ -284,10 +760,10 @@ struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
|
|||
#else
|
||||
|
||||
ThreadVectorRangeBoundariesStruct
|
||||
( const TaskExec< Kokkos::Cuda > & arg_thread, const index_type& arg_count );
|
||||
( member_type const& arg_thread, const index_type& arg_count );
|
||||
|
||||
ThreadVectorRangeBoundariesStruct
|
||||
( const TaskExec< Kokkos::Cuda > & arg_thread, const index_type& arg_begin, const index_type& arg_end);
|
||||
( member_type const& arg_thread, const index_type& arg_begin, const index_type& arg_end);
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -299,69 +775,69 @@ struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
|
|||
|
||||
namespace Kokkos {
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >
|
||||
TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & count )
|
||||
{
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >( thread, count );
|
||||
}
|
||||
//template<typename iType>
|
||||
//KOKKOS_INLINE_FUNCTION
|
||||
//Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >
|
||||
//TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & count )
|
||||
//{
|
||||
// return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >( thread, count );
|
||||
//}
|
||||
//
|
||||
//template<typename iType1, typename iType2>
|
||||
//KOKKOS_INLINE_FUNCTION
|
||||
//Impl::TeamThreadRangeBoundariesStruct
|
||||
// < typename std::common_type<iType1,iType2>::type
|
||||
// , Impl::TaskExec< Kokkos::Cuda > >
|
||||
//TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
|
||||
// , const iType1 & begin, const iType2 & end )
|
||||
//{
|
||||
// typedef typename std::common_type< iType1, iType2 >::type iType;
|
||||
// return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >(
|
||||
// thread, iType(begin), iType(end) );
|
||||
//}
|
||||
//
|
||||
//template<typename iType>
|
||||
//KOKKOS_INLINE_FUNCTION
|
||||
//Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
|
||||
//ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
|
||||
// , const iType & count )
|
||||
//{
|
||||
// return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
|
||||
//}
|
||||
//
|
||||
//template<typename iType>
|
||||
//KOKKOS_INLINE_FUNCTION
|
||||
//Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
|
||||
//ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
|
||||
// , const iType & arg_begin
|
||||
// , const iType & arg_end )
|
||||
//{
|
||||
// return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,arg_begin,arg_end);
|
||||
//}
|
||||
|
||||
template<typename iType1, typename iType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct
|
||||
< typename std::common_type<iType1,iType2>::type
|
||||
, Impl::TaskExec< Kokkos::Cuda > >
|
||||
TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
|
||||
, const iType1 & begin, const iType2 & end )
|
||||
{
|
||||
typedef typename std::common_type< iType1, iType2 >::type iType;
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >(
|
||||
thread, iType(begin), iType(end) );
|
||||
}
|
||||
// KOKKOS_INLINE_FUNCTION
|
||||
// Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >
|
||||
// PerTeam(const Impl::TaskExec< Kokkos::Cuda >& thread)
|
||||
// {
|
||||
// return Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread);
|
||||
// }
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
|
||||
ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
|
||||
, const iType & count )
|
||||
{
|
||||
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
|
||||
ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
|
||||
, const iType & arg_begin
|
||||
, const iType & arg_end )
|
||||
{
|
||||
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,arg_begin,arg_end);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >
|
||||
PerTeam(const Impl::TaskExec< Kokkos::Cuda >& thread)
|
||||
{
|
||||
return Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >
|
||||
PerThread(const Impl::TaskExec< Kokkos::Cuda >& thread)
|
||||
{
|
||||
return Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread);
|
||||
}
|
||||
// KOKKOS_INLINE_FUNCTION
|
||||
// Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >
|
||||
// PerThread(const Impl::TaskExec< Kokkos::Cuda >& thread)
|
||||
// {
|
||||
// return Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread);
|
||||
// }
|
||||
|
||||
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
|
||||
* This functionality requires C++11 support.
|
||||
*/
|
||||
template<typename iType, class Lambda>
|
||||
template<typename iType, class Lambda, class Scheduler>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for
|
||||
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >& loop_boundaries
|
||||
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries
|
||||
, const Lambda& lambda
|
||||
)
|
||||
{
|
||||
|
@ -370,10 +846,10 @@ void parallel_for
|
|||
}
|
||||
}
|
||||
|
||||
template< typename iType, class Lambda >
|
||||
template< typename iType, class Lambda, class Scheduler >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
|
||||
const Lambda & lambda) {
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i);
|
||||
|
@ -459,14 +935,14 @@ void parallel_reduce
|
|||
// blockDim.y == team_size
|
||||
// threadIdx.x == position in vec
|
||||
// threadIdx.y == member number
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
template< typename iType, class Lambda, typename ValueType, class Scheduler >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
ValueType& initialized_result) {
|
||||
|
||||
//TODO what is the point of creating this temporary?
|
||||
//TODO @internal_documentation what is the point of creating this temporary?
|
||||
ValueType result = initialized_result;
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,result);
|
||||
|
@ -487,15 +963,15 @@ void parallel_reduce
|
|||
}
|
||||
}
|
||||
|
||||
template< typename iType, class Lambda, typename ReducerType >
|
||||
template< typename iType, class Lambda, typename ReducerType, class Scheduler >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
const ReducerType& reducer) {
|
||||
|
||||
typedef typename ReducerType::value_type ValueType;
|
||||
//TODO what is the point of creating this temporary?
|
||||
//TODO @internal_documentation what is the point of creating this temporary?
|
||||
ValueType result = ValueType();
|
||||
reducer.init(result);
|
||||
|
||||
|
@ -549,10 +1025,10 @@ void parallel_reduce
|
|||
// blockDim.y == team_size
|
||||
// threadIdx.x == position in vec
|
||||
// threadIdx.y == member number
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
template< typename iType, class Lambda, typename ValueType, class Scheduler >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
ValueType& initialized_result) {
|
||||
|
||||
|
@ -576,10 +1052,10 @@ void parallel_reduce
|
|||
}
|
||||
}
|
||||
|
||||
template< typename iType, class Lambda, typename ReducerType >
|
||||
template< typename iType, class Lambda, typename ReducerType, class Scheduler >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
const ReducerType& reducer) {
|
||||
|
||||
|
@ -611,10 +1087,10 @@ void parallel_reduce
|
|||
// blockDim.y == team_size
|
||||
// threadIdx.x == position in vec
|
||||
// threadIdx.y == member number
|
||||
template< typename iType, class Closure >
|
||||
template< typename iType, class Closure, class Scheduler >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
|
||||
const Closure & closure )
|
||||
{
|
||||
// Extract value_type from closure
|
||||
|
@ -676,10 +1152,10 @@ void parallel_scan
|
|||
// blockDim.y == team_size
|
||||
// threadIdx.x == position in vec
|
||||
// threadIdx.y == member number
|
||||
template< typename iType, class Closure >
|
||||
template< typename iType, class Closure, class Scheduler >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
|
||||
const Closure & closure )
|
||||
{
|
||||
// Extract value_type from closure
|
||||
|
@ -735,25 +1211,25 @@ void parallel_scan
|
|||
|
||||
namespace Kokkos {
|
||||
|
||||
template<class FunctorType>
|
||||
template<class FunctorType, class Scheduler>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >& , const FunctorType& lambda) {
|
||||
void single(const Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda, Scheduler > >& , const FunctorType& lambda) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
if(threadIdx.x == 0) lambda();
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class FunctorType>
|
||||
template<class FunctorType, class Scheduler>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >& , const FunctorType& lambda) {
|
||||
void single(const Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda, Scheduler > >& , const FunctorType& lambda) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
if(threadIdx.x == 0 && threadIdx.y == 0) lambda();
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class FunctorType, class ValueType>
|
||||
template<class FunctorType, class ValueType, class Scheduler>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >& s , const FunctorType& lambda, ValueType& val) {
|
||||
void single(const Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda, Scheduler > >& s , const FunctorType& lambda, ValueType& val) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
if(threadIdx.x == 0) lambda(val);
|
||||
if ( 1 < s.team_member.team_size() ) {
|
||||
|
@ -762,9 +1238,9 @@ namespace Kokkos {
|
|||
#endif
|
||||
}
|
||||
|
||||
template<class FunctorType, class ValueType>
|
||||
template<class FunctorType, class ValueType, class Scheduler>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >& single_struct, const FunctorType& lambda, ValueType& val) {
|
||||
void single(const Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda, Scheduler > >& single_struct, const FunctorType& lambda, ValueType& val) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
if(threadIdx.x == 0 && threadIdx.y == 0) {
|
||||
lambda(val);
|
||||
|
|
|
@ -56,9 +56,9 @@
|
|||
#include <utility>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
|
||||
#include <Cuda/Kokkos_CudaExec.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_KernelLaunch.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Internal.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
|
||||
#include <Kokkos_Vectorization.hpp>
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
|
@ -101,11 +101,13 @@ struct CudaJoinFunctor {
|
|||
* total available shared memory must be partitioned among teams.
|
||||
*/
|
||||
class CudaTeamMember {
|
||||
private:
|
||||
|
||||
public:
|
||||
typedef Kokkos::Cuda execution_space ;
|
||||
typedef execution_space::scratch_memory_space scratch_memory_space ;
|
||||
|
||||
private:
|
||||
|
||||
mutable void * m_team_reduce ;
|
||||
scratch_memory_space m_team_shared ;
|
||||
int m_team_reduce_size ;
|
||||
|
@ -221,12 +223,21 @@ public:
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< is_reducer< ReducerType >::value >::type
|
||||
team_reduce( ReducerType const & reducer ) const noexcept
|
||||
{
|
||||
team_reduce(reducer,reducer.reference());
|
||||
}
|
||||
|
||||
template< typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< is_reducer< ReducerType >::value >::type
|
||||
team_reduce( ReducerType const & reducer, typename ReducerType::value_type& value ) const noexcept
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
cuda_intra_block_reduction(reducer,blockDim.y);
|
||||
cuda_intra_block_reduction(reducer,value,blockDim.y);
|
||||
#endif /* #ifdef __CUDA_ARCH__ */
|
||||
}
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
|
||||
* with intra-team non-deterministic ordering accumulation.
|
||||
|
@ -281,20 +292,28 @@ public:
|
|||
template< typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
typename std::enable_if< is_reducer< ReducerType >::value >::type
|
||||
vector_reduce( ReducerType const & reducer )
|
||||
vector_reduce( ReducerType const & reducer ) {
|
||||
vector_reduce(reducer,reducer.reference());
|
||||
}
|
||||
|
||||
template< typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
typename std::enable_if< is_reducer< ReducerType >::value >::type
|
||||
vector_reduce( ReducerType const & reducer, typename ReducerType::value_type& value )
|
||||
{
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
if(blockDim.x == 1) return;
|
||||
|
||||
// Intra vector lane shuffle reduction:
|
||||
typename ReducerType::value_type tmp ( reducer.reference() );
|
||||
typename ReducerType::value_type tmp ( value );
|
||||
typename ReducerType::value_type tmp2 = tmp;
|
||||
|
||||
unsigned mask = blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<((threadIdx.y%(32/blockDim.x))*blockDim.x);
|
||||
|
||||
for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
|
||||
cuda_shfl_down( reducer.reference() , tmp , i , blockDim.x , mask );
|
||||
if ( (int)threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
|
||||
cuda_shfl_down( tmp2 , tmp , i , blockDim.x , mask );
|
||||
if ( (int)threadIdx.x < i ) { reducer.join( tmp , tmp2 ); }
|
||||
}
|
||||
|
||||
// Broadcast from root lane to all other lanes.
|
||||
|
@ -302,7 +321,9 @@ public:
|
|||
// because floating point summation is not associative
|
||||
// and thus different threads could have different results.
|
||||
|
||||
cuda_shfl( reducer.reference() , tmp , 0 , blockDim.x , mask );
|
||||
cuda_shfl( tmp2 , tmp , 0 , blockDim.x , mask );
|
||||
value = tmp2;
|
||||
reducer.reference() = tmp2;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -543,19 +564,37 @@ struct TeamThreadRangeBoundariesStruct<iType,CudaTeamMember> {
|
|||
const iType end;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count)
|
||||
TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, iType count)
|
||||
: member(thread_)
|
||||
, start( 0 )
|
||||
, end( count ) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& begin_, const iType& end_)
|
||||
TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, iType begin_, iType end_)
|
||||
: member(thread_)
|
||||
, start( begin_ )
|
||||
, end( end_ ) {}
|
||||
};
|
||||
|
||||
template<typename iType>
|
||||
struct TeamVectorRangeBoundariesStruct<iType,CudaTeamMember> {
|
||||
typedef iType index_type;
|
||||
const CudaTeamMember& member;
|
||||
const iType start;
|
||||
const iType end;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TeamVectorRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count)
|
||||
: member(thread_)
|
||||
, start( 0 )
|
||||
, end( count ) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TeamVectorRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& begin_, const iType& end_)
|
||||
: member(thread_)
|
||||
, start( begin_ )
|
||||
, end( end_ ) {}
|
||||
};
|
||||
|
||||
template<typename iType>
|
||||
struct ThreadVectorRangeBoundariesStruct<iType,CudaTeamMember> {
|
||||
|
@ -564,19 +603,19 @@ struct ThreadVectorRangeBoundariesStruct<iType,CudaTeamMember> {
|
|||
const index_type end;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ThreadVectorRangeBoundariesStruct (const CudaTeamMember, const index_type& count)
|
||||
ThreadVectorRangeBoundariesStruct (const CudaTeamMember, index_type count)
|
||||
: start( static_cast<index_type>(0) ), end( count ) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ThreadVectorRangeBoundariesStruct (const index_type& count)
|
||||
ThreadVectorRangeBoundariesStruct (index_type count)
|
||||
: start( static_cast<index_type>(0) ), end( count ) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ThreadVectorRangeBoundariesStruct (const CudaTeamMember, const index_type& arg_begin, const index_type& arg_end)
|
||||
ThreadVectorRangeBoundariesStruct (const CudaTeamMember, index_type arg_begin, index_type arg_end)
|
||||
: start( arg_begin ), end( arg_end ) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ThreadVectorRangeBoundariesStruct (const index_type& arg_begin, const index_type& arg_end)
|
||||
ThreadVectorRangeBoundariesStruct (index_type arg_begin, index_type arg_end)
|
||||
: start( arg_begin ), end( arg_end ) {}
|
||||
};
|
||||
|
||||
|
@ -585,7 +624,7 @@ struct ThreadVectorRangeBoundariesStruct<iType,CudaTeamMember> {
|
|||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >
|
||||
TeamThreadRange( const Impl::CudaTeamMember & thread, const iType & count ) {
|
||||
TeamThreadRange( const Impl::CudaTeamMember & thread, iType count ) {
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, count );
|
||||
}
|
||||
|
||||
|
@ -593,22 +632,38 @@ template< typename iType1, typename iType2 >
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
|
||||
Impl::CudaTeamMember >
|
||||
TeamThreadRange( const Impl::CudaTeamMember & thread, const iType1 & begin, const iType2 & end ) {
|
||||
TeamThreadRange( const Impl::CudaTeamMember & thread, iType1 begin, iType2 end ) {
|
||||
typedef typename std::common_type< iType1, iType2 >::type iType;
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, iType(begin), iType(end) );
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamVectorRangeBoundariesStruct< iType, Impl::CudaTeamMember >
|
||||
TeamVectorRange( const Impl::CudaTeamMember & thread, const iType & count ) {
|
||||
return Impl::TeamVectorRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, count );
|
||||
}
|
||||
|
||||
template< typename iType1, typename iType2 >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamVectorRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
|
||||
Impl::CudaTeamMember >
|
||||
TeamVectorRange( const Impl::CudaTeamMember & thread, const iType1 & begin, const iType2 & end ) {
|
||||
typedef typename std::common_type< iType1, iType2 >::type iType;
|
||||
return Impl::TeamVectorRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, iType(begin), iType(end) );
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
|
||||
ThreadVectorRange(const Impl::CudaTeamMember& thread, const iType& count) {
|
||||
ThreadVectorRange(const Impl::CudaTeamMember& thread, iType count) {
|
||||
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >(thread,count);
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
|
||||
ThreadVectorRange(const Impl::CudaTeamMember& thread, const iType& arg_begin, const iType& arg_end) {
|
||||
ThreadVectorRange(const Impl::CudaTeamMember& thread, iType arg_begin, iType arg_end) {
|
||||
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >(thread,arg_begin,arg_end);
|
||||
}
|
||||
|
||||
|
@ -667,16 +722,16 @@ parallel_reduce
|
|||
)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
reducer.init( reducer.reference() );
|
||||
typename ReducerType::value_type value;
|
||||
reducer.init( value );
|
||||
|
||||
for( iType i = loop_boundaries.start + threadIdx.y
|
||||
; i < loop_boundaries.end
|
||||
; i += blockDim.y ) {
|
||||
closure(i,reducer.reference());
|
||||
closure(i,value);
|
||||
}
|
||||
|
||||
loop_boundaries.member.team_reduce( reducer );
|
||||
loop_boundaries.member.team_reduce( reducer, value );
|
||||
|
||||
#endif
|
||||
}
|
||||
|
@ -701,19 +756,88 @@ parallel_reduce
|
|||
)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
Kokkos::Sum<ValueType> reducer(result);
|
||||
ValueType val;
|
||||
Kokkos::Sum<ValueType> reducer(val);
|
||||
|
||||
reducer.init( reducer.reference() );
|
||||
|
||||
for( iType i = loop_boundaries.start + threadIdx.y
|
||||
; i < loop_boundaries.end
|
||||
; i += blockDim.y ) {
|
||||
closure(i,result);
|
||||
closure(i,val);
|
||||
}
|
||||
|
||||
loop_boundaries.member.team_reduce( reducer );
|
||||
loop_boundaries.member.team_reduce( reducer , val);
|
||||
result = reducer.reference();
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename iType, class Closure >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for
|
||||
( const Impl::TeamVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember>&
|
||||
loop_boundaries
|
||||
, const Closure & closure
|
||||
)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
for( iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x
|
||||
; i < loop_boundaries.end
|
||||
; i += blockDim.y*blockDim.x )
|
||||
closure(i);
|
||||
#endif
|
||||
}
|
||||
|
||||
template< typename iType, class Closure, class ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
|
||||
parallel_reduce
|
||||
( const Impl::TeamVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember> &
|
||||
loop_boundaries
|
||||
, const Closure & closure
|
||||
, const ReducerType & reducer
|
||||
)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
typename ReducerType::value_type value;
|
||||
reducer.init( value );
|
||||
|
||||
for( iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x
|
||||
; i < loop_boundaries.end
|
||||
; i += blockDim.y * blockDim.x ) {
|
||||
closure(i,value);
|
||||
}
|
||||
|
||||
loop_boundaries.member.vector_reduce( reducer, value );
|
||||
loop_boundaries.member.team_reduce( reducer, value );
|
||||
#endif
|
||||
}
|
||||
|
||||
template< typename iType, class Closure, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< ! Kokkos::is_reducer< ValueType >::value >::type
|
||||
parallel_reduce
|
||||
( const Impl::TeamVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember> &
|
||||
loop_boundaries
|
||||
, const Closure & closure
|
||||
, ValueType & result
|
||||
)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
ValueType val;
|
||||
Kokkos::Sum<ValueType> reducer(val);
|
||||
|
||||
reducer.init( reducer.reference() );
|
||||
|
||||
for( iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x
|
||||
; i < loop_boundaries.end
|
||||
; i += blockDim.y * blockDim.x ) {
|
||||
closure(i,val);
|
||||
}
|
||||
|
||||
loop_boundaries.member.vector_reduce( reducer );
|
||||
loop_boundaries.member.team_reduce( reducer );
|
||||
result = reducer.reference();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -241,7 +241,7 @@ class ViewDataHandle< Traits ,
|
|||
sizeof(typename Traits::const_value_type) == 16 )
|
||||
&&
|
||||
// Random access trait
|
||||
( Traits::memory_traits::RandomAccess != 0 )
|
||||
( Traits::memory_traits::is_random_access != 0 )
|
||||
)>::type >
|
||||
{
|
||||
public:
|
||||
|
|
|
@ -102,9 +102,8 @@ public:
|
|||
const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
|
||||
const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
|
||||
const int shared = 0 ;
|
||||
const cudaStream_t stream = 0 ;
|
||||
|
||||
Kokkos::Impl::CudaParallelLaunch<Self>(*this, grid, block, shared, stream);
|
||||
Kokkos::Impl::CudaParallelLaunch<Self>(*this, grid, block, shared, Cuda().impl_internal_space_instance() , false );
|
||||
}
|
||||
|
||||
inline
|
||||
|
|
|
@ -0,0 +1,152 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
#ifdef KOKKOS_ENABLE_HPX
|
||||
#include <Kokkos_HPX.hpp>
|
||||
|
||||
#include <hpx/util/yield_while.hpp>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
bool HPX::m_hpx_initialized = false;
|
||||
Kokkos::Impl::thread_buffer HPX::m_buffer;
|
||||
#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
|
||||
hpx::future<void> HPX::m_future = hpx::make_ready_future<void>();
|
||||
#endif
|
||||
|
||||
int HPX::concurrency() {
|
||||
hpx::runtime *rt = hpx::get_runtime_ptr();
|
||||
if (rt == nullptr) {
|
||||
return hpx::threads::hardware_concurrency();
|
||||
} else {
|
||||
if (hpx::threads::get_self_ptr() == nullptr) {
|
||||
return hpx::resource::get_thread_pool(0).get_os_thread_count();
|
||||
} else {
|
||||
return hpx::this_thread::get_pool()->get_os_thread_count();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void HPX::impl_initialize(int thread_count) {
|
||||
hpx::runtime *rt = hpx::get_runtime_ptr();
|
||||
if (rt == nullptr) {
|
||||
std::vector<std::string> config = {
|
||||
"hpx.os_threads=" + std::to_string(thread_count),
|
||||
#ifdef KOKKOS_DEBUG
|
||||
"--hpx:attach-debugger=exception",
|
||||
#endif
|
||||
};
|
||||
int argc_hpx = 1;
|
||||
char name[] = "kokkos_hpx";
|
||||
char *argv_hpx[] = {name, nullptr};
|
||||
hpx::start(nullptr, argc_hpx, argv_hpx, config);
|
||||
|
||||
// NOTE: Wait for runtime to start. hpx::start returns as soon as
|
||||
// possible, meaning some operations are not allowed immediately
|
||||
// after hpx::start. Notably, hpx::stop needs state_running. This
|
||||
// needs to be fixed in HPX itself.
|
||||
|
||||
// Get runtime pointer again after it has been started.
|
||||
rt = hpx::get_runtime_ptr();
|
||||
hpx::util::yield_while(
|
||||
[rt]() { return rt->get_state() < hpx::state_running; });
|
||||
|
||||
m_hpx_initialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
void HPX::impl_initialize() {
|
||||
hpx::runtime *rt = hpx::get_runtime_ptr();
|
||||
if (rt == nullptr) {
|
||||
std::vector<std::string> config = {
|
||||
#ifdef KOKKOS_DEBUG
|
||||
"--hpx:attach-debugger=exception",
|
||||
#endif
|
||||
};
|
||||
int argc_hpx = 1;
|
||||
char name[] = "kokkos_hpx";
|
||||
char *argv_hpx[] = {name, nullptr};
|
||||
hpx::start(nullptr, argc_hpx, argv_hpx, config);
|
||||
|
||||
// NOTE: Wait for runtime to start. hpx::start returns as soon as
|
||||
// possible, meaning some operations are not allowed immediately
|
||||
// after hpx::start. Notably, hpx::stop needs state_running. This
|
||||
// needs to be fixed in HPX itself.
|
||||
|
||||
// Get runtime pointer again after it has been started.
|
||||
rt = hpx::get_runtime_ptr();
|
||||
hpx::util::yield_while(
|
||||
[rt]() { return rt->get_state() < hpx::state_running; });
|
||||
|
||||
m_hpx_initialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
bool HPX::impl_is_initialized() noexcept {
|
||||
hpx::runtime *rt = hpx::get_runtime_ptr();
|
||||
return rt != nullptr;
|
||||
}
|
||||
|
||||
void HPX::impl_finalize() {
|
||||
if (m_hpx_initialized) {
|
||||
hpx::runtime *rt = hpx::get_runtime_ptr();
|
||||
if (rt != nullptr) {
|
||||
hpx::apply([]() { hpx::finalize(); });
|
||||
hpx::stop();
|
||||
} else {
|
||||
Kokkos::abort("Kokkos::Experimental::HPX::impl_finalize: Kokkos started "
|
||||
"HPX but something else already stopped HPX\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_IMPL_HPX_PREVENT_LINK_ERROR() {}
|
||||
#endif //#ifdef KOKKOS_ENABLE_HPX
|
|
@ -41,38 +41,25 @@
|
|||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_STATICASSERT_HPP
|
||||
#define KOKKOS_STATICASSERT_HPP
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined(KOKKOS_ENABLE_HPX) && defined(KOKKOS_ENABLE_TASKDAG)
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include <impl/Kokkos_TaskQueue_impl.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template < bool , class T = void >
|
||||
struct StaticAssert ;
|
||||
|
||||
template< class T >
|
||||
struct StaticAssert< true , T > {
|
||||
typedef T type ;
|
||||
static const bool value = true ;
|
||||
};
|
||||
|
||||
template < class A , class B >
|
||||
struct StaticAssertSame ;
|
||||
|
||||
template < class A >
|
||||
struct StaticAssertSame<A,A> { typedef A type ; };
|
||||
|
||||
template < class A , class B >
|
||||
struct StaticAssertAssignable ;
|
||||
|
||||
template < class A >
|
||||
struct StaticAssertAssignable<A,A> { typedef A type ; };
|
||||
|
||||
template < class A >
|
||||
struct StaticAssertAssignable< const A , A > { typedef const A type ; };
|
||||
template class TaskQueue<Kokkos::Experimental::HPX,
|
||||
Kokkos::Experimental::HPX::memory_space>;
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* KOKKOS_STATICASSERT_HPP */
|
||||
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_IMPL_HPX_TASK_PREVENT_LINK_ERROR() {}
|
||||
#endif // #if defined( KOKKOS_ENABLE_HPX ) && defined( KOKKOS_ENABLE_TASKDAG )
|
|
@ -0,0 +1,298 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_HPX_TASK_HPP
|
||||
#define KOKKOS_HPX_TASK_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined(KOKKOS_ENABLE_HPX) && defined(KOKKOS_ENABLE_TASKDAG)
|
||||
|
||||
#include <Kokkos_TaskScheduler_fwd.hpp>
|
||||
|
||||
#include <Kokkos_HPX.hpp>
|
||||
|
||||
#include <hpx/apply.hpp>
|
||||
#include <hpx/lcos/local/counting_semaphore.hpp>
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template <class QueueType>
|
||||
class TaskQueueSpecialization<
|
||||
SimpleTaskScheduler<Kokkos::Experimental::HPX, QueueType>> {
|
||||
public:
|
||||
using execution_space = Kokkos::Experimental::HPX;
|
||||
using scheduler_type =
|
||||
SimpleTaskScheduler<Kokkos::Experimental::HPX, QueueType>;
|
||||
using member_type =
|
||||
TaskTeamMemberAdapter<Kokkos::Impl::HPXTeamMember, scheduler_type>;
|
||||
using memory_space = Kokkos::HostSpace;
|
||||
|
||||
static void execute(scheduler_type const &scheduler) {
|
||||
// NOTE: We create an instance so that we can use dispatch_execute_task.
|
||||
// This is not necessarily the most efficient, but can be improved later.
|
||||
TaskQueueSpecialization<scheduler_type> task_queue;
|
||||
task_queue.scheduler = &scheduler;
|
||||
Kokkos::Impl::dispatch_execute_task(&task_queue);
|
||||
Kokkos::Experimental::HPX().fence();
|
||||
}
|
||||
|
||||
// Must provide task queue execution function
|
||||
void execute_task() const {
|
||||
using hpx::apply;
|
||||
using hpx::lcos::local::counting_semaphore;
|
||||
using task_base_type = typename scheduler_type::task_base_type;
|
||||
|
||||
const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
|
||||
|
||||
thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
|
||||
buffer.resize(num_worker_threads, 512);
|
||||
|
||||
auto &queue = scheduler->queue();
|
||||
|
||||
counting_semaphore sem(0);
|
||||
|
||||
for (int thread = 0; thread < num_worker_threads; ++thread) {
|
||||
apply([this, &sem, &queue, &buffer, num_worker_threads, thread]() {
|
||||
// NOTE: This implementation has been simplified based on the
|
||||
// assumption that team_size = 1. The HPX backend currently only
|
||||
// supports a team size of 1.
|
||||
std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id();
|
||||
|
||||
buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id());
|
||||
HPXTeamMember member(TeamPolicyInternal<Kokkos::Experimental::HPX>(
|
||||
Kokkos::Experimental::HPX(), num_worker_threads, 1),
|
||||
0, t, buffer.get(t), 512);
|
||||
|
||||
member_type single_exec(*scheduler, member);
|
||||
member_type &team_exec = single_exec;
|
||||
|
||||
auto &team_scheduler = team_exec.scheduler();
|
||||
auto current_task = OptionalRef<task_base_type>(nullptr);
|
||||
|
||||
while (!queue.is_done()) {
|
||||
current_task =
|
||||
queue.pop_ready_task(team_scheduler.team_scheduler_info());
|
||||
|
||||
if (current_task) {
|
||||
KOKKOS_ASSERT(current_task->is_single_runnable() ||
|
||||
current_task->is_team_runnable());
|
||||
current_task->as_runnable_task().run(single_exec);
|
||||
queue.complete((*std::move(current_task)).as_runnable_task(),
|
||||
team_scheduler.team_scheduler_info());
|
||||
}
|
||||
}
|
||||
|
||||
sem.signal(1);
|
||||
});
|
||||
}
|
||||
|
||||
sem.wait(num_worker_threads);
|
||||
}
|
||||
|
||||
static uint32_t get_max_team_count(execution_space const &espace) {
|
||||
return static_cast<uint32_t>(espace.concurrency());
|
||||
}
|
||||
|
||||
template <typename TaskType>
|
||||
static void get_function_pointer(typename TaskType::function_type &ptr,
|
||||
typename TaskType::destroy_type &dtor) {
|
||||
ptr = TaskType::apply;
|
||||
dtor = TaskType::destroy;
|
||||
}
|
||||
|
||||
private:
|
||||
const scheduler_type *scheduler;
|
||||
};
|
||||
|
||||
template <class Scheduler>
|
||||
class TaskQueueSpecializationConstrained<
|
||||
Scheduler, typename std::enable_if<
|
||||
std::is_same<typename Scheduler::execution_space,
|
||||
Kokkos::Experimental::HPX>::value>::type> {
|
||||
public:
|
||||
using execution_space = Kokkos::Experimental::HPX;
|
||||
using scheduler_type = Scheduler;
|
||||
using member_type =
|
||||
TaskTeamMemberAdapter<Kokkos::Impl::HPXTeamMember, scheduler_type>;
|
||||
using memory_space = Kokkos::HostSpace;
|
||||
|
||||
static void
|
||||
iff_single_thread_recursive_execute(scheduler_type const &scheduler) {
|
||||
using task_base_type = typename scheduler_type::task_base;
|
||||
using queue_type = typename scheduler_type::queue_type;
|
||||
|
||||
if (1 == Kokkos::Experimental::HPX::concurrency()) {
|
||||
task_base_type *const end = (task_base_type *)task_base_type::EndTag;
|
||||
task_base_type *task = end;
|
||||
|
||||
HPXTeamMember member(TeamPolicyInternal<Kokkos::Experimental::HPX>(
|
||||
Kokkos::Experimental::HPX(), 1, 1),
|
||||
0, 0, nullptr, 0);
|
||||
member_type single_exec(scheduler, member);
|
||||
|
||||
do {
|
||||
task = end;
|
||||
|
||||
// Loop by priority and then type
|
||||
for (int i = 0; i < queue_type::NumQueue && end == task; ++i) {
|
||||
for (int j = 0; j < 2 && end == task; ++j) {
|
||||
task =
|
||||
queue_type::pop_ready_task(&scheduler.m_queue->m_ready[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
if (end == task)
|
||||
break;
|
||||
|
||||
(*task->m_apply)(task, &single_exec);
|
||||
|
||||
scheduler.m_queue->complete(task);
|
||||
|
||||
} while (true);
|
||||
}
|
||||
}
|
||||
|
||||
static void execute(scheduler_type const &scheduler) {
|
||||
// NOTE: We create an instance so that we can use dispatch_execute_task.
|
||||
// This is not necessarily the most efficient, but can be improved later.
|
||||
TaskQueueSpecializationConstrained<scheduler_type> task_queue;
|
||||
task_queue.scheduler = &scheduler;
|
||||
Kokkos::Impl::dispatch_execute_task(&task_queue);
|
||||
Kokkos::Experimental::HPX().fence();
|
||||
}
|
||||
|
||||
// Must provide task queue execution function
|
||||
void execute_task() const {
|
||||
using hpx::apply;
|
||||
using hpx::lcos::local::counting_semaphore;
|
||||
using task_base_type = typename scheduler_type::task_base;
|
||||
using queue_type = typename scheduler_type::queue_type;
|
||||
|
||||
const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
|
||||
static task_base_type *const end = (task_base_type *)task_base_type::EndTag;
|
||||
constexpr task_base_type *no_more_tasks_sentinel = nullptr;
|
||||
|
||||
thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
|
||||
buffer.resize(num_worker_threads, 512);
|
||||
|
||||
auto &queue = scheduler->queue();
|
||||
queue.initialize_team_queues(num_worker_threads);
|
||||
|
||||
counting_semaphore sem(0);
|
||||
|
||||
for (int thread = 0; thread < num_worker_threads; ++thread) {
|
||||
apply([this, &sem, &buffer, num_worker_threads, thread]() {
|
||||
// NOTE: This implementation has been simplified based on the assumption
|
||||
// that team_size = 1. The HPX backend currently only supports a team
|
||||
// size of 1.
|
||||
std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id();
|
||||
|
||||
buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id());
|
||||
HPXTeamMember member(
|
||||
TeamPolicyInternal<Kokkos::Experimental::HPX>(
|
||||
Kokkos::Experimental::HPX(), num_worker_threads, 1),
|
||||
0, t, buffer.get(t), 512);
|
||||
|
||||
member_type single_exec(*scheduler, member);
|
||||
member_type &team_exec = single_exec;
|
||||
|
||||
auto &team_queue = team_exec.scheduler().queue();
|
||||
task_base_type *task = no_more_tasks_sentinel;
|
||||
|
||||
do {
|
||||
if (task != no_more_tasks_sentinel && task != end) {
|
||||
team_queue.complete(task);
|
||||
}
|
||||
|
||||
if (*((volatile int *)&team_queue.m_ready_count) > 0) {
|
||||
task = end;
|
||||
for (int i = 0; i < queue_type::NumQueue && end == task; ++i) {
|
||||
for (int j = 0; j < 2 && end == task; ++j) {
|
||||
task = queue_type::pop_ready_task(&team_queue.m_ready[i][j]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
task = team_queue.attempt_to_steal_task();
|
||||
}
|
||||
|
||||
if (task != no_more_tasks_sentinel && task != end) {
|
||||
(*task->m_apply)(task, &single_exec);
|
||||
}
|
||||
} while (task != no_more_tasks_sentinel);
|
||||
|
||||
sem.signal(1);
|
||||
});
|
||||
}
|
||||
|
||||
sem.wait(num_worker_threads);
|
||||
}
|
||||
|
||||
template <typename TaskType>
|
||||
static void get_function_pointer(typename TaskType::function_type &ptr,
|
||||
typename TaskType::destroy_type &dtor) {
|
||||
ptr = TaskType::apply;
|
||||
dtor = TaskType::destroy;
|
||||
}
|
||||
|
||||
private:
|
||||
const scheduler_type *scheduler;
|
||||
};
|
||||
|
||||
extern template class TaskQueue<
|
||||
Kokkos::Experimental::HPX,
|
||||
typename Kokkos::Experimental::HPX::memory_space>;
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
|
||||
#endif /* #ifndef KOKKOS_HPX_TASK_HPP */
|
|
@ -0,0 +1,57 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_HPX_VIEWETIAVAIL_HPP
|
||||
#define KOKKOS_HPX_VIEWETIAVAIL_HPP
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
#define KOKKOS_IMPL_VIEWCOPY_ETI_AVAIL_EXECSPACE Kokkos::Experimental::HPX
|
||||
|
||||
#include<eti/common/Kokkos_ViewFillCopyETIAvail_Macros.hpp>
|
||||
|
||||
#undef KOKKOS_IMPL_VIEWCOPY_ETI_AVAIL_EXECSPACE
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_HPX_VIEWETIDECL_HPP
|
||||
#define KOKKOS_HPX_VIEWETIDECL_HPP
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
#define KOKKOS_IMPL_VIEWCOPY_ETI_AVAIL_EXECSPACE Kokkos::Experimental::HPX
|
||||
|
||||
#include<eti/common/Kokkos_ViewFillCopyETIDecl_Macros.hpp>
|
||||
|
||||
#undef KOKKOS_IMPL_VIEWCOPY_ETI_AVAIL_EXECSPACE
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
@ -0,0 +1,116 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_HPX_WORKGRAPHPOLICY_HPP
|
||||
#define KOKKOS_HPX_WORKGRAPHPOLICY_HPP
|
||||
|
||||
#include <hpx/apply.hpp>
|
||||
#include <hpx/lcos/local/counting_semaphore.hpp>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template <class FunctorType, class... Traits>
|
||||
class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
|
||||
Kokkos::Experimental::HPX> {
|
||||
private:
|
||||
using Policy = Kokkos::WorkGraphPolicy<Traits...>;
|
||||
using WorkTag = typename Policy::work_tag;
|
||||
|
||||
Policy m_policy;
|
||||
FunctorType m_functor;
|
||||
|
||||
template <class TagType>
|
||||
typename std::enable_if<std::is_same<TagType, void>::value>::type
|
||||
execute_functor(const std::int32_t w) const noexcept {
|
||||
m_functor(w);
|
||||
}
|
||||
|
||||
template <class TagType>
|
||||
typename std::enable_if<!std::is_same<TagType, void>::value>::type
|
||||
execute_functor(const std::int32_t w) const noexcept {
|
||||
const TagType t{};
|
||||
m_functor(t, w);
|
||||
}
|
||||
|
||||
public:
|
||||
void execute() const {
|
||||
dispatch_execute_task(this);
|
||||
Kokkos::Experimental::HPX().fence();
|
||||
}
|
||||
|
||||
void execute_task() const {
|
||||
const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
|
||||
|
||||
using hpx::apply;
|
||||
using hpx::lcos::local::counting_semaphore;
|
||||
|
||||
counting_semaphore sem(0);
|
||||
|
||||
for (int thread = 0; thread < num_worker_threads; ++thread) {
|
||||
apply([this, &sem]() {
|
||||
std::int32_t w = m_policy.pop_work();
|
||||
while (w != Policy::COMPLETED_TOKEN) {
|
||||
if (w != Policy::END_TOKEN) {
|
||||
execute_functor<WorkTag>(w);
|
||||
m_policy.completed_work(w);
|
||||
}
|
||||
|
||||
w = m_policy.pop_work();
|
||||
}
|
||||
|
||||
sem.signal(1);
|
||||
});
|
||||
}
|
||||
|
||||
sem.wait(num_worker_threads);
|
||||
}
|
||||
|
||||
inline ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy)
|
||||
: m_policy(arg_policy), m_functor(arg_functor) {}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* #define KOKKOS_HPX_WORKGRAPHPOLICY_HPP */
|
|
@ -125,6 +125,8 @@ struct MDRangePolicy
|
|||
using traits = Kokkos::Impl::PolicyTraits<Properties ...>;
|
||||
using range_policy = RangePolicy<Properties...>;
|
||||
|
||||
typename traits::execution_space m_space;
|
||||
|
||||
using impl_range_policy = RangePolicy< typename traits::execution_space
|
||||
, typename traits::schedule_type
|
||||
, typename traits::index_type
|
||||
|
@ -132,6 +134,9 @@ struct MDRangePolicy
|
|||
|
||||
typedef MDRangePolicy execution_policy; // needed for is_execution_space interrogation
|
||||
|
||||
template<class ... OtherProperties>
|
||||
friend struct MDRangePolicy;
|
||||
|
||||
static_assert( !std::is_same<typename traits::iteration_pattern,void>::value
|
||||
, "Kokkos Error: MD iteration pattern not defined" );
|
||||
|
||||
|
@ -192,13 +197,54 @@ struct MDRangePolicy
|
|||
static constexpr int Right = static_cast<int>( Iterate::Right );
|
||||
static constexpr int Left = static_cast<int>( Iterate::Left );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION const typename traits::execution_space & space() const { return m_space ; }
|
||||
template < typename LT , typename UT , typename TT = array_index_type >
|
||||
MDRangePolicy(std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
|
||||
: m_space() {
|
||||
init(lower, upper, tile);
|
||||
}
|
||||
|
||||
template < typename LT , typename UT , typename TT = array_index_type >
|
||||
MDRangePolicy(const typename traits::execution_space & work_space,
|
||||
std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
|
||||
: m_space( work_space ) {
|
||||
init(lower, upper, tile);
|
||||
}
|
||||
|
||||
MDRangePolicy( point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{} )
|
||||
: m_lower(lower)
|
||||
: m_space()
|
||||
, m_lower(lower)
|
||||
, m_upper(upper)
|
||||
, m_tile(tile)
|
||||
, m_num_tiles(1)
|
||||
, m_prod_tile_dims(1)
|
||||
{
|
||||
, m_prod_tile_dims(1) {
|
||||
init();
|
||||
}
|
||||
|
||||
MDRangePolicy( const typename traits::execution_space & work_space,
|
||||
point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{} )
|
||||
: m_space( work_space )
|
||||
, m_lower(lower)
|
||||
, m_upper(upper)
|
||||
, m_tile(tile)
|
||||
, m_num_tiles(1)
|
||||
, m_prod_tile_dims(1) {
|
||||
init();
|
||||
}
|
||||
|
||||
template<class ... OtherProperties>
|
||||
MDRangePolicy( const MDRangePolicy<OtherProperties...> p ):
|
||||
m_space(p.m_space),
|
||||
m_lower(p.m_lower),
|
||||
m_upper(p.m_upper),
|
||||
m_tile(p.m_tile),
|
||||
m_tile_end(p.m_tile_end),
|
||||
m_num_tiles(p.m_num_tiles),
|
||||
m_prod_tile_dims(p.m_prod_tile_dims) {}
|
||||
|
||||
private:
|
||||
|
||||
void init() {
|
||||
// Host
|
||||
if ( true
|
||||
#if defined(KOKKOS_ENABLE_CUDA)
|
||||
|
@ -211,7 +257,7 @@ struct MDRangePolicy
|
|||
{
|
||||
index_type span;
|
||||
for (int i=0; i<rank; ++i) {
|
||||
span = upper[i] - lower[i];
|
||||
span = m_upper[i] - m_lower[i];
|
||||
if ( m_tile[i] <= 0 ) {
|
||||
if ( ((int)inner_direction == (int)Right && (i < rank-1))
|
||||
|| ((int)inner_direction == (int)Left && (i > 0)) )
|
||||
|
@ -311,11 +357,9 @@ struct MDRangePolicy
|
|||
#endif
|
||||
}
|
||||
|
||||
|
||||
template < typename LT , typename UT , typename TT = array_index_type >
|
||||
MDRangePolicy( std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
|
||||
void init( std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
|
||||
{
|
||||
|
||||
if(static_cast<int>(m_lower.size()) != rank || static_cast<int>(m_upper.size()) != rank)
|
||||
Kokkos::abort("MDRangePolicy: Constructor initializer lists have wrong size");
|
||||
|
||||
|
@ -589,5 +633,26 @@ void md_parallel_reduce( const std::string& str
|
|||
} } // namespace Kokkos::Experimental
|
||||
#endif
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
template<unsigned long P, class ... Properties>
|
||||
struct PolicyPropertyAdaptor<WorkItemProperty::ImplWorkItemProperty<P>,MDRangePolicy<Properties...>> {
|
||||
typedef MDRangePolicy<Properties...> policy_in_t;
|
||||
typedef MDRangePolicy<typename policy_in_t::traits::execution_space,
|
||||
typename policy_in_t::traits::schedule_type,
|
||||
typename policy_in_t::traits::work_tag,
|
||||
typename policy_in_t::traits::index_type,
|
||||
typename policy_in_t::traits::iteration_pattern,
|
||||
typename policy_in_t::traits::launch_bounds,
|
||||
WorkItemProperty::ImplWorkItemProperty<P>> policy_out_t;
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
|
||||
|
||||
|
|
|
@ -90,6 +90,7 @@
|
|||
#if ! defined( KOKKOS_ENABLE_GNU_ATOMICS ) && \
|
||||
! defined( KOKKOS_ENABLE_INTEL_ATOMICS ) && \
|
||||
! defined( KOKKOS_ENABLE_OPENMP_ATOMICS ) && \
|
||||
! defined( KOKKOS_ENABLE_STD_ATOMICS ) && \
|
||||
! defined( KOKKOS_ENABLE_SERIAL_ATOMICS )
|
||||
|
||||
// Compiling for non-Cuda atomic implementation has not been pre-selected.
|
||||
|
@ -168,6 +169,12 @@ const char * atomic_query_version()
|
|||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Atomic Memory Orders
|
||||
//
|
||||
// Implements Strongly-typed analogs of C++ standard memory orders
|
||||
#include "impl/Kokkos_Atomic_Memory_Order.hpp"
|
||||
|
||||
#if defined( KOKKOS_ENABLE_ROCM )
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
@ -287,6 +294,14 @@ void unlock_address_rocm_space(void* ptr);
|
|||
#ifndef _WIN32
|
||||
#include "impl/Kokkos_Atomic_Generic.hpp"
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Provide atomic loads and stores with memory order semantics
|
||||
|
||||
#include "impl/Kokkos_Atomic_Load.hpp"
|
||||
#include "impl/Kokkos_Atomic_Store.hpp"
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// This atomic-style macro should be an inlined function, not a macro
|
||||
|
||||
|
|
|
@ -631,8 +631,10 @@ RealType real (const complex<RealType>& x) {
|
|||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
RealType abs (const complex<RealType>& x) {
|
||||
// FIXME (mfh 31 Oct 2014) Scale to avoid unwarranted overflow.
|
||||
return std::sqrt (real (x) * real (x) + imag (x) * imag (x));
|
||||
#ifndef __CUDA_ARCH__
|
||||
using std::hypot;
|
||||
#endif
|
||||
return hypot(x.real(),x.imag());
|
||||
}
|
||||
|
||||
//! Power of a complex number
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue