Merge branch 'master' into improve-include-consistency

# Conflicts: # src/USER-MESO/atom_vec_tdpd.cpp
2019-07-09 14:50:00 -04:00 · 2019-07-09 14:50:00 -04:00 · 06dcc9e283
parent 31e19e0824 691fc357a4
commit 06dcc9e283
610 changed files with 39096 additions and 8292 deletions
--- a/doc/src/if.txt
+++ b/doc/src/if.txt
@ -57,8 +57,7 @@ Boolean expression is FALSE, then no commands are executed.
 The syntax for Boolean expressions is described below.

 Each command (t1, f1, e1, etc) can be any valid LAMMPS input script
-command, except an "include"_include.html command, which is not
-allowed.  If the command is more than one word, it must enclosed in
+command.  If the command is more than one word, it must enclosed in
 quotes, so it will be treated as a single argument, as in the examples
 above.

--- a/doc/src/pair_meamc.txt
+++ b/doc/src/pair_meamc.txt
@ -147,7 +147,8 @@ asub     = "A" parameter for MEAM (see e.g. "(Baskes)"_#Baskes) :pre

 The alpha, b0, b1, b2, b3, t0, t1, t2, t3 parameters correspond to the
 standard MEAM parameters in the literature "(Baskes)"_#Baskes (the b
-parameters are the standard beta parameters).  The rozero parameter is
+parameters are the standard beta parameters). Note that only parameters
+normalized to t0 = 1.0 are supported.  The rozero parameter is
 an element-dependent density scaling that weights the reference
 background density (see e.g. equation 4.5 in "(Gullet)"_#Gullet) and
 is typically 1.0 for single-element systems.  The ibar parameter
--- a/doc/utils/sphinx-config/_themes/lammps_theme/static/css/theme.css
+++ b/doc/utils/sphinx-config/_themes/lammps_theme/static/css/theme.css
@ -5092,4 +5092,17 @@ span[id*='MathJax-Span'] {
  src: local("Roboto Slab Bold"), local("RobotoSlab-Bold"), url(../fonts/RobotoSlab-Bold.ttf) format("truetype");
 }

+.codeblock, pre.literal-block, .rst-content .literal-block, .rst-content pre.literal-block, div[class^='highlight'] {
+  font-size: 12px;
+  line-height: 1.5;
+  display: block;
+  overflow: auto;
+  color: #404040;
+  padding: 12px 12px;
+}
+
+.codeblock,div[class^='highlight'] {
+  padding: 0;
+}
+
 /*# sourceMappingURL=theme.css.map */
--- a/lib/gpu/lal_lj96.cu
+++ b/lib/gpu/lal_lj96.cu
@ -174,6 +174,7 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
        numtyp r6inv = r2inv*r2inv*r2inv;
        numtyp r3inv = ucl_sqrt(r6inv);
        numtyp force = r2inv*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
+        force*=factor_lj;

        f.x+=delx*force;
        f.y+=dely*force;
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@ -308,8 +308,6 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
      delr1.z = jx.z-ix.z;
      numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;

-//      if (rsq1 > cutsq[ijparam]) continue;
-
      // compute zeta_ij
      z = (acctyp)0;

@ -355,13 +353,9 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
                  rsq1, rsq2, delr1, delr2);
      }

-      //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
-      //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
      int idx = nbor_j;
      if (dev_packed==dev_nbor) idx -= n_stride;
-//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-//               i, nbor_j, offset_j, idx);
      acc_zeta(z, tid, t_per_atom, offset_k);

      numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
@ -585,14 +579,9 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
      numtyp r1inv = ucl_rsqrt(rsq1);

      // look up for zeta_ij
-
-      //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
-      //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
      int idx = nbor_j;
      if (dev_packed==dev_nbor) idx -= n_stride;
-//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-//               i, nbor_j, offset_j, idx);
      acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
      numtyp force = zeta_ij.x*tpainv;
      numtyp prefactor = zeta_ij.y;
@ -823,13 +812,9 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
        offset_kf = red_acc[2*m+1];
      }

-      //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
-      //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
      int idx = ijnum;
      if (dev_packed==dev_nbor) idx -= n_stride;
-//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-//               j, ijnum, offset_kf, idx);
      acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
      numtyp force = zeta_ji.x*tpainv;
      numtyp prefactor_ji = zeta_ji.y;
@ -891,13 +876,10 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
        f.y += fi[1];
        f.z += fi[2];

-        //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
-        //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
        int idx = nbor_k;
        if (dev_packed==dev_nbor) idx -= n_stride;
-//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-//                 j, nbor_k, offset_k, idx);
+
        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
        numtyp prefactor_jk = zeta_jk.y;
        int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@ -1068,13 +1050,9 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
        offset_kf = red_acc[2*m+1];
      }

-      //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
-      //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
      int idx = ijnum;
      if (dev_packed==dev_nbor) idx -= n_stride;
-//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-//               j, ijnum, offset_kf, idx);
      acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
      numtyp force = zeta_ji.x*tpainv;
      numtyp prefactor_ji = zeta_ji.y;
@ -1143,13 +1121,9 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
        virial[4] += TWOTHIRD*(mdelr1[0]*fj[2] + delr2[0]*fk[2]);
        virial[5] += TWOTHIRD*(mdelr1[1]*fj[2] + delr2[1]*fk[2]);

-        //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
-        //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
        int idx = nbor_k;
        if (dev_packed==dev_nbor) idx -= n_stride;
-//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-//                 j, nbor_k, offset_k, idx);
        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
        numtyp prefactor_jk = zeta_jk.y;

--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@ -356,13 +356,9 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
                  ijkparam_c5, rsq1, rsq2, delr1, delr2);
      }

-      //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
-      //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
      int idx = nbor_j;
      if (dev_packed==dev_nbor) idx -= n_stride;
-//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-//               i, nbor_j, offset_j, idx);
      acc_zeta(z, tid, t_per_atom, offset_k);

      numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
@ -587,14 +583,9 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
      numtyp r1inv = ucl_rsqrt(rsq1);

      // look up for zeta_ij
-
-      //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
-      //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
      int idx = nbor_j;
      if (dev_packed==dev_nbor) idx -= n_stride;
-//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-//               i, nbor_j, offset_j, idx);
      acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
      numtyp force = zeta_ij.x*tpainv;
      numtyp prefactor = zeta_ij.y;
@ -831,13 +822,9 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
        offset_kf = red_acc[2*m+1];
      }

-      //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
-      //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
      int idx = ijnum;
      if (dev_packed==dev_nbor) idx -= n_stride;
-//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-//               j, ijnum, offset_kf, idx);
      acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
      numtyp force = zeta_ji.x*tpainv;
      numtyp prefactor_ji = zeta_ji.y;
@ -902,13 +889,9 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
        f.y += fi[1];
        f.z += fi[2];

-        //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
-        //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
        int idx = nbor_k;
        if (dev_packed==dev_nbor) idx -= n_stride;
-//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-//                 j, nbor_k, offset_k, idx);
        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
        numtyp prefactor_jk = zeta_jk.y;
        int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@ -1085,13 +1068,9 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
        offset_kf = red_acc[2*m+1];
      }

-      //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
-      //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
      int idx = ijnum;
      if (dev_packed==dev_nbor) idx -= n_stride;
-//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-//               j, ijnum, offset_kf, idx);
      acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
      numtyp force = zeta_ji.x*tpainv;
      numtyp prefactor_ji = zeta_ji.y;
@ -1163,13 +1142,9 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
        virial[4] += TWOTHIRD*(mdelr1[0]*fj[2] + delr2[0]*fk[2]);
        virial[5] += TWOTHIRD*(mdelr1[1]*fj[2] + delr2[1]*fk[2]);

-        //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
-        //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
        int idx = nbor_k;
        if (dev_packed==dev_nbor) idx -= n_stride;
-//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-//                 j, nbor_k, offset_k, idx);
        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
        numtyp prefactor_jk = zeta_jk.y;

--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@ -359,13 +359,9 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
                  rsq1, rsq2, delr1, delr2);
      }

-      //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
-      //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
      int idx = nbor_j;
      if (dev_packed==dev_nbor) idx -= n_stride;
-//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-//               i, nbor_j, offset_j, idx);
      acc_zeta(z, tid, t_per_atom, offset_k);

      numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
@ -603,14 +599,9 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
      numtyp r1inv = ucl_rsqrt(rsq1);

      // look up for zeta_ij
-
-      //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
-      //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
      int idx = nbor_j;
      if (dev_packed==dev_nbor) idx -= n_stride;
-//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-//               i, nbor_j, offset_j, idx);
      acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
      numtyp force = zeta_ij.x*tpainv;
      numtyp prefactor = zeta_ij.y;
@ -841,13 +832,9 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
        offset_kf = red_acc[2*m+1];
      }

-      //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
-      //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
      int idx = ijnum;
      if (dev_packed==dev_nbor) idx -= n_stride;
-//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-//               j, ijnum, offset_kf, idx);
      acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
      numtyp force = zeta_ji.x*tpainv;
      numtyp prefactor_ji = zeta_ji.y;
@ -909,13 +896,9 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
        f.y += fi[1];
        f.z += fi[2];

-        //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
-        //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
        int idx = nbor_k;
        if (dev_packed==dev_nbor) idx -= n_stride;
-//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-//                 j, nbor_k, offset_k, idx);
        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
        numtyp prefactor_jk = zeta_jk.y;
        int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@ -1086,13 +1069,9 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
        offset_kf = red_acc[2*m+1];
      }

-      //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
-      //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
      int idx = ijnum;
      if (dev_packed==dev_nbor) idx -= n_stride;
-//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-//               j, ijnum, offset_kf, idx);
      acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
      numtyp force = zeta_ji.x*tpainv;
      numtyp prefactor_ji = zeta_ji.y;
@ -1161,13 +1140,9 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
        virial[4] += TWOTHIRD*(mdelr1[0]*fj[2] + delr2[0]*fk[2]);
        virial[5] += TWOTHIRD*(mdelr1[1]*fj[2] + delr2[1]*fk[2]);

-        //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
-        //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
        int idx = nbor_k;
        if (dev_packed==dev_nbor) idx -= n_stride;
-//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-//                 j, nbor_k, offset_k, idx);
        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
        numtyp prefactor_jk = zeta_jk.y;

--- a/lib/gpu/lal_yukawa_colloid.cu
+++ b/lib/gpu/lal_yukawa_colloid.cu
@ -89,10 +89,10 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
      if (rsq<coeff[mtype].z) {
        numtyp r = ucl_sqrt(rsq);
        numtyp rinv = ucl_recip(r);
-              numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
-              numtyp force = coeff[mtype].x * screening;
+        numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
+        numtyp force = coeff[mtype].x * screening;

-              force = factor_lj*force * rinv;
+        force = factor_lj*force * rinv;

        f.x+=delx*force;
        f.y+=dely*force;
@ -181,10 +181,10 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
      if (rsq<coeff[mtype].z) {
        numtyp r = ucl_sqrt(rsq);
        numtyp rinv = ucl_recip(r);
-              numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
-              numtyp force = coeff[mtype].x * screening;
+        numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
+        numtyp force = coeff[mtype].x * screening;

-              force = factor_lj*force * rinv;
+        force = factor_lj*force * rinv;

        f.x+=delx*force;
        f.y+=dely*force;
--- a/lib/gpu/lal_zbl.cu
+++ b/lib/gpu/lal_zbl.cu
@ -129,16 +129,13 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
      int mtype=itype*lj_types+jtype;
      if (rsq<cut_globalsq) {
        numtyp r, t, force;
-
        r = ucl_sqrt(rsq);
        force = dzbldr(r, coeff2[mtype].x, coeff2[mtype].y,
                       coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
-
-              if (rsq>cut_innersq) {
-                t = r - cut_inner;
-                force = t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
-              }
-
+        if (rsq>cut_innersq) {
+          t = r - cut_inner;
+          force = t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
+        }
        force *= (numtyp)-1.0*ucl_recip(r);

        f.x+=delx*force;
@ -148,11 +145,10 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
        if (eflag>0) {
          numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y,
                         coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
-                 e += coeff3[mtype].z;
-                if (rsq > cut_innersq) {
-                  e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
-                }
-
+          e += coeff3[mtype].z;
+          if (rsq > cut_innersq) {
+            e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
+          }
          energy+=e;
        }
        if (vflag>0) {
@ -232,15 +228,13 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,

      if (rsq<cut_globalsq) {
        numtyp r, t, force;
-
        r = ucl_sqrt(rsq);
        force = dzbldr(r, coeff2[mtype].x, coeff2[mtype].y,
                       coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
-
-              if (rsq>cut_innersq) {
-                t = r - cut_inner;
-                force += t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
-              }
+        if (rsq>cut_innersq) {
+          t = r - cut_inner;
+          force += t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
+        }

        force *= (numtyp)-1.0*ucl_recip(r);

@ -251,11 +245,10 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
        if (eflag>0) {
          numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y,
                         coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
-                 e += coeff3[mtype].z;
-                if (rsq > cut_innersq) {
-                  e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
-                }
-
+          e += coeff3[mtype].z;
+          if (rsq > cut_innersq) {
+            e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
+          }
          energy+=e;
        }
        if (vflag>0) {
--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@ -1,5 +1,39 @@
 # Change Log

+## [2.9.00](https://github.com/kokkos/kokkos/tree/2.9.00) (2019-06-24)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.8.00...2.9.00)
+
+**Implemented enhancements:**
+
+- Capability: CUDA Streams [\#1723](https://github.com/kokkos/kokkos/issues/1723)
+- Capability: CUDA Stream support for parallel\_reduce [\#2061](https://github.com/kokkos/kokkos/issues/2061)
+- Capability: Feature Request: TeamVectorRange [\#713](https://github.com/kokkos/kokkos/issues/713)
+- Capability: Adding HPX backend [\#2080](https://github.com/kokkos/kokkos/issues/2080)
+- Capability: TaskScheduler to have multiple queues [\#565](https://github.com/kokkos/kokkos/issues/565)
+- Capability: Support for additional reductions in ScatterView [\#1674](https://github.com/kokkos/kokkos/issues/1674)
+- Capability: Request: deep\_copy within parallel regions [\#689](https://github.com/kokkos/kokkos/issues/689)
+- Capability: Feature Request: `create\_mirror\_view\_without\_initializing` [\#1765](https://github.com/kokkos/kokkos/issues/1765)
+- View: Use SFINAE to restrict possible View type conversions [\#2127](https://github.com/kokkos/kokkos/issues/2127)
+- Deprecation: Deprecate ExecutionSpace::fence\(\) as static function and make it non-static [\#2140](https://github.com/kokkos/kokkos/issues/2140)
+- Deprecation: Deprecate LayoutTileLeft [\#2122](https://github.com/kokkos/kokkos/issues/2122)
+- Macros: KOKKOS\_RESTRICT defined for non-Intel compilers [\#2038](https://github.com/kokkos/kokkos/issues/2038)
+
+**Fixed bugs:**
+
+- Cuda: TeamThreadRange loop count on device is passed by reference to host static constexpr [\#1733](https://github.com/kokkos/kokkos/issues/1733)
+- Cuda: Build error with relocatable device code with CUDA 10.1 GCC 7.3 [\#2134](https://github.com/kokkos/kokkos/issues/2134)
+- Cuda: cudaFuncSetCacheConfig is setting CachePreferShared too often [\#2066](https://github.com/kokkos/kokkos/issues/2066)
+- Cuda: TeamPolicy doesn't throw then created with non-viable vector length and also doesn't backscale to viable one [\#2020](https://github.com/kokkos/kokkos/issues/2020)
+- Cuda: cudaMemcpy error for large league sizes on V100 [\#1991](https://github.com/kokkos/kokkos/issues/1991)
+- Cuda: illegal warp sync in parallel\_reduce by functor on Turing 75 [\#1958](https://github.com/kokkos/kokkos/issues/1958)
+- TeamThreadRange: Inconsistent results from TeamThreadRange reduction [\#1905](https://github.com/kokkos/kokkos/issues/1905)
+- Atomics: atomic\_fetch\_oper & atomic\_oper\_fetch don't build for complex\<float\> [\#1964](https://github.com/kokkos/kokkos/issues/1964)
+- Views: Kokkos randomread Views leak memory [\#2155](https://github.com/kokkos/kokkos/issues/2155)
+- ScatterView: LayoutLeft overload currently non-functional [\#2165](https://github.com/kokkos/kokkos/issues/2165)
+- KNL: With intel 17.2.174 illegal instruction in random number test [\#2078](https://github.com/kokkos/kokkos/issues/2078)
+- Bitset: Enable copy constructor on device [\#2094](https://github.com/kokkos/kokkos/issues/2094)
+- Examples: do not compile due to template deduction error \(multi\_fem\) [\#1928](https://github.com/kokkos/kokkos/issues/1928)
+
 ## [2.8.00](https://github.com/kokkos/kokkos/tree/2.8.00) (2019-02-05)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/2.7.24...2.8.00)

--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@ -23,7 +23,7 @@ KOKKOS_DEBUG ?= "no"
 KOKKOS_USE_TPLS ?= ""
 # Options: c++11,c++14,c++1y,c++17,c++1z,c++2a
 KOKKOS_CXX_STANDARD ?= "c++11"
-# Options: aggressive_vectorization,disable_profiling,disable_deprecated_code,enable_large_mem_tests
+# Options: aggressive_vectorization,disable_profiling,enable_deprecated_code,disable_deprecated_code,enable_large_mem_tests
 KOKKOS_OPTIONS ?= ""
 # Option for setting ETI path
 KOKKOS_ETI_PATH ?= ${KOKKOS_PATH}/core/src/eti
@ -33,11 +33,19 @@ KOKKOS_CMAKE ?= "no"
 # Options: force_uvm,use_ldg,rdc,enable_lambda
 KOKKOS_CUDA_OPTIONS ?= "enable_lambda"

+# Default settings specific options.
+# Options: enable_async_dispatch
+KOKKOS_HPX_OPTIONS ?= ""
+
 # Return a 1 if a string contains a substring and 0 if not
 # Note the search string should be without '"'
 # Example: $(call kokkos_has_string,"hwloc,librt",hwloc)
 #   Will return a 1
 kokkos_has_string=$(if $(findstring $2,$1),1,0)
+# Returns 1 if the path exists, 0 otherwise
+# Example: $(call kokkos_path_exists,/path/to/file)
+#   Will return a 1 if /path/to/file exists
+kokkos_path_exists=$(if $(wildcard $1),1,0)

 # Check for general settings.
 KOKKOS_INTERNAL_ENABLE_DEBUG := $(call kokkos_has_string,$(KOKKOS_DEBUG),yes)
@ -58,6 +66,7 @@ KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OP
 KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OPTIONS),aggressive_vectorization)
 KOKKOS_INTERNAL_DISABLE_PROFILING := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_profiling)
 KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_deprecated_code)
+KOKKOS_INTERNAL_ENABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_deprecated_code)
 KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_dualview_modify_check)
 KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_profile_load_print)
 KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_large_mem_tests)
@ -65,6 +74,7 @@ KOKKOS_INTERNAL_CUDA_USE_LDG := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),
 KOKKOS_INTERNAL_CUDA_USE_UVM := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),force_uvm)
 KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),rdc)
 KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda)
+KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch)
 KOKKOS_INTERNAL_ENABLE_ETI := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_eti)


@ -72,12 +82,15 @@ KOKKOS_INTERNAL_ENABLE_ETI := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_
 KOKKOS_INTERNAL_USE_OPENMP := $(call kokkos_has_string,$(subst OpenMPTarget,,$(KOKKOS_DEVICES)),OpenMP)
 KOKKOS_INTERNAL_USE_PTHREADS := $(call kokkos_has_string,$(KOKKOS_DEVICES),Pthread)
 KOKKOS_INTERNAL_USE_QTHREADS := $(call kokkos_has_string,$(KOKKOS_DEVICES),Qthreads)
+KOKKOS_INTERNAL_USE_HPX := $(call kokkos_has_string,$(KOKKOS_DEVICES),HPX)
 KOKKOS_INTERNAL_USE_SERIAL := $(call kokkos_has_string,$(KOKKOS_DEVICES),Serial)

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 0)
  ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 0)
    ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 0)
-      KOKKOS_INTERNAL_USE_SERIAL := 1
+      ifeq ($(KOKKOS_INTERNAL_USE_HPX), 0)
+        KOKKOS_INTERNAL_USE_SERIAL := 1
+      endif
    endif
  endif
 endif
@ -112,7 +125,7 @@ KOKKOS_INTERNAL_COMPILER_XL          := $(strip $(shell $(CXX) -qversion       2
 KOKKOS_INTERNAL_COMPILER_CRAY        := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-"               | wc -l))
 KOKKOS_INTERNAL_COMPILER_NVCC        := $(strip $(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep nvcc | wc -l))
 KOKKOS_INTERNAL_COMPILER_CLANG       := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang)
-KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),apple-darwin)
+KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple LLVM)
 KOKKOS_INTERNAL_COMPILER_HCC         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC)

 # Check Host Compiler if using NVCC through nvcc_wrapper
@ -283,9 +296,9 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
                                              + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61)  \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60)  \
-					      + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \
-					      + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
-					      + $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
+											  + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \
+											  + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
+											  + $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53))
@ -300,19 +313,19 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
                                                + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
                                                + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61)  \
                                                + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60)  \
-						+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \
-						+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
-						+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
+												+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \
+												+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
+												+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
                                                + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
                                                + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
                                                + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53))
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1)
-  ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
-    ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-      KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc)
-      CUDA_PATH ?= $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=)
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+    KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc)
+    CUDA_PATH ?= $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=)
+    ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
      KOKKOS_INTERNAL_OPENMPTARGET_FLAG := $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG) --cuda-path=$(CUDA_PATH)
    endif
  endif
@ -441,6 +454,10 @@ ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_QTHREADS")
 endif

+ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_HPX")
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_SERIAL")
 endif
@ -559,9 +576,15 @@ ifeq ($(KOKKOS_INTERNAL_DISABLE_PROFILING), 0)
  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_PROFILING")
 endif

-ifeq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 0)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEPRECATED_CODE")
+ifeq ($(KOKKOS_INTERNAL_USE_HPX), 0)
+  ifeq ($(KOKKOS_INTERNAL_ENABLE_DEPRECATED_CODE), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEPRECATED_CODE")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 0)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEPRECATED_CODE")
+  endif
 endif
+
 ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_ETI")
 endif
@ -593,8 +616,13 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)

  ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1)
    tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE")
-    KOKKOS_CXXFLAGS += --relocatable-device-code=true
-    KOKKOS_LDFLAGS += --relocatable-device-code=true
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+      KOKKOS_CXXFLAGS += -fcuda-rdc
+      KOKKOS_LDFLAGS += -fcuda-rdc
+    else
+      KOKKOS_CXXFLAGS += --relocatable-device-code=true
+      KOKKOS_LDFLAGS += --relocatable-device-code=true
+    endif
  endif

  ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
@ -625,6 +653,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
  endif
 endif

+ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
+  ifeq ($(KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_HPX_ASYNC_DISPATCH")
+  endif
+endif
+
 # Add Architecture flags.

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1)
@ -908,7 +942,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 		KOKKOS_INTERNAL_CUDA_ARCH_FLAG=--cuda-gpu-arch
 		KOKKOS_CXXFLAGS += -x cuda
  else
-    $(error Makefile.kokkos: CUDA is enabled but the compiler is neither NVCC nor Clang)
+    $(error Makefile.kokkos: CUDA is enabled but the compiler is neither NVCC nor Clang (got version string $(KOKKOS_CXX_VERSION)) )
  endif

  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1)
@ -1058,10 +1092,18 @@ endif
    ifneq ($(KOKKOS_CMAKE), yes)
      KOKKOS_CXXFLAGS += -I$(CUDA_PATH)/include
    endif
-    KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64
-    KOKKOS_CXXLDFLAGS += -L$(CUDA_PATH)/lib64
+    ifeq ($(call kokkos_path_exists,$(CUDA_PATH)/lib64), 1)
+      KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64
+      KOKKOS_CXXLDFLAGS += -L$(CUDA_PATH)/lib64
+      KOKKOS_TPL_LIBRARY_DIRS += $(CUDA_PATH)/lib64
+    else ifeq ($(call kokkos_path_exists,$(CUDA_PATH)/lib), 1)
+      KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib
+      KOKKOS_CXXLDFLAGS += -L$(CUDA_PATH)/lib
+      KOKKOS_TPL_LIBRARY_DIRS += $(CUDA_PATH)/lib
+    else
+      $(error Can't find CUDA library directory: no lib64 or lib directory in $(CUDA_PATH))
+    endif
    KOKKOS_TPL_INCLUDE_DIRS += $(CUDA_PATH)/include
-    KOKKOS_TPL_LIBRARY_DIRS += $(CUDA_PATH)/lib64
    ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
      KOKKOS_CXXFLAGS += --cuda-path=$(CUDA_PATH)
    endif
@ -1124,6 +1166,33 @@ ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
  KOKKOS_TPL_LIBRARY_NAMES += qthread
 endif

+ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HPX/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HPX/*.hpp)
+  ifneq ($(HPX_PATH),)
+    ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
+      KOKKOS_CXXFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --cflags hpx_application_debug)
+      KOKKOS_CXXLDFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --libs hpx_application_debug)
+      KOKKOS_LDFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --libs hpx_application_debug)
+    else
+      KOKKOS_CXXFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --cflags hpx_application)
+      KOKKOS_CXXLDFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --libs hpx_application)
+      KOKKOS_LDFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --libs hpx_application)
+    endif
+  else
+    ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
+      KOKKOS_CXXFLAGS += $(shell pkg-config --cflags hpx_application_debug)
+      KOKKOS_CXXLDFLAGS += $(shell pkg-config --libs hpx_application_debug)
+      KOKKOS_LDFLAGS += $(shell pkg-config --libs hpx_application_debug)
+    else
+      KOKKOS_CXXFLAGS += $(shell pkg-config --cflags hpx_application)
+      KOKKOS_CXXLDFLAGS += $(shell pkg-config --libs hpx_application)
+      KOKKOS_LDFLAGS += $(shell pkg-config --libs hpx_application)
+    endif
+  endif
+  KOKKOS_TPL_LIBRARY_NAMES += hpx
+endif
+
 # Explicitly set the GCC Toolchain for Clang.
 ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
  KOKKOS_INTERNAL_GCC_PATH = $(shell which g++)
--- a/lib/kokkos/Makefile.targets
+++ b/lib/kokkos/Makefile.targets
@ -30,6 +30,8 @@ Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
 Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
+Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp 
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp

 ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
 ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
@ -38,8 +40,8 @@ endif
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-Kokkos_Cuda_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+Kokkos_Cuda_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp
 Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
 Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
@ -92,6 +94,13 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
 endif
 endif

+ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
+Kokkos_HPX.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp
+Kokkos_HPX_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
 Kokkos_OpenMPTarget_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
--- a/lib/kokkos/algorithms/cmake/Dependencies.cmake
+++ b/lib/kokkos/algorithms/cmake/Dependencies.cmake
@ -1,5 +1,5 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
  LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
  TEST_OPTIONAL_TPLS CUSPARSE
  )
--- a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
@ -328,6 +328,8 @@ public:

      parallel_for("Kokkos::Sort::Copy", Kokkos::RangePolicy<execution_space>(0,len),functor);
    }
+
+    Kokkos::fence();
  }

  template<class ValuesViewType>
--- a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt
+++ b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt
@ -42,6 +42,12 @@ IF(Kokkos_ENABLE_OpenMP)
  )
 ENDIF()

+IF(Kokkos_ENABLE_HPX)
+  LIST( APPEND SOURCES
+    TestHPX.cpp
+  )
+ENDIF()
+
 IF(Kokkos_ENABLE_Serial)
  LIST( APPEND SOURCES
    TestSerial.cpp
--- a/lib/kokkos/algorithms/unit_tests/Makefile
+++ b/lib/kokkos/algorithms/unit_tests/Makefile
@ -49,6 +49,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
 	TEST_TARGETS += test-openmp
 endif

+ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
+	OBJ_HPX = TestHPX.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosAlgorithms_UnitTest_HPX
+	TEST_TARGETS += test-hpx
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
 	OBJ_SERIAL = TestSerial.o UnitTestMain.o gtest-all.o
 	TARGETS += KokkosAlgorithms_UnitTest_Serial
@ -67,6 +73,9 @@ KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
 KokkosAlgorithms_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_OpenMP

+KokkosAlgorithms_UnitTest_HPX: $(OBJ_HPX) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_HPX) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_HPX
+
 KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Serial

@ -82,6 +91,9 @@ test-threads: KokkosAlgorithms_UnitTest_Threads
 test-openmp: KokkosAlgorithms_UnitTest_OpenMP
 	./KokkosAlgorithms_UnitTest_OpenMP

+test-hpx: KokkosAlgorithms_UnitTest_HPX
+	./KokkosAlgorithms_UnitTest_HPX
+
 test-serial: KokkosAlgorithms_UnitTest_Serial
 	./KokkosAlgorithms_UnitTest_Serial

--- a/lib/kokkos/algorithms/unit_tests/TestHPX.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestHPX.cpp
@ -0,0 +1,96 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_HPX
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+#include <iomanip>
+
+namespace Test {
+
+class hpx : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+  }
+
+  static void TearDownTestCase()
+  {
+  }
+};
+
+#define HPX_RANDOM_XORSHIFT64( num_draws )                                \
+  TEST_F( hpx, Random_XorShift64 ) {   \
+      Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Experimental::HPX> >(num_draws); \
+  }
+
+#define HPX_RANDOM_XORSHIFT1024( num_draws )                                \
+  TEST_F( hpx, Random_XorShift1024 ) {   \
+      Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Experimental::HPX> >(num_draws); \
+  }
+
+#define HPX_SORT_UNSIGNED( size )                                \
+  TEST_F( hpx, SortUnsigned ) {   \
+      Impl::test_sort< Kokkos::Experimental::HPX, unsigned >(size); \
+  }
+
+HPX_RANDOM_XORSHIFT64( 10240000 )
+HPX_RANDOM_XORSHIFT1024( 10130144 )
+HPX_SORT_UNSIGNED(171)
+
+#undef HPX_RANDOM_XORSHIFT64
+#undef HPX_RANDOM_XORSHIFT1024
+#undef HPX_SORT_UNSIGNED
+} // namespace test
+#else
+void KOKKOS_ALGORITHMS_UNITTESTS_TESTHPX_PREVENT_LINK_ERROR() {}
+#endif
+
--- a/lib/kokkos/algorithms/unit_tests/TestSort.hpp
+++ b/lib/kokkos/algorithms/unit_tests/TestSort.hpp
@ -225,9 +225,9 @@ void test_dynamic_view_sort(unsigned int n )
  Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
  Kokkos::fill_random(keys_view,g,Kokkos::Random_XorShift64_Pool<ExecutionSpace>::generator_type::MAX_URAND);

-  ExecutionSpace::fence();
+  ExecutionSpace().fence();
  Kokkos::deep_copy(keys,keys_view);
-  //ExecutionSpace::fence();
+  //ExecutionSpace().fence();

  double sum_before = 0.0;
  double sum_after = 0.0;
@ -237,9 +237,9 @@ void test_dynamic_view_sort(unsigned int n )

  Kokkos::sort(keys, 0 /* begin */ , n /* end */ );

-  ExecutionSpace::fence(); // Need this fence to prevent BusError with Cuda
+  ExecutionSpace().fence(); // Need this fence to prevent BusError with Cuda
  Kokkos::deep_copy( keys_view , keys );
-  //ExecutionSpace::fence();
+  //ExecutionSpace().fence();

  Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys_view),sum_after);
  Kokkos::parallel_reduce(n-1,is_sorted_struct<ExecutionSpace, KeyType>(keys_view),sort_fails);
--- a/lib/kokkos/cmake/kokkos_build.cmake
+++ b/lib/kokkos/cmake/kokkos_build.cmake
@ -76,8 +76,20 @@ IF(KOKKOS_SEPARATE_LIBS)
  )

  foreach(lib IN LISTS KOKKOS_TPL_LIBRARY_NAMES)
-    if ("${lib}" STREQUAL "cuda")
+    if (("${lib}" STREQUAL "cuda") AND (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang"))
      set(LIB_cuda "-lcuda")
+    elseif ("${lib}" STREQUAL "hpx")
+      find_package(HPX REQUIRED)
+      if(${HPX_FOUND})
+        target_link_libraries(kokkoscore PUBLIC ${HPX_LIBRARIES})
+        target_link_libraries(kokkoscontainers PUBLIC ${HPX_LIBRARIES})
+        target_link_libraries(kokkosalgorithms PUBLIC ${HPX_LIBRARIES})
+        target_include_directories(kokkoscore PUBLIC ${HPX_INCLUDE_DIRS})
+        target_include_directories(kokkoscontainers PUBLIC ${HPX_INCLUDE_DIRS})
+        target_include_directories(kokkosalgorithms PUBLIC ${HPX_INCLUDE_DIRS})
+      else()
+        message(ERROR "HPX not found. Check the value of HPX_DIR (= ${HPX_DIR}) or CMAKE_PREFIX_PATH (= ${CMAKE_PREFIX_PATH}).")
+      endif()
    else()
      find_library(LIB_${lib} ${lib} PATHS ${KOKKOS_TPL_LIBRARY_DIRS})
    endif()
@ -158,8 +170,16 @@ ELSE()
  )

  foreach(lib IN LISTS KOKKOS_TPL_LIBRARY_NAMES)
-    if ("${lib}" STREQUAL "cuda")
+    if (("${lib}" STREQUAL "cuda") AND (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang"))
      set(LIB_cuda "-lcuda")
+    elseif ("${lib}" STREQUAL "hpx")
+      find_package(HPX REQUIRED)
+      if(${HPX_FOUND})
+        target_link_libraries(kokkos PUBLIC ${HPX_LIBRARIES})
+        target_include_directories(kokkos PUBLIC ${HPX_INCLUDE_DIRS})
+      else()
+        message(ERROR "HPX not found. Check the value of HPX_DIR (= ${HPX_DIR}) or CMAKE_PREFIX_PATH (= ${CMAKE_PREFIX_PATH}).")
+      endif()
    else()
      find_library(LIB_${lib} ${lib} PATHS ${KOKKOS_TPL_LIBRARY_DIRS})
    endif()
--- a/lib/kokkos/cmake/kokkos_functions.cmake
+++ b/lib/kokkos/cmake/kokkos_functions.cmake
@ -95,7 +95,7 @@ function(set_kokkos_cxx_compiler)
        message(FATAL_ERROR "Compiling CUDA code directly with Clang requires version 4.0.0 or higher.")
      endif()
    elseif(NOT INTERNAL_CXX_COMPILER_ID STREQUAL NVIDIA)
-      message(FATAL_ERROR "Invalid compiler for CUDA.  The compiler must be nvcc_wrapper or Clang.")
+      message(FATAL_ERROR "Invalid compiler for CUDA.  The compiler must be nvcc_wrapper or Clang, but compiler ID was ${INTERNAL_CXX_COMPILER_ID}")
    endif()
  endif()

--- a/lib/kokkos/cmake/kokkos_options.cmake
+++ b/lib/kokkos/cmake/kokkos_options.cmake
@ -14,6 +14,7 @@ list(APPEND KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST
     OpenMP
     Pthread
     Qthread
+     HPX
     Cuda
     ROCm
     HWLOC
@ -23,6 +24,7 @@ list(APPEND KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST
     Cuda_Relocatable_Device_Code
     Cuda_UVM
     Cuda_LDG_Intrinsic
+     HPX_ASYNC_DISPATCH
     Debug
     Debug_DualView_Modify_Check
     Debug_Bounds_Check
@ -116,6 +118,7 @@ list(APPEND KOKKOS_DEVICES_LIST
    OpenMP        # OpenMP
    Pthread       # pthread
    Qthreads      # qthreads
+    HPX           # HPX
    Serial        # serial
    ROCm          # Relocatable device code
    )
@ -173,6 +176,19 @@ set(KOKKOS_INTERNAL_RELOCATABLE_DEVICE_CODE rdc)
 set(KOKKOS_INTERNAL_LAMBDA enable_lambda)


+#-------------------------------------------------------------------------------
+# List of possible Options for HPX
+#-------------------------------------------------------------------------------
+# From Makefile.kokkos: Options: enable_async_dispatch
+set(KOKKOS_HPX_OPTIONS_LIST)
+list(APPEND KOKKOS_HPX_OPTIONS_LIST
+    ASYNC_DISPATCH # enable_async_dispatch
+    )
+
+# Map of cmake variables to Makefile variables
+set(KOKKOS_INTERNAL_ENABLE_ASYNC_DISPATCH enable_async_dispatch)
+
+
 #-------------------------------------------------------------------------------
 #------------------------------- Create doc strings ----------------------------
 #-------------------------------------------------------------------------------
@ -202,6 +218,11 @@ set(KOKKOS_SEPARATE_LIBS OFF CACHE BOOL "OFF = kokkos.  ON = kokkoscore, kokkosc
 # Qthreads options.
 set(KOKKOS_QTHREADS_DIR "" CACHE PATH "Location of Qthreads library.")

+# HPX options.
+set(KOKKOS_HPX_DIR "" CACHE PATH "Location of HPX library.")
+
+# Whether to build separate libraries or now
+set(KOKKOS_SEPARATE_TESTS OFF CACHE BOOL "Provide unit test targets with finer granularity.")

 #-------------------------------------------------------------------------------
 #------------------------------- KOKKOS_DEVICES --------------------------------
@ -215,6 +236,11 @@ IF(Trilinos_ENABLE_Kokkos)
  ELSE()
    set_kokkos_default_default(QTHREADS OFF)
  ENDIF()
+  IF(TPL_ENABLE_HPX)
+    set_kokkos_default_default(HPX ON)
+  ELSE()
+    set_kokkos_default_default(HPX OFF)
+  ENDIF()
  IF(Trilinos_ENABLE_OpenMP)
    set_kokkos_default_default(OPENMP ${Trilinos_ENABLE_OpenMP})
  ELSE()
@ -231,6 +257,7 @@ ELSE()
  set_kokkos_default_default(OPENMP OFF)
  set_kokkos_default_default(PTHREAD OFF)
  set_kokkos_default_default(QTHREAD OFF)
+  set_kokkos_default_default(HPX OFF)
  set_kokkos_default_default(CUDA OFF)
  set_kokkos_default_default(ROCM OFF)
 ENDIF()
@ -241,6 +268,7 @@ set(KOKKOS_ENABLE_SERIAL ${KOKKOS_INTERNAL_ENABLE_SERIAL_DEFAULT} CACHE BOOL "Wh
 set(KOKKOS_ENABLE_OPENMP ${KOKKOS_INTERNAL_ENABLE_OPENMP_DEFAULT} CACHE BOOL "Enable OpenMP support in Kokkos." FORCE)
 set(KOKKOS_ENABLE_PTHREAD ${KOKKOS_INTERNAL_ENABLE_PTHREAD_DEFAULT} CACHE BOOL "Enable Pthread support in Kokkos.")
 set(KOKKOS_ENABLE_QTHREADS ${KOKKOS_INTERNAL_ENABLE_QTHREADS_DEFAULT} CACHE BOOL "Enable Qthreads support in Kokkos.")
+set(KOKKOS_ENABLE_HPX ${KOKKOS_INTERNAL_ENABLE_HPX_DEFAULT} CACHE BOOL "Enable HPX support in Kokkos.")
 set(KOKKOS_ENABLE_CUDA ${KOKKOS_INTERNAL_ENABLE_CUDA_DEFAULT} CACHE BOOL "Enable CUDA support in Kokkos.")
 set(KOKKOS_ENABLE_ROCM ${KOKKOS_INTERNAL_ENABLE_ROCM_DEFAULT} CACHE BOOL "Enable ROCm support in Kokkos.")

@ -343,6 +371,18 @@ set(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE ${KOKKOS_INTERNAL_ENABLE_CUDA_REL
 set(KOKKOS_ENABLE_CUDA_LAMBDA ${KOKKOS_INTERNAL_ENABLE_CUDA_LAMBDA_DEFAULT} CACHE BOOL "Enable lambdas for CUDA. (cuda option)")


+#-------------------------------------------------------------------------------
+#------------------------------- KOKKOS_HPX_OPTIONS ----------------------------
+#-------------------------------------------------------------------------------
+
+# HPX options.
+# Set Defaults
+set_kokkos_default_default(HPX_ASYNC_DISPATCH OFF)
+
+# Set actual options
+set(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH ${KOKKOS_INTERNAL_ENABLE_HPX_ASYNC_DISPATCH_DEFAULT} CACHE BOOL "Enable HPX async dispatch.")
+
+
 #-------------------------------------------------------------------------------
 #----------------------- HOST ARCH AND LEGACY TRIBITS --------------------------
 #-------------------------------------------------------------------------------
@ -376,4 +416,3 @@ foreach(opt ${KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST})
    SET(Kokkos_ENABLE_${opt} ${KOKKOS_ENABLE_${OPT}} CACHE BOOL "CamelCase Compatibility setting for KOKKOS_ENABLE_${OPT}")
  ENDIF()
 endforeach()
-
--- a/lib/kokkos/cmake/kokkos_settings.cmake
+++ b/lib/kokkos/cmake/kokkos_settings.cmake
@ -198,6 +198,8 @@ if(KOKKOS_CMAKE_VERBOSE)
    message(STATUS "    Host Parallel: Pthread")
  elseif(KOKKOS_ENABLE_QTHREADS)
    message(STATUS "    Host Parallel: Qthreads")
+  elseif(KOKKOS_ENABLE_HPX)
+    message(STATUS "    Host Parallel: HPX")
  else()
    message(STATUS "    Host Parallel: None")
  endif()
@ -244,6 +246,10 @@ if(KOKKOS_CMAKE_VERBOSE)
    message(STATUS "  KOKKOS_MEMKIND_DIR: ${KOKKOS_MEMKIND_DIR}")
  endif()

+  if(KOKKOS_HPX_DIR)
+    message(STATUS "  KOKKOS_HPX_DIR: ${KOKKOS_HPX_DIR}")
+  endif()
+
  message(STATUS "")
  message(STATUS "Final kokkos settings variable:")
  message(STATUS "  ${KOKKOS_SETTINGS}")
--- a/lib/kokkos/cmake/tribits.cmake
+++ b/lib/kokkos/cmake/tribits.cmake
@ -9,6 +9,10 @@ IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_OpenMP)
  SET(${PROJECT_NAME}_ENABLE_OpenMP OFF)
 ENDIF()

+IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_HPX)
+  SET(${PROJECT_NAME}_ENABLE_HPX OFF)
+ENDIF()
+
 IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_DEBUG)
  SET(${PROJECT_NAME}_ENABLE_DEBUG OFF)
 ENDIF()
@ -309,6 +313,10 @@ ENDFUNCTION()
 FUNCTION(TRIBITS_TPL_TENTATIVELY_ENABLE)
 ENDFUNCTION()

+FUNCTION(TRIBITS_ADD_ADVANCED_TEST)
+  # TODO Write this
+ENDFUNCTION()
+
 FUNCTION(TRIBITS_ADD_EXECUTABLE_AND_TEST EXE_NAME)

  SET(options STANDARD_PASS_OUTPUT WILL_FAIL)
--- a/lib/kokkos/containers/cmake/Dependencies.cmake
+++ b/lib/kokkos/containers/cmake/Dependencies.cmake
@ -1,5 +1,5 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
  LIB_REQUIRED_PACKAGES KokkosCore
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
  TEST_OPTIONAL_TPLS CUSPARSE
  )
--- a/lib/kokkos/containers/performance_tests/CMakeLists.txt
+++ b/lib/kokkos/containers/performance_tests/CMakeLists.txt
@ -24,6 +24,10 @@ IF(Kokkos_ENABLE_OpenMP)
  LIST( APPEND SOURCES TestOpenMP.cpp)
 ENDIF()

+IF(Kokkos_ENABLE_HPX)
+  LIST( APPEND SOURCES TestHPX.cpp)
+ENDIF()
+
 # Per #374, we always want to build this test, but we only want to run
 # it as a PERFORMANCE test.  That's why we separate building the test
 # from running the test.
--- a/lib/kokkos/containers/performance_tests/Makefile
+++ b/lib/kokkos/containers/performance_tests/Makefile
@ -49,6 +49,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
 	TEST_TARGETS += test-openmp
 endif

+ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
+	OBJ_HPX = TestHPX.o TestMain.o gtest-all.o
+	TARGETS += KokkosContainers_PerformanceTest_HPX
+	TEST_TARGETS += test-hpx
+endif
+
 KokkosContainers_PerformanceTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Cuda

@ -61,6 +67,9 @@ KokkosContainers_PerformanceTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
 KokkosContainers_PerformanceTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_OpenMP

+KokkosContainers_PerformanceTest_HPX: $(OBJ_HPX) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_HPX) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_HPX
+
 test-cuda: KokkosContainers_PerformanceTest_Cuda
 	./KokkosContainers_PerformanceTest_Cuda

@ -73,6 +82,9 @@ test-threads: KokkosContainers_PerformanceTest_Threads
 test-openmp: KokkosContainers_PerformanceTest_OpenMP
 	./KokkosContainers_PerformanceTest_OpenMP

+test-hpx: KokkosContainers_PerformanceTest_HPX
+	./KokkosContainers_PerformanceTest_HPX
+
 build_all: $(TARGETS)

 test: $(TEST_TARGETS)
--- a/lib/kokkos/containers/performance_tests/TestDynRankView.hpp
+++ b/lib/kokkos/containers/performance_tests/TestDynRankView.hpp
@ -197,7 +197,7 @@ void test_dynrankview_op_perf( const int par_size )
    timer.reset();
    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
    Kokkos::parallel_for( policy , FunctorType(testview) );
-    DeviceType::fence();
+    DeviceType().fence();
    elapsed_time_view = timer.seconds();
    std::cout << " View time (init only): " << elapsed_time_view << std::endl;

@ -205,7 +205,7 @@ void test_dynrankview_op_perf( const int par_size )
    timer.reset();
    Kokkos::View<double*,DeviceType> sumview("sumview",par_size);
    Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testview, sumview) );
-    DeviceType::fence();
+    DeviceType().fence();
    elapsed_time_compview = timer.seconds();
    std::cout << " View sum computation time: " << elapsed_time_view << std::endl;

@ -215,7 +215,7 @@ void test_dynrankview_op_perf( const int par_size )

    timer.reset();
    Kokkos::parallel_for( policy , FunctorStrideType(teststrideview) );
-    DeviceType::fence();
+    DeviceType().fence();
    elapsed_time_strideview = timer.seconds();
    std::cout << " Strided View time (init only): " << elapsed_time_strideview << std::endl;
  }
@ -226,7 +226,7 @@ void test_dynrankview_op_perf( const int par_size )
    timer.reset();
    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
    Kokkos::parallel_for( policy , FunctorType(testview) );
-    DeviceType::fence();
+    DeviceType().fence();
    elapsed_time_view_rank7 = timer.seconds();
    std::cout << " View Rank7 time (init only): " << elapsed_time_view_rank7 << std::endl;
  }
@ -237,14 +237,14 @@ void test_dynrankview_op_perf( const int par_size )
    timer.reset();
    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
    Kokkos::parallel_for( policy , FunctorType(testdrview) );
-    DeviceType::fence();
+    DeviceType().fence();
    elapsed_time_drview = timer.seconds();
    std::cout << " DynRankView time (init only): " << elapsed_time_drview << std::endl;

    timer.reset();
    Kokkos::DynRankView<double,DeviceType> sumview("sumview",par_size);
    Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testdrview, sumview) );
-    DeviceType::fence();
+    DeviceType().fence();
    elapsed_time_compdrview = timer.seconds();
    std::cout << " DynRankView sum computation time: " << elapsed_time_compdrview << std::endl;

--- a/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
+++ b/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
@ -192,7 +192,7 @@ void test_global_to_local_ids(unsigned num_ids)
  {
    generate_ids<Device> gen(local_2_global);
  }
-  Device::fence();
+  Device().fence();
  // generate
  elasped_time = timer.seconds();
  std::cout << elasped_time << ", ";
@ -201,7 +201,7 @@ void test_global_to_local_ids(unsigned num_ids)
  {
    fill_map<Device> fill(global_2_local, local_2_global);
  }
-  Device::fence();
+  Device().fence();

  // fill
  elasped_time = timer.seconds();
@ -214,7 +214,7 @@ void test_global_to_local_ids(unsigned num_ids)
  {
    find_test<Device> find(global_2_local, local_2_global,num_errors);
  }
-  Device::fence();
+  Device().fence();

  // find
  elasped_time = timer.seconds();
--- a/lib/kokkos/containers/performance_tests/TestHPX.cpp
+++ b/lib/kokkos/containers/performance_tests/TestHPX.cpp
@ -0,0 +1,130 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_HPX )
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_UnorderedMap.hpp>
+
+#include <TestGlobal2LocalIds.hpp>
+#include <TestUnorderedMapPerformance.hpp>
+
+#include <TestDynRankView.hpp>
+#include <TestScatterView.hpp>
+
+#include <iomanip>
+#include <sstream>
+#include <string>
+#include <fstream>
+
+
+namespace Performance {
+
+class hpx : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+
+    Kokkos::initialize();
+    Kokkos::print_configuration( std::cout );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::finalize();
+  }
+};
+
+TEST_F( hpx, dynrankview_perf )
+{
+  std::cout << "HPX" << std::endl;
+  std::cout << " DynRankView vs View: Initialization Only " << std::endl;
+  test_dynrankview_op_perf<Kokkos::Experimental::HPX>( 8192 );
+}
+
+TEST_F( hpx, global_2_local)
+{
+  std::cout << "HPX" << std::endl;
+  std::cout << "size, create, generate, fill, find" << std::endl;
+  for (unsigned i=Performance::begin_id_size; i<=Performance::end_id_size; i *= Performance::id_step)
+    test_global_to_local_ids<Kokkos::Experimental::HPX>(i);
+}
+
+TEST_F( hpx, unordered_map_performance_near)
+{
+  unsigned num_hpx = 4;
+  std::ostringstream base_file_name;
+  base_file_name << "hpx-" << num_hpx << "-near";
+  Perf::run_performance_tests<Kokkos::Experimental::HPX,true>(base_file_name.str());
+}
+
+TEST_F( hpx, unordered_map_performance_far)
+{
+  unsigned num_hpx = 4;
+  std::ostringstream base_file_name;
+  base_file_name << "hpx-" << num_hpx << "-far";
+  Perf::run_performance_tests<Kokkos::Experimental::HPX,false>(base_file_name.str());
+}
+
+TEST_F( hpx, scatter_view)
+{
+  std::cout << "ScatterView data-duplicated test:\n";
+  Perf::test_scatter_view<Kokkos::Experimental::HPX, Kokkos::LayoutRight,
+    Kokkos::Experimental::ScatterDuplicated,
+    Kokkos::Experimental::ScatterNonAtomic>(10, 1000 * 1000);
+//std::cout << "ScatterView atomics test:\n";
+//Perf::test_scatter_view<Kokkos::Experimental::HPX, Kokkos::LayoutRight,
+//  Kokkos::Experimental::ScatterNonDuplicated,
+//  Kokkos::Experimental::ScatterAtomic>(10, 1000 * 1000);
+}
+
+} // namespace test
+#else
+void KOKKOS_CONTAINERS_PERFORMANCE_TESTS_TESTHPX_PREVENT_EMPTY_LINK_ERROR() {}
+#endif
+
--- a/lib/kokkos/containers/performance_tests/TestScatterView.hpp
+++ b/lib/kokkos/containers/performance_tests/TestScatterView.hpp
@ -83,6 +83,7 @@ void test_scatter_view(int m, int n)
      for (int k = 0; k < m; ++k) {
        Kokkos::parallel_for(policy, f2, "hand_coded_duplicate_scatter_view_test");
      }
+      Kokkos::fence();
      auto t = timer.seconds();
      std::cout << "hand-coded test took " << t << " seconds\n";
    }
@ -101,6 +102,7 @@ void test_scatter_view(int m, int n)
      for (int k = 0; k < m; ++k) {
        Kokkos::parallel_for(policy, f, "scatter_view_test");
      }
+      Kokkos::fence();
      auto t = timer.seconds();
      std::cout << "test took " << t << " seconds\n";
    }
--- a/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
+++ b/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
@ -108,7 +108,7 @@ struct UnorderedMapTest
    std::cout << std::setprecision(2) << std::fixed << std::setw(5) << (1e9*(seconds/(inserts))) << "; " << std::flush;

    histogram.calculate();
-    Device::fence();
+    Device().fence();
  }

  void print(std::ostream & metrics_out, std::ostream & length_out, std::ostream & distance_out, std::ostream & block_distance_out)
@ -236,7 +236,7 @@ void run_performance_tests(std::string const & base_file_name)
        uint32_t inserts = static_cast<uint32_t>(test_ratios[j]*(capacity));
        std::cout << capacity << std::flush;
        UnorderedMapTest<Device, Near> test(capacity, inserts*collisions[i], collisions[i]);
-        Device::fence();
+        Device().fence();
        test.print(metrics_out, length_out, distance_out, block_distance_out);
      }
      std::cout << "\b\b  " <<  std::endl;
--- a/lib/kokkos/containers/src/Kokkos_Bitset.hpp
+++ b/lib/kokkos/containers/src/Kokkos_Bitset.hpp
@ -107,22 +107,20 @@ public:
    }
  }

-  /// assignment
-  Bitset<Device> & operator = (Bitset<Device> const & rhs)
-  {
-    this->m_size = rhs.m_size;
-    this->m_last_block_mask = rhs.m_last_block_mask;
-    this->m_blocks = rhs.m_blocks;
+  KOKKOS_INLINE_FUNCTION
+  Bitset (const Bitset<Device>&) = default;

-    return *this;
-  }
+  KOKKOS_INLINE_FUNCTION
+  Bitset& operator= (const Bitset<Device>&) = default;

-  /// copy constructor
-  Bitset( Bitset<Device> const & rhs)
-    : m_size( rhs.m_size )
-    , m_last_block_mask( rhs.m_last_block_mask )
-    , m_blocks( rhs.m_blocks )
-  {}
+  KOKKOS_INLINE_FUNCTION
+  Bitset (Bitset<Device>&&) = default;
+
+  KOKKOS_INLINE_FUNCTION
+  Bitset& operator= (Bitset<Device>&&) = default;
+  
+  KOKKOS_INLINE_FUNCTION
+  ~Bitset () = default;

  /// number of bits in the set
  /// can be call from the host or the device
--- a/lib/kokkos/containers/src/Kokkos_DualView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp
@ -484,8 +484,8 @@ public:
      }
    }
    if(std::is_same<typename t_host::memory_space,typename t_dev::memory_space>::value) {
-      t_dev::execution_space::fence();
-      t_host::execution_space::fence();
+      typename t_dev::execution_space().fence();
+      typename t_host::execution_space().fence();
    }
  }

--- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
@ -75,7 +75,7 @@ struct DynRankDimTraits {
                           , const size_t N4
                           , const size_t N5
                           , const size_t N6
-                           , const size_t N7 )
+                           , const size_t /* N7 */)
  {
    return
      (   (N6 == unspecified && N5 == unspecified && N4 == unspecified && N3 == unspecified && N2 == unspecified && N1 == unspecified && N0 == unspecified) ? 0
@ -106,7 +106,7 @@ struct DynRankDimTraits {
  // Extra overload to match that for specialize types v2
  template <typename Layout, typename ... P>
  KOKKOS_INLINE_FUNCTION
-  static size_t computeRank( const Kokkos::Impl::ViewCtorProp<P...>& prop, const Layout& layout )
+  static size_t computeRank( const Kokkos::Impl::ViewCtorProp<P...>& /* prop */, const Layout& layout )
  {
    return computeRank(layout);
  }
@ -155,7 +155,7 @@ struct DynRankDimTraits {
  // Extra overload to match that for specialize types
  template <typename Traits, typename ... P>
  KOKKOS_INLINE_FUNCTION
-  static typename std::enable_if< (std::is_same<typename Traits::array_layout , Kokkos::LayoutRight>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutLeft>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutStride>::value) , typename Traits::array_layout >::type createLayout( const Kokkos::Impl::ViewCtorProp<P...>& prop, const typename Traits::array_layout& layout )
+  static typename std::enable_if< (std::is_same<typename Traits::array_layout , Kokkos::LayoutRight>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutLeft>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutStride>::value) , typename Traits::array_layout >::type createLayout( const Kokkos::Impl::ViewCtorProp<P...>& /* prop */, const typename Traits::array_layout& layout )
  {
    return createLayout( layout );
  }
@ -655,7 +655,7 @@ public:
      const size_t dim_scalar = m_map.dimension_scalar();
      const size_t bytes = this->span() / dim_scalar;

-      typedef Kokkos::View<DataType*, typename traits::array_layout, typename traits::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged | traits::memory_traits::RandomAccess | traits::memory_traits::Atomic> > tmp_view_type;
+      typedef Kokkos::View<DataType*, typename traits::array_layout, typename traits::device_type, Kokkos::MemoryTraits<traits::memory_traits::is_unmanaged | traits::memory_traits::is_random_access | traits::memory_traits::is_atomic> > tmp_view_type;
      tmp_view_type rankone_view(this->data(), bytes, dim_scalar);
      return rankone_view(i0);
    }
@ -1060,7 +1060,7 @@ public:
      }

      // Copy the input allocation properties with possibly defaulted properties
-      alloc_prop prop( arg_prop );
+      alloc_prop prop_copy( arg_prop );

 //------------------------------------------------------------
 #if defined( KOKKOS_ENABLE_CUDA )
@ -1070,18 +1070,18 @@ public:
      // Fence using the trait's executon space (which will be Kokkos::Cuda)
      // to avoid incomplete type errors from usng Kokkos::Cuda directly.
      if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
-        traits::device_type::memory_space::execution_space::fence();
+        typename traits::device_type::memory_space::execution_space().fence();
      }
 #endif
 //------------------------------------------------------------

      Kokkos::Impl::SharedAllocationRecord<> *
-        record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) );
+        record = m_map.allocate_shared( prop_copy, Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) );

 //------------------------------------------------------------
 #if defined( KOKKOS_ENABLE_CUDA )
      if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
-        traits::device_type::memory_space::execution_space::fence();
+        typename traits::device_type::memory_space::execution_space().fence();
      }
 #endif
 //------------------------------------------------------------
@ -1609,7 +1609,7 @@ struct DynRankViewFill {

      closure.execute();

-      execution_space::fence();
+      execution_space().fence();
    }
 };

@ -1650,6 +1650,7 @@ struct DynRankViewRemap {
      typedef Kokkos::RangePolicy< ExecSpace > Policy ;
      const Kokkos::Impl::ParallelFor< DynRankViewRemap , Policy > closure( *this , Policy( 0 , n0 ) );
      closure.execute();
+      // Kokkos::fence(); // ??
    }

  KOKKOS_INLINE_FUNCTION
--- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
@ -288,8 +288,8 @@ public:
    >::type
  resize_serial( IntType const & n )
    {
-      typedef typename traits::value_type value_type ;
-      typedef value_type * value_pointer_type ;
+      typedef typename traits::value_type local_value_type ;
+      typedef local_value_type * value_pointer_type ;

      const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ; // New total number of chunks needed for resize

@ -304,8 +304,8 @@ public:
      if ( *pc < NC ) {
        while ( *pc < NC ) {
          m_chunks[*pc] = reinterpret_cast<value_pointer_type>
-            ( 
-             typename traits::memory_space().allocate( sizeof(value_type) << m_chunk_shift ) 
+            (
+             typename traits::memory_space().allocate( sizeof(local_value_type) << m_chunk_shift )
            );
          ++*pc ;
        }
@ -314,7 +314,7 @@ public:
        while ( NC + 1 <= *pc ) {
          --*pc ;
          typename traits::memory_space().deallocate( m_chunks[*pc]
-                                         , sizeof(value_type) << m_chunk_shift );
+                                         , sizeof(local_value_type) << m_chunk_shift );
          m_chunks[*pc] = 0 ;
        }
      }
@ -376,8 +376,8 @@ public:

        closure.execute();

-        traits::execution_space::fence();
-        //Impl::ChunkArraySpace< typename traits::memory_space >::memory_space::execution_space::fence(); 
+        typename traits::execution_space().fence();
+        //Impl::ChunkArraySpace< typename traits::memory_space >::memory_space::execution_space().fence(); 
      }

    void construct_shared_allocation()
--- a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp
@ -202,8 +202,8 @@ namespace Kokkos {

         template <typename iType, typename std::enable_if< std::is_integral<iType>::value, iType>::type = 0>
         KOKKOS_INLINE_FUNCTION
-         int64_t begin(const iType dimension) const {
-            return dimension < Rank ? m_begins[dimension] : 0;
+         int64_t begin(const iType local_dimension) const {
+             return local_dimension < Rank ? m_begins[local_dimension] : 0;
         }

         KOKKOS_INLINE_FUNCTION
@ -211,7 +211,9 @@ namespace Kokkos {

         template <typename iType, typename std::enable_if< std::is_integral<iType>::value, iType>::type = 0>
         KOKKOS_INLINE_FUNCTION
-         int64_t end(const iType dimension) const {return begin(dimension) + m_map.extent(dimension);}
+         int64_t end(const iType local_dimension) const {
+             return begin(local_dimension) + m_map.extent(local_dimension);
+         }


      private:
@ -1068,7 +1070,7 @@ namespace Kokkos {
            }

            // Copy the input allocation properties with possibly defaulted properties
-            alloc_prop prop( arg_prop );
+            alloc_prop prop_copy( arg_prop );

            //------------------------------------------------------------
 #if defined( KOKKOS_ENABLE_CUDA )
@ -1078,18 +1080,18 @@ namespace Kokkos {
            // Fence using the trait's executon space (which will be Kokkos::Cuda)
            // to avoid incomplete type errors from usng Kokkos::Cuda directly.
            if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
-               traits::device_type::memory_space::execution_space::fence();
+               typename traits::device_type::memory_space::execution_space().fence();
            }
 #endif
            //------------------------------------------------------------

            Kokkos::Impl::SharedAllocationRecord<> *
-            record = m_map.allocate_shared( prop , arg_layout );
+            record = m_map.allocate_shared( prop_copy , arg_layout );

            //------------------------------------------------------------
 #if defined( KOKKOS_ENABLE_CUDA )
            if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
-               traits::device_type::memory_space::execution_space::fence();
+               typename traits::device_type::memory_space::execution_space().fence();
            }
 #endif
            //------------------------------------------------------------
--- a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp
@ -57,9 +57,16 @@
 namespace Kokkos {
 namespace Experimental {

-//TODO: replace this enum with the Kokkos::Sum, etc reducers for parallel_reduce
+/*
+ * Reduction Type list
+ *  - These corresponds to subset of the reducers in parallel_reduce
+ *  - See Implementations of ScatterValue for details.
+ */
 enum : int {
  ScatterSum,
+  ScatterProd,
+  ScatterMax,
+  ScatterMin,
 };

 enum : int {
@ -114,6 +121,21 @@ struct DefaultContribution<Kokkos::OpenMP, Kokkos::Experimental::ScatterDuplicat
 };
 #endif

+#ifdef KOKKOS_ENABLE_HPX
+template <>
+struct DefaultDuplication<Kokkos::Experimental::HPX> {
+  enum : int { value = Kokkos::Experimental::ScatterDuplicated };
+};
+template <>
+struct DefaultContribution<Kokkos::Experimental::HPX, Kokkos::Experimental::ScatterNonDuplicated> {
+  enum : int { value = Kokkos::Experimental::ScatterAtomic };
+};
+template <>
+struct DefaultContribution<Kokkos::Experimental::HPX, Kokkos::Experimental::ScatterDuplicated> {
+  enum : int { value = Kokkos::Experimental::ScatterNonAtomic };
+};
+#endif
+
 #ifdef KOKKOS_ENABLE_THREADS
 template <>
 struct DefaultDuplication<Kokkos::Threads> {
@ -144,39 +166,277 @@ struct DefaultContribution<Kokkos::Cuda, Kokkos::Experimental::ScatterDuplicated
 };
 #endif

-/* ScatterValue is the object returned by the access operator() of ScatterAccess,
-   similar to that returned by an Atomic View, it wraps Kokkos::atomic_add with convenient
-   operator+=, etc. */
+/* ScatterValue <Op=ScatterSum, contribution=ScatterNonAtomic> is the object returned by the access operator() of ScatterAccess,
+   This class inherits from the Sum<> reducer and it wraps join(dest, src) with convenient operator+=, etc. 
+   Note the addition of update(ValueType const& rhs) and reset()  so that all reducers can have common functions
+   See ReduceDuplicates and ResetDuplicates ) */
 template <typename ValueType, int Op, int contribution>
 struct ScatterValue;

 template <typename ValueType>
-struct ScatterValue<ValueType, Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonAtomic> {
+struct ScatterValue<ValueType, Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonAtomic> :
+  Sum<ValueType,Kokkos::DefaultExecutionSpace> {
  public:
-    KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) : value( value_in ) {}
-    KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) : value( other.value ) {}
+    KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) : 
+       Sum<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
+    {}
+    KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) : 
+       Sum<ValueType,Kokkos::DefaultExecutionSpace>(other.reference())
+    {}
    KOKKOS_FORCEINLINE_FUNCTION void operator+=(ValueType const& rhs) {
-      value += rhs;
+      this->join( this->reference(), rhs );
    }
    KOKKOS_FORCEINLINE_FUNCTION void operator-=(ValueType const& rhs) {
-      value -= rhs;
+      this->join( this->reference(), -rhs );
+    }
+    KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
+      this->join( this->reference(), rhs );
+    }
+    KOKKOS_FORCEINLINE_FUNCTION void reset() {
+      this->init( this->reference() );
    }
-  private:
-    ValueType& value;
 };

+/* ScatterValue <Op=ScatterSum, contribution=ScatterAtomic> is the object returned by the access operator() 
+ * of ScatterAccess, similar to that returned by an Atomic View, it wraps Kokkos::atomic_add with convenient
+   operator+=, etc. This version also has the update(rhs) and reset() functions. */
 template <typename ValueType>
-struct ScatterValue<ValueType, Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterAtomic> {
+struct ScatterValue<ValueType, Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterAtomic> :
+  Sum<ValueType,Kokkos::DefaultExecutionSpace> {
  public:
-    KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) : value( value_in ) {}
+    KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) : 
+       Sum<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
+    {}
+
    KOKKOS_FORCEINLINE_FUNCTION void operator+=(ValueType const& rhs) {
-      Kokkos::atomic_add(&value, rhs);
+     this->join(this->reference(), rhs);
    }
    KOKKOS_FORCEINLINE_FUNCTION void operator-=(ValueType const& rhs) {
-      Kokkos::atomic_add(&value, -rhs);
+      this->join(this->reference(), -rhs);
    }
-  private:
-    ValueType& value;
+    
+    KOKKOS_INLINE_FUNCTION
+    void join(ValueType& dest, const ValueType& src)  const {
+      Kokkos::atomic_add(&dest, src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void join(volatile ValueType& dest, const volatile ValueType& src) const {
+      Kokkos::atomic_add(&dest, src);
+    } 
+
+    KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
+      this->join( this->reference(), rhs );
+    }
+
+    KOKKOS_FORCEINLINE_FUNCTION void reset() {
+      this->init( this->reference() );
+    }
+};
+
+/* ScatterValue <Op=ScatterProd, contribution=ScatterNonAtomic> is the object returned by the access operator() of ScatterAccess,
+   This class inherits from the Prod<> reducer and it wraps join(dest, src) with convenient operator*=, etc. 
+   Note the addition of update(ValueType const& rhs) and reset()  so that all reducers can have common functions
+   See ReduceDuplicates and ResetDuplicates ) */
+template <typename ValueType>
+struct ScatterValue<ValueType, Kokkos::Experimental::ScatterProd, Kokkos::Experimental::ScatterNonAtomic> :
+  Prod<ValueType,Kokkos::DefaultExecutionSpace> {
+  public:
+    KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) : 
+       Prod<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
+    {}
+    KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) : 
+       Prod<ValueType,Kokkos::DefaultExecutionSpace>(other.reference())
+    {}
+    KOKKOS_FORCEINLINE_FUNCTION void operator*=(ValueType const& rhs) {
+      this->join( this->reference(), rhs );
+    }
+    KOKKOS_FORCEINLINE_FUNCTION void operator/=(ValueType const& rhs) {
+      this->join( this->reference(), static_cast<ValueType>(1)/rhs );
+    }
+    KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
+      this->join( this->reference(), rhs );
+    }
+    KOKKOS_FORCEINLINE_FUNCTION void reset() {
+      this->init( this->reference() );
+    }
+};
+
+/* ScatterValue <Op=ScatterProd, contribution=ScatterAtomic> is the object returned by the access operator() 
+ * of ScatterAccess, similar to that returned by an Atomic View, it wraps and atomic_prod with convenient
+   operator*=, etc. atomic_prod uses the atomic_compare_exchange. This version also has the update(rhs) and reset() functions. */
+template <typename ValueType>
+struct ScatterValue<ValueType, Kokkos::Experimental::ScatterProd, Kokkos::Experimental::ScatterAtomic> :
+  Prod<ValueType,Kokkos::DefaultExecutionSpace> {
+  public:
+    KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) : 
+       Prod<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
+    {}
+
+    KOKKOS_FORCEINLINE_FUNCTION void operator*=(ValueType const& rhs) {
+     this->join(this->reference(), rhs);
+    }
+    KOKKOS_FORCEINLINE_FUNCTION void operator/=(ValueType const& rhs) {
+      this->join(this->reference(), static_cast<ValueType>(1)/rhs);
+    }
+
+    KOKKOS_FORCEINLINE_FUNCTION 
+    void atomic_prod(ValueType & dest, const ValueType& src) const {
+
+        bool success = false;
+        while(!success) {
+            ValueType dest_old = dest;
+            ValueType dest_new = dest_old * src;
+            dest_new = Kokkos::atomic_compare_exchange<ValueType>(&dest,dest_old,dest_new);
+            success = ( (dest_new - dest_old)/dest_old <= 1e-15 );
+        }
+    }
+    
+    KOKKOS_INLINE_FUNCTION
+    void join(ValueType& dest, const ValueType& src)  const {
+      atomic_prod(dest, src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void join(volatile ValueType& dest, const volatile ValueType& src) const {
+      atomic_prod(dest, src);
+    } 
+
+    KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
+      this->join( this->reference(), rhs );
+    }
+    KOKKOS_FORCEINLINE_FUNCTION void reset() {
+      this->init( this->reference() );
+    }
+
+};
+
+/* ScatterValue <Op=ScatterMin, contribution=ScatterNonAtomic> is the object returned by the access operator() of ScatterAccess,
+   This class inherits from the Min<> reducer and it wraps join(dest, src) with convenient update(rhs). 
+   Note the addition of update(ValueType const& rhs) and reset() are so that all reducers can have a common update function
+   See ReduceDuplicates and ResetDuplicates ) */
+template <typename ValueType>
+struct ScatterValue<ValueType, Kokkos::Experimental::ScatterMin, Kokkos::Experimental::ScatterNonAtomic> :
+  Min<ValueType,Kokkos::DefaultExecutionSpace> {
+  public:
+    KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) : 
+       Min<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
+    {}
+    KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) : 
+       Min<ValueType,Kokkos::DefaultExecutionSpace>(other.reference())
+    {}
+    KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
+      this->join( this->reference(), rhs );
+    }
+    KOKKOS_FORCEINLINE_FUNCTION void reset() {
+      this->init( this->reference() );
+    }
+};
+
+/* ScatterValue <Op=ScatterMin, contribution=ScatterAtomic> is the object returned by the access operator() 
+ * of ScatterAccess, similar to that returned by an Atomic View, it wraps and atomic_min with the update(rhs)
+   function. atomic_min uses the atomic_compare_exchange. This version also has the reset() function */
+template <typename ValueType>
+struct ScatterValue<ValueType, Kokkos::Experimental::ScatterMin, Kokkos::Experimental::ScatterAtomic> :
+  Min<ValueType,Kokkos::DefaultExecutionSpace> {
+  public:
+    KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) : 
+       Min<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
+    {}
+
+    KOKKOS_FORCEINLINE_FUNCTION 
+    void atomic_min(ValueType & dest, const ValueType& src) const {
+
+        bool success = false;
+        while(!success) {
+            ValueType dest_old = dest;
+            ValueType dest_new = ( dest_old > src ) ? src : dest_old;
+            dest_new = Kokkos::atomic_compare_exchange<ValueType>(&dest,dest_old,dest_new);
+            success = ( (dest_new - dest_old)/dest_old <= 1e-15 );
+        }
+    }
+    
+    KOKKOS_INLINE_FUNCTION
+    void join(ValueType& dest, const ValueType& src)  const {
+      atomic_min(dest, src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void join(volatile ValueType& dest, const volatile ValueType& src) const {
+      atomic_min(dest, src);
+    } 
+
+    KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
+      this->join( this->reference(), rhs );
+    }
+    KOKKOS_FORCEINLINE_FUNCTION void reset() {
+      this->init( this->reference() );
+    }
+
+};
+
+/* ScatterValue <Op=ScatterMax, contribution=ScatterNonAtomic> is the object returned by the access operataor() of ScatterAccess,
+   This class inherits from the Max<> reducer and it wraps join(dest, src) with convenient update(rhs). 
+   Note the addition of update(ValueType const& rhs) and reset() are so that all reducers can have a common update function
+   See ReduceDuplicates and ResetDuplicates ) */
+template <typename ValueType>
+struct ScatterValue<ValueType, Kokkos::Experimental::ScatterMax, Kokkos::Experimental::ScatterNonAtomic> :
+  Max<ValueType,Kokkos::DefaultExecutionSpace> {
+  public:
+    KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) : 
+       Max<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
+    {}
+    KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) : 
+       Max<ValueType,Kokkos::DefaultExecutionSpace>(other.reference())
+    {}
+    KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
+      this->join( this->reference(), rhs );
+    }
+    KOKKOS_FORCEINLINE_FUNCTION void reset() {
+      this->init( this->reference() );
+    }
+};
+
+/* ScatterValue <Op=ScatterMax, contribution=ScatterAtomic> is the object returned by the access operator() 
+ * of ScatterAccess, similar to that returned by an Atomic View, it wraps and atomic_max with the update(rhs)
+   function. atomic_max uses the atomic_compare_exchange. This version also has the reset() function  */
+template <typename ValueType>
+struct ScatterValue<ValueType, Kokkos::Experimental::ScatterMax, Kokkos::Experimental::ScatterAtomic> :
+  Max<ValueType,Kokkos::DefaultExecutionSpace> {
+  public:
+    KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) : 
+       Max<ValueType,Kokkos::DefaultExecutionSpace>(value_in)
+    {}
+
+    KOKKOS_FORCEINLINE_FUNCTION 
+    void atomic_max(ValueType & dest, const ValueType& src) const {
+
+        bool success = false;
+        while(!success) {
+            ValueType dest_old = dest;
+            ValueType dest_new = ( dest_old < src ) ? src : dest_old;
+            dest_new = Kokkos::atomic_compare_exchange<ValueType>(&dest,dest_old,dest_new);
+            success = ( (dest_new - dest_old)/dest_old <= 1e-15 );
+        }
+    }
+    
+    KOKKOS_INLINE_FUNCTION
+    void join(ValueType& dest, const ValueType& src)  const {
+      atomic_max(dest, src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void join(volatile ValueType& dest, const volatile ValueType& src) const {
+      atomic_max(dest, src);
+    } 
+
+    KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
+      this->join( this->reference(), rhs );
+    }
+    KOKKOS_FORCEINLINE_FUNCTION void reset() {
+      this->init( this->reference() );
+    }
+
 };

 /* DuplicatedDataType, given a View DataType, will create a new DataType
@ -226,6 +486,18 @@ struct DuplicatedDataType<T*, Kokkos::LayoutLeft> {
  typedef typename DuplicatedDataType<T, Kokkos::LayoutLeft>::value_type* value_type;
 };

+/* Insert integer argument pack into array */
+
+template<class T>
+void args_to_array(size_t* array, int pos, T dim0) {
+  array[pos] = dim0;
+}
+template<class T, class ... Dims>
+void args_to_array(size_t* array, int pos, T dim0, Dims ... dims) {
+  array[pos] = dim0;
+  args_to_array(array,pos+1,dims...);
+}
+
 /* Slice is just responsible for stuffing the correct number of Kokkos::ALL
   arguments on the correct side of the index in a call to subview() to get a
   subview where the index specified is the largest-stride one. */
@ -304,21 +576,26 @@ struct ReduceDuplicatesBase {
  }
 };

-template <typename ExecSpace, typename ValueType>
-struct ReduceDuplicates<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum> :
-  public ReduceDuplicatesBase<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum>
+/* ReduceDuplicates -- Perform reduction on destination array using strided source 
+ *    Use ScatterValue<> specific to operation to wrap destination array so that
+ *    the reduction operation can be accessed via the update(rhs) function */
+template <typename ExecSpace, typename ValueType, int Op>
+struct ReduceDuplicates :
+  public ReduceDuplicatesBase<ExecSpace, ValueType, Op>
 {
-  typedef ReduceDuplicatesBase<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum> Base;
+  typedef ReduceDuplicatesBase<ExecSpace, ValueType, Op> Base;
  ReduceDuplicates(ValueType const* src_in, ValueType* dst_in, size_t stride_in, size_t start_in, size_t n_in, std::string const& name):
    Base(src_in, dst_in, stride_in, start_in, n_in, name)
  {}
  KOKKOS_FORCEINLINE_FUNCTION void operator()(size_t i) const {
    for (size_t j = Base::start; j < Base::n; ++j) {
-      Base::dst[i] += Base::src[i + Base::stride * j];
+      ScatterValue<ValueType, Op, Kokkos::Experimental::ScatterNonAtomic> sv(Base::dst[i]);
+      sv.update( Base::src[i + Base::stride * j] );
    }
  }
 };

+
 template <typename ExecSpace, typename ValueType, int Op>
 struct ResetDuplicates;

@ -347,19 +624,24 @@ struct ResetDuplicatesBase {
  }
 };

-template <typename ExecSpace, typename ValueType>
-struct ResetDuplicates<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum> :
-  public ResetDuplicatesBase<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum>
+/* ResetDuplicates -- Perform reset on destination array
+ *    Use ScatterValue<> specific to operation to wrap destination array so that
+ *    the reset operation can be accessed via the reset() function */
+template <typename ExecSpace, typename ValueType, int Op>
+struct ResetDuplicates :
+  public ResetDuplicatesBase<ExecSpace, ValueType, Op>
 {
-  typedef ResetDuplicatesBase<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum> Base;
+  typedef ResetDuplicatesBase<ExecSpace, ValueType, Op> Base;
  ResetDuplicates(ValueType* data_in, size_t size_in, std::string const& name):
    Base(data_in, size_in, name)
  {}
  KOKKOS_FORCEINLINE_FUNCTION void operator()(size_t i) const {
-    Base::data[i] = Kokkos::reduction_identity<ValueType>::sum();
+    ScatterValue<ValueType, Op, Kokkos::Experimental::ScatterNonAtomic> sv(Base::data[i]);
+    sv.reset();
  }
 };

+
 }}} // Kokkos::Impl::Experimental

 namespace Kokkos {
@ -519,12 +801,22 @@ public:
  typedef Kokkos::Impl::Experimental::ScatterValue<
      original_value_type, Op, override_contribution> value_type;

+  KOKKOS_INLINE_FUNCTION
+  ScatterAccess() :
+    view(view_type())  {
+  }
+
  KOKKOS_INLINE_FUNCTION
  ScatterAccess(view_type const& view_in)
    : view(view_in)
  {
  }

+  KOKKOS_INLINE_FUNCTION
+  ~ScatterAccess()
+  {
+  }
+
  template <typename ... Args>
  KOKKOS_FORCEINLINE_FUNCTION
  value_type operator()(Args ... args) const {
@ -608,7 +900,7 @@ public:
  }

  template <int override_contribution = contribution>
-  inline
+  KOKKOS_FORCEINLINE_FUNCTION
  ScatterAccess<DataType, Op, ExecSpace, Kokkos::LayoutRight, ScatterDuplicated, contribution, override_contribution>
  access() const {
    return ScatterAccess<DataType, Op, ExecSpace, Kokkos::LayoutRight, ScatterDuplicated, contribution, override_contribution>{*this};
@ -729,14 +1021,14 @@ public:
  : unique_token()
  {
    size_t arg_N[8] = {
-      original_view.extent(0),
-      original_view.extent(1),
-      original_view.extent(2),
-      original_view.extent(3),
-      original_view.extent(4),
-      original_view.extent(5),
-      original_view.extent(6),
-      0
+      original_view.rank>0?original_view.extent(0):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+      original_view.rank>1?original_view.extent(1):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+      original_view.rank>2?original_view.extent(2):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+      original_view.rank>3?original_view.extent(3):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+      original_view.rank>4?original_view.extent(4):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+      original_view.rank>5?original_view.extent(5):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+      original_view.rank>6?original_view.extent(6):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+      KOKKOS_IMPL_CTOR_DEFAULT_ARG
    };
    arg_N[internal_view_type::rank - 1] = unique_token.size();
    internal_view = internal_view_type(
@ -748,14 +1040,28 @@ public:
  }

  template <typename ... Dims>
-  ScatterView(std::string const& name, Dims ... dims)
-  : internal_view(Kokkos::ViewAllocateWithoutInitializing(name), dims ..., unique_token.size())
-  {
+  ScatterView(std::string const& name, Dims ... dims) {
+    original_view_type original_view;
+    size_t arg_N[8] = {
+      original_view.rank>0?original_view.static_extent(0):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+      original_view.rank>1?original_view.static_extent(1):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+      original_view.rank>2?original_view.static_extent(2):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+      original_view.rank>3?original_view.static_extent(3):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+      original_view.rank>4?original_view.static_extent(4):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+      original_view.rank>5?original_view.static_extent(5):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+      original_view.rank>6?original_view.static_extent(6):KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+      KOKKOS_IMPL_CTOR_DEFAULT_ARG
+    };
+    Kokkos::Impl::Experimental::args_to_array(arg_N,0,dims ...);
+    arg_N[internal_view_type::rank - 1] = unique_token.size();
+    internal_view = internal_view_type(Kokkos::ViewAllocateWithoutInitializing(name),
+     arg_N[0], arg_N[1], arg_N[2], arg_N[3],
+     arg_N[4], arg_N[5], arg_N[6], arg_N[7]);
    reset();
  }

  template <int override_contribution = contribution>
-  inline
+  KOKKOS_FORCEINLINE_FUNCTION
  ScatterAccess<DataType, Op, ExecSpace, Kokkos::LayoutLeft, ScatterDuplicated, contribution, override_contribution>
  access() const {
    return ScatterAccess<DataType, Op, ExecSpace, Kokkos::LayoutLeft, ScatterDuplicated, contribution, override_contribution>{*this};
@ -770,9 +1076,13 @@ public:
  }

  template <typename ... RP>
-  void contribute_into(View<DataType, RP...> const& dest) const
+  void contribute_into(View<RP...> const& dest) const
  {
-    typedef View<DataType, RP...> dest_type;
+    typedef View<RP...> dest_type;
+    static_assert(std::is_same<
+        typename dest_type::value_type,
+        typename original_view_type::non_const_value_type>::value,
+        "ScatterView deep_copy destination has wrong value_type");
    static_assert(std::is_same<
        typename dest_type::array_layout,
        Kokkos::LayoutLeft>::value,
@ -891,12 +1201,14 @@ public:
  typedef Kokkos::Impl::Experimental::ScatterValue<
      original_value_type, Op, override_contribution> value_type;

-  inline ScatterAccess(view_type const& view_in)
+  KOKKOS_FORCEINLINE_FUNCTION
+  ScatterAccess(view_type const& view_in)
    : view(view_in)
    , thread_id(view_in.unique_token.acquire()) {
  }

-  inline ~ScatterAccess() {
+  KOKKOS_FORCEINLINE_FUNCTION
+  ~ScatterAccess() {
    if (thread_id != ~thread_id_type(0)) view.unique_token.release(thread_id);
  }

@ -926,8 +1238,9 @@ private:
 public:
  // do need to allow moves though, for the common
  // auto b = a.access();
-  // that assignments turns into a move constructor call 
-  inline ScatterAccess(ScatterAccess&& other)
+  // that assignments turns into a move constructor call  
+  KOKKOS_FORCEINLINE_FUNCTION
+  ScatterAccess(ScatterAccess&& other)
    : view(other.view)
    , thread_id(other.thread_id)
  {
--- a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
+++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
@ -437,9 +437,9 @@ public:
  {
    bool result = !erasable();
    if (is_insertable_map && result) {
-      execution_space::fence();
+      execution_space().fence();
      set_flag(erasable_idx);
-      execution_space::fence();
+      execution_space().fence();
    }
    return result;
  }
@ -448,10 +448,10 @@ public:
  {
    bool result = erasable();
    if (is_insertable_map && result) {
-      execution_space::fence();
+      execution_space().fence();
      Impl::UnorderedMapErase<declared_map_type> f(*this);
      f.apply();
-      execution_space::fence();
+      execution_space().fence();
      reset_flag(erasable_idx);
    }
    return result;
--- a/lib/kokkos/containers/src/Kokkos_Vector.hpp
+++ b/lib/kokkos/containers/src/Kokkos_Vector.hpp
@ -121,12 +121,12 @@ public:
    if( DV::template need_sync<typename DV::t_dev::device_type>() ) {
      set_functor_host f(DV::h_view,val);
      parallel_for(n,f);
-      DV::t_host::execution_space::fence();
+      typename DV::t_host::execution_space().fence();
      DV::template modify<typename DV::t_host::device_type>();
    } else {
      set_functor f(DV::d_view,val);
      parallel_for(n,f);
-      DV::t_dev::execution_space::fence();
+      typename DV::t_dev::execution_space().fence();
      DV::template modify<typename DV::t_dev::device_type>();
    }
  }
--- a/lib/kokkos/containers/unit_tests/CMakeLists.txt
+++ b/lib/kokkos/containers/unit_tests/CMakeLists.txt
@ -86,6 +86,31 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
  )
 ENDIF()

+IF(Kokkos_ENABLE_HPX)
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_HPX
+  SOURCES
+    UnitTestMain.cpp
+    hpx/TestHPX_BitSet.cpp
+    hpx/TestHPX_DualView.cpp
+    hpx/TestHPX_DynamicView.cpp
+    hpx/TestHPX_DynRankViewAPI_generic.cpp
+    hpx/TestHPX_DynRankViewAPI_rank12345.cpp
+    hpx/TestHPX_DynRankViewAPI_rank67.cpp
+    hpx/TestHPX_ErrorReporter.cpp
+    hpx/TestHPX_OffsetView.cpp
+    hpx/TestHPX_ScatterView.cpp
+    hpx/TestHPX_StaticCrsGraph.cpp
+    hpx/TestHPX_UnorderedMap.cpp
+    hpx/TestHPX_Vector.cpp
+    hpx/TestHPX_ViewCtorPropEmbeddedDim.cpp
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
+  )
+ENDIF()
+
 IF(Kokkos_ENABLE_Cuda)
 TRIBITS_ADD_EXECUTABLE_AND_TEST(
  UnitTest_Cuda
--- a/lib/kokkos/containers/unit_tests/Makefile
+++ b/lib/kokkos/containers/unit_tests/Makefile
@ -4,6 +4,7 @@ GTEST_PATH = ../../TPL/gtest

 vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests
 vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/openmp
+vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/hpx
 vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/serial
 vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/threads
 vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/rocm
@ -106,6 +107,25 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
 	TEST_TARGETS += test-openmp
 endif

+ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
+	OBJ_HPX = UnitTestMain.o gtest-all.o
+	OBJ_HPX += TestHPX_BitSet.o
+	OBJ_HPX += TestHPX_DualView.o
+	OBJ_HPX += TestHPX_DynamicView.o
+	OBJ_HPX += TestHPX_DynRankViewAPI_generic.o
+	OBJ_HPX += TestHPX_DynRankViewAPI_rank12345.o
+	OBJ_HPX += TestHPX_DynRankViewAPI_rank67.o
+	OBJ_HPX += TestHPX_ErrorReporter.o
+	OBJ_HPX += TestHPX_OffsetView.o
+	OBJ_HPX += TestHPX_ScatterView.o
+	OBJ_HPX += TestHPX_StaticCrsGraph.o
+	OBJ_HPX += TestHPX_UnorderedMap.o
+	OBJ_HPX += TestHPX_Vector.o
+	OBJ_HPX += TestHPX_ViewCtorPropEmbeddedDim.o
+	TARGETS += KokkosContainers_UnitTest_HPX
+	TEST_TARGETS += test-hpx
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
 	OBJ_SERIAL = UnitTestMain.o gtest-all.o
 	OBJ_SERIAL += TestSerial_BitSet.o
@ -137,6 +157,9 @@ KokkosContainers_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
 KokkosContainers_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_OpenMP

+KokkosContainers_UnitTest_HPX: $(OBJ_HPX) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_HPX) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_HPX
+
 KokkosContainers_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_Serial

@ -152,6 +175,9 @@ test-threads: KokkosContainers_UnitTest_Threads
 test-openmp: KokkosContainers_UnitTest_OpenMP
 	./KokkosContainers_UnitTest_OpenMP

+test-hpx: KokkosContainers_UnitTest_HPX
+	./KokkosContainers_UnitTest_HPX
+
 test-serial: KokkosContainers_UnitTest_Serial
 	./KokkosContainers_UnitTest_Serial

--- a/lib/kokkos/containers/unit_tests/TestBitset.hpp
+++ b/lib/kokkos/containers/unit_tests/TestBitset.hpp
@ -66,7 +66,7 @@ struct TestBitset

  unsigned testit(unsigned collisions)
  {
-    execution_space::fence();
+    execution_space().fence();

    unsigned count = 0;
    Kokkos::parallel_reduce( m_bitset.size()*collisions, *this, count);
@ -114,7 +114,7 @@ struct TestBitsetTest

  unsigned testit()
  {
-    execution_space::fence();
+    execution_space().fence();

    unsigned count = 0;
    Kokkos::parallel_reduce( m_bitset.size(), *this, count);
@ -151,7 +151,7 @@ struct TestBitsetAny

  unsigned testit()
  {
-    execution_space::fence();
+    execution_space().fence();

    unsigned count = 0;
    Kokkos::parallel_reduce( m_bitset.size(), *this, count);
--- a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
@ -1276,6 +1276,7 @@ public:
      Kokkos::deep_copy( dx , hx );
      Kokkos::deep_copy( dy , dx );
      Kokkos::deep_copy( hy , dy );
+      Kokkos::fence();

      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
@ -1286,6 +1287,7 @@ public:

      Kokkos::deep_copy( dx , T(0) );
      Kokkos::deep_copy( hx , dx );
+      Kokkos::fence();

      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
--- a/lib/kokkos/containers/unit_tests/TestErrorReporter.hpp
+++ b/lib/kokkos/containers/unit_tests/TestErrorReporter.hpp
@ -162,6 +162,7 @@ struct ErrorReporterDriver : public ErrorReporterDriverBase<DeviceType>
  void execute(int reporter_capacity, int test_size)
  {
    Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0,test_size), *this);
+    Kokkos::fence();
    driver_base::check_expectations(reporter_capacity, test_size);
  }

@ -194,6 +195,7 @@ struct ErrorReporterDriverUseLambda : public ErrorReporterDriverBase<DeviceType>
        driver_base::m_errorReporter.add_report(work_idx, report);
      }
    });
+    Kokkos::fence();
    driver_base::check_expectations(reporter_capacity, test_size);
  }

--- a/lib/kokkos/containers/unit_tests/TestScatterView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestScatterView.hpp
@ -48,79 +48,387 @@

 namespace Test {

+template <typename ExecSpace, typename Layout, int duplication, int contribution, int op>
+struct test_scatter_view_impl_cls;
+
 template <typename ExecSpace, typename Layout, int duplication, int contribution>
-void test_scatter_view_config(int n)
+struct test_scatter_view_impl_cls<ExecSpace, Layout, duplication, contribution, Kokkos::Experimental::ScatterSum>   
 {
-  Kokkos::View<double *[3], Layout, ExecSpace> original_view("original_view", n);
-  {
-    auto scatter_view = Kokkos::Experimental::create_scatter_view
-      < Kokkos::Experimental::ScatterSum
-      , duplication
-      , contribution
-      > (original_view);
-#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
-    auto policy = Kokkos::RangePolicy<ExecSpace, int>(0, n);
-    auto f = KOKKOS_LAMBDA(int i) {
+public:   
+
+   typedef Kokkos::Experimental::ScatterView
+       < double*[3]
+       , Layout
+       , ExecSpace
+       , Kokkos::Experimental::ScatterSum
+       , duplication
+       , contribution
+        > scatter_view_type;
+
+   typedef Kokkos::View<double *[3], Layout, ExecSpace> orig_view_type; 
+
+
+   scatter_view_type scatter_view;
+   int scatterSize;
+
+   test_scatter_view_impl_cls(const scatter_view_type& view){
+      scatter_view = view;
+      scatterSize = 0;
+   }
+
+   void initialize(orig_view_type orig) {
+      auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
+      Kokkos::fence();
+      for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
+        host_view(i, 0) = 0.0;
+        host_view(i, 1) = 0.0;
+        host_view(i, 2) = 0.0;
+      }
+      Kokkos::fence();
+      Kokkos::deep_copy(orig, host_view);
+   }
+
+   void run_parallel(int n) {
+        scatterSize = n;
+        auto policy = Kokkos::RangePolicy<ExecSpace, int>(0, n);
+        Kokkos::parallel_for(policy, *this, "scatter_view_test: Sum");
+   }
+
+   KOKKOS_INLINE_FUNCTION
+   void operator()(int i) const {
      auto scatter_access = scatter_view.access();
      auto scatter_access_atomic = scatter_view.template access<Kokkos::Experimental::ScatterAtomic>();
      for (int j = 0; j < 10; ++j) {
-        auto k = (i + j) % n;
+        auto k = (i + j) % scatterSize;
        scatter_access(k, 0) += 4.2;
        scatter_access_atomic(k, 1) += 2.0;
        scatter_access(k, 2) += 1.0;
      }
-    };
-    Kokkos::parallel_for(policy, f, "scatter_view_test");
-#endif
-    Kokkos::Experimental::contribute(original_view, scatter_view);
-    scatter_view.reset_except(original_view);
-#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
-    Kokkos::parallel_for(policy, f, "scatter_view_test");
-#endif
-    Kokkos::Experimental::contribute(original_view, scatter_view);
-  }
-#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
-  Kokkos::fence();
-  auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), original_view);
-  Kokkos::fence();
-  for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
-    auto val0 = host_view(i, 0);
-    auto val1 = host_view(i, 1);
-    auto val2 = host_view(i, 2);
-    EXPECT_TRUE(std::fabs((val0 - 84.0) / 84.0) < 1e-15);
-    EXPECT_TRUE(std::fabs((val1 - 40.0) / 40.0) < 1e-15);
-    EXPECT_TRUE(std::fabs((val2 - 20.0) / 20.0) < 1e-15);
-  }
-#endif
-  {
-    Kokkos::Experimental::ScatterView
-      < double*[3]
-      , Layout
-      , ExecSpace
-      , Kokkos::Experimental::ScatterSum
-      , duplication
-      , contribution
-      >
-      persistent_view("persistent", n);
-    auto result_view = persistent_view.subview();
-    contribute(result_view, persistent_view);
-  }
-}
+    }

-template <typename ExecSpace>
+    void validateResults(orig_view_type orig) {
+      auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
+      Kokkos::fence();
+      for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
+        auto val0 = host_view(i, 0);
+        auto val1 = host_view(i, 1);
+        auto val2 = host_view(i, 2);
+        EXPECT_TRUE(std::fabs((val0 - 84.0) / 84.0) < 1e-14);
+        EXPECT_TRUE(std::fabs((val1 - 40.0) / 40.0) < 1e-14);
+        EXPECT_TRUE(std::fabs((val2 - 20.0) / 20.0) < 1e-14);
+      }
+    }
+};
+
+
+template <typename ExecSpace, typename Layout, int duplication, int contribution>
+struct test_scatter_view_impl_cls<ExecSpace, Layout, duplication, contribution, Kokkos::Experimental::ScatterProd>   
+{
+public:   
+
+   typedef Kokkos::Experimental::ScatterView
+       < double*[3]
+       , Layout
+       , ExecSpace
+       , Kokkos::Experimental::ScatterProd
+       , duplication
+       , contribution
+        > scatter_view_type;
+
+   typedef Kokkos::View<double *[3], Layout, ExecSpace> orig_view_type; 
+
+
+   scatter_view_type scatter_view;
+   int scatterSize;
+
+   test_scatter_view_impl_cls(const scatter_view_type& view){
+      scatter_view = view;
+      scatterSize = 0;
+   }
+
+   void initialize(orig_view_type orig) {
+      auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
+      Kokkos::fence();
+      for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
+        host_view(i, 0) = 1.0;
+        host_view(i, 1) = 1.0;
+        host_view(i, 2) = 1.0;
+      }
+      Kokkos::fence();
+      Kokkos::deep_copy(orig, host_view);
+   }
+
+   void run_parallel(int n) {
+        scatterSize = n;
+        auto policy = Kokkos::RangePolicy<ExecSpace, int>(0, n);
+        Kokkos::parallel_for(policy, *this, "scatter_view_test: Prod");
+   }
+
+   KOKKOS_INLINE_FUNCTION
+   void operator()(int i) const {
+      auto scatter_access = scatter_view.access();
+      auto scatter_access_atomic = scatter_view.template access<Kokkos::Experimental::ScatterAtomic>();
+      for (int j = 0; j < 4; ++j) {
+        auto k = (i + j) % scatterSize;
+        scatter_access(k, 0) *= 4.0;
+        scatter_access_atomic(k, 1) *= 2.0;
+        scatter_access(k, 2) *= 1.0;
+      }
+    }
+
+    void validateResults(orig_view_type orig) {
+      auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
+      Kokkos::fence();
+      for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
+        auto val0 = host_view(i, 0);
+        auto val1 = host_view(i, 1);
+        auto val2 = host_view(i, 2);
+        EXPECT_TRUE(std::fabs((val0 - 65536.0) / 65536.0) < 1e-14);
+        EXPECT_TRUE(std::fabs((val1 - 256.0) / 256.0) < 1e-14);
+        EXPECT_TRUE(std::fabs((val2 - 1.0) / 1.0) < 1e-14);
+      }
+    }
+};
+
+
+template <typename ExecSpace, typename Layout, int duplication, int contribution>
+struct test_scatter_view_impl_cls<ExecSpace, Layout, duplication, contribution, Kokkos::Experimental::ScatterMin>   
+{
+public:   
+
+   typedef Kokkos::Experimental::ScatterView
+       < double*[3]
+       , Layout
+       , ExecSpace
+       , Kokkos::Experimental::ScatterMin
+       , duplication
+       , contribution
+        > scatter_view_type;
+
+   typedef Kokkos::View<double *[3], Layout, ExecSpace> orig_view_type; 
+
+
+   scatter_view_type scatter_view;
+   int scatterSize;
+
+   test_scatter_view_impl_cls(const scatter_view_type& view){
+      scatter_view = view;
+      scatterSize = 0;
+   }
+
+   void initialize(orig_view_type orig) {
+      auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
+      Kokkos::fence();
+      for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
+        host_view(i, 0) = 999999.0;
+        host_view(i, 1) = 999999.0;
+        host_view(i, 2) = 999999.0;
+      }
+      Kokkos::fence();
+      Kokkos::deep_copy(orig, host_view);
+   }
+
+   void run_parallel(int n) {
+        scatterSize = n;
+        auto policy = Kokkos::RangePolicy<ExecSpace, int>(0, n);
+        Kokkos::parallel_for(policy, *this, "scatter_view_test: Prod");
+   }
+
+   KOKKOS_INLINE_FUNCTION
+   void operator()(int i) const {
+      auto scatter_access = scatter_view.access();
+      auto scatter_access_atomic = scatter_view.template access<Kokkos::Experimental::ScatterAtomic>();
+      for (int j = 0; j < 4; ++j) {
+        auto k = (i + j) % scatterSize;
+        scatter_access(k, 0).update((double)(j+1)*4);
+        scatter_access_atomic(k, 1).update((double)(j+1)*2.0);
+        scatter_access(k, 2).update((double)(j+1)*1.0);
+      }
+    }
+
+    void validateResults(orig_view_type orig) {
+      auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
+      Kokkos::fence();
+      for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
+        auto val0 = host_view(i, 0);
+        auto val1 = host_view(i, 1);
+        auto val2 = host_view(i, 2);
+        EXPECT_TRUE(std::fabs((val0 - 4.0) / 4.0) < 1e-14);
+        EXPECT_TRUE(std::fabs((val1 - 2.0) / 2.0) < 1e-14);
+        EXPECT_TRUE(std::fabs((val2 - 1.0) / 1.0) < 1e-14);
+      }
+    }
+};
+
+
+template <typename ExecSpace, typename Layout, int duplication, int contribution>
+struct test_scatter_view_impl_cls<ExecSpace, Layout, duplication, contribution, Kokkos::Experimental::ScatterMax>   
+{
+public:   
+
+   typedef Kokkos::Experimental::ScatterView
+       < double*[3]
+       , Layout
+       , ExecSpace
+       , Kokkos::Experimental::ScatterMax
+       , duplication
+       , contribution
+        > scatter_view_type;
+
+   typedef Kokkos::View<double *[3], Layout, ExecSpace> orig_view_type; 
+
+
+   scatter_view_type scatter_view;
+   int scatterSize;
+
+   test_scatter_view_impl_cls(const scatter_view_type& view){
+      scatter_view = view;
+      scatterSize = 0;
+   }
+
+   void initialize(orig_view_type orig) {
+      auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
+      Kokkos::fence();
+      for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
+        host_view(i, 0) = 0.0;
+        host_view(i, 1) = 0.0;
+        host_view(i, 2) = 0.0;
+      }
+      Kokkos::fence();
+      Kokkos::deep_copy(orig, host_view);
+   }
+
+   void run_parallel(int n) {
+        scatterSize = n;
+        auto policy = Kokkos::RangePolicy<ExecSpace, int>(0, n);
+        Kokkos::parallel_for(policy, *this, "scatter_view_test: Prod");
+   }
+
+   KOKKOS_INLINE_FUNCTION
+   void operator()(int i) const {
+      auto scatter_access = scatter_view.access();
+      auto scatter_access_atomic = scatter_view.template access<Kokkos::Experimental::ScatterAtomic>();
+      for (int j = 0; j < 4; ++j) {
+        auto k = (i + j) % scatterSize;
+        scatter_access(k, 0).update((double)(j+1)*4);
+        scatter_access_atomic(k, 1).update((double)(j+1)*2.0);
+        scatter_access(k, 2).update((double)(j+1)*1.0);
+      }
+    }
+
+    void validateResults(orig_view_type orig) {
+      auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig);
+      Kokkos::fence();
+      for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
+        auto val0 = host_view(i, 0);
+        auto val1 = host_view(i, 1);
+        auto val2 = host_view(i, 2);
+        EXPECT_TRUE(std::fabs((val0 - 16.0) / 16.0) < 1e-14);
+        EXPECT_TRUE(std::fabs((val1 - 8.0) / 8.0) < 1e-14);
+        EXPECT_TRUE(std::fabs((val2 - 4.0) / 4.0) < 1e-14);
+      }
+    }
+};
+
+
+
+template <typename ExecSpace, typename Layout, int duplication, int contribution, int op>
+struct test_scatter_view_config
+{
+ public:
+   typedef typename test_scatter_view_impl_cls<ExecSpace, Layout, 
+         duplication, contribution, op>::scatter_view_type scatter_view_def;
+   typedef typename test_scatter_view_impl_cls<ExecSpace, Layout, 
+         duplication, contribution, op>::orig_view_type orig_view_def;
+
+   test_scatter_view_config() {
+   }
+
+   void run_test(int n)
+   {
+     //Test creation via create_scatter_view
+     {
+     orig_view_def original_view("original_view", n);
+     scatter_view_def scatter_view = Kokkos::Experimental::create_scatter_view
+       < op
+       , duplication
+       , contribution
+       > (original_view);
+
+     test_scatter_view_impl_cls<ExecSpace, Layout, duplication, contribution, op> scatter_view_test_impl(scatter_view);
+     scatter_view_test_impl.initialize(original_view);
+     scatter_view_test_impl.run_parallel(n);
+
+     Kokkos::Experimental::contribute(original_view, scatter_view);
+     scatter_view.reset_except(original_view);
+
+     scatter_view_test_impl.run_parallel(n);
+
+     Kokkos::Experimental::contribute(original_view, scatter_view);
+     Kokkos::fence();
+
+     scatter_view_test_impl.validateResults(original_view);
+
+     {
+        scatter_view_def persistent_view("persistent", n);
+        auto result_view = persistent_view.subview();
+        contribute(result_view, persistent_view);
+        Kokkos::fence();
+     }
+     }
+     //Test creation via constructor
+     {
+     orig_view_def original_view("original_view", n);
+     scatter_view_def scatter_view(original_view);
+
+     test_scatter_view_impl_cls<ExecSpace, Layout, duplication, contribution, op> scatter_view_test_impl(scatter_view);
+     scatter_view_test_impl.initialize(original_view);
+     scatter_view_test_impl.run_parallel(n);
+
+     Kokkos::Experimental::contribute(original_view, scatter_view);
+     scatter_view.reset_except(original_view);
+
+     scatter_view_test_impl.run_parallel(n);
+
+     Kokkos::Experimental::contribute(original_view, scatter_view);
+     Kokkos::fence();
+
+     scatter_view_test_impl.validateResults(original_view);
+
+     {
+        scatter_view_def persistent_view("persistent", n);
+        auto result_view = persistent_view.subview();
+        contribute(result_view, persistent_view);
+        Kokkos::fence();
+     }
+     }
+   }
+
+};
+
+
+template <typename ExecSpace, int ScatterType>
 struct TestDuplicatedScatterView {
  TestDuplicatedScatterView(int n) {
+    // ScatterSum test
    test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
      Kokkos::Experimental::ScatterDuplicated,
-      Kokkos::Experimental::ScatterNonAtomic>(n);
+      Kokkos::Experimental::ScatterNonAtomic,
+      ScatterType> test_sv_right_config;
+    test_sv_right_config.run_test(n);
+    test_scatter_view_config<ExecSpace, Kokkos::LayoutLeft,
+      Kokkos::Experimental::ScatterDuplicated,
+      Kokkos::Experimental::ScatterNonAtomic,
+      ScatterType> test_sv_left_config;
+    test_sv_left_config.run_test(n);
  }
 };

 #ifdef KOKKOS_ENABLE_CUDA
 // disable duplicated instantiation with CUDA until
 // UniqueToken can support it
-template <>
-struct TestDuplicatedScatterView<Kokkos::Cuda> {
+template <int ScatterType>
+struct TestDuplicatedScatterView<Kokkos::Cuda, ScatterType> {
  TestDuplicatedScatterView(int) {
  }
 };
@ -129,14 +437,14 @@ struct TestDuplicatedScatterView<Kokkos::Cuda> {
 #ifdef KOKKOS_ENABLE_ROCM
 // disable duplicated instantiation with ROCm until
 // UniqueToken can support it
-template <>
-struct TestDuplicatedScatterView<Kokkos::Experimental::ROCm> {
+template <int ScatterType>
+struct TestDuplicatedScatterView<Kokkos::Experimental::ROCm, ScatterType> {
  TestDuplicatedScatterView(int) {
  }
 };
 #endif

-template <typename ExecSpace>
+template <typename ExecSpace, int ScatterType>
 void test_scatter_view(int n)
 {
  // all of these configurations should compile okay, but only some of them are
@ -149,29 +457,47 @@ void test_scatter_view(int n)
  if (unique_token.size() == 1) {
    test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
      Kokkos::Experimental::ScatterNonDuplicated,
-      Kokkos::Experimental::ScatterNonAtomic>(n);
+      Kokkos::Experimental::ScatterNonAtomic,
+      ScatterType> test_sv_config;
+    test_sv_config.run_test(n);
  }
 #ifdef KOKKOS_ENABLE_SERIAL
  if (!std::is_same<ExecSpace, Kokkos::Serial>::value) {
 #endif
  test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
    Kokkos::Experimental::ScatterNonDuplicated,
-    Kokkos::Experimental::ScatterAtomic>(n);
+    Kokkos::Experimental::ScatterAtomic,
+    ScatterType> test_sv_config;
+  test_sv_config.run_test(n);
 #ifdef KOKKOS_ENABLE_SERIAL
  }
 #endif
-
-  TestDuplicatedScatterView<ExecSpace> duptest(n);
+  // with hundreds of threads we were running out of memory.
+  // limit (n) so that duplication doesn't exceed 8GB
+  constexpr std::size_t maximum_allowed_total_bytes = 8ull * 1024ull * 1024ull * 1024ull;
+  std::size_t const maximum_allowed_copy_bytes = maximum_allowed_total_bytes / std::size_t(unique_token.size());
+  constexpr std::size_t bytes_per_value = sizeof(double) * 3;
+  std::size_t const maximum_allowed_copy_values = maximum_allowed_copy_bytes / bytes_per_value;
+  n = std::min(n, int(maximum_allowed_copy_values));
+  TestDuplicatedScatterView<ExecSpace, ScatterType> duptest(n);
 }

 TEST_F( TEST_CATEGORY, scatterview) {
 #ifndef KOKKOS_ENABLE_ROCM
-  test_scatter_view<TEST_EXECSPACE>(10);
+  test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterSum>(10);
+  test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterProd>(10);
+  test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterMin>(10);
+  test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterMax>(10);
+  // tests were timing out in DEBUG mode, reduce the amount of work
 #ifdef KOKKOS_ENABLE_DEBUG
-  test_scatter_view<TEST_EXECSPACE>(100000);
+  int big_n = 100 * 1000;
 #else
-  test_scatter_view<TEST_EXECSPACE>(10000000);
+  int big_n = 10 * 1000 * 1000;
 #endif
+  test_scatter_view<TEST_EXECSPACE,Kokkos::Experimental::ScatterSum>(big_n);
+  test_scatter_view<TEST_EXECSPACE,Kokkos::Experimental::ScatterProd>(big_n);
+  test_scatter_view<TEST_EXECSPACE,Kokkos::Experimental::ScatterMin>(big_n);
+  test_scatter_view<TEST_EXECSPACE,Kokkos::Experimental::ScatterMax>(big_n);
 #endif
 }

--- a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp
+++ b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp
@ -69,7 +69,7 @@ struct TestInsert

  void testit( bool rehash_on_fail = true )
  {
-    execution_space::fence();
+    execution_space().fence();

    uint32_t failed_count = 0;
    do {
@ -82,7 +82,7 @@ struct TestInsert
      }
    } while (rehash_on_fail && failed_count > 0u);

-    execution_space::fence();
+    execution_space().fence();
  }


@ -122,9 +122,9 @@ struct TestInsert

    void testit()
    {
-      execution_space::fence();
+      execution_space().fence();
      Kokkos::parallel_for(m_num_erase, *this);
-      execution_space::fence();
+      execution_space().fence();
    }

    KOKKOS_INLINE_FUNCTION
@ -161,9 +161,9 @@ struct TestInsert

    void testit(value_type &errors)
    {
-      execution_space::execution_space::fence();
+      execution_space().fence();
      Kokkos::parallel_reduce(m_map.capacity(), *this, errors);
-      execution_space::execution_space::fence();
+      execution_space().fence();
    }

    KOKKOS_INLINE_FUNCTION
@ -247,7 +247,7 @@ void test_failed_insert( uint32_t num_nodes)
  map_type map(num_nodes);
  Impl::TestInsert<map_type> test_insert(map, 2u*num_nodes, 1u);
  test_insert.testit(false /*don't rehash on fail*/);
-  Device::execution_space::fence();
+  typename Device::execution_space().fence();

  EXPECT_TRUE( map.failed_insert() );
 }
--- a/lib/kokkos/containers/unit_tests/hpx/TestHPX_BitSet.cpp
+++ b/lib/kokkos/containers/unit_tests/hpx/TestHPX_BitSet.cpp
@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<hpx/TestHPX_Category.hpp>
+#include<TestBitset.hpp>
+
--- a/lib/kokkos/containers/unit_tests/hpx/TestHPX_Category.hpp
+++ b/lib/kokkos/containers/unit_tests/hpx/TestHPX_Category.hpp
@ -0,0 +1,65 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_HPX_HPP
+#define KOKKOS_TEST_HPX_HPP
+
+#include <gtest/gtest.h>
+
+namespace Test {
+
+class hpx : public ::testing::Test {
+protected:
+  static void SetUpTestCase() {
+  }
+
+  static void TearDownTestCase() {
+  }
+};
+
+} // namespace Test
+
+#define TEST_CATEGORY hpx
+#define TEST_EXECSPACE Kokkos::Experimental::HPX
+
+#endif
--- a/lib/kokkos/containers/unit_tests/hpx/TestHPX_DualView.cpp
+++ b/lib/kokkos/containers/unit_tests/hpx/TestHPX_DualView.cpp
@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<hpx/TestHPX_Category.hpp>
+#include<TestDualView.hpp>
+
--- a/lib/kokkos/containers/unit_tests/hpx/TestHPX_DynRankViewAPI_generic.cpp
+++ b/lib/kokkos/containers/unit_tests/hpx/TestHPX_DynRankViewAPI_generic.cpp
@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<hpx/TestHPX_Category.hpp>
+#include<TestDynViewAPI_generic.hpp>
+
--- a/lib/kokkos/containers/unit_tests/hpx/TestHPX_DynRankViewAPI_rank12345.cpp
+++ b/lib/kokkos/containers/unit_tests/hpx/TestHPX_DynRankViewAPI_rank12345.cpp
@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<hpx/TestHPX_Category.hpp>
+#include<TestDynViewAPI_rank12345.hpp>
+
--- a/lib/kokkos/containers/unit_tests/hpx/TestHPX_DynRankViewAPI_rank67.cpp
+++ b/lib/kokkos/containers/unit_tests/hpx/TestHPX_DynRankViewAPI_rank67.cpp
@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<hpx/TestHPX_Category.hpp>
+#include<TestDynViewAPI_rank67.hpp>
+
--- a/lib/kokkos/containers/unit_tests/hpx/TestHPX_DynamicView.cpp
+++ b/lib/kokkos/containers/unit_tests/hpx/TestHPX_DynamicView.cpp
@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<hpx/TestHPX_Category.hpp>
+#include<TestDynamicView.hpp>
+
--- a/lib/kokkos/containers/unit_tests/hpx/TestHPX_ErrorReporter.cpp
+++ b/lib/kokkos/containers/unit_tests/hpx/TestHPX_ErrorReporter.cpp
@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<hpx/TestHPX_Category.hpp>
+#include<TestErrorReporter.hpp>
+
--- a/lib/kokkos/containers/unit_tests/hpx/TestHPX_OffsetView.cpp
+++ b/lib/kokkos/containers/unit_tests/hpx/TestHPX_OffsetView.cpp
@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<hpx/TestHPX_Category.hpp>
+#include<TestOffsetView.hpp>
+
--- a/lib/kokkos/containers/unit_tests/hpx/TestHPX_ScatterView.cpp
+++ b/lib/kokkos/containers/unit_tests/hpx/TestHPX_ScatterView.cpp
@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<hpx/TestHPX_Category.hpp>
+#include<TestScatterView.hpp>
+
--- a/lib/kokkos/containers/unit_tests/hpx/TestHPX_StaticCrsGraph.cpp
+++ b/lib/kokkos/containers/unit_tests/hpx/TestHPX_StaticCrsGraph.cpp
@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<hpx/TestHPX_Category.hpp>
+#include<TestStaticCrsGraph.hpp>
+
--- a/lib/kokkos/containers/unit_tests/hpx/TestHPX_UnorderedMap.cpp
+++ b/lib/kokkos/containers/unit_tests/hpx/TestHPX_UnorderedMap.cpp
@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<hpx/TestHPX_Category.hpp>
+#include<TestUnorderedMap.hpp>
+
--- a/lib/kokkos/containers/unit_tests/hpx/TestHPX_Vector.cpp
+++ b/lib/kokkos/containers/unit_tests/hpx/TestHPX_Vector.cpp
@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<hpx/TestHPX_Category.hpp>
+#include<TestVector.hpp>
+
--- a/lib/kokkos/containers/unit_tests/hpx/TestHPX_ViewCtorPropEmbeddedDim.cpp
+++ b/lib/kokkos/containers/unit_tests/hpx/TestHPX_ViewCtorPropEmbeddedDim.cpp
@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<hpx/TestHPX_Category.hpp>
+#include<TestViewCtorPropEmbeddedDim.hpp>
+
--- a/lib/kokkos/core/cmake/Dependencies.cmake
+++ b/lib/kokkos/core/cmake/Dependencies.cmake
@ -1,5 +1,5 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREADS DLlib
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREADS DLlib HPX
  TEST_OPTIONAL_TPLS CUSPARSE
  )

--- a/lib/kokkos/core/perf_test/CMakeLists.txt
+++ b/lib/kokkos/core/perf_test/CMakeLists.txt
@ -47,6 +47,7 @@ TRIBITS_ADD_EXECUTABLE(
  PerformanceTest_TaskDAG
  SOURCES test_taskdag.cpp
  COMM serial mpi
+  TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
  )

 TRIBITS_ADD_TEST(
--- a/lib/kokkos/core/perf_test/Makefile
+++ b/lib/kokkos/core/perf_test/Makefile
@ -30,6 +30,7 @@ TARGETS =
 #

 OBJ_PERF = PerfTestMain.o gtest-all.o
+OBJ_PERF += PerfTest_ExecSpacePartitioning.o
 OBJ_PERF += PerfTestGramSchmidt.o
 OBJ_PERF += PerfTestHexGrad.o
 OBJ_PERF += PerfTest_CustomReduction.o
--- a/lib/kokkos/core/perf_test/PerfTestBlasKernels.hpp
+++ b/lib/kokkos/core/perf_test/PerfTestBlasKernels.hpp
@ -44,6 +44,8 @@
 #ifndef KOKKOS_BLAS_KERNELS_HPP
 #define KOKKOS_BLAS_KERNELS_HPP

+#include <type_traits>
+
 namespace Kokkos {

 template< class ConstVectorType ,
@ -123,15 +125,10 @@ struct Dot
 {
  typedef typename Device::execution_space execution_space ;

-  typedef typename
-    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
-                            Impl::unsigned_< Type::Rank > >::type ok_rank ;
+  static_assert( static_cast<unsigned>(Type::Rank) == static_cast<unsigned>(1),
+    "Dot static_assert Fail: Rank != 1");


-/*  typedef typename
-    Impl::StaticAssertSame< execution_space ,
-                            typename Type::execution_space >::type ok_device ;*/
-
  typedef double value_type ;

 #if 1
@ -164,13 +161,8 @@ struct DotSingle
 {
  typedef typename Device::execution_space execution_space ;

-  typedef typename
-    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
-                            Impl::unsigned_< Type::Rank > >::type ok_rank ;
-
-/*  typedef typename
-    Impl::StaticAssertSame< execution_space ,
-                            typename Type::execution_space >::type ok_device ;*/
+  static_assert( static_cast<unsigned>(Type::Rank) == static_cast<unsigned>(1),
+    "DotSingle static_assert Fail: Rank != 1");

  typedef double value_type ;

@ -204,25 +196,11 @@ struct Scale
 {
  typedef typename Device::execution_space execution_space ;

-/*  typedef typename
-    Impl::StaticAssertSame< execution_space ,
-                            typename ScalarType::execution_space >::type
-      ok_scalar_device ;
+  static_assert( static_cast<unsigned>(ScalarType::Rank) == static_cast<unsigned>(0),
+    "Scale static_assert Fail: ScalarType::Rank != 0");

-  typedef typename
-    Impl::StaticAssertSame< execution_space ,
-                            typename VectorType::execution_space >::type
-      ok_vector_device ;*/
-
-  typedef typename
-    Impl::StaticAssertSame< Impl::unsigned_< 0 > ,
-                            Impl::unsigned_< ScalarType::Rank > >::type
-      ok_scalar_rank ;
-
-  typedef typename
-    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
-                            Impl::unsigned_< VectorType::Rank > >::type
-      ok_vector_rank ;
+  static_assert( static_cast<unsigned>(VectorType::Rank) == static_cast<unsigned>(1),
+    "Scale static_assert Fail: VectorType::Rank != 1");

 #if 1
  typename ScalarType::const_type alpha ;
@ -251,35 +229,14 @@ struct AXPBY
 {
  typedef typename Device::execution_space execution_space ;

-/*  typedef typename
-    Impl::StaticAssertSame< execution_space ,
-                            typename ScalarType::execution_space >::type
-      ok_scalar_device ;
+  static_assert( static_cast<unsigned>(ScalarType::Rank) == static_cast<unsigned>(0),
+    "AXPBY static_assert Fail: ScalarType::Rank != 0");

-  typedef typename
-    Impl::StaticAssertSame< execution_space ,
-                            typename ConstVectorType::execution_space >::type
-      ok_const_vector_device ;
+  static_assert( static_cast<unsigned>(ConstVectorType::Rank) == static_cast<unsigned>(1),
+    "AXPBY static_assert Fail: ConstVectorType::Rank != 1");

-  typedef typename
-    Impl::StaticAssertSame< execution_space ,
-                            typename VectorType::execution_space >::type
-      ok_vector_device ;*/
-
-  typedef typename
-    Impl::StaticAssertSame< Impl::unsigned_< 0 > ,
-                            Impl::unsigned_< ScalarType::Rank > >::type
-      ok_scalar_rank ;
-
-  typedef typename
-    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
-                            Impl::unsigned_< ConstVectorType::Rank > >::type
-      ok_const_vector_rank ;
-
-  typedef typename
-    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
-                            Impl::unsigned_< VectorType::Rank > >::type
-      ok_vector_rank ;
+  static_assert( static_cast<unsigned>(VectorType::Rank) == static_cast<unsigned>(1),
+    "AXPBY static_assert Fail: VectorType::Rank != 1");

 #if 1
  typename ScalarType::const_type alpha , beta ;
--- a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
@ -183,7 +183,7 @@ struct ModifiedGramSchmidt
      }
    }

-    execution_space::fence();
+    execution_space().fence();

    return timer.seconds();
  }
--- a/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp
@ -253,12 +253,12 @@ struct HexGrad
    double dt_min = 0 ;

    Kokkos::parallel_for( count , Init( coord ) );
-    execution_space::fence();
+    execution_space().fence();

    for ( int i = 0 ; i < iter ; ++i ) {
      Kokkos::Timer timer ;
      Kokkos::parallel_for( count , HexGrad<execution_space>( coord , grad ) );
-      execution_space::fence();
+      execution_space().fence();
      const double dt = timer.seconds();
      if ( 0 == i ) dt_min = dt ;
      else dt_min = dt < dt_min ? dt : dt_min ;
--- a/lib/kokkos/core/perf_test/PerfTestMDRange.hpp
+++ b/lib/kokkos/core/perf_test/PerfTestMDRange.hpp
@ -125,15 +125,15 @@ struct MultiDimRangePerf3D
      Kokkos::MDRangePolicy<Kokkos::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > policy(point_type{{0,0,0}},point_type{{icount,jcount,kcount}},tile_type{{Ti,Tj,Tk}} );

      Kokkos::parallel_for( policy_initA, Init(Atest, icount, jcount, kcount) );
-      execution_space::fence();
+      execution_space().fence();
      Kokkos::parallel_for( policy_initB, Init(Btest, icount+2, jcount+2, kcount+2) );
-      execution_space::fence();
+      execution_space().fence();

    for (int i = 0; i < iter; ++i)
    {
      Kokkos::Timer timer;
      Kokkos::parallel_for( policy, FunctorType(Atest, Btest, icount, jcount, kcount) );
-      execution_space::fence();
+      execution_space().fence();
      const double dt = timer.seconds();
      if ( 0 == i ) dt_min = dt ;
      else dt_min = dt < dt_min ? dt : dt_min ;
@ -189,15 +189,15 @@ struct MultiDimRangePerf3D
      Kokkos::MDRangePolicy<Kokkos::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space > policy({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}} ); 

      Kokkos::parallel_for( policy_initA, Init(Atest, icount, jcount, kcount) );
-      execution_space::fence();
+      execution_space().fence();
      Kokkos::parallel_for( policy_initB, Init(Btest, icount+2, jcount+2, kcount+2) );
-      execution_space::fence();
+      execution_space().fence();

    for (int i = 0; i < iter; ++i)
    {
      Kokkos::Timer timer;
      Kokkos::parallel_for( policy, FunctorType(Atest, Btest, icount, jcount, kcount) );
-      execution_space::fence();
+      execution_space().fence();
      const double dt = timer.seconds();
      if ( 0 == i ) dt_min = dt ;
      else dt_min = dt < dt_min ? dt : dt_min ;
@ -368,15 +368,15 @@ struct RangePolicyCollapseTwo
    double dt_min = 0;

    Kokkos::parallel_for( policy, Init(Atest,icount,jcount,kcount) );
-    execution_space::fence();
+    execution_space().fence();
    Kokkos::parallel_for( policy_initB, Init(Btest,icount+2,jcount+2,kcount+2) );
-    execution_space::fence();
+    execution_space().fence();

    for (int i = 0; i < iter; ++i)
    {
      Kokkos::Timer timer;
      Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount));
-      execution_space::fence();
+      execution_space().fence();
      const double dt = timer.seconds();
      if ( 0 == i ) dt_min = dt ;
      else dt_min = dt < dt_min ? dt : dt_min ;
@ -513,15 +513,15 @@ struct RangePolicyCollapseAll
    double dt_min = 0;

    Kokkos::parallel_for( policy, Init(Atest,icount,jcount,kcount) );
-    execution_space::fence();
+    execution_space().fence();
    Kokkos::parallel_for( policy_initB, Init(Btest,icount+2,jcount+2,kcount+2) );
-    execution_space::fence();
+    execution_space().fence();

    for (int i = 0; i < iter; ++i)
    {
      Kokkos::Timer timer;
      Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount));
-      execution_space::fence();
+      execution_space().fence();
      const double dt = timer.seconds();
      if ( 0 == i ) dt_min = dt ;
      else dt_min = dt < dt_min ? dt : dt_min ;
--- a/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
+++ b/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
@ -0,0 +1,564 @@
+#include <Kokkos_Core.hpp>
+#include <gtest/gtest.h>
+#include <PerfTest_Category.hpp>
+
+
+namespace Test {
+
+namespace {
+  template<class ExecSpace>
+  struct SpaceInstance {
+    static ExecSpace create() {
+      return ExecSpace();
+    }
+    static void destroy(ExecSpace&) {
+    }
+    static bool overlap() {
+      return false;
+    }
+  };
+
+  #ifndef KOKKOS_ENABLE_DEBUG
+  #ifdef KOKKOS_ENABLE_CUDA
+  template<>
+  struct SpaceInstance<Kokkos::Cuda> {
+    static Kokkos::Cuda create() {
+      cudaStream_t stream;
+      cudaStreamCreate(&stream);
+      return Kokkos::Cuda(stream);
+    }
+    static void destroy(Kokkos::Cuda& space) {
+      cudaStream_t stream = space.cuda_stream();
+      cudaStreamDestroy(stream);
+    }
+    static bool overlap() {
+      bool value = true;
+      auto local_rank_str = std::getenv("CUDA_LAUNCH_BLOCKING");
+      if(local_rank_str) {
+        value = (std::atoi(local_rank_str)==0);
+      }
+      return value;
+    }
+  };
+  #endif
+  #endif
+}
+
+struct FunctorRange {
+  int M,R;
+  Kokkos::View<double**,TEST_EXECSPACE> a;
+  FunctorRange(int M_, int R_, Kokkos::View<double**,TEST_EXECSPACE> a_):M(M_),R(R_),a(a_){}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int i) const {
+    for(int r=0;r<R;r++)
+    for(int j=0;j<M;j++) {
+      a(i,j)+=1.0;
+    }
+  }
+};
+
+struct FunctorMDRange {
+  int M,R;
+  Kokkos::View<double**,TEST_EXECSPACE> a;
+  FunctorMDRange(int M_, int R_, Kokkos::View<double**,TEST_EXECSPACE> a_):M(M_),R(R_),a(a_){}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int i, const int) const {
+    for(int j=0;j<M;j++)
+      a(i,j)+=1.0;
+  }
+};
+
+struct FunctorTeam {
+  int M,R;
+  Kokkos::View<double**,Kokkos::LayoutRight,TEST_EXECSPACE> a;
+  FunctorTeam(int M_, int R_, Kokkos::View<double**,Kokkos::LayoutRight,TEST_EXECSPACE> a_):M(M_),R(R_),a(a_){}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const Kokkos::TeamPolicy<TEST_EXECSPACE>::member_type& team) const {
+    int i = team.league_rank();
+    for(int r=0;r<R;r++) {
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,M), [&] (const int j) {
+        a(i,j)+=1.0;
+      });
+    }
+  }
+};
+
+struct FunctorRangeReduce {
+  int M,R;
+  Kokkos::View<double**,TEST_EXECSPACE> a;
+  FunctorRangeReduce(int M_, int R_, Kokkos::View<double**,TEST_EXECSPACE> a_):M(M_),R(R_),a(a_){}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int i, double& tmp) const {
+    for(int r=0;r<R;r++)
+    for(int j=0;j<M;j++) {
+      tmp += a(i,j);
+    }
+  }
+};
+
+struct FunctorMDRangeReduce {
+  int M,R;
+  Kokkos::View<double**,TEST_EXECSPACE> a;
+  FunctorMDRangeReduce(int M_, int R_, Kokkos::View<double**,TEST_EXECSPACE> a_):M(M_),R(R_),a(a_){}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int i, const int, double& tmp) const {
+    for(int j=0;j<M;j++)
+      tmp += a(i,j);
+  }
+};
+
+struct FunctorTeamReduce {
+  int M,R;
+  Kokkos::View<double**,Kokkos::LayoutRight,TEST_EXECSPACE> a;
+  FunctorTeamReduce(int M_, int R_, Kokkos::View<double**,Kokkos::LayoutRight,TEST_EXECSPACE> a_):M(M_),R(R_),a(a_){}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const Kokkos::TeamPolicy<TEST_EXECSPACE>::member_type& team, double& tmp) const {
+    int i = team.league_rank();
+    for(int r=0;r<R;r++) {
+      double val;
+      Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,M), [&] (const int j, double& tmp2) {
+        tmp2 += a(i,j);
+      },val);
+      tmp+=val;
+    }
+  }
+};
+
+TEST_F( default_exec, overlap_range_policy ) {
+  int N = 2000;
+   int M = 10000;
+   int R =  10;
+
+   TEST_EXECSPACE space;
+   TEST_EXECSPACE space1 = SpaceInstance<TEST_EXECSPACE>::create();
+   TEST_EXECSPACE space2 = SpaceInstance<TEST_EXECSPACE>::create();
+
+   Kokkos::View<double**,TEST_EXECSPACE> a("A",N,M);
+   FunctorRange f(M,R,a);
+   FunctorRangeReduce fr(M,R,a);
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel0",
+       Kokkos::RangePolicy<TEST_EXECSPACE>(0,N), FunctorRange(M,R,a));
+
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel1",
+       Kokkos::Experimental::require(
+           Kokkos::RangePolicy<TEST_EXECSPACE>(space1,0,N),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , f);
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel2",
+       Kokkos::Experimental::require(
+           Kokkos::RangePolicy<TEST_EXECSPACE>(space2,0,N),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , f);
+   Kokkos::fence();
+
+   Kokkos::Timer timer;
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel3",
+       Kokkos::Experimental::require(
+           Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , f);
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel4",
+       Kokkos::Experimental::require(
+           Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , f);
+   Kokkos::fence();
+
+   timer.reset();
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel5",
+       Kokkos::Experimental::require(
+           Kokkos::RangePolicy<TEST_EXECSPACE>(space1,0,N),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , FunctorRange(M,R,a));
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel6",
+       Kokkos::Experimental::require(
+           Kokkos::RangePolicy<TEST_EXECSPACE>(space2,0,N),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , FunctorRange(M,R,a));
+   Kokkos::fence();
+   double time_overlap = timer.seconds();
+
+   timer.reset();
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel7",
+       Kokkos::Experimental::require(
+           Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , f);
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel8",
+       Kokkos::Experimental::require(
+           Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , f);
+   Kokkos::fence();
+   double time_end = timer.seconds();
+
+   if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
+     ASSERT_TRUE( (time_end > 1.5*time_overlap) );
+   }
+   printf("Time RangePolicy: NonOverlap: %lf Time Overlap: %lf\n",time_end,time_overlap);
+
+   Kokkos::View<double,TEST_EXECSPACE> result("result");
+   Kokkos::View<double,TEST_EXECSPACE> result1("result1");
+   Kokkos::View<double,TEST_EXECSPACE> result2("result2");
+   Kokkos::View<double,Kokkos::HostSpace> h_result("h_result");
+   Kokkos::View<double,Kokkos::HostSpace> h_result1("h_result1");
+   Kokkos::View<double,Kokkos::HostSpace> h_result2("h_result2");
+
+   timer.reset();
+   Kokkos::parallel_reduce("default_exec::overlap_range_policy::kernel_reduce",
+       Kokkos::Experimental::require(
+           Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , fr, result);
+   Kokkos::fence();
+   double time_fenced = timer.seconds();
+   Kokkos::deep_copy(h_result,result);
+   
+   timer.reset();  
+   Kokkos::parallel_reduce("default_exec::overlap_range_policy::kernel_reduce",
+       Kokkos::Experimental::require(
+           Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , fr, result);   
+   double time_not_fenced = timer.seconds();
+   Kokkos::fence();
+   if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
+     ASSERT_TRUE(time_fenced>2.0*time_not_fenced);
+   }
+
+   timer.reset();
+   Kokkos::parallel_reduce("default_exec::overlap_range_policy::kernel_reduce",
+       Kokkos::Experimental::require(
+           Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , fr, result);
+   Kokkos::parallel_reduce("default_exec::overlap_range_policy::kernel_reduce",
+       Kokkos::Experimental::require(
+           Kokkos::RangePolicy<TEST_EXECSPACE>(space,0,N),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , fr, result);
+  Kokkos::fence();
+  double time_no_overlapped_reduce = timer.seconds();
+
+   timer.reset();
+   Kokkos::parallel_reduce("default_exec::overlap_range_policy::kernel_reduce",
+       Kokkos::Experimental::require(
+           Kokkos::RangePolicy<TEST_EXECSPACE>(space1,0,N),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , fr, result1);
+   Kokkos::parallel_reduce("default_exec::overlap_range_policy::kernel_reduce",
+       Kokkos::Experimental::require(
+           Kokkos::RangePolicy<TEST_EXECSPACE>(space2,0,N),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , fr, result2);
+  Kokkos::fence();
+  double time_overlapped_reduce = timer.seconds();
+
+  Kokkos::deep_copy(h_result2,result2);
+  Kokkos::deep_copy(h_result1,result1);
+
+  ASSERT_EQ(h_result1(),h_result()); 
+  ASSERT_EQ(h_result2(),h_result()); 
+
+   if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
+     ASSERT_TRUE(time_overlapped_reduce < 1.5*time_no_overlapped_reduce);
+   }
+   printf("Time RangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",time_no_overlapped_reduce,time_overlapped_reduce);
+   SpaceInstance<TEST_EXECSPACE>::destroy(space1);
+   SpaceInstance<TEST_EXECSPACE>::destroy(space2);
+}
+
+TEST_F( default_exec, overlap_mdrange_policy ) {
+   int N = 200;
+   int M = 10000;
+   int R =  10;
+
+   TEST_EXECSPACE space;
+   TEST_EXECSPACE space1 = SpaceInstance<TEST_EXECSPACE>::create();
+   TEST_EXECSPACE space2 = SpaceInstance<TEST_EXECSPACE>::create();
+
+   Kokkos::View<double**,TEST_EXECSPACE> a("A",N,M);
+   FunctorMDRange f(M,R,a);
+   FunctorMDRangeReduce fr(M,R,a);
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel0",
+       Kokkos::Experimental::require(
+           Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>({0,0},{N,R}),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , FunctorMDRange(M,R,a));
+
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel1",
+       Kokkos::Experimental::require(
+           Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space1,{0,0},{N,R}),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , f);
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel2",
+       Kokkos::Experimental::require(
+           Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space2,{0,0},{N,R}),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , f);
+   Kokkos::fence();
+
+   Kokkos::Timer timer;
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel3",
+       Kokkos::Experimental::require(
+           Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , f);
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel4",
+       Kokkos::Experimental::require(
+           Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , f);
+   Kokkos::fence();
+
+   timer.reset();
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel5",
+       Kokkos::Experimental::require(
+           Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space1,{0,0},{N,R}),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , FunctorMDRange(M,R,a));
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel6",
+       Kokkos::Experimental::require(
+           Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space2,{0,0},{N,R}),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , FunctorMDRange(M,R,a));
+   Kokkos::fence();
+   double time_overlap = timer.seconds();
+
+   timer.reset();
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel7",
+       Kokkos::Experimental::require(
+           Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , f);
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel8",
+       Kokkos::Experimental::require(
+           Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , f);
+   Kokkos::fence();
+   double time_end = timer.seconds();
+
+   if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
+     ASSERT_TRUE( (time_end > 1.5*time_overlap) );
+   }
+   printf("Time MDRangePolicy: NonOverlap: %lf Time Overlap: %lf\n",time_end,time_overlap);
+
+   Kokkos::View<double,TEST_EXECSPACE> result("result");
+   Kokkos::View<double,TEST_EXECSPACE> result1("result1");
+   Kokkos::View<double,TEST_EXECSPACE> result2("result2");
+   Kokkos::View<double,Kokkos::HostSpace> h_result("h_result");
+   Kokkos::View<double,Kokkos::HostSpace> h_result1("h_result1");
+   Kokkos::View<double,Kokkos::HostSpace> h_result2("h_result2");
+
+   timer.reset();
+   Kokkos::parallel_reduce("default_exec::overlap_mdrange_policy::kernel_reduce",
+       Kokkos::Experimental::require(
+           Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , fr, result);
+   Kokkos::fence();
+   double time_fenced = timer.seconds();
+   Kokkos::deep_copy(h_result,result);
+   
+   timer.reset();  
+   Kokkos::parallel_reduce("default_exec::overlap_mdrange_policy::kernel_reduce",
+       Kokkos::Experimental::require(
+           Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , fr, result);   
+   double time_not_fenced = timer.seconds();
+   Kokkos::fence();
+   if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
+     ASSERT_TRUE(time_fenced>2.0*time_not_fenced);
+   }
+
+   timer.reset();
+   Kokkos::parallel_reduce("default_exec::overlap_mdrange_policy::kernel_reduce",
+       Kokkos::Experimental::require(
+           Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , fr, result);
+   Kokkos::parallel_reduce("default_exec::overlap_mdrange_policy::kernel_reduce",
+       Kokkos::Experimental::require(
+           Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space,{0,0},{N,R}),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , fr, result);
+  Kokkos::fence();
+  double time_no_overlapped_reduce = timer.seconds();
+
+   timer.reset();
+   Kokkos::parallel_reduce("default_exec::overlap_mdrange_policy::kernel_reduce",
+       Kokkos::Experimental::require(
+           Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space1,{0,0},{N,R}),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , fr, result1);
+   Kokkos::parallel_reduce("default_exec::overlap_mdrange_policy::kernel_reduce",
+       Kokkos::Experimental::require(
+           Kokkos::MDRangePolicy<TEST_EXECSPACE,Kokkos::Rank<2>>(space2,{0,0},{N,R}),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , fr, result2);
+  Kokkos::fence();
+  double time_overlapped_reduce = timer.seconds();
+
+  Kokkos::deep_copy(h_result2,result2);
+  Kokkos::deep_copy(h_result1,result1);
+
+  ASSERT_EQ(h_result1(),h_result()); 
+  ASSERT_EQ(h_result2(),h_result()); 
+
+   if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
+     ASSERT_TRUE(time_overlapped_reduce < 1.5*time_no_overlapped_reduce);
+   }
+   printf("Time MDRangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",time_no_overlapped_reduce,time_overlapped_reduce);
+   SpaceInstance<TEST_EXECSPACE>::destroy(space2);
+   SpaceInstance<TEST_EXECSPACE>::destroy(space1);
+
+}
+
+TEST_F( default_exec, overlap_team_policy ) {
+  int N = 20;
+   int M = 1000000;
+   int R =  10;
+
+   TEST_EXECSPACE space;
+   TEST_EXECSPACE space1 = SpaceInstance<TEST_EXECSPACE>::create();
+   TEST_EXECSPACE space2 = SpaceInstance<TEST_EXECSPACE>::create();
+
+   Kokkos::View<double**,Kokkos::LayoutRight,TEST_EXECSPACE> a("A",N,M);
+   FunctorTeam f(M,R,a);
+   FunctorTeamReduce fr(M,R,a);
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel0",
+       Kokkos::Experimental::require(
+           Kokkos::TeamPolicy<TEST_EXECSPACE>(N,Kokkos::AUTO),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , FunctorTeam(M,R,a));
+
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel1",
+       Kokkos::Experimental::require(
+           Kokkos::TeamPolicy<TEST_EXECSPACE>(space1,N,Kokkos::AUTO),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , f);
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel2",
+       Kokkos::Experimental::require(
+           Kokkos::TeamPolicy<TEST_EXECSPACE>(space2,N,Kokkos::AUTO),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , f);
+   Kokkos::fence();
+
+   Kokkos::Timer timer;
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel3",
+       Kokkos::Experimental::require(
+           Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , f);
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel4",
+       Kokkos::Experimental::require(
+           Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , f);
+   Kokkos::fence();
+
+   timer.reset();
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel5",
+       Kokkos::Experimental::require(
+           Kokkos::TeamPolicy<TEST_EXECSPACE>(space1,N,Kokkos::AUTO),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , FunctorTeam(M,R,a));
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel6",
+       Kokkos::Experimental::require(
+           Kokkos::TeamPolicy<TEST_EXECSPACE>(space2,N,Kokkos::AUTO),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , FunctorTeam(M,R,a));
+   Kokkos::fence();
+   double time_overlap = timer.seconds();
+
+   timer.reset();
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel7",
+       Kokkos::Experimental::require(
+           Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , f);
+   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel8",
+       Kokkos::Experimental::require(
+           Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , f);
+   Kokkos::fence();
+   double time_end = timer.seconds();
+
+   if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
+     ASSERT_TRUE( (time_end > 1.5*time_overlap) );
+   }
+   printf("Time TeamPolicy: NonOverlap: %lf Time Overlap: %lf\n",time_end,time_overlap);
+
+   Kokkos::View<double,TEST_EXECSPACE> result("result");
+   Kokkos::View<double,TEST_EXECSPACE> result1("result1");
+   Kokkos::View<double,TEST_EXECSPACE> result2("result2");
+   Kokkos::View<double,Kokkos::HostSpace> h_result("h_result");
+   Kokkos::View<double,Kokkos::HostSpace> h_result1("h_result1");
+   Kokkos::View<double,Kokkos::HostSpace> h_result2("h_result2");
+
+   timer.reset();
+   Kokkos::parallel_reduce("default_exec::overlap_team_policy::kernel_reduce",
+       Kokkos::Experimental::require(
+           Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , fr, result);
+   Kokkos::fence();
+   double time_fenced = timer.seconds();
+   Kokkos::deep_copy(h_result,result);
+   
+   timer.reset();  
+   Kokkos::parallel_reduce("default_exec::overlap_team_policy::kernel_reduce",
+       Kokkos::Experimental::require(
+           Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , fr, result);   
+   double time_not_fenced = timer.seconds();
+   Kokkos::fence();
+   if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
+     ASSERT_TRUE(time_fenced>2.0*time_not_fenced);
+   }
+   timer.reset();
+   Kokkos::parallel_reduce("default_exec::overlap_team_policy::kernel_reduce",
+       Kokkos::Experimental::require(
+           Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , fr, result);
+   Kokkos::parallel_reduce("default_exec::overlap_team_policy::kernel_reduce",
+       Kokkos::Experimental::require(
+           Kokkos::TeamPolicy<TEST_EXECSPACE>(space,N,Kokkos::AUTO),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , fr, result);
+  Kokkos::fence();
+  double time_no_overlapped_reduce = timer.seconds();
+
+   timer.reset();
+   Kokkos::parallel_reduce("default_exec::overlap_team_policy::kernel_reduce",
+       Kokkos::Experimental::require(
+           Kokkos::TeamPolicy<TEST_EXECSPACE>(space1,N,Kokkos::AUTO),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , fr, result1);
+   Kokkos::parallel_reduce("default_exec::overlap_team_policy::kernel_reduce",
+       Kokkos::Experimental::require(
+           Kokkos::TeamPolicy<TEST_EXECSPACE>(space2,N,Kokkos::AUTO),
+           Kokkos::Experimental::WorkItemProperty::HintLightWeight)
+       , fr, result2);
+  Kokkos::fence();
+  double time_overlapped_reduce = timer.seconds();
+
+  Kokkos::deep_copy(h_result2,result2);
+  Kokkos::deep_copy(h_result1,result1);
+
+  ASSERT_EQ(h_result1(),h_result()); 
+  ASSERT_EQ(h_result2(),h_result()); 
+
+   if(SpaceInstance<TEST_EXECSPACE>::overlap()) {
+     ASSERT_TRUE(time_overlapped_reduce < 1.5*time_no_overlapped_reduce);
+   }
+   printf("Time TeamPolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",time_no_overlapped_reduce,time_overlapped_reduce);
+   SpaceInstance<TEST_EXECSPACE>::destroy(space1);
+   SpaceInstance<TEST_EXECSPACE>::destroy(space2);
+}
+}
--- a/lib/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp
+++ b/lib/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp
@ -121,6 +121,7 @@ void run_allocateview_tests(int N, int R) {
      Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
        a_ptr[i] = 0.0;
      });
+      Kokkos::fence();
      Kokkos::kokkos_free(a_ptr);
    }
    time_raw = timer.seconds()/R;
--- a/lib/kokkos/core/perf_test/PerfTest_ViewCopy.hpp
+++ b/lib/kokkos/core/perf_test/PerfTest_ViewCopy.hpp
@ -95,6 +95,7 @@ void run_deepcopyview_tests123(int N, int R) {
        a_ptr[i] = b_ptr[i];
      });
    }
+    Kokkos::fence();
    time_raw = timer.seconds()/R;
  }
  #endif
@ -135,6 +136,7 @@ void run_deepcopyview_tests45(int N, int R) {
        a_ptr[i] = b_ptr[i];
      });
    }
+    Kokkos::fence();
    time_raw = timer.seconds()/R;
  }
  #endif
@ -169,6 +171,7 @@ void run_deepcopyview_tests6(int N, int R) {
        a_ptr[i] = b_ptr[i];
      });
    }
+    Kokkos::fence();
    time_raw = timer.seconds()/R;
  }
  #endif
@ -202,6 +205,7 @@ void run_deepcopyview_tests7(int N, int R) {
        a_ptr[i] = b_ptr[i];
      });
    }
+    Kokkos::fence();
    time_raw = timer.seconds()/R;
  }
  #endif
@ -235,6 +239,7 @@ void run_deepcopyview_tests8(int N, int R) {
        a_ptr[i] = b_ptr[i];
      });
    }
+    Kokkos::fence();
    time_raw = timer.seconds()/R;
  }
  #endif
--- a/lib/kokkos/core/perf_test/PerfTest_ViewFill.hpp
+++ b/lib/kokkos/core/perf_test/PerfTest_ViewFill.hpp
@ -90,6 +90,7 @@ void run_fillview_tests123(int N, int R) {
        a_ptr[i] = 1.1;
      });
    }
+    Kokkos::fence();
    time_raw = timer.seconds()/R;
  }
  #endif
@ -126,6 +127,7 @@ void run_fillview_tests45(int N, int R) {
        a_ptr[i] = 1.1;
      });
    }
+    Kokkos::fence();
    time_raw = timer.seconds()/R;
  }
  #endif
@ -157,6 +159,7 @@ void run_fillview_tests6(int N, int R) {
        a_ptr[i] = 1.1;
      });
    }
+    Kokkos::fence();
    time_raw = timer.seconds()/R;
  }
  #endif
@ -187,6 +190,7 @@ void run_fillview_tests7(int N, int R) {
        a_ptr[i] = 1.1;
      });
    }
+    Kokkos::fence();
    time_raw = timer.seconds()/R;
  }
  #endif
@ -217,6 +221,7 @@ void run_fillview_tests8(int N, int R) {
        a_ptr[i] = 1.1;
      });
    }
+    Kokkos::fence();
    time_raw = timer.seconds()/R;
  }
  #endif
--- a/lib/kokkos/core/perf_test/PerfTest_ViewResize.hpp
+++ b/lib/kokkos/core/perf_test/PerfTest_ViewResize.hpp
@ -95,7 +95,9 @@ void run_resizeview_tests123(int N, int R) {
      Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
        a1_ptr[i] = a_ptr[i];
      });
+      Kokkos::fence();
    }
+    Kokkos::fence();
    time_raw = timer.seconds()/R;
  }
  #endif
@ -143,7 +145,9 @@ void run_resizeview_tests45(int N, int R) {
      Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
        a1_ptr[i] = a_ptr[i];
      });
+      Kokkos::fence();
    }
+    Kokkos::fence();
    time_raw = timer.seconds()/R;
  }
  #endif
@ -181,7 +185,9 @@ void run_resizeview_tests6(int N, int R) {
      Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
        a1_ptr[i] = a_ptr[i];
      });
+      Kokkos::fence();
    }
+    Kokkos::fence();
    time_raw = timer.seconds()/R;
  }
  #endif
@ -218,7 +224,9 @@ void run_resizeview_tests7(int N, int R) {
      Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
        a1_ptr[i] = a_ptr[i];
      });
+      Kokkos::fence();
    }
+    Kokkos::fence();
    time_raw = timer.seconds()/R;
  }
  #endif
@ -255,7 +263,9 @@ void run_resizeview_tests8(int N, int R) {
      Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
        a1_ptr[i] = a_ptr[i];
      });
+      Kokkos::fence();
    }
+    Kokkos::fence();
    time_raw = timer.seconds()/R;
  }
  #endif
--- a/lib/kokkos/core/perf_test/test_atomic.cpp
+++ b/lib/kokkos/core/perf_test/test_atomic.cpp
@ -69,7 +69,7 @@ typedef Kokkos::DefaultExecutionSpace exec_space;
 #define	WHITE		8

 void textcolor(int attr, int fg, int bg)
-{	char command[13];
+{	char command[40];

 	/* Command is the control command to the terminal */
 	sprintf(command, "%c[%d;%d;%dm", 0x1B, attr, fg + 30, bg + 40);
@ -85,7 +85,7 @@ struct ZeroFunctor{
  typedef typename Kokkos::View<T,execution_space>::HostMirror h_type;
  type data;
  KOKKOS_INLINE_FUNCTION
-  void operator()(int i) const {
+  void operator()(int) const {
    data() = 0;
  }
 };
@ -101,7 +101,7 @@ struct AddFunctor{
  type data;

  KOKKOS_INLINE_FUNCTION
-  void operator()(int i) const {
+  void operator()(int) const {
    Kokkos::atomic_fetch_add(&data(),(T)1);
  }
 };
@ -113,12 +113,12 @@ T AddLoop(int loop) {
  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
  f_zero.data = data;
  Kokkos::parallel_for(1,f_zero);
-  exec_space::fence();
+  exec_space().fence();

  struct AddFunctor<T,exec_space> f_add;
  f_add.data = data;
  Kokkos::parallel_for(loop,f_add);
-  exec_space::fence();
+  exec_space().fence();

  Kokkos::deep_copy(h_data,data);
  T val = h_data();
@ -132,7 +132,7 @@ struct AddNonAtomicFunctor{
  type data;

  KOKKOS_INLINE_FUNCTION
-  void operator()(int i) const {
+  void operator()(int) const {
    data()+=(T)1;
  }
 };
@ -145,12 +145,12 @@ T AddLoopNonAtomic(int loop) {

  f_zero.data = data;
  Kokkos::parallel_for(1,f_zero);
-  exec_space::fence();
+  exec_space().fence();

  struct AddNonAtomicFunctor<T,exec_space> f_add;
  f_add.data = data;
  Kokkos::parallel_for(loop,f_add);
-  exec_space::fence();
+  exec_space().fence();

  Kokkos::deep_copy(h_data,data);
  T val = h_data();
@ -178,7 +178,7 @@ struct CASFunctor{
  type data;

  KOKKOS_INLINE_FUNCTION
-  void operator()(int i) const {
+  void operator()(int) const {
 	  T old = data();
 	  T newval, assumed;
 	  do {
@ -197,12 +197,12 @@ T CASLoop(int loop) {
  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
  f_zero.data = data;
  Kokkos::parallel_for(1,f_zero);
-  exec_space::fence();
+  exec_space().fence();

  struct CASFunctor<T,exec_space> f_cas;
  f_cas.data = data;
  Kokkos::parallel_for(loop,f_cas);
-  exec_space::fence();
+  exec_space().fence();

  Kokkos::deep_copy(h_data,data);
  T val = h_data();
@ -217,7 +217,7 @@ struct CASNonAtomicFunctor{
  type data;

  KOKKOS_INLINE_FUNCTION
-  void operator()(int i) const {
+  void operator()(int) const {
 	  volatile T assumed;
 	  volatile T newval;
 	  bool fail=1;
@ -240,12 +240,12 @@ T CASLoopNonAtomic(int loop) {
  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
  f_zero.data = data;
  Kokkos::parallel_for(1,f_zero);
-  exec_space::fence();
+  exec_space().fence();

  struct CASNonAtomicFunctor<T,exec_space> f_cas;
  f_cas.data = data;
  Kokkos::parallel_for(loop,f_cas);
-  exec_space::fence();
+  exec_space().fence();

  Kokkos::deep_copy(h_data,data);
  T val = h_data();
@ -296,19 +296,19 @@ T ExchLoop(int loop) {
  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
  f_zero.data = data;
  Kokkos::parallel_for(1,f_zero);
-  exec_space::fence();
+  exec_space().fence();

  typename ZeroFunctor<T,exec_space>::type data2("Data");
  typename ZeroFunctor<T,exec_space>::h_type h_data2("HData");
  f_zero.data = data2;
  Kokkos::parallel_for(1,f_zero);
-  exec_space::fence();
+  exec_space().fence();

  struct ExchFunctor<T,exec_space> f_exch;
  f_exch.data = data;
  f_exch.data2 = data2;
  Kokkos::parallel_for(loop,f_exch);
-  exec_space::fence();
+  exec_space().fence();

  Kokkos::deep_copy(h_data,data);
  Kokkos::deep_copy(h_data2,data2);
@ -339,19 +339,19 @@ T ExchLoopNonAtomic(int loop) {
  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
  f_zero.data = data;
  Kokkos::parallel_for(1,f_zero);
-  exec_space::fence();
+  exec_space().fence();

  typename ZeroFunctor<T,exec_space>::type data2("Data");
  typename ZeroFunctor<T,exec_space>::h_type h_data2("HData");
  f_zero.data = data2;
  Kokkos::parallel_for(1,f_zero);
-  exec_space::fence();
+  exec_space().fence();

  struct ExchNonAtomicFunctor<T,exec_space> f_exch;
  f_exch.data = data;
  f_exch.data2 = data2;
  Kokkos::parallel_for(loop,f_exch);
-  exec_space::fence();
+  exec_space().fence();

  Kokkos::deep_copy(h_data,data);
  Kokkos::deep_copy(h_data2,data2);
--- a/lib/kokkos/core/perf_test/test_mempool.cpp
+++ b/lib/kokkos/core/perf_test/test_mempool.cpp
@ -153,6 +153,7 @@ struct TestFunctor {
      typedef Kokkos::RangePolicy< ExecSpace , TagDel > policy ;

      Kokkos::parallel_for( policy(0,range_iter), *this );
+      Kokkos::fence();
    }

  //----------------------------------------
--- a/lib/kokkos/core/perf_test/test_taskdag.cpp
+++ b/lib/kokkos/core/perf_test/test_taskdag.cpp
@ -92,27 +92,26 @@ long fib_alloc_count( long n )
  return count[ n & mask ];
 }

-template< class Space >
+template< class Scheduler >
 struct TestFib {

-  using Scheduler   = Kokkos::TaskScheduler< Space > ;
  using MemorySpace = typename Scheduler::memory_space ;
  using MemberType  = typename Scheduler::member_type ;
-  using FutureType  = Kokkos::Future< long , Space > ;
+  using FutureType  = Kokkos::BasicFuture< long , Scheduler > ;

  typedef long value_type ;

-  Scheduler  sched ;
  FutureType dep[2] ;
  const value_type n ;

  KOKKOS_INLINE_FUNCTION
-  TestFib( const Scheduler & arg_sched , const value_type arg_n )
-    : sched( arg_sched ), dep{} , n( arg_n ) {}
+  TestFib( const value_type arg_n )
+    : dep{} , n( arg_n ) {}

  KOKKOS_INLINE_FUNCTION
-  void operator()( const MemberType & , value_type & result ) noexcept
+  void operator()( MemberType & member, value_type & result ) noexcept
    {
+      auto& sched = member.scheduler();
      if ( n < 2 ) {
        result = n ;
      }
@ -126,13 +125,13 @@ struct TestFib {

        dep[1] = Kokkos::task_spawn
          ( Kokkos::TaskSingle( sched, Kokkos::TaskPriority::High )
-          , TestFib( sched, n - 2 ) );
+          , TestFib( n - 2 ) );

        dep[0] = Kokkos::task_spawn
          ( Kokkos::TaskSingle( sched )
-          , TestFib( sched, n - 1 ) );
+          , TestFib( n - 1 ) );

-        Kokkos::Future< ExecSpace > fib_all = Kokkos::when_all( dep, 2 );
+        auto fib_all = sched.when_all( dep, 2 );

        if ( ! dep[0].is_null() && ! dep[1].is_null() && ! fib_all.is_null() ) {
          // High priority to retire this branch.
@ -202,13 +201,15 @@ int main( int argc , char* argv[] )
    return -1;
  }

-  typedef TestFib< ExecSpace >  Functor ;
+  using Scheduler = Kokkos::TaskSchedulerMultiple<ExecSpace>;
+
+  typedef TestFib< Scheduler >  Functor ;

  Kokkos::initialize(argc,argv);

  {

-    Functor::Scheduler sched( Functor::MemorySpace()
+    Scheduler sched( Functor::MemorySpace()
                            , total_alloc_size
                            , min_block_size
                            , max_block_size
@ -217,21 +218,21 @@ int main( int argc , char* argv[] )

    Functor::FutureType f =
      Kokkos::host_spawn( Kokkos::TaskSingle( sched )
-                        , Functor( sched , fib_input )
+                        , Functor( fib_input )
                        );

    Kokkos::wait( sched );

    test_result = f.get();

-    task_count_max   = sched.allocated_task_count_max();
-    task_count_accum = sched.allocated_task_count_accum();
+    //task_count_max   = sched.allocated_task_count_max();
+    //task_count_accum = sched.allocated_task_count_accum();

-    if ( number_alloc != task_count_accum ) {
-      std::cout << " number_alloc( " << number_alloc << " )"
-                << " != task_count_accum( " << task_count_accum << " )"
-                << std::endl ;
-    }
+    //if ( number_alloc != task_count_accum ) {
+    //  std::cout << " number_alloc( " << number_alloc << " )"
+    //            << " != task_count_accum( " << task_count_accum << " )"
+    //            << std::endl ;
+    //}

    if ( fib_output != test_result ) {
      std::cout << " answer( " << fib_output << " )"
@ -239,7 +240,7 @@ int main( int argc , char* argv[] )
                << std::endl ;
    }

-    if ( fib_output != test_result || number_alloc != task_count_accum ) {
+    if ( fib_output != test_result) { // || number_alloc != task_count_accum ) {
      printf("  TEST FAILED\n");
      return -1;
    }
@ -252,7 +253,7 @@ int main( int argc , char* argv[] )

      Functor::FutureType ftmp =
        Kokkos::host_spawn( Kokkos::TaskSingle( sched )
-                          , Functor( sched , fib_input )
+                          , Functor( fib_input )
                          );

      Kokkos::wait( sched );
--- a/lib/kokkos/core/src/CMakeLists.txt
+++ b/lib/kokkos/core/src/CMakeLists.txt
@ -61,6 +61,16 @@ IF(KOKKOS_LEGACY_TRIBITS)

  #-----------------------------------------------------------------------------

+  FILE(GLOB HEADERS_HPX HPX/*.hpp)
+  FILE(GLOB SOURCES_HPX HPX/*.cpp)
+
+  LIST(APPEND HEADERS_PRIVATE ${HEADERS_HPX} )
+  LIST(APPEND SOURCES         ${SOURCES_HPX} )
+
+  INSTALL(FILES ${HEADERS_HPX} DESTINATION ${TRILINOS_INCDIR}/HPX/)
+
+  #-----------------------------------------------------------------------------
+
  FILE(GLOB HEADERS_CUDA Cuda/*.hpp)
  FILE(GLOB SOURCES_CUDA Cuda/*.cpp)

--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@ -1,419 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CUDAEXEC_HPP
-#define KOKKOS_CUDAEXEC_HPP
-
-#include <Kokkos_Macros.hpp>
-#ifdef KOKKOS_ENABLE_CUDA
-
-#include <string>
-#include <cstdint>
-#include <Kokkos_Parallel.hpp>
-#include <impl/Kokkos_Error.hpp>
-#include <Cuda/Kokkos_Cuda_abort.hpp>
-#include <Cuda/Kokkos_Cuda_Error.hpp>
-#include <Cuda/Kokkos_Cuda_Locks.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-struct CudaTraits {
-  enum { WarpSize       = 32      /* 0x0020 */ };
-  enum { WarpIndexMask  = 0x001f  /* Mask for warpindex */ };
-  enum { WarpIndexShift = 5       /* WarpSize == 1 << WarpShift */ };
-
-  enum { SharedMemoryBanks    = 32      /* Compute device 2.0 */ };
-  enum { SharedMemoryCapacity = 0x0C000 /* 48k shared / 16k L1 Cache */ };
-  enum { SharedMemoryUsage    = 0x04000 /* 16k shared / 48k L1 Cache */ };
-
-  enum { UpperBoundGridCount    = 65535 /* Hard upper bound */ };
-  enum { ConstantMemoryCapacity = 0x010000 /* 64k bytes */ };
-  enum { ConstantMemoryUsage    = 0x008000 /* 32k bytes */ };
-  enum { ConstantMemoryCache    = 0x002000 /*  8k bytes */ };
-
-  typedef unsigned long
-    ConstantGlobalBufferType[ ConstantMemoryUsage / sizeof(unsigned long) ];
-
-  enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ };
-
-  KOKKOS_INLINE_FUNCTION static
-  CudaSpace::size_type warp_count( CudaSpace::size_type i )
-    { return ( i + WarpIndexMask ) >> WarpIndexShift ; }
-
-  KOKKOS_INLINE_FUNCTION static
-  CudaSpace::size_type warp_align( CudaSpace::size_type i )
-    {
-      enum { Mask = ~CudaSpace::size_type( WarpIndexMask ) };
-      return ( i + WarpIndexMask ) & Mask ;
-    }
-};
-
-//----------------------------------------------------------------------------
-
-CudaSpace::size_type cuda_internal_multiprocessor_count();
-CudaSpace::size_type cuda_internal_maximum_warp_count();
-CudaSpace::size_type cuda_internal_maximum_grid_count();
-CudaSpace::size_type cuda_internal_maximum_shared_words();
-
-CudaSpace::size_type cuda_internal_maximum_concurrent_block_count();
-
-CudaSpace::size_type * cuda_internal_scratch_flags( const CudaSpace::size_type size );
-CudaSpace::size_type * cuda_internal_scratch_space( const CudaSpace::size_type size );
-CudaSpace::size_type * cuda_internal_scratch_unified( const CudaSpace::size_type size );
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( __CUDACC__ )
-
-/** \brief  Access to constant memory on the device */
-#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-
-__device__ __constant__
-extern unsigned long kokkos_impl_cuda_constant_memory_buffer[] ;
-
-#else
-
-__device__ __constant__
-unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;
-
-#endif
-
-namespace Kokkos {
-namespace Impl {
-  void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink = false);
-}
-}
-
-template< typename T >
-inline
-__device__
-T * kokkos_impl_cuda_shared_memory()
-{ extern __shared__ Kokkos::CudaSpace::size_type sh[]; return (T*) sh ; }
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-// See section B.17 of Cuda C Programming Guide Version 3.2
-// for discussion of
-//   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
-// function qualifier which could be used to improve performance.
-//----------------------------------------------------------------------------
-// Maximize L1 cache and minimize shared memory:
-//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
-// For 2.0 capability: 48 KB L1 and 16 KB shared
-//----------------------------------------------------------------------------
-
-template< class DriverType>
-__global__
-static void cuda_parallel_launch_constant_memory()
-{
-  const DriverType & driver =
-    *((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
-
-  driver();
-}
-
-template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
-__global__
-__launch_bounds__(maxTperB, minBperSM)
-static void cuda_parallel_launch_constant_memory()
-{
-  const DriverType & driver =
-    *((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
-
-  driver();
-}
-
-template< class DriverType>
-__global__
-static void cuda_parallel_launch_local_memory( const DriverType driver )
-{
-  driver();
-}
-
-template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
-__global__
-__launch_bounds__(maxTperB, minBperSM)
-static void cuda_parallel_launch_local_memory( const DriverType driver )
-{
-  driver();
-}
-
-template < class DriverType
-         , class LaunchBounds = Kokkos::LaunchBounds<>
-         , bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
-struct CudaParallelLaunch ;
-
-template < class DriverType
-         , unsigned int MaxThreadsPerBlock
-         , unsigned int MinBlocksPerSM >
-struct CudaParallelLaunch< DriverType
-                         , Kokkos::LaunchBounds< MaxThreadsPerBlock 
-                                               , MinBlocksPerSM >
-                         , true >
-{
-  inline
-  CudaParallelLaunch( const DriverType & driver
-                    , const dim3       & grid
-                    , const dim3       & block
-                    , const int          shmem
-                    , const cudaStream_t stream = 0 )
-  {
-    if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
-
-      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
-           sizeof( DriverType ) ) {
-        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
-      }
-
-      // Fence before changing settings and copying closure
-      Kokkos::Cuda::fence();
-
-      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
-        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
-      }
-      #ifndef KOKKOS_ARCH_KEPLER
-      // On Kepler the L1 has no benefit since it doesn't cache reads
-      else {
-        CUDA_SAFE_CALL(
-          cudaFuncSetCacheConfig
-            ( cuda_parallel_launch_constant_memory
-                < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
-            , ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
-            ) );
-      }
-      #endif
-
-      // Copy functor to constant memory on the device
-      cudaMemcpyToSymbol(
-        kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType) );
-
-      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
-
-      // Invoke the driver function on the device
-      cuda_parallel_launch_constant_memory
-        < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
-          <<< grid , block , shmem , stream >>>();
-
-#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
-      CUDA_SAFE_CALL( cudaGetLastError() );
-      Kokkos::Cuda::fence();
-#endif
-    }
-  }
-};
-
-template < class DriverType >
-struct CudaParallelLaunch< DriverType
-                         , Kokkos::LaunchBounds<>
-                         , true >
-{
-  inline
-  CudaParallelLaunch( const DriverType & driver
-                    , const dim3       & grid
-                    , const dim3       & block
-                    , const int          shmem
-                    , const cudaStream_t stream = 0 )
-  {
-    if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
-
-      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
-           sizeof( DriverType ) ) {
-        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
-      }
-
-      // Fence before changing settings and copying closure
-      Kokkos::Cuda::fence();
-
-      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
-        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
-      }
-      #ifndef KOKKOS_ARCH_KEPLER
-      // On Kepler the L1 has no benefit since it doesn't cache reads
-      else {
-        CUDA_SAFE_CALL(
-          cudaFuncSetCacheConfig
-            ( cuda_parallel_launch_constant_memory< DriverType >
-            , ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
-            ) );
-      }
-      #endif
-
-      // Copy functor to constant memory on the device
-      cudaMemcpyToSymbol(
-        kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType) );
-
-      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
-
-      // Invoke the driver function on the device
-      cuda_parallel_launch_constant_memory< DriverType >
-          <<< grid , block , shmem , stream >>>();
-
-#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
-      CUDA_SAFE_CALL( cudaGetLastError() );
-      Kokkos::Cuda::fence();
-#endif
-    }
-  }
-};
-
-template < class DriverType
-         , unsigned int MaxThreadsPerBlock
-         , unsigned int MinBlocksPerSM >
-struct CudaParallelLaunch< DriverType
-                         , Kokkos::LaunchBounds< MaxThreadsPerBlock 
-                                               , MinBlocksPerSM >
-                         , false >
-{
-  inline
-  CudaParallelLaunch( const DriverType & driver
-                    , const dim3       & grid
-                    , const dim3       & block
-                    , const int          shmem
-                    , const cudaStream_t stream = 0 )
-  {
-    if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
-
-      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
-           sizeof( DriverType ) ) {
-        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
-      }
-
-      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
-        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
-      }
-      #ifndef KOKKOS_ARCH_KEPLER
-      // On Kepler the L1 has no benefit since it doesn't cache reads
-      else {
-        CUDA_SAFE_CALL(
-          cudaFuncSetCacheConfig
-            ( cuda_parallel_launch_local_memory
-                < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
-            , ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
-            ) );
-      }
-      #endif
-
-      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
-
-      // Invoke the driver function on the device
-      cuda_parallel_launch_local_memory
-        < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
-          <<< grid , block , shmem , stream >>>( driver );
-
-#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
-      CUDA_SAFE_CALL( cudaGetLastError() );
-      Kokkos::Cuda::fence();
-#endif
-    }
-  }
-};
-
-template < class DriverType >
-struct CudaParallelLaunch< DriverType
-                         , Kokkos::LaunchBounds<>
-                         , false >
-{
-  inline
-  CudaParallelLaunch( const DriverType & driver
-                    , const dim3       & grid
-                    , const dim3       & block
-                    , const int          shmem
-                    , const cudaStream_t stream = 0 )
-  {
-    if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
-
-      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
-           sizeof( DriverType ) ) {
-        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
-      }
-
-      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
-        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
-      }
-      #ifndef KOKKOS_ARCH_KEPLER
-      // On Kepler the L1 has no benefit since it doesn't cache reads
-      else {
-        CUDA_SAFE_CALL(
-          cudaFuncSetCacheConfig
-            ( cuda_parallel_launch_local_memory< DriverType >
-            , ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
-            ) );
-      }
-      #endif
-
-      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
-
-      // Invoke the driver function on the device
-      cuda_parallel_launch_local_memory< DriverType >
-          <<< grid , block , shmem , stream >>>( driver );
-
-#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
-      CUDA_SAFE_CALL( cudaGetLastError() );
-      Kokkos::Cuda::fence();
-#endif
-    }
-  }
-};
-
-//----------------------------------------------------------------------------
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* defined( __CUDACC__ ) */
-#endif /* defined( KOKKOS_ENABLE_CUDA ) */
-#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */
-
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@ -55,7 +55,7 @@
 #include <Kokkos_Cuda.hpp>
 #include <Kokkos_CudaSpace.hpp>

-#include <Cuda/Kokkos_Cuda_Internal.hpp>
+//#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
 #include <impl/Kokkos_Error.hpp>

 #if defined(KOKKOS_ENABLE_PROFILING)
@ -183,7 +183,7 @@ void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const

  enum { max_uvm_allocations = 65536 };

-  Cuda::fence();
+  Cuda::impl_static_fence();
  if ( arg_alloc_size > 0 )
  {
    Kokkos::Impl::num_uvm_allocations++;
@ -194,7 +194,7 @@ void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const

    CUDA_SAFE_CALL( cudaMallocManaged( &ptr, arg_alloc_size , cudaMemAttachGlobal ) );
  }
-  Cuda::fence();
+  Cuda::impl_static_fence();

  return ptr ;
 }
@ -217,14 +217,14 @@ void CudaSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_all

 void CudaUVMSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
 {
-  Cuda::fence();
+  Cuda::impl_static_fence();
  try {
    if ( arg_alloc_ptr != nullptr ) {
      Kokkos::Impl::num_uvm_allocations--;
      CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) );
    }
  } catch(...) {}
-  Cuda::fence();
+  Cuda::impl_static_fence();
 }

 void CudaHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
@ -390,7 +390,7 @@ SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
 {
  #if defined(KOKKOS_ENABLE_PROFILING)
  if(Kokkos::Profiling::profileLibraryLoaded()) {
-    Cuda::fence(); //Make sure I can access the label ...
+    Cuda::impl_static_fence(); //Make sure I can access the label ...
    Kokkos::Profiling::deallocateData(
      Kokkos::Profiling::SpaceHandle(Kokkos::CudaUVMSpace::name()),RecordBase::m_alloc_ptr->m_label,
      data(),size());
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp
@ -0,0 +1,657 @@
+/*
+@HEADER
+================================================================================
+
+ORIGINAL LICENSE
+----------------
+
+Copyright (c) 2018, NVIDIA Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+================================================================================
+
+LICENSE ASSOCIATED WITH SUBSEQUENT MODIFICATIONS
+------------------------------------------------
+
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2019) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS)
+
+#include <cassert>
+
+#ifndef _SIMT_DETAILS_CONFIG
+#define _SIMT_DETAILS_CONFIG
+
+namespace Kokkos {
+namespace Impl {
+
+
+#ifndef __simt_scope
+// Modification: Kokkos GPU atomics should default to `gpu` scope
+#define __simt_scope "gpu"
+#endif
+
+#define __simt_fence_signal_() asm volatile("":::"memory")
+#define __simt_fence_sc_() asm volatile("fence.sc." __simt_scope ";":::"memory")
+#define __simt_fence_() asm volatile("fence." __simt_scope ";":::"memory")
+
+#define __simt_load_acquire_8_as_32(ptr,ret) asm volatile("ld.acquire." __simt_scope ".b8 %0, [%1];" : "=r"(ret) : "l"(ptr) : "memory")
+#define __simt_load_relaxed_8_as_32(ptr,ret) asm volatile("ld.relaxed." __simt_scope ".b8 %0, [%1];" : "=r"(ret) : "l"(ptr) : "memory")
+#define __simt_store_release_8_as_32(ptr,desired) asm volatile("st.release." __simt_scope ".b8 [%0], %1;" :: "l"(ptr), "r"(desired) : "memory")
+#define __simt_store_relaxed_8_as_32(ptr,desired) asm volatile("st.relaxed." __simt_scope ".b8 [%0], %1;" :: "l"(ptr), "r"(desired) : "memory")
+
+#define __simt_load_acquire_16(ptr,ret) asm volatile("ld.acquire." __simt_scope ".b16 %0, [%1];" : "=h"(ret) : "l"(ptr) : "memory")
+#define __simt_load_relaxed_16(ptr,ret) asm volatile("ld.relaxed." __simt_scope ".b16 %0, [%1];" : "=h"(ret) : "l"(ptr) : "memory")
+#define __simt_store_release_16(ptr,desired) asm volatile("st.release." __simt_scope ".b16 [%0], %1;" :: "l"(ptr), "h"(desired) : "memory")
+#define __simt_store_relaxed_16(ptr,desired) asm volatile("st.relaxed." __simt_scope ".b16 [%0], %1;" :: "l"(ptr), "h"(desired) : "memory")
+
+#define __simt_load_acquire_32(ptr,ret) asm volatile("ld.acquire." __simt_scope ".b32 %0, [%1];" : "=r"(ret) : "l"(ptr) : "memory")
+#define __simt_load_relaxed_32(ptr,ret) asm volatile("ld.relaxed." __simt_scope ".b32 %0, [%1];" : "=r"(ret) : "l"(ptr) : "memory")
+#define __simt_store_release_32(ptr,desired) asm volatile("st.release." __simt_scope ".b32 [%0], %1;" :: "l"(ptr), "r"(desired) : "memory")
+#define __simt_store_relaxed_32(ptr,desired) asm volatile("st.relaxed." __simt_scope ".b32 [%0], %1;" :: "l"(ptr), "r"(desired) : "memory")
+#define __simt_exch_release_32(ptr,old,desired) asm volatile("atom.exch.release." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(desired) : "memory")
+#define __simt_exch_acquire_32(ptr,old,desired) asm volatile("atom.exch.acquire." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(desired) : "memory")
+#define __simt_exch_acq_rel_32(ptr,old,desired) asm volatile("atom.exch.acq_rel." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(desired) : "memory")
+#define __simt_exch_relaxed_32(ptr,old,desired) asm volatile("atom.exch.relaxed." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(desired) : "memory")
+#define __simt_cas_release_32(ptr,old,expected,desired) asm volatile("atom.cas.release." __simt_scope ".b32 %0, [%1], %2, %3;" : "=r"(old) : "l"(ptr), "r"(expected), "r"(desired) : "memory")
+#define __simt_cas_acquire_32(ptr,old,expected,desired) asm volatile("atom.cas.acquire." __simt_scope ".b32 %0, [%1], %2, %3;" : "=r"(old) : "l"(ptr), "r"(expected), "r"(desired) : "memory")
+#define __simt_cas_acq_rel_32(ptr,old,expected,desired) asm volatile("atom.cas.acq_rel." __simt_scope ".b32 %0, [%1], %2, %3;" : "=r"(old) : "l"(ptr), "r"(expected), "r"(desired) : "memory")
+#define __simt_cas_relaxed_32(ptr,old,expected,desired) asm volatile("atom.cas.relaxed." __simt_scope ".b32 %0, [%1], %2, %3;" : "=r"(old) : "l"(ptr), "r"(expected), "r"(desired) : "memory")
+#define __simt_add_release_32(ptr,old,addend) asm volatile("atom.add.release." __simt_scope ".u32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(addend) : "memory")
+#define __simt_add_acquire_32(ptr,old,addend) asm volatile("atom.add.acquire." __simt_scope ".u32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(addend) : "memory")
+#define __simt_add_acq_rel_32(ptr,old,addend) asm volatile("atom.add.acq_rel." __simt_scope ".u32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(addend) : "memory")
+#define __simt_add_relaxed_32(ptr,old,addend) asm volatile("atom.add.relaxed." __simt_scope ".u32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(addend) : "memory")
+#define __simt_and_release_32(ptr,old,andend) asm volatile("atom.and.release." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(andend) : "memory")
+#define __simt_and_acquire_32(ptr,old,andend) asm volatile("atom.and.acquire." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(andend) : "memory")
+#define __simt_and_acq_rel_32(ptr,old,andend) asm volatile("atom.and.acq_rel." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(andend) : "memory")
+#define __simt_and_relaxed_32(ptr,old,andend) asm volatile("atom.and.relaxed." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(andend) : "memory")
+#define __simt_or_release_32(ptr,old,orend) asm volatile("atom.or.release." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(orend) : "memory")
+#define __simt_or_acquire_32(ptr,old,orend) asm volatile("atom.or.acquire." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(orend) : "memory")
+#define __simt_or_acq_rel_32(ptr,old,orend) asm volatile("atom.or.acq_rel." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(orend) : "memory")
+#define __simt_or_relaxed_32(ptr,old,orend) asm volatile("atom.or.relaxed." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(orend) : "memory")
+#define __simt_xor_release_32(ptr,old,xorend) asm volatile("atom.xor.release." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(xorend) : "memory")
+#define __simt_xor_acquire_32(ptr,old,xorend) asm volatile("atom.xor.acquire." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(xorend) : "memory")
+#define __simt_xor_acq_rel_32(ptr,old,xorend) asm volatile("atom.xor.acq_rel." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(xorend) : "memory")
+#define __simt_xor_relaxed_32(ptr,old,xorend) asm volatile("atom.xor.relaxed." __simt_scope ".b32 %0, [%1], %2;" : "=r"(old) : "l"(ptr), "r"(xorend) : "memory")
+
+#define __simt_load_acquire_64(ptr,ret) asm volatile("ld.acquire." __simt_scope ".b64 %0, [%1];" : "=l"(ret) : "l"(ptr) : "memory")
+#define __simt_load_relaxed_64(ptr,ret) asm volatile("ld.relaxed." __simt_scope ".b64 %0, [%1];" : "=l"(ret) : "l"(ptr) : "memory")
+#define __simt_store_release_64(ptr,desired) asm volatile("st.release." __simt_scope ".b64 [%0], %1;" :: "l"(ptr), "l"(desired) : "memory")
+#define __simt_store_relaxed_64(ptr,desired) asm volatile("st.relaxed." __simt_scope ".b64 [%0], %1;" :: "l"(ptr), "l"(desired) : "memory")
+#define __simt_exch_release_64(ptr,old,desired) asm volatile("atom.exch.release." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(desired) : "memory")
+#define __simt_exch_acquire_64(ptr,old,desired) asm volatile("atom.exch.acquire." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(desired) : "memory")
+#define __simt_exch_acq_rel_64(ptr,old,desired) asm volatile("atom.exch.acq_rel." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(desired) : "memory")
+#define __simt_exch_relaxed_64(ptr,old,desired) asm volatile("atom.exch.relaxed." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(desired) : "memory")
+#define __simt_cas_release_64(ptr,old,expected,desired) asm volatile("atom.cas.release." __simt_scope ".b64 %0, [%1], %2, %3;" : "=l"(old) : "l"(ptr), "l"(expected), "l"(desired) : "memory")
+#define __simt_cas_acquire_64(ptr,old,expected,desired) asm volatile("atom.cas.acquire." __simt_scope ".b64 %0, [%1], %2, %3;" : "=l"(old) : "l"(ptr), "l"(expected), "l"(desired) : "memory")
+#define __simt_cas_acq_rel_64(ptr,old,expected,desired) asm volatile("atom.cas.acq_rel." __simt_scope ".b64 %0, [%1], %2, %3;" : "=l"(old) : "l"(ptr), "l"(expected), "l"(desired) : "memory")
+#define __simt_cas_relaxed_64(ptr,old,expected,desired) asm volatile("atom.cas.relaxed." __simt_scope ".b64 %0, [%1], %2, %3;" : "=l"(old) : "l"(ptr), "l"(expected), "l"(desired) : "memory")
+#define __simt_add_release_64(ptr,old,addend) asm volatile("atom.add.release." __simt_scope ".u64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(addend) : "memory")
+#define __simt_add_acquire_64(ptr,old,addend) asm volatile("atom.add.acquire." __simt_scope ".u64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(addend) : "memory")
+#define __simt_add_acq_rel_64(ptr,old,addend) asm volatile("atom.add.acq_rel." __simt_scope ".u64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(addend) : "memory")
+#define __simt_add_relaxed_64(ptr,old,addend) asm volatile("atom.add.relaxed." __simt_scope ".u64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(addend) : "memory")
+#define __simt_and_release_64(ptr,old,andend) asm volatile("atom.and.release." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(andend) : "memory")
+#define __simt_and_acquire_64(ptr,old,andend) asm volatile("atom.and.acquire." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(andend) : "memory")
+#define __simt_and_acq_rel_64(ptr,old,andend) asm volatile("atom.and.acq_rel." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(andend) : "memory")
+#define __simt_and_relaxed_64(ptr,old,andend) asm volatile("atom.and.relaxed." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(andend) : "memory")
+#define __simt_or_release_64(ptr,old,orend) asm volatile("atom.or.release." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(orend) : "memory")
+#define __simt_or_acquire_64(ptr,old,orend) asm volatile("atom.or.acquire." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(orend) : "memory")
+#define __simt_or_acq_rel_64(ptr,old,orend) asm volatile("atom.or.acq_rel." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(orend) : "memory")
+#define __simt_or_relaxed_64(ptr,old,orend) asm volatile("atom.or.relaxed." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(orend) : "memory")
+#define __simt_xor_release_64(ptr,old,xorend) asm volatile("atom.xor.release." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(xorend) : "memory")
+#define __simt_xor_acquire_64(ptr,old,xorend) asm volatile("atom.xor.acquire." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(xorend) : "memory")
+#define __simt_xor_acq_rel_64(ptr,old,xorend) asm volatile("atom.xor.acq_rel." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(xorend) : "memory")
+#define __simt_xor_relaxed_64(ptr,old,xorend) asm volatile("atom.xor.relaxed." __simt_scope ".b64 %0, [%1], %2;" : "=l"(old) : "l"(ptr), "l"(xorend) : "memory")
+
+#define __simt_nanosleep(timeout) asm volatile("nanosleep.u32 %0;" :: "r"(unsigned(timeout)) : )
+
+/*
+    definitions
+*/
+
+#ifndef __GCC_ATOMIC_BOOL_LOCK_FREE
+#define __GCC_ATOMIC_BOOL_LOCK_FREE 2
+#define __GCC_ATOMIC_CHAR_LOCK_FREE 2
+#define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2
+#define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2
+#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2
+#define __GCC_ATOMIC_SHORT_LOCK_FREE 2
+#define __GCC_ATOMIC_INT_LOCK_FREE 2
+#define __GCC_ATOMIC_LONG_LOCK_FREE 2
+#define __GCC_ATOMIC_LLONG_LOCK_FREE 2
+#define __GCC_ATOMIC_POINTER_LOCK_FREE 2
+#endif
+
+#ifndef __ATOMIC_RELAXED
+#define __ATOMIC_RELAXED 0
+#define __ATOMIC_CONSUME 1
+#define __ATOMIC_ACQUIRE 2
+#define __ATOMIC_RELEASE 3
+#define __ATOMIC_ACQ_REL 4
+#define __ATOMIC_SEQ_CST 5
+#endif
+
+inline __device__ int __stronger_order_simt_(int a, int b) {
+    if (b == __ATOMIC_SEQ_CST) return __ATOMIC_SEQ_CST;
+    if (b == __ATOMIC_RELAXED) return a;
+    switch (a) {
+    case __ATOMIC_SEQ_CST:
+    case __ATOMIC_ACQ_REL: return a;
+    case __ATOMIC_CONSUME:
+    case __ATOMIC_ACQUIRE: if (b != __ATOMIC_ACQUIRE) return __ATOMIC_ACQ_REL; else return __ATOMIC_ACQUIRE;
+    case __ATOMIC_RELEASE: if (b != __ATOMIC_RELEASE) return __ATOMIC_ACQ_REL; else return __ATOMIC_RELEASE;
+    case __ATOMIC_RELAXED: return b;
+    default: assert(0);
+    }
+    return __ATOMIC_SEQ_CST;
+}
+
+/*
+    base
+*/
+
+#define DO__atomic_load_simt_(bytes, bits) \
+template<class type, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
+void __device__ __atomic_load_simt_ (const type *ptr, type *ret, int memorder) { \
+    int##bits##_t tmp = 0; \
+    switch (memorder) { \
+    case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
+    case __ATOMIC_CONSUME: \
+    case __ATOMIC_ACQUIRE: __simt_load_acquire_##bits(ptr, tmp); break; \
+    case __ATOMIC_RELAXED: __simt_load_relaxed_##bits(ptr, tmp); break; \
+    default: assert(0); \
+    } \
+    memcpy(ret, &tmp, bytes); \
+}
+DO__atomic_load_simt_(1,32)
+DO__atomic_load_simt_(2,16)
+DO__atomic_load_simt_(4,32)
+DO__atomic_load_simt_(8,64)
+
+template<class type>
+type __device__ __atomic_load_n_simt_(const type *ptr, int memorder) {
+    type ret;
+    __atomic_load_simt_(ptr, &ret, memorder);
+    return ret;
+}
+
+#define DO__atomic_store_simt_(bytes, bits) \
+template<class type, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
+void __device__ __atomic_store_simt_ (type *ptr, type *val, int memorder) { \
+    int##bits##_t tmp = 0; \
+    memcpy(&tmp, val, bytes); \
+    switch (memorder) { \
+    case __ATOMIC_RELEASE: __simt_store_release_##bits(ptr, tmp); break; \
+    case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
+    case __ATOMIC_RELAXED: __simt_store_relaxed_##bits(ptr, tmp); break; \
+    default: assert(0); \
+    } \
+}
+DO__atomic_store_simt_(1,32)
+DO__atomic_store_simt_(2,16)
+DO__atomic_store_simt_(4,32)
+DO__atomic_store_simt_(8,64)
+
+template<class type>
+void __device__ __atomic_store_n_simt_(type *ptr, type val, int memorder) {
+    __atomic_store_simt_(ptr, &val, memorder);
+}
+
+#define DO__atomic_compare_exchange_simt_(bytes, bits) \
+template<class type, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
+bool __device__ __atomic_compare_exchange_simt_ (type *ptr, type *expected, const type *desired, bool, int success_memorder, int failure_memorder) { \
+    int##bits##_t tmp = 0, old = 0, old_tmp; \
+    memcpy(&tmp, desired, bytes); \
+    memcpy(&old, expected, bytes); \
+    old_tmp = old; \
+    switch (__stronger_order_simt_(success_memorder, failure_memorder)) { \
+    case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
+    case __ATOMIC_CONSUME: \
+    case __ATOMIC_ACQUIRE: __simt_cas_acquire_##bits(ptr, old, old_tmp, tmp); break; \
+    case __ATOMIC_ACQ_REL: __simt_cas_acq_rel_##bits(ptr, old, old_tmp, tmp); break; \
+    case __ATOMIC_RELEASE: __simt_cas_release_##bits(ptr, old, old_tmp, tmp); break; \
+    case __ATOMIC_RELAXED: __simt_cas_relaxed_##bits(ptr, old, old_tmp, tmp); break; \
+    default: assert(0); \
+    } \
+    bool const ret = old == old_tmp; \
+    memcpy(expected, &old, bytes); \
+    return ret; \
+}
+DO__atomic_compare_exchange_simt_(4, 32)
+DO__atomic_compare_exchange_simt_(8, 64)
+
+template<class type, typename std::enable_if<sizeof(type) <= 2, int>::type = 0> \
+bool __device__ __atomic_compare_exchange_simt_(type *ptr, type *expected, const type *desired, bool, int success_memorder, int failure_memorder) {
+
+    using R = typename std::conditional<std::is_volatile<type>::value, volatile uint32_t, uint32_t>::type;
+    auto const aligned = (R*)((intptr_t)ptr & ~(sizeof(uint32_t) - 1));
+    auto const offset = uint32_t((intptr_t)ptr & (sizeof(uint32_t) - 1)) * 8;
+    auto const mask = ((1 << sizeof(type)*8) - 1) << offset;
+
+    uint32_t old = *expected << offset, old_value;
+    while (1) {
+        old_value = (old & mask) >> offset;
+        if (old_value != *expected)
+            break;
+        uint32_t const attempt = (old & ~mask) | (*desired << offset);
+        if (__atomic_compare_exchange_simt_ (aligned, &old, &attempt, true, success_memorder, failure_memorder))
+            return true;
+    }
+    *expected = old_value;
+    return false;
+}
+
+template<class type>
+bool __device__ __atomic_compare_exchange_n_simt_(type *ptr, type *expected, type desired, bool weak, int success_memorder, int failure_memorder) {
+    return __atomic_compare_exchange_simt_(ptr, expected, &desired, weak, success_memorder, failure_memorder);
+}
+
+#define DO__atomic_exchange_simt_(bytes, bits) \
+template<class type, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
+void __device__ __atomic_exchange_simt_ (type *ptr, type *val, type *ret, int memorder) { \
+    int##bits##_t tmp = 0; \
+    memcpy(&tmp, val, bytes); \
+    switch (memorder) { \
+    case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
+    case __ATOMIC_CONSUME: \
+    case __ATOMIC_ACQUIRE: __simt_exch_acquire_##bits(ptr, tmp, tmp); break; \
+    case __ATOMIC_ACQ_REL: __simt_exch_acq_rel_##bits(ptr, tmp, tmp); break; \
+    case __ATOMIC_RELEASE: __simt_exch_release_##bits(ptr, tmp, tmp); break; \
+    case __ATOMIC_RELAXED: __simt_exch_relaxed_##bits(ptr, tmp, tmp); break; \
+    default: assert(0); \
+    } \
+    memcpy(ret, &tmp, bytes); \
+}
+DO__atomic_exchange_simt_(4,32)
+DO__atomic_exchange_simt_(8,64)
+
+template<class type, typename std::enable_if<sizeof(type)<=2, int>::type = 0>
+void __device__ __atomic_exchange_simt_ (type *ptr, type *val, type *ret, int memorder) {
+
+    type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
+    while(!__atomic_compare_exchange_simt_(ptr, &expected, val, true, memorder, memorder))
+        ;
+    *ret = expected;
+}
+
+template<class type>
+type __device__ __atomic_exchange_n_simt_(type *ptr, type val, int memorder) {
+    type ret;
+    __atomic_exchange_simt_(ptr, &val, &ret, memorder);
+    return ret;
+}
+
+#define DO__atomic_fetch_add_simt_(bytes, bits) \
+template<class type, class delta, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
+type __device__ __atomic_fetch_add_simt_ (type *ptr, delta val, int memorder) { \
+    type ret; \
+    switch (memorder) { \
+    case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
+    case __ATOMIC_CONSUME: \
+    case __ATOMIC_ACQUIRE: __simt_add_acquire_##bits(ptr, ret, val); break; \
+    case __ATOMIC_ACQ_REL: __simt_add_acq_rel_##bits(ptr, ret, val); break; \
+    case __ATOMIC_RELEASE: __simt_add_release_##bits(ptr, ret, val); break; \
+    case __ATOMIC_RELAXED: __simt_add_relaxed_##bits(ptr, ret, val); break; \
+    default: assert(0); \
+    } \
+    return ret; \
+}
+DO__atomic_fetch_add_simt_(4, 32)
+DO__atomic_fetch_add_simt_(8, 64)
+
+template<class type, class delta, typename std::enable_if<sizeof(type)<=2, int>::type = 0>
+type __device__ __atomic_fetch_add_simt_ (type *ptr, delta val, int memorder) {
+
+    type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
+    type const desired = expected + val;
+    while(!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, memorder, memorder))
+        ;
+    return expected;
+}
+
+#define DO__atomic_fetch_sub_simt_(bytes, bits) \
+template<class type, class delta, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
+type __device__ __atomic_fetch_sub_simt_ (type *ptr, delta val, int memorder) { \
+    type ret; \
+    switch (memorder) { \
+    case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
+    case __ATOMIC_CONSUME: \
+    case __ATOMIC_ACQUIRE: __simt_add_acquire_##bits(ptr, ret, -val); break; \
+    case __ATOMIC_ACQ_REL: __simt_add_acq_rel_##bits(ptr, ret, -val); break; \
+    case __ATOMIC_RELEASE: __simt_add_release_##bits(ptr, ret, -val); break; \
+    case __ATOMIC_RELAXED: __simt_add_relaxed_##bits(ptr, ret, -val); break; \
+    default: assert(0); \
+    } \
+    return ret; \
+}
+DO__atomic_fetch_sub_simt_(4,32)
+DO__atomic_fetch_sub_simt_(8,64)
+
+template<class type, class delta, typename std::enable_if<sizeof(type)<=2, int>::type = 0>
+type __device__ __atomic_fetch_sub_simt_ (type *ptr, delta val, int memorder) {
+
+    type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
+    type const desired = expected - val;
+    while(!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, memorder, memorder))
+        ;
+    return expected;
+}
+
+#define DO__atomic_fetch_and_simt_(bytes, bits) \
+template<class type, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
+type __device__ __atomic_fetch_and_simt_ (type *ptr, type val, int memorder) { \
+    type ret; \
+    switch (memorder) { \
+    case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
+    case __ATOMIC_CONSUME: \
+    case __ATOMIC_ACQUIRE: __simt_and_acquire_##bits(ptr, ret, val); break; \
+    case __ATOMIC_ACQ_REL: __simt_and_acq_rel_##bits(ptr, ret, val); break; \
+    case __ATOMIC_RELEASE: __simt_and_release_##bits(ptr, ret, val); break; \
+    case __ATOMIC_RELAXED: __simt_and_relaxed_##bits(ptr, ret, val); break; \
+    default: assert(0); \
+    } \
+    return ret; \
+}
+DO__atomic_fetch_and_simt_(4,32)
+DO__atomic_fetch_and_simt_(8,64)
+
+template<class type, class delta, typename std::enable_if<sizeof(type)<=2, int>::type = 0>
+type __device__ __atomic_fetch_and_simt_ (type *ptr, delta val, int memorder) {
+
+    type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
+    type const desired = expected & val;
+    while(!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, memorder, memorder))
+        ;
+    return expected;
+}
+
+#define DO__atomic_fetch_xor_simt_(bytes, bits) \
+template<class type, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
+type __device__ __atomic_fetch_xor_simt_ (type *ptr, type val, int memorder) { \
+    type ret; \
+    switch (memorder) { \
+    case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
+    case __ATOMIC_CONSUME: \
+    case __ATOMIC_ACQUIRE: __simt_xor_acquire_##bits(ptr, ret, val); break; \
+    case __ATOMIC_ACQ_REL: __simt_xor_acq_rel_##bits(ptr, ret, val); break; \
+    case __ATOMIC_RELEASE: __simt_xor_release_##bits(ptr, ret, val); break; \
+    case __ATOMIC_RELAXED: __simt_xor_relaxed_##bits(ptr, ret, val); break; \
+    default: assert(0); \
+    } \
+    return ret; \
+}
+DO__atomic_fetch_xor_simt_(4,32)
+DO__atomic_fetch_xor_simt_(8,64)
+
+template<class type, class delta, typename std::enable_if<sizeof(type)<=2, int>::type = 0>
+type __device__ __atomic_fetch_xor_simt_ (type *ptr, delta val, int memorder) {
+
+    type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
+    type const desired = expected ^ val;
+    while(!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, memorder, memorder))
+        ;
+    return expected;
+}
+
+#define DO__atomic_fetch_or_simt_(bytes, bits) \
+template<class type, typename std::enable_if<sizeof(type)==bytes, int>::type = 0> \
+type __device__ __atomic_fetch_or_simt_ (type *ptr, type val, int memorder) { \
+    type ret; \
+    switch (memorder) { \
+    case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \
+    case __ATOMIC_CONSUME: \
+    case __ATOMIC_ACQUIRE: __simt_or_acquire_##bits(ptr, ret, val); break; \
+    case __ATOMIC_ACQ_REL: __simt_or_acq_rel_##bits(ptr, ret, val); break; \
+    case __ATOMIC_RELEASE: __simt_or_release_##bits(ptr, ret, val); break; \
+    case __ATOMIC_RELAXED: __simt_or_relaxed_##bits(ptr, ret, val); break; \
+    default: assert(0); \
+    } \
+    return ret; \
+}
+DO__atomic_fetch_or_simt_(4,32)
+DO__atomic_fetch_or_simt_(8,64)
+
+template<class type, class delta, typename std::enable_if<sizeof(type)<=2, int>::type = 0>
+type __device__ __atomic_fetch_or_simt_ (type *ptr, delta val, int memorder) {
+
+    type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
+    type const desired = expected | val;
+    while(!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, memorder, memorder))
+        ;
+    return expected;
+}
+
+template<class type>
+inline bool __device__ __atomic_test_and_set_simt_(type *ptr, int memorder) {
+    return __atomic_exchange_n_simt_((char*)ptr, (char)1, memorder) == 1;
+}
+template<class type>
+inline void __device__ __atomic_clear_simt_(type *ptr, int memorder) {
+    return __atomic_store_n_simt_((char*)ptr, (char)0, memorder);
+}
+
+inline constexpr __device__ bool __atomic_always_lock_free_simt_ (size_t size, void *) {
+    return size <= 8;
+}
+inline __device__ bool __atomic_is_lock_free_simt_(size_t size, void * ptr) {
+    return __atomic_always_lock_free_simt_(size, ptr);
+}
+
+/*
+    fences
+*/
+
+inline void __device__ __atomic_thread_fence_simt(int memorder) {
+    switch (memorder) {
+    case __ATOMIC_SEQ_CST: __simt_fence_sc_(); break;
+    case __ATOMIC_CONSUME:
+    case __ATOMIC_ACQUIRE:
+    case __ATOMIC_ACQ_REL:
+    case __ATOMIC_RELEASE: __simt_fence_(); break;
+    case __ATOMIC_RELAXED: break;
+    default: assert(0);
+    }
+}
+inline void __device__ __atomic_signal_fence_simt(int memorder) { 
+    __atomic_thread_fence_simt(memorder);
+}
+
+/*
+    non-volatile
+*/
+
+template<class type> type __device__ __atomic_load_n_simt(const type *ptr, int memorder) {
+    return __atomic_load_n_simt_(const_cast<const type*>(ptr), memorder);
+}
+template<class type> void __device__ __atomic_load_simt(const type *ptr, type *ret, int memorder) {
+    __atomic_load_simt_(const_cast<const type*>(ptr), ret, memorder);
+}
+template<class type> void __device__ __atomic_store_n_simt(type *ptr, type val, int memorder) {
+    __atomic_store_n_simt_(const_cast<type*>(ptr), val, memorder);
+}
+template<class type> void __device__ __atomic_store_simt(type *ptr, type *val, int memorder) {
+    __atomic_store_simt_(const_cast<type*>(ptr), val, memorder);
+}
+template<class type> type __device__ __atomic_exchange_n_simt(type *ptr, type val, int memorder) {
+    return __atomic_exchange_n_simt_(const_cast<type*>(ptr), val, memorder);
+}
+template<class type> void __device__ __atomic_exchange_simt(type *ptr, type *val, type *ret, int memorder) {
+    __atomic_exchange_simt_(const_cast<type*>(ptr), val, ret, memorder);
+}
+template<class type> bool __device__ __atomic_compare_exchange_n_simt(type *ptr, type *expected, type desired, bool weak, int success_memorder, int failure_memorder) {
+    return __atomic_compare_exchange_n_simt_(const_cast<type*>(ptr), expected, desired, weak, success_memorder, failure_memorder);
+}
+template<class type> bool __device__ __atomic_compare_exchange_simt(type *ptr, type *expected, type *desired, bool weak, int success_memorder, int failure_memorder) {
+    return __atomic_compare_exchange_simt_(const_cast<type*>(ptr), expected, desired, weak, success_memorder, failure_memorder);
+}
+template<class type, class delta> type __device__ __atomic_fetch_add_simt(type *ptr, delta val, int memorder) {
+    return __atomic_fetch_add_simt_(const_cast<type*>(ptr), val, memorder);
+}
+template<class type, class delta> type __device__ __atomic_fetch_sub_simt(type *ptr, delta val, int memorder) {
+    return __atomic_fetch_sub_simt_(const_cast<type*>(ptr), val, memorder);
+}
+template<class type> type __device__ __atomic_fetch_and_simt(type *ptr, type val, int memorder) {
+    return __atomic_fetch_and_simt_(const_cast<type*>(ptr), val, memorder);
+}
+template<class type> type __device__ __atomic_fetch_xor_simt(type *ptr, type val, int memorder) {
+    return __atomic_fetch_xor_simt_(const_cast<type*>(ptr), val, memorder);
+}
+template<class type> type __device__ __atomic_fetch_or_simt(type *ptr, type val, int memorder) {
+    return __atomic_fetch_or_simt_(const_cast<type*>(ptr), val, memorder);
+}
+template<class type> bool __device__ __atomic_test_and_set_simt(void *ptr, int memorder) {
+    return __atomic_test_and_set_simt_(const_cast<void*>(ptr), memorder);
+}
+template<class type> void __device__ __atomic_clear_simt(void *ptr, int memorder) {
+    return __atomic_clear_simt_(const_cast<void*>(ptr), memorder);
+}
+inline bool __device__ __atomic_always_lock_free_simt(size_t size, void *ptr) {
+    return __atomic_always_lock_free_simt_(size, const_cast<void*>(ptr));
+}
+inline bool __device__ __atomic_is_lock_free_simt(size_t size, void *ptr) {
+    return __atomic_is_lock_free_simt_(size, const_cast<void*>(ptr));
+}
+
+/*
+    volatile 
+*/
+
+template<class type> type __device__ __atomic_load_n_simt(const volatile type *ptr, int memorder) {
+    return __atomic_load_n_simt_(const_cast<const type*>(ptr), memorder);
+}
+template<class type> void __device__ __atomic_load_simt(const volatile type *ptr, type *ret, int memorder) {
+    __atomic_load_simt_(const_cast<const type*>(ptr), ret, memorder);
+}
+template<class type> void __device__ __atomic_store_n_simt(volatile type *ptr, type val, int memorder) {
+    __atomic_store_n_simt_(const_cast<type*>(ptr), val, memorder);
+}
+template<class type> void __device__ __atomic_store_simt(volatile type *ptr, type *val, int memorder) {
+    __atomic_store_simt_(const_cast<type*>(ptr), val, memorder);
+}
+template<class type> type __device__ __atomic_exchange_n_simt(volatile type *ptr, type val, int memorder) {
+    return __atomic_exchange_n_simt_(const_cast<type*>(ptr), val, memorder);
+}
+template<class type> void __device__ __atomic_exchange_simt(volatile type *ptr, type *val, type *ret, int memorder) {
+    __atomic_exchange_simt_(const_cast<type*>(ptr), val, ret, memorder);
+}
+template<class type> bool __device__ __atomic_compare_exchange_n_simt(volatile type *ptr, type *expected, type desired, bool weak, int success_memorder, int failure_memorder) {
+    return __atomic_compare_exchange_n_simt_(const_cast<type*>(ptr), expected, desired, weak, success_memorder, failure_memorder);
+}
+template<class type> bool __device__ __atomic_compare_exchange_simt(volatile type *ptr, type *expected, type *desired, bool weak, int success_memorder, int failure_memorder) {
+    return __atomic_compare_exchange_simt_(const_cast<type*>(ptr), expected, desired, weak, success_memorder, failure_memorder);
+}
+template<class type, class delta> type __device__ __atomic_fetch_add_simt(volatile type *ptr, delta val, int memorder) {
+    return __atomic_fetch_add_simt_(const_cast<type*>(ptr), val, memorder);
+}
+template<class type, class delta> type __device__ __atomic_fetch_sub_simt(volatile type *ptr, delta val, int memorder) {
+    return __atomic_fetch_sub_simt_(const_cast<type*>(ptr), val, memorder);
+}
+template<class type> type __device__ __atomic_fetch_and_simt(volatile type *ptr, type val, int memorder) {
+    return __atomic_fetch_and_simt_(const_cast<type*>(ptr), val, memorder);
+}
+template<class type> type __device__ __atomic_fetch_xor_simt(volatile type *ptr, type val, int memorder) {
+    return __atomic_fetch_xor_simt_(const_cast<type*>(ptr), val, memorder);
+}
+template<class type> type __device__ __atomic_fetch_or_simt(volatile type *ptr, type val, int memorder) {
+    return __atomic_fetch_or_simt_(const_cast<type*>(ptr), val, memorder);
+}
+template<class type> bool __device__ __atomic_test_and_set_simt(volatile void *ptr, int memorder) {
+    return __atomic_test_and_set_simt_(const_cast<void*>(ptr), memorder);
+}
+template<class type> void __device__ __atomic_clear_simt(volatile void *ptr, int memorder) {
+    return __atomic_clear_simt_(const_cast<void*>(ptr), memorder);
+}
+
+
+
+} // end namespace Impl
+} // end namespace Kokkos
+
+#endif //_SIMT_DETAILS_CONFIG
+
+#ifndef KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
+/*
+    builtins
+*/
+
+#define __atomic_load_n __atomic_load_n_simt
+#define __atomic_load __atomic_load_simt
+#define __atomic_store_n __atomic_store_n_simt
+#define __atomic_store __atomic_store_simt
+#define __atomic_exchange_n __atomic_exchange_n_simt
+#define __atomic_exchange __atomic_exchange_simt
+#define __atomic_compare_exchange_n __atomic_compare_exchange_n_simt
+#define __atomic_compare_exchange __atomic_compare_exchange_simt
+#define __atomic_fetch_add __atomic_fetch_add_simt
+#define __atomic_fetch_sub __atomic_fetch_sub_simt
+#define __atomic_fetch_and __atomic_fetch_and_simt
+#define __atomic_fetch_xor __atomic_fetch_xor_simt
+#define __atomic_fetch_or __atomic_fetch_or_simt
+#define __atomic_test_and_set __atomic_test_and_set_simt
+#define __atomic_clear __atomic_clear_simt
+#define __atomic_always_lock_free __atomic_always_lock_free_simt
+#define __atomic_is_lock_free __atomic_is_lock_free_simt
+#define __atomic_thread_fence __atomic_thread_fence_simt
+#define __atomic_signal_fence __atomic_signal_fence_simt
+
+#define KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
+
+#endif //__CUDA_ARCH__ && KOKKOS_ENABLE_CUDA_ASM_ATOMICS
+#endif // KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics_Restore_Builtins.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics_Restore_Builtins.hpp
@ -0,0 +1,68 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2019) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifdef KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
+
+#undef __atomic_load_n
+#undef __atomic_load
+#undef __atomic_store_n
+#undef __atomic_store
+#undef __atomic_exchange_n
+#undef __atomic_exchange
+#undef __atomic_compare_exchange_n
+#undef __atomic_compare_exchange
+#undef __atomic_fetch_add
+#undef __atomic_fetch_sub
+#undef __atomic_fetch_and
+#undef __atomic_fetch_xor
+#undef __atomic_fetch_or
+#undef __atomic_test_and_set
+#undef __atomic_clear
+#undef __atomic_always_lock_free
+#undef __atomic_is_lock_free
+#undef __atomic_thread_fence
+#undef __atomic_signal_fence
+
+#undef KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
+
+#endif // KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@ -58,7 +58,68 @@ struct CudaGetMaxBlockSize;
 template<class DriverType, class LaunchBounds>
 int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
-  return CudaGetMaxBlockSize<DriverType,LaunchBounds,(CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>::get_block_size(f,vector_length, shmem_extra_block,shmem_extra_thread);
+  return CudaGetMaxBlockSize<DriverType,LaunchBounds
+          , true
+      >::get_block_size(f,vector_length, shmem_extra_block,shmem_extra_thread);
+}
+
+template<class FunctorType, class LaunchBounds>
+int cuda_get_max_block_size(const CudaInternal* cuda_instance, const cudaFuncAttributes& attr, const FunctorType& f, const size_t vector_length,
+    const size_t shmem_block, const size_t shmem_thread) {
+
+  const int min_blocks_per_sm = LaunchBounds::minBperSM == 0 ?
+                                     1 : LaunchBounds::minBperSM ;
+  const int max_threads_per_block = LaunchBounds::maxTperB == 0 ?
+                                     cuda_instance->m_maxThreadsPerBlock : LaunchBounds::maxTperB ;
+
+  const int regs_per_thread = attr.numRegs;
+  const int regs_per_sm = cuda_instance->m_regsPerSM;
+  const int shmem_per_sm = cuda_instance->m_shmemPerSM;
+  const int max_shmem_per_block = cuda_instance->m_maxShmemPerBlock;
+  const int max_blocks_per_sm = cuda_instance->m_maxBlocksPerSM;
+  const int max_threads_per_sm = cuda_instance->m_maxThreadsPerSM;
+
+  int block_size = std::min(attr.maxThreadsPerBlock,max_threads_per_block);
+
+  int functor_shmem = FunctorTeamShmemSize< FunctorType  >::value( f , block_size/vector_length );
+  int total_shmem = shmem_block + shmem_thread*(block_size/vector_length) + functor_shmem + attr.sharedSizeBytes;
+  int max_blocks_regs = regs_per_sm/(regs_per_thread*block_size);
+  int max_blocks_shmem = (total_shmem<max_shmem_per_block)?(total_shmem>0?shmem_per_sm/total_shmem:max_blocks_regs):0;
+  int blocks_per_sm = std::min(max_blocks_regs,max_blocks_shmem);
+  int threads_per_sm = blocks_per_sm * block_size;
+  if(threads_per_sm > max_threads_per_sm) {
+    blocks_per_sm = max_threads_per_sm/block_size;
+    threads_per_sm = blocks_per_sm * block_size;
+  }
+  int opt_block_size = (blocks_per_sm>=min_blocks_per_sm) ? block_size : 0;
+  int opt_threads_per_sm = threads_per_sm;
+  //printf("BlockSizeMax: %i Shmem: %i %i %i %i Regs: %i %i Blocks: %i %i Achieved: %i %i Opt: %i %i\n",block_size,
+  //   shmem_per_sm,max_shmem_per_block,functor_shmem,total_shmem,
+  //   regs_per_sm,regs_per_thread,max_blocks_shmem,max_blocks_regs,blocks_per_sm,threads_per_sm,opt_block_size,opt_threads_per_sm);
+  block_size-=32;
+  while ((blocks_per_sm==0) && (block_size>=32)) {
+    functor_shmem = FunctorTeamShmemSize< FunctorType  >::value( f , block_size/vector_length );
+    total_shmem = shmem_block + shmem_thread*(block_size/vector_length) + functor_shmem + attr.sharedSizeBytes;
+    max_blocks_regs = regs_per_sm/(regs_per_thread*block_size);
+    max_blocks_shmem = (total_shmem<max_shmem_per_block)?(total_shmem>0?shmem_per_sm/total_shmem:max_blocks_regs):0;
+    blocks_per_sm = std::min(max_blocks_regs,max_blocks_shmem);
+    threads_per_sm = blocks_per_sm * block_size;
+    if(threads_per_sm > max_threads_per_sm) {
+      blocks_per_sm = max_threads_per_sm/block_size;
+      threads_per_sm = blocks_per_sm * block_size;
+    }
+    if((blocks_per_sm >= min_blocks_per_sm) && (blocks_per_sm <= max_blocks_per_sm)) {
+      if(threads_per_sm>=opt_threads_per_sm) {
+        opt_block_size = block_size;
+        opt_threads_per_sm = threads_per_sm;
+      }
+    }
+  //printf("BlockSizeMax: %i Shmem: %i %i %i %i Regs: %i %i Blocks: %i %i Achieved: %i %i Opt: %i %i\n",block_size,
+  //   shmem_per_sm,max_shmem_per_block,functor_shmem,total_shmem,
+  //   regs_per_sm,regs_per_thread,max_blocks_shmem,max_blocks_regs,blocks_per_sm,threads_per_sm,opt_block_size,opt_threads_per_sm);
+    block_size-=32;
+  }
+  return opt_block_size;
 }


@ -241,11 +302,71 @@ struct CudaGetOptBlockSize;
 template<class DriverType, class LaunchBounds>
 int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
-  return CudaGetOptBlockSize<DriverType,LaunchBounds,(CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>::get_block_size(f,vector_length,shmem_extra_block,shmem_extra_thread);
+  return CudaGetOptBlockSize<DriverType,LaunchBounds,
+      //LaunchBounds::launch_mechanism == Kokkos::Experimental::LaunchDefault ?
+      //            (( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) )?
+      //                   Kokkos::Experimental::CudaLaunchConstantMemory:Kokkos::Experimental::CudaLaunchLocalMemory):
+      //             LaunchBounds::launch_mechanism
+      (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))
+      >::get_block_size(f,vector_length,shmem_extra_block,shmem_extra_thread);
+}
+
+template<class FunctorType, class LaunchBounds>
+int cuda_get_opt_block_size(const CudaInternal* cuda_instance, const cudaFuncAttributes& attr, const FunctorType& f, const size_t vector_length,
+    const size_t shmem_block, const size_t shmem_thread) {
+
+  const int min_blocks_per_sm = LaunchBounds::minBperSM == 0 ?
+                                     1 : LaunchBounds::minBperSM ;
+  const int max_threads_per_block = LaunchBounds::maxTperB == 0 ?
+                                     cuda_instance->m_maxThreadsPerBlock : LaunchBounds::maxTperB ;
+
+  const int regs_per_thread = attr.numRegs;
+  const int regs_per_sm = cuda_instance->m_regsPerSM;
+  const int shmem_per_sm = cuda_instance->m_shmemPerSM;
+  const int max_shmem_per_block = cuda_instance->m_maxShmemPerBlock;
+  const int max_blocks_per_sm = cuda_instance->m_maxBlocksPerSM;
+  const int max_threads_per_sm = cuda_instance->m_maxThreadsPerSM;
+
+  int block_size = std::min(attr.maxThreadsPerBlock,max_threads_per_block);
+
+  int functor_shmem = FunctorTeamShmemSize< FunctorType  >::value( f , block_size/vector_length );
+  int total_shmem = shmem_block + shmem_thread*(block_size/vector_length) + functor_shmem + attr.sharedSizeBytes;
+  int max_blocks_regs = regs_per_sm/(regs_per_thread*block_size);
+  int max_blocks_shmem = (total_shmem<max_shmem_per_block)?(total_shmem>0?shmem_per_sm/total_shmem:max_blocks_regs):0;
+  int blocks_per_sm = std::min(max_blocks_regs,max_blocks_shmem);
+  int threads_per_sm = blocks_per_sm * block_size;
+  if(threads_per_sm > max_threads_per_sm) {
+    blocks_per_sm = max_threads_per_sm/block_size;
+    threads_per_sm = blocks_per_sm * block_size;
+  }
+  int opt_block_size = (blocks_per_sm>=min_blocks_per_sm) ? block_size : 0;
+  int opt_threads_per_sm = threads_per_sm;
+
+  block_size-=32;
+  while ((block_size>=32)) {
+    functor_shmem = FunctorTeamShmemSize< FunctorType  >::value( f , block_size/vector_length );
+    total_shmem = shmem_block + shmem_thread*(block_size/vector_length) + functor_shmem + attr.sharedSizeBytes;
+    max_blocks_regs = regs_per_sm/(regs_per_thread*block_size);
+    max_blocks_shmem = (total_shmem<max_shmem_per_block)?(total_shmem>0?shmem_per_sm/total_shmem:max_blocks_regs):0;
+    blocks_per_sm = std::min(max_blocks_regs,max_blocks_shmem);
+    threads_per_sm = blocks_per_sm * block_size;
+    if(threads_per_sm > max_threads_per_sm) {
+      blocks_per_sm = max_threads_per_sm/block_size;
+      threads_per_sm = blocks_per_sm * block_size;
+    }
+    if((blocks_per_sm >= min_blocks_per_sm) && (blocks_per_sm <= max_blocks_per_sm)) {
+      if(threads_per_sm>=opt_threads_per_sm) {
+        opt_block_size = block_size;
+        opt_threads_per_sm = threads_per_sm;
+      }
+    }
+    block_size-=32;
+  }
+  return opt_block_size;
 }

 template<class DriverType>
-struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<>,true> {
+struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<0,0>,true> {
  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int blockSize=16;
@ -275,7 +396,7 @@ struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<>,true> {
 };

 template<class DriverType>
-struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<>,false> {
+struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<0,0>,false> {
  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int blockSize=16;
@ -305,7 +426,7 @@ struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<>,false> {
 };

 template<class DriverType, unsigned int MaxThreadsPerBlock, unsigned int MinBlocksPerSM>
-struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds< MaxThreadsPerBlock, MinBlocksPerSM >,true> {
+struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds< MaxThreadsPerBlock, MinBlocksPerSM>,true> {
  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int blockSize=16;
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@ -50,7 +50,8 @@
 #include <Kokkos_Core.hpp>

 #include <Cuda/Kokkos_Cuda_Error.hpp>
-#include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
+#include <Cuda/Kokkos_Cuda_Instance.hpp>
 #include <Cuda/Kokkos_Cuda_Locks.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
@ -217,78 +218,6 @@ const CudaInternalDevices & CudaInternalDevices::singleton()

 }

-//----------------------------------------------------------------------------
-
-class CudaInternal {
-private:
-
-  CudaInternal( const CudaInternal & );
-  CudaInternal & operator = ( const CudaInternal & );
-
-
-public:
-
-  typedef Cuda::size_type size_type ;
-
-  int         m_cudaDev ;
-  int         m_cudaArch ;
-  unsigned    m_multiProcCount ;
-  unsigned    m_maxWarpCount ;
-  unsigned    m_maxBlock ;
-  unsigned    m_maxSharedWords ;
-  uint32_t    m_maxConcurrency ;
-  size_type   m_scratchSpaceCount ;
-  size_type   m_scratchFlagsCount ;
-  size_type   m_scratchUnifiedCount ;
-  size_type   m_scratchUnifiedSupported ;
-  size_type   m_streamCount ;
-  size_type * m_scratchSpace ;
-  size_type * m_scratchFlags ;
-  size_type * m_scratchUnified ;
-  uint32_t  * m_scratchConcurrentBitset ;
-  cudaStream_t * m_stream ;
-
-  static int was_initialized;
-  static int was_finalized;
-
-  static CudaInternal & singleton();
-
-  int verify_is_initialized( const char * const label ) const ;
-
-  int is_initialized() const
-    { return 0 != m_scratchSpace && 0 != m_scratchFlags ; }
-
-  void initialize( int cuda_device_id , int stream_count );
-  void finalize();
-
-  void print_configuration( std::ostream & ) const ;
-
-  ~CudaInternal();
-
-  CudaInternal()
-    : m_cudaDev( -1 )
-    , m_cudaArch( -1 )
-    , m_multiProcCount( 0 )
-    , m_maxWarpCount( 0 )
-    , m_maxBlock( 0 )
-    , m_maxSharedWords( 0 )
-    , m_maxConcurrency( 0 )
-    , m_scratchSpaceCount( 0 )
-    , m_scratchFlagsCount( 0 )
-    , m_scratchUnifiedCount( 0 )
-    , m_scratchUnifiedSupported( 0 )
-    , m_streamCount( 0 )
-    , m_scratchSpace( 0 )
-    , m_scratchFlags( 0 )
-    , m_scratchUnified( 0 )
-    , m_scratchConcurrentBitset( 0 )
-    , m_stream( 0 )
-    {}
-
-  size_type * scratch_space( const size_type size );
-  size_type * scratch_flags( const size_type size );
-  size_type * scratch_unified( const size_type size );
-};

 int CudaInternal::was_initialized = 0;
 int CudaInternal::was_finalized = 0;
@ -366,8 +295,11 @@ CudaInternal & CudaInternal::singleton()
  static CudaInternal self ;
  return self ;
 }
+void CudaInternal::fence() const {
+  cudaStreamSynchronize(m_stream);
+}

-void CudaInternal::initialize( int cuda_device_id , int stream_count )
+void CudaInternal::initialize( int cuda_device_id , cudaStream_t stream )
 {
  if ( was_finalized ) Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n");
  was_initialized = 1;
@ -454,6 +386,15 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )

    m_maxBlock = cudaProp.maxGridSize[0] ;

+    m_shmemPerSM = cudaProp.sharedMemPerMultiprocessor ;
+    m_maxShmemPerBlock = cudaProp.sharedMemPerBlock ;
+    m_regsPerSM = cudaProp.regsPerMultiprocessor ;
+    m_maxBlocksPerSM = m_cudaArch < 500 ? 16 : (
+                       m_cudaArch < 750 ? 32 : (
+                       m_cudaArch == 750 ? 16 : 32));
+    m_maxThreadsPerSM = cudaProp.maxThreadsPerMultiProcessor ;
+    m_maxThreadsPerBlock = cudaProp.maxThreadsPerBlock ;
+
    //----------------------------------

    m_scratchUnifiedSupported = cudaProp.unifiedAddressing ;
@ -482,10 +423,9 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
    // Concurrent bitset for obtaining unique tokens from within
    // an executing kernel.
    {
-      const unsigned max_threads_per_sm = 2048 ; // up to capability 7.0

      m_maxConcurrency =
-        max_threads_per_sm * cudaProp.multiProcessorCount ;
+        m_maxThreadsPerSM * cudaProp.multiProcessorCount ;

      const int32_t buffer_bound =
         Kokkos::Impl::concurrent_bitset::buffer_bound( m_maxConcurrency );
@ -507,11 +447,6 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
    }
    //----------------------------------

-    if ( stream_count ) {
-      m_stream = (cudaStream_t*) ::malloc( stream_count * sizeof(cudaStream_t) );
-      m_streamCount = stream_count ;
-      for ( size_type i = 0 ; i < m_streamCount ; ++i ) m_stream[i] = 0 ;
-    }
  }
  else {

@ -539,7 +474,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
    if( Kokkos::show_warnings() && !cuda_launch_blocking() ) {
      std::cerr << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
      std::cerr << "                                  without setting CUDA_LAUNCH_BLOCKING=1." << std::endl;
-      std::cerr << "                                  The code must call Cuda::fence() after each kernel" << std::endl;
+      std::cerr << "                                  The code must call Cuda().fence() after each kernel" << std::endl;
      std::cerr << "                                  or will likely crash when accessing data on the host." << std::endl;
    }

@ -568,7 +503,10 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
  #endif

  // Init the array for used for arbitrarily sized atomics
-  Impl::initialize_host_cuda_lock_arrays();
+  if(stream == 0)
+    Impl::initialize_host_cuda_lock_arrays();
+
+  m_stream = stream;
 }

 //----------------------------------------------------------------------------
@ -578,7 +516,7 @@ enum { sizeScratchGrain = sizeof(ScratchGrain) };


 Cuda::size_type *
-CudaInternal::scratch_flags( const Cuda::size_type size )
+CudaInternal::scratch_flags( const Cuda::size_type size ) const 
 {
  if ( verify_is_initialized("scratch_flags") && m_scratchFlagsCount * sizeScratchGrain < size ) {

@ -587,6 +525,9 @@ CudaInternal::scratch_flags( const Cuda::size_type size )

    typedef Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;

+    if( m_scratchFlags )
+      Record::decrement( Record::get_record( m_scratchFlags ) );
+
    Record * const r = Record::allocate( Kokkos::CudaSpace()
                                       , "InternalScratchFlags"
                                       , ( sizeof( ScratchGrain ) * m_scratchFlagsCount ) );
@ -602,7 +543,7 @@ CudaInternal::scratch_flags( const Cuda::size_type size )
 }

 Cuda::size_type *
-CudaInternal::scratch_space( const Cuda::size_type size )
+CudaInternal::scratch_space( const Cuda::size_type size ) const 
 {
  if ( verify_is_initialized("scratch_space") && m_scratchSpaceCount * sizeScratchGrain < size ) {

@ -610,6 +551,9 @@ CudaInternal::scratch_space( const Cuda::size_type size )

     typedef Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;

+     if( m_scratchSpace )
+       Record::decrement( Record::get_record( m_scratchSpace ) );
+
     Record * const r = Record::allocate( Kokkos::CudaSpace()
                                        , "InternalScratchSpace"
                                        , ( sizeof( ScratchGrain ) * m_scratchSpaceCount ) );
@ -623,7 +567,7 @@ CudaInternal::scratch_space( const Cuda::size_type size )
 }

 Cuda::size_type *
-CudaInternal::scratch_unified( const Cuda::size_type size )
+CudaInternal::scratch_unified( const Cuda::size_type size ) const
 {
  if ( verify_is_initialized("scratch_unified") &&
       m_scratchUnifiedSupported && m_scratchUnifiedCount * sizeScratchGrain < size ) {
@ -632,6 +576,9 @@ CudaInternal::scratch_unified( const Cuda::size_type size )

    typedef Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > Record ;

+    if( m_scratchUnified )
+      Record::decrement( Record::get_record( m_scratchUnified ) );
+
    Record * const r = Record::allocate( Kokkos::CudaHostPinnedSpace()
                                       , "InternalScratchUnified"
                                       , ( sizeof( ScratchGrain ) * m_scratchUnifiedCount ) );
@ -644,6 +591,31 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
  return m_scratchUnified ;
 }

+Cuda::size_type *
+CudaInternal::scratch_functor( const Cuda::size_type size ) const
+{
+  if ( verify_is_initialized("scratch_functor") &&
+       m_scratchFunctorSize < size ) {
+
+    m_scratchFunctorSize = size ;
+
+    typedef Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
+
+    if( m_scratchFunctor )
+      Record::decrement( Record::get_record( m_scratchFunctor ) );
+
+    Record * const r = Record::allocate( Kokkos::CudaSpace()
+                                       , "InternalScratchFunctor"
+                                       , m_scratchFunctorSize );
+
+    Record::increment( r );
+
+    m_scratchFunctor = reinterpret_cast<size_type *>( r->data() );
+  }
+
+  return m_scratchFunctor ;
+}
+
 //----------------------------------------------------------------------------

 void CudaInternal::finalize()
@ -653,13 +625,7 @@ void CudaInternal::finalize()

    Impl::finalize_host_cuda_lock_arrays();

-    if ( m_stream ) {
-      for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
-        cudaStreamDestroy( m_stream[i] );
-        m_stream[i] = 0 ;
-      }
-      ::free( m_stream );
-    }
+    if(m_stream!=0) cudaStreamDestroy(m_stream);

    typedef Kokkos::Impl::SharedAllocationRecord< CudaSpace > RecordCuda ;
    typedef Kokkos::Impl::SharedAllocationRecord< CudaHostPinnedSpace > RecordHost ;
@ -668,6 +634,8 @@ void CudaInternal::finalize()
    RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) );
    RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) );
    RecordCuda::decrement( RecordCuda::get_record( m_scratchConcurrentBitset ) );
+    if(m_scratchFunctorSize>0)
+    RecordCuda::decrement( RecordCuda::get_record( m_scratchFunctor ) );

    m_cudaDev             = -1 ;
    m_multiProcCount      = 0 ;
@ -713,14 +681,14 @@ Cuda::size_type cuda_internal_maximum_grid_count()
 Cuda::size_type cuda_internal_maximum_shared_words()
 { return CudaInternal::singleton().m_maxSharedWords ; }

-Cuda::size_type * cuda_internal_scratch_space( const Cuda::size_type size )
-{ return CudaInternal::singleton().scratch_space( size ); }
+Cuda::size_type * cuda_internal_scratch_space( const Cuda& instance, const Cuda::size_type size )
+{ return instance.impl_internal_space_instance()->scratch_space( size ); }

-Cuda::size_type * cuda_internal_scratch_flags( const Cuda::size_type size )
-{ return CudaInternal::singleton().scratch_flags( size ); }
+Cuda::size_type * cuda_internal_scratch_flags( const Cuda& instance, const Cuda::size_type size )
+{ return instance.impl_internal_space_instance()->scratch_flags( size ); }

-Cuda::size_type * cuda_internal_scratch_unified( const Cuda::size_type size )
-{ return CudaInternal::singleton().scratch_unified( size ); }
+Cuda::size_type * cuda_internal_scratch_unified( const Cuda& instance, const Cuda::size_type size )
+{ return instance.impl_internal_space_instance()->scratch_unified( size ); }


 } // namespace Impl
@ -749,7 +717,7 @@ void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances )
 void Cuda::impl_initialize( const Cuda::SelectDevice config , size_t num_instances )
 #endif
 {
-  Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances );
+  Impl::CudaInternal::singleton().initialize( config.cuda_device_id , 0 );

  #if defined(KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::initialize();
@ -800,19 +768,17 @@ void Cuda::impl_finalize()
 }

 Cuda::Cuda()
-  : m_device( Impl::CudaInternal::singleton().m_cudaDev )
-  , m_stream( 0 )
+  : m_space_instance( &Impl::CudaInternal::singleton() )
 {
  Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" );
 }

-Cuda::Cuda( const int instance_id )
-  : m_device( Impl::CudaInternal::singleton().m_cudaDev )
-  , m_stream(
-      Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" )
-        ? Impl::CudaInternal::singleton().m_stream[ instance_id % Impl::CudaInternal::singleton().m_streamCount ]
-        : 0 )
-{}
+Cuda::Cuda(cudaStream_t stream)
+  :   m_space_instance(new Impl::CudaInternal)
+{
+  Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" );
+  m_space_instance->initialize(Impl::CudaInternal::singleton().m_cudaDev,stream);
+}

 void Cuda::print_configuration( std::ostream & s , const bool )
 { Impl::CudaInternal::singleton().print_configuration( s ); }
@ -823,13 +789,27 @@ bool Cuda::sleep() { return false ; }
 bool Cuda::wake() { return true ; }
 #endif

-void Cuda::fence()
+void Cuda::impl_static_fence()
 {
  Kokkos::Impl::cuda_device_synchronize();
 }

+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+void Cuda::fence() {
+  impl_static_fence();
+}
+#else
+void Cuda::fence() const {
+  m_space_instance->fence();
+}
+#endif
+
 const char* Cuda::name() { return "Cuda"; }

+cudaStream_t Cuda::cuda_stream() const { return m_space_instance->m_stream ; }
+int          Cuda::cuda_device() const { return m_space_instance->m_cudaDev ; }
+
+
 } // namespace Kokkos

 namespace Kokkos {
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
@ -0,0 +1,156 @@
+#ifndef KOKKOS_CUDA_INSTANCE_HPP_
+#define KOKKOS_CUDA_INSTANCE_HPP_
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+struct CudaTraits {
+  enum { WarpSize       = 32      /* 0x0020 */ };
+  enum { WarpIndexMask  = 0x001f  /* Mask for warpindex */ };
+  enum { WarpIndexShift = 5       /* WarpSize == 1 << WarpShift */ };
+
+  enum { ConstantMemoryUsage    = 0x008000 /* 32k bytes */ };
+  enum { ConstantMemoryCache    = 0x002000 /*  8k bytes */ };
+  enum { KernelArgumentLimit    = 0x001000 /*  4k bytes */ };
+
+  typedef unsigned long
+    ConstantGlobalBufferType[ ConstantMemoryUsage / sizeof(unsigned long) ];
+
+#if defined(KOKKOS_ARCH_VOLTA) || \
+    defined(KOKKOS_ARCH_PASCAL)
+  enum { ConstantMemoryUseThreshold = 0x000200 /* 0 bytes -> always use constant (or global)*/ };
+#else
+  enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ };
+#endif
+
+  KOKKOS_INLINE_FUNCTION static
+  CudaSpace::size_type warp_count( CudaSpace::size_type i )
+    { return ( i + WarpIndexMask ) >> WarpIndexShift ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  CudaSpace::size_type warp_align( CudaSpace::size_type i )
+    {
+      enum { Mask = ~CudaSpace::size_type( WarpIndexMask ) };
+      return ( i + WarpIndexMask ) & Mask ;
+    }
+};
+
+//----------------------------------------------------------------------------
+
+CudaSpace::size_type cuda_internal_multiprocessor_count();
+CudaSpace::size_type cuda_internal_maximum_warp_count();
+CudaSpace::size_type cuda_internal_maximum_grid_count();
+CudaSpace::size_type cuda_internal_maximum_shared_words();
+
+CudaSpace::size_type cuda_internal_maximum_concurrent_block_count();
+
+CudaSpace::size_type * cuda_internal_scratch_flags( const Cuda&, const CudaSpace::size_type size );
+CudaSpace::size_type * cuda_internal_scratch_space( const Cuda&, const CudaSpace::size_type size );
+CudaSpace::size_type * cuda_internal_scratch_unified( const Cuda&, const CudaSpace::size_type size );
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+namespace Kokkos {
+namespace Impl {
+
+class CudaInternal {
+private:
+
+  CudaInternal( const CudaInternal & );
+  CudaInternal & operator = ( const CudaInternal & );
+
+
+public:
+
+  typedef Cuda::size_type size_type ;
+
+  int         m_cudaDev ;
+
+  // Device Properties
+  int         m_cudaArch ;
+  unsigned    m_multiProcCount ;
+  unsigned    m_maxWarpCount ;
+  unsigned    m_maxBlock ;
+  unsigned    m_maxSharedWords ;
+  uint32_t    m_maxConcurrency ;
+  int         m_shmemPerSM ;
+  int         m_maxShmemPerBlock ;
+  int         m_regsPerSM ;
+  int         m_maxBlocksPerSM ;
+  int         m_maxThreadsPerSM ;
+  int         m_maxThreadsPerBlock ;
+
+  mutable size_type   m_scratchSpaceCount ;
+  mutable size_type   m_scratchFlagsCount ;
+  mutable size_type   m_scratchUnifiedCount ;
+  mutable size_type   m_scratchFunctorSize ;
+  size_type   m_scratchUnifiedSupported ;
+  size_type   m_streamCount ;
+  mutable size_type * m_scratchSpace ;
+  mutable size_type * m_scratchFlags ;
+  mutable size_type * m_scratchUnified ;
+  mutable size_type * m_scratchFunctor ;
+  uint32_t  * m_scratchConcurrentBitset ;
+  cudaStream_t m_stream ;
+
+  static int was_initialized;
+  static int was_finalized;
+
+  static CudaInternal & singleton();
+
+  int verify_is_initialized( const char * const label ) const ;
+
+  int is_initialized() const
+    { return 0 != m_scratchSpace && 0 != m_scratchFlags ; }
+
+  void initialize( int cuda_device_id , cudaStream_t stream = 0 );
+  void finalize();
+
+  void print_configuration( std::ostream & ) const ;
+
+  void fence() const ;
+
+  ~CudaInternal();
+
+  CudaInternal()
+    : m_cudaDev( -1 )
+    , m_cudaArch( -1 )
+    , m_multiProcCount( 0 )
+    , m_maxWarpCount( 0 )
+    , m_maxBlock( 0 )
+    , m_maxSharedWords( 0 )
+    , m_maxConcurrency( 0 )
+    , m_shmemPerSM( 0 )
+    , m_maxShmemPerBlock( 0 )
+    , m_regsPerSM( 0 )
+    , m_maxBlocksPerSM( 0 )
+    , m_maxThreadsPerSM( 0 )
+    , m_maxThreadsPerBlock( 0 )
+    , m_scratchSpaceCount( 0 )
+    , m_scratchFlagsCount( 0 )
+    , m_scratchUnifiedCount( 0 )
+    , m_scratchFunctorSize( 0 )
+    , m_scratchUnifiedSupported( 0 )
+    , m_streamCount( 0 )
+    , m_scratchSpace( 0 )
+    , m_scratchFlags( 0 )
+    , m_scratchUnified( 0 )
+    , m_scratchFunctor( 0 )
+    , m_scratchConcurrentBitset( 0 )
+    , m_stream( 0 )
+    {}
+
+  size_type * scratch_space( const size_type size ) const ;
+  size_type * scratch_flags( const size_type size ) const ;
+  size_type * scratch_unified( const size_type size ) const ;
+  size_type * scratch_functor( const size_type size ) const ;
+};
+
+} // Namespace Impl
+} // Namespace Kokkos
+#endif
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@ -0,0 +1,579 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDAEXEC_HPP
+#define KOKKOS_CUDAEXEC_HPP
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include <string>
+#include <cstdint>
+#include <Kokkos_Parallel.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <Cuda/Kokkos_Cuda_abort.hpp>
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+#include <Cuda/Kokkos_Cuda_Locks.hpp>
+#include <Cuda/Kokkos_Cuda_Instance.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __CUDACC__ )
+
+/** \brief  Access to constant memory on the device */
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+
+__device__ __constant__
+extern unsigned long kokkos_impl_cuda_constant_memory_buffer[] ;
+
+#else
+
+__device__ __constant__
+unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;
+
+#endif
+
+namespace Kokkos {
+namespace Impl {
+  void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink = false);
+}
+}
+
+template< typename T >
+inline
+__device__
+T * kokkos_impl_cuda_shared_memory()
+{ extern __shared__ Kokkos::CudaSpace::size_type sh[]; return (T*) sh ; }
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+// See section B.17 of Cuda C Programming Guide Version 3.2
+// for discussion of
+//   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
+// function qualifier which could be used to improve performance.
+//----------------------------------------------------------------------------
+// Maximize L1 cache and minimize shared memory:
+//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
+// For 2.0 capability: 48 KB L1 and 16 KB shared
+//----------------------------------------------------------------------------
+
+template< class DriverType>
+__global__
+static void cuda_parallel_launch_constant_memory()
+{
+  const DriverType & driver =
+    *((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
+
+  driver();
+}
+
+template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
+__global__
+__launch_bounds__(maxTperB, minBperSM)
+static void cuda_parallel_launch_constant_memory()
+{
+  const DriverType & driver =
+    *((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
+
+  driver();
+}
+
+template< class DriverType>
+__global__
+static void cuda_parallel_launch_local_memory( const DriverType driver )
+{
+  driver();
+}
+
+template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
+__global__
+__launch_bounds__(maxTperB, minBperSM)
+static void cuda_parallel_launch_local_memory( const DriverType driver )
+{
+  driver();
+}
+
+template< class DriverType>
+__global__
+static void cuda_parallel_launch_global_memory( const DriverType* driver )
+{
+  driver->operator()();
+}
+
+template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
+__global__
+__launch_bounds__(maxTperB, minBperSM)
+static void cuda_parallel_launch_global_memory( const DriverType* driver )
+{
+  driver->operator()();
+}
+
+template< class DriverType>
+__global__
+static void cuda_parallel_launch_constant_or_global_memory( const DriverType* driver_ptr )
+{
+  const DriverType & driver = driver_ptr!=NULL ? *driver_ptr :
+    *((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
+
+  driver();
+}
+
+template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
+__global__
+__launch_bounds__(maxTperB, minBperSM)
+static void cuda_parallel_launch_constant_or_global_memory( const DriverType* driver_ptr )
+{
+  const DriverType & driver = driver_ptr!=NULL ? *driver_ptr :
+    *((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
+
+  driver();
+}
+
+template< class DriverType >
+struct DeduceCudaLaunchMechanism {
+  constexpr static const Kokkos::Experimental::WorkItemProperty::HintLightWeight_t light_weight = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
+  constexpr static const Kokkos::Experimental::WorkItemProperty::HintHeavyWeight_t heavy_weight = Kokkos::Experimental::WorkItemProperty::HintHeavyWeight ;
+  constexpr static const typename DriverType::Policy::work_item_property property = typename DriverType::Policy::work_item_property();
+
+  static constexpr const Experimental::CudaLaunchMechanism valid_launch_mechanism =
+      // BuildValidMask
+      (sizeof(DriverType)<CudaTraits::KernelArgumentLimit?
+          Experimental::CudaLaunchMechanism::LocalMemory:Experimental::CudaLaunchMechanism::Default)|
+      (sizeof(DriverType)<CudaTraits::ConstantMemoryUsage?
+          Experimental::CudaLaunchMechanism::ConstantMemory:Experimental::CudaLaunchMechanism::Default)|
+      Experimental::CudaLaunchMechanism::GlobalMemory;
+
+  static constexpr const Experimental::CudaLaunchMechanism requested_launch_mechanism =
+      (((property&light_weight)==light_weight)?
+           Experimental::CudaLaunchMechanism::LocalMemory :
+           Experimental::CudaLaunchMechanism::ConstantMemory)
+    | Experimental::CudaLaunchMechanism::GlobalMemory;
+
+  static constexpr const Experimental::CudaLaunchMechanism default_launch_mechanism =
+      // BuildValidMask
+      (sizeof(DriverType)<CudaTraits::ConstantMemoryUseThreshold)?
+          Experimental::CudaLaunchMechanism::LocalMemory:(
+      (sizeof(DriverType)<CudaTraits::ConstantMemoryUsage)?
+          Experimental::CudaLaunchMechanism::ConstantMemory:
+          Experimental::CudaLaunchMechanism::GlobalMemory);
+
+  //              None                LightWeight    HeavyWeight
+  // F<UseT       LCG LCG L  L        LCG  LG L  L    LCG  CG L  C
+  // UseT<F<KAL   LCG LCG C  C        LCG  LG C  L    LCG  CG C  C
+  // Kal<F<CMU     CG LCG C  C         CG  LG C  G     CG  CG C  C
+  // CMU<F          G LCG G  G          G  LG G  G      G  CG G  G
+  static constexpr const Experimental::CudaLaunchMechanism launch_mechanism =
+      ((property&light_weight)==light_weight)?
+          (sizeof(DriverType)<CudaTraits::KernelArgumentLimit?
+              Experimental::CudaLaunchMechanism::LocalMemory:
+              Experimental::CudaLaunchMechanism::GlobalMemory):(
+        ((property&heavy_weight)==heavy_weight)?
+          (sizeof(DriverType)<CudaTraits::ConstantMemoryUsage?
+              Experimental::CudaLaunchMechanism::ConstantMemory:
+              Experimental::CudaLaunchMechanism::GlobalMemory):
+          (default_launch_mechanism)
+      );
+};
+// Use local memory up to ConstantMemoryUseThreshold
+// Use global memory above ConstantMemoryUsage
+// In between use ConstantMemory
+template < class DriverType
+         , class LaunchBounds = Kokkos::LaunchBounds<>
+         , Experimental::CudaLaunchMechanism LaunchMechanism =
+             DeduceCudaLaunchMechanism<DriverType>::launch_mechanism >
+struct CudaParallelLaunch ;
+
+template < class DriverType
+         , unsigned int MaxThreadsPerBlock
+         , unsigned int MinBlocksPerSM>
+struct CudaParallelLaunch< DriverType
+                         , Kokkos::LaunchBounds< MaxThreadsPerBlock 
+                                               , MinBlocksPerSM >
+                          , Experimental::CudaLaunchMechanism::ConstantMemory>
+{
+  static_assert(sizeof(DriverType)<CudaTraits::ConstantMemoryUsage,"Kokkos Error: Requested CudaLaunchConstantMemory with a Functor larger than 32kB.");
+  inline
+  CudaParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int          shmem
+                    , const CudaInternal* cuda_instance
+                    , const bool prefer_shmem )
+  {
+    if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
+
+      // Fence before changing settings and copying closure
+      Kokkos::Cuda().fence();
+
+      if ( cuda_instance->m_maxShmemPerBlock < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      #ifndef KOKKOS_ARCH_KEPLER
+      // On Kepler the L1 has no benefit since it doesn't cache reads
+      else {
+        CUDA_SAFE_CALL(
+          cudaFuncSetCacheConfig
+            ( cuda_parallel_launch_constant_memory
+                < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
+            , ( prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
+            ) );
+      }
+      #endif
+
+      // Copy functor to constant memory on the device
+      cudaMemcpyToSymbolAsync(
+        kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType), 0, cudaMemcpyHostToDevice, cudaStream_t(cuda_instance->m_stream));
+
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+
+      // Invoke the driver function on the device
+      cuda_parallel_launch_constant_memory
+        < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
+          <<< grid , block , shmem , cuda_instance->m_stream >>>();
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      CUDA_SAFE_CALL( cudaGetLastError() );
+      Kokkos::Cuda().fence();
+#endif
+    }
+  }
+
+  static cudaFuncAttributes get_cuda_func_attributes() {
+    cudaFuncAttributes attr;
+    cudaFuncGetAttributes(&attr,cuda_parallel_launch_constant_memory
+            < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >);
+    return attr;
+  }
+};
+
+template < class DriverType>
+struct CudaParallelLaunch< DriverType
+                         , Kokkos::LaunchBounds<0,0>
+                         , Experimental::CudaLaunchMechanism::ConstantMemory >
+{
+  static_assert(sizeof(DriverType)<CudaTraits::ConstantMemoryUsage,"Kokkos Error: Requested CudaLaunchConstantMemory with a Functor larger than 32kB.");
+  inline
+  CudaParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int          shmem
+                    , const CudaInternal* cuda_instance
+                    , const bool prefer_shmem )
+  {
+    if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
+
+      // Fence before changing settings and copying closure
+      Kokkos::Cuda().fence();
+
+      if ( cuda_instance->m_maxShmemPerBlock < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      #ifndef KOKKOS_ARCH_KEPLER
+      // On Kepler the L1 has no benefit since it doesn't cache reads
+      else {
+        CUDA_SAFE_CALL(
+          cudaFuncSetCacheConfig
+            ( cuda_parallel_launch_constant_memory< DriverType >
+            , ( prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
+            ) );
+      }
+      #endif
+
+      // Copy functor to constant memory on the device
+      cudaMemcpyToSymbolAsync(
+        kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType), 0, cudaMemcpyHostToDevice, cudaStream_t(cuda_instance->m_stream));
+
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+
+      // Invoke the driver function on the device
+      cuda_parallel_launch_constant_memory< DriverType >
+          <<< grid , block , shmem , cuda_instance->m_stream >>>();
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      CUDA_SAFE_CALL( cudaGetLastError() );
+      Kokkos::Cuda().fence();
+#endif
+    }
+  }
+
+  static cudaFuncAttributes get_cuda_func_attributes() {
+    cudaFuncAttributes attr;
+    cudaFuncGetAttributes(&attr,cuda_parallel_launch_constant_memory
+            < DriverType >);
+    return attr;
+  }
+};
+
+template < class DriverType
+         , unsigned int MaxThreadsPerBlock
+         , unsigned int MinBlocksPerSM >
+struct CudaParallelLaunch< DriverType
+                         , Kokkos::LaunchBounds< MaxThreadsPerBlock 
+                                               , MinBlocksPerSM >
+                         , Experimental::CudaLaunchMechanism::LocalMemory >
+{
+  static_assert(sizeof(DriverType)<CudaTraits::KernelArgumentLimit,"Kokkos Error: Requested CudaLaunchLocalMemory with a Functor larger than 4096 bytes.");
+  inline
+  CudaParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int          shmem
+                    , const CudaInternal* cuda_instance
+                    , const bool prefer_shmem )
+  {
+    if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
+
+      if ( cuda_instance->m_maxShmemPerBlock < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      #ifndef KOKKOS_ARCH_KEPLER
+      // On Kepler the L1 has no benefit since it doesn't cache reads
+      else {
+        CUDA_SAFE_CALL(
+          cudaFuncSetCacheConfig
+            ( cuda_parallel_launch_local_memory
+                < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
+            , ( prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
+            ) );
+      }
+      #endif
+
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+
+      // Invoke the driver function on the device
+      cuda_parallel_launch_local_memory
+        < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
+          <<< grid , block , shmem , cuda_instance->m_stream >>>( driver );
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      CUDA_SAFE_CALL( cudaGetLastError() );
+      Kokkos::Cuda().fence();
+#endif
+    }
+  }
+
+  static cudaFuncAttributes get_cuda_func_attributes() {
+    cudaFuncAttributes attr;
+    cudaFuncGetAttributes(&attr,cuda_parallel_launch_local_memory
+            < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >);
+    return attr;
+  }
+};
+
+template < class DriverType>
+struct CudaParallelLaunch< DriverType
+                         , Kokkos::LaunchBounds<0,0>
+                         , Experimental::CudaLaunchMechanism::LocalMemory >
+{
+  static_assert(sizeof(DriverType)<CudaTraits::KernelArgumentLimit,"Kokkos Error: Requested CudaLaunchLocalMemory with a Functor larger than 4096 bytes.");
+  inline
+  CudaParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int          shmem
+                    , const CudaInternal* cuda_instance
+                    , const bool prefer_shmem)
+  {
+    if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
+
+      if ( cuda_instance->m_maxShmemPerBlock < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      #ifndef KOKKOS_ARCH_KEPLER
+      // On Kepler the L1 has no benefit since it doesn't cache reads
+      else {
+        CUDA_SAFE_CALL(
+          cudaFuncSetCacheConfig
+            ( cuda_parallel_launch_local_memory< DriverType >
+            , ( prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
+            ) );
+      }
+      #endif
+
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+
+      // Invoke the driver function on the device
+      cuda_parallel_launch_local_memory< DriverType >
+          <<< grid , block , shmem , cuda_instance->m_stream >>>( driver );
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      CUDA_SAFE_CALL( cudaGetLastError() );
+      Kokkos::Cuda().fence();
+#endif
+    }
+  }
+
+  static cudaFuncAttributes get_cuda_func_attributes() {
+    cudaFuncAttributes attr;
+    cudaFuncGetAttributes(&attr,cuda_parallel_launch_local_memory
+            < DriverType >);
+    return attr;
+  }
+};
+
+template < class DriverType
+         , unsigned int MaxThreadsPerBlock
+         , unsigned int MinBlocksPerSM>
+struct CudaParallelLaunch< DriverType
+                         , Kokkos::LaunchBounds< MaxThreadsPerBlock
+                                               , MinBlocksPerSM>
+                         , Experimental::CudaLaunchMechanism::GlobalMemory >
+{
+  inline
+  CudaParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int          shmem
+                    , CudaInternal* cuda_instance
+                    , const bool prefer_shmem )
+  {
+    if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
+
+      if ( cuda_instance->m_maxShmemPerBlock < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      #ifndef KOKKOS_ARCH_KEPLER
+      // On Kepler the L1 has no benefit since it doesn't cache reads
+      else {
+        CUDA_SAFE_CALL(
+          cudaFuncSetCacheConfig
+            ( cuda_parallel_launch_global_memory
+                < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
+            , ( prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
+            ) );
+      }
+      #endif
+
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+
+      DriverType* driver_ptr = NULL;
+      driver_ptr = reinterpret_cast<DriverType*>(cuda_instance->scratch_functor(sizeof(DriverType)));
+      cudaMemcpyAsync(driver_ptr,&driver, sizeof(DriverType), cudaMemcpyDefault, cuda_instance->m_stream);
+
+      // Invoke the driver function on the device
+      cuda_parallel_launch_global_memory
+        < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
+          <<< grid , block , shmem , cuda_instance->m_stream >>>( driver_ptr );
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      CUDA_SAFE_CALL( cudaGetLastError() );
+      Kokkos::Cuda().fence();
+#endif
+    }
+  }
+  static cudaFuncAttributes get_cuda_func_attributes() {
+    cudaFuncAttributes attr;
+    cudaFuncGetAttributes(&attr,cuda_parallel_launch_global_memory
+            < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >);
+    return attr;
+  }
+
+};
+
+template < class DriverType>
+struct CudaParallelLaunch< DriverType
+                         , Kokkos::LaunchBounds<0,0>
+                         , Experimental::CudaLaunchMechanism::GlobalMemory >
+{
+  inline
+  CudaParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int          shmem
+                    , CudaInternal* cuda_instance
+                    , const bool prefer_shmem)
+  {
+    if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
+
+      if ( cuda_instance->m_maxShmemPerBlock < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      #ifndef KOKKOS_ARCH_KEPLER
+      // On Kepler the L1 has no benefit since it doesn't cache reads
+      else {
+        CUDA_SAFE_CALL(
+          cudaFuncSetCacheConfig
+            ( cuda_parallel_launch_global_memory< DriverType >
+            , ( prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
+            ) );
+      }
+      #endif
+
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+
+      DriverType* driver_ptr = NULL;
+      driver_ptr = reinterpret_cast<DriverType*>(cuda_instance->scratch_functor(sizeof(DriverType)));
+      cudaMemcpyAsync(driver_ptr,&driver, sizeof(DriverType), cudaMemcpyDefault, cuda_instance->m_stream);
+
+      cuda_parallel_launch_global_memory< DriverType >
+          <<< grid , block , shmem , cuda_instance->m_stream >>>( driver_ptr );
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      CUDA_SAFE_CALL( cudaGetLastError() );
+      Kokkos::Cuda().fence();
+#endif
+    }
+  }
+
+  static cudaFuncAttributes get_cuda_func_attributes() {
+    cudaFuncAttributes attr;
+    cudaFuncGetAttributes(&attr,cuda_parallel_launch_global_memory
+            < DriverType >);
+    return attr;
+  }
+};
+//----------------------------------------------------------------------------
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* defined( __CUDACC__ ) */
+#endif /* defined( KOKKOS_ENABLE_CUDA ) */
+#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@ -376,13 +376,13 @@ template< class ReducerType >
 __device__ inline
 typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
 cuda_intra_warp_reduction( const ReducerType& reducer,
+                           typename ReducerType::value_type& result,
                           const uint32_t max_active_thread = blockDim.y) {

  typedef typename ReducerType::value_type ValueType;

  unsigned int shift = 1;

-  ValueType result = reducer.reference();
  //Reduce over values from threads with different threadIdx.y
  while(blockDim.x * shift < 32 ) {
    const ValueType tmp = shfl_down(result, blockDim.x*shift,32u);
@ -400,6 +400,7 @@ template< class ReducerType >
 __device__ inline
 typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
 cuda_inter_warp_reduction( const ReducerType& reducer,
+                           typename ReducerType::value_type value,
                           const int max_active_thread = blockDim.y) {

  typedef typename ReducerType::value_type ValueType;
@ -410,7 +411,6 @@ cuda_inter_warp_reduction( const ReducerType& reducer,
  // could lead to race conditions
  __shared__ double sh_result[(sizeof(ValueType)+7)/8*STEP_WIDTH];
  ValueType* result = (ValueType*) & sh_result;
-  ValueType value = reducer.reference();
  const int step = 32 / blockDim.x;
  int shift = STEP_WIDTH;
  const int id = threadIdx.y%step==0?threadIdx.y/step:65000;
@ -438,9 +438,18 @@ template< class ReducerType >
 __device__ inline
 typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
 cuda_intra_block_reduction( const ReducerType& reducer,
+                            typename ReducerType::value_type value,
                            const int max_active_thread = blockDim.y) {
-  cuda_intra_warp_reduction(reducer,max_active_thread);
-  cuda_inter_warp_reduction(reducer,max_active_thread);
+  cuda_intra_warp_reduction(reducer,value,max_active_thread);
+  cuda_inter_warp_reduction(reducer,value,max_active_thread);
+}
+
+template< class ReducerType >
+__device__ inline
+typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
+cuda_intra_block_reduction( const ReducerType& reducer,
+                            const int max_active_thread = blockDim.y) {
+  cuda_intra_block_reduction(reducer,reducer.reference(),max_active_thread);
 }

 template< class ReducerType>
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
@ -54,194 +54,8 @@
 namespace Kokkos {
 namespace Impl {

-template class TaskQueue< Kokkos::Cuda > ;
-
-//----------------------------------------------------------------------------
-
-__device__
-void TaskQueueSpecialization< Kokkos::Cuda >::driver
-  ( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue 
-  , int32_t shmem_per_warp )
-{
-  using Member = TaskExec< Kokkos::Cuda > ;
-  using Queue  = TaskQueue< Kokkos::Cuda > ;
-  using task_root_type = TaskBase< void , void , void > ;
-
-  extern __shared__ int32_t shmem_all[];
-
-  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
-
-  int32_t * const warp_shmem =
-    shmem_all + ( threadIdx.z * shmem_per_warp ) / sizeof(int32_t);
-
-  task_root_type * const task_shmem = (task_root_type *) warp_shmem ;
-
-  const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;
-
-  Member single_exec( warp_shmem , 1 );
-  Member team_exec( warp_shmem , blockDim.y );
-
-  task_root_type * task_ptr ;
-
-  // Loop until all queues are empty and no tasks in flight
-
-  do {
-
-    // Each team lead attempts to acquire either a thread team task
-    // or collection of single thread tasks for the team.
-
-    if ( 0 == warp_lane ) {
-
-      task_ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
-
-      // Loop by priority and then type
-      for ( int i = 0 ; i < Queue::NumQueue && end == task_ptr ; ++i ) {
-        for ( int j = 0 ; j < 2 && end == task_ptr ; ++j ) {
-          task_ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
-        }
-      }
-
-#if 0
-printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
-      , uintptr_t(task_ptr));
-#endif
-
-    }
-
-    // Synchronize warp with memory fence before broadcasting task pointer:
-
-    // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "A" );
-    KOKKOS_IMPL_CUDA_SYNCWARP ;
-
-    // Broadcast task pointer:
-
-    ((int*) & task_ptr )[0] = KOKKOS_IMPL_CUDA_SHFL( ((int*) & task_ptr )[0] , 0 , 32 );
-    ((int*) & task_ptr )[1] = KOKKOS_IMPL_CUDA_SHFL( ((int*) & task_ptr )[1] , 0 , 32 );
-
-#if defined( KOKKOS_DEBUG )
-    KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "TaskQueue CUDA task_ptr" );
-#endif
-
-    if ( 0 == task_ptr ) break ; // 0 == queue->m_ready_count
-
-    if ( end != task_ptr ) {
-
-      // Whole warp copy task's closure to/from shared memory.
-      // Use all threads of warp for coalesced read/write.
-
-      int32_t const b = sizeof(task_root_type) / sizeof(int32_t);
-      int32_t const e = *((int32_t volatile *)( & task_ptr->m_alloc_size )) / sizeof(int32_t);
-
-      int32_t volatile * const task_mem = (int32_t volatile *) task_ptr ;
-
-      // copy task closure from global to shared memory:
-
-      for ( int32_t i = warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
-        warp_shmem[i] = task_mem[i] ;
-      }
-
-      // Synchronize threads of the warp and insure memory
-      // writes are visible to all threads in the warp.
-
-      // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "B" );
-      KOKKOS_IMPL_CUDA_SYNCWARP ;
-
-      if ( task_root_type::TaskTeam == task_shmem->m_task_type ) {
-        // Thread Team Task
-        (*task_shmem->m_apply)( task_shmem , & team_exec );
-      }
-      else if ( 0 == threadIdx.y ) {
-        // Single Thread Task
-        (*task_shmem->m_apply)( task_shmem , & single_exec );
-      }
-
-      // Synchronize threads of the warp and insure memory
-      // writes are visible to all threads in the warp.
-
-      // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "C" );
-      KOKKOS_IMPL_CUDA_SYNCWARP ;
-
-      // copy task closure from shared to global memory:
-
-      for ( int32_t i = b + warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
-        task_mem[i] = warp_shmem[i] ;
-      }
-
-      // Synchronize threads of the warp and insure memory
-      // writes are visible to root thread of the warp for
-      // respawn or completion.
-
-      // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "D" );
-      KOKKOS_IMPL_CUDA_SYNCWARP ;
-
-      // If respawn requested copy respawn data back to main memory
-
-      if ( 0 == warp_lane ) {
-
-        if ( ((task_root_type *) task_root_type::LockTag) != task_shmem->m_next ) {
-          ( (volatile task_root_type *) task_ptr )->m_next = task_shmem->m_next ;
-          ( (volatile task_root_type *) task_ptr )->m_priority = task_shmem->m_priority ;
-        }
-
-        queue->complete( task_ptr );
-      }
-    }
-  } while(1);
-}
-
-namespace {
-
-__global__
-void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue 
-                            , int32_t shmem_size )
-{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue , shmem_size ); }
-
-}
-
-void TaskQueueSpecialization< Kokkos::Cuda >::execute
-  ( TaskQueue< Kokkos::Cuda > * const queue )
-{
-  const int shared_per_warp = 2048 ;
-  const int warps_per_block = 4 ;
-  const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
-  const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
-  const int shared_total = shared_per_warp * warps_per_block ;
-  const cudaStream_t stream = 0 ;
-
-  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
-
-#if 0
-printf("cuda_task_queue_execute before\n");
-#endif
-
-  // Query the stack size, in bytes:
-
-  size_t previous_stack_size = 0 ;
-  CUDA_SAFE_CALL( cudaDeviceGetLimit( & previous_stack_size , cudaLimitStackSize ) );
-
-  // If not large enough then set the stack size, in bytes:
-
-  const size_t larger_stack_size = 2048 ;
-
-  if ( previous_stack_size < larger_stack_size ) {
-    CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , larger_stack_size ) );
-  }
-
-  cuda_task_queue_execute<<< grid , block , shared_total , stream >>>( queue , shared_per_warp );
-
-  CUDA_SAFE_CALL( cudaGetLastError() );
-
-  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
-
-  if ( previous_stack_size < larger_stack_size ) {
-    CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , previous_stack_size ) );
-  }
-
-#if 0
-printf("cuda_task_queue_execute after\n");
-#endif
-
-}
+template class TaskQueue< Kokkos::Cuda, Impl::default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda> > ;
+template class TaskQueueMultiple< Kokkos::Cuda, Impl::default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda> > ;

 }} /* namespace Kokkos::Impl */

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
@ -50,6 +50,14 @@
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

+#include <Kokkos_Core_fwd.hpp>
+
+#include <impl/Kokkos_TaskBase.hpp>
+#include <Cuda/Kokkos_Cuda_Error.hpp> // CUDA_SAFE_CALL
+#include <impl/Kokkos_TaskTeamMember.hpp>
+
+//----------------------------------------------------------------------------
+
 namespace Kokkos {
 namespace Impl {
 namespace {
@ -57,54 +65,498 @@ namespace {
 template< typename TaskType >
 __global__
 void set_cuda_task_base_apply_function_pointer
-  ( TaskBase<void,void,void>::function_type * ptr )
-{ *ptr = TaskType::apply ; }
+  ( typename TaskType::function_type * ptr, typename TaskType::destroy_type* dtor )
+{ 
+  *ptr = TaskType::apply;
+  *dtor = TaskType::destroy;
+}
+
+template< typename Scheduler >
+__global__
+void cuda_task_queue_execute( Scheduler scheduler, int32_t shmem_size ) {
+  TaskQueueSpecialization< Scheduler >::driver( std::move(scheduler) , shmem_size );
+}

 }

-template< class > class TaskExec ;
+template <class, class> class TaskExec ;

-template<>
-class TaskQueueSpecialization< Kokkos::Cuda >
+template<class QueueType>
+class TaskQueueSpecialization<
+  SimpleTaskScheduler<Kokkos::Cuda, QueueType>
+>
 {
 public:

-  using execution_space = Kokkos::Cuda ;
-  using memory_space    = Kokkos::CudaUVMSpace ;
-  using queue_type      = TaskQueue< execution_space > ;
-  using member_type     = TaskExec< Kokkos::Cuda > ;
+  using scheduler_type = SimpleTaskScheduler<Kokkos::Cuda, QueueType>;
+  using execution_space = Kokkos::Cuda;
+  using memory_space = Kokkos::CudaUVMSpace;
+  using member_type = TaskExec<Kokkos::Cuda, scheduler_type> ;

+  enum : long { max_league_size = 16 };
+  enum : int { warps_per_block = 4 };
+
+  KOKKOS_INLINE_FUNCTION
  static
-  void iff_single_thread_recursive_execute( queue_type * const ) {}
+  void iff_single_thread_recursive_execute( scheduler_type const& ) {}
+
+  static int get_max_team_count(
+    execution_space const&
+  ) {
+    return Kokkos::Impl::cuda_internal_multiprocessor_count() * warps_per_block;
+  }

  __device__
-  static void driver( queue_type * const , int32_t );
+  static void driver(scheduler_type scheduler, int32_t shmem_per_warp)
+  {
+    using queue_type = typename scheduler_type::task_queue_type;
+    using task_base_type = typename scheduler_type::task_base_type;
+    using runnable_task_base_type = typename scheduler_type::runnable_task_base_type;
+    using scheduling_info_storage_type =
+      SchedulingInfoStorage<
+        runnable_task_base_type,
+        typename scheduler_type::task_scheduling_info_type
+      >;
+
+    extern __shared__ int32_t shmem_all[];
+
+    int32_t* const warp_shmem = shmem_all + (threadIdx.z * shmem_per_warp) / sizeof(int32_t);
+
+    task_base_type* const shared_memory_task_copy = (task_base_type*)warp_shmem;
+
+    const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x;
+
+    member_type single_exec(scheduler, warp_shmem, 1);
+    member_type team_exec(scheduler, warp_shmem, blockDim.y);
+
+    auto& queue = scheduler.queue();
+    auto& team_scheduler = team_exec.scheduler();
+
+    auto current_task = OptionalRef<task_base_type>();
+
+    // Loop until all queues are empty and no tasks in flight
+    while(not queue.is_done()) {
+
+      if(warp_lane == 0) {  // should be (?) same as team_exec.team_rank() == 0
+        // pop off a task
+        current_task = queue.pop_ready_task(team_scheduler.team_scheduler_info());
+      }
+
+      // Broadcast task pointer:
+
+      // Sync before the broadcast
+      KOKKOS_IMPL_CUDA_SYNCWARP;
+      
+      // pretend it's an int* for shuffle purposes
+      ((int*) &current_task)[0] = KOKKOS_IMPL_CUDA_SHFL(((int*) &current_task)[0], 0, 32);
+      ((int*) &current_task)[1] = KOKKOS_IMPL_CUDA_SHFL(((int*) &current_task)[1], 0, 32);
+
+      if(current_task) {
+
+        KOKKOS_ASSERT(!current_task->as_runnable_task().get_respawn_flag());
+
+        int32_t b = sizeof(scheduling_info_storage_type) / sizeof(int32_t);
+        static_assert(
+          sizeof(scheduling_info_storage_type) % sizeof(int32_t) == 0,
+          "bad task size"
+        );
+        int32_t const e = current_task->get_allocation_size() / sizeof(int32_t);
+        KOKKOS_ASSERT(current_task->get_allocation_size() % sizeof(int32_t) == 0);
+
+        int32_t volatile* const task_mem = (int32_t volatile*)current_task.get();
+
+        // do a coordinated copy of the task closure from global to shared memory:
+        for(int32_t i = warp_lane; i < e; i += CudaTraits::WarpSize) {
+          warp_shmem[i] = task_mem[i];
+        }
+
+        // Synchronize threads of the warp and insure memory
+        // writes are visible to all threads in the warp.
+        KOKKOS_IMPL_CUDA_SYNCWARP;
+
+        if(shared_memory_task_copy->is_team_runnable()) {
+          // Thread Team Task
+          shared_memory_task_copy->as_runnable_task().run(team_exec);
+        }
+        else if(threadIdx.y == 0) {
+          // TODO @tasking @optimization DSH Change this to warp_lane == 0 when we allow blockDim.x to be more than 1
+          // Single Thread Task
+          shared_memory_task_copy->as_runnable_task().run(single_exec);
+        }
+
+        // Synchronize threads of the warp and insure memory
+        // writes are visible to all threads in the warp.
+
+        KOKKOS_IMPL_CUDA_SYNCWARP;
+
+        //if(warp_lane < b % CudaTraits::WarpSize) b += CudaTraits::WarpSize;
+        //b -= b % CudaTraits::WarpSize;
+
+        // copy task closure from shared to global memory:
+        for (int32_t i = b + warp_lane; i < e; i += CudaTraits::WarpSize) {
+          task_mem[i] = warp_shmem[i];
+        }
+
+        // Synchronize threads of the warp and insure memory
+        // writes are visible to root thread of the warp for
+        // respawn or completion.
+
+        KOKKOS_IMPL_CUDA_SYNCWARP;
+
+
+        if (warp_lane == 0) {
+          // If respawn requested copy respawn data back to main memory
+          if(shared_memory_task_copy->as_runnable_task().get_respawn_flag()) {
+            if(shared_memory_task_copy->as_runnable_task().has_predecessor()) {
+              // It's not necessary to make this a volatile write because
+              // the next read of the predecessor is on this thread in complete,
+              // and the predecessor is cleared there (using a volatile write)
+              current_task->as_runnable_task().acquire_predecessor_from(
+                shared_memory_task_copy->as_runnable_task()
+              );
+            }
+
+            // It may not necessary to make this a volatile write, since the
+            // next read will be done by this thread in complete where the
+            // rescheduling occurs, but since the task could be stolen later
+            // before this is written again, we should do the volatile write
+            // here.  (It might not be necessary though because I don't know
+            // where else the priority would be read after it is scheduled
+            // by this thread; for now, we leave it volatile, but we should
+            // benchmark the cost of this.)
+            current_task.as_volatile()->set_priority(shared_memory_task_copy->get_priority());
+
+            // It's not necessary to make this a volatile write, since the
+            // next read of it (if true) will be by this thread in `complete()`,
+            // which will unset the flag (using volatile) once it has handled
+            // the respawn
+            current_task->as_runnable_task().set_respawn_flag();
+
+          }
+
+          queue.complete(
+            (*std::move(current_task)).as_runnable_task(),
+            team_scheduler.team_scheduler_info()
+          );
+        }
+
+      }
+    }
+  }

  static
-  void execute( queue_type * const );
+  void execute(scheduler_type const& scheduler)
+  {
+    const int shared_per_warp = 2048 ;
+    const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1);
+    const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block);
+    const int shared_total = shared_per_warp * warps_per_block;
+    const cudaStream_t stream = nullptr;
+
+    KOKKOS_ASSERT(
+      static_cast<long>(grid.x * grid.y * grid.z * block.x * block.y * block.z)
+        == static_cast<long>(get_max_team_count(scheduler.get_execution_space()) * Kokkos::Impl::CudaTraits::WarpSize)
+    );
+
+    auto& queue = scheduler.queue();
+
+    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+
+    // Query the stack size, in bytes:
+
+    size_t previous_stack_size = 0;
+    CUDA_SAFE_CALL(cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize));
+
+    // If not large enough then set the stack size, in bytes:
+
+    const size_t larger_stack_size = 1 << 11;
+
+    if (previous_stack_size < larger_stack_size) {
+      CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
+    }
+
+    cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(scheduler, shared_per_warp);
+
+    CUDA_SAFE_CALL(cudaGetLastError());
+
+    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+
+    if (previous_stack_size < larger_stack_size) {
+      CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size));
+    }
+  }
+
+  template <typename TaskType>
+  static
+  // TODO @tasking @optimiazation DSH specialize this for trivially destructible types
+  void
+  get_function_pointer(
+    typename TaskType::function_type& ptr,
+    typename TaskType::destroy_type& dtor
+  )
+  {
+    using function_type = typename TaskType::function_type;
+    using destroy_type = typename TaskType::destroy_type;
+
+    // TODO @tasking @minor DSH make sure there aren't any alignment concerns?
+    void* storage = cuda_internal_scratch_unified( 
+      Kokkos::Cuda(),
+      sizeof(function_type) + sizeof(destroy_type)
+    );
+    function_type* ptr_ptr = (function_type*)storage;
+    destroy_type* dtor_ptr = (destroy_type*)((char*)storage + sizeof(function_type));
+
+    CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+    set_cuda_task_base_apply_function_pointer<TaskType><<<1,1>>>(ptr_ptr, dtor_ptr);
+
+    CUDA_SAFE_CALL( cudaGetLastError() );
+    CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+    ptr = *ptr_ptr;
+    dtor = *dtor_ptr;
+  }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template<class Scheduler>
+class TaskQueueSpecializationConstrained<
+  Scheduler,
+  typename std::enable_if<
+    std::is_same<typename Scheduler::execution_space, Kokkos::Cuda>::value
+  >::type
+>
+{
+public:
+
+  using scheduler_type = Scheduler;
+  using execution_space = Kokkos::Cuda;
+  using memory_space = Kokkos::CudaUVMSpace;
+  using member_type = TaskExec<Kokkos::Cuda, Scheduler> ;
+
+  enum : long { max_league_size = 16 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void iff_single_thread_recursive_execute( scheduler_type const& ) {}
+
+  __device__
+  static void driver(scheduler_type scheduler, int32_t shmem_per_warp)
+  {
+    using queue_type = typename scheduler_type::queue_type;
+    using task_root_type = TaskBase;
+
+    extern __shared__ int32_t shmem_all[];
+
+    task_root_type* const end = (task_root_type *) task_root_type::EndTag ;
+    task_root_type* const no_more_tasks_sentinel = nullptr;
+
+    int32_t * const warp_shmem =
+      shmem_all + ( threadIdx.z * shmem_per_warp ) / sizeof(int32_t);
+
+    task_root_type * const task_shmem = (task_root_type *) warp_shmem ;
+
+    const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;
+
+    member_type single_exec(scheduler, warp_shmem, 1);
+    member_type team_exec(scheduler, warp_shmem, blockDim.y);
+
+    auto& team_queue = team_exec.scheduler().queue();
+
+    task_root_type * task_ptr = no_more_tasks_sentinel;
+
+    // Loop until all queues are empty and no tasks in flight
+
+    do {
+
+      // Each team lead attempts to acquire either a thread team task
+      // or collection of single thread tasks for the team.
+
+      if ( 0 == warp_lane ) {
+
+        if( *((volatile int *) & team_queue.m_ready_count) > 0 ) {
+          task_ptr = end;
+          // Attempt to acquire a task
+          // Loop by priority and then type
+          for ( int i = 0 ; i < queue_type::NumQueue && end == task_ptr ; ++i ) {
+            for ( int j = 0 ; j < 2 && end == task_ptr ; ++j ) {
+              task_ptr = queue_type::pop_ready_task( & team_queue.m_ready[i][j] );
+            }
+          }
+        }
+        else {
+          // returns nullptr if and only if all other queues have a ready
+          // count of 0 also. Otherwise, returns a task from another queue
+          // or `end` if one couldn't be popped
+          task_ptr = team_queue.attempt_to_steal_task();
+          #if 0
+          if(task != no_more_tasks_sentinel && task != end) {
+            std::printf("task stolen on rank %d\n", team_exec.league_rank());
+          }
+          #endif
+        }
+
+      }
+
+      // Synchronize warp with memory fence before broadcasting task pointer:
+
+      // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "A" );
+      KOKKOS_IMPL_CUDA_SYNCWARP ;
+
+      // Broadcast task pointer:
+
+      ((int*) & task_ptr )[0] = KOKKOS_IMPL_CUDA_SHFL( ((int*) & task_ptr )[0] , 0 , 32 );
+      ((int*) & task_ptr )[1] = KOKKOS_IMPL_CUDA_SHFL( ((int*) & task_ptr )[1] , 0 , 32 );
+
+      #if defined( KOKKOS_DEBUG )
+      KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "TaskQueue CUDA task_ptr" );
+      #endif
+
+      if ( 0 == task_ptr ) break ; // 0 == queue->m_ready_count
+
+      if ( end != task_ptr ) {
+
+        // Whole warp copy task's closure to/from shared memory.
+        // Use all threads of warp for coalesced read/write.
+
+        int32_t const b = sizeof(task_root_type) / sizeof(int32_t);
+        int32_t const e = *((int32_t volatile *)( & task_ptr->m_alloc_size )) / sizeof(int32_t);
+
+        int32_t volatile * const task_mem = (int32_t volatile *) task_ptr ;
+
+        KOKKOS_ASSERT(e * sizeof(int32_t) < shmem_per_warp);
+
+        // copy task closure from global to shared memory:
+
+        for ( int32_t i = warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
+          warp_shmem[i] = task_mem[i] ;
+        }
+
+        // Synchronize threads of the warp and insure memory
+        // writes are visible to all threads in the warp.
+
+        // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "B" );
+        KOKKOS_IMPL_CUDA_SYNCWARP ;
+
+        if ( task_root_type::TaskTeam == task_shmem->m_task_type ) {
+          // Thread Team Task
+          (*task_shmem->m_apply)( task_shmem , & team_exec );
+        }
+        else if ( 0 == threadIdx.y ) {
+          // Single Thread Task
+          (*task_shmem->m_apply)( task_shmem , & single_exec );
+        }
+
+        // Synchronize threads of the warp and insure memory
+        // writes are visible to all threads in the warp.
+
+        // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "C" );
+        KOKKOS_IMPL_CUDA_SYNCWARP ;
+
+        // copy task closure from shared to global memory:
+
+        for ( int32_t i = b + warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
+          task_mem[i] = warp_shmem[i] ;
+        }
+
+        // Synchronize threads of the warp and insure memory
+        // writes are visible to root thread of the warp for
+        // respawn or completion.
+
+        // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "D" );
+        KOKKOS_IMPL_CUDA_SYNCWARP ;
+
+        // If respawn requested copy respawn data back to main memory
+
+        if ( 0 == warp_lane ) {
+
+          if ( ((task_root_type *) task_root_type::LockTag) != task_shmem->m_next ) {
+            ( (volatile task_root_type *) task_ptr )->m_next = task_shmem->m_next ;
+            ( (volatile task_root_type *) task_ptr )->m_priority = task_shmem->m_priority ;
+          }
+
+          team_queue.complete( task_ptr );
+        }
+
+      }
+    } while(1);
+  }
+
+  static
+  void execute(scheduler_type const& scheduler)
+  {
+    const int shared_per_warp = 2048 ;
+    const int warps_per_block = 4 ;
+    const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
+    //const dim3 grid( 1 , 1 , 1 );
+    const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
+    const int shared_total = shared_per_warp * warps_per_block ;
+    const cudaStream_t stream = 0 ;
+
+    auto& queue = scheduler.queue();
+    queue.initialize_team_queues(warps_per_block * grid.x);
+
+    CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+    // Query the stack size, in bytes:
+
+    size_t previous_stack_size = 0 ;
+    CUDA_SAFE_CALL( cudaDeviceGetLimit( & previous_stack_size , cudaLimitStackSize ) );
+
+    // If not large enough then set the stack size, in bytes:
+
+    const size_t larger_stack_size = 2048 ;
+
+    if ( previous_stack_size < larger_stack_size ) {
+      CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , larger_stack_size ) );
+    }
+
+    cuda_task_queue_execute<<< grid , block , shared_total , stream >>>( scheduler , shared_per_warp );
+
+    CUDA_SAFE_CALL( cudaGetLastError() );
+
+    CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+    if ( previous_stack_size < larger_stack_size ) {
+      CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , previous_stack_size ) );
+    }
+
+  }

  template< typename TaskType >
  static
-  typename TaskType::function_type
-  get_function_pointer()
+  void
+  get_function_pointer(
+    typename TaskType::function_type& ptr,
+    typename TaskType::destroy_type& dtor
+  )
    {
-      using function_type = typename TaskType::function_type ;
+      using function_type = typename TaskType::function_type;
+      using destroy_type = typename TaskType::destroy_type;

-      function_type * const ptr =
-        (function_type*) cuda_internal_scratch_unified( sizeof(function_type) );
+      void* storage = cuda_internal_scratch_unified(
+        Kokkos::Cuda(),
+        sizeof(function_type) + sizeof(destroy_type)
+      );
+      function_type* ptr_ptr = (function_type*)storage;
+      destroy_type* dtor_ptr = (destroy_type*)((char*)storage + sizeof(function_type));

      CUDA_SAFE_CALL( cudaDeviceSynchronize() );

-      set_cuda_task_base_apply_function_pointer<TaskType><<<1,1>>>(ptr);
+      set_cuda_task_base_apply_function_pointer<TaskType><<<1,1>>>(ptr_ptr, dtor_ptr);

      CUDA_SAFE_CALL( cudaGetLastError() );
      CUDA_SAFE_CALL( cudaDeviceSynchronize() );

-      return *ptr ;
+      ptr = *ptr_ptr;
+      dtor = *dtor_ptr;
+
    }
 };

-extern template class TaskQueue< Kokkos::Cuda > ;
+extern template class TaskQueue< Kokkos::Cuda, default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda> > ;

 }} /* namespace Kokkos::Impl */

@ -136,8 +588,8 @@ namespace Impl {
 *  When executing a single thread task the syncwarp or other
 *  warp synchronizing functions must not be called.
 */
-template<>
-class TaskExec< Kokkos::Cuda >
+template <class Scheduler>
+class TaskExec<Kokkos::Cuda, Scheduler>
 {
 private:

@ -148,24 +600,39 @@ private:
  TaskExec & operator = ( TaskExec && ) = delete ;
  TaskExec & operator = ( TaskExec const & ) = delete ;

-  friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda > ;
-  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Cuda > ;
+  friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda, default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda> > ;
+  template <class, class>
+  friend class Kokkos::Impl::TaskQueueSpecializationConstrained;
+  template <class>
+  friend class Kokkos::Impl::TaskQueueSpecialization;

  int32_t * m_team_shmem ;
  const int m_team_size ;
+  Scheduler m_scheduler;

  // If constructed with arg_team_size == 1 the object
  // can only be used by 0 == threadIdx.y.
-  __device__
-  TaskExec( int32_t * arg_team_shmem , int arg_team_size = blockDim.y )
-    : m_team_shmem( arg_team_shmem )
-    , m_team_size( arg_team_size ) {}
+  KOKKOS_INLINE_FUNCTION
+  TaskExec(
+    Scheduler const& parent_scheduler,
+    int32_t* arg_team_shmem,
+    int arg_team_size = blockDim.y
+  )
+    : m_team_shmem(arg_team_shmem),
+      m_team_size(arg_team_size),
+      m_scheduler(parent_scheduler.get_team_scheduler(league_rank()))
+  { }

 public:

+  using thread_team_member = TaskExec;
+
 #if defined( __CUDA_ARCH__ )
-  __device__ int  team_rank() const { return threadIdx.y ; }
-  __device__ int  team_size() const { return m_team_size ; }
+  __device__ int team_rank() const { return threadIdx.y ; }
+  __device__ int team_size() const { return m_team_size ; }
+  //__device__ int league_rank() const { return threadIdx.z; }
+  __device__ int league_rank() const { return blockIdx.x * blockDim.z + threadIdx.z; }
+  __device__ int league_size() const { return blockDim.z * gridDim.x; }

  __device__ void team_barrier() const
    {
@ -186,13 +653,18 @@ public:
    }

 #else
-  __host__ int  team_rank() const { return 0 ; }
-  __host__ int  team_size() const { return 0 ; }
+  __host__ int team_rank() const { return 0 ; }
+  __host__ int team_size() const { return 0 ; }
+  __host__ int league_rank() const { return 0; }
+  __host__ int league_size() const { return 0; }
  __host__ void team_barrier() const {}
  template< class ValueType >
  __host__ void team_broadcast( ValueType & , const int ) const {}
 #endif

+  KOKKOS_INLINE_FUNCTION Scheduler const& scheduler() const noexcept { return m_scheduler; }
+  KOKKOS_INLINE_FUNCTION Scheduler& scheduler() noexcept { return m_scheduler; }
+
 };

 }} /* namespace Kokkos::Impl */
@ -203,20 +675,22 @@ public:
 namespace Kokkos {
 namespace Impl {

-template<typename iType>
-struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
+template<typename iType, typename Scheduler>
+struct TeamThreadRangeBoundariesStruct<iType, TaskExec<Kokkos::Cuda, Scheduler>>
 {
-  typedef iType index_type;
+  using index_type = iType;
+  using member_type = TaskExec<Kokkos::Cuda, Scheduler>;
+
  const iType start ;
  const iType end ;
  const iType increment ;
-  const TaskExec< Kokkos::Cuda > & thread;
+  member_type const& thread;

 #if defined( __CUDA_ARCH__ )

  __device__ inline
  TeamThreadRangeBoundariesStruct
-    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count)
+    ( member_type const& arg_thread, const iType& arg_count)
    : start( threadIdx.y )
    , end(arg_count)
    , increment( blockDim.y )
@ -225,7 +699,7 @@ struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >

  __device__ inline
  TeamThreadRangeBoundariesStruct
-    ( const TaskExec< Kokkos::Cuda > & arg_thread
+    ( member_type const& arg_thread
    , const iType & arg_start
    , const iType & arg_end
    )
@ -238,10 +712,10 @@ struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
 #else

  TeamThreadRangeBoundariesStruct
-    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count);
+    ( member_type const& arg_thread, const iType& arg_count);

  TeamThreadRangeBoundariesStruct
-    ( const TaskExec< Kokkos::Cuda > & arg_thread
+    ( member_type const& arg_thread
    , const iType & arg_start
    , const iType & arg_end
    );
@ -252,20 +726,22 @@ struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >

 //----------------------------------------------------------------------------

-template<typename iType>
-struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
+template<typename iType, typename Scheduler>
+struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda, Scheduler > >
 {
-  typedef iType index_type;
+  using index_type = iType;
+  using member_type = TaskExec<Kokkos::Cuda, Scheduler>;
+
  const index_type start ;
  const index_type end ;
  const index_type increment ;
-  const TaskExec< Kokkos::Cuda > & thread;
+  const member_type& thread;

 #if defined( __CUDA_ARCH__ )

  __device__ inline
  ThreadVectorRangeBoundariesStruct
-    ( const TaskExec< Kokkos::Cuda > & arg_thread, const index_type& arg_count )
+    ( member_type const& arg_thread, const index_type& arg_count )
    : start( threadIdx.x )
    , end(arg_count)
    , increment( blockDim.x )
@ -274,9 +750,9 @@ struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >

  __device__ inline
  ThreadVectorRangeBoundariesStruct
-    ( const TaskExec< Kokkos::Cuda > & arg_thread, const index_type& arg_begin, const index_type& arg_end )
+    ( member_type const& arg_thread, const index_type& arg_begin, const index_type& arg_end )
    : start( arg_begin + threadIdx.x )
-    , end(arg_count)
+    , end(arg_end)
    , increment( blockDim.x )
    , thread(arg_thread)
    {}
@ -284,10 +760,10 @@ struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
 #else

  ThreadVectorRangeBoundariesStruct
-    ( const TaskExec< Kokkos::Cuda > & arg_thread, const index_type& arg_count );
+    ( member_type const& arg_thread, const index_type& arg_count );

  ThreadVectorRangeBoundariesStruct
-    ( const TaskExec< Kokkos::Cuda > & arg_thread, const index_type& arg_begin, const index_type& arg_end);
+    ( member_type const& arg_thread, const index_type& arg_begin, const index_type& arg_end);

 #endif

@ -299,69 +775,69 @@ struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >

 namespace Kokkos {

-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >
-TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & count )
-{
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >( thread, count );
-}
+//template<typename iType>
+//KOKKOS_INLINE_FUNCTION
+//Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >
+//TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & count )
+//{
+//  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >( thread, count );
+//}
+//
+//template<typename iType1, typename iType2>
+//KOKKOS_INLINE_FUNCTION
+//Impl::TeamThreadRangeBoundariesStruct
+//  < typename std::common_type<iType1,iType2>::type
+//  , Impl::TaskExec< Kokkos::Cuda > >
+//TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
+//               , const iType1 & begin, const iType2 & end )
+//{
+//  typedef typename std::common_type< iType1, iType2 >::type iType;
+//  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >(
+//           thread, iType(begin), iType(end) );
+//}
+//
+//template<typename iType>
+//KOKKOS_INLINE_FUNCTION
+//Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
+//ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
+//                 , const iType & count )
+//{
+//  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
+//}
+//
+//template<typename iType>
+//KOKKOS_INLINE_FUNCTION
+//Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
+//ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
+//                 , const iType & arg_begin
+//                 , const iType & arg_end )
+//{
+//  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,arg_begin,arg_end);
+//}

-template<typename iType1, typename iType2>
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct
-  < typename std::common_type<iType1,iType2>::type
-  , Impl::TaskExec< Kokkos::Cuda > >
-TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
-               , const iType1 & begin, const iType2 & end )
-{
-  typedef typename std::common_type< iType1, iType2 >::type iType;
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >(
-           thread, iType(begin), iType(end) );
-}
+// KOKKOS_INLINE_FUNCTION
+// Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >
+// PerTeam(const Impl::TaskExec< Kokkos::Cuda >& thread)
+// {
+//   return Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread);
+// }

-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
-ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
-                 , const iType & count )
-{
-  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
-}
-
-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
-ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
-                 , const iType & arg_begin
-                 , const iType & arg_end )
-{
-  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,arg_begin,arg_end);
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >
-PerTeam(const Impl::TaskExec< Kokkos::Cuda >& thread)
-{
-  return Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread);
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >
-PerThread(const Impl::TaskExec< Kokkos::Cuda >& thread)
-{
-  return Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread);
-}
+// KOKKOS_INLINE_FUNCTION
+// Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >
+// PerThread(const Impl::TaskExec< Kokkos::Cuda >& thread)
+// {
+//   return Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread);
+// }

 /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
 *
 * The range i=0..N-1 is mapped to all threads of the the calling thread team.
 * This functionality requires C++11 support.
 */
-template<typename iType, class Lambda>
+template<typename iType, class Lambda, class Scheduler>
 KOKKOS_INLINE_FUNCTION
 void parallel_for
-  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >& loop_boundaries
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries
  , const Lambda& lambda
  )
 {
@ -370,10 +846,10 @@ void parallel_for
  }
 }

-template< typename iType, class Lambda >
+template< typename iType, class Lambda, class Scheduler >
 KOKKOS_INLINE_FUNCTION
 void parallel_for
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
   const Lambda & lambda) {
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    lambda(i);
@ -459,14 +935,14 @@ void parallel_reduce
 // blockDim.y == team_size
 // threadIdx.x == position in vec
 // threadIdx.y == member number
-template< typename iType, class Lambda, typename ValueType >
+template< typename iType, class Lambda, typename ValueType, class Scheduler >
 KOKKOS_INLINE_FUNCTION
 void parallel_reduce
-  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
   const Lambda & lambda,
   ValueType& initialized_result) {

-  //TODO what is the point of creating this temporary?
+  //TODO @internal_documentation what is the point of creating this temporary?
  ValueType result = initialized_result;
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    lambda(i,result);
@ -487,15 +963,15 @@ void parallel_reduce
  }
 }

-template< typename iType, class Lambda, typename ReducerType >
+template< typename iType, class Lambda, typename ReducerType, class Scheduler >
 KOKKOS_INLINE_FUNCTION
 void parallel_reduce
-  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
   const Lambda & lambda,
   const ReducerType& reducer) {

  typedef typename ReducerType::value_type ValueType;
-  //TODO what is the point of creating this temporary?
+  //TODO @internal_documentation what is the point of creating this temporary?
  ValueType result = ValueType();
  reducer.init(result);

@ -549,10 +1025,10 @@ void parallel_reduce
 // blockDim.y == team_size
 // threadIdx.x == position in vec
 // threadIdx.y == member number
-template< typename iType, class Lambda, typename ValueType >
+template< typename iType, class Lambda, typename ValueType, class Scheduler >
 KOKKOS_INLINE_FUNCTION
 void parallel_reduce
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
   const Lambda & lambda,
   ValueType& initialized_result) {

@ -576,10 +1052,10 @@ void parallel_reduce
  }
 }

-template< typename iType, class Lambda, typename ReducerType >
+template< typename iType, class Lambda, typename ReducerType, class Scheduler >
 KOKKOS_INLINE_FUNCTION
 void parallel_reduce
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
   const Lambda & lambda,
   const ReducerType& reducer) {

@ -611,10 +1087,10 @@ void parallel_reduce
 // blockDim.y == team_size
 // threadIdx.x == position in vec
 // threadIdx.y == member number
-template< typename iType, class Closure >
+template< typename iType, class Closure, class Scheduler >
 KOKKOS_INLINE_FUNCTION
 void parallel_scan
-  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
   const Closure & closure )
 {
  // Extract value_type from closure
@ -676,10 +1152,10 @@ void parallel_scan
 // blockDim.y == team_size
 // threadIdx.x == position in vec
 // threadIdx.y == member number
-template< typename iType, class Closure >
+template< typename iType, class Closure, class Scheduler >
 KOKKOS_INLINE_FUNCTION
 void parallel_scan
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
   const Closure & closure )
 {
  // Extract value_type from closure
@ -735,25 +1211,25 @@ void parallel_scan

 namespace Kokkos {

-  template<class FunctorType>
+  template<class FunctorType, class Scheduler>
  KOKKOS_INLINE_FUNCTION
-  void single(const Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >& , const FunctorType& lambda) {
+  void single(const Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda, Scheduler > >& , const FunctorType& lambda) {
 #ifdef __CUDA_ARCH__
    if(threadIdx.x == 0) lambda();
 #endif
  }
  
-  template<class FunctorType>
+  template<class FunctorType, class Scheduler>
  KOKKOS_INLINE_FUNCTION
-  void single(const Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >& , const FunctorType& lambda) {
+  void single(const Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda, Scheduler > >& , const FunctorType& lambda) {
 #ifdef __CUDA_ARCH__
    if(threadIdx.x == 0 && threadIdx.y == 0) lambda();
 #endif
  }
  
-  template<class FunctorType, class ValueType>
+  template<class FunctorType, class ValueType, class Scheduler>
  KOKKOS_INLINE_FUNCTION
-  void single(const Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >& s , const FunctorType& lambda, ValueType& val) {
+  void single(const Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda, Scheduler > >& s , const FunctorType& lambda, ValueType& val) {
 #ifdef __CUDA_ARCH__
    if(threadIdx.x == 0) lambda(val);
    if ( 1 < s.team_member.team_size() ) {
@ -762,9 +1238,9 @@ namespace Kokkos {
 #endif
  }
  
-  template<class FunctorType, class ValueType>
+  template<class FunctorType, class ValueType, class Scheduler>
  KOKKOS_INLINE_FUNCTION
-  void single(const Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >& single_struct, const FunctorType& lambda, ValueType& val) {
+  void single(const Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda, Scheduler > >& single_struct, const FunctorType& lambda, ValueType& val) {
 #ifdef __CUDA_ARCH__
    if(threadIdx.x == 0 && threadIdx.y == 0) {
      lambda(val);
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
@ -56,9 +56,9 @@
 #include <utility>
 #include <Kokkos_Parallel.hpp>

-#include <Cuda/Kokkos_CudaExec.hpp>
+#include <Cuda/Kokkos_Cuda_KernelLaunch.hpp>
 #include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
-#include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
 #include <Kokkos_Vectorization.hpp>

 #if defined(KOKKOS_ENABLE_PROFILING)
@ -101,11 +101,13 @@ struct CudaJoinFunctor {
 *  total available shared memory must be partitioned among teams.
 */
 class CudaTeamMember {
-private:

+public:
  typedef Kokkos::Cuda                           execution_space ;
  typedef execution_space::scratch_memory_space  scratch_memory_space ;

+private:
+
  mutable void        * m_team_reduce ;
  scratch_memory_space  m_team_shared ;
  int                   m_team_reduce_size ;
@ -221,12 +223,21 @@ public:
  KOKKOS_INLINE_FUNCTION
  typename std::enable_if< is_reducer< ReducerType >::value >::type
  team_reduce( ReducerType const & reducer ) const noexcept
+    {
+      team_reduce(reducer,reducer.reference());
+    }
+  
+  template< typename ReducerType >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< is_reducer< ReducerType >::value >::type
+  team_reduce( ReducerType const & reducer, typename ReducerType::value_type& value ) const noexcept
    {
      #ifdef __CUDA_ARCH__
-      cuda_intra_block_reduction(reducer,blockDim.y);
+      cuda_intra_block_reduction(reducer,value,blockDim.y);
      #endif /* #ifdef __CUDA_ARCH__ */
    }

+
  //--------------------------------------------------------------------------
  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
   *          with intra-team non-deterministic ordering accumulation.
@ -281,20 +292,28 @@ public:
  template< typename ReducerType >
  KOKKOS_INLINE_FUNCTION static
  typename std::enable_if< is_reducer< ReducerType >::value >::type
-  vector_reduce( ReducerType const & reducer )
+  vector_reduce( ReducerType const & reducer ) {
+    vector_reduce(reducer,reducer.reference());
+  }
+
+  template< typename ReducerType >
+  KOKKOS_INLINE_FUNCTION static
+  typename std::enable_if< is_reducer< ReducerType >::value >::type
+  vector_reduce( ReducerType const & reducer, typename ReducerType::value_type& value )
    {

      #ifdef __CUDA_ARCH__
      if(blockDim.x == 1) return;

      // Intra vector lane shuffle reduction:
-      typename ReducerType::value_type tmp ( reducer.reference() );
+      typename ReducerType::value_type tmp ( value );
+      typename ReducerType::value_type tmp2 = tmp;

      unsigned mask = blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<((threadIdx.y%(32/blockDim.x))*blockDim.x);

      for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
-        cuda_shfl_down( reducer.reference() , tmp , i , blockDim.x , mask );
-        if ( (int)threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
+        cuda_shfl_down( tmp2 , tmp , i , blockDim.x , mask );
+        if ( (int)threadIdx.x < i ) { reducer.join( tmp , tmp2 ); }
      }

      // Broadcast from root lane to all other lanes.
@ -302,7 +321,9 @@ public:
      // because floating point summation is not associative
      // and thus different threads could have different results.

-      cuda_shfl( reducer.reference() , tmp , 0 , blockDim.x , mask );
+      cuda_shfl( tmp2 , tmp , 0 , blockDim.x , mask );
+      value = tmp2;
+      reducer.reference() = tmp2;
      #endif
    }

@ -543,19 +564,37 @@ struct TeamThreadRangeBoundariesStruct<iType,CudaTeamMember> {
  const iType end;

  KOKKOS_INLINE_FUNCTION
-  TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count)
+  TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, iType count)
    : member(thread_)
    , start( 0 )
    , end( count ) {}

  KOKKOS_INLINE_FUNCTION
-  TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_,  const iType& begin_, const iType& end_)
+  TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, iType begin_, iType end_)
    : member(thread_)
    , start( begin_ )
    , end( end_ ) {}
 };

+template<typename iType>
+struct TeamVectorRangeBoundariesStruct<iType,CudaTeamMember> {
+  typedef iType index_type;
+  const CudaTeamMember& member;
+  const iType start;
+  const iType end;

+  KOKKOS_INLINE_FUNCTION
+  TeamVectorRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count)
+    : member(thread_)
+    , start( 0 )
+    , end( count ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  TeamVectorRangeBoundariesStruct (const CudaTeamMember& thread_,  const iType& begin_, const iType& end_)
+    : member(thread_)
+    , start( begin_ )
+    , end( end_ ) {}
+};

 template<typename iType>
 struct ThreadVectorRangeBoundariesStruct<iType,CudaTeamMember> {
@ -564,19 +603,19 @@ struct ThreadVectorRangeBoundariesStruct<iType,CudaTeamMember> {
  const index_type end;

  KOKKOS_INLINE_FUNCTION
-  ThreadVectorRangeBoundariesStruct (const CudaTeamMember, const index_type& count)
+  ThreadVectorRangeBoundariesStruct (const CudaTeamMember, index_type count)
    : start( static_cast<index_type>(0) ), end( count ) {}

  KOKKOS_INLINE_FUNCTION
-  ThreadVectorRangeBoundariesStruct (const index_type& count)
+  ThreadVectorRangeBoundariesStruct (index_type count)
    : start( static_cast<index_type>(0) ), end( count ) {}

  KOKKOS_INLINE_FUNCTION
-  ThreadVectorRangeBoundariesStruct (const CudaTeamMember, const index_type& arg_begin, const index_type& arg_end)
+  ThreadVectorRangeBoundariesStruct (const CudaTeamMember, index_type arg_begin, index_type arg_end)
    : start( arg_begin ), end( arg_end ) {}

  KOKKOS_INLINE_FUNCTION
-  ThreadVectorRangeBoundariesStruct (const index_type& arg_begin, const index_type& arg_end)
+  ThreadVectorRangeBoundariesStruct (index_type arg_begin, index_type arg_end)
    : start( arg_begin ), end( arg_end ) {}
 };

@ -585,7 +624,7 @@ struct ThreadVectorRangeBoundariesStruct<iType,CudaTeamMember> {
 template<typename iType>
 KOKKOS_INLINE_FUNCTION
 Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >
-TeamThreadRange( const Impl::CudaTeamMember & thread, const iType & count ) {
+TeamThreadRange( const Impl::CudaTeamMember & thread, iType count ) {
  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, count );
 }

@ -593,22 +632,38 @@ template< typename iType1, typename iType2 >
 KOKKOS_INLINE_FUNCTION
 Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
                                       Impl::CudaTeamMember >
-TeamThreadRange( const Impl::CudaTeamMember & thread, const iType1 & begin, const iType2 & end ) {
+TeamThreadRange( const Impl::CudaTeamMember & thread, iType1 begin, iType2 end ) {
  typedef typename std::common_type< iType1, iType2 >::type iType;
  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, iType(begin), iType(end) );
 }

+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamVectorRangeBoundariesStruct< iType, Impl::CudaTeamMember >
+TeamVectorRange( const Impl::CudaTeamMember & thread, const iType & count ) {
+  return Impl::TeamVectorRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, count );
+}
+
+template< typename iType1, typename iType2 >
+KOKKOS_INLINE_FUNCTION
+Impl::TeamVectorRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
+                                       Impl::CudaTeamMember >
+TeamVectorRange( const Impl::CudaTeamMember & thread, const iType1 & begin, const iType2 & end ) {
+  typedef typename std::common_type< iType1, iType2 >::type iType;
+  return Impl::TeamVectorRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, iType(begin), iType(end) );
+}
+
 template<typename iType>
 KOKKOS_INLINE_FUNCTION
 Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
-ThreadVectorRange(const Impl::CudaTeamMember& thread, const iType& count) {
+ThreadVectorRange(const Impl::CudaTeamMember& thread, iType count) {
  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >(thread,count);
 }

 template<typename iType>
 KOKKOS_INLINE_FUNCTION
 Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
-ThreadVectorRange(const Impl::CudaTeamMember& thread, const iType& arg_begin, const iType& arg_end) {
+ThreadVectorRange(const Impl::CudaTeamMember& thread, iType arg_begin, iType arg_end) {
  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >(thread,arg_begin,arg_end);
 }

@ -667,16 +722,16 @@ parallel_reduce
  )
 {
 #ifdef __CUDA_ARCH__
-
-  reducer.init( reducer.reference() );
+  typename ReducerType::value_type value;
+  reducer.init( value );

  for( iType i = loop_boundaries.start + threadIdx.y
     ; i < loop_boundaries.end
     ; i += blockDim.y ) {
-    closure(i,reducer.reference());
+    closure(i,value);
  }

-  loop_boundaries.member.team_reduce( reducer );
+  loop_boundaries.member.team_reduce( reducer, value );

 #endif
 }
@ -701,19 +756,88 @@ parallel_reduce
  )
 {
 #ifdef __CUDA_ARCH__
-
-  Kokkos::Sum<ValueType> reducer(result);
+  ValueType val;
+  Kokkos::Sum<ValueType> reducer(val);

  reducer.init( reducer.reference() );

  for( iType i = loop_boundaries.start + threadIdx.y
     ; i < loop_boundaries.end
     ; i += blockDim.y ) {
-    closure(i,result);
+    closure(i,val);
  }

-  loop_boundaries.member.team_reduce( reducer );
+  loop_boundaries.member.team_reduce( reducer , val);
+  result = reducer.reference();
+#endif
+}

+template<typename iType, class Closure >
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( const Impl::TeamVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember>&
+      loop_boundaries
+  , const Closure & closure
+  )
+{
+  #ifdef __CUDA_ARCH__
+  for( iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x
+     ; i < loop_boundaries.end
+     ; i += blockDim.y*blockDim.x )
+    closure(i);
+  #endif
+}
+
+template< typename iType, class Closure, class ReducerType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
+parallel_reduce
+  ( const Impl::TeamVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember> &
+      loop_boundaries
+  , const Closure & closure
+  , const ReducerType & reducer
+  )
+{
+#ifdef __CUDA_ARCH__
+  typename ReducerType::value_type value;
+  reducer.init( value );
+
+  for( iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x
+     ; i < loop_boundaries.end
+     ; i += blockDim.y * blockDim.x ) {
+    closure(i,value);
+  }
+
+  loop_boundaries.member.vector_reduce( reducer, value );
+  loop_boundaries.member.team_reduce( reducer, value );
+#endif
+}
+
+template< typename iType, class Closure, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< ! Kokkos::is_reducer< ValueType >::value >::type
+parallel_reduce
+  ( const Impl::TeamVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember> &
+      loop_boundaries
+  , const Closure & closure
+  , ValueType & result
+  )
+{
+#ifdef __CUDA_ARCH__
+  ValueType val;
+  Kokkos::Sum<ValueType> reducer(val);
+
+  reducer.init( reducer.reference() );
+
+  for( iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x
+     ; i < loop_boundaries.end
+     ; i += blockDim.y * blockDim.x ) {
+    closure(i,val);
+  }
+
+  loop_boundaries.member.vector_reduce( reducer );
+  loop_boundaries.member.team_reduce( reducer );
+  result = reducer.reference();
 #endif
 }

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@ -241,7 +241,7 @@ class ViewDataHandle< Traits ,
      sizeof(typename Traits::const_value_type) == 16 )
    &&
    // Random access trait
-    ( Traits::memory_traits::RandomAccess != 0 )
+    ( Traits::memory_traits::is_random_access != 0 )
  )>::type >
 {
 public:
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
@ -102,9 +102,8 @@ public:
    const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
    const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
    const int shared = 0 ;
-    const cudaStream_t stream = 0 ;

-    Kokkos::Impl::CudaParallelLaunch<Self>(*this, grid, block, shared, stream);
+    Kokkos::Impl::CudaParallelLaunch<Self>(*this, grid, block, shared, Cuda().impl_internal_space_instance() , false );
  }

  inline
--- a/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp
+++ b/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp
@ -0,0 +1,152 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+#include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_ENABLE_HPX
+#include <Kokkos_HPX.hpp>
+
+#include <hpx/util/yield_while.hpp>
+
+namespace Kokkos {
+namespace Experimental {
+
+bool HPX::m_hpx_initialized = false;
+Kokkos::Impl::thread_buffer HPX::m_buffer;
+#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
+hpx::future<void> HPX::m_future = hpx::make_ready_future<void>();
+#endif
+
+int HPX::concurrency() {
+  hpx::runtime *rt = hpx::get_runtime_ptr();
+  if (rt == nullptr) {
+    return hpx::threads::hardware_concurrency();
+  } else {
+    if (hpx::threads::get_self_ptr() == nullptr) {
+      return hpx::resource::get_thread_pool(0).get_os_thread_count();
+    } else {
+      return hpx::this_thread::get_pool()->get_os_thread_count();
+    }
+  }
+}
+
+void HPX::impl_initialize(int thread_count) {
+  hpx::runtime *rt = hpx::get_runtime_ptr();
+  if (rt == nullptr) {
+    std::vector<std::string> config = {
+        "hpx.os_threads=" + std::to_string(thread_count),
+#ifdef KOKKOS_DEBUG
+        "--hpx:attach-debugger=exception",
+#endif
+    };
+    int argc_hpx = 1;
+    char name[] = "kokkos_hpx";
+    char *argv_hpx[] = {name, nullptr};
+    hpx::start(nullptr, argc_hpx, argv_hpx, config);
+
+    // NOTE: Wait for runtime to start. hpx::start returns as soon as
+    // possible, meaning some operations are not allowed immediately
+    // after hpx::start. Notably, hpx::stop needs state_running. This
+    // needs to be fixed in HPX itself.
+
+    // Get runtime pointer again after it has been started.
+    rt = hpx::get_runtime_ptr();
+    hpx::util::yield_while(
+        [rt]() { return rt->get_state() < hpx::state_running; });
+
+    m_hpx_initialized = true;
+  }
+}
+
+void HPX::impl_initialize() {
+  hpx::runtime *rt = hpx::get_runtime_ptr();
+  if (rt == nullptr) {
+    std::vector<std::string> config = {
+#ifdef KOKKOS_DEBUG
+        "--hpx:attach-debugger=exception",
+#endif
+    };
+    int argc_hpx = 1;
+    char name[] = "kokkos_hpx";
+    char *argv_hpx[] = {name, nullptr};
+    hpx::start(nullptr, argc_hpx, argv_hpx, config);
+
+    // NOTE: Wait for runtime to start. hpx::start returns as soon as
+    // possible, meaning some operations are not allowed immediately
+    // after hpx::start. Notably, hpx::stop needs state_running. This
+    // needs to be fixed in HPX itself.
+
+    // Get runtime pointer again after it has been started.
+    rt = hpx::get_runtime_ptr();
+    hpx::util::yield_while(
+        [rt]() { return rt->get_state() < hpx::state_running; });
+
+    m_hpx_initialized = true;
+  }
+}
+
+bool HPX::impl_is_initialized() noexcept {
+  hpx::runtime *rt = hpx::get_runtime_ptr();
+  return rt != nullptr;
+}
+
+void HPX::impl_finalize() {
+  if (m_hpx_initialized) {
+    hpx::runtime *rt = hpx::get_runtime_ptr();
+    if (rt != nullptr) {
+      hpx::apply([]() { hpx::finalize(); });
+      hpx::stop();
+    } else {
+      Kokkos::abort("Kokkos::Experimental::HPX::impl_finalize: Kokkos started "
+                    "HPX but something else already stopped HPX\n");
+    }
+  }
+}
+
+} // namespace Experimental
+} // namespace Kokkos
+
+#else
+void KOKKOS_CORE_SRC_IMPL_HPX_PREVENT_LINK_ERROR() {}
+#endif //#ifdef KOKKOS_ENABLE_HPX
--- a/lib/kokkos/core/src/impl/Kokkos_StaticAssert.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_StaticAssert.hpp
@ -41,38 +41,25 @@
 //@HEADER
 */

-#ifndef KOKKOS_STATICASSERT_HPP
-#define KOKKOS_STATICASSERT_HPP
+#include <Kokkos_Macros.hpp>
+#if defined(KOKKOS_ENABLE_HPX) && defined(KOKKOS_ENABLE_TASKDAG)
+
+#include <Kokkos_Core.hpp>
+
+#include <impl/Kokkos_TaskQueue_impl.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------

 namespace Kokkos {
 namespace Impl {

-template < bool , class T = void >
-struct StaticAssert ;
-
-template< class T >
-struct StaticAssert< true , T > {
-  typedef T type ;
-  static const bool value = true ;
-};
-
-template < class A , class B >
-struct StaticAssertSame ;
-
-template < class A >
-struct StaticAssertSame<A,A> { typedef A type ; };
-
-template < class A , class B >
-struct StaticAssertAssignable ;
-
-template < class A >
-struct StaticAssertAssignable<A,A> { typedef A type ; };
-
-template < class A >
-struct StaticAssertAssignable< const A , A > { typedef const A type ; };
+template class TaskQueue<Kokkos::Experimental::HPX,
+                         Kokkos::Experimental::HPX::memory_space>;

 } // namespace Impl
 } // namespace Kokkos

-#endif /* KOKKOS_STATICASSERT_HPP */
-
+#else
+void KOKKOS_CORE_SRC_IMPL_HPX_TASK_PREVENT_LINK_ERROR() {}
+#endif // #if defined( KOKKOS_ENABLE_HPX ) && defined( KOKKOS_ENABLE_TASKDAG )
--- a/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp
+++ b/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp
@ -0,0 +1,298 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HPX_TASK_HPP
+#define KOKKOS_HPX_TASK_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined(KOKKOS_ENABLE_HPX) && defined(KOKKOS_ENABLE_TASKDAG)
+
+#include <Kokkos_TaskScheduler_fwd.hpp>
+
+#include <Kokkos_HPX.hpp>
+
+#include <hpx/apply.hpp>
+#include <hpx/lcos/local/counting_semaphore.hpp>
+
+#include <type_traits>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template <class QueueType>
+class TaskQueueSpecialization<
+    SimpleTaskScheduler<Kokkos::Experimental::HPX, QueueType>> {
+public:
+  using execution_space = Kokkos::Experimental::HPX;
+  using scheduler_type =
+      SimpleTaskScheduler<Kokkos::Experimental::HPX, QueueType>;
+  using member_type =
+      TaskTeamMemberAdapter<Kokkos::Impl::HPXTeamMember, scheduler_type>;
+  using memory_space = Kokkos::HostSpace;
+
+  static void execute(scheduler_type const &scheduler) {
+    // NOTE: We create an instance so that we can use dispatch_execute_task.
+    // This is not necessarily the most efficient, but can be improved later.
+    TaskQueueSpecialization<scheduler_type> task_queue;
+    task_queue.scheduler = &scheduler;
+    Kokkos::Impl::dispatch_execute_task(&task_queue);
+    Kokkos::Experimental::HPX().fence();
+  }
+
+  // Must provide task queue execution function
+  void execute_task() const {
+    using hpx::apply;
+    using hpx::lcos::local::counting_semaphore;
+    using task_base_type = typename scheduler_type::task_base_type;
+
+    const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
+
+    thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
+    buffer.resize(num_worker_threads, 512);
+
+    auto &queue = scheduler->queue();
+
+    counting_semaphore sem(0);
+
+    for (int thread = 0; thread < num_worker_threads; ++thread) {
+      apply([this, &sem, &queue, &buffer, num_worker_threads, thread]() {
+        // NOTE: This implementation has been simplified based on the
+        // assumption that team_size = 1. The HPX backend currently only
+        // supports a team size of 1.
+        std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id();
+
+        buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id());
+        HPXTeamMember member(TeamPolicyInternal<Kokkos::Experimental::HPX>(
+                                 Kokkos::Experimental::HPX(), num_worker_threads, 1),
+                             0, t, buffer.get(t), 512);
+
+        member_type single_exec(*scheduler, member);
+        member_type &team_exec = single_exec;
+
+        auto &team_scheduler = team_exec.scheduler();
+        auto current_task = OptionalRef<task_base_type>(nullptr);
+
+        while (!queue.is_done()) {
+          current_task =
+              queue.pop_ready_task(team_scheduler.team_scheduler_info());
+
+          if (current_task) {
+            KOKKOS_ASSERT(current_task->is_single_runnable() ||
+                          current_task->is_team_runnable());
+            current_task->as_runnable_task().run(single_exec);
+            queue.complete((*std::move(current_task)).as_runnable_task(),
+                           team_scheduler.team_scheduler_info());
+          }
+        }
+
+        sem.signal(1);
+      });
+    }
+
+    sem.wait(num_worker_threads);
+  }
+
+  static uint32_t get_max_team_count(execution_space const &espace) {
+    return static_cast<uint32_t>(espace.concurrency());
+  }
+
+  template <typename TaskType>
+  static void get_function_pointer(typename TaskType::function_type &ptr,
+                                   typename TaskType::destroy_type &dtor) {
+    ptr = TaskType::apply;
+    dtor = TaskType::destroy;
+  }
+
+private:
+  const scheduler_type *scheduler;
+};
+
+template <class Scheduler>
+class TaskQueueSpecializationConstrained<
+    Scheduler, typename std::enable_if<
+                   std::is_same<typename Scheduler::execution_space,
+                                Kokkos::Experimental::HPX>::value>::type> {
+public:
+  using execution_space = Kokkos::Experimental::HPX;
+  using scheduler_type = Scheduler;
+  using member_type =
+      TaskTeamMemberAdapter<Kokkos::Impl::HPXTeamMember, scheduler_type>;
+  using memory_space = Kokkos::HostSpace;
+
+  static void
+  iff_single_thread_recursive_execute(scheduler_type const &scheduler) {
+    using task_base_type = typename scheduler_type::task_base;
+    using queue_type = typename scheduler_type::queue_type;
+
+    if (1 == Kokkos::Experimental::HPX::concurrency()) {
+      task_base_type *const end = (task_base_type *)task_base_type::EndTag;
+      task_base_type *task = end;
+
+      HPXTeamMember member(TeamPolicyInternal<Kokkos::Experimental::HPX>(
+                               Kokkos::Experimental::HPX(), 1, 1),
+                           0, 0, nullptr, 0);
+      member_type single_exec(scheduler, member);
+
+      do {
+        task = end;
+
+        // Loop by priority and then type
+        for (int i = 0; i < queue_type::NumQueue && end == task; ++i) {
+          for (int j = 0; j < 2 && end == task; ++j) {
+            task =
+                queue_type::pop_ready_task(&scheduler.m_queue->m_ready[i][j]);
+          }
+        }
+
+        if (end == task)
+          break;
+
+        (*task->m_apply)(task, &single_exec);
+
+        scheduler.m_queue->complete(task);
+
+      } while (true);
+    }
+  }
+
+  static void execute(scheduler_type const &scheduler) {
+    // NOTE: We create an instance so that we can use dispatch_execute_task.
+    // This is not necessarily the most efficient, but can be improved later.
+    TaskQueueSpecializationConstrained<scheduler_type> task_queue;
+    task_queue.scheduler = &scheduler;
+    Kokkos::Impl::dispatch_execute_task(&task_queue);
+    Kokkos::Experimental::HPX().fence();
+  }
+
+  // Must provide task queue execution function
+  void execute_task() const {
+    using hpx::apply;
+    using hpx::lcos::local::counting_semaphore;
+    using task_base_type = typename scheduler_type::task_base;
+    using queue_type = typename scheduler_type::queue_type;
+
+    const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
+    static task_base_type *const end = (task_base_type *)task_base_type::EndTag;
+    constexpr task_base_type *no_more_tasks_sentinel = nullptr;
+
+    thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
+    buffer.resize(num_worker_threads, 512);
+
+    auto &queue = scheduler->queue();
+    queue.initialize_team_queues(num_worker_threads);
+
+    counting_semaphore sem(0);
+
+    for (int thread = 0; thread < num_worker_threads; ++thread) {
+      apply([this, &sem, &buffer, num_worker_threads, thread]() {
+        // NOTE: This implementation has been simplified based on the assumption
+        // that team_size = 1. The HPX backend currently only supports a team
+        // size of 1.
+        std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id();
+
+        buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id());
+        HPXTeamMember member(
+            TeamPolicyInternal<Kokkos::Experimental::HPX>(
+                Kokkos::Experimental::HPX(), num_worker_threads, 1),
+            0, t, buffer.get(t), 512);
+
+        member_type single_exec(*scheduler, member);
+        member_type &team_exec = single_exec;
+
+        auto &team_queue = team_exec.scheduler().queue();
+        task_base_type *task = no_more_tasks_sentinel;
+
+        do {
+          if (task != no_more_tasks_sentinel && task != end) {
+            team_queue.complete(task);
+          }
+
+          if (*((volatile int *)&team_queue.m_ready_count) > 0) {
+            task = end;
+            for (int i = 0; i < queue_type::NumQueue && end == task; ++i) {
+              for (int j = 0; j < 2 && end == task; ++j) {
+                task = queue_type::pop_ready_task(&team_queue.m_ready[i][j]);
+              }
+            }
+          } else {
+            task = team_queue.attempt_to_steal_task();
+          }
+
+          if (task != no_more_tasks_sentinel && task != end) {
+            (*task->m_apply)(task, &single_exec);
+          }
+        } while (task != no_more_tasks_sentinel);
+
+        sem.signal(1);
+      });
+    }
+
+    sem.wait(num_worker_threads);
+  }
+
+  template <typename TaskType>
+  static void get_function_pointer(typename TaskType::function_type &ptr,
+                                   typename TaskType::destroy_type &dtor) {
+    ptr = TaskType::apply;
+    dtor = TaskType::destroy;
+  }
+
+private:
+  const scheduler_type *scheduler;
+};
+
+extern template class TaskQueue<
+    Kokkos::Experimental::HPX,
+    typename Kokkos::Experimental::HPX::memory_space>;
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
+#endif /* #ifndef KOKKOS_HPX_TASK_HPP */
--- a/lib/kokkos/core/src/HPX/Kokkos_HPX_ViewCopyETIAvail.hpp
+++ b/lib/kokkos/core/src/HPX/Kokkos_HPX_ViewCopyETIAvail.hpp
@ -0,0 +1,57 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HPX_VIEWETIAVAIL_HPP
+#define KOKKOS_HPX_VIEWETIAVAIL_HPP
+
+namespace Kokkos {
+namespace Impl {
+#define KOKKOS_IMPL_VIEWCOPY_ETI_AVAIL_EXECSPACE Kokkos::Experimental::HPX
+
+#include<eti/common/Kokkos_ViewFillCopyETIAvail_Macros.hpp>
+
+#undef KOKKOS_IMPL_VIEWCOPY_ETI_AVAIL_EXECSPACE
+}
+}
+#endif
+
--- a/lib/kokkos/core/src/HPX/Kokkos_HPX_ViewCopyETIDecl.hpp
+++ b/lib/kokkos/core/src/HPX/Kokkos_HPX_ViewCopyETIDecl.hpp
@ -0,0 +1,57 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HPX_VIEWETIDECL_HPP
+#define KOKKOS_HPX_VIEWETIDECL_HPP
+
+namespace Kokkos {
+namespace Impl {
+#define KOKKOS_IMPL_VIEWCOPY_ETI_AVAIL_EXECSPACE Kokkos::Experimental::HPX
+
+#include<eti/common/Kokkos_ViewFillCopyETIDecl_Macros.hpp>
+
+#undef KOKKOS_IMPL_VIEWCOPY_ETI_AVAIL_EXECSPACE
+}
+}
+#endif
+
--- a/lib/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp
@ -0,0 +1,116 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HPX_WORKGRAPHPOLICY_HPP
+#define KOKKOS_HPX_WORKGRAPHPOLICY_HPP
+
+#include <hpx/apply.hpp>
+#include <hpx/lcos/local/counting_semaphore.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class FunctorType, class... Traits>
+class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
+                  Kokkos::Experimental::HPX> {
+private:
+  using Policy = Kokkos::WorkGraphPolicy<Traits...>;
+  using WorkTag = typename Policy::work_tag;
+
+  Policy m_policy;
+  FunctorType m_functor;
+
+  template <class TagType>
+  typename std::enable_if<std::is_same<TagType, void>::value>::type
+  execute_functor(const std::int32_t w) const noexcept {
+    m_functor(w);
+  }
+
+  template <class TagType>
+  typename std::enable_if<!std::is_same<TagType, void>::value>::type
+  execute_functor(const std::int32_t w) const noexcept {
+    const TagType t{};
+    m_functor(t, w);
+  }
+
+public:
+  void execute() const {
+    dispatch_execute_task(this);
+    Kokkos::Experimental::HPX().fence();
+  }
+
+  void execute_task() const {
+    const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
+
+    using hpx::apply;
+    using hpx::lcos::local::counting_semaphore;
+
+    counting_semaphore sem(0);
+
+    for (int thread = 0; thread < num_worker_threads; ++thread) {
+      apply([this, &sem]() {
+        std::int32_t w = m_policy.pop_work();
+        while (w != Policy::COMPLETED_TOKEN) {
+          if (w != Policy::END_TOKEN) {
+            execute_functor<WorkTag>(w);
+            m_policy.completed_work(w);
+          }
+
+          w = m_policy.pop_work();
+        }
+
+        sem.signal(1);
+      });
+    }
+
+    sem.wait(num_worker_threads);
+  }
+
+  inline ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy)
+      : m_policy(arg_policy), m_functor(arg_functor) {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_HPX_WORKGRAPHPOLICY_HPP */
--- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
+++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@ -125,6 +125,8 @@ struct MDRangePolicy
  using traits = Kokkos::Impl::PolicyTraits<Properties ...>;
  using range_policy = RangePolicy<Properties...>;

+  typename traits::execution_space m_space;
+
  using impl_range_policy = RangePolicy< typename traits::execution_space
                                       , typename traits::schedule_type
                                       , typename traits::index_type
@ -132,6 +134,9 @@ struct MDRangePolicy

  typedef MDRangePolicy execution_policy; // needed for is_execution_space interrogation

+  template<class ... OtherProperties>
+  friend struct MDRangePolicy;
+
  static_assert( !std::is_same<typename traits::iteration_pattern,void>::value
               , "Kokkos Error: MD iteration pattern not defined" );

@ -192,13 +197,54 @@ struct MDRangePolicy
  static constexpr int Right = static_cast<int>( Iterate::Right );
  static constexpr int Left  = static_cast<int>( Iterate::Left );

+  KOKKOS_INLINE_FUNCTION const typename traits::execution_space & space() const { return m_space ; }
+  template < typename LT , typename UT , typename TT = array_index_type >
+  MDRangePolicy(std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
+    : m_space() {
+    init(lower, upper, tile);
+  }
+
+  template < typename LT , typename UT , typename TT = array_index_type >
+  MDRangePolicy(const typename traits::execution_space & work_space,
+    std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
+    : m_space( work_space ) {
+    init(lower, upper, tile);
+  }
+
  MDRangePolicy( point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{} )
-    : m_lower(lower)
+    : m_space()
+    , m_lower(lower)
    , m_upper(upper)
    , m_tile(tile)
    , m_num_tiles(1)
-    , m_prod_tile_dims(1)
-  {
+    , m_prod_tile_dims(1) {
+    init();
+  }
+
+  MDRangePolicy( const typename traits::execution_space & work_space,
+    point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{} )
+    : m_space( work_space )
+    , m_lower(lower)
+    , m_upper(upper)
+    , m_tile(tile)
+    , m_num_tiles(1)
+    , m_prod_tile_dims(1) {
+    init();
+  }
+
+  template<class ... OtherProperties>
+  MDRangePolicy( const MDRangePolicy<OtherProperties...> p ):
+     m_space(p.m_space),
+     m_lower(p.m_lower),
+     m_upper(p.m_upper),
+     m_tile(p.m_tile),
+     m_tile_end(p.m_tile_end),
+     m_num_tiles(p.m_num_tiles),
+     m_prod_tile_dims(p.m_prod_tile_dims) {}
+
+private:
+
+  void init() {
    // Host
    if ( true
       #if defined(KOKKOS_ENABLE_CUDA)
@ -211,7 +257,7 @@ struct MDRangePolicy
    {
      index_type span;
      for (int i=0; i<rank; ++i) {
-        span = upper[i] - lower[i];
+        span = m_upper[i] - m_lower[i];
        if ( m_tile[i] <= 0 ) {
          if (  ((int)inner_direction == (int)Right && (i < rank-1))
              || ((int)inner_direction == (int)Left && (i > 0)) )
@ -311,11 +357,9 @@ struct MDRangePolicy
    #endif
  }

-
  template < typename LT , typename UT , typename TT = array_index_type >
-  MDRangePolicy( std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
+  void init( std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
  {
-
    if(static_cast<int>(m_lower.size()) != rank || static_cast<int>(m_upper.size()) != rank)
      Kokkos::abort("MDRangePolicy: Constructor initializer lists have wrong size");

@ -589,5 +633,26 @@ void md_parallel_reduce( const std::string& str
 } } // namespace Kokkos::Experimental
 #endif

+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template<unsigned long P, class ... Properties>
+struct PolicyPropertyAdaptor<WorkItemProperty::ImplWorkItemProperty<P>,MDRangePolicy<Properties...>> {
+  typedef MDRangePolicy<Properties...> policy_in_t;
+  typedef MDRangePolicy<typename policy_in_t::traits::execution_space,
+                      typename policy_in_t::traits::schedule_type,
+                      typename policy_in_t::traits::work_tag,
+                      typename policy_in_t::traits::index_type,
+                      typename policy_in_t::traits::iteration_pattern,
+                      typename policy_in_t::traits::launch_bounds,
+                      WorkItemProperty::ImplWorkItemProperty<P>> policy_out_t;
+};
+
+}
+}
+}
+
+
 #endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP

--- a/lib/kokkos/core/src/Kokkos_Atomic.hpp
+++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp
@ -90,6 +90,7 @@
 #if ! defined( KOKKOS_ENABLE_GNU_ATOMICS ) && \
    ! defined( KOKKOS_ENABLE_INTEL_ATOMICS ) && \
    ! defined( KOKKOS_ENABLE_OPENMP_ATOMICS ) && \
+    ! defined( KOKKOS_ENABLE_STD_ATOMICS ) && \
    ! defined( KOKKOS_ENABLE_SERIAL_ATOMICS )

 // Compiling for non-Cuda atomic implementation has not been pre-selected.
@ -168,6 +169,12 @@ const char * atomic_query_version()

 } // namespace Kokkos

+//----------------------------------------------------------------------------
+// Atomic Memory Orders
+//
+// Implements Strongly-typed analogs of C++ standard memory orders
+#include "impl/Kokkos_Atomic_Memory_Order.hpp"
+
 #if defined( KOKKOS_ENABLE_ROCM )
 namespace Kokkos {
 namespace Impl {
@ -287,6 +294,14 @@ void unlock_address_rocm_space(void* ptr);
 #ifndef _WIN32
 #include "impl/Kokkos_Atomic_Generic.hpp"
 #endif
+
+//----------------------------------------------------------------------------
+// Provide atomic loads and stores with memory order semantics
+
+#include "impl/Kokkos_Atomic_Load.hpp"
+#include "impl/Kokkos_Atomic_Store.hpp"
+
+
 //----------------------------------------------------------------------------
 // This atomic-style macro should be an inlined function, not a macro

--- a/lib/kokkos/core/src/Kokkos_Complex.hpp
+++ b/lib/kokkos/core/src/Kokkos_Complex.hpp
@ -631,8 +631,10 @@ RealType real (const complex<RealType>& x) {
 template<class RealType>
 KOKKOS_INLINE_FUNCTION
 RealType abs (const complex<RealType>& x) {
-  // FIXME (mfh 31 Oct 2014) Scale to avoid unwarranted overflow.
-  return std::sqrt (real (x) * real (x) + imag (x) * imag (x));
+#ifndef __CUDA_ARCH__
+  using std::hypot;
+#endif
+  return hypot(x.real(),x.imag());
 }

 //! Power of a complex number
--- a/Show More
+++ b/Show More