Merge pull request #1422 from stanmoore1/team_opt

Optimize KOKKOS package for small systems
2019-06-10 20:03:53 -04:00 · 2019-06-10 20:03:53 -04:00 · e72ac92a7f
parent b9e10d55e2 b88158fc3b
commit e72ac92a7f
28 changed files with 1216 additions and 500 deletions
--- a/doc/src/Speed_kokkos.txt
+++ b/doc/src/Speed_kokkos.txt
@ -46,7 +46,7 @@ software version 7.5 or later must be installed on your system. See
 the discussion for the "GPU package"_Speed_gpu.html for details of how
 to check and do this.

-NOTE: Kokkos with CUDA currently implicitly assumes, that the MPI
+NOTE: Kokkos with CUDA currently implicitly assumes that the MPI
 library is CUDA-aware and has support for GPU-direct. This is not
 always the case, especially when using pre-compiled MPI libraries
 provided by a Linux distribution. This is not a problem when using
@ -207,19 +207,21 @@ supports.

 [Running on GPUs:]

-Use the "-k" "command-line switch"_Run_options.html to
-specify the number of GPUs per node. Typically the -np setting of the
-mpirun command should set the number of MPI tasks/node to be equal to
-the number of physical GPUs on the node.  You can assign multiple MPI
-tasks to the same GPU with the KOKKOS package, but this is usually
-only faster if significant portions of the input script have not
-been ported to use Kokkos. Using CUDA MPS is recommended in this
-scenario. Using a CUDA-aware MPI library with support for GPU-direct
-is highly recommended. GPU-direct use can be avoided by using
-"-pk kokkos gpu/direct no"_package.html.
-As above for multi-core CPUs (and no GPU), if N is the number of
-physical cores/node, then the number of MPI tasks/node should not
-exceed N.
+Use the "-k" "command-line switch"_Run_options.html to specify the 
+number of GPUs per node. Typically the -np setting of the mpirun command 
+should set the number of MPI tasks/node to be equal to the number of 
+physical GPUs on the node. You can assign multiple MPI tasks to the same 
+GPU with the KOKKOS package, but this is usually only faster if some 
+portions of the input script have not been ported to use Kokkos. In this 
+case, also packing/unpacking communication buffers on the host may give 
+speedup (see the KOKKOS "package"_package.html command). Using CUDA MPS 
+is recommended in this scenario.
+
+Using a CUDA-aware MPI library with 
+support for GPU-direct is highly recommended. GPU-direct use can be 
+avoided by using "-pk kokkos gpu/direct no"_package.html. As above for 
+multi-core CPUs (and no GPU), if N is the number of physical cores/node, 
+then the number of MPI tasks/node should not exceed N.

 -k on g Ng :pre

--- a/doc/src/package.txt
+++ b/doc/src/package.txt
@ -64,13 +64,16 @@ args = arguments specific to the style :l
      {no_affinity} values = none
  {kokkos} args = keyword value ...
    zero or more keyword/value pairs may be appended
-    keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse} or {gpu/direct}
+    keywords = {neigh} or {neigh/qeq} or {neigh/thread} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse} or {gpu/direct}
      {neigh} value = {full} or {half}
        full = full neighbor list
        half = half neighbor list built in thread-safe manner
      {neigh/qeq} value = {full} or {half}
        full = full neighbor list
        half = half neighbor list built in thread-safe manner
+      {neigh/thread} value = {off} or {on}
+        off = thread only over atoms
+        on = thread over both atoms and neighbors
      {newton} = {off} or {on}
        off = set Newton pairwise and bonded flags off
        on = set Newton pairwise and bonded flags on
@ -442,7 +445,19 @@ running on CPUs, a {half} neighbor list is the default because it are
 often faster, just as it is for non-accelerated pair styles. Similarly, 
 the {neigh/qeq} keyword determines how neighbor lists are built for "fix 
 qeq/reax/kk"_fix_qeq_reax.html. If not explicitly set, the value of 
-{neigh/qeq} will match {neigh}. 
+{neigh/qeq} will match {neigh}.
+
+If the {neigh/thread} keyword is set to {off}, then the KOKKOS package 
+threads only over atoms. However, for small systems, this may not expose 
+enough parallelism to keep a GPU busy. When this keyword is set to {on}, 
+the KOKKOS package threads over both atoms and neighbors of atoms. When 
+using {neigh/thread} {on}, a full neighbor list must also be used. Using 
+{neigh/thread} {on} may be slower for large systems, so this this option 
+is turned on by default only when there are 16K atoms or less owned by 
+an MPI rank and when using a full neighbor list. Not all KOKKOS-enabled 
+potentials support this keyword yet, and only thread over atoms. Many 
+simple pair-wise potentials such as Lennard-Jones do support threading 
+over both atoms and neighbors.

 The {newton} keyword sets the Newton flags for pairwise and bonded 
 interactions to {off} or {on}, the same as the "newton"_newton.html 
@ -475,10 +490,10 @@ are rebuilt. The data is only for atoms that migrate to new processors.
 "Forward" communication happens every timestep. "Reverse" communication 
 happens every timestep if the {newton} option is on. The data is for 
 atom coordinates and any other atom properties that needs to be updated 
-for ghost atoms owned by each processor. 
+for ghost atoms owned by each processor.

 The {comm} keyword is simply a short-cut to set the same value for both 
-the {comm/exchange} and {comm/forward} and {comm/reverse} keywords. 
+the {comm/exchange} and {comm/forward} and {comm/reverse} keywords.

 The value options for all 3 keywords are {no} or {host} or {device}. A 
 value of {no} means to use the standard non-KOKKOS method of 
@ -486,26 +501,26 @@ packing/unpacking data for the communication. A value of {host} means to
 use the host, typically a multi-core CPU, and perform the 
 packing/unpacking in parallel with threads. A value of {device} means to 
 use the device, typically a GPU, to perform the packing/unpacking 
-operation. 
+operation.

 The optimal choice for these keywords depends on the input script and 
 the hardware used. The {no} value is useful for verifying that the 
 Kokkos-based {host} and {device} values are working correctly. It is the 
-default when running on CPUs since it is usually the fastest. 
+default when running on CPUs since it is usually the fastest.

 When running on CPUs or Xeon Phi, the {host} and {device} values work 
 identically. When using GPUs, the {device} value is the default since it 
 will typically be optimal if all of your styles used in your input 
 script are supported by the KOKKOS package. In this case data can stay 
 on the GPU for many timesteps without being moved between the host and 
-GPU, if you use the {device} value. This requires that your MPI is able 
-to access GPU memory directly. Currently that is true for OpenMPI 1.8 
-(or later versions), Mvapich2 1.9 (or later), and CrayMPI. If your 
-script uses styles (e.g. fixes) which are not yet supported by the 
-KOKKOS package, then data has to be move between the host and device 
-anyway, so it is typically faster to let the host handle communication, 
-by using the {host} value. Using {host} instead of {no} will enable use 
-of multiple threads to pack/unpack communicated data. 
+GPU, if you use the {device} value. If your script uses styles (e.g. 
+fixes) which are not yet supported by the KOKKOS package, then data has 
+to be move between the host and device anyway, so it is typically faster 
+to let the host handle communication, by using the {host} value. Using 
+{host} instead of {no} will enable use of multiple threads to 
+pack/unpack communicated data. When running small systems on a GPU, 
+performing the exchange pack/unpack on the host CPU can give speedup 
+since it reduces the number of CUDA kernel launches.

 The {gpu/direct} keyword chooses whether GPU-direct will be used. When 
 this keyword is set to {on}, buffers in GPU memory are passed directly 
@ -518,7 +533,8 @@ the {gpu/direct} keyword is automatically set to {off} by default. When
 the {gpu/direct} keyword is set to {off} while any of the {comm} 
 keywords are set to {device}, the value for these {comm} keywords will 
 be automatically changed to {host}. This setting has no effect if not 
-running on GPUs.
+running on GPUs. GPU-direct is available for OpenMPI 1.8 (or later 
+versions), Mvapich2 1.9 (or later), and CrayMPI.

 :line

@ -630,11 +646,12 @@ neigh/qeq = full, newton = off, binsize for GPUs = 2x LAMMPS default
 value, comm = device, gpu/direct = on. When LAMMPS can safely detect 
 that GPU-direct is not available, the default value of gpu/direct 
 becomes "off". For CPUs or Xeon Phis, the option defaults are neigh = 
-half, neigh/qeq = half, newton = on, binsize = 0.0, and comm = no. These 
-settings are made automatically by the required "-k on" "command-line 
-switch"_Run_options.html. You can change them by using the package 
-kokkos command in your input script or via the "-pk kokkos command-line 
-switch"_Run_options.html.
+half, neigh/qeq = half, newton = on, binsize = 0.0, and comm = no. The 
+option neigh/thread = on when there are 16K atoms or less on an MPI 
+rank, otherwise it is "off". These settings are made automatically by 
+the required "-k on" "command-line switch"_Run_options.html. You can 
+change them by using the package kokkos command in your input script or 
+via the "-pk kokkos command-line switch"_Run_options.html.

 For the OMP package, the default is Nthreads = 0 and the option
 defaults are neigh = yes.  These settings are made automatically if
--- a/src/KOKKOS/atom_kokkos.cpp
+++ b/src/KOKKOS/atom_kokkos.cpp
@ -22,6 +22,7 @@
 #include "memory_kokkos.h"
 #include "error.h"
 #include "kokkos.h"
+#include "atom_masks.h"

 using namespace LAMMPS_NS;

@ -270,8 +271,10 @@ int AtomKokkos::add_custom(const char *name, int flag)
    int n = strlen(name) + 1;
    dname[index] = new char[n];
    strcpy(dname[index],name);
+    this->sync(Device,DVECTOR_MASK);
    memoryKK->grow_kokkos(k_dvector,dvector,ndvector,nmax,
                        "atom:dvector");
+    this->modified(Device,DVECTOR_MASK);
  }

  return index;
--- a/src/KOKKOS/atom_vec_angle_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_angle_kokkos.cpp
@ -24,7 +24,7 @@

 using namespace LAMMPS_NS;

-#define DELTA 10000
+#define DELTA 10

 /* ---------------------------------------------------------------------- */

@ -59,14 +59,15 @@ AtomVecAngleKokkos::AtomVecAngleKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)

 void AtomVecAngleKokkos::grow(int n)
 {
-  if (n == 0) nmax += DELTA;
+  int step = MAX(DELTA,nmax*0.01);
+  if (n == 0) nmax += step;
  else nmax = n;
  atomKK->nmax = nmax;
  if (nmax < 0 || nmax > MAXSMALLINT)
    error->one(FLERR,"Per-processor system is too big");

-  sync(Device,ALL_MASK);
-  modified(Device,ALL_MASK);
+  atomKK->sync(Device,ALL_MASK);
+  atomKK->modified(Device,ALL_MASK);

  memoryKK->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
  memoryKK->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
@ -98,7 +99,7 @@ void AtomVecAngleKokkos::grow(int n)
                      "atom:angle_atom3");

  grow_reset();
-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);

  if (atom->nextra_grow)
    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
@ -282,7 +283,7 @@ int AtomVecAngleKokkos::pack_comm_kokkos(const int &n,
  // Choose correct forward PackComm kernel

  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
+    atomKK->sync(Host,X_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
        struct AtomVecAngleKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
@ -309,7 +310,7 @@ int AtomVecAngleKokkos::pack_comm_kokkos(const int &n,
      }
    }
  } else {
-    sync(Device,X_MASK);
+    atomKK->sync(Device,X_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
        struct AtomVecAngleKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
@ -397,8 +398,8 @@ int AtomVecAngleKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &li
                                       const int nfirst, const int &pbc_flag,
                                       const int* const pbc) {
  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
+    atomKK->sync(Host,X_MASK);
+    atomKK->modified(Host,X_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
      struct AtomVecAngleKokkos_PackCommSelf<LMPHostType,1,1>
@ -429,8 +430,8 @@ int AtomVecAngleKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &li
      }
    }
  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
+    atomKK->sync(Device,X_MASK);
+    atomKK->modified(Device,X_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
      struct AtomVecAngleKokkos_PackCommSelf<LMPDeviceType,1,1>
@ -493,13 +494,13 @@ struct AtomVecAngleKokkos_UnpackComm {
 void AtomVecAngleKokkos::unpack_comm_kokkos(const int &n, const int &first,
    const DAT::tdual_xfloat_2d &buf ) {
  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
+    atomKK->sync(Host,X_MASK);
+    atomKK->modified(Host,X_MASK);
    struct AtomVecAngleKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
    Kokkos::parallel_for(n,f);
  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
+    atomKK->sync(Device,X_MASK);
+    atomKK->modified(Device,X_MASK);
    struct AtomVecAngleKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
    Kokkos::parallel_for(n,f);
  }
@ -642,7 +643,7 @@ void AtomVecAngleKokkos::unpack_comm_vel(int n, int first, double *buf)
 int AtomVecAngleKokkos::pack_reverse(int n, int first, double *buf)
 {
  if(n > 0)
-    sync(Host,F_MASK);
+    atomKK->sync(Host,F_MASK);

  int m = 0;
  const int last = first + n;
@ -659,7 +660,7 @@ int AtomVecAngleKokkos::pack_reverse(int n, int first, double *buf)
 void AtomVecAngleKokkos::unpack_reverse(int n, int *list, double *buf)
 {
  if(n > 0)
-    modified(Host,F_MASK);
+    atomKK->modified(Host,F_MASK);

  int m = 0;
  for (int i = 0; i < n; i++) {
@ -960,9 +961,9 @@ struct AtomVecAngleKokkos_UnpackBorder {
 void AtomVecAngleKokkos::unpack_border_kokkos(const int &n, const int &first,
                                             const DAT::tdual_xfloat_2d &buf,
                                             ExecutionSpace space) {
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
  while (first+n >= nmax) grow(0);
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
  if(space==Host) {
    struct AtomVecAngleKokkos_UnpackBorder<LMPHostType>
      f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,h_molecule,first);
@ -984,7 +985,7 @@ void AtomVecAngleKokkos::unpack_border(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+    atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -1010,7 +1011,7 @@ void AtomVecAngleKokkos::unpack_border_vel(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+    atomKK->modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -1412,7 +1413,7 @@ int AtomVecAngleKokkos::unpack_exchange(double *buf)
 {
  int nlocal = atom->nlocal;
  if (nlocal == nmax) grow(0);
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK | MOLECULE_MASK | BOND_MASK |
           ANGLE_MASK | SPECIAL_MASK);

@ -1487,7 +1488,7 @@ int AtomVecAngleKokkos::size_restart()

 int AtomVecAngleKokkos::pack_restart(int i, double *buf)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
            MASK_MASK | IMAGE_MASK | MOLECULE_MASK | BOND_MASK |
            ANGLE_MASK | SPECIAL_MASK);

@ -1541,7 +1542,7 @@ int AtomVecAngleKokkos::unpack_restart(double *buf)
    if (atom->nextra_store)
      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
  }
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
                MASK_MASK | IMAGE_MASK | MOLECULE_MASK | BOND_MASK |
                ANGLE_MASK | SPECIAL_MASK);

--- a/src/KOKKOS/atom_vec_atomic_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_atomic_kokkos.cpp
@ -24,7 +24,7 @@

 using namespace LAMMPS_NS;

-#define DELTA 10000
+#define DELTA 10

 /* ---------------------------------------------------------------------- */

@ -55,14 +55,15 @@ AtomVecAtomicKokkos::AtomVecAtomicKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)

 void AtomVecAtomicKokkos::grow(int n)
 {
-  if (n == 0) nmax += DELTA;
+  int step = MAX(DELTA,nmax*0.01);
+  if (n == 0) nmax += step;
  else nmax = n;
  atomKK->nmax = nmax;
  if (nmax < 0 || nmax > MAXSMALLINT)
    error->one(FLERR,"Per-processor system is too big");

-  sync(Device,ALL_MASK);
-  modified(Device,ALL_MASK);
+  atomKK->sync(Device,ALL_MASK);
+  atomKK->modified(Device,ALL_MASK);

  memoryKK->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
  memoryKK->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
@ -74,7 +75,7 @@ void AtomVecAtomicKokkos::grow(int n)
  memoryKK->grow_kokkos(atomKK->k_f,atomKK->f,nmax,3,"atom:f");

  grow_reset();
-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);

  if (atom->nextra_grow)
    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
@ -393,9 +394,9 @@ struct AtomVecAtomicKokkos_UnpackBorder {

 void AtomVecAtomicKokkos::unpack_border_kokkos(const int &n, const int &first,
                     const DAT::tdual_xfloat_2d &buf,ExecutionSpace space) {
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
  while (first+n >= nmax) grow(0);
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
  if(space==Host) {
    struct AtomVecAtomicKokkos_UnpackBorder<LMPHostType> f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,first);
    Kokkos::parallel_for(n,f);
@ -415,7 +416,7 @@ void AtomVecAtomicKokkos::unpack_border(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+    atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -440,7 +441,7 @@ void AtomVecAtomicKokkos::unpack_border_vel(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+    atomKK->modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -668,7 +669,7 @@ int AtomVecAtomicKokkos::unpack_exchange(double *buf)
 {
  int nlocal = atom->nlocal;
  if (nlocal == nmax) grow(0);
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK);

  int m = 1;
@ -720,7 +721,7 @@ int AtomVecAtomicKokkos::size_restart()

 int AtomVecAtomicKokkos::pack_restart(int i, double *buf)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
            MASK_MASK | IMAGE_MASK );

  int m = 1;
@ -755,7 +756,7 @@ int AtomVecAtomicKokkos::unpack_restart(double *buf)
    if (atom->nextra_store)
      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
  }
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
                MASK_MASK | IMAGE_MASK );

  int m = 1;
--- a/src/KOKKOS/atom_vec_bond_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_bond_kokkos.cpp
@ -24,7 +24,7 @@

 using namespace LAMMPS_NS;

-#define DELTA 10000
+#define DELTA 10

 /* ---------------------------------------------------------------------- */

@ -58,14 +58,15 @@ AtomVecBondKokkos::AtomVecBondKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)

 void AtomVecBondKokkos::grow(int n)
 {
-  if (n == 0) nmax += DELTA;
+  int step = MAX(DELTA,nmax*0.01);
+  if (n == 0) nmax += step;
  else nmax = n;
  atomKK->nmax = nmax;
  if (nmax < 0 || nmax > MAXSMALLINT)
    error->one(FLERR,"Per-processor system is too big");

-  sync(Device,ALL_MASK);
-  modified(Device,ALL_MASK);
+  atomKK->sync(Device,ALL_MASK);
+  atomKK->modified(Device,ALL_MASK);

  memoryKK->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
  memoryKK->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
@ -84,7 +85,7 @@ void AtomVecBondKokkos::grow(int n)
  memoryKK->grow_kokkos(atomKK->k_bond_atom,atomKK->bond_atom,nmax,atomKK->bond_per_atom,"atom:bond_atom");

  grow_reset();
-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);

  if (atom->nextra_grow)
    for (int iextra = 0; iextra < atomKK->nextra_grow; iextra++)
@ -468,9 +469,9 @@ struct AtomVecBondKokkos_UnpackBorder {
 void AtomVecBondKokkos::unpack_border_kokkos(const int &n, const int &first,
                                             const DAT::tdual_xfloat_2d &buf,
                                             ExecutionSpace space) {
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
  while (first+n >= nmax) grow(0);
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
  if(space==Host) {
    struct AtomVecBondKokkos_UnpackBorder<LMPHostType>
      f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,h_molecule,first);
@ -492,7 +493,7 @@ void AtomVecBondKokkos::unpack_border(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+    atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -518,7 +519,7 @@ void AtomVecBondKokkos::unpack_border_vel(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+    atomKK->modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -866,7 +867,7 @@ int AtomVecBondKokkos::unpack_exchange(double *buf)
 {
  int nlocal = atom->nlocal;
  if (nlocal == nmax) grow(0);
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK | MOLECULE_MASK | BOND_MASK | SPECIAL_MASK);

  int k;
@ -934,7 +935,7 @@ int AtomVecBondKokkos::size_restart()

 int AtomVecBondKokkos::pack_restart(int i, double *buf)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
            MASK_MASK | IMAGE_MASK | MOLECULE_MASK | BOND_MASK | SPECIAL_MASK);
  int m = 1;
  buf[m++] = h_x(i,0);
@ -978,7 +979,7 @@ int AtomVecBondKokkos::unpack_restart(double *buf)
    if (atom->nextra_store)
      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
  }
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK | MOLECULE_MASK | BOND_MASK | SPECIAL_MASK);
  int m = 1;
  h_x(nlocal,0) = buf[m++];
--- a/src/KOKKOS/atom_vec_charge_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_charge_kokkos.cpp
@ -24,7 +24,7 @@

 using namespace LAMMPS_NS;

-#define DELTA 10000
+#define DELTA 10

 /* ---------------------------------------------------------------------- */

@ -58,14 +58,15 @@ AtomVecChargeKokkos::AtomVecChargeKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)

 void AtomVecChargeKokkos::grow(int n)
 {
-  if (n == 0) nmax += DELTA;
+  int step = MAX(DELTA,nmax*0.01);
+  if (n == 0) nmax += step;
  else nmax = n;
  atomKK->nmax = nmax;
  if (nmax < 0 || nmax > MAXSMALLINT)
    error->one(FLERR,"Per-processor system is too big");

-  sync(Device,ALL_MASK);
-  modified(Device,ALL_MASK);
+  atomKK->sync(Device,ALL_MASK);
+  atomKK->modified(Device,ALL_MASK);

  memoryKK->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
  memoryKK->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
@ -79,7 +80,7 @@ void AtomVecChargeKokkos::grow(int n)
  memoryKK->grow_kokkos(atomKK->k_q,atomKK->q,nmax,"atom:q");

  grow_reset();
-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);

  if (atom->nextra_grow)
    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
@ -494,7 +495,7 @@ void AtomVecChargeKokkos::unpack_border_kokkos(const int &n, const int &first,
      f(buf.view<LMPDeviceType>(),d_x,d_tag,d_type,d_mask,d_q,first);
    Kokkos::parallel_for(n,f);
  }
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK);
 }

 /* ---------------------------------------------------------------------- */
@ -510,7 +511,7 @@ void AtomVecChargeKokkos::unpack_border(int n, int first, double *buf)
    if (i == nmax) {
      grow(0);
    }
-    modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK);
+    atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -536,7 +537,7 @@ void AtomVecChargeKokkos::unpack_border_vel(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK);
+    atomKK->modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -797,7 +798,7 @@ int AtomVecChargeKokkos::unpack_exchange(double *buf)
 {
  int nlocal = atom->nlocal;
  if (nlocal == nmax) grow(0);
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK | Q_MASK);

  int m = 1;
@ -850,7 +851,7 @@ int AtomVecChargeKokkos::size_restart()

 int AtomVecChargeKokkos::pack_restart(int i, double *buf)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
            MASK_MASK | IMAGE_MASK | Q_MASK);

  int m = 1;
@ -888,7 +889,7 @@ int AtomVecChargeKokkos::unpack_restart(double *buf)
      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
  }

-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK | Q_MASK);

  int m = 1;
--- a/src/KOKKOS/atom_vec_dpd_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
@ -24,7 +24,7 @@

 using namespace LAMMPS_NS;

-#define DELTA 10000
+#define DELTA 10

 /* ---------------------------------------------------------------------- */

@ -60,14 +60,15 @@ AtomVecDPDKokkos::AtomVecDPDKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)

 void AtomVecDPDKokkos::grow(int n)
 {
-  if (n == 0) nmax += DELTA;
+  int step = MAX(DELTA,nmax*0.01);
+  if (n == 0) nmax += step;
  else nmax = n;
  atomKK->nmax = nmax;
  if (nmax < 0 || nmax > MAXSMALLINT)
    error->one(FLERR,"Per-processor system is too big");

-  sync(Device,ALL_MASK);
-  modified(Device,ALL_MASK);
+  atomKK->sync(Device,ALL_MASK);
+  atomKK->modified(Device,ALL_MASK);

  memoryKK->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
  memoryKK->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
@ -93,7 +94,7 @@ void AtomVecDPDKokkos::grow(int n)
      modify->fix[atom->extra_grow[iextra]]->grow_arrays(nmax);

  grow_reset();
-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);
 }

 /* ----------------------------------------------------------------------
@ -158,7 +159,7 @@ void AtomVecDPDKokkos::grow_reset()

 void AtomVecDPDKokkos::copy(int i, int j, int delflag)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
            MASK_MASK | IMAGE_MASK | DPDTHETA_MASK |
            UCG_MASK | UCGNEW_MASK |
            UCOND_MASK | UMECH_MASK | UCHEM_MASK | DVECTOR_MASK);
@ -184,7 +185,7 @@ void AtomVecDPDKokkos::copy(int i, int j, int delflag)
    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
      modify->fix[atom->extra_grow[iextra]]->copy_arrays(i,j,delflag);

-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
                MASK_MASK | IMAGE_MASK | DPDTHETA_MASK |
                UCG_MASK | UCGNEW_MASK |
                UCOND_MASK | UMECH_MASK | UCHEM_MASK | DVECTOR_MASK);
@ -268,7 +269,7 @@ int AtomVecDPDKokkos::pack_comm_kokkos(const int &n,
  // Choose correct forward PackComm kernel

  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    atomKK->sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
        struct AtomVecDPDKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,
@ -303,7 +304,7 @@ int AtomVecDPDKokkos::pack_comm_kokkos(const int &n,
      }
    }
  } else {
-    sync(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    atomKK->sync(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
        struct AtomVecDPDKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,
@ -410,8 +411,8 @@ struct AtomVecDPDKokkos_PackCommSelf {
 int AtomVecDPDKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
                                                                                const int nfirst, const int &pbc_flag, const int* const pbc) {
  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
-    modified(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    atomKK->sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    atomKK->modified(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
      struct AtomVecDPDKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,
@ -446,8 +447,8 @@ int AtomVecDPDKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list
      }
    }
  } else {
-    sync(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
-    modified(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    atomKK->sync(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    atomKK->modified(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
      struct AtomVecDPDKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,
@ -528,15 +529,15 @@ struct AtomVecDPDKokkos_UnpackComm {
 void AtomVecDPDKokkos::unpack_comm_kokkos(const int &n, const int &first,
    const DAT::tdual_xfloat_2d &buf ) {
  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
-    modified(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    atomKK->sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    atomKK->modified(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
    struct AtomVecDPDKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,
    atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
    buf,first);
    Kokkos::parallel_for(n,f);
  } else {
-    sync(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
-    modified(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    atomKK->sync(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    atomKK->modified(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
    struct AtomVecDPDKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,
    atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
    buf,first);
@ -552,7 +553,7 @@ int AtomVecDPDKokkos::pack_comm(int n, int *list, double *buf,
  int i,j,m;
  double dx,dy,dz;

-  sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+  atomKK->sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);

  m = 0;
  if (pbc_flag == 0) {
@ -598,7 +599,7 @@ int AtomVecDPDKokkos::pack_comm_vel(int n, int *list, double *buf,
  int i,j,m;
  double dx,dy,dz,dvx,dvy,dvz;

-  sync(Host,X_MASK|V_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+  atomKK->sync(Host,X_MASK|V_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);

  m = 0;
  if (pbc_flag == 0) {
@ -685,7 +686,7 @@ void AtomVecDPDKokkos::unpack_comm(int n, int first, double *buf)
    h_uChem[i] = buf[m++];
  }

-  modified(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+  atomKK->modified(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
 }

 /* ---------------------------------------------------------------------- */
@ -709,7 +710,7 @@ void AtomVecDPDKokkos::unpack_comm_vel(int n, int first, double *buf)
    h_uChem[i] = buf[m++];
  }

-  modified(Host,X_MASK|V_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+  atomKK->modified(Host,X_MASK|V_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
 }

 /* ---------------------------------------------------------------------- */
@ -717,7 +718,7 @@ void AtomVecDPDKokkos::unpack_comm_vel(int n, int first, double *buf)
 int AtomVecDPDKokkos::pack_reverse(int n, int first, double *buf)
 {
  if(n > 0)
-    sync(Host,F_MASK);
+    atomKK->sync(Host,F_MASK);

  int m = 0;
  const int last = first + n;
@ -734,8 +735,8 @@ int AtomVecDPDKokkos::pack_reverse(int n, int first, double *buf)
 void AtomVecDPDKokkos::unpack_reverse(int n, int *list, double *buf)
 {
  if(n > 0) {
-    sync(Host,F_MASK);
-    modified(Host,F_MASK);
+    atomKK->sync(Host,F_MASK);
+    atomKK->modified(Host,F_MASK);
  }

  int m = 0;
@ -819,7 +820,7 @@ int AtomVecDPDKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DA
 {
  X_FLOAT dx,dy,dz;

-  sync(space,ALL_MASK);
+  atomKK->sync(space,ALL_MASK);

  if (pbc_flag != 0) {
    if (domain->triclinic == 0) {
@ -876,7 +877,7 @@ int AtomVecDPDKokkos::pack_border(int n, int *list, double *buf,
  int i,j,m;
  double dx,dy,dz;

-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);

  m = 0;
  if (pbc_flag == 0) {
@ -937,7 +938,7 @@ int AtomVecDPDKokkos::pack_border_vel(int n, int *list, double *buf,
  int i,j,m;
  double dx,dy,dz,dvx,dvy,dvz;

-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);

  m = 0;
  if (pbc_flag == 0) {
@ -1032,7 +1033,7 @@ int AtomVecDPDKokkos::pack_comm_hybrid(int n, int *list, double *buf)
 {
  int i,j,m;

-  sync(Host,DPDTHETA_MASK | UCOND_MASK |
+  atomKK->sync(Host,DPDTHETA_MASK | UCOND_MASK |
            UMECH_MASK | UCHEM_MASK);

  m = 0;
@ -1052,7 +1053,7 @@ int AtomVecDPDKokkos::pack_border_hybrid(int n, int *list, double *buf)
 {
  int i,j,m;

-  sync(Host,DPDTHETA_MASK | UCOND_MASK |
+  atomKK->sync(Host,DPDTHETA_MASK | UCOND_MASK |
            UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK);

  m = 0;
@ -1127,11 +1128,11 @@ struct AtomVecDPDKokkos_UnpackBorder {

 void AtomVecDPDKokkos::unpack_border_kokkos(const int &n, const int &first,
                     const DAT::tdual_xfloat_2d &buf,ExecutionSpace space) {
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
                 DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK|
                 UCG_MASK|UCGNEW_MASK);
  while (first+n >= nmax) grow(0);
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
                 DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK|
                 UCG_MASK|UCGNEW_MASK|DVECTOR_MASK);
  if(space==Host) {
@ -1179,7 +1180,7 @@ void AtomVecDPDKokkos::unpack_border(int n, int first, double *buf)
      m += modify->fix[atom->extra_border[iextra]]->
        unpack_border(n,first,&buf[m]);

-  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+  atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
                DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK|
                UCG_MASK|UCGNEW_MASK|DVECTOR_MASK);
 }
@ -1217,7 +1218,7 @@ void AtomVecDPDKokkos::unpack_border_vel(int n, int first, double *buf)
      m += modify->fix[atom->extra_border[iextra]]->
        unpack_border(n,first,&buf[m]);

-  modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+  atomKK->modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
                DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK|
                UCG_MASK|UCGNEW_MASK|DVECTOR_MASK);
 }
@ -1237,7 +1238,7 @@ int AtomVecDPDKokkos::unpack_comm_hybrid(int n, int first, double *buf)
    h_uChem(i) = buf[m++];
  }

-  modified(Host,DPDTHETA_MASK | UCOND_MASK |
+  atomKK->modified(Host,DPDTHETA_MASK | UCOND_MASK |
                UMECH_MASK | UCHEM_MASK );

  return m;
@ -1260,7 +1261,7 @@ int AtomVecDPDKokkos::unpack_border_hybrid(int n, int first, double *buf)
    h_uCGnew(i) = buf[m++];
  }

-  modified(Host,DPDTHETA_MASK | UCOND_MASK |
+  atomKK->modified(Host,DPDTHETA_MASK | UCOND_MASK |
                UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK);

  return m;
@ -1384,7 +1385,7 @@ int AtomVecDPDKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d
    int newsize = nsend*17/k_buf.view<LMPHostType>().extent(1)+1;
    k_buf.resize(newsize,k_buf.view<LMPHostType>().extent(1));
  }
-  sync(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
             MASK_MASK | IMAGE_MASK| DPDTHETA_MASK | UCOND_MASK |
             UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK |
             DVECTOR_MASK);
@ -1402,7 +1403,7 @@ int AtomVecDPDKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d

 int AtomVecDPDKokkos::pack_exchange(int i, double *buf)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
            MASK_MASK | IMAGE_MASK| DPDTHETA_MASK | UCOND_MASK |
            UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK |
            DVECTOR_MASK);
@ -1518,7 +1519,7 @@ int AtomVecDPDKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nre
    k_count.sync<LMPHostType>();
  }

-  modified(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
                 MASK_MASK | IMAGE_MASK| DPDTHETA_MASK | UCOND_MASK |
                 UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK |
                 DVECTOR_MASK);
@ -1556,7 +1557,7 @@ int AtomVecDPDKokkos::unpack_exchange(double *buf)
      m += modify->fix[atom->extra_grow[iextra]]->
        unpack_exchange(nlocal,&buf[m]);

-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK| DPDTHETA_MASK | UCOND_MASK |
           UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK |
           DVECTOR_MASK);
@ -1593,7 +1594,7 @@ int AtomVecDPDKokkos::size_restart()

 int AtomVecDPDKokkos::pack_restart(int i, double *buf)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
            MASK_MASK | IMAGE_MASK | DPDTHETA_MASK |
            UCOND_MASK | UMECH_MASK | UCHEM_MASK | DVECTOR_MASK);

@ -1658,7 +1659,7 @@ int AtomVecDPDKokkos::unpack_restart(double *buf)
    for (int i = 0; i < size; i++) extra[nlocal][i] = buf[m++];
  }

-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
                MASK_MASK | IMAGE_MASK | DPDTHETA_MASK |
                UCG_MASK | UCGNEW_MASK |
                UCOND_MASK | UMECH_MASK | UCHEM_MASK | DVECTOR_MASK);
--- a/src/KOKKOS/atom_vec_full_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_full_kokkos.cpp
@ -24,7 +24,7 @@

 using namespace LAMMPS_NS;

-#define DELTA 10000
+#define DELTA 10

 /* ---------------------------------------------------------------------- */

@ -58,14 +58,15 @@ AtomVecFullKokkos::AtomVecFullKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)

 void AtomVecFullKokkos::grow(int n)
 {
-  if (n == 0) nmax += DELTA;
+  int step = MAX(DELTA,nmax*0.01);
+  if (n == 0) nmax += step;
  else nmax = n;
  atomKK->nmax = nmax;
  if (nmax < 0 || nmax > MAXSMALLINT)
    error->one(FLERR,"Per-processor system is too big");

-  sync(Device,ALL_MASK);
-  modified(Device,ALL_MASK);
+  atomKK->sync(Device,ALL_MASK);
+  atomKK->modified(Device,ALL_MASK);

  memoryKK->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
  memoryKK->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
@ -123,7 +124,7 @@ void AtomVecFullKokkos::grow(int n)
                      atomKK->improper_per_atom,"atom:improper_atom4");

  grow_reset();
-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);

  if (atom->nextra_grow)
    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
@ -608,9 +609,9 @@ struct AtomVecFullKokkos_UnpackBorder {
 void AtomVecFullKokkos::unpack_border_kokkos(const int &n, const int &first,
                                                  const DAT::tdual_xfloat_2d &buf,
                                                  ExecutionSpace space) {
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK);
  while (first+n >= nmax) grow(0);
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK);
  if(space==Host) {
    struct AtomVecFullKokkos_UnpackBorder<LMPHostType>
      f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,h_q,h_molecule,first);
@ -632,7 +633,7 @@ void AtomVecFullKokkos::unpack_border(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK);
+    atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -659,7 +660,7 @@ void AtomVecFullKokkos::unpack_border_vel(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK);
+    atomKK->modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -1204,7 +1205,7 @@ int AtomVecFullKokkos::unpack_exchange(double *buf)
 {
  int nlocal = atom->nlocal;
  if (nlocal == nmax) grow(0);
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK | BOND_MASK |
           ANGLE_MASK | DIHEDRAL_MASK | IMPROPER_MASK | SPECIAL_MASK);

@ -1297,7 +1298,7 @@ int AtomVecFullKokkos::size_restart()

 int AtomVecFullKokkos::pack_restart(int i, double *buf)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
       MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK | BOND_MASK |
       ANGLE_MASK | DIHEDRAL_MASK | IMPROPER_MASK | SPECIAL_MASK);

@ -1370,10 +1371,10 @@ int AtomVecFullKokkos::unpack_restart(double *buf)
    if (atom->nextra_store)
      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
  }
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
       MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK | BOND_MASK |
       ANGLE_MASK | DIHEDRAL_MASK | IMPROPER_MASK | SPECIAL_MASK);
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK | BOND_MASK |
           ANGLE_MASK | DIHEDRAL_MASK | IMPROPER_MASK | SPECIAL_MASK);

--- a/src/KOKKOS/atom_vec_hybrid_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_hybrid_kokkos.cpp
@ -307,7 +307,7 @@ int AtomVecHybridKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int
 int AtomVecHybridKokkos::pack_comm(int n, int *list, double *buf,
                             int pbc_flag, int *pbc)
 {
-  sync(Host,X_MASK);
+  atomKK->sync(Host,X_MASK);

  int i,j,k,m;
  double dx,dy,dz;
@ -351,7 +351,7 @@ int AtomVecHybridKokkos::pack_comm(int n, int *list, double *buf,
 int AtomVecHybridKokkos::pack_comm_vel(int n, int *list, double *buf,
                                 int pbc_flag, int *pbc)
 {
-  sync(Host,X_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+  atomKK->sync(Host,X_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);

  int i,j,k,m;
  double dx,dy,dz,dvx,dvy,dvz;
@ -463,7 +463,7 @@ void AtomVecHybridKokkos::unpack_comm(int n, int first, double *buf)
    h_x(i,2) = buf[m++];
  }

-  modified(Host,X_MASK);
+  atomKK->modified(Host,X_MASK);

  // unpack sub-style contributions as contiguous chunks

@ -500,7 +500,7 @@ void AtomVecHybridKokkos::unpack_comm_vel(int n, int first, double *buf)
    }
  }

-  modified(Host,X_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+  atomKK->modified(Host,X_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);

  // unpack sub-style contributions as contiguous chunks

@ -512,7 +512,7 @@ void AtomVecHybridKokkos::unpack_comm_vel(int n, int first, double *buf)

 int AtomVecHybridKokkos::pack_reverse(int n, int first, double *buf)
 {
-  sync(Host,F_MASK);
+  atomKK->sync(Host,F_MASK);

  int i,k,m,last;

@ -546,7 +546,7 @@ void AtomVecHybridKokkos::unpack_reverse(int n, int *list, double *buf)
    h_f(j,2) += buf[m++];
  }

-  modified(Host,F_MASK);
+  atomKK->modified(Host,F_MASK);

  // unpack sub-style contributions as contiguous chunks

@ -559,7 +559,7 @@ void AtomVecHybridKokkos::unpack_reverse(int n, int *list, double *buf)
 int AtomVecHybridKokkos::pack_border(int n, int *list, double *buf,
                               int pbc_flag, int *pbc)
 {
-  sync(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+  atomKK->sync(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);

  int i,j,k,m;
  double dx,dy,dz;
@ -613,7 +613,7 @@ int AtomVecHybridKokkos::pack_border(int n, int *list, double *buf,
 int AtomVecHybridKokkos::pack_border_vel(int n, int *list, double *buf,
                                   int pbc_flag, int *pbc)
 {
-  sync(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+  atomKK->sync(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
  int i,j,k,m;
  double dx,dy,dz,dvx,dvy,dvz;
  int omega_flag = atom->omega_flag;
@ -741,7 +741,7 @@ void AtomVecHybridKokkos::unpack_border(int n, int first, double *buf)
    h_mask[i] = (int) ubuf(buf[m++]).i;
  }

-  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+  atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);

  // unpack sub-style contributions as contiguous chunks

@ -787,7 +787,7 @@ void AtomVecHybridKokkos::unpack_border_vel(int n, int first, double *buf)
    }
  }

-  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+  atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);

  // unpack sub-style contributions as contiguous chunks

@ -969,7 +969,7 @@ void AtomVecHybridKokkos::create_atom(int itype, double *coord)

 void AtomVecHybridKokkos::data_atom(double *coord, imageint imagetmp, char **values)
 {
-  sync(Host,X_MASK|TAG_MASK|TYPE_MASK|IMAGE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+  atomKK->sync(Host,X_MASK|TAG_MASK|TYPE_MASK|IMAGE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);

  int nlocal = atom->nlocal;
  if (nlocal == nmax) grow(0);
@ -1000,7 +1000,7 @@ void AtomVecHybridKokkos::data_atom(double *coord, imageint imagetmp, char **val
    h_angmom(nlocal,2) = 0.0;
  }

-  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|IMAGE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+  atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|IMAGE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);

  // each sub-style parses sub-style specific values

@ -1017,13 +1017,13 @@ void AtomVecHybridKokkos::data_atom(double *coord, imageint imagetmp, char **val

 void AtomVecHybridKokkos::data_vel(int m, char **values)
 {
-  sync(Host,V_MASK);
+  atomKK->sync(Host,V_MASK);

  h_v(m,0) = atof(values[0]);
  h_v(m,1) = atof(values[1]);
  h_v(m,2) = atof(values[2]);

-  modified(Host,V_MASK);
+  atomKK->modified(Host,V_MASK);

  // each sub-style parses sub-style specific values

@ -1038,7 +1038,7 @@ void AtomVecHybridKokkos::data_vel(int m, char **values)

 void AtomVecHybridKokkos::pack_data(double **buf)
 {
-  sync(Host,TAG_MASK|TYPE_MASK|X_MASK);
+  atomKK->sync(Host,TAG_MASK|TYPE_MASK|X_MASK);

  int k,m;

@ -1089,7 +1089,7 @@ void AtomVecHybridKokkos::write_data(FILE *fp, int n, double **buf)

 void AtomVecHybridKokkos::pack_vel(double **buf)
 {
-  sync(Host,V_MASK);
+  atomKK->sync(Host,V_MASK);

  int k,m;

--- a/src/KOKKOS/atom_vec_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_kokkos.cpp
@ -267,6 +267,114 @@ int AtomVecKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, c
        return n*3;
 }

+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int TRICLINIC>
+struct AtomVecKokkos_PackCommSelfFused {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  typename ArrayTypes<DeviceType>::t_x_array _xw;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _pbc;
+  typename ArrayTypes<DeviceType>::t_int_1d_const _pbc_flag;
+  typename ArrayTypes<DeviceType>::t_int_1d_const _firstrecv;
+  typename ArrayTypes<DeviceType>::t_int_1d_const _sendnum_scan;
+  typename ArrayTypes<DeviceType>::t_int_1d_const _g2l;
+  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
+
+  AtomVecKokkos_PackCommSelfFused(
+      const typename DAT::tdual_x_array &x,
+      const typename DAT::tdual_int_2d &list,
+      const typename DAT::tdual_int_2d &pbc,
+      const typename DAT::tdual_int_1d &pbc_flag,
+      const typename DAT::tdual_int_1d &firstrecv,
+      const typename DAT::tdual_int_1d &sendnum_scan,
+      const typename DAT::tdual_int_1d &g2l,
+      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
+      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz):
+      _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),
+      _list(list.view<DeviceType>()),
+      _pbc(pbc.view<DeviceType>()),
+      _pbc_flag(pbc_flag.view<DeviceType>()),
+      _firstrecv(firstrecv.view<DeviceType>()),
+      _sendnum_scan(sendnum_scan.view<DeviceType>()),
+      _g2l(g2l.view<DeviceType>()),
+      _xprd(xprd),_yprd(yprd),_zprd(zprd),
+      _xy(xy),_xz(xz),_yz(yz) {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& ii) const {
+
+    int iswap = 0;
+    while (ii >= _sendnum_scan[iswap]) iswap++;
+    int i = ii;
+    if (iswap > 0)
+      i = ii - _sendnum_scan[iswap-1];
+
+    const int _nfirst = _firstrecv[iswap];
+    const int nlocal = _firstrecv[0];
+
+    int j = _list(iswap,i);
+    if (j >= nlocal)
+      j = _g2l(j-nlocal);
+
+    if (_pbc_flag(ii) == 0) {
+        _xw(i+_nfirst,0) = _x(j,0);
+        _xw(i+_nfirst,1) = _x(j,1);
+        _xw(i+_nfirst,2) = _x(j,2);
+    } else {
+      if (TRICLINIC == 0) {
+        _xw(i+_nfirst,0) = _x(j,0) + _pbc(ii,0)*_xprd;
+        _xw(i+_nfirst,1) = _x(j,1) + _pbc(ii,1)*_yprd;
+        _xw(i+_nfirst,2) = _x(j,2) + _pbc(ii,2)*_zprd;
+      } else {
+        _xw(i+_nfirst,0) = _x(j,0) + _pbc(ii,0)*_xprd + _pbc(ii,5)*_xy + _pbc(ii,4)*_xz;
+        _xw(i+_nfirst,1) = _x(j,1) + _pbc(ii,1)*_yprd + _pbc(ii,3)*_yz;
+        _xw(i+_nfirst,2) = _x(j,2) + _pbc(ii,2)*_zprd;
+      }
+    }
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::pack_comm_self_fused(const int &n, const DAT::tdual_int_2d &list, const DAT::tdual_int_1d &sendnum_scan,
+                                         const DAT::tdual_int_1d &firstrecv, const DAT::tdual_int_1d &pbc_flag, const DAT::tdual_int_2d &pbc,
+                                         const DAT::tdual_int_1d &g2l) {
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK);
+    modified(Host,X_MASK);
+    if(domain->triclinic) {
+    struct AtomVecKokkos_PackCommSelfFused<LMPHostType,1> f(atomKK->k_x,list,pbc,pbc_flag,firstrecv,sendnum_scan,g2l,
+        domain->xprd,domain->yprd,domain->zprd,
+        domain->xy,domain->xz,domain->yz);
+    Kokkos::parallel_for(n,f);
+    } else {
+    struct AtomVecKokkos_PackCommSelfFused<LMPHostType,0> f(atomKK->k_x,list,pbc,pbc_flag,firstrecv,sendnum_scan,g2l,
+        domain->xprd,domain->yprd,domain->zprd,
+        domain->xy,domain->xz,domain->yz);
+    Kokkos::parallel_for(n,f);
+    }
+  } else {
+    sync(Device,X_MASK);
+    modified(Device,X_MASK);
+    if(domain->triclinic) {
+    struct AtomVecKokkos_PackCommSelfFused<LMPDeviceType,1> f(atomKK->k_x,list,pbc,pbc_flag,firstrecv,sendnum_scan,g2l,
+        domain->xprd,domain->yprd,domain->zprd,
+        domain->xy,domain->xz,domain->yz);
+    Kokkos::parallel_for(n,f);
+    } else {
+    struct AtomVecKokkos_PackCommSelfFused<LMPDeviceType,0> f(atomKK->k_x,list,pbc,pbc_flag,firstrecv,sendnum_scan,g2l,
+        domain->xprd,domain->yprd,domain->zprd,
+        domain->xy,domain->xz,domain->yz);
+    Kokkos::parallel_for(n,f);
+    }
+  }
+  return n*3;
+}
+
 /* ---------------------------------------------------------------------- */

 template<class DeviceType>
--- a/src/KOKKOS/atom_vec_kokkos.h
+++ b/src/KOKKOS/atom_vec_kokkos.h
@ -51,6 +51,14 @@ class AtomVecKokkos : public AtomVec {
                   const int & iswap, const int nfirst,
                   const int &pbc_flag, const int pbc[]);

+  virtual int
+    pack_comm_self_fused(const int &n, const DAT::tdual_int_2d &list,
+                         const DAT::tdual_int_1d &sendnum_scan,
+                         const DAT::tdual_int_1d &firstrecv,
+                         const DAT::tdual_int_1d &pbc_flag,
+                         const DAT::tdual_int_2d &pbc,
+                         const DAT::tdual_int_1d &g2l);
+
  virtual int
    pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &list,
                     const int & iswap, const DAT::tdual_xfloat_2d &buf,
--- a/src/KOKKOS/atom_vec_molecular_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_molecular_kokkos.cpp
@ -24,7 +24,7 @@

 using namespace LAMMPS_NS;

-#define DELTA 10000
+#define DELTA 10

 /* ---------------------------------------------------------------------- */

@ -58,14 +58,15 @@ AtomVecMolecularKokkos::AtomVecMolecularKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)

 void AtomVecMolecularKokkos::grow(int n)
 {
-  if (n == 0) nmax += DELTA;
+  int step = MAX(DELTA,nmax*0.01);
+  if (n == 0) nmax += step;
  else nmax = n;
  atomKK->nmax = nmax;
  if (nmax < 0 || nmax > MAXSMALLINT)
    error->one(FLERR,"Per-processor system is too big");

-  sync(Device,ALL_MASK);
-  modified(Device,ALL_MASK);
+  atomKK->sync(Device,ALL_MASK);
+  atomKK->modified(Device,ALL_MASK);

  memoryKK->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
  memoryKK->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
@ -121,7 +122,7 @@ void AtomVecMolecularKokkos::grow(int n)
                      atomKK->improper_per_atom,"atom:improper_atom4");

  grow_reset();
-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);

  if (atom->nextra_grow)
    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
@ -361,7 +362,7 @@ int AtomVecMolecularKokkos::pack_comm_kokkos(const int &n,
  // Choose correct forward PackComm kernel

  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
+    atomKK->sync(Host,X_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
        struct AtomVecMolecularKokkos_PackComm<LMPHostType,1,1>
@ -388,7 +389,7 @@ int AtomVecMolecularKokkos::pack_comm_kokkos(const int &n,
      }
    }
  } else {
-    sync(Device,X_MASK);
+    atomKK->sync(Device,X_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
        struct AtomVecMolecularKokkos_PackComm<LMPDeviceType,1,1>
@ -477,8 +478,8 @@ int AtomVecMolecularKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d
                                           const int nfirst, const int &pbc_flag,
                                           const int* const pbc) {
  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
+    atomKK->sync(Host,X_MASK);
+    atomKK->modified(Host,X_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
      struct AtomVecMolecularKokkos_PackCommSelf<LMPHostType,1,1>
@ -505,8 +506,8 @@ int AtomVecMolecularKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d
      }
    }
  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
+    atomKK->sync(Device,X_MASK);
+    atomKK->modified(Device,X_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
      struct AtomVecMolecularKokkos_PackCommSelf<LMPDeviceType,1,1>
@ -565,13 +566,13 @@ struct AtomVecMolecularKokkos_UnpackComm {
 void AtomVecMolecularKokkos::unpack_comm_kokkos(const int &n, const int &first,
    const DAT::tdual_xfloat_2d &buf ) {
  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
+    atomKK->sync(Host,X_MASK);
+    atomKK->modified(Host,X_MASK);
    struct AtomVecMolecularKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
    Kokkos::parallel_for(n,f);
  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
+    atomKK->sync(Device,X_MASK);
+    atomKK->modified(Device,X_MASK);
    struct AtomVecMolecularKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
    Kokkos::parallel_for(n,f);
  }
@ -714,7 +715,7 @@ void AtomVecMolecularKokkos::unpack_comm_vel(int n, int first, double *buf)
 int AtomVecMolecularKokkos::pack_reverse(int n, int first, double *buf)
 {
  if(n > 0)
-    sync(Host,F_MASK);
+    atomKK->sync(Host,F_MASK);

  int m = 0;
  const int last = first + n;
@ -731,7 +732,7 @@ int AtomVecMolecularKokkos::pack_reverse(int n, int first, double *buf)
 void AtomVecMolecularKokkos::unpack_reverse(int n, int *list, double *buf)
 {
  if(n > 0)
-    modified(Host,F_MASK);
+    atomKK->modified(Host,F_MASK);

  int m = 0;
  for (int i = 0; i < n; i++) {
@ -1032,9 +1033,9 @@ struct AtomVecMolecularKokkos_UnpackBorder {
 void AtomVecMolecularKokkos::unpack_border_kokkos(const int &n, const int &first,
                                                  const DAT::tdual_xfloat_2d &buf,
                                                  ExecutionSpace space) {
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
  while (first+n >= nmax) grow(0);
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
  if(space==Host) {
    struct AtomVecMolecularKokkos_UnpackBorder<LMPHostType>
      f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,h_molecule,first);
@ -1056,7 +1057,7 @@ void AtomVecMolecularKokkos::unpack_border(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+    atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -1082,7 +1083,7 @@ void AtomVecMolecularKokkos::unpack_border_vel(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+    atomKK->modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -1615,7 +1616,7 @@ int AtomVecMolecularKokkos::unpack_exchange(double *buf)
 {
  int nlocal = atom->nlocal;
  if (nlocal == nmax) grow(0);
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK | MOLECULE_MASK | BOND_MASK |
           ANGLE_MASK | DIHEDRAL_MASK | IMPROPER_MASK | SPECIAL_MASK);

@ -1707,7 +1708,7 @@ int AtomVecMolecularKokkos::size_restart()

 int AtomVecMolecularKokkos::pack_restart(int i, double *buf)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
       MASK_MASK | IMAGE_MASK | MOLECULE_MASK | BOND_MASK |
       ANGLE_MASK | DIHEDRAL_MASK | IMPROPER_MASK | SPECIAL_MASK);

@ -1780,7 +1781,7 @@ int AtomVecMolecularKokkos::unpack_restart(double *buf)
      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
  }

-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK | MOLECULE_MASK | BOND_MASK |
           ANGLE_MASK | DIHEDRAL_MASK | IMPROPER_MASK | SPECIAL_MASK);

--- a/src/KOKKOS/atom_vec_sphere_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_sphere_kokkos.cpp
@ -30,7 +30,7 @@

 using namespace LAMMPS_NS;

-#define DELTA 10000
+#define DELTA 10

 static const double MY_PI  = 3.14159265358979323846; // pi

@ -93,14 +93,15 @@ void AtomVecSphereKokkos::init()

 void AtomVecSphereKokkos::grow(int n)
 {
-  if (n == 0) nmax += DELTA;
+  int step = MAX(DELTA,nmax*0.01);
+  if (n == 0) nmax += step;
  else nmax = n;
  atom->nmax = nmax;
  if (nmax < 0 || nmax > MAXSMALLINT)
    error->one(FLERR,"Per-processor system is too big");

-  sync(Device,ALL_MASK);
-  modified(Device,ALL_MASK);
+  atomKK->sync(Device,ALL_MASK);
+  atomKK->modified(Device,ALL_MASK);

  memoryKK->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
  memoryKK->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
@ -120,7 +121,7 @@ void AtomVecSphereKokkos::grow(int n)
      modify->fix[atom->extra_grow[iextra]]->grow_arrays(nmax);

  grow_reset();
-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);
 }

 /* ----------------------------------------------------------------------
@ -172,7 +173,7 @@ void AtomVecSphereKokkos::grow_reset()

 void AtomVecSphereKokkos::copy(int i, int j, int delflag)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
            MASK_MASK | IMAGE_MASK | RADIUS_MASK |
            RMASS_MASK | OMEGA_MASK);

@ -197,7 +198,7 @@ void AtomVecSphereKokkos::copy(int i, int j, int delflag)
    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
      modify->fix[atom->extra_grow[iextra]]->copy_arrays(i,j,delflag);

-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
                MASK_MASK | IMAGE_MASK | RADIUS_MASK |
                RMASS_MASK | OMEGA_MASK);
 }
@ -277,7 +278,7 @@ int AtomVecSphereKokkos::pack_comm_kokkos(
  // Check whether to always run forward communication on the host
  // Choose correct forward PackComm kernel
  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
+    atomKK->sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
        struct AtomVecSphereKokkos_PackComm<LMPHostType,1,1> f(
@ -316,7 +317,7 @@ int AtomVecSphereKokkos::pack_comm_kokkos(
      }
    }
  } else {
-    sync(Device,X_MASK|RADIUS_MASK|RMASS_MASK);
+    atomKK->sync(Device,X_MASK|RADIUS_MASK|RMASS_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
        struct AtomVecSphereKokkos_PackComm<LMPDeviceType,1,1> f(
@ -464,7 +465,7 @@ int AtomVecSphereKokkos::pack_comm_vel_kokkos(
  const int* const pbc)
 {
  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
+    atomKK->sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
    if(pbc_flag) {
      if(deform_vremap) {
        if(domain->triclinic) {
@ -595,7 +596,7 @@ int AtomVecSphereKokkos::pack_comm_vel_kokkos(
      }
    }
  } else {
-    sync(Device,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
+    atomKK->sync(Device,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
    if(pbc_flag) {
      if(deform_vremap) {
        if(domain->triclinic) {
@ -795,8 +796,8 @@ int AtomVecSphereKokkos::pack_comm_self(
  if (radvary == 0)
    return AtomVecKokkos::pack_comm_self(n,list,iswap,nfirst,pbc_flag,pbc);
  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
-    modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
+    atomKK->sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
+    atomKK->modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
 	struct AtomVecSphereKokkos_PackCommSelf<LMPHostType,1,1> f(
@ -835,8 +836,8 @@ int AtomVecSphereKokkos::pack_comm_self(
      }
    }
  } else {
-    sync(Device,X_MASK|RADIUS_MASK|RMASS_MASK);
-    modified(Device,X_MASK|RADIUS_MASK|RMASS_MASK);
+    atomKK->sync(Device,X_MASK|RADIUS_MASK|RMASS_MASK);
+    atomKK->modified(Device,X_MASK|RADIUS_MASK|RMASS_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
 	struct AtomVecSphereKokkos_PackCommSelf<LMPDeviceType,1,1> f(
@ -926,14 +927,14 @@ void AtomVecSphereKokkos::unpack_comm_kokkos(
    return;
  }
  if(commKK->forward_comm_on_host) {
-    modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
+    atomKK->modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
    struct AtomVecSphereKokkos_UnpackComm<LMPHostType> f(
      atomKK->k_x,
      atomKK->k_radius,atomKK->k_rmass,
      buf,first);
    Kokkos::parallel_for(n,f);
  } else {
-    modified(Device,X_MASK|RADIUS_MASK|RMASS_MASK);
+    atomKK->modified(Device,X_MASK|RADIUS_MASK|RMASS_MASK);
    struct AtomVecSphereKokkos_UnpackComm<LMPDeviceType> f(
      atomKK->k_x,
      atomKK->k_radius,atomKK->k_rmass,
@ -998,7 +999,7 @@ void AtomVecSphereKokkos::unpack_comm_vel_kokkos(
  const int &n, const int &first,
  const DAT::tdual_xfloat_2d &buf ) {
  if(commKK->forward_comm_on_host) {
-    modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
+    atomKK->modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
    if (radvary == 0) {
      struct AtomVecSphereKokkos_UnpackCommVel<LMPHostType,0> f(
        atomKK->k_x,
@ -1015,7 +1016,7 @@ void AtomVecSphereKokkos::unpack_comm_vel_kokkos(
      Kokkos::parallel_for(n,f);
    }
  } else {
-    modified(Device,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
+    atomKK->modified(Device,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
    if (radvary == 0) {
      struct AtomVecSphereKokkos_UnpackCommVel<LMPDeviceType,0> f(
        atomKK->k_x,
@ -1044,7 +1045,7 @@ int AtomVecSphereKokkos::pack_comm(int n, int *list, double *buf,

  if (radvary == 0) {
    // Not sure if we need to call sync for X here
-    sync(Host,X_MASK);
+    atomKK->sync(Host,X_MASK);
    m = 0;
    if (pbc_flag == 0) {
      for (i = 0; i < n; i++) {
@ -1071,7 +1072,7 @@ int AtomVecSphereKokkos::pack_comm(int n, int *list, double *buf,
      }
    }
  } else {
-    sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
+    atomKK->sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
    m = 0;
    if (pbc_flag == 0) {
      for (i = 0; i < n; i++) {
@ -1115,7 +1116,7 @@ int AtomVecSphereKokkos::pack_comm_vel(int n, int *list, double *buf,
  double dx,dy,dz,dvx,dvy,dvz;

  if (radvary == 0) {
-    sync(Host,X_MASK|V_MASK|OMEGA_MASK);
+    atomKK->sync(Host,X_MASK|V_MASK|OMEGA_MASK);
    m = 0;
    if (pbc_flag == 0) {
      for (i = 0; i < n; i++) {
@ -1178,7 +1179,7 @@ int AtomVecSphereKokkos::pack_comm_vel(int n, int *list, double *buf,
      }
    }
  } else {
-    sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
+    atomKK->sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
    m = 0;
    if (pbc_flag == 0) {
      for (i = 0; i < n; i++) {
@ -1257,7 +1258,7 @@ int AtomVecSphereKokkos::pack_comm_hybrid(int n, int *list, double *buf)
 {
  if (radvary == 0) return 0;

-  sync(Host,RADIUS_MASK|RMASS_MASK);
+  atomKK->sync(Host,RADIUS_MASK|RMASS_MASK);

  int m = 0;
  for (int i = 0; i < n; i++) {
@ -1280,7 +1281,7 @@ void AtomVecSphereKokkos::unpack_comm(int n, int first, double *buf)
      h_x(i,1) = buf[m++];
      h_x(i,2) = buf[m++];
    }
-    modified(Host,X_MASK);
+    atomKK->modified(Host,X_MASK);
  } else {
    int m = 0;
    const int last = first + n;
@ -1291,7 +1292,7 @@ void AtomVecSphereKokkos::unpack_comm(int n, int first, double *buf)
      h_radius[i] = buf[m++];
      h_rmass[i] = buf[m++];
    }
-    modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
+    atomKK->modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
  }
 }

@ -1313,7 +1314,7 @@ void AtomVecSphereKokkos::unpack_comm_vel(int n, int first, double *buf)
      h_omega(i,1) = buf[m++];
      h_omega(i,2) = buf[m++];
    }
-    modified(Host,X_MASK|V_MASK|OMEGA_MASK);
+    atomKK->modified(Host,X_MASK|V_MASK|OMEGA_MASK);
  } else {
    int m = 0;
    const int last = first + n;
@ -1330,7 +1331,7 @@ void AtomVecSphereKokkos::unpack_comm_vel(int n, int first, double *buf)
      h_omega(i,1) = buf[m++];
      h_omega(i,2) = buf[m++];
    }
-    modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
+    atomKK->modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
  }
 }

@ -1346,7 +1347,7 @@ int AtomVecSphereKokkos::unpack_comm_hybrid(int n, int first, double *buf)
    h_radius[i] = buf[m++];
    h_rmass[i] = buf[m++];
  }
-  modified(Host,RADIUS_MASK|RMASS_MASK);
+  atomKK->modified(Host,RADIUS_MASK|RMASS_MASK);
  return m;
 }

@ -1355,7 +1356,7 @@ int AtomVecSphereKokkos::unpack_comm_hybrid(int n, int first, double *buf)
 int AtomVecSphereKokkos::pack_reverse(int n, int first, double *buf)
 {
  if(n > 0)
-    sync(Host,F_MASK|TORQUE_MASK);
+    atomKK->sync(Host,F_MASK|TORQUE_MASK);

  int m = 0;
  const int last = first + n;
@ -1375,7 +1376,7 @@ int AtomVecSphereKokkos::pack_reverse(int n, int first, double *buf)
 int AtomVecSphereKokkos::pack_reverse_hybrid(int n, int first, double *buf)
 {
  if(n > 0)
-    sync(Host,TORQUE_MASK);
+    atomKK->sync(Host,TORQUE_MASK);

  int m = 0;
  const int last = first + n;
@ -1392,7 +1393,7 @@ int AtomVecSphereKokkos::pack_reverse_hybrid(int n, int first, double *buf)
 void AtomVecSphereKokkos::unpack_reverse(int n, int *list, double *buf)
 {
  if(n > 0) {
-    modified(Host,F_MASK|TORQUE_MASK);
+    atomKK->modified(Host,F_MASK|TORQUE_MASK);
  }

  int m = 0;
@ -1412,7 +1413,7 @@ void AtomVecSphereKokkos::unpack_reverse(int n, int *list, double *buf)
 int AtomVecSphereKokkos::unpack_reverse_hybrid(int n, int *list, double *buf)
 {
  if(n > 0) {
-    modified(Host,TORQUE_MASK);
+    atomKK->modified(Host,TORQUE_MASK);
  }

  int m = 0;
@ -1492,7 +1493,7 @@ int AtomVecSphereKokkos::pack_border_kokkos(
  X_FLOAT dx,dy,dz;

  // This was in atom_vec_dpd_kokkos but doesn't appear in any other atom_vec
-  sync(space,ALL_MASK);
+  atomKK->sync(space,ALL_MASK);

  if (pbc_flag != 0) {
    if (domain->triclinic == 0) {
@ -1549,7 +1550,7 @@ int AtomVecSphereKokkos::pack_border(
  int i,j,m;
  double dx,dy,dz;

-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);

  m = 0;
  if (pbc_flag == 0) {
@ -1686,7 +1687,7 @@ int AtomVecSphereKokkos::pack_border_vel_kokkos(
  X_FLOAT dvx=0,dvy=0,dvz=0;

  // This was in atom_vec_dpd_kokkos but doesn't appear in any other atom_vec
-  sync(space,ALL_MASK);
+  atomKK->sync(space,ALL_MASK);

  if (pbc_flag != 0) {
    if (domain->triclinic == 0) {
@ -1776,7 +1777,7 @@ int AtomVecSphereKokkos::pack_border_vel(int n, int *list, double *buf,
  int i,j,m;
  double dx,dy,dz,dvx,dvy,dvz;

-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);

  m = 0;
  if (pbc_flag == 0) {
@ -1866,7 +1867,7 @@ int AtomVecSphereKokkos::pack_border_vel(int n, int *list, double *buf,

 int AtomVecSphereKokkos::pack_border_hybrid(int n, int *list, double *buf)
 {
-  sync(Host,RADIUS_MASK|RMASS_MASK);
+  atomKK->sync(Host,RADIUS_MASK|RMASS_MASK);

  int m = 0;
  for (int i = 0; i < n; i++) {
@ -1942,7 +1943,7 @@ void AtomVecSphereKokkos::unpack_border_kokkos(const int &n, const int &first,
    Kokkos::parallel_for(n,f);
  }

-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
 	         RADIUS_MASK|RMASS_MASK);
 }

@ -1969,7 +1970,7 @@ void AtomVecSphereKokkos::unpack_border(int n, int first, double *buf)
      m += modify->fix[atom->extra_border[iextra]]->
        unpack_border(n,first,&buf[m]);

-  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|RADIUS_MASK|RMASS_MASK);
+  atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|RADIUS_MASK|RMASS_MASK);
 }


@ -2052,7 +2053,7 @@ void AtomVecSphereKokkos::unpack_border_vel_kokkos(
    Kokkos::parallel_for(n,f);
  }

-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
 	         RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
 }

@ -2085,7 +2086,7 @@ void AtomVecSphereKokkos::unpack_border_vel(int n, int first, double *buf)
      m += modify->fix[atom->extra_border[iextra]]->
        unpack_border(n,first,&buf[m]);

-  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
+  atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
 }

 /* ---------------------------------------------------------------------- */
@ -2098,7 +2099,7 @@ int AtomVecSphereKokkos::unpack_border_hybrid(int n, int first, double *buf)
    h_radius[i] = buf[m++];
    h_rmass[i] = buf[m++];
  }
-  modified(Host,RADIUS_MASK|RMASS_MASK);
+  atomKK->modified(Host,RADIUS_MASK|RMASS_MASK);
  return m;
 }

@ -2218,7 +2219,7 @@ int AtomVecSphereKokkos::pack_exchange_kokkos(
    int newsize = nsend*17/k_buf.view<LMPHostType>().extent(1)+1;
    k_buf.resize(newsize,k_buf.view<LMPHostType>().extent(1));
  }
-  sync(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
             MASK_MASK | IMAGE_MASK| RADIUS_MASK | RMASS_MASK |
             OMEGA_MASK);

@ -2239,7 +2240,7 @@ int AtomVecSphereKokkos::pack_exchange_kokkos(

 int AtomVecSphereKokkos::pack_exchange(int i, double *buf)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
            MASK_MASK | IMAGE_MASK| RADIUS_MASK | RMASS_MASK |
            OMEGA_MASK);

@ -2354,7 +2355,7 @@ int AtomVecSphereKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int
    k_count.sync<LMPHostType>();
  }

-  modified(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
                 MASK_MASK | IMAGE_MASK| RADIUS_MASK | RMASS_MASK |
                 OMEGA_MASK);

@ -2391,7 +2392,7 @@ int AtomVecSphereKokkos::unpack_exchange(double *buf)
      m += modify->fix[atom->extra_grow[iextra]]->
        unpack_exchange(nlocal,&buf[m]);

-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK | RADIUS_MASK | RMASS_MASK |
           OMEGA_MASK);

@ -2427,7 +2428,7 @@ int AtomVecSphereKokkos::size_restart()

 int AtomVecSphereKokkos::pack_restart(int i, double *buf)
 {
-  sync(Host,X_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | TAG_MASK | TYPE_MASK |
            MASK_MASK | IMAGE_MASK | V_MASK |
            RADIUS_MASK | RMASS_MASK | OMEGA_MASK);

@ -2494,7 +2495,7 @@ int AtomVecSphereKokkos::unpack_restart(double *buf)
    for (int i = 0; i < size; i++) extra[nlocal][i] = buf[m++];
  }

-  modified(Host,X_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | TAG_MASK | TYPE_MASK |
                MASK_MASK | IMAGE_MASK | V_MASK |
 	        RADIUS_MASK | RMASS_MASK | OMEGA_MASK);

@ -2616,14 +2617,14 @@ int AtomVecSphereKokkos::data_atom_hybrid(int nlocal, char **values)

 void AtomVecSphereKokkos::data_vel(int m, char **values)
 {
-  sync(Host,V_MASK|OMEGA_MASK);
+  atomKK->sync(Host,V_MASK|OMEGA_MASK);
  h_v(m,0) = atof(values[0]);
  h_v(m,1) = atof(values[1]);
  h_v(m,2) = atof(values[2]);
  h_omega(m,0) = atof(values[3]);
  h_omega(m,1) = atof(values[4]);
  h_omega(m,2) = atof(values[5]);
-  modified(Host,V_MASK|OMEGA_MASK);
+  atomKK->modified(Host,V_MASK|OMEGA_MASK);
 }

 /* ----------------------------------------------------------------------
@ -2632,11 +2633,11 @@ void AtomVecSphereKokkos::data_vel(int m, char **values)

 int AtomVecSphereKokkos::data_vel_hybrid(int m, char **values)
 {
-  sync(Host,OMEGA_MASK);
+  atomKK->sync(Host,OMEGA_MASK);
  omega[m][0] = atof(values[0]);
  omega[m][1] = atof(values[1]);
  omega[m][2] = atof(values[2]);
-  modified(Host,OMEGA_MASK);
+  atomKK->modified(Host,OMEGA_MASK);
  return 3;
 }

@ -2711,7 +2712,7 @@ int AtomVecSphereKokkos::write_data_hybrid(FILE *fp, double *buf)

 void AtomVecSphereKokkos::pack_vel(double **buf)
 {
-  sync(Host,TAG_MASK|V_MASK|OMEGA_MASK);
+  atomKK->sync(Host,TAG_MASK|V_MASK|OMEGA_MASK);

  int nlocal = atom->nlocal;
  for (int i = 0; i < nlocal; i++) {
@ -2731,7 +2732,7 @@ void AtomVecSphereKokkos::pack_vel(double **buf)

 int AtomVecSphereKokkos::pack_vel_hybrid(int i, double *buf)
 {
-  sync(Host,OMEGA_MASK);
+  atomKK->sync(Host,OMEGA_MASK);

  buf[0] = h_omega(i,0);
  buf[1] = h_omega(i,1);
--- a/src/KOKKOS/comm_kokkos.cpp
+++ b/src/KOKKOS/comm_kokkos.cpp
@ -57,10 +57,9 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp)
  memory->destroy(buf_recv);
  buf_recv = NULL;

-  k_exchange_sendlist = DAT::
-    tdual_int_1d("comm:k_exchange_sendlist",100);
-  k_exchange_copylist = DAT::
-    tdual_int_1d("comm:k_exchange_copylist",100);
+  k_exchange_lists = DAT::tdual_int_2d("comm:k_exchange_lists",2,100);
+  k_exchange_sendlist = Kokkos::subview(k_exchange_lists,0,Kokkos::ALL);
+  k_exchange_copylist = Kokkos::subview(k_exchange_lists,1,Kokkos::ALL);
  k_count = DAT::tdual_int_scalar("comm:k_count");
  k_sendflag = DAT::tdual_int_1d("comm:k_sendflag",100);

@ -187,71 +186,80 @@ void CommKokkos::forward_comm_device(int dummy)
  k_sendlist.sync<DeviceType>();
  atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);

-  for (int iswap = 0; iswap < nswap; iswap++) {
-    if (sendproc[iswap] != me) {
-      if (comm_x_only) {
-        if (size_forward_recv[iswap]) {
-          buf = atomKK->k_x.view<DeviceType>().data() +
-            firstrecv[iswap]*atomKK->k_x.view<DeviceType>().extent(1);
-          MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE,
-                    recvproc[iswap],0,world,&request);
-        }
-        n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,
-                                   iswap,k_buf_send,pbc_flag[iswap],pbc[iswap]);
-        DeviceType::fence();
-        if (n) {
-          MPI_Send(k_buf_send.view<DeviceType>().data(),
-                   n,MPI_DOUBLE,sendproc[iswap],0,world);
-        }
+  if (comm->nprocs == 1 && !ghost_velocity) {
+    k_swap.sync<DeviceType>();
+    k_swap2.sync<DeviceType>();
+    k_pbc.sync<DeviceType>();
+    n = avec->pack_comm_self_fused(totalsend,k_sendlist,k_sendnum_scan,
+                    k_firstrecv,k_pbc_flag,k_pbc,k_g2l);
+  } else {

-        if (size_forward_recv[iswap]) {
-          MPI_Wait(&request,MPI_STATUS_IGNORE);
-          atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::
-                           space,X_MASK);
+    for (int iswap = 0; iswap < nswap; iswap++) {
+      if (sendproc[iswap] != me) {
+        if (comm_x_only) {
+          if (size_forward_recv[iswap]) {
+            buf = atomKK->k_x.view<DeviceType>().data() +
+              firstrecv[iswap]*atomKK->k_x.view<DeviceType>().extent(1);
+            MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE,
+                      recvproc[iswap],0,world,&request);
+          }
+          n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,
+                                     iswap,k_buf_send,pbc_flag[iswap],pbc[iswap]);
+          DeviceType::fence();
+          if (n) {
+            MPI_Send(k_buf_send.view<DeviceType>().data(),
+                     n,MPI_DOUBLE,sendproc[iswap],0,world);
+          }
+    
+          if (size_forward_recv[iswap]) {
+            MPI_Wait(&request,MPI_STATUS_IGNORE);
+            atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::
+                             space,X_MASK);
+          }
+        } else if (ghost_velocity) {
+          if (size_forward_recv[iswap]) {
+            MPI_Irecv(k_buf_recv.view<DeviceType>().data(),
+                      size_forward_recv[iswap],MPI_DOUBLE,
+                      recvproc[iswap],0,world,&request);
+          }
+          n = avec->pack_comm_vel_kokkos(sendnum[iswap],k_sendlist,iswap,
+                                         k_buf_send,pbc_flag[iswap],pbc[iswap]);
+          DeviceType::fence();
+          if (n) {
+            MPI_Send(k_buf_send.view<DeviceType>().data(),n,
+                     MPI_DOUBLE,sendproc[iswap],0,world);
+          }
+          if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
+          avec->unpack_comm_vel_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv);
+          DeviceType::fence();
+        } else {
+          if (size_forward_recv[iswap])
+            MPI_Irecv(k_buf_recv.view<DeviceType>().data(),
+                      size_forward_recv[iswap],MPI_DOUBLE,
+                      recvproc[iswap],0,world,&request);
+          n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,iswap,
+                                     k_buf_send,pbc_flag[iswap],pbc[iswap]);
+          DeviceType::fence();
+          if (n)
+            MPI_Send(k_buf_send.view<DeviceType>().data(),n,
+                     MPI_DOUBLE,sendproc[iswap],0,world);
+          if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
+          avec->unpack_comm_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv);
+          DeviceType::fence();
        }
-      } else if (ghost_velocity) {
-        if (size_forward_recv[iswap]) {
-          MPI_Irecv(k_buf_recv.view<DeviceType>().data(),
-                    size_forward_recv[iswap],MPI_DOUBLE,
-                    recvproc[iswap],0,world,&request);
-        }
-        n = avec->pack_comm_vel_kokkos(sendnum[iswap],k_sendlist,iswap,
-                                       k_buf_send,pbc_flag[iswap],pbc[iswap]);
-        DeviceType::fence();
-        if (n) {
-          MPI_Send(k_buf_send.view<DeviceType>().data(),n,
-                   MPI_DOUBLE,sendproc[iswap],0,world);
-        }
-        if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
-        avec->unpack_comm_vel_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv);
-        DeviceType::fence();
      } else {
-        if (size_forward_recv[iswap])
-          MPI_Irecv(k_buf_recv.view<DeviceType>().data(),
-                    size_forward_recv[iswap],MPI_DOUBLE,
-                    recvproc[iswap],0,world,&request);
-        n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,iswap,
-                                   k_buf_send,pbc_flag[iswap],pbc[iswap]);
-        DeviceType::fence();
-        if (n)
-          MPI_Send(k_buf_send.view<DeviceType>().data(),n,
-                   MPI_DOUBLE,sendproc[iswap],0,world);
-        if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
-        avec->unpack_comm_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv);
-        DeviceType::fence();
-      }
-    } else {
-      if (!ghost_velocity) {
-        if (sendnum[iswap])
-          n = avec->pack_comm_self(sendnum[iswap],k_sendlist,iswap,
-                                   firstrecv[iswap],pbc_flag[iswap],pbc[iswap]);
-        DeviceType::fence();
-      } else {
-        n = avec->pack_comm_vel_kokkos(sendnum[iswap],k_sendlist,iswap,
-                                       k_buf_send,pbc_flag[iswap],pbc[iswap]);
-        DeviceType::fence();
-        avec->unpack_comm_vel_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send);
-        DeviceType::fence();
+        if (!ghost_velocity) {
+          if (sendnum[iswap])
+            n = avec->pack_comm_self(sendnum[iswap],k_sendlist,iswap,
+                                     firstrecv[iswap],pbc_flag[iswap],pbc[iswap]);
+          DeviceType::fence();
+        } else {
+          n = avec->pack_comm_vel_kokkos(sendnum[iswap],k_sendlist,iswap,
+                                         k_buf_send,pbc_flag[iswap],pbc[iswap]);
+          DeviceType::fence();
+          avec->unpack_comm_vel_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send);
+          DeviceType::fence();
+        }
      }
    }
  }
@ -496,9 +504,8 @@ void CommKokkos::exchange()
  }

  atomKK->sync(Host,ALL_MASK);
-  atomKK->modified(Host,ALL_MASK);
-
  CommBrick::exchange();
+  atomKK->modified(Host,ALL_MASK);
 }

 /* ---------------------------------------------------------------------- */
@ -565,146 +572,149 @@ void CommKokkos::exchange_device()
  atom->nghost = 0;
  atom->avec->clear_bonus();

-  // subbox bounds for orthogonal or triclinic
+  if (comm->nprocs > 1) { // otherwise no-op

-  if (triclinic == 0) {
-    sublo = domain->sublo;
-    subhi = domain->subhi;
-  } else {
-    sublo = domain->sublo_lamda;
-    subhi = domain->subhi_lamda;
-  }
+    // subbox bounds for orthogonal or triclinic

-  atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,ALL_MASK);
+    if (triclinic == 0) {
+      sublo = domain->sublo;
+      subhi = domain->subhi;
+    } else {
+      sublo = domain->sublo_lamda;
+      subhi = domain->subhi_lamda;
+    }

-  // loop over dimensions
-  for (int dim = 0; dim < 3; dim++) {
+    atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,ALL_MASK);

-    // fill buffer with atoms leaving my box, using < and >=
-    // when atom is deleted, fill it in with last atom
+    // loop over dimensions
+    for (int dim = 0; dim < 3; dim++) {

-    x = atom->x;
-    lo = sublo[dim];
-    hi = subhi[dim];
-    nlocal = atom->nlocal;
-    i = nsend = 0;
+      // fill buffer with atoms leaving my box, using < and >=
+      // when atom is deleted, fill it in with last atom

-    if (true) {
-      if (k_sendflag.h_view.extent(0)<nlocal) k_sendflag.resize(nlocal);
-      k_sendflag.sync<DeviceType>();
-      k_count.h_view() = k_exchange_sendlist.h_view.extent(0);
-      while (k_count.h_view()>=k_exchange_sendlist.h_view.extent(0)) {
-        k_count.h_view() = 0;
-        k_count.modify<LMPHostType>();
-        k_count.sync<DeviceType>();
+      x = atom->x;
+      lo = sublo[dim];
+      hi = subhi[dim];
+      nlocal = atom->nlocal;
+      i = nsend = 0;

-        BuildExchangeListFunctor<DeviceType>
-          f(atomKK->k_x,k_exchange_sendlist,k_count,k_sendflag,
-            nlocal,dim,lo,hi);
-        Kokkos::parallel_for(nlocal,f);
-        k_exchange_sendlist.modify<DeviceType>();
-        k_sendflag.modify<DeviceType>();
-        k_count.modify<DeviceType>();
+      if (true) {
+        if (k_sendflag.h_view.extent(0)<nlocal) k_sendflag.resize(nlocal);
+        k_sendflag.sync<DeviceType>();
+        k_count.h_view() = k_exchange_sendlist.h_view.extent(0);
+        while (k_count.h_view()>=k_exchange_sendlist.h_view.extent(0)) {
+          k_count.h_view() = 0;
+          k_count.modify<LMPHostType>();
+          k_count.sync<DeviceType>();

-        k_count.sync<LMPHostType>();
-        if (k_count.h_view()>=k_exchange_sendlist.h_view.extent(0)) {
-          k_exchange_sendlist.resize(k_count.h_view()*1.1);
-          k_exchange_copylist.resize(k_count.h_view()*1.1);
-          k_count.h_view()=k_exchange_sendlist.h_view.extent(0);
+          BuildExchangeListFunctor<DeviceType>
+            f(atomKK->k_x,k_exchange_sendlist,k_count,k_sendflag,
+              nlocal,dim,lo,hi);
+          Kokkos::parallel_for(nlocal,f);
+          k_exchange_sendlist.modify<DeviceType>();
+          k_sendflag.modify<DeviceType>();
+          k_count.modify<DeviceType>();
+
+          k_count.sync<LMPHostType>();
+          if (k_count.h_view()>=k_exchange_sendlist.h_view.extent(0)) {
+            k_exchange_lists.resize(2,k_count.h_view()*1.1);
+            k_exchange_sendlist = Kokkos::subview(k_exchange_lists,0,Kokkos::ALL);
+            k_exchange_copylist = Kokkos::subview(k_exchange_lists,1,Kokkos::ALL);
+            k_count.h_view()=k_exchange_sendlist.h_view.extent(0);
+          }
+        }
+
+        k_exchange_lists.sync<LMPHostType>();
+        k_sendflag.sync<LMPHostType>();
+
+        int sendpos = nlocal-1;
+        nlocal -= k_count.h_view();
+        for(int i = 0; i < k_count.h_view(); i++) {
+          if (k_exchange_sendlist.h_view(i)<nlocal) {
+            while (k_sendflag.h_view(sendpos)) sendpos--;
+            k_exchange_copylist.h_view(i) = sendpos;
+            sendpos--;
+          } else
+            k_exchange_copylist.h_view(i) = -1;
+        }
+
+        k_exchange_copylist.modify<LMPHostType>();
+        k_exchange_copylist.sync<DeviceType>();
+        nsend = k_count.h_view();
+        if (nsend > maxsend) grow_send_kokkos(nsend,1);
+        nsend =
+          avec->pack_exchange_kokkos(k_count.h_view(),k_buf_send,
+                                     k_exchange_sendlist,k_exchange_copylist,
+                                     ExecutionSpaceFromDevice<DeviceType>::space,
+                                     dim,lo,hi);
+        DeviceType::fence();
+      } else {
+        while (i < nlocal) {
+          if (x[i][dim] < lo || x[i][dim] >= hi) {
+            if (nsend > maxsend) grow_send_kokkos(nsend,1);
+            nsend += avec->pack_exchange(i,&buf_send[nsend]);
+            avec->copy(nlocal-1,i,1);
+            nlocal--;
+          } else i++;
        }
      }
-      k_exchange_copylist.sync<LMPHostType>();
-      k_exchange_sendlist.sync<LMPHostType>();
-      k_sendflag.sync<LMPHostType>();
+      atom->nlocal = nlocal;

-      int sendpos = nlocal-1;
-      nlocal -= k_count.h_view();
-      for(int i = 0; i < k_count.h_view(); i++) {
-        if (k_exchange_sendlist.h_view(i)<nlocal) {
-          while (k_sendflag.h_view(sendpos)) sendpos--;
-          k_exchange_copylist.h_view(i) = sendpos;
-          sendpos--;
-        } else
-          k_exchange_copylist.h_view(i) = -1;
-      }
+      // send/recv atoms in both directions
+      // if 1 proc in dimension, no send/recv, set recv buf to send buf
+      // if 2 procs in dimension, single send/recv
+      // if more than 2 procs in dimension, send/recv to both neighbors

-      k_exchange_copylist.modify<LMPHostType>();
-      k_exchange_copylist.sync<DeviceType>();
-      nsend = k_count.h_view();
-      if (nsend > maxsend) grow_send_kokkos(nsend,1);
-      nsend =
-        avec->pack_exchange_kokkos(k_count.h_view(),k_buf_send,
-                                   k_exchange_sendlist,k_exchange_copylist,
-                                   ExecutionSpaceFromDevice<DeviceType>::space,
-                                   dim,lo,hi);
-      DeviceType::fence();
-    } else {
-      while (i < nlocal) {
-        if (x[i][dim] < lo || x[i][dim] >= hi) {
-          if (nsend > maxsend) grow_send_kokkos(nsend,1);
-          nsend += avec->pack_exchange(i,&buf_send[nsend]);
-          avec->copy(nlocal-1,i,1);
-          nlocal--;
-        } else i++;
-      }
-    }
-    atom->nlocal = nlocal;
+      if (procgrid[dim] == 1) {
+        nrecv = nsend;
+        if (nrecv) {
+          atom->nlocal=avec->
+            unpack_exchange_kokkos(k_buf_send,nrecv,atom->nlocal,dim,lo,hi,
+                                   ExecutionSpaceFromDevice<DeviceType>::space);
+          DeviceType::fence();
+        }
+      } else {
+        MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][0],0,
+                     &nrecv1,1,MPI_INT,procneigh[dim][1],0,world,MPI_STATUS_IGNORE);
+        nrecv = nrecv1;
+        if (procgrid[dim] > 2) {
+          MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][1],0,
+                       &nrecv2,1,MPI_INT,procneigh[dim][0],0,world,MPI_STATUS_IGNORE);
+          nrecv += nrecv2;
+        }
+        if (nrecv > maxrecv) grow_recv_kokkos(nrecv);

-    // send/recv atoms in both directions
-    // if 1 proc in dimension, no send/recv, set recv buf to send buf
-    // if 2 procs in dimension, single send/recv
-    // if more than 2 procs in dimension, send/recv to both neighbors
-
-    if (procgrid[dim] == 1) {
-      nrecv = nsend;
-      if (nrecv) {
-        atom->nlocal=avec->
-          unpack_exchange_kokkos(k_buf_send,nrecv,atom->nlocal,dim,lo,hi,
-                                 ExecutionSpaceFromDevice<DeviceType>::space);
-        DeviceType::fence();
-      }
-    } else {
-      MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][0],0,
-                   &nrecv1,1,MPI_INT,procneigh[dim][1],0,world,MPI_STATUS_IGNORE);
-      nrecv = nrecv1;
-      if (procgrid[dim] > 2) {
-        MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][1],0,
-                     &nrecv2,1,MPI_INT,procneigh[dim][0],0,world,MPI_STATUS_IGNORE);
-        nrecv += nrecv2;
-      }
-      if (nrecv > maxrecv) grow_recv_kokkos(nrecv);
-
-      MPI_Irecv(k_buf_recv.view<DeviceType>().data(),nrecv1,
-                MPI_DOUBLE,procneigh[dim][1],0,
-                world,&request);
-      MPI_Send(k_buf_send.view<DeviceType>().data(),nsend,
-               MPI_DOUBLE,procneigh[dim][0],0,world);
-      MPI_Wait(&request,MPI_STATUS_IGNORE);
-
-      if (procgrid[dim] > 2) {
-        MPI_Irecv(k_buf_recv.view<DeviceType>().data()+nrecv1,
-                  nrecv2,MPI_DOUBLE,procneigh[dim][0],0,
+        MPI_Irecv(k_buf_recv.view<DeviceType>().data(),nrecv1,
+                  MPI_DOUBLE,procneigh[dim][1],0,
                  world,&request);
        MPI_Send(k_buf_send.view<DeviceType>().data(),nsend,
-                 MPI_DOUBLE,procneigh[dim][1],0,world);
+                 MPI_DOUBLE,procneigh[dim][0],0,world);
        MPI_Wait(&request,MPI_STATUS_IGNORE);
+
+        if (procgrid[dim] > 2) {
+          MPI_Irecv(k_buf_recv.view<DeviceType>().data()+nrecv1,
+                    nrecv2,MPI_DOUBLE,procneigh[dim][0],0,
+                    world,&request);
+          MPI_Send(k_buf_send.view<DeviceType>().data(),nsend,
+                   MPI_DOUBLE,procneigh[dim][1],0,world);
+          MPI_Wait(&request,MPI_STATUS_IGNORE);
+        }
+
+        if (nrecv) {
+          atom->nlocal = avec->
+            unpack_exchange_kokkos(k_buf_recv,nrecv,atom->nlocal,dim,lo,hi,
+                                   ExecutionSpaceFromDevice<DeviceType>::space);
+          DeviceType::fence();
+        }
      }

-      if (nrecv) {
-        atom->nlocal = avec->
-          unpack_exchange_kokkos(k_buf_recv,nrecv,atom->nlocal,dim,lo,hi,
-                                 ExecutionSpaceFromDevice<DeviceType>::space);
-        DeviceType::fence();
-      }
+      // check incoming atoms to see if they are in my box
+      // if so, add to my list
+
    }
-
-    // check incoming atoms to see if they are in my box
-    // if so, add to my list
-
+    atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::space,ALL_MASK);
  }

-  atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::space,ALL_MASK);
-
  if (atom->firstgroupname) {
    /* this is not yet implemented with Kokkos */
    atomKK->sync(Host,ALL_MASK);
@ -742,14 +752,15 @@ void CommKokkos::borders()
  if (!exchange_comm_classic) {
    if (exchange_comm_on_host) borders_device<LMPHostType>();
    else borders_device<LMPDeviceType>();
-    return;
+  } else {
+    atomKK->sync(Host,ALL_MASK);
+    CommBrick::borders();
+    k_sendlist.modify<LMPHostType>();
+    atomKK->modified(Host,ALL_MASK);
  }

-  atomKK->sync(Host,ALL_MASK);
-  k_sendlist.sync<LMPHostType>();
-  CommBrick::borders();
-  k_sendlist.modify<LMPHostType>();
-  atomKK->modified(Host,ALL_MASK);
+  if (comm->nprocs == 1 && !ghost_velocity && !forward_comm_classic)
+    copy_swap_info();
 }

 /* ---------------------------------------------------------------------- */
@ -817,7 +828,6 @@ void CommKokkos::borders_device() {
  AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec;

  ExecutionSpace exec_space = ExecutionSpaceFromDevice<DeviceType>::space;
-  k_sendlist.sync<DeviceType>();
  atomKK->sync(exec_space,ALL_MASK);

  // do swaps over all 3 dimensions
@ -1037,6 +1047,69 @@ void CommKokkos::borders_device() {
    atom->map_set();
  }
 }
+
+/* ----------------------------------------------------------------------
+   copy swap info
+------------------------------------------------------------------------- */
+
+void CommKokkos::copy_swap_info()
+{
+  if (nswap > k_swap.extent(1)) {
+    k_swap = DAT::tdual_int_2d("comm:swap",2,nswap);
+    k_firstrecv    = Kokkos::subview(k_swap,0,Kokkos::ALL);
+    k_sendnum_scan = Kokkos::subview(k_swap,1,Kokkos::ALL);
+  }
+  int scan = 0;
+  for (int iswap = 0; iswap < nswap; iswap++) {
+    scan += sendnum[iswap];
+    k_sendnum_scan.h_view[iswap] = scan;
+    k_firstrecv.h_view[iswap] = firstrecv[iswap];
+  }
+  totalsend = scan;
+
+  // create map of ghost to local atom id
+  // store periodic boundary transform from local to ghost
+
+  k_sendlist.sync<LMPHostType>();
+
+  if (totalsend > k_pbc.extent(0)) {
+    k_pbc = DAT::tdual_int_2d("comm:pbc",totalsend,6);
+    k_swap2 = DAT::tdual_int_2d("comm:swap2",2,totalsend);
+    k_pbc_flag = Kokkos::subview(k_swap2,0,Kokkos::ALL);
+    k_g2l = Kokkos::subview(k_swap2,1,Kokkos::ALL);
+  }
+
+  for (int iswap = 0; iswap < nswap; iswap++) {
+    for (int i = 0; i < sendnum[iswap]; i++) {
+      int source = sendlist[iswap][i] - atom->nlocal;
+      int dest = firstrecv[iswap] + i - atom->nlocal;
+      k_pbc_flag.h_view(dest) = pbc_flag[iswap];
+      k_pbc.h_view(dest,0) = pbc[iswap][0];
+      k_pbc.h_view(dest,1) = pbc[iswap][1];
+      k_pbc.h_view(dest,2) = pbc[iswap][2];
+      k_pbc.h_view(dest,3) = pbc[iswap][3];
+      k_pbc.h_view(dest,4) = pbc[iswap][4];
+      k_pbc.h_view(dest,5) = pbc[iswap][5];
+      k_g2l.h_view(dest) = atom->nlocal + source;
+
+      if (source >= 0) {
+        k_pbc_flag.h_view(dest) = k_pbc_flag.h_view(dest) || k_pbc_flag.h_view(source);
+        k_pbc.h_view(dest,0) += k_pbc.h_view(source,0);
+        k_pbc.h_view(dest,1) += k_pbc.h_view(source,1);
+        k_pbc.h_view(dest,2) += k_pbc.h_view(source,2);
+        k_pbc.h_view(dest,3) += k_pbc.h_view(source,3);
+        k_pbc.h_view(dest,4) += k_pbc.h_view(source,4);
+        k_pbc.h_view(dest,5) += k_pbc.h_view(source,5);
+        k_g2l.h_view(dest) = k_g2l.h_view(source);
+      }
+    }
+  }
+
+  k_swap.modify<LMPHostType>();
+  k_swap2.modify<LMPHostType>();
+  k_pbc.modify<LMPHostType>();
+}
+
 /* ----------------------------------------------------------------------
   realloc the size of the send buffer as needed with BUFFACTOR and bufextra
   if flag = 1, realloc
--- a/src/KOKKOS/comm_kokkos.h
+++ b/src/KOKKOS/comm_kokkos.h
@ -58,11 +58,21 @@ class CommKokkos : public CommBrick {
  DAT::tdual_int_2d k_sendlist;
  DAT::tdual_int_scalar k_total_send;
  DAT::tdual_xfloat_2d k_buf_send,k_buf_recv;
+  DAT::tdual_int_2d k_exchange_lists;
  DAT::tdual_int_1d k_exchange_sendlist,k_exchange_copylist,k_sendflag;
  DAT::tdual_int_scalar k_count;
  //double *buf_send;                 // send buffer for all comm
  //double *buf_recv;                 // recv buffer for all comm

+  DAT::tdual_int_2d k_swap;
+  DAT::tdual_int_2d k_swap2;
+  DAT::tdual_int_2d k_pbc;
+  DAT::tdual_int_1d k_pbc_flag;
+  DAT::tdual_int_1d k_g2l;
+  DAT::tdual_int_1d k_firstrecv;
+  DAT::tdual_int_1d k_sendnum_scan;
+  int totalsend;
+
  int max_buf_pair;
  DAT::tdual_xfloat_1d k_buf_send_pair;
  DAT::tdual_xfloat_1d k_buf_recv_pair;
@ -74,6 +84,7 @@ class CommKokkos : public CommBrick {
  void grow_recv_kokkos(int, ExecutionSpace space = Host);
  void grow_list(int, int);
  void grow_swap(int);
+  void copy_swap_info();
 };

 }
--- a/src/KOKKOS/domain_kokkos.cpp
+++ b/src/KOKKOS/domain_kokkos.cpp
@ -17,6 +17,7 @@
 #include "error.h"
 #include "force.h"
 #include "kspace.h"
+#include "kokkos.h"

 using namespace LAMMPS_NS;

@ -339,6 +340,17 @@ struct DomainPBCFunctor {

 void DomainKokkos::pbc()
 {
+ 
+  if (lmp->kokkos->exchange_comm_classic) {
+ 
+   // reduce GPU data movement
+ 
+    atomKK->sync(Host,X_MASK|V_MASK|MASK_MASK|IMAGE_MASK);
+    Domain::pbc();
+    atomKK->modified(Host,X_MASK|V_MASK|MASK_MASK|IMAGE_MASK);
+    return;
+  }
+
  double *lo,*hi,*period;
  int nlocal = atomKK->nlocal;

--- a/src/KOKKOS/fix_nve_kokkos.cpp
+++ b/src/KOKKOS/fix_nve_kokkos.cpp
@ -113,8 +113,8 @@ void FixNVEKokkos<DeviceType>::initial_integrate_rmass_item(int i) const
 template<class DeviceType>
 void FixNVEKokkos<DeviceType>::final_integrate()
 {
-  atomKK->sync(execution_space,datamask_read);
-  atomKK->modified(execution_space,datamask_modify);
+  atomKK->sync(execution_space,V_MASK | F_MASK | MASK_MASK | RMASS_MASK | TYPE_MASK);
+  atomKK->modified(execution_space,V_MASK);

  v = atomKK->k_v.view<DeviceType>();
  f = atomKK->k_f.view<DeviceType>();
--- a/src/KOKKOS/fix_property_atom_kokkos.cpp
+++ b/src/KOKKOS/fix_property_atom_kokkos.cpp
@ -19,6 +19,7 @@
 #include "memory_kokkos.h"
 #include "error.h"
 #include "update.h"
+#include "atom_masks.h"

 using namespace LAMMPS_NS;
 using namespace FixConst;
@ -61,8 +62,10 @@ void FixPropertyAtomKokkos::grow_arrays(int nmax)
      size_t nbytes = (nmax-nmax_old) * sizeof(int);
      memset(&atom->ivector[index[m]][nmax_old],0,nbytes);
    } else if (style[m] == DOUBLE) {
+      atomKK->sync(Device,DVECTOR_MASK);
      memoryKK->grow_kokkos(atomKK->k_dvector,atomKK->dvector,atomKK->k_dvector.extent(0),nmax,
                          "atom:dvector");
+      atomKK->modified(Device,DVECTOR_MASK);
      //memory->grow(atom->dvector[index[m]],nmax,"atom:dvector");
      //size_t nbytes = (nmax-nmax_old) * sizeof(double);
      //memset(&atom->dvector[index[m]][nmax_old],0,nbytes);
--- a/src/KOKKOS/fix_qeq_reax_kokkos.cpp
+++ b/src/KOKKOS/fix_qeq_reax_kokkos.cpp
@ -58,7 +58,7 @@ FixQEqReaxKokkos(LAMMPS *lmp, int narg, char **arg) :
  atomKK = (AtomKokkos *) atom;
  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;

-  datamask_read = X_MASK | V_MASK | F_MASK | MASK_MASK | Q_MASK | TYPE_MASK;
+  datamask_read = X_MASK | V_MASK | F_MASK | MASK_MASK | Q_MASK | TYPE_MASK | TAG_MASK;
  datamask_modify = Q_MASK | X_MASK;

  nmax = nmax = m_cap = 0;
@ -164,6 +164,9 @@ void FixQEqReaxKokkos<DeviceType>::init_shielding_k()
 template<class DeviceType>
 void FixQEqReaxKokkos<DeviceType>::init_hist()
 {
+  k_s_hist.clear_sync_state();
+  k_t_hist.clear_sync_state();
+
  Kokkos::deep_copy(d_s_hist,0.0);
  Kokkos::deep_copy(d_t_hist,0.0);

@ -189,7 +192,6 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
  if (update->ntimestep % nevery) return;

  atomKK->sync(execution_space,datamask_read);
-  atomKK->modified(execution_space,datamask_modify);

  x = atomKK->k_x.view<DeviceType>();
  v = atomKK->k_v.view<DeviceType>();
@ -273,6 +275,8 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
  // free duplicated memory
  if (need_dup)
    dup_o = decltype(dup_o)();
+
+  atomKK->modified(execution_space,datamask_modify);
 }

 /* ---------------------------------------------------------------------- */
@ -1199,9 +1203,12 @@ double FixQEqReaxKokkos<DeviceType>::memory_usage()
 template<class DeviceType>
 void FixQEqReaxKokkos<DeviceType>::grow_arrays(int nmax)
 {
-  k_s_hist.template sync<LMPHostType>(); // force reallocation on host
+  k_s_hist.template sync<LMPHostType>();
  k_t_hist.template sync<LMPHostType>();

+  k_s_hist.template modify<LMPHostType>(); // force reallocation on host
+  k_t_hist.template modify<LMPHostType>();
+
  memoryKK->grow_kokkos(k_s_hist,s_hist,nmax,nprev,"qeq:s_hist");
  memoryKK->grow_kokkos(k_t_hist,t_hist,nmax,nprev,"qeq:t_hist");

--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@ -184,10 +184,12 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)

  binsize = 0.0;
  gpu_direct_flag = 1;
+  neigh_thread = 0;
+  neigh_thread_set = 0;
+  neighflag_qeq_set = 0;
  if (ngpu > 0) {
    neighflag = FULL;
    neighflag_qeq = FULL;
-    neighflag_qeq_set = 0;
    newtonflag = 0;
    exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
    exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
@ -199,7 +201,6 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
      neighflag = HALF;
      neighflag_qeq = HALF;
    }
-    neighflag_qeq_set = 0;
    newtonflag = 1;
    exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1;
    exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
@ -318,6 +319,13 @@ void KokkosLMP::accelerator(int narg, char **arg)
      else if (strcmp(arg[iarg+1],"on") == 0) gpu_direct_flag = 1;
      else error->all(FLERR,"Illegal package kokkos command");
      iarg += 2;
+    } else if (strcmp(arg[iarg],"neigh/thread") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
+      if (strcmp(arg[iarg+1],"off") == 0) neigh_thread = 0;
+      else if (strcmp(arg[iarg+1],"on") == 0) neigh_thread = 1;
+      else error->all(FLERR,"Illegal package kokkos command");
+      neigh_thread_set = 1;
+      iarg += 2;
    } else error->all(FLERR,"Illegal package kokkos command");
  }

@ -337,6 +345,9 @@ void KokkosLMP::accelerator(int narg, char **arg)

  force->newton = force->newton_pair = force->newton_bond = newtonflag;

+  if (neigh_thread && neighflag != FULL)
+    error->all(FLERR,"Must use KOKKOS package option 'neigh full' with 'neigh/thread on'");
+
  neighbor->binsize_user = binsize;
  if (binsize <= 0.0) neighbor->binsizeflag = 0;
  else neighbor->binsizeflag = 1;
--- a/src/KOKKOS/kokkos.h
+++ b/src/KOKKOS/kokkos.h
@ -36,6 +36,8 @@ class KokkosLMP : protected Pointers {
  int numa;
  int auto_sync;
  int gpu_direct_flag;
+  int neigh_thread;
+  int neigh_thread_set;
  int newtonflag;
  double binsize;

@ -87,4 +89,8 @@ U: Must use Kokkos half/thread or full neighbor list with threads or GPUs

 Using Kokkos half-neighbor lists with threading is not allowed.

+E: Must use KOKKOS package option 'neigh full' with 'neigh/thread on'
+
+The 'neigh/thread on' option requires a full neighbor list
+
 */
--- a/src/KOKKOS/kokkos_type.h
+++ b/src/KOKKOS/kokkos_type.h
@ -448,6 +448,52 @@ struct s_EV_FLOAT_REAX {
 };
 typedef struct s_EV_FLOAT_REAX EV_FLOAT_REAX;

+struct s_FEV_FLOAT {
+  F_FLOAT f[3];
+  E_FLOAT evdwl;
+  E_FLOAT ecoul;
+  E_FLOAT v[6];
+  KOKKOS_INLINE_FUNCTION
+  s_FEV_FLOAT() {
+    f[0] = 0; f[1] = 0; f[2] = 0;
+    evdwl = 0;
+    ecoul = 0;
+    v[0] = 0; v[1] = 0; v[2] = 0;
+    v[3] = 0; v[4] = 0; v[5] = 0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator+=(const s_FEV_FLOAT &rhs) {
+    f[0] += rhs.f[0];
+    f[1] += rhs.f[1];
+    f[2] += rhs.f[2];
+    evdwl += rhs.evdwl;
+    ecoul += rhs.ecoul;
+    v[0] += rhs.v[0];
+    v[1] += rhs.v[1];
+    v[2] += rhs.v[2];
+    v[3] += rhs.v[3];
+    v[4] += rhs.v[4];
+    v[5] += rhs.v[5];
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator+=(const volatile s_FEV_FLOAT &rhs) volatile {
+    f[0] += rhs.f[0];
+    f[1] += rhs.f[1];
+    f[2] += rhs.f[2];
+    evdwl += rhs.evdwl;
+    ecoul += rhs.ecoul;
+    v[0] += rhs.v[0];
+    v[1] += rhs.v[1];
+    v[2] += rhs.v[2];
+    v[3] += rhs.v[3];
+    v[4] += rhs.v[4];
+    v[5] += rhs.v[5];
+  }
+};
+typedef struct s_FEV_FLOAT FEV_FLOAT;
+
 #ifndef PREC_POS
 #define PREC_POS PRECISION
 #endif
--- a/src/KOKKOS/npair_kokkos.cpp
+++ b/src/KOKKOS/npair_kokkos.cpp
@ -15,6 +15,7 @@
 #include "atom_kokkos.h"
 #include "atom_masks.h"
 #include "domain_kokkos.h"
+#include "update.h"
 #include "neighbor_kokkos.h"
 #include "nbin_kokkos.h"
 #include "nstencil.h"
@ -27,6 +28,16 @@ namespace LAMMPS_NS {
 template<class DeviceType, int HALF_NEIGH, int GHOST, int TRI, int SIZE>
 NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::NPairKokkos(LAMMPS *lmp) : NPair(lmp) {

+  // use 1D view for scalars to reduce GPU memory operations
+
+  d_scalars = typename AT::t_int_1d("neighbor:scalars",2);
+  h_scalars = HAT::t_int_1d("neighbor:scalars_mirror",2);
+
+  d_resize = Kokkos::subview(d_scalars,0);
+  d_new_maxneighs = Kokkos::subview(d_scalars,1);
+
+  h_resize = Kokkos::subview(h_scalars,0);
+  h_new_maxneighs = Kokkos::subview(h_scalars,1);
 }

 /* ----------------------------------------------------------------------
@ -84,27 +95,30 @@ template<class DeviceType, int HALF_NEIGH, int GHOST, int TRI, int SIZE>
 void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::copy_stencil_info()
 {
  NPair::copy_stencil_info();
-
  nstencil = ns->nstencil;

-  int maxstencil = ns->get_maxstencil();
+  if (neighbor->last_setup_bins == update->ntimestep) {
+    // copy stencil to device as it may have changed

-  if (maxstencil > k_stencil.extent(0))
-    k_stencil = DAT::tdual_int_1d("neighlist:stencil",maxstencil);
-  for (int k = 0; k < maxstencil; k++)
-    k_stencil.h_view(k) = ns->stencil[k];
-    k_stencil.modify<LMPHostType>();
-    k_stencil.sync<DeviceType>();
-  if (GHOST) {
-    if (maxstencil > k_stencilxyz.extent(0))
-      k_stencilxyz = DAT::tdual_int_1d_3("neighlist:stencilxyz",maxstencil);
-    for (int k = 0; k < maxstencil; k++) {
-      k_stencilxyz.h_view(k,0) = ns->stencilxyz[k][0];
-      k_stencilxyz.h_view(k,1) = ns->stencilxyz[k][1];
-      k_stencilxyz.h_view(k,2) = ns->stencilxyz[k][2];
+    int maxstencil = ns->get_maxstencil();
+    
+    if (maxstencil > k_stencil.extent(0))
+      k_stencil = DAT::tdual_int_1d("neighlist:stencil",maxstencil);
+    for (int k = 0; k < maxstencil; k++)
+      k_stencil.h_view(k) = ns->stencil[k];
+      k_stencil.modify<LMPHostType>();
+      k_stencil.sync<DeviceType>();
+    if (GHOST) {
+      if (maxstencil > k_stencilxyz.extent(0))
+        k_stencilxyz = DAT::tdual_int_1d_3("neighlist:stencilxyz",maxstencil);
+      for (int k = 0; k < maxstencil; k++) {
+        k_stencilxyz.h_view(k,0) = ns->stencilxyz[k][0];
+        k_stencilxyz.h_view(k,1) = ns->stencilxyz[k][1];
+        k_stencilxyz.h_view(k,2) = ns->stencilxyz[k][2];
+      }
+      k_stencilxyz.modify<LMPHostType>();
+      k_stencilxyz.sync<DeviceType>();
    }
-    k_stencilxyz.modify<LMPHostType>();
-    k_stencilxyz.sync<DeviceType>();
  }
 }

@ -126,7 +140,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::build(NeighList *list_)
         k_bincount.view<DeviceType>(),
         k_bins.view<DeviceType>(),
         k_atom2bin.view<DeviceType>(),
-         nstencil,
+         mbins,nstencil,
         k_stencil.view<DeviceType>(),
         k_stencilxyz.view<DeviceType>(),
         nlocal,
@ -157,7 +171,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::build(NeighList *list_)
         bboxhi,bboxlo,
         domain->xperiodic,domain->yperiodic,domain->zperiodic,
         domain->xprd_half,domain->yprd_half,domain->zprd_half,
-	 skin);
+         skin,d_resize,h_resize,d_new_maxneighs,h_new_maxneighs);

  k_cutneighsq.sync<DeviceType>();
  k_ex1_type.sync<DeviceType>();
@ -173,7 +187,18 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::build(NeighList *list_)
  k_bincount.sync<DeviceType>();
  k_bins.sync<DeviceType>();
  k_atom2bin.sync<DeviceType>();
-  atomKK->sync(Device,X_MASK|RADIUS_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK|TAG_MASK|SPECIAL_MASK);
+
+  if (atom->molecular) {
+    if (exclude)
+      atomKK->sync(Device,X_MASK|RADIUS_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK|TAG_MASK|SPECIAL_MASK);
+    else
+      atomKK->sync(Device,X_MASK|RADIUS_MASK|TYPE_MASK|TAG_MASK|SPECIAL_MASK);
+  } else {
+    if (exclude)
+      atomKK->sync(Device,X_MASK|RADIUS_MASK|TYPE_MASK|MASK_MASK);
+    else
+      atomKK->sync(Device,X_MASK|RADIUS_MASK|TYPE_MASK);
+  }

  data.special_flag[0] = special_flag[0];
  data.special_flag[1] = special_flag[1];
@ -185,8 +210,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::build(NeighList *list_)
    data.h_new_maxneighs() = list->maxneighs;
    data.h_resize() = 0;

-    Kokkos::deep_copy(data.resize, data.h_resize);
-    Kokkos::deep_copy(data.new_maxneighs, data.h_new_maxneighs);
+    Kokkos::deep_copy(d_scalars, h_scalars);
 #ifdef KOKKOS_ENABLE_CUDA
    #define BINS_PER_BLOCK 2
    const int factor = atoms_per_bin<64?2:1;
@ -245,10 +269,9 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::build(NeighList *list_)
 	}
      }
    }
-    deep_copy(data.h_resize, data.resize);
+    Kokkos::deep_copy(h_scalars, d_scalars);

    if(data.h_resize()) {
-      deep_copy(data.h_new_maxneighs, data.new_maxneighs);
      list->maxneighs = data.h_new_maxneighs() * 1.2;
      list->d_neighbors = typename ArrayTypes<DeviceType>::t_neighbors_2d("neighbors", list->d_neighbors.extent(0), list->maxneighs);
      data.neigh_list.d_neighbors = list->d_neighbors;
@ -488,7 +511,7 @@ void NeighborKokkosExecute<DeviceType>::build_ItemCuda(typename Kokkos::TeamPoli

  const int ibin = dev.league_rank()*BINS_PER_TEAM+MY_BIN;

-  if(ibin >=c_bincount.extent(0)) return;
+  if(ibin >= mbins) return;
  X_FLOAT* other_x = sharedmem;
  other_x = other_x + 5*atoms_per_bin*MY_BIN;

@ -924,7 +947,7 @@ void NeighborKokkosExecute<DeviceType>::build_ItemSizeCuda(typename Kokkos::Team

  const int ibin = dev.league_rank()*BINS_PER_TEAM+MY_BIN;

-  if(ibin >=c_bincount.extent(0)) return;
+  if(ibin >= mbins) return;
  X_FLOAT* other_x = sharedmem;
  other_x = other_x + 6*atoms_per_bin*MY_BIN;

--- a/src/KOKKOS/npair_kokkos.h
+++ b/src/KOKKOS/npair_kokkos.h
@ -95,6 +95,8 @@ namespace LAMMPS_NS {

 template<class DeviceType, int HALF_NEIGH, int GHOST, int TRI, int SIZE>
 class NPairKokkos : public NPair {
+  typedef ArrayTypes<DeviceType> AT;
+
 public:
  NPairKokkos(class LAMMPS *);
  ~NPairKokkos() {}
@ -105,6 +107,12 @@ class NPairKokkos : public NPair {

 private:
  int newton_pair;
+  typename AT::t_int_1d d_scalars;
+  HAT::t_int_1d h_scalars;
+  typename AT::t_int_scalar d_resize;
+  typename AT::t_int_scalar d_new_maxneighs;
+  HAT::t_int_scalar h_resize;
+  HAT::t_int_scalar h_new_maxneighs;

  // data from Neighbor class

@ -165,6 +173,7 @@ class NeighborKokkosExecute

  // data from NBin class

+  const int mbins;
  const typename AT::t_int_1d bincount;
  const typename AT::t_int_1d_const c_bincount;
  typename AT::t_int_2d bins;
@ -218,7 +227,7 @@ class NeighborKokkosExecute
                        const typename AT::t_int_1d &_bincount,
                        const typename AT::t_int_2d &_bins,
                        const typename AT::t_int_1d &_atom2bin,
-                        const int _nstencil,
+                        const int _mbins,const int _nstencil,
                        const typename AT::t_int_1d &_d_stencil,
                        const typename AT::t_int_1d_3 &_d_stencilxyz,
                        const int _nlocal,
@ -251,8 +260,12 @@ class NeighborKokkosExecute
                        const X_FLOAT *_bboxhi, const X_FLOAT* _bboxlo,
                        const int & _xperiodic, const int & _yperiodic, const int & _zperiodic,
                        const int & _xprd_half, const int & _yprd_half, const int & _zprd_half,
-                        const X_FLOAT _skin):
-    neigh_list(_neigh_list), cutneighsq(_cutneighsq),
+                        const X_FLOAT _skin,
+                        const typename AT::t_int_scalar _resize,
+                        const typename ArrayTypes<LMPHostType>::t_int_scalar _h_resize,
+                        const typename AT::t_int_scalar _new_maxneighs,
+                        const typename ArrayTypes<LMPHostType>::t_int_scalar _h_new_maxneighs):
+    neigh_list(_neigh_list), cutneighsq(_cutneighsq),mbins(_mbins),
    bincount(_bincount),c_bincount(_bincount),bins(_bins),c_bins(_bins),
    atom2bin(_atom2bin),c_atom2bin(_atom2bin),
    nstencil(_nstencil),d_stencil(_d_stencil),d_stencilxyz(_d_stencilxyz),
@ -272,7 +285,8 @@ class NeighborKokkosExecute
    ex_mol_intra(_ex_mol_intra),
    xperiodic(_xperiodic),yperiodic(_yperiodic),zperiodic(_zperiodic),
    xprd_half(_xprd_half),yprd_half(_yprd_half),zprd_half(_zprd_half),
-    skin(_skin) {
+    skin(_skin),resize(_resize),h_resize(_h_resize),
+    new_maxneighs(_new_maxneighs),h_new_maxneighs(_h_new_maxneighs) {

    if (molecular == 2) moltemplate = 1;
    else moltemplate = 0;
@ -280,20 +294,7 @@ class NeighborKokkosExecute
    bboxlo[0] = _bboxlo[0]; bboxlo[1] = _bboxlo[1]; bboxlo[2] = _bboxlo[2];
    bboxhi[0] = _bboxhi[0]; bboxhi[1] = _bboxhi[1]; bboxhi[2] = _bboxhi[2];

-    resize = typename AT::t_int_scalar("NeighborKokkosFunctor::resize");
-#ifndef KOKKOS_USE_CUDA_UVM
-    h_resize = Kokkos::create_mirror_view(resize);
-#else
-    h_resize = resize;
-#endif
    h_resize() = 1;
-    new_maxneighs = typename AT::
-      t_int_scalar("NeighborKokkosFunctor::new_maxneighs");
-#ifndef KOKKOS_USE_CUDA_UVM
-    h_new_maxneighs = Kokkos::create_mirror_view(new_maxneighs);
-#else
-    h_new_maxneighs = new_maxneighs;
-#endif
    h_new_maxneighs() = neigh_list.maxneighs;
  };

--- a/src/KOKKOS/pair_kokkos.h
+++ b/src/KOKKOS/pair_kokkos.h
@ -86,6 +86,7 @@ struct PairComputeFunctor  {
                          NeighListKokkos<device_type>* list_ptr):
  c(*c_ptr),list(*list_ptr) {
    // allocate duplicated memory
+    f = c.f;
    dup_f     = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, NeedDup<NEIGHFLAG,device_type>::value >(c.f);
    dup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, NeedDup<NEIGHFLAG,device_type>::value >(c.d_eatom);
    dup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, NeedDup<NEIGHFLAG,device_type>::value >(c.d_vatom);
@ -255,6 +256,328 @@ struct PairComputeFunctor  {
    return ev;
  }

+  // Use TeamPolicy, assume Newton off, Full Neighborlist, and no energy/virial
+  // Loop over neighbors of one atom without coulomb interaction
+  // This function is called in parallel
+  KOKKOS_FUNCTION
+  void compute_item_team(Kokkos::TeamPolicy<>::member_type team,
+                         const NeighListKokkos<device_type> &list, const NoCoulTag&) const {
+
+    const int inum = team.league_size();
+    const int atoms_per_team = team.team_size();
+    const int firstatom = team.league_rank()*atoms_per_team;
+    const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, firstatom, lastatom), [&] (const int &ii) {
+
+      const int i = list.d_ilist[ii];
+      const X_FLOAT xtmp = c.x(i,0);
+      const X_FLOAT ytmp = c.x(i,1);
+      const X_FLOAT ztmp = c.x(i,2);
+      const int itype = c.type(i);
+
+      const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i);
+      const int jnum = list.d_numneigh[i];
+
+      t_scalar3<double> fsum;
+
+      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,jnum),
+        [&] (const int jj, t_scalar3<double>& ftmp) {
+
+        int j = neighbors_i(jj);
+        const F_FLOAT factor_lj = c.special_lj[sbmask(j)];
+        j &= NEIGHMASK;
+        const X_FLOAT delx = xtmp - c.x(j,0);
+        const X_FLOAT dely = ytmp - c.x(j,1);
+        const X_FLOAT delz = ztmp - c.x(j,2);
+        const int jtype = c.type(j);
+        const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+        if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) {
+    
+          const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
+
+          ftmp.x += delx*fpair;
+          ftmp.y += dely*fpair;
+          ftmp.z += delz*fpair;
+        }
+
+      },fsum);
+
+      Kokkos::single(Kokkos::PerThread(team), [&] (){
+        f(i,0) += fsum.x;
+        f(i,1) += fsum.y;
+        f(i,2) += fsum.z;
+      });
+
+    });
+  }
+
+  // Use TeamPolicy, assume Newton off, Full Neighborlist, and no energy/virial
+  // Loop over neighbors of one atom with coulomb interaction
+  // This function is called in parallel
+  KOKKOS_FUNCTION
+  void compute_item_team(Kokkos::TeamPolicy<>::member_type team,
+                         const NeighListKokkos<device_type> &list, const CoulTag& ) const {
+
+    const int inum = team.league_size();
+    const int atoms_per_team = team.team_size();
+    int firstatom = team.league_rank()*atoms_per_team;
+    int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, firstatom, lastatom), [&] (const int &ii) {
+
+      const int i = list.d_ilist[ii];
+      const X_FLOAT xtmp = c.x(i,0);
+      const X_FLOAT ytmp = c.x(i,1);
+      const X_FLOAT ztmp = c.x(i,2);
+      const int itype = c.type(i);
+      const F_FLOAT qtmp = c.q(i);
+
+      const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i);
+      const int jnum = list.d_numneigh[i];
+
+      t_scalar3<double> fsum;
+
+      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,jnum),
+        [&] (const int jj, t_scalar3<double>& ftmp) {
+        int j = neighbors_i(jj);
+        const F_FLOAT factor_lj = c.special_lj[sbmask(j)];
+        const F_FLOAT factor_coul = c.special_coul[sbmask(j)];
+        j &= NEIGHMASK;
+        const X_FLOAT delx = xtmp - c.x(j,0);
+        const X_FLOAT dely = ytmp - c.x(j,1);
+        const X_FLOAT delz = ztmp - c.x(j,2);
+        const int jtype = c.type(j);
+        const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+        if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) {
+
+          F_FLOAT fpair = F_FLOAT();
+
+          if(rsq < (STACKPARAMS?c.m_cut_ljsq[itype][jtype]:c.d_cut_ljsq(itype,jtype)))
+            fpair+=factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
+          if(rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype)))
+            fpair+=c.template compute_fcoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);
+
+          ftmp.x += delx*fpair;
+          ftmp.y += dely*fpair;
+          ftmp.z += delz*fpair;
+        }
+      },fsum);
+
+      Kokkos::single(Kokkos::PerThread(team), [&] (){
+      f(i,0) += fsum.x;
+      f(i,1) += fsum.y;
+      f(i,2) += fsum.z;
+      });
+    });
+  }
+
+
+  // Use TeamPolicy, assume Newton off, Full Neighborlist, and energy/virial
+  // Loop over neighbors of one atom without coulomb interaction
+  // This function is called in parallel
+  KOKKOS_FUNCTION
+  EV_FLOAT compute_item_team_ev(Kokkos::TeamPolicy<>::member_type team,
+                                const NeighListKokkos<device_type> &list, const NoCoulTag&) const {
+
+    EV_FLOAT ev;
+
+    const int inum = team.league_size();
+    const int atoms_per_team = team.team_size();
+    const int firstatom = team.league_rank()*atoms_per_team;
+    const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, firstatom, lastatom), [&] (const int &ii) {
+
+      const int i = list.d_ilist[ii];
+      const X_FLOAT xtmp = c.x(i,0);
+      const X_FLOAT ytmp = c.x(i,1);
+      const X_FLOAT ztmp = c.x(i,2);
+      const int itype = c.type(i);
+
+      const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i);
+      const int jnum = list.d_numneigh[i];
+
+      FEV_FLOAT fev;
+
+      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,jnum),
+        [&] (const int jj, FEV_FLOAT& fev_tmp) {
+
+        int j = neighbors_i(jj);
+        const F_FLOAT factor_lj = c.special_lj[sbmask(j)];
+        j &= NEIGHMASK;
+        const X_FLOAT delx = xtmp - c.x(j,0);
+        const X_FLOAT dely = ytmp - c.x(j,1);
+        const X_FLOAT delz = ztmp - c.x(j,2);
+        const int jtype = c.type(j);
+        const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+        if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) {
+    
+          const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
+
+          fev_tmp.f[0] += delx*fpair;
+          fev_tmp.f[1] += dely*fpair;
+          fev_tmp.f[2] += delz*fpair;
+
+          F_FLOAT evdwl = 0.0;
+          if (c.eflag) {
+            evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
+            fev.evdwl += 0.5*evdwl;
+          }
+          if (c.vflag_either) {
+            fev.v[0] += 0.5*delx*delx*fpair;
+            fev.v[1] += 0.5*dely*dely*fpair;
+            fev.v[2] += 0.5*delz*delz*fpair;
+            fev.v[3] += 0.5*delx*dely*fpair;
+            fev.v[4] += 0.5*delx*delz*fpair;
+            fev.v[5] += 0.5*dely*delz*fpair;
+          }
+        }
+      },fev);
+
+      Kokkos::single(Kokkos::PerThread(team), [&] (){
+        f(i,0) += fev.f[0];
+        f(i,1) += fev.f[1];
+        f(i,2) += fev.f[2];
+
+        if (c.eflag_global)
+          ev.evdwl += fev.evdwl;
+
+        if (c.eflag_atom)
+          d_eatom(i,0) += fev.evdwl;
+
+        if (c.vflag_global) {
+          ev.v[0] += fev.v[0];
+          ev.v[1] += fev.v[1];
+          ev.v[2] += fev.v[2];
+          ev.v[3] += fev.v[3];
+          ev.v[4] += fev.v[4];
+          ev.v[5] += fev.v[5];
+        }
+
+        if (c.vflag_atom) {
+          d_vatom(i,0) += fev.v[0];
+          d_vatom(i,1) += fev.v[1];
+          d_vatom(i,2) += fev.v[2];
+          d_vatom(i,3) += fev.v[3];
+          d_vatom(i,4) += fev.v[4];
+          d_vatom(i,5) += fev.v[5];
+        }
+      });
+    });
+    return ev;
+  }
+
+  // Use TeamPolicy, assume Newton off, Full Neighborlist, and energy/virial
+  // Loop over neighbors of one atom with coulomb interaction
+  // This function is called in parallel
+  KOKKOS_FUNCTION
+  EV_FLOAT compute_item_team_ev(Kokkos::TeamPolicy<>::member_type team,
+                                const NeighListKokkos<device_type> &list, const CoulTag& ) const {
+
+    EV_FLOAT ev;
+
+    const int inum = team.league_size();
+    const int atoms_per_team = team.team_size();
+    int firstatom = team.league_rank()*atoms_per_team;
+    int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, firstatom, lastatom), [&] (const int &ii) {
+
+      const int i = list.d_ilist[ii];
+      const X_FLOAT xtmp = c.x(i,0);
+      const X_FLOAT ytmp = c.x(i,1);
+      const X_FLOAT ztmp = c.x(i,2);
+      const int itype = c.type(i);
+      const F_FLOAT qtmp = c.q(i);
+
+      const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i);
+      const int jnum = list.d_numneigh[i];
+
+      FEV_FLOAT fev;
+
+      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,jnum),
+        [&] (const int jj, FEV_FLOAT& fev_tmp) {
+        int j = neighbors_i(jj);
+        const F_FLOAT factor_lj = c.special_lj[sbmask(j)];
+        const F_FLOAT factor_coul = c.special_coul[sbmask(j)];
+        j &= NEIGHMASK;
+        const X_FLOAT delx = xtmp - c.x(j,0);
+        const X_FLOAT dely = ytmp - c.x(j,1);
+        const X_FLOAT delz = ztmp - c.x(j,2);
+        const int jtype = c.type(j);
+        const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+        if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) {
+
+          F_FLOAT fpair = F_FLOAT();
+
+          if(rsq < (STACKPARAMS?c.m_cut_ljsq[itype][jtype]:c.d_cut_ljsq(itype,jtype)))
+            fpair+=factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
+          if(rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype)))
+            fpair+=c.template compute_fcoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);
+
+          fev.f[0] += delx*fpair;
+          fev.f[1] += dely*fpair;
+          fev.f[2] += delz*fpair;
+
+          F_FLOAT evdwl = 0.0;
+          F_FLOAT ecoul = 0.0;
+          if (c.eflag) {
+            if(rsq < (STACKPARAMS?c.m_cut_ljsq[itype][jtype]:c.d_cut_ljsq(itype,jtype))) {
+              evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
+              ev.evdwl += 0.5*evdwl;
+            }
+            if(rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype))) {
+              ecoul = c.template compute_ecoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);
+              ev.ecoul += 0.5*ecoul;
+            }
+          }
+          if (c.vflag) {
+            fev.v[0] += 0.5*delx*delx*fpair;
+            fev.v[1] += 0.5*dely*dely*fpair;
+            fev.v[2] += 0.5*delz*delz*fpair;
+            fev.v[3] += 0.5*delx*dely*fpair;
+            fev.v[4] += 0.5*delx*delz*fpair;
+            fev.v[5] += 0.5*dely*delz*fpair;
+          }
+        }
+      },fev);
+
+      Kokkos::single(Kokkos::PerThread(team), [&] (){
+        f(i,0) += fev.f[0];
+        f(i,1) += fev.f[1];
+        f(i,2) += fev.f[2];
+
+        if (c.eflag_global) {
+          ev.evdwl += fev.evdwl;
+          ev.ecoul += fev.ecoul;
+        }
+
+        if (c.eflag_atom)
+          d_eatom(i,0) += fev.evdwl + fev.ecoul;
+
+        if (c.vflag_global) {
+          ev.v[0] += fev.v[0];
+          ev.v[1] += fev.v[1];
+          ev.v[2] += fev.v[2];
+          ev.v[3] += fev.v[3];
+          ev.v[4] += fev.v[4];
+          ev.v[5] += fev.v[5];
+        }
+
+        if (c.vflag_atom) {
+          d_vatom(i,0) += fev.v[0];
+          d_vatom(i,1) += fev.v[1];
+          d_vatom(i,2) += fev.v[2];
+          d_vatom(i,3) += fev.v[3];
+          d_vatom(i,4) += fev.v[4];
+          d_vatom(i,5) += fev.v[5];
+        }
+      });
+    });
+    return ev;
+  }
+
  KOKKOS_INLINE_FUNCTION
    void ev_tally(EV_FLOAT &ev, const int &i, const int &j,
      const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
@ -355,6 +678,16 @@ struct PairComputeFunctor  {
    else
      energy_virial += compute_item<1,0>(i,list,typename DoCoul<PairStyle::COUL_FLAG>::type());
  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const typename Kokkos::TeamPolicy<>::member_type& team) const {
+    compute_item_team(team,list,typename DoCoul<PairStyle::COUL_FLAG>::type());
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const typename Kokkos::TeamPolicy<>::member_type& team, value_type &energy_virial) const {
+    energy_virial += compute_item_team_ev(team,list,typename DoCoul<PairStyle::COUL_FLAG>::type());
+  }
 };

 template <class PairStyle, bool STACKPARAMS, class Specialisation>
@ -489,6 +822,15 @@ struct PairComputeFunctor<PairStyle,N2,STACKPARAMS,Specialisation>  {
  void operator()(const int i, value_type &energy_virial) const {
    energy_virial += compute_item<1,0>(i,list,typename DoCoul<PairStyle::COUL_FLAG>::type());
  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const typename Kokkos::TeamPolicy<>::member_type& team) const
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const typename Kokkos::TeamPolicy<>::member_type& team, value_type &energy_virial) const
+  {}
+
 };

 // Filter out Neighflags which are not supported for PairStyle
@ -507,20 +849,57 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, typename Kokkos::Impl::enable
  return ev;
 }

+template<class FunctorStyle>
+int GetTeamSize(FunctorStyle& functor, int team_size, int vector_length) {
+    int team_size_max = Kokkos::TeamPolicy<>::team_size_max(functor);
+
+#ifdef KOKKOS_ENABLE_CUDA
+    if(team_size*vector_length > team_size_max)
+      team_size = team_size_max/vector_length;
+#else
+    team_size = 1;
+#endif
+    return team_size;
+}
+
 // Submit ParallelFor for NEIGHFLAG=HALF,HALFTHREAD,FULL,N2
 template<class PairStyle, unsigned NEIGHFLAG, class Specialisation>
 EV_FLOAT pair_compute_neighlist (PairStyle* fpair, typename Kokkos::Impl::enable_if<(NEIGHFLAG&PairStyle::EnabledNeighFlags) != 0, NeighListKokkos<typename PairStyle::device_type>*>::type list) {
  EV_FLOAT ev;
-  if(fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
-    PairComputeFunctor<PairStyle,NEIGHFLAG,false,Specialisation > ff(fpair,list);
-    if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
-    else                              Kokkos::parallel_for(list->inum,ff);
-    ff.contribute();
+
+  if (!fpair->lmp->kokkos->neigh_thread_set)
+    if (list->inum <= 16384 && NEIGHFLAG == FULL)
+      fpair->lmp->kokkos->neigh_thread = 1;
+
+  if (fpair->lmp->kokkos->neigh_thread) {
+    int vector_length = 8;
+    int atoms_per_team = 32;
+
+    if(fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
+      PairComputeFunctor<PairStyle,NEIGHFLAG,false,Specialisation > ff(fpair,list);
+      atoms_per_team = GetTeamSize(ff, atoms_per_team, vector_length);
+      Kokkos::TeamPolicy<Kokkos::IndexType<int> > policy(list->inum,atoms_per_team,vector_length);
+      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev);
+      else                              Kokkos::parallel_for(policy,ff);
+    } else {
+      PairComputeFunctor<PairStyle,NEIGHFLAG,true,Specialisation > ff(fpair,list);
+      atoms_per_team = GetTeamSize(ff, atoms_per_team, vector_length);
+      Kokkos::TeamPolicy<Kokkos::IndexType<int> > policy(list->inum,atoms_per_team,vector_length);
+      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev);
+      else                              Kokkos::parallel_for(policy,ff);
+    }
  } else {
-    PairComputeFunctor<PairStyle,NEIGHFLAG,true,Specialisation > ff(fpair,list);
-    if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
-    else                              Kokkos::parallel_for(list->inum,ff);
-    ff.contribute();
+    if(fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
+      PairComputeFunctor<PairStyle,NEIGHFLAG,false,Specialisation > ff(fpair,list);
+      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
+      else                              Kokkos::parallel_for(list->inum,ff);
+      ff.contribute();
+    } else {
+      PairComputeFunctor<PairStyle,NEIGHFLAG,true,Specialisation > ff(fpair,list);
+      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
+      else                              Kokkos::parallel_for(list->inum,ff);
+      ff.contribute();
+    }
  }
  return ev;
 }
--- a/src/KOKKOS/verlet_kokkos.cpp
+++ b/src/KOKKOS/verlet_kokkos.cpp
@ -93,7 +93,6 @@ void VerletKokkos::setup(int flag)
  }

  update->setupflag = 1;
-  lmp->kokkos->auto_sync = 0;

  // setup domain, communication and neighboring
  // acquire ghosts
@ -187,6 +186,7 @@ void VerletKokkos::setup(int flag)
  }
  if (force->newton) comm->reverse_comm();

+  lmp->kokkos->auto_sync = 0;
  modify->setup(vflag);
  output->setup(flag);
  lmp->kokkos->auto_sync = 1;
@ -202,7 +202,6 @@ void VerletKokkos::setup(int flag)
 void VerletKokkos::setup_minimal(int flag)
 {
  update->setupflag = 1;
-  lmp->kokkos->auto_sync = 0;

  // setup domain, communication and neighboring
  // acquire ghosts
@ -294,7 +293,6 @@ void VerletKokkos::setup_minimal(int flag)
  if (force->newton) comm->reverse_comm();

  modify->setup(vflag);
-  lmp->kokkos->auto_sync = 1;
  update->setupflag = 0;
 }

--- a/src/neighbor.h
+++ b/src/neighbor.h
@ -126,6 +126,8 @@ class Neighbor : protected Pointers {

  bigint memory_usage();

+  bigint last_setup_bins;          // step of last neighbor::setup_bins() call
+
 protected:
  int me,nprocs;
  int firsttime;                   // flag for calling init_styles() only once
@ -139,8 +141,6 @@ class Neighbor : protected Pointers {
  int fix_check;                   // # of fixes that induce reneigh
  int *fixchecklist;               // which fixes to check

-  bigint last_setup_bins;          // step of last neighbor::setup_bins() call
-
  double triggersq;                // trigger = build when atom moves this dist

  double **xhold;                      // atom coords at last neighbor build