From 1d939231a49dde46e9e1d18084f23fb189e8e055 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Mon, 21 Nov 2016 12:21:01 -0500
Subject: [PATCH 001/267] USER-DPD: initial Kokkos port, first steps from Aug
 24th ARL Kokkos hackathon atom_vec_dpd_kokkos pair_dpd_fdt_energy_kokkos
 without the Oct 7th VV support from e27ed6c

---
 src/KOKKOS/atom_kokkos.h                    |    6 +
 src/USER-DPD/atom_vec_dpd_kokkos.cpp        | 1872 +++++++++++++++++++
 src/USER-DPD/atom_vec_dpd_kokkos.h          |  135 ++
 src/USER-DPD/pair_dpd_fdt_energy_kokkos.cpp |  373 ++++
 src/USER-DPD/pair_dpd_fdt_energy_kokkos.h   |  119 ++
 5 files changed, 2505 insertions(+)
 create mode 100644 src/USER-DPD/atom_vec_dpd_kokkos.cpp
 create mode 100644 src/USER-DPD/atom_vec_dpd_kokkos.h
 create mode 100644 src/USER-DPD/pair_dpd_fdt_energy_kokkos.cpp
 create mode 100644 src/USER-DPD/pair_dpd_fdt_energy_kokkos.h

diff --git a/src/KOKKOS/atom_kokkos.h b/src/KOKKOS/atom_kokkos.h
index 05aae712d9..f31c26e01f 100644
--- a/src/KOKKOS/atom_kokkos.h
+++ b/src/KOKKOS/atom_kokkos.h
@@ -51,6 +51,12 @@ class AtomKokkos : public Atom {
   DAT::tdual_int_2d k_improper_type;
   DAT::tdual_tagint_2d k_improper_atom1, k_improper_atom2, k_improper_atom3, k_improper_atom4;
 
+
+// USER-DPD package
+  DAT::tdual_efloat_1d k_uCond, k_uMech, k_uChem, k_uCG, k_uCGnew,
+                       k_rho,k_dpdTheta,k_duChem;
+
+
   AtomKokkos(class LAMMPS *);
   ~AtomKokkos();
 
diff --git a/src/USER-DPD/atom_vec_dpd_kokkos.cpp b/src/USER-DPD/atom_vec_dpd_kokkos.cpp
new file mode 100644
index 0000000000..c58b592e53
--- /dev/null
+++ b/src/USER-DPD/atom_vec_dpd_kokkos.cpp
@@ -0,0 +1,1872 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale AtomicKokkos/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include "atom_vec_dpd_kokkos.h"
+#include "atom_kokkos.h"
+#include "comm_kokkos.h"
+#include "domain.h"
+#include "modify.h"
+#include "fix.h"
+#include "atom_masks.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define DELTA 10000
+
+/* ---------------------------------------------------------------------- */
+
+AtomVecDPDKokkos::AtomVecDPDKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)
+{
+  molecular = 0;
+  mass_type = 1;
+
+  comm_x_only = comm_f_only = 0;
+  size_forward = 7;
+  size_reverse = 3;
+  size_border = 12;
+  size_velocity = 3;
+  size_data_atom = 6;
+  size_data_vel = 4;
+  xcol_data = 4;
+
+  atom->rho_flag = 1;
+  atom->dpd_flag = 1;
+
+  k_count = DAT::tdual_int_1d("atom::k_count",1);
+  atomKK = (AtomKokkos *) atom;
+  commKK = (CommKokkos *) comm;
+}
+
+/* ----------------------------------------------------------------------
+   grow atom arrays
+   n = 0 grows arrays by DELTA
+   n > 0 allocates arrays to size n
+------------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::grow(int n)
+{
+  if (n == 0) nmax += DELTA;
+  else nmax = n;
+  atomKK->nmax = nmax;
+  if (nmax < 0 || nmax > MAXSMALLINT)
+    error->one(FLERR,"Per-processor system is too big");
+
+  sync(Device,ALL_MASK);
+  modified(Device,ALL_MASK);
+
+  memory->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
+  memory->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
+  memory->grow_kokkos(atomKK->k_mask,atomKK->mask,nmax,"atom:mask");
+  memory->grow_kokkos(atomKK->k_image,atomKK->image,nmax,"atom:image");
+
+  memory->grow_kokkos(atomKK->k_x,atomKK->x,nmax,3,"atom:x");
+  memory->grow_kokkos(atomKK->k_v,atomKK->v,nmax,3,"atom:v");
+  memory->grow_kokkos(atomKK->k_f,atomKK->f,nmax,3,"atom:f");
+
+
+  memory->grow_kokkos(atomKK->k_rho,atomKK->rho,nmax,"atom:rho");
+  memory->grow_kokkos(atomKK->k_dpdTheta,atomKK->dpdTheta,nmax,"atom:dpdTheta");
+  memory->grow_kokkos(atomKK->k_uCond,atomKK->uCond,nmax,"atom:uCond");
+  memory->grow_kokkos(atomKK->k_uMech,atomKK->uMech,nmax,"atom:uMech");
+  memory->grow_kokkos(atomKK->k_uChem,atomKK->uChem,nmax,"atom:uChem");
+  memory->grow_kokkos(atomKK->k_uCG,atomKK->uCG,nmax,"atom:uCG");
+  memory->grow_kokkos(atomKK->k_uCGnew,atomKK->uCGnew,nmax,"atom:uCGnew");
+  memory->grow_kokkos(atomKK->k_duChem,atomKK->duChem,nmax,"atom:duChem");
+
+  grow_reset();
+  sync(Host,ALL_MASK);
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      modify->fix[atom->extra_grow[iextra]]->grow_arrays(nmax);
+}
+
+/* ----------------------------------------------------------------------
+   reset local array ptrs
+------------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::grow_reset()
+{
+  tag = atomKK->tag;
+  d_tag = atomKK->k_tag.d_view;
+  h_tag = atomKK->k_tag.h_view;
+
+  type = atomKK->type;
+  d_type = atomKK->k_type.d_view;
+  h_type = atomKK->k_type.h_view;
+  mask = atomKK->mask;
+  d_mask = atomKK->k_mask.d_view;
+  h_mask = atomKK->k_mask.h_view;
+  image = atomKK->image;
+  d_image = atomKK->k_image.d_view;
+  h_image = atomKK->k_image.h_view;
+
+  x = atomKK->x;
+  d_x = atomKK->k_x.d_view;
+  h_x = atomKK->k_x.h_view;
+  v = atomKK->v;
+  d_v = atomKK->k_v.d_view;
+  h_v = atomKK->k_v.h_view;
+  f = atomKK->f;
+  d_f = atomKK->k_f.d_view;
+  h_f = atomKK->k_f.h_view;
+
+  rho = atomKK->rho;
+  d_rho = atomKK->k_rho.d_view;
+  h_rho = atomKK->k_rho.h_view;
+  dpdTheta = atomKK->dpdTheta;
+  d_dpdTheta = atomKK->k_dpdTheta.d_view;
+  h_dpdTheta = atomKK->k_dpdTheta.h_view;
+  uCond = atomKK->uCond;
+  d_uCond = atomKK->k_uCond.d_view;;
+  h_uCond = atomKK->k_uCond.h_view;
+  uMech = atomKK->uMech;
+  d_uMech = atomKK->k_uMech.d_view;;
+  h_uMech = atomKK->k_uMech.h_view;
+  uChem = atomKK->uChem;
+  d_uChem = atomKK->k_uChem.d_view;;
+  h_uChem = atomKK->k_uChem.h_view;
+  uCG = atomKK->uCG;
+  d_uCG = atomKK->k_uCG.d_view;;
+  h_uCG = atomKK->k_uCG.h_view;
+  uCGnew = atomKK->uCGnew;
+  d_uCGnew = atomKK->k_uCGnew.d_view;;
+  h_uCGnew = atomKK->k_uCGnew.h_view;
+  duChem = atomKK->duChem;
+  d_duChem = atomKK->k_duChem.d_view;;
+  h_duChem = atomKK->k_duChem.h_view;
+}
+
+/* ----------------------------------------------------------------------
+   copy atom I info to atom J
+------------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::copy(int i, int j, int delflag)
+{
+  h_tag[j] = h_tag[i];
+  h_type[j] = h_type[i];
+  mask[j] = mask[i];
+  h_image[j] = h_image[i];
+  h_x(j,0) = h_x(i,0);
+  h_x(j,1) = h_x(i,1);
+  h_x(j,2) = h_x(i,2);
+  h_v(j,0) = h_v(i,0);
+  h_v(j,1) = h_v(i,1);
+  h_v(j,2) = h_v(i,2);
+  h_dpdTheta[j] = h_dpdTheta[i];
+  h_uCond[j] = h_uCond[i];
+  h_uMech[j] = h_uMech[i];
+  h_uChem[j] = h_uChem[i];
+  h_uCG[j] = h_uCG[i];
+  h_uCGnew[j] = h_uCGnew[i];
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      modify->fix[atom->extra_grow[iextra]]->copy_arrays(i,j,delflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG,int TRICLINIC>
+struct AtomVecDPDKokkos_PackComm {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  typename ArrayTypes<DeviceType>::t_efloat_1d _dpdTheta,_uCond,_uMech,_uChem;
+  typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
+  X_FLOAT _pbc[6];
+
+  AtomVecDPDKokkos_PackComm(
+      const typename DAT::tdual_x_array &x,
+      const typename DAT::tdual_efloat_1d &dpdTheta,
+      const typename DAT::tdual_efloat_1d &uCond,
+      const typename DAT::tdual_efloat_1d &uMech,
+      const typename DAT::tdual_efloat_1d &uChem,
+      const typename DAT::tdual_xfloat_2d &buf,
+      const typename DAT::tdual_int_2d &list,
+      const int & iswap,
+      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
+      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
+      _x(x.view<DeviceType>()),
+      _dpdTheta(dpdTheta.view<DeviceType>()),
+      _uCond(uCond.view<DeviceType>()),
+      _uMech(uMech.view<DeviceType>()),
+      _uChem(uChem.view<DeviceType>()),
+      _list(list.view<DeviceType>()),_iswap(iswap),
+      _xprd(xprd),_yprd(yprd),_zprd(zprd),
+      _xy(xy),_xz(xz),_yz(yz) {
+        const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
+        const size_t elements = 3;
+        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
+        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
+        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+      const int j = _list(_iswap,i);
+      if (PBC_FLAG == 0) {
+          _buf(i,0) = _x(j,0);
+          _buf(i,1) = _x(j,1);
+          _buf(i,2) = _x(j,2);
+      } else {
+        if (TRICLINIC == 0) {
+          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
+          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
+          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
+        } else {
+          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
+          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
+          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
+        }
+      }
+      _buf(i,3) = _dpdTheta(j);
+      _buf(i,4) = _uCond(j);
+      _buf(i,5) = _uMech(j);
+      _buf(i,6) = _uChem(j);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_comm_kokkos(const int &n,
+                                          const DAT::tdual_int_2d &list,
+                                          const int & iswap,
+                                          const DAT::tdual_xfloat_2d &buf,
+                                          const int &pbc_flag,
+                                          const int* const pbc)
+{
+  // Check whether to always run forward communication on the host
+  // Choose correct forward PackComm kernel
+
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+        struct AtomVecDPDKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecDPDKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+        struct AtomVecDPDKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecDPDKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    }
+    LMPHostType::fence();
+  } else {
+    sync(Device,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+        struct AtomVecDPDKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecDPDKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+        struct AtomVecDPDKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecDPDKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    }
+    LMPDeviceType::fence();
+  }
+
+	return n*size_forward;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG,int TRICLINIC>
+struct AtomVecDPDKokkos_PackCommSelf {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  typename ArrayTypes<DeviceType>::t_x_array _xw;
+  typename ArrayTypes<DeviceType>::t_efloat_1d _dpdTheta,_uCond,_uMech,_uChem;
+  int _nfirst;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
+  X_FLOAT _pbc[6];
+
+  AtomVecDPDKokkos_PackCommSelf(
+      const typename DAT::tdual_x_array &x,
+      const typename DAT::tdual_efloat_1d &dpdTheta,
+      const typename DAT::tdual_efloat_1d &uCond,
+      const typename DAT::tdual_efloat_1d &uMech,
+      const typename DAT::tdual_efloat_1d &uChem,
+      const int &nfirst,
+      const typename DAT::tdual_int_2d &list,
+      const int & iswap,
+      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
+      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
+      _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),
+      _dpdTheta(dpdTheta.view<DeviceType>()),
+      _uCond(uCond.view<DeviceType>()),
+      _uMech(uMech.view<DeviceType>()),
+      _uChem(uChem.view<DeviceType>()),      
+      _nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
+      _xprd(xprd),_yprd(yprd),_zprd(zprd),
+      _xy(xy),_xz(xz),_yz(yz) {
+        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
+        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+        const int j = _list(_iswap,i);
+      if (PBC_FLAG == 0) {
+          _xw(i+_nfirst,0) = _x(j,0);
+          _xw(i+_nfirst,1) = _x(j,1);
+          _xw(i+_nfirst,2) = _x(j,2);
+      } else {
+        if (TRICLINIC == 0) {
+          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
+          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
+          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
+        } else {
+          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
+          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
+          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
+        }
+      }
+      _dpdTheta(i+_nfirst) = _dpdTheta(j);
+      _uCond(i+_nfirst) = _uCond(j);
+      _uMech(i+_nfirst) = _uMech(j);
+      _uChem(i+_nfirst) = _uChem(j); 
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
+										const int nfirst, const int &pbc_flag, const int* const pbc) {
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK);
+    modified(Host,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    }
+    LMPHostType::fence();
+  } else {
+    sync(Device,X_MASK);
+    modified(Device,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    }
+    LMPDeviceType::fence();
+  }
+	return n*3;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecDPDKokkos_UnpackComm {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array _x;
+  typename ArrayTypes<DeviceType>::t_efloat_1d _dpdTheta,_uCond,_uMech,_uChem;
+  typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
+  int _first;
+
+  AtomVecDPDKokkos_UnpackComm(
+      const typename DAT::tdual_x_array &x,
+      const typename DAT::tdual_efloat_1d &dpdTheta,
+      const typename DAT::tdual_efloat_1d &uCond,
+      const typename DAT::tdual_efloat_1d &uMech,
+      const typename DAT::tdual_efloat_1d &uChem,
+      const typename DAT::tdual_xfloat_2d &buf,
+      const int& first):_x(x.view<DeviceType>()),
+                        _dpdTheta(dpdTheta.view<DeviceType>()),
+                        _uCond(uCond.view<DeviceType>()),
+                        _uMech(uMech.view<DeviceType>()),
+                        _uChem(uChem.view<DeviceType>()),
+                        _buf(buf.view<DeviceType>()),
+                        _first(first) {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+      _x(i+_first,0) = _buf(i,0);
+      _x(i+_first,1) = _buf(i,1);
+      _x(i+_first,2) = _buf(i,2);
+      _dpdTheta(i+_first) = _buf(i,3);
+      _uCond(i+_first) = _buf(i,4);
+      _uMech(i+_first) = _buf(i,5);
+      _uChem(i+_first) = _buf(i,6);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::unpack_comm_kokkos(const int &n, const int &first,
+    const DAT::tdual_xfloat_2d &buf ) {
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK);
+    modified(Host,X_MASK);
+    struct AtomVecDPDKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,
+    atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+    buf,first);
+    Kokkos::parallel_for(n,f);
+    LMPDeviceType::fence();
+  } else {
+    sync(Device,X_MASK);
+    modified(Device,X_MASK);
+    struct AtomVecDPDKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,
+    atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+    buf,first);
+    Kokkos::parallel_for(n,f);
+    LMPDeviceType::fence();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_comm(int n, int *list, double *buf,
+                             int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = dpdTheta[j];
+      buf[m++] = uCond[j];
+      buf[m++] = uMech[j];
+      buf[m++] = uChem[j];
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+      dz = pbc[2]*domain->zprd;
+    }
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0) + dx;
+      buf[m++] = h_x(j,1) + dy;
+      buf[m++] = h_x(j,2) + dz;
+      buf[m++] = h_dpdTheta[j];
+      buf[m++] = h_uCond[j];
+      buf[m++] = h_uMech[j];
+      buf[m++] = h_uChem[j];
+    }
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_comm_vel(int n, int *list, double *buf,
+                                 int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz,dvx,dvy,dvz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = h_v(j,0);
+      buf[m++] = h_v(j,1);
+      buf[m++] = h_v(j,2);
+      buf[m++] = h_dpdTheta[j];
+      buf[m++] = h_uCond[j];
+      buf[m++] = h_uMech[j];
+      buf[m++] = h_uChem[j];
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+      dz = pbc[2]*domain->zprd;
+    }
+    if (!deform_vremap) {
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = h_v(j,0);
+        buf[m++] = h_v(j,1);
+        buf[m++] = h_v(j,2);
+        buf[m++] = h_dpdTheta[j];
+        buf[m++] = h_uCond[j];
+        buf[m++] = h_uMech[j];
+        buf[m++] = h_uChem[j];
+      }
+    } else {
+      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
+      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
+      dvz = pbc[2]*h_rate[2];
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        if (mask[i] & deform_groupbit) {
+          buf[m++] = h_v(j,0) + dvx;
+          buf[m++] = h_v(j,1) + dvy;
+          buf[m++] = h_v(j,2) + dvz;
+        } else {
+          buf[m++] = h_v(j,0);
+          buf[m++] = h_v(j,1);
+          buf[m++] = h_v(j,2);
+        }
+        buf[m++] = h_dpdTheta(j);
+        buf[m++] = h_uCond(j);
+        buf[m++] = h_uMech(j);
+        buf[m++] = h_uChem(j); 
+      }
+    }
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::unpack_comm(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_dpdTheta[i] = buf[m++];
+    h_uCond[i] = buf[m++];
+    h_uMech[i] = buf[m++];
+    h_uChem[i] = buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::unpack_comm_vel(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_v(i,0) = buf[m++];
+    h_v(i,1) = buf[m++];
+    h_v(i,2) = buf[m++];
+    h_dpdTheta[i] = buf[m++];
+    h_uCond[i] = buf[m++];
+    h_uMech[i] = buf[m++];
+    h_uChem[i] = buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_reverse(int n, int first, double *buf)
+{
+  if(n > 0)
+    sync(Host,F_MASK);
+
+  int m = 0;
+  const int last = first + n;
+  for (int i = first; i < last; i++) {
+    buf[m++] = h_f(i,0);
+    buf[m++] = h_f(i,1);
+    buf[m++] = h_f(i,2);
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::unpack_reverse(int n, int *list, double *buf)
+{
+  if(n > 0) {
+    sync(Host,F_MASK);
+    modified(Host,F_MASK);
+  }
+
+  int m = 0;
+  for (int i = 0; i < n; i++) {
+    const int j = list[i];
+    h_f(j,0) += buf[m++];
+    h_f(j,1) += buf[m++];
+    h_f(j,2) += buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG>
+struct AtomVecDPDKokkos_PackBorder {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_xfloat_2d _buf;
+  const typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  const typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  const typename ArrayTypes<DeviceType>::t_tagint_1d _tag;
+  const typename ArrayTypes<DeviceType>::t_int_1d _type;
+  const typename ArrayTypes<DeviceType>::t_int_1d _mask;
+  typename ArrayTypes<DeviceType>::t_efloat_1d _dpdTheta,_uCond,_uMech,_uChem,_uCG,_uCGnew;
+  X_FLOAT _dx,_dy,_dz;
+
+  AtomVecDPDKokkos_PackBorder(
+      const typename ArrayTypes<DeviceType>::t_xfloat_2d &buf,
+      const typename ArrayTypes<DeviceType>::t_int_2d_const &list,
+      const int & iswap,
+      const typename ArrayTypes<DeviceType>::t_x_array &x,
+      const typename ArrayTypes<DeviceType>::t_tagint_1d &tag,
+      const typename ArrayTypes<DeviceType>::t_int_1d &type,
+      const typename ArrayTypes<DeviceType>::t_int_1d &mask,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &dpdTheta,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCond,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uMech,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uChem,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCG,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCGnew,
+      const X_FLOAT &dx, const X_FLOAT &dy, const X_FLOAT &dz):
+      _buf(buf),_list(list),_iswap(iswap),
+      _x(x),_tag(tag),_type(type),_mask(mask),
+      _dpdTheta(dpdTheta),
+      _uCond(uCond),
+      _uMech(uMech),
+      _uChem(uChem),
+      _uCG(uCGnew),
+      _uCGnew(uCGnew),
+      _dx(dx),_dy(dy),_dz(dz) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+      const int j = _list(_iswap,i);
+      if (PBC_FLAG == 0) {
+          _buf(i,0) = _x(j,0);
+          _buf(i,1) = _x(j,1);
+          _buf(i,2) = _x(j,2);
+      } else {
+          _buf(i,0) = _x(j,0) + _dx;
+          _buf(i,1) = _x(j,1) + _dy;
+          _buf(i,2) = _x(j,2) + _dz;
+      }
+      _buf(i,3) = _tag(j);
+      _buf(i,4) = _type(j);
+      _buf(i,5) = _mask(j);
+      _buf(i,6) = _dpdTheta(j);
+      _buf(i,7) = _uCond(j);
+      _buf(i,8) = _uMech(j);
+      _buf(i,9) = _uChem(j);
+      _buf(i,10) = _uCG(j);
+      _buf(i,11) = _uCGnew(j);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap,
+                               int pbc_flag, int *pbc, ExecutionSpace space)
+{
+  X_FLOAT dx,dy,dz;
+
+  if (pbc_flag != 0) {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    if(space==Host) {
+      AtomVecDPDKokkos_PackBorder<LMPHostType,1> f(
+        buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
+        iswap,h_x,h_tag,h_type,h_mask,
+        h_dpdTheta,h_uCond,h_uMech,h_uChem,h_uCG,h_uCGnew,
+        dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+      LMPHostType::fence();
+    } else {
+      AtomVecDPDKokkos_PackBorder<LMPDeviceType,1> f(
+        buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
+        iswap,d_x,d_tag,d_type,d_mask,
+        d_dpdTheta,d_uCond,d_uMech,d_uChem,d_uCG,d_uCGnew,
+        dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+      LMPDeviceType::fence();
+    }
+
+  } else {
+    dx = dy = dz = 0;
+    if(space==Host) {
+      AtomVecDPDKokkos_PackBorder<LMPHostType,0> f(
+        buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
+        iswap,h_x,h_tag,h_type,h_mask,
+        h_dpdTheta,h_uCond,h_uMech,h_uChem,h_uCG,h_uCGnew,
+        dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+      LMPHostType::fence();
+    } else {
+      AtomVecDPDKokkos_PackBorder<LMPDeviceType,0> f(
+        buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
+        iswap,d_x,d_tag,d_type,d_mask,
+        d_dpdTheta,d_uCond,d_uMech,d_uChem,d_uCG,d_uCGnew,
+        dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+      LMPDeviceType::fence();
+    }
+  }
+  return n*6;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_border(int n, int *list, double *buf,
+                               int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = ubuf(h_tag(j)).d;
+      buf[m++] = ubuf(h_type(j)).d;
+      buf[m++] = ubuf(h_mask(j)).d;
+      buf[m++] = h_dpdTheta(j);
+      buf[m++] = h_uCond(j);
+      buf[m++] = h_uMech(j);
+      buf[m++] = h_uChem(j);
+      buf[m++] = h_uCG(j);
+      buf[m++] = h_uCGnew(j);
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0) + dx;
+      buf[m++] = h_x(j,1) + dy;
+      buf[m++] = h_x(j,2) + dz;
+      buf[m++] = ubuf(h_tag(j)).d;
+      buf[m++] = ubuf(h_type(j)).d;
+      buf[m++] = ubuf(h_mask(j)).d;
+      buf[m++] = h_dpdTheta(j);
+      buf[m++] = h_uCond(j);
+      buf[m++] = h_uMech(j);
+      buf[m++] = h_uChem(j);
+      buf[m++] = h_uCG(j);
+      buf[m++] = h_uCGnew(j);
+    }
+  }
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->pack_border(n,list,&buf[m]);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_border_vel(int n, int *list, double *buf,
+                                   int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz,dvx,dvy,dvz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = ubuf(h_tag(j)).d;
+      buf[m++] = ubuf(h_type(j)).d;
+      buf[m++] = ubuf(h_mask(j)).d;
+      buf[m++] = h_v(j,0);
+      buf[m++] = h_v(j,1);
+      buf[m++] = h_v(j,2);
+      buf[m++] = h_dpdTheta(j);
+      buf[m++] = h_uCond(j);
+      buf[m++] = h_uMech(j);
+      buf[m++] = h_uChem(j);
+      buf[m++] = h_uCG(j);
+      buf[m++] = h_uCGnew(j);
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    if (!deform_vremap) {
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = ubuf(h_tag(j)).d;
+        buf[m++] = ubuf(h_type(j)).d;
+        buf[m++] = ubuf(h_mask(j)).d;
+        buf[m++] = h_v(j,0);
+        buf[m++] = h_v(j,1);
+        buf[m++] = h_v(j,2);
+        buf[m++] = h_dpdTheta(j);
+        buf[m++] = h_uCond(j);
+        buf[m++] = h_uMech(j);
+        buf[m++] = h_uChem(j);
+        buf[m++] = h_uCG(j);
+        buf[m++] = h_uCGnew(j);
+      }
+    } else {
+      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
+      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
+      dvz = pbc[2]*h_rate[2];
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = ubuf(h_tag(j)).d;
+        buf[m++] = ubuf(h_type(j)).d;
+        buf[m++] = ubuf(h_mask(j)).d;
+        if (mask[i] & deform_groupbit) {
+          buf[m++] = h_v(j,0) + dvx;
+          buf[m++] = h_v(j,1) + dvy;
+          buf[m++] = h_v(j,2) + dvz;
+        } else {
+          buf[m++] = h_v(j,0);
+          buf[m++] = h_v(j,1);
+          buf[m++] = h_v(j,2);
+        }
+        buf[m++] = h_dpdTheta(j);
+        buf[m++] = h_uCond(j);
+        buf[m++] = h_uMech(j);
+        buf[m++] = h_uChem(j);
+        buf[m++] = h_uCG(j);
+        buf[m++] = h_uCGnew(j);
+      }
+    }
+  }
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->pack_border(n,list,&buf[m]);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_comm_hybrid(int n, int *list, double *buf)
+{
+  int i,j,m;
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    buf[m++] = h_dpdTheta[j];
+    buf[m++] = h_uCond[j];
+    buf[m++] = h_uMech[j];
+    buf[m++] = h_uChem[j];
+    buf[m++] = h_uCG[j];
+    buf[m++] = h_uCGnew[j];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_border_hybrid(int n, int *list, double *buf)
+{
+  int i,j,m;
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    buf[m++] = h_dpdTheta[j];
+    buf[m++] = h_uCond[j];
+    buf[m++] = h_uMech[j];
+    buf[m++] = h_uChem[j];
+    buf[m++] = h_uCG[j];
+    buf[m++] = h_uCGnew[j];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecDPDKokkos_UnpackBorder {
+  typedef DeviceType device_type;
+
+  const typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
+  typename ArrayTypes<DeviceType>::t_x_array _x;
+  typename ArrayTypes<DeviceType>::t_tagint_1d _tag;
+  typename ArrayTypes<DeviceType>::t_int_1d _type;
+  typename ArrayTypes<DeviceType>::t_int_1d _mask;
+  typename ArrayTypes<DeviceType>::t_efloat_1d _dpdTheta,_uCond,_uMech,_uChem,_uCG,_uCGnew;
+  int _first;
+
+
+  AtomVecDPDKokkos_UnpackBorder(
+      const typename ArrayTypes<DeviceType>::t_xfloat_2d_const &buf,
+      typename ArrayTypes<DeviceType>::t_x_array &x,
+      typename ArrayTypes<DeviceType>::t_tagint_1d &tag,
+      typename ArrayTypes<DeviceType>::t_int_1d &type,
+      typename ArrayTypes<DeviceType>::t_int_1d &mask,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &dpdTheta,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCond,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uMech,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uChem,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCG,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCGnew,
+      const int& first):
+      _buf(buf),_x(x),_tag(tag),_type(type),_mask(mask),
+      _dpdTheta(dpdTheta),
+      _uCond(uCond),
+      _uMech(uMech),
+      _uChem(uChem),
+      _uCG(uCGnew),
+      _uCGnew(uCGnew),
+      _first(first) {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+      _x(i+_first,0) = _buf(i,0);
+      _x(i+_first,1) = _buf(i,1);
+      _x(i+_first,2) = _buf(i,2);
+      _tag(i+_first) = static_cast<int> (_buf(i,3));
+      _type(i+_first) = static_cast<int>  (_buf(i,4));
+      _mask(i+_first) = static_cast<int>  (_buf(i,5));
+      _dpdTheta(i+_first) = _buf(i,6);
+      _uCond(i+_first) = _buf(i,7);
+      _uMech(i+_first) = _buf(i,8);
+      _uChem(i+_first) = _buf(i,9);
+      _uCG(i+_first) = _buf(i,10);
+      _uCGnew(i+_first) = _buf(i,11);
+//      printf("%i %i %lf %lf %lf %i BORDER\n",_tag(i+_first),i+_first,_x(i+_first,0),_x(i+_first,1),_x(i+_first,2),_type(i+_first));
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::unpack_border_kokkos(const int &n, const int &first,
+                     const DAT::tdual_xfloat_2d &buf,ExecutionSpace space) {
+  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+  while (first+n >= nmax) grow(0);
+  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+  if(space==Host) {
+    struct AtomVecDPDKokkos_UnpackBorder<LMPHostType> f(buf.view<LMPHostType>(),
+      h_x,h_tag,h_type,h_mask,
+      h_dpdTheta,h_uCond,h_uMech,h_uChem,h_uCG,h_uCGnew,
+      first);
+    Kokkos::parallel_for(n,f);
+    LMPHostType::fence();
+  } else {
+    struct AtomVecDPDKokkos_UnpackBorder<LMPDeviceType> f(buf.view<LMPDeviceType>(),
+      d_x,d_tag,d_type,d_mask,
+      d_dpdTheta,d_uCond,d_uMech,d_uChem,d_uCG,d_uCGnew,
+      first);
+    Kokkos::parallel_for(n,f);
+    LMPDeviceType::fence();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::unpack_border(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    if (i == nmax) grow(0);
+    modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_tag(i) =  (tagint)  ubuf(buf[m++]).i;
+    h_type(i) = (int) ubuf(buf[m++]).i;
+    h_mask(i) = (int) ubuf(buf[m++]).i;
+    h_dpdTheta(i) = buf[m++];
+    h_uCond(i) = buf[m++];
+    h_uMech(i) = buf[m++];
+    h_uChem(i) = buf[m++];
+    h_uCG(i) = buf[m++];
+    h_uCGnew(i) = buf[m++];
+  }
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->
+        unpack_border(n,first,&buf[m]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::unpack_border_vel(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    if (i == nmax) grow(0);
+    modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_tag(i) =  (tagint)  ubuf(buf[m++]).i;
+    h_type(i) = (int) ubuf(buf[m++]).i;
+    h_mask(i) = (int) ubuf(buf[m++]).i;
+    h_v(i,0) = buf[m++];
+    h_v(i,1) = buf[m++];
+    h_v(i,2) = buf[m++];
+    h_dpdTheta(i) = buf[m++];
+    h_uCond(i) = buf[m++];
+    h_uMech(i) = buf[m++];
+    h_uChem(i) = buf[m++];
+    h_uCG(i) = buf[m++];
+    h_uCGnew(i) = buf[m++];
+  }
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->
+        unpack_border(n,first,&buf[m]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::unpack_comm_hybrid(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_dpdTheta(i) = buf[m++];
+    h_uCond(i) = buf[m++];
+    h_uMech(i) = buf[m++];
+    h_uChem(i) = buf[m++];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::unpack_border_hybrid(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_dpdTheta(i) = buf[m++];
+    h_uCond(i) = buf[m++];
+    h_uMech(i) = buf[m++];
+    h_uChem(i) = buf[m++];
+    h_uCG(i) = buf[m++];
+    h_uCGnew(i) = buf[m++];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecDPDKokkos_PackExchangeFunctor {
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typename AT::t_x_array_randomread _x;
+  typename AT::t_v_array_randomread _v;
+  typename AT::t_tagint_1d_randomread _tag;
+  typename AT::t_int_1d_randomread _type;
+  typename AT::t_int_1d_randomread _mask;
+  typename AT::t_imageint_1d_randomread _image;
+  typename AT::t_efloat_1d_randomread _dpdTheta,_uCond,_uMech,_uChem,_uCG,_uCGnew;
+  typename AT::t_x_array _xw;
+  typename AT::t_v_array _vw;
+  typename AT::t_tagint_1d _tagw;
+  typename AT::t_int_1d _typew;
+  typename AT::t_int_1d _maskw;
+  typename AT::t_imageint_1d _imagew;
+  typename AT::t_efloat_1d _dpdThetaw,_uCondw,_uMechw,_uChemw,_uCGw,_uCGneww;
+
+  typename AT::t_xfloat_2d_um _buf;
+  typename AT::t_int_1d_const _sendlist;
+  typename AT::t_int_1d_const _copylist;
+  int _nlocal,_dim;
+  X_FLOAT _lo,_hi;
+
+  AtomVecDPDKokkos_PackExchangeFunctor(
+      const AtomKokkos* atom,
+      const typename AT::tdual_xfloat_2d buf,
+      typename AT::tdual_int_1d sendlist,
+      typename AT::tdual_int_1d copylist,int nlocal, int dim,
+                X_FLOAT lo, X_FLOAT hi):
+                _x(atom->k_x.view<DeviceType>()),
+                _v(atom->k_v.view<DeviceType>()),
+                _tag(atom->k_tag.view<DeviceType>()),
+                _type(atom->k_type.view<DeviceType>()),
+                _mask(atom->k_mask.view<DeviceType>()),
+                _image(atom->k_image.view<DeviceType>()),
+                _dpdTheta(atom->k_dpdTheta.view<DeviceType>()),
+                _uCond(atom->k_uCond.view<DeviceType>()),
+                _uMech(atom->k_uMech.view<DeviceType>()),
+                _uChem(atom->k_uChem.view<DeviceType>()),
+                _uCG(atom->k_uCG.view<DeviceType>()),
+                _uCGnew(atom->k_uCGnew.view<DeviceType>()),
+                _xw(atom->k_x.view<DeviceType>()),
+                _vw(atom->k_v.view<DeviceType>()),
+                _tagw(atom->k_tag.view<DeviceType>()),
+                _typew(atom->k_type.view<DeviceType>()),
+                _maskw(atom->k_mask.view<DeviceType>()),
+                _imagew(atom->k_image.view<DeviceType>()),
+                _dpdThetaw(atom->k_dpdTheta.view<DeviceType>()),
+                _uCondw(atom->k_uCond.view<DeviceType>()),
+                _uMechw(atom->k_uMech.view<DeviceType>()),
+                _uChemw(atom->k_uChem.view<DeviceType>()),
+                _uCGw(atom->k_uCG.view<DeviceType>()),
+                _uCGneww(atom->k_uCGnew.view<DeviceType>()),
+                _sendlist(sendlist.template view<DeviceType>()),
+                _copylist(copylist.template view<DeviceType>()),
+                _nlocal(nlocal),_dim(dim),
+                _lo(lo),_hi(hi){
+    const size_t elements = 17;
+    const int maxsendlist = (buf.template view<DeviceType>().dimension_0()*buf.template view<DeviceType>().dimension_1())/elements;
+
+    buffer_view<DeviceType>(_buf,buf,maxsendlist,elements);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &mysend) const {
+    const int i = _sendlist(mysend);
+    _buf(mysend,0) = 17;
+    _buf(mysend,1) = _x(i,0);
+    _buf(mysend,2) = _x(i,1);
+    _buf(mysend,3) = _x(i,2);
+    _buf(mysend,4) = _v(i,0);
+    _buf(mysend,5) = _v(i,1);
+    _buf(mysend,6) = _v(i,2);
+    _buf(mysend,7) = _tag[i];
+    _buf(mysend,8) = _type[i];
+    _buf(mysend,9) = _mask[i];
+    _buf(mysend,10) = _image[i];
+    _buf(mysend,11) = _dpdTheta[i];
+    _buf(mysend,12) = _uCond[i];
+    _buf(mysend,13) = _uMech[i];
+    _buf(mysend,14) = _uChem[i];
+    _buf(mysend,15) = _uCG[i];
+    _buf(mysend,16) = _uCGnew[i];
+    const int j = _copylist(mysend);
+
+    if(j>-1) {
+    _xw(i,0) = _x(j,0);
+    _xw(i,1) = _x(j,1);
+    _xw(i,2) = _x(j,2);
+    _vw(i,0) = _v(j,0);
+    _vw(i,1) = _v(j,1);
+    _vw(i,2) = _v(j,2);
+    _tagw[i] = _tag(j);
+    _typew[i] = _type(j);
+    _maskw[i] = _mask(j);
+    _imagew[i] = _image(j);
+    _dpdThetaw[i] = _dpdTheta(j);
+    _uCondw[i] = _uCond(j);
+    _uMechw[i] = _uMech(j);
+    _uChemw[i] = _uChem(j);
+    _uCGw[i] = _uCG(j);
+    _uCGneww[i] = _uCGnew(j);
+    }
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &k_buf, DAT::tdual_int_1d k_sendlist,DAT::tdual_int_1d k_copylist,ExecutionSpace space,int dim,X_FLOAT lo,X_FLOAT hi )
+{
+  if(nsend > (int) (k_buf.view<LMPHostType>().dimension_0()*k_buf.view<LMPHostType>().dimension_1())/17) {
+    int newsize = nsend*17/k_buf.view<LMPHostType>().dimension_1()+1;
+    k_buf.resize(newsize,k_buf.view<LMPHostType>().dimension_1());
+  }
+  if(space == Host) {
+    AtomVecDPDKokkos_PackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
+    Kokkos::parallel_for(nsend,f);
+    LMPHostType::fence();
+    return nsend*17;
+  } else {
+    AtomVecDPDKokkos_PackExchangeFunctor<LMPDeviceType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
+    Kokkos::parallel_for(nsend,f);
+    LMPDeviceType::fence();
+    return nsend*17;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_exchange(int i, double *buf)
+{
+  int m = 1;
+  buf[m++] = h_x(i,0);
+  buf[m++] = h_x(i,1);
+  buf[m++] = h_x(i,2);
+  buf[m++] = h_v(i,0);
+  buf[m++] = h_v(i,1);
+  buf[m++] = h_v(i,2);
+  buf[m++] = ubuf(h_tag(i)).d;
+  buf[m++] = ubuf(h_type(i)).d;
+  buf[m++] = ubuf(h_mask(i)).d;
+  buf[m++] = ubuf(h_image(i)).d;
+  buf[m++] = h_dpdTheta[i];
+  buf[m++] = h_uCond[i];
+  buf[m++] = h_uMech[i];
+  buf[m++] = h_uChem[i];
+  buf[m++] = h_uCG[i];
+  buf[m++] = h_uCGnew[i];
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      m += modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&buf[m]);
+
+  buf[0] = m;
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecDPDKokkos_UnpackExchangeFunctor {
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typename AT::t_x_array _x;
+  typename AT::t_v_array _v;
+  typename AT::t_tagint_1d _tag;
+  typename AT::t_int_1d _type;
+  typename AT::t_int_1d _mask;
+  typename AT::t_imageint_1d _image;
+  typename AT::t_efloat_1d _dpdTheta;
+  typename AT::t_efloat_1d _uCond;
+  typename AT::t_efloat_1d _uMech;
+  typename AT::t_efloat_1d _uChem;
+  typename AT::t_efloat_1d _uCG;
+  typename AT::t_efloat_1d _uCGnew;
+
+  typename AT::t_xfloat_2d_um _buf;
+  typename AT::t_int_1d _nlocal;
+  int _dim;
+  X_FLOAT _lo,_hi;
+
+  AtomVecDPDKokkos_UnpackExchangeFunctor(
+      const AtomKokkos* atom,
+      const typename AT::tdual_xfloat_2d buf,
+      typename AT::tdual_int_1d nlocal,
+      int dim, X_FLOAT lo, X_FLOAT hi):
+                _x(atom->k_x.view<DeviceType>()),
+                _v(atom->k_v.view<DeviceType>()),
+                _tag(atom->k_tag.view<DeviceType>()),
+                _type(atom->k_type.view<DeviceType>()),
+                _mask(atom->k_mask.view<DeviceType>()),
+                _image(atom->k_image.view<DeviceType>()),
+                _nlocal(nlocal.template view<DeviceType>()),_dim(dim),
+                _lo(lo),_hi(hi){
+    const size_t elements = 17;
+    const int maxsendlist = (buf.template view<DeviceType>().dimension_0()*buf.template view<DeviceType>().dimension_1())/elements;
+
+    buffer_view<DeviceType>(_buf,buf,maxsendlist,elements);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &myrecv) const {
+    X_FLOAT x = _buf(myrecv,_dim+1);
+    if (x >= _lo && x < _hi) {
+      int i = Kokkos::atomic_fetch_add(&_nlocal(0),1);
+      _x(i,0) = _buf(myrecv,1);
+      _x(i,1) = _buf(myrecv,2);
+      _x(i,2) = _buf(myrecv,3);
+      _v(i,0) = _buf(myrecv,4);
+      _v(i,1) = _buf(myrecv,5);
+      _v(i,2) = _buf(myrecv,6);
+      _tag[i] = _buf(myrecv,7);
+      _type[i] = _buf(myrecv,8);
+      _mask[i] = _buf(myrecv,9);
+      _image[i] = _buf(myrecv,10);
+      _dpdTheta[i] = _buf(myrecv,11);
+      _uCond[i] = _buf(myrecv,12);
+      _uMech[i] = _buf(myrecv,13);
+      _uChem[i] = _buf(myrecv,14);
+      _uCG[i] = _buf(myrecv,15);
+      _uCGnew[i] = _buf(myrecv,16);
+    }
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nrecv,int nlocal,int dim,X_FLOAT lo,X_FLOAT hi,ExecutionSpace space) {
+  if(space == Host) {
+    k_count.h_view(0) = nlocal;
+    AtomVecDPDKokkos_UnpackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_count,dim,lo,hi);
+    Kokkos::parallel_for(nrecv/17,f);
+    LMPHostType::fence();
+    return k_count.h_view(0);
+  } else {
+    k_count.h_view(0) = nlocal;
+    k_count.modify<LMPHostType>();
+    k_count.sync<LMPDeviceType>();
+    AtomVecDPDKokkos_UnpackExchangeFunctor<LMPDeviceType> f(atomKK,k_buf,k_count,dim,lo,hi);
+    Kokkos::parallel_for(nrecv/17,f);
+    LMPDeviceType::fence();
+    k_count.modify<LMPDeviceType>();
+    k_count.sync<LMPHostType>();
+
+    return k_count.h_view(0);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::unpack_exchange(double *buf)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) grow(0);
+  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+           MASK_MASK | IMAGE_MASK);
+
+  int m = 1;
+  h_x(nlocal,0) = buf[m++];
+  h_x(nlocal,1) = buf[m++];
+  h_x(nlocal,2) = buf[m++];
+  h_v(nlocal,0) = buf[m++];
+  h_v(nlocal,1) = buf[m++];
+  h_v(nlocal,2) = buf[m++];
+  h_tag(nlocal) = (tagint) ubuf(buf[m++]).i;
+  h_type(nlocal) = (int) ubuf(buf[m++]).i;
+  h_mask(nlocal) = (int) ubuf(buf[m++]).i;
+  h_image(nlocal) = (imageint) ubuf(buf[m++]).i;
+  h_dpdTheta[nlocal] = buf[m++];
+  h_uCond[nlocal] = buf[m++];
+  h_uMech[nlocal] = buf[m++];
+  h_uChem[nlocal] = buf[m++];
+  h_uCG[nlocal] = buf[m++];
+  h_uCGnew[nlocal] = buf[m++];
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      m += modify->fix[atom->extra_grow[iextra]]->
+        unpack_exchange(nlocal,&buf[m]);
+
+  atom->nlocal++;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   size of restart data for all atoms owned by this proc
+   include extra data stored by fixes
+------------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::size_restart()
+{
+  int i;
+
+  int nlocal = atom->nlocal;
+  int n = 15 * nlocal; // 11 + dpdTheta + uCond + uMech + uChem
+
+  if (atom->nextra_restart)
+    for (int iextra = 0; iextra < atom->nextra_restart; iextra++)
+      for (i = 0; i < nlocal; i++)
+        n += modify->fix[atom->extra_restart[iextra]]->size_restart(i);
+
+  return n;
+}
+
+/* ----------------------------------------------------------------------
+   pack atom I's data for restart file including extra quantities
+   xyz must be 1st 3 values, so that read_restart can test on them
+   molecular types may be negative, but write as positive
+------------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_restart(int i, double *buf)
+{
+  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+            MASK_MASK | IMAGE_MASK );
+
+  int m = 1;
+  buf[m++] = h_x(i,0);
+  buf[m++] = h_x(i,1);
+  buf[m++] = h_x(i,2);
+  buf[m++] = ubuf(h_tag(i)).d;
+  buf[m++] = ubuf(h_type(i)).d;
+  buf[m++] = ubuf(h_mask(i)).d;
+  buf[m++] = ubuf(h_image(i)).d;
+  buf[m++] = h_v(i,0);
+  buf[m++] = h_v(i,1);
+  buf[m++] = h_v(i,2);
+  buf[m++] = h_dpdTheta[i];
+  buf[m++] = h_uCond[i];
+  buf[m++] = h_uMech[i];
+  buf[m++] = h_uChem[i];
+
+  if (atom->nextra_restart)
+    for (int iextra = 0; iextra < atom->nextra_restart; iextra++)
+      m += modify->fix[atom->extra_restart[iextra]]->pack_restart(i,&buf[m]);
+
+  buf[0] = m;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   unpack data for one atom from restart file including extra quantities
+------------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::unpack_restart(double *buf)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) {
+    grow(0);
+    if (atom->nextra_store)
+      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
+  }
+  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+                MASK_MASK | IMAGE_MASK );
+
+  int m = 1;
+  h_x(nlocal,0) = buf[m++];
+  h_x(nlocal,1) = buf[m++];
+  h_x(nlocal,2) = buf[m++];
+  h_tag(nlocal) = (tagint) ubuf(buf[m++]).i;
+  h_type(nlocal) = (int) ubuf(buf[m++]).i;
+  h_mask(nlocal) = (int) ubuf(buf[m++]).i;
+  h_image(nlocal) = (imageint) ubuf(buf[m++]).i;
+  h_v(nlocal,0) = buf[m++];
+  h_v(nlocal,1) = buf[m++];
+  h_v(nlocal,2) = buf[m++];
+  h_dpdTheta[nlocal] = buf[m++];
+  h_uCond[nlocal] = buf[m++];
+  h_uMech[nlocal] = buf[m++];
+  h_uChem[nlocal] = buf[m++];
+
+  double **extra = atom->extra;
+  if (atom->nextra_store) {
+    int size = static_cast<int> (ubuf(buf[m++]).i) - m;
+    for (int i = 0; i < size; i++) extra[nlocal][i] = buf[m++];
+  }
+
+  atom->nlocal++;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   create one atom of itype at coord
+   set other values to defaults
+------------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::create_atom(int itype, double *coord)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) {
+    //if(nlocal>2) printf("typeA: %i %i\n",type[0],type[1]);
+    atomKK->modified(Host,ALL_MASK);
+    grow(0);
+    //if(nlocal>2) printf("typeB: %i %i\n",type[0],type[1]);
+  }
+  atomKK->modified(Host,ALL_MASK);
+
+  tag[nlocal] = 0;
+  type[nlocal] = itype;
+  h_x(nlocal,0) = coord[0];
+  h_x(nlocal,1) = coord[1];
+  h_x(nlocal,2) = coord[2];
+  h_mask[nlocal] = 1;
+  h_image[nlocal] = ((tagint) IMGMAX << IMG2BITS) |
+    ((tagint) IMGMAX << IMGBITS) | IMGMAX;
+  h_v(nlocal,0) = 0.0;
+  h_v(nlocal,1) = 0.0;
+  h_v(nlocal,2) = 0.0;
+  h_rho[nlocal] = 0.0;
+  h_dpdTheta[nlocal] = 0.0;
+  h_uCond[nlocal] = 0.0;
+  h_uMech[nlocal] = 0.0;
+  h_uChem[nlocal] = 0.0;
+  h_uCG[nlocal] = 0.0;
+  h_uCGnew[nlocal] = 0.0;
+  h_duChem[nlocal] = 0.0;
+
+  atom->nlocal++;
+}
+
+/* ----------------------------------------------------------------------
+   unpack one line from Atoms section of data file
+   initialize other atom quantities
+------------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::data_atom(double *coord, tagint imagetmp,
+                                    char **values)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) grow(0);
+
+  h_tag[nlocal] = ATOTAGINT(values[0]);
+  h_type[nlocal] = atoi(values[1]);
+  if (type[nlocal] <= 0 || type[nlocal] > atom->ntypes)
+    error->one(FLERR,"Invalid atom type in Atoms section of data file");
+
+  h_dpdTheta[nlocal] = atof(values[2]);
+  if (h_dpdTheta[nlocal] <= 0)
+    error->one(FLERR,"Internal temperature in Atoms section of date file must be > zero");
+
+  h_x(nlocal,0) = coord[0];
+  h_x(nlocal,1) = coord[1];
+  h_x(nlocal,2) = coord[2];
+
+  h_image[nlocal] = imagetmp;
+
+  h_mask[nlocal] = 1;
+  h_v(nlocal,0) = 0.0;
+  h_v(nlocal,1) = 0.0;
+  h_v(nlocal,2) = 0.0;
+
+  h_rho[nlocal] = 0.0;
+  h_uCond[nlocal] = 0.0;
+  h_uMech[nlocal] = 0.0;
+  h_uChem[nlocal] = 0.0;
+  h_uCG[nlocal] = 0.0;
+  h_uCGnew[nlocal] = 0.0;
+
+  atomKK->modified(Host,ALL_MASK);
+
+  atom->nlocal++;
+}
+
+/* ----------------------------------------------------------------------
+   unpack hybrid quantities from one line in Atoms section of data file
+   initialize other atom quantities for this sub-style
+------------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::data_atom_hybrid(int nlocal, char **values)
+{
+  h_dpdTheta(nlocal) = atof(values[0]);
+
+  return 1;
+}
+
+/* ----------------------------------------------------------------------
+   pack atom info for data file including 3 image flags
+------------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::pack_data(double **buf)
+{
+  int nlocal = atom->nlocal;
+  for (int i = 0; i < nlocal; i++) {
+    buf[i][0] = ubuf(h_tag(i)).d;
+    buf[i][1] = ubuf(h_type(i)).d;
+    buf[i][2] = h_dpdTheta(i);
+    buf[i][3] = h_x(i,0);
+    buf[i][4] = h_x(i,1);
+    buf[i][5] = h_x(i,2);
+    buf[i][6] = (h_image[i] & IMGMASK) - IMGMAX;
+    buf[i][7] = (h_image[i] >> IMGBITS & IMGMASK) - IMGMAX;
+    buf[i][8] = (h_image[i] >> IMG2BITS) - IMGMAX;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pack hybrid atom info for data file
+------------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_data_hybrid(int i, double *buf)
+{
+  buf[0] = h_dpdTheta(i);
+  return 1;
+}
+
+/* ----------------------------------------------------------------------
+   write atom info to data file including 3 image flags
+------------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::write_data(FILE *fp, int n, double **buf)
+{
+  for (int i = 0; i < n; i++)
+    fprintf(fp,TAGINT_FORMAT " %d %-1.16e %-1.16e %-1.16e %-1.16e %d %d %d\n",
+            (tagint) ubuf(buf[i][0]).i,(int) ubuf(buf[i][1]).i,
+            buf[i][2],buf[i][3],buf[i][4],buf[i][5],
+            (int) ubuf(buf[i][6]).i,(int) ubuf(buf[i][7]).i,
+            (int) ubuf(buf[i][8]).i);
+}
+
+/* ----------------------------------------------------------------------
+   write hybrid atom info to data file
+------------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::write_data_hybrid(FILE *fp, double *buf)
+{
+  fprintf(fp," %-1.16e",buf[0]);
+  return 1;
+}
+
+/* ----------------------------------------------------------------------
+   return # of bytes of allocated memory
+------------------------------------------------------------------------- */
+
+bigint AtomVecDPDKokkos::memory_usage()
+{
+  bigint bytes = 0;
+
+  if (atom->memcheck("tag")) bytes += memory->usage(tag,nmax);
+  if (atom->memcheck("type")) bytes += memory->usage(type,nmax);
+  if (atom->memcheck("mask")) bytes += memory->usage(mask,nmax);
+  if (atom->memcheck("image")) bytes += memory->usage(image,nmax);
+  if (atom->memcheck("x")) bytes += memory->usage(x,nmax,3);
+  if (atom->memcheck("v")) bytes += memory->usage(v,nmax,3);
+  if (atom->memcheck("f")) bytes += memory->usage(f,nmax*commKK->nthreads,3);
+  if (atom->memcheck("rho")) bytes += memory->usage(rho,nmax);
+  if (atom->memcheck("dpdTheta")) bytes += memory->usage(dpdTheta,nmax);
+  if (atom->memcheck("uCond")) bytes += memory->usage(uCond,nmax);
+  if (atom->memcheck("uMech")) bytes += memory->usage(uMech,nmax);
+  if (atom->memcheck("uChem")) bytes += memory->usage(uChem,nmax);
+  if (atom->memcheck("uCG")) bytes += memory->usage(uCG,nmax);
+  if (atom->memcheck("uCGnew")) bytes += memory->usage(uCGnew,nmax);
+  if (atom->memcheck("duChem")) bytes += memory->usage(duChem,nmax);
+
+  return bytes;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::sync(ExecutionSpace space, unsigned int mask)
+{
+  if (space == Device) {
+    if (mask & X_MASK) atomKK->k_x.sync<LMPDeviceType>();
+    if (mask & V_MASK) atomKK->k_v.sync<LMPDeviceType>();
+    if (mask & F_MASK) atomKK->k_f.sync<LMPDeviceType>();
+    if (mask & TAG_MASK) atomKK->k_tag.sync<LMPDeviceType>();
+    if (mask & TYPE_MASK) atomKK->k_type.sync<LMPDeviceType>();
+    if (mask & MASK_MASK) atomKK->k_mask.sync<LMPDeviceType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.sync<LMPDeviceType>();
+  } else {
+    if (mask & X_MASK) atomKK->k_x.sync<LMPHostType>();
+    if (mask & V_MASK) atomKK->k_v.sync<LMPHostType>();
+    if (mask & F_MASK) atomKK->k_f.sync<LMPHostType>();
+    if (mask & TAG_MASK) atomKK->k_tag.sync<LMPHostType>();
+    if (mask & TYPE_MASK) atomKK->k_type.sync<LMPHostType>();
+    if (mask & MASK_MASK) atomKK->k_mask.sync<LMPHostType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.sync<LMPHostType>();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::sync_overlapping_device(ExecutionSpace space, unsigned int mask)
+{
+  if (space == Device) {
+    if ((mask & X_MASK) && atomKK->k_x.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_x_array>(atomKK->k_x,space);
+    if ((mask & V_MASK) && atomKK->k_v.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_v_array>(atomKK->k_v,space);
+    if ((mask & F_MASK) && atomKK->k_f.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_f_array>(atomKK->k_f,space);
+    if ((mask & TAG_MASK) && atomKK->k_tag.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_tagint_1d>(atomKK->k_tag,space);
+    if ((mask & TYPE_MASK) && atomKK->k_type.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_type,space);
+    if ((mask & MASK_MASK) && atomKK->k_mask.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_mask,space);
+    if ((mask & IMAGE_MASK) && atomKK->k_image.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_imageint_1d>(atomKK->k_image,space);
+  } else {
+    if ((mask & X_MASK) && atomKK->k_x.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_x_array>(atomKK->k_x,space);
+    if ((mask & V_MASK) && atomKK->k_v.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_v_array>(atomKK->k_v,space);
+    if ((mask & F_MASK) && atomKK->k_f.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_f_array>(atomKK->k_f,space);
+    if ((mask & TAG_MASK) && atomKK->k_tag.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_tagint_1d>(atomKK->k_tag,space);
+    if ((mask & TYPE_MASK) && atomKK->k_type.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_type,space);
+    if ((mask & MASK_MASK) && atomKK->k_mask.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_mask,space);
+    if ((mask & IMAGE_MASK) && atomKK->k_image.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_imageint_1d>(atomKK->k_image,space);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::modified(ExecutionSpace space, unsigned int mask)
+{
+  if (space == Device) {
+    if (mask & X_MASK) atomKK->k_x.modify<LMPDeviceType>();
+    if (mask & V_MASK) atomKK->k_v.modify<LMPDeviceType>();
+    if (mask & F_MASK) atomKK->k_f.modify<LMPDeviceType>();
+    if (mask & TAG_MASK) atomKK->k_tag.modify<LMPDeviceType>();
+    if (mask & TYPE_MASK) atomKK->k_type.modify<LMPDeviceType>();
+    if (mask & MASK_MASK) atomKK->k_mask.modify<LMPDeviceType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.modify<LMPDeviceType>();
+  } else {
+    if (mask & X_MASK) atomKK->k_x.modify<LMPHostType>();
+    if (mask & V_MASK) atomKK->k_v.modify<LMPHostType>();
+    if (mask & F_MASK) atomKK->k_f.modify<LMPHostType>();
+    if (mask & TAG_MASK) atomKK->k_tag.modify<LMPHostType>();
+    if (mask & TYPE_MASK) atomKK->k_type.modify<LMPHostType>();
+    if (mask & MASK_MASK) atomKK->k_mask.modify<LMPHostType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.modify<LMPHostType>();
+  }
+}
+
diff --git a/src/USER-DPD/atom_vec_dpd_kokkos.h b/src/USER-DPD/atom_vec_dpd_kokkos.h
new file mode 100644
index 0000000000..d108e58ae7
--- /dev/null
+++ b/src/USER-DPD/atom_vec_dpd_kokkos.h
@@ -0,0 +1,135 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale AtomicKokkos/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef ATOM_CLASS
+
+AtomStyle(dpd/kk,AtomVecDPDKokkos)
+
+#else
+
+#ifndef LMP_ATOM_VEC_DPD_KOKKOS_H
+#define LMP_ATOM_VEC_DPD_KOKKOS_H
+
+#include "atom_vec_kokkos.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+class AtomVecDPDKokkos : public AtomVecKokkos {
+ public:
+  AtomVecDPDKokkos(class LAMMPS *);
+  virtual ~AtomVecDPDKokkos() {}
+  void grow(int);
+  void copy(int, int, int);
+  int pack_comm(int, int *, double *, int, int *);
+  int pack_comm_vel(int, int *, double *, int, int *);
+  int pack_comm_hybrid(int, int *, double *);
+  void unpack_comm(int, int, double *);
+  void unpack_comm_vel(int, int, double *);
+  int unpack_comm_hybrid(int, int, double *);
+  int pack_reverse(int, int, double *);
+  void unpack_reverse(int, int *, double *);
+  int pack_border(int, int *, double *, int, int *);
+  int pack_border_vel(int, int *, double *, int, int *);
+  int pack_border_hybrid(int, int *, double *);
+  void unpack_border(int, int, double *);
+  void unpack_border_vel(int, int, double *);
+  int unpack_border_hybrid(int, int, double *);
+  int pack_exchange(int, double *);
+  int unpack_exchange(double *);
+  int size_restart();
+  int pack_restart(int, double *);
+  int unpack_restart(double *);
+  void create_atom(int, double *);
+  void data_atom(double *, tagint, char **);
+  int data_atom_hybrid(int, char **);
+  void pack_data(double **);
+  int pack_data_hybrid(int, double *);
+  void write_data(FILE *, int, double **);
+  int write_data_hybrid(FILE *, double *);
+  bigint memory_usage();
+
+  void grow_reset();
+  int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
+                       const int & iswap,
+                       const DAT::tdual_xfloat_2d &buf,
+                       const int &pbc_flag, const int pbc[]);
+  void unpack_comm_kokkos(const int &n, const int &nfirst,
+                          const DAT::tdual_xfloat_2d &buf);
+  int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
+                     const int & iswap, const int nfirst,
+                     const int &pbc_flag, const int pbc[]);
+  int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
+                         DAT::tdual_xfloat_2d buf,int iswap,
+                         int pbc_flag, int *pbc, ExecutionSpace space);
+  void unpack_border_kokkos(const int &n, const int &nfirst,
+                            const DAT::tdual_xfloat_2d &buf,
+                            ExecutionSpace space);
+  int pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &buf,
+                           DAT::tdual_int_1d k_sendlist,
+                           DAT::tdual_int_1d k_copylist,
+                           ExecutionSpace space, int dim,
+                           X_FLOAT lo, X_FLOAT hi);
+  int unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv,
+                             int nlocal, int dim, X_FLOAT lo, X_FLOAT hi,
+                             ExecutionSpace space);
+
+  void sync(ExecutionSpace space, unsigned int mask);
+  void modified(ExecutionSpace space, unsigned int mask);
+  void sync_overlapping_device(ExecutionSpace space, unsigned int mask);
+  double *uCond,*uMech,*uChem,*uCG,*uCGnew,*rho,*dpdTheta;
+  double *duChem;
+
+ protected:
+  DAT::t_efloat_1d d_uCond, d_uMech, d_uChem, d_uCG, d_uCGnew,d_rho,d_dpdTheta,d_duChem;
+  HAT::t_efloat_1d h_uCond, h_uMech, h_uChem, h_uCG, h_uCGnew,h_rho,h_dpdTheta,h_duChem;
+
+  tagint *tag;
+  imageint *image;
+  int *type,*mask;
+  double **x,**v,**f;
+
+  DAT::t_tagint_1d d_tag;
+  HAT::t_tagint_1d h_tag;
+  DAT::t_imageint_1d d_image;
+  HAT::t_imageint_1d h_image;
+  DAT::t_int_1d d_type, d_mask;
+  HAT::t_int_1d h_type, h_mask;
+
+  DAT::t_x_array d_x;
+  DAT::t_v_array d_v;
+  DAT::t_f_array d_f;
+  HAT::t_x_array h_x;
+  HAT::t_v_array h_v;
+  HAT::t_f_array h_f;
+
+  DAT::tdual_int_1d k_count;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Per-processor system is too big
+
+The number of owned atoms plus ghost atoms on a single
+processor must fit in 32-bit integer.
+
+E: Invalid atom type in Atoms section of data file
+
+Atom types must range from 1 to specified # of types.
+
+*/
diff --git a/src/USER-DPD/pair_dpd_fdt_energy_kokkos.cpp b/src/USER-DPD/pair_dpd_fdt_energy_kokkos.cpp
new file mode 100644
index 0000000000..f7e1fecc09
--- /dev/null
+++ b/src/USER-DPD/pair_dpd_fdt_energy_kokkos.cpp
@@ -0,0 +1,373 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: James Larentzos (U.S. Army Research Laboratory)
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pair_dpd_fdt_energy_kokkos.h"
+#include "kokkos.h"
+#include "atom_kokkos.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "update.h"
+#include "fix.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "random_mars.h"
+#include "math_const.h"
+#include "memory.h"
+#include "modify.h"
+#include "error.h"
+#include "atom_masks.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define KOKKOS_CUDA_MAX_THREADS 256
+#define KOKKOS_CUDA_MIN_BLOCKS 8
+
+#define EPSILON 1.0e-10
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairDPDfdtEnergyKokkos<DeviceType>::PairDPDfdtEnergyKokkos(LAMMPS *lmp) : PairDPDfdtEnergy(lmp)
+{
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  cutsq = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairDPDfdtEnergyKokkos<DeviceType>::~PairDPDfdtEnergyKokkos()
+{
+  if (allocated) {
+    memory->destroy_kokkos(k_eatom,eatom);
+    memory->destroy_kokkos(k_vatom,vatom);
+    k_cutsq = DAT::tdual_ffloat_2d();
+    memory->sfree(cutsq);
+    eatom = NULL;
+    vatom = NULL;
+    cutsq = NULL;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairDPDfdtEnergyKokkos<DeviceType>::cleanup_copy() {
+  // WHY needed: this prevents parent copy from deallocating any arrays
+  allocated = 0;
+  cutsq = NULL;
+  eatom = NULL;
+  vatom = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairDPDfdtEnergyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
+{
+  eflag = eflag_in;
+  vflag = vflag_in;
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+
+  // reallocate per-atom arrays if necessary
+
+  if (eflag_atom) {
+    memory->destroy_kokkos(k_eatom,eatom);
+    memory->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
+    d_eatom = k_eatom.view<DeviceType>();
+  }
+  if (vflag_atom) {
+    memory->destroy_kokkos(k_vatom,vatom);
+    memory->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom");
+    d_vatom = k_vatom.view<DeviceType>();
+  }
+
+  atomKK->sync(execution_space,datamask_read);
+  k_cutsq.template sync<DeviceType>();
+  if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
+  else atomKK->modified(execution_space,F_MASK);
+
+  x = atomKK->k_x.view<DeviceType>();
+  c_x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  tag = atomKK->k_tag.view<DeviceType>();
+  nlocal = atom->nlocal;
+  nall = atom->nlocal + atom->nghost;
+  newton_pair = force->newton_pair;
+  special_lj[0] = force->special_lj[0];
+  special_lj[1] = force->special_lj[1];
+  special_lj[2] = force->special_lj[2];
+  special_lj[3] = force->special_lj[3];
+
+  // loop over neighbors of my atoms
+
+  EV_FLOAT ev = pair_compute<PairDPDfdtEnergyKokkos<DeviceType>,void >(this,(NeighListKokkos<DeviceType>*)list);
+
+  if (eflag_global) eng_vdwl += ev.evdwl;
+  if (vflag_global) {
+    virial[0] += ev.v[0];
+    virial[1] += ev.v[1];
+    virial[2] += ev.v[2];
+    virial[3] += ev.v[3];
+    virial[4] += ev.v[4];
+    virial[5] += ev.v[5];
+  }
+
+  if (vflag_fdotr) pair_virial_fdotr_compute(this);
+
+  if (eflag_atom) {
+    k_eatom.template modify<DeviceType>();
+    k_eatom.template sync<LMPHostType>();
+  }
+
+  if (vflag_atom) {
+    k_vatom.template modify<DeviceType>();
+    k_vatom.template sync<LMPHostType>();
+  }
+}
+
+template<class DeviceType>
+template<bool STACKPARAMS, class Specialisation>
+KOKKOS_INLINE_FUNCTION
+F_FLOAT PairDPDfdtEnergyKokkos<DeviceType>::
+compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const {
+  (void) i;
+  (void) j;
+  const F_FLOAT r = sqrt(rsq);
+ if (r < EPSILON) return 0;     // r can be 0.0 in DPD systems
+  const F_FLOAT rinv = 1.0/r;
+  const F_FLOAT wr = 1.0 - r/cut[itype][jtype];
+  const F_FLOAT wd = wr*wr;
+
+ // conservative force = a0 * wr
+  return  a0[itype][jtype]*wr*rinv;
+}
+
+template<class DeviceType>
+template<bool STACKPARAMS, class Specialisation>
+KOKKOS_INLINE_FUNCTION
+F_FLOAT PairDPDfdtEnergyKokkos<DeviceType>::
+compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const {
+  (void) i;
+  (void) j;
+  const F_FLOAT r = sqrt(rsq);
+  if (r < EPSILON) return 0;     // r can be 0.0 in DPD systems
+  const F_FLOAT rinv = 1.0/r;
+  const F_FLOAT wr = 1.0 - r/cut[itype][jtype];
+  const F_FLOAT wd = wr*wr;
+     // unshifted eng of conservative term:
+     // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
+     // eng shifted to 0.0 at cutoff
+  return 0.5*a0[itype][jtype]*cut[itype][jtype] * wd;
+}
+
+
+/*
+  int i,j,ii,jj,inum,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r,rinv,wd,wr,factor_dpd;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  double *special_lj = force->special_lj;
+  int newton_pair = force->newton_pair;
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_dpd = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r = sqrt(rsq);
+        if (r < EPSILON) continue;     // r can be 0.0 in DPD systems
+        rinv = 1.0/r;
+        wr = 1.0 - r/cut[itype][jtype];
+        wd = wr*wr;
+
+        // conservative force = a0 * wr
+        fpair = a0[itype][jtype]*wr;
+        fpair *= factor_dpd*rinv;
+
+        f[i][0] += delx*fpair;
+        f[i][1] += dely*fpair;
+        f[i][2] += delz*fpair;
+        if (newton_pair || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (eflag) {
+          // unshifted eng of conservative term:
+          // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
+          // eng shifted to 0.0 at cutoff
+          evdwl = 0.5*a0[itype][jtype]*cut[itype][jtype] * wd;
+          evdwl *= factor_dpd;
+        }
+
+        if (evflag) ev_tally(i,j,nlocal,newton_pair,
+                             evdwl,0.0,fpair,delx,dely,delz);
+      }
+    }
+  }
+
+  if (vflag_fdotr) virial_fdotr_compute();
+}
+*/
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairDPDfdtEnergyKokkos<DeviceType>::allocate()
+{
+  PairDPDfdtEnergy::allocate();
+
+  int n = atom->ntypes;
+  memory->destroy(cutsq);
+  memory->create_kokkos(k_cutsq,cutsq,n+1,n+1,"pair:cutsq");
+  d_cutsq = k_cutsq.template view<DeviceType>();
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairDPDfdtEnergyKokkos<DeviceType>::settings(int narg, char **arg)
+{
+  if (narg != 2) error->all(FLERR,"Illegal pair_style command");
+
+  PairDPDfdtEnergy::settings(2,arg);
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairDPDfdtEnergyKokkos<DeviceType>::init_style()
+{
+  PairDPDfdtEnergy::init_style();
+
+  neighflag = lmp->kokkos->neighflag;
+  int irequest = neighbor->nrequest - 1;
+
+  neighbor->requests[irequest]->
+    kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value &&
+    !Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+  neighbor->requests[irequest]->
+    kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+
+  if (neighflag == HALF || neighflag == HALFTHREAD) {
+    neighbor->requests[irequest]->full = 0;
+    neighbor->requests[irequest]->half = 1;
+    neighbor->requests[irequest]->full_cluster = 0;
+  } else {
+    error->all(FLERR,"Cannot use chosen neighbor list style with dpd/fdt/energy/kk");
+  }
+
+/*
+  if (comm->ghost_velocity == 0)
+    error->all(FLERR,"Pair dpd/fdt/energy requires ghost atoms store velocity");
+
+  // if newton off, forces between atoms ij will be double computed
+  // using different random numbers
+
+  if (force->newton_pair == 0 && comm->me == 0) error->warning(FLERR,
+      "Pair dpd/fdt/energy requires newton pair on");
+
+  int irequest = neighbor->request(this,instance_me);
+  neighbor->requests[irequest]->ssa = 0;
+  for (int i = 0; i < modify->nfix; i++)
+    if (strcmp(modify->fix[i]->style,"shardlow") == 0)
+      neighbor->requests[irequest]->ssa = 1;
+
+  bool eos_flag = false;
+  for (int i = 0; i < modify->nfix; i++)
+    if (strncmp(modify->fix[i]->style,"eos",3) == 0) eos_flag = true;
+  if(!eos_flag) error->all(FLERR,"pair_style dpd/fdt/energy requires an EOS to be specified");
+*/
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PairDPDfdtEnergyKokkos<DeviceType>::init_one(int i, int j)
+{
+  double cutone = PairDPDfdtEnergy::init_one(i,j);
+
+  if(i<MAX_TYPES_STACKPARAMS+1 && j<MAX_TYPES_STACKPARAMS+1) {
+    m_cutsq[j][i] = m_cutsq[i][j] = cutone*cutone;
+  }
+  k_cutsq.h_view(i,j) = cutone*cutone;
+  k_cutsq.template modify<LMPHostType>();
+
+  return cutone;
+}
+
+
+namespace LAMMPS_NS {
+template class PairDPDfdtEnergyKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class PairDPDfdtEnergyKokkos<LMPHostType>;
+#endif
+}
+
diff --git a/src/USER-DPD/pair_dpd_fdt_energy_kokkos.h b/src/USER-DPD/pair_dpd_fdt_energy_kokkos.h
new file mode 100644
index 0000000000..a8a5f25801
--- /dev/null
+++ b/src/USER-DPD/pair_dpd_fdt_energy_kokkos.h
@@ -0,0 +1,119 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(dpd/fdt/energy/kk,PairDPDfdtEnergyKokkos<LMPDeviceType>)
+PairStyle(dpd/fdt/energy/kk/device,PairDPDfdtEnergyKokkos<LMPDeviceType>)
+PairStyle(dpd/fdt/energy/kk/host,PairDPDfdtEnergyKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_PAIR_DPD_FDT_ENERGY_KOKKOS_H
+#define LMP_PAIR_DPD_FDT_ENERGY_KOKKOS_H
+
+#include "pair_kokkos.h"
+#include "pair_dpd_fdt_energy.h"
+#include "neigh_list_kokkos.h"
+
+namespace LAMMPS_NS {
+
+template<class DeviceType>
+class PairDPDfdtEnergyKokkos : public PairDPDfdtEnergy {
+ public:
+  enum {EnabledNeighFlags=HALFTHREAD|HALF};
+  enum {COUL_FLAG=0};
+  typedef DeviceType device_type;
+  PairDPDfdtEnergyKokkos(class LAMMPS *);
+  virtual ~PairDPDfdtEnergyKokkos();
+  virtual void compute(int, int);
+  virtual void settings(int, char **);
+  void init_style();
+  double init_one(int, int);
+
+ protected:
+  void cleanup_copy();
+
+  template<bool STACKPARAMS, class Specialisation>
+  KOKKOS_INLINE_FUNCTION
+  F_FLOAT compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const;
+
+  template<bool STACKPARAMS, class Specialisation>
+  KOKKOS_INLINE_FUNCTION
+  F_FLOAT compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const;
+
+  F_FLOAT m_cutsq[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
+  typename ArrayTypes<DeviceType>::t_x_array_randomread x;
+  typename ArrayTypes<DeviceType>::t_x_array c_x;
+  typename ArrayTypes<DeviceType>::t_f_array f;
+  typename ArrayTypes<DeviceType>::t_int_1d_randomread type;
+
+  DAT::tdual_efloat_1d k_eatom;
+  DAT::tdual_virial_array k_vatom;
+  typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom;
+  typename ArrayTypes<DeviceType>::t_virial_array d_vatom;
+  typename ArrayTypes<DeviceType>::t_tagint_1d tag;
+
+  int newton_pair;
+  double special_lj[4];
+
+  typename ArrayTypes<DeviceType>::tdual_ffloat_2d k_cutsq;
+  typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq;
+
+
+  int neighflag;
+  int nlocal,nall,eflag,vflag;
+
+  void allocate();
+
+  friend class PairComputeFunctor<PairDPDfdtEnergyKokkos,HALF,true>;
+  friend class PairComputeFunctor<PairDPDfdtEnergyKokkos,HALFTHREAD,true>;
+  friend class PairComputeFunctor<PairDPDfdtEnergyKokkos,HALF,false>;
+  friend class PairComputeFunctor<PairDPDfdtEnergyKokkos,HALFTHREAD,false>;
+  friend EV_FLOAT pair_compute_neighlist<PairDPDfdtEnergyKokkos,HALF,void>(PairDPDfdtEnergyKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairDPDfdtEnergyKokkos,HALFTHREAD,void>(PairDPDfdtEnergyKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairDPDfdtEnergyKokkos,void>(PairDPDfdtEnergyKokkos*,NeighListKokkos<DeviceType>*);
+  friend void pair_virial_fdotr_compute<PairDPDfdtEnergyKokkos>(PairDPDfdtEnergyKokkos*);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: Pair dpd/fdt/energy requires ghost atoms store velocity
+
+Use the communicate vel yes command to enable this.
+
+E: Pair dpd/fdt/energy requires newton pair on
+
+Self-explanatory.
+
+E: All pair coeffs are not set
+
+All pair coefficients must be set in the data file or by the
+pair_coeff command before running a simulation.
+
+*/

From 8f78157202299a5bf9d860c90f30c8340d2d0cfc Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Mon, 21 Nov 2016 12:32:48 -0500
Subject: [PATCH 002/267] USER-DPD: aplly unpack_comm_hybrid bugfix d31121b to
 atom_vec_dpd_kokkos.cpp

---
 src/USER-DPD/atom_vec_dpd_kokkos.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/USER-DPD/atom_vec_dpd_kokkos.cpp b/src/USER-DPD/atom_vec_dpd_kokkos.cpp
index c58b592e53..c79559172f 100644
--- a/src/USER-DPD/atom_vec_dpd_kokkos.cpp
+++ b/src/USER-DPD/atom_vec_dpd_kokkos.cpp
@@ -1205,6 +1205,8 @@ int AtomVecDPDKokkos::unpack_comm_hybrid(int n, int first, double *buf)
     h_uCond(i) = buf[m++];
     h_uMech(i) = buf[m++];
     h_uChem(i) = buf[m++];
+    h_uCG(i) = buf[m++];
+    h_uCGnew(i) = buf[m++];
   }
   return m;
 }

From 75907916045ff25745389db4b11773c820bc13de Mon Sep 17 00:00:00 2001
From: stamoor <stamoor@sandia.gov>
Date: Mon, 21 Nov 2016 13:54:14 -0700
Subject: [PATCH 003/267] Integrating atom_vec_dpd into the Kokkos package

---
 src/KOKKOS/atom_vec_dpd_kokkos.cpp | 1874 ++++++++++++++++++++++++++++
 src/KOKKOS/atom_vec_dpd_kokkos.h   |  135 ++
 2 files changed, 2009 insertions(+)
 create mode 100644 src/KOKKOS/atom_vec_dpd_kokkos.cpp
 create mode 100644 src/KOKKOS/atom_vec_dpd_kokkos.h

diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.cpp b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
new file mode 100644
index 0000000000..c79559172f
--- /dev/null
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
@@ -0,0 +1,1874 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale AtomicKokkos/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include "atom_vec_dpd_kokkos.h"
+#include "atom_kokkos.h"
+#include "comm_kokkos.h"
+#include "domain.h"
+#include "modify.h"
+#include "fix.h"
+#include "atom_masks.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define DELTA 10000
+
+/* ---------------------------------------------------------------------- */
+
+AtomVecDPDKokkos::AtomVecDPDKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)
+{
+  molecular = 0;
+  mass_type = 1;
+
+  comm_x_only = comm_f_only = 0;
+  size_forward = 7;
+  size_reverse = 3;
+  size_border = 12;
+  size_velocity = 3;
+  size_data_atom = 6;
+  size_data_vel = 4;
+  xcol_data = 4;
+
+  atom->rho_flag = 1;
+  atom->dpd_flag = 1;
+
+  k_count = DAT::tdual_int_1d("atom::k_count",1);
+  atomKK = (AtomKokkos *) atom;
+  commKK = (CommKokkos *) comm;
+}
+
+/* ----------------------------------------------------------------------
+   grow atom arrays
+   n = 0 grows arrays by DELTA
+   n > 0 allocates arrays to size n
+------------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::grow(int n)
+{
+  if (n == 0) nmax += DELTA;
+  else nmax = n;
+  atomKK->nmax = nmax;
+  if (nmax < 0 || nmax > MAXSMALLINT)
+    error->one(FLERR,"Per-processor system is too big");
+
+  sync(Device,ALL_MASK);
+  modified(Device,ALL_MASK);
+
+  memory->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
+  memory->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
+  memory->grow_kokkos(atomKK->k_mask,atomKK->mask,nmax,"atom:mask");
+  memory->grow_kokkos(atomKK->k_image,atomKK->image,nmax,"atom:image");
+
+  memory->grow_kokkos(atomKK->k_x,atomKK->x,nmax,3,"atom:x");
+  memory->grow_kokkos(atomKK->k_v,atomKK->v,nmax,3,"atom:v");
+  memory->grow_kokkos(atomKK->k_f,atomKK->f,nmax,3,"atom:f");
+
+
+  memory->grow_kokkos(atomKK->k_rho,atomKK->rho,nmax,"atom:rho");
+  memory->grow_kokkos(atomKK->k_dpdTheta,atomKK->dpdTheta,nmax,"atom:dpdTheta");
+  memory->grow_kokkos(atomKK->k_uCond,atomKK->uCond,nmax,"atom:uCond");
+  memory->grow_kokkos(atomKK->k_uMech,atomKK->uMech,nmax,"atom:uMech");
+  memory->grow_kokkos(atomKK->k_uChem,atomKK->uChem,nmax,"atom:uChem");
+  memory->grow_kokkos(atomKK->k_uCG,atomKK->uCG,nmax,"atom:uCG");
+  memory->grow_kokkos(atomKK->k_uCGnew,atomKK->uCGnew,nmax,"atom:uCGnew");
+  memory->grow_kokkos(atomKK->k_duChem,atomKK->duChem,nmax,"atom:duChem");
+
+  grow_reset();
+  sync(Host,ALL_MASK);
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      modify->fix[atom->extra_grow[iextra]]->grow_arrays(nmax);
+}
+
+/* ----------------------------------------------------------------------
+   reset local array ptrs
+------------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::grow_reset()
+{
+  tag = atomKK->tag;
+  d_tag = atomKK->k_tag.d_view;
+  h_tag = atomKK->k_tag.h_view;
+
+  type = atomKK->type;
+  d_type = atomKK->k_type.d_view;
+  h_type = atomKK->k_type.h_view;
+  mask = atomKK->mask;
+  d_mask = atomKK->k_mask.d_view;
+  h_mask = atomKK->k_mask.h_view;
+  image = atomKK->image;
+  d_image = atomKK->k_image.d_view;
+  h_image = atomKK->k_image.h_view;
+
+  x = atomKK->x;
+  d_x = atomKK->k_x.d_view;
+  h_x = atomKK->k_x.h_view;
+  v = atomKK->v;
+  d_v = atomKK->k_v.d_view;
+  h_v = atomKK->k_v.h_view;
+  f = atomKK->f;
+  d_f = atomKK->k_f.d_view;
+  h_f = atomKK->k_f.h_view;
+
+  rho = atomKK->rho;
+  d_rho = atomKK->k_rho.d_view;
+  h_rho = atomKK->k_rho.h_view;
+  dpdTheta = atomKK->dpdTheta;
+  d_dpdTheta = atomKK->k_dpdTheta.d_view;
+  h_dpdTheta = atomKK->k_dpdTheta.h_view;
+  uCond = atomKK->uCond;
+  d_uCond = atomKK->k_uCond.d_view;;
+  h_uCond = atomKK->k_uCond.h_view;
+  uMech = atomKK->uMech;
+  d_uMech = atomKK->k_uMech.d_view;;
+  h_uMech = atomKK->k_uMech.h_view;
+  uChem = atomKK->uChem;
+  d_uChem = atomKK->k_uChem.d_view;;
+  h_uChem = atomKK->k_uChem.h_view;
+  uCG = atomKK->uCG;
+  d_uCG = atomKK->k_uCG.d_view;;
+  h_uCG = atomKK->k_uCG.h_view;
+  uCGnew = atomKK->uCGnew;
+  d_uCGnew = atomKK->k_uCGnew.d_view;;
+  h_uCGnew = atomKK->k_uCGnew.h_view;
+  duChem = atomKK->duChem;
+  d_duChem = atomKK->k_duChem.d_view;;
+  h_duChem = atomKK->k_duChem.h_view;
+}
+
+/* ----------------------------------------------------------------------
+   copy atom I info to atom J
+------------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::copy(int i, int j, int delflag)
+{
+  h_tag[j] = h_tag[i];
+  h_type[j] = h_type[i];
+  mask[j] = mask[i];
+  h_image[j] = h_image[i];
+  h_x(j,0) = h_x(i,0);
+  h_x(j,1) = h_x(i,1);
+  h_x(j,2) = h_x(i,2);
+  h_v(j,0) = h_v(i,0);
+  h_v(j,1) = h_v(i,1);
+  h_v(j,2) = h_v(i,2);
+  h_dpdTheta[j] = h_dpdTheta[i];
+  h_uCond[j] = h_uCond[i];
+  h_uMech[j] = h_uMech[i];
+  h_uChem[j] = h_uChem[i];
+  h_uCG[j] = h_uCG[i];
+  h_uCGnew[j] = h_uCGnew[i];
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      modify->fix[atom->extra_grow[iextra]]->copy_arrays(i,j,delflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG,int TRICLINIC>
+struct AtomVecDPDKokkos_PackComm {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  typename ArrayTypes<DeviceType>::t_efloat_1d _dpdTheta,_uCond,_uMech,_uChem;
+  typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
+  X_FLOAT _pbc[6];
+
+  AtomVecDPDKokkos_PackComm(
+      const typename DAT::tdual_x_array &x,
+      const typename DAT::tdual_efloat_1d &dpdTheta,
+      const typename DAT::tdual_efloat_1d &uCond,
+      const typename DAT::tdual_efloat_1d &uMech,
+      const typename DAT::tdual_efloat_1d &uChem,
+      const typename DAT::tdual_xfloat_2d &buf,
+      const typename DAT::tdual_int_2d &list,
+      const int & iswap,
+      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
+      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
+      _x(x.view<DeviceType>()),
+      _dpdTheta(dpdTheta.view<DeviceType>()),
+      _uCond(uCond.view<DeviceType>()),
+      _uMech(uMech.view<DeviceType>()),
+      _uChem(uChem.view<DeviceType>()),
+      _list(list.view<DeviceType>()),_iswap(iswap),
+      _xprd(xprd),_yprd(yprd),_zprd(zprd),
+      _xy(xy),_xz(xz),_yz(yz) {
+        const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
+        const size_t elements = 3;
+        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
+        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
+        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+      const int j = _list(_iswap,i);
+      if (PBC_FLAG == 0) {
+          _buf(i,0) = _x(j,0);
+          _buf(i,1) = _x(j,1);
+          _buf(i,2) = _x(j,2);
+      } else {
+        if (TRICLINIC == 0) {
+          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
+          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
+          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
+        } else {
+          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
+          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
+          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
+        }
+      }
+      _buf(i,3) = _dpdTheta(j);
+      _buf(i,4) = _uCond(j);
+      _buf(i,5) = _uMech(j);
+      _buf(i,6) = _uChem(j);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_comm_kokkos(const int &n,
+                                          const DAT::tdual_int_2d &list,
+                                          const int & iswap,
+                                          const DAT::tdual_xfloat_2d &buf,
+                                          const int &pbc_flag,
+                                          const int* const pbc)
+{
+  // Check whether to always run forward communication on the host
+  // Choose correct forward PackComm kernel
+
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+        struct AtomVecDPDKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecDPDKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+        struct AtomVecDPDKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecDPDKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    }
+    LMPHostType::fence();
+  } else {
+    sync(Device,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+        struct AtomVecDPDKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecDPDKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+        struct AtomVecDPDKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecDPDKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    }
+    LMPDeviceType::fence();
+  }
+
+	return n*size_forward;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG,int TRICLINIC>
+struct AtomVecDPDKokkos_PackCommSelf {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  typename ArrayTypes<DeviceType>::t_x_array _xw;
+  typename ArrayTypes<DeviceType>::t_efloat_1d _dpdTheta,_uCond,_uMech,_uChem;
+  int _nfirst;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
+  X_FLOAT _pbc[6];
+
+  AtomVecDPDKokkos_PackCommSelf(
+      const typename DAT::tdual_x_array &x,
+      const typename DAT::tdual_efloat_1d &dpdTheta,
+      const typename DAT::tdual_efloat_1d &uCond,
+      const typename DAT::tdual_efloat_1d &uMech,
+      const typename DAT::tdual_efloat_1d &uChem,
+      const int &nfirst,
+      const typename DAT::tdual_int_2d &list,
+      const int & iswap,
+      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
+      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
+      _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),
+      _dpdTheta(dpdTheta.view<DeviceType>()),
+      _uCond(uCond.view<DeviceType>()),
+      _uMech(uMech.view<DeviceType>()),
+      _uChem(uChem.view<DeviceType>()),      
+      _nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
+      _xprd(xprd),_yprd(yprd),_zprd(zprd),
+      _xy(xy),_xz(xz),_yz(yz) {
+        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
+        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+        const int j = _list(_iswap,i);
+      if (PBC_FLAG == 0) {
+          _xw(i+_nfirst,0) = _x(j,0);
+          _xw(i+_nfirst,1) = _x(j,1);
+          _xw(i+_nfirst,2) = _x(j,2);
+      } else {
+        if (TRICLINIC == 0) {
+          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
+          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
+          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
+        } else {
+          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
+          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
+          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
+        }
+      }
+      _dpdTheta(i+_nfirst) = _dpdTheta(j);
+      _uCond(i+_nfirst) = _uCond(j);
+      _uMech(i+_nfirst) = _uMech(j);
+      _uChem(i+_nfirst) = _uChem(j); 
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
+										const int nfirst, const int &pbc_flag, const int* const pbc) {
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK);
+    modified(Host,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    }
+    LMPHostType::fence();
+  } else {
+    sync(Device,X_MASK);
+    modified(Device,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    }
+    LMPDeviceType::fence();
+  }
+	return n*3;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecDPDKokkos_UnpackComm {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array _x;
+  typename ArrayTypes<DeviceType>::t_efloat_1d _dpdTheta,_uCond,_uMech,_uChem;
+  typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
+  int _first;
+
+  AtomVecDPDKokkos_UnpackComm(
+      const typename DAT::tdual_x_array &x,
+      const typename DAT::tdual_efloat_1d &dpdTheta,
+      const typename DAT::tdual_efloat_1d &uCond,
+      const typename DAT::tdual_efloat_1d &uMech,
+      const typename DAT::tdual_efloat_1d &uChem,
+      const typename DAT::tdual_xfloat_2d &buf,
+      const int& first):_x(x.view<DeviceType>()),
+                        _dpdTheta(dpdTheta.view<DeviceType>()),
+                        _uCond(uCond.view<DeviceType>()),
+                        _uMech(uMech.view<DeviceType>()),
+                        _uChem(uChem.view<DeviceType>()),
+                        _buf(buf.view<DeviceType>()),
+                        _first(first) {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+      _x(i+_first,0) = _buf(i,0);
+      _x(i+_first,1) = _buf(i,1);
+      _x(i+_first,2) = _buf(i,2);
+      _dpdTheta(i+_first) = _buf(i,3);
+      _uCond(i+_first) = _buf(i,4);
+      _uMech(i+_first) = _buf(i,5);
+      _uChem(i+_first) = _buf(i,6);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::unpack_comm_kokkos(const int &n, const int &first,
+    const DAT::tdual_xfloat_2d &buf ) {
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK);
+    modified(Host,X_MASK);
+    struct AtomVecDPDKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,
+    atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+    buf,first);
+    Kokkos::parallel_for(n,f);
+    LMPDeviceType::fence();
+  } else {
+    sync(Device,X_MASK);
+    modified(Device,X_MASK);
+    struct AtomVecDPDKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,
+    atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+    buf,first);
+    Kokkos::parallel_for(n,f);
+    LMPDeviceType::fence();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_comm(int n, int *list, double *buf,
+                             int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = dpdTheta[j];
+      buf[m++] = uCond[j];
+      buf[m++] = uMech[j];
+      buf[m++] = uChem[j];
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+      dz = pbc[2]*domain->zprd;
+    }
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0) + dx;
+      buf[m++] = h_x(j,1) + dy;
+      buf[m++] = h_x(j,2) + dz;
+      buf[m++] = h_dpdTheta[j];
+      buf[m++] = h_uCond[j];
+      buf[m++] = h_uMech[j];
+      buf[m++] = h_uChem[j];
+    }
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_comm_vel(int n, int *list, double *buf,
+                                 int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz,dvx,dvy,dvz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = h_v(j,0);
+      buf[m++] = h_v(j,1);
+      buf[m++] = h_v(j,2);
+      buf[m++] = h_dpdTheta[j];
+      buf[m++] = h_uCond[j];
+      buf[m++] = h_uMech[j];
+      buf[m++] = h_uChem[j];
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+      dz = pbc[2]*domain->zprd;
+    }
+    if (!deform_vremap) {
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = h_v(j,0);
+        buf[m++] = h_v(j,1);
+        buf[m++] = h_v(j,2);
+        buf[m++] = h_dpdTheta[j];
+        buf[m++] = h_uCond[j];
+        buf[m++] = h_uMech[j];
+        buf[m++] = h_uChem[j];
+      }
+    } else {
+      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
+      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
+      dvz = pbc[2]*h_rate[2];
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        if (mask[i] & deform_groupbit) {
+          buf[m++] = h_v(j,0) + dvx;
+          buf[m++] = h_v(j,1) + dvy;
+          buf[m++] = h_v(j,2) + dvz;
+        } else {
+          buf[m++] = h_v(j,0);
+          buf[m++] = h_v(j,1);
+          buf[m++] = h_v(j,2);
+        }
+        buf[m++] = h_dpdTheta(j);
+        buf[m++] = h_uCond(j);
+        buf[m++] = h_uMech(j);
+        buf[m++] = h_uChem(j); 
+      }
+    }
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::unpack_comm(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_dpdTheta[i] = buf[m++];
+    h_uCond[i] = buf[m++];
+    h_uMech[i] = buf[m++];
+    h_uChem[i] = buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::unpack_comm_vel(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_v(i,0) = buf[m++];
+    h_v(i,1) = buf[m++];
+    h_v(i,2) = buf[m++];
+    h_dpdTheta[i] = buf[m++];
+    h_uCond[i] = buf[m++];
+    h_uMech[i] = buf[m++];
+    h_uChem[i] = buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_reverse(int n, int first, double *buf)
+{
+  if(n > 0)
+    sync(Host,F_MASK);
+
+  int m = 0;
+  const int last = first + n;
+  for (int i = first; i < last; i++) {
+    buf[m++] = h_f(i,0);
+    buf[m++] = h_f(i,1);
+    buf[m++] = h_f(i,2);
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::unpack_reverse(int n, int *list, double *buf)
+{
+  if(n > 0) {
+    sync(Host,F_MASK);
+    modified(Host,F_MASK);
+  }
+
+  int m = 0;
+  for (int i = 0; i < n; i++) {
+    const int j = list[i];
+    h_f(j,0) += buf[m++];
+    h_f(j,1) += buf[m++];
+    h_f(j,2) += buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG>
+struct AtomVecDPDKokkos_PackBorder {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_xfloat_2d _buf;
+  const typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  const typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  const typename ArrayTypes<DeviceType>::t_tagint_1d _tag;
+  const typename ArrayTypes<DeviceType>::t_int_1d _type;
+  const typename ArrayTypes<DeviceType>::t_int_1d _mask;
+  typename ArrayTypes<DeviceType>::t_efloat_1d _dpdTheta,_uCond,_uMech,_uChem,_uCG,_uCGnew;
+  X_FLOAT _dx,_dy,_dz;
+
+  AtomVecDPDKokkos_PackBorder(
+      const typename ArrayTypes<DeviceType>::t_xfloat_2d &buf,
+      const typename ArrayTypes<DeviceType>::t_int_2d_const &list,
+      const int & iswap,
+      const typename ArrayTypes<DeviceType>::t_x_array &x,
+      const typename ArrayTypes<DeviceType>::t_tagint_1d &tag,
+      const typename ArrayTypes<DeviceType>::t_int_1d &type,
+      const typename ArrayTypes<DeviceType>::t_int_1d &mask,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &dpdTheta,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCond,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uMech,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uChem,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCG,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCGnew,
+      const X_FLOAT &dx, const X_FLOAT &dy, const X_FLOAT &dz):
+      _buf(buf),_list(list),_iswap(iswap),
+      _x(x),_tag(tag),_type(type),_mask(mask),
+      _dpdTheta(dpdTheta),
+      _uCond(uCond),
+      _uMech(uMech),
+      _uChem(uChem),
+      _uCG(uCGnew),
+      _uCGnew(uCGnew),
+      _dx(dx),_dy(dy),_dz(dz) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+      const int j = _list(_iswap,i);
+      if (PBC_FLAG == 0) {
+          _buf(i,0) = _x(j,0);
+          _buf(i,1) = _x(j,1);
+          _buf(i,2) = _x(j,2);
+      } else {
+          _buf(i,0) = _x(j,0) + _dx;
+          _buf(i,1) = _x(j,1) + _dy;
+          _buf(i,2) = _x(j,2) + _dz;
+      }
+      _buf(i,3) = _tag(j);
+      _buf(i,4) = _type(j);
+      _buf(i,5) = _mask(j);
+      _buf(i,6) = _dpdTheta(j);
+      _buf(i,7) = _uCond(j);
+      _buf(i,8) = _uMech(j);
+      _buf(i,9) = _uChem(j);
+      _buf(i,10) = _uCG(j);
+      _buf(i,11) = _uCGnew(j);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap,
+                               int pbc_flag, int *pbc, ExecutionSpace space)
+{
+  X_FLOAT dx,dy,dz;
+
+  if (pbc_flag != 0) {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    if(space==Host) {
+      AtomVecDPDKokkos_PackBorder<LMPHostType,1> f(
+        buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
+        iswap,h_x,h_tag,h_type,h_mask,
+        h_dpdTheta,h_uCond,h_uMech,h_uChem,h_uCG,h_uCGnew,
+        dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+      LMPHostType::fence();
+    } else {
+      AtomVecDPDKokkos_PackBorder<LMPDeviceType,1> f(
+        buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
+        iswap,d_x,d_tag,d_type,d_mask,
+        d_dpdTheta,d_uCond,d_uMech,d_uChem,d_uCG,d_uCGnew,
+        dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+      LMPDeviceType::fence();
+    }
+
+  } else {
+    dx = dy = dz = 0;
+    if(space==Host) {
+      AtomVecDPDKokkos_PackBorder<LMPHostType,0> f(
+        buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
+        iswap,h_x,h_tag,h_type,h_mask,
+        h_dpdTheta,h_uCond,h_uMech,h_uChem,h_uCG,h_uCGnew,
+        dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+      LMPHostType::fence();
+    } else {
+      AtomVecDPDKokkos_PackBorder<LMPDeviceType,0> f(
+        buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
+        iswap,d_x,d_tag,d_type,d_mask,
+        d_dpdTheta,d_uCond,d_uMech,d_uChem,d_uCG,d_uCGnew,
+        dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+      LMPDeviceType::fence();
+    }
+  }
+  return n*6;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_border(int n, int *list, double *buf,
+                               int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = ubuf(h_tag(j)).d;
+      buf[m++] = ubuf(h_type(j)).d;
+      buf[m++] = ubuf(h_mask(j)).d;
+      buf[m++] = h_dpdTheta(j);
+      buf[m++] = h_uCond(j);
+      buf[m++] = h_uMech(j);
+      buf[m++] = h_uChem(j);
+      buf[m++] = h_uCG(j);
+      buf[m++] = h_uCGnew(j);
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0) + dx;
+      buf[m++] = h_x(j,1) + dy;
+      buf[m++] = h_x(j,2) + dz;
+      buf[m++] = ubuf(h_tag(j)).d;
+      buf[m++] = ubuf(h_type(j)).d;
+      buf[m++] = ubuf(h_mask(j)).d;
+      buf[m++] = h_dpdTheta(j);
+      buf[m++] = h_uCond(j);
+      buf[m++] = h_uMech(j);
+      buf[m++] = h_uChem(j);
+      buf[m++] = h_uCG(j);
+      buf[m++] = h_uCGnew(j);
+    }
+  }
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->pack_border(n,list,&buf[m]);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_border_vel(int n, int *list, double *buf,
+                                   int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz,dvx,dvy,dvz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = ubuf(h_tag(j)).d;
+      buf[m++] = ubuf(h_type(j)).d;
+      buf[m++] = ubuf(h_mask(j)).d;
+      buf[m++] = h_v(j,0);
+      buf[m++] = h_v(j,1);
+      buf[m++] = h_v(j,2);
+      buf[m++] = h_dpdTheta(j);
+      buf[m++] = h_uCond(j);
+      buf[m++] = h_uMech(j);
+      buf[m++] = h_uChem(j);
+      buf[m++] = h_uCG(j);
+      buf[m++] = h_uCGnew(j);
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    if (!deform_vremap) {
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = ubuf(h_tag(j)).d;
+        buf[m++] = ubuf(h_type(j)).d;
+        buf[m++] = ubuf(h_mask(j)).d;
+        buf[m++] = h_v(j,0);
+        buf[m++] = h_v(j,1);
+        buf[m++] = h_v(j,2);
+        buf[m++] = h_dpdTheta(j);
+        buf[m++] = h_uCond(j);
+        buf[m++] = h_uMech(j);
+        buf[m++] = h_uChem(j);
+        buf[m++] = h_uCG(j);
+        buf[m++] = h_uCGnew(j);
+      }
+    } else {
+      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
+      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
+      dvz = pbc[2]*h_rate[2];
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = ubuf(h_tag(j)).d;
+        buf[m++] = ubuf(h_type(j)).d;
+        buf[m++] = ubuf(h_mask(j)).d;
+        if (mask[i] & deform_groupbit) {
+          buf[m++] = h_v(j,0) + dvx;
+          buf[m++] = h_v(j,1) + dvy;
+          buf[m++] = h_v(j,2) + dvz;
+        } else {
+          buf[m++] = h_v(j,0);
+          buf[m++] = h_v(j,1);
+          buf[m++] = h_v(j,2);
+        }
+        buf[m++] = h_dpdTheta(j);
+        buf[m++] = h_uCond(j);
+        buf[m++] = h_uMech(j);
+        buf[m++] = h_uChem(j);
+        buf[m++] = h_uCG(j);
+        buf[m++] = h_uCGnew(j);
+      }
+    }
+  }
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->pack_border(n,list,&buf[m]);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_comm_hybrid(int n, int *list, double *buf)
+{
+  int i,j,m;
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    buf[m++] = h_dpdTheta[j];
+    buf[m++] = h_uCond[j];
+    buf[m++] = h_uMech[j];
+    buf[m++] = h_uChem[j];
+    buf[m++] = h_uCG[j];
+    buf[m++] = h_uCGnew[j];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_border_hybrid(int n, int *list, double *buf)
+{
+  int i,j,m;
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    buf[m++] = h_dpdTheta[j];
+    buf[m++] = h_uCond[j];
+    buf[m++] = h_uMech[j];
+    buf[m++] = h_uChem[j];
+    buf[m++] = h_uCG[j];
+    buf[m++] = h_uCGnew[j];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecDPDKokkos_UnpackBorder {
+  typedef DeviceType device_type;
+
+  const typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
+  typename ArrayTypes<DeviceType>::t_x_array _x;
+  typename ArrayTypes<DeviceType>::t_tagint_1d _tag;
+  typename ArrayTypes<DeviceType>::t_int_1d _type;
+  typename ArrayTypes<DeviceType>::t_int_1d _mask;
+  typename ArrayTypes<DeviceType>::t_efloat_1d _dpdTheta,_uCond,_uMech,_uChem,_uCG,_uCGnew;
+  int _first;
+
+
+  AtomVecDPDKokkos_UnpackBorder(
+      const typename ArrayTypes<DeviceType>::t_xfloat_2d_const &buf,
+      typename ArrayTypes<DeviceType>::t_x_array &x,
+      typename ArrayTypes<DeviceType>::t_tagint_1d &tag,
+      typename ArrayTypes<DeviceType>::t_int_1d &type,
+      typename ArrayTypes<DeviceType>::t_int_1d &mask,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &dpdTheta,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCond,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uMech,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uChem,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCG,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCGnew,
+      const int& first):
+      _buf(buf),_x(x),_tag(tag),_type(type),_mask(mask),
+      _dpdTheta(dpdTheta),
+      _uCond(uCond),
+      _uMech(uMech),
+      _uChem(uChem),
+      _uCG(uCGnew),
+      _uCGnew(uCGnew),
+      _first(first) {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+      _x(i+_first,0) = _buf(i,0);
+      _x(i+_first,1) = _buf(i,1);
+      _x(i+_first,2) = _buf(i,2);
+      _tag(i+_first) = static_cast<int> (_buf(i,3));
+      _type(i+_first) = static_cast<int>  (_buf(i,4));
+      _mask(i+_first) = static_cast<int>  (_buf(i,5));
+      _dpdTheta(i+_first) = _buf(i,6);
+      _uCond(i+_first) = _buf(i,7);
+      _uMech(i+_first) = _buf(i,8);
+      _uChem(i+_first) = _buf(i,9);
+      _uCG(i+_first) = _buf(i,10);
+      _uCGnew(i+_first) = _buf(i,11);
+//      printf("%i %i %lf %lf %lf %i BORDER\n",_tag(i+_first),i+_first,_x(i+_first,0),_x(i+_first,1),_x(i+_first,2),_type(i+_first));
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::unpack_border_kokkos(const int &n, const int &first,
+                     const DAT::tdual_xfloat_2d &buf,ExecutionSpace space) {
+  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+  while (first+n >= nmax) grow(0);
+  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+  if(space==Host) {
+    struct AtomVecDPDKokkos_UnpackBorder<LMPHostType> f(buf.view<LMPHostType>(),
+      h_x,h_tag,h_type,h_mask,
+      h_dpdTheta,h_uCond,h_uMech,h_uChem,h_uCG,h_uCGnew,
+      first);
+    Kokkos::parallel_for(n,f);
+    LMPHostType::fence();
+  } else {
+    struct AtomVecDPDKokkos_UnpackBorder<LMPDeviceType> f(buf.view<LMPDeviceType>(),
+      d_x,d_tag,d_type,d_mask,
+      d_dpdTheta,d_uCond,d_uMech,d_uChem,d_uCG,d_uCGnew,
+      first);
+    Kokkos::parallel_for(n,f);
+    LMPDeviceType::fence();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::unpack_border(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    if (i == nmax) grow(0);
+    modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_tag(i) =  (tagint)  ubuf(buf[m++]).i;
+    h_type(i) = (int) ubuf(buf[m++]).i;
+    h_mask(i) = (int) ubuf(buf[m++]).i;
+    h_dpdTheta(i) = buf[m++];
+    h_uCond(i) = buf[m++];
+    h_uMech(i) = buf[m++];
+    h_uChem(i) = buf[m++];
+    h_uCG(i) = buf[m++];
+    h_uCGnew(i) = buf[m++];
+  }
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->
+        unpack_border(n,first,&buf[m]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::unpack_border_vel(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    if (i == nmax) grow(0);
+    modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_tag(i) =  (tagint)  ubuf(buf[m++]).i;
+    h_type(i) = (int) ubuf(buf[m++]).i;
+    h_mask(i) = (int) ubuf(buf[m++]).i;
+    h_v(i,0) = buf[m++];
+    h_v(i,1) = buf[m++];
+    h_v(i,2) = buf[m++];
+    h_dpdTheta(i) = buf[m++];
+    h_uCond(i) = buf[m++];
+    h_uMech(i) = buf[m++];
+    h_uChem(i) = buf[m++];
+    h_uCG(i) = buf[m++];
+    h_uCGnew(i) = buf[m++];
+  }
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->
+        unpack_border(n,first,&buf[m]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::unpack_comm_hybrid(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_dpdTheta(i) = buf[m++];
+    h_uCond(i) = buf[m++];
+    h_uMech(i) = buf[m++];
+    h_uChem(i) = buf[m++];
+    h_uCG(i) = buf[m++];
+    h_uCGnew(i) = buf[m++];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::unpack_border_hybrid(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_dpdTheta(i) = buf[m++];
+    h_uCond(i) = buf[m++];
+    h_uMech(i) = buf[m++];
+    h_uChem(i) = buf[m++];
+    h_uCG(i) = buf[m++];
+    h_uCGnew(i) = buf[m++];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecDPDKokkos_PackExchangeFunctor {
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typename AT::t_x_array_randomread _x;
+  typename AT::t_v_array_randomread _v;
+  typename AT::t_tagint_1d_randomread _tag;
+  typename AT::t_int_1d_randomread _type;
+  typename AT::t_int_1d_randomread _mask;
+  typename AT::t_imageint_1d_randomread _image;
+  typename AT::t_efloat_1d_randomread _dpdTheta,_uCond,_uMech,_uChem,_uCG,_uCGnew;
+  typename AT::t_x_array _xw;
+  typename AT::t_v_array _vw;
+  typename AT::t_tagint_1d _tagw;
+  typename AT::t_int_1d _typew;
+  typename AT::t_int_1d _maskw;
+  typename AT::t_imageint_1d _imagew;
+  typename AT::t_efloat_1d _dpdThetaw,_uCondw,_uMechw,_uChemw,_uCGw,_uCGneww;
+
+  typename AT::t_xfloat_2d_um _buf;
+  typename AT::t_int_1d_const _sendlist;
+  typename AT::t_int_1d_const _copylist;
+  int _nlocal,_dim;
+  X_FLOAT _lo,_hi;
+
+  AtomVecDPDKokkos_PackExchangeFunctor(
+      const AtomKokkos* atom,
+      const typename AT::tdual_xfloat_2d buf,
+      typename AT::tdual_int_1d sendlist,
+      typename AT::tdual_int_1d copylist,int nlocal, int dim,
+                X_FLOAT lo, X_FLOAT hi):
+                _x(atom->k_x.view<DeviceType>()),
+                _v(atom->k_v.view<DeviceType>()),
+                _tag(atom->k_tag.view<DeviceType>()),
+                _type(atom->k_type.view<DeviceType>()),
+                _mask(atom->k_mask.view<DeviceType>()),
+                _image(atom->k_image.view<DeviceType>()),
+                _dpdTheta(atom->k_dpdTheta.view<DeviceType>()),
+                _uCond(atom->k_uCond.view<DeviceType>()),
+                _uMech(atom->k_uMech.view<DeviceType>()),
+                _uChem(atom->k_uChem.view<DeviceType>()),
+                _uCG(atom->k_uCG.view<DeviceType>()),
+                _uCGnew(atom->k_uCGnew.view<DeviceType>()),
+                _xw(atom->k_x.view<DeviceType>()),
+                _vw(atom->k_v.view<DeviceType>()),
+                _tagw(atom->k_tag.view<DeviceType>()),
+                _typew(atom->k_type.view<DeviceType>()),
+                _maskw(atom->k_mask.view<DeviceType>()),
+                _imagew(atom->k_image.view<DeviceType>()),
+                _dpdThetaw(atom->k_dpdTheta.view<DeviceType>()),
+                _uCondw(atom->k_uCond.view<DeviceType>()),
+                _uMechw(atom->k_uMech.view<DeviceType>()),
+                _uChemw(atom->k_uChem.view<DeviceType>()),
+                _uCGw(atom->k_uCG.view<DeviceType>()),
+                _uCGneww(atom->k_uCGnew.view<DeviceType>()),
+                _sendlist(sendlist.template view<DeviceType>()),
+                _copylist(copylist.template view<DeviceType>()),
+                _nlocal(nlocal),_dim(dim),
+                _lo(lo),_hi(hi){
+    const size_t elements = 17;
+    const int maxsendlist = (buf.template view<DeviceType>().dimension_0()*buf.template view<DeviceType>().dimension_1())/elements;
+
+    buffer_view<DeviceType>(_buf,buf,maxsendlist,elements);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &mysend) const {
+    const int i = _sendlist(mysend);
+    _buf(mysend,0) = 17;
+    _buf(mysend,1) = _x(i,0);
+    _buf(mysend,2) = _x(i,1);
+    _buf(mysend,3) = _x(i,2);
+    _buf(mysend,4) = _v(i,0);
+    _buf(mysend,5) = _v(i,1);
+    _buf(mysend,6) = _v(i,2);
+    _buf(mysend,7) = _tag[i];
+    _buf(mysend,8) = _type[i];
+    _buf(mysend,9) = _mask[i];
+    _buf(mysend,10) = _image[i];
+    _buf(mysend,11) = _dpdTheta[i];
+    _buf(mysend,12) = _uCond[i];
+    _buf(mysend,13) = _uMech[i];
+    _buf(mysend,14) = _uChem[i];
+    _buf(mysend,15) = _uCG[i];
+    _buf(mysend,16) = _uCGnew[i];
+    const int j = _copylist(mysend);
+
+    if(j>-1) {
+    _xw(i,0) = _x(j,0);
+    _xw(i,1) = _x(j,1);
+    _xw(i,2) = _x(j,2);
+    _vw(i,0) = _v(j,0);
+    _vw(i,1) = _v(j,1);
+    _vw(i,2) = _v(j,2);
+    _tagw[i] = _tag(j);
+    _typew[i] = _type(j);
+    _maskw[i] = _mask(j);
+    _imagew[i] = _image(j);
+    _dpdThetaw[i] = _dpdTheta(j);
+    _uCondw[i] = _uCond(j);
+    _uMechw[i] = _uMech(j);
+    _uChemw[i] = _uChem(j);
+    _uCGw[i] = _uCG(j);
+    _uCGneww[i] = _uCGnew(j);
+    }
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &k_buf, DAT::tdual_int_1d k_sendlist,DAT::tdual_int_1d k_copylist,ExecutionSpace space,int dim,X_FLOAT lo,X_FLOAT hi )
+{
+  if(nsend > (int) (k_buf.view<LMPHostType>().dimension_0()*k_buf.view<LMPHostType>().dimension_1())/17) {
+    int newsize = nsend*17/k_buf.view<LMPHostType>().dimension_1()+1;
+    k_buf.resize(newsize,k_buf.view<LMPHostType>().dimension_1());
+  }
+  if(space == Host) {
+    AtomVecDPDKokkos_PackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
+    Kokkos::parallel_for(nsend,f);
+    LMPHostType::fence();
+    return nsend*17;
+  } else {
+    AtomVecDPDKokkos_PackExchangeFunctor<LMPDeviceType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
+    Kokkos::parallel_for(nsend,f);
+    LMPDeviceType::fence();
+    return nsend*17;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_exchange(int i, double *buf)
+{
+  int m = 1;
+  buf[m++] = h_x(i,0);
+  buf[m++] = h_x(i,1);
+  buf[m++] = h_x(i,2);
+  buf[m++] = h_v(i,0);
+  buf[m++] = h_v(i,1);
+  buf[m++] = h_v(i,2);
+  buf[m++] = ubuf(h_tag(i)).d;
+  buf[m++] = ubuf(h_type(i)).d;
+  buf[m++] = ubuf(h_mask(i)).d;
+  buf[m++] = ubuf(h_image(i)).d;
+  buf[m++] = h_dpdTheta[i];
+  buf[m++] = h_uCond[i];
+  buf[m++] = h_uMech[i];
+  buf[m++] = h_uChem[i];
+  buf[m++] = h_uCG[i];
+  buf[m++] = h_uCGnew[i];
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      m += modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&buf[m]);
+
+  buf[0] = m;
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecDPDKokkos_UnpackExchangeFunctor {
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typename AT::t_x_array _x;
+  typename AT::t_v_array _v;
+  typename AT::t_tagint_1d _tag;
+  typename AT::t_int_1d _type;
+  typename AT::t_int_1d _mask;
+  typename AT::t_imageint_1d _image;
+  typename AT::t_efloat_1d _dpdTheta;
+  typename AT::t_efloat_1d _uCond;
+  typename AT::t_efloat_1d _uMech;
+  typename AT::t_efloat_1d _uChem;
+  typename AT::t_efloat_1d _uCG;
+  typename AT::t_efloat_1d _uCGnew;
+
+  typename AT::t_xfloat_2d_um _buf;
+  typename AT::t_int_1d _nlocal;
+  int _dim;
+  X_FLOAT _lo,_hi;
+
+  AtomVecDPDKokkos_UnpackExchangeFunctor(
+      const AtomKokkos* atom,
+      const typename AT::tdual_xfloat_2d buf,
+      typename AT::tdual_int_1d nlocal,
+      int dim, X_FLOAT lo, X_FLOAT hi):
+                _x(atom->k_x.view<DeviceType>()),
+                _v(atom->k_v.view<DeviceType>()),
+                _tag(atom->k_tag.view<DeviceType>()),
+                _type(atom->k_type.view<DeviceType>()),
+                _mask(atom->k_mask.view<DeviceType>()),
+                _image(atom->k_image.view<DeviceType>()),
+                _nlocal(nlocal.template view<DeviceType>()),_dim(dim),
+                _lo(lo),_hi(hi){
+    const size_t elements = 17;
+    const int maxsendlist = (buf.template view<DeviceType>().dimension_0()*buf.template view<DeviceType>().dimension_1())/elements;
+
+    buffer_view<DeviceType>(_buf,buf,maxsendlist,elements);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &myrecv) const {
+    X_FLOAT x = _buf(myrecv,_dim+1);
+    if (x >= _lo && x < _hi) {
+      int i = Kokkos::atomic_fetch_add(&_nlocal(0),1);
+      _x(i,0) = _buf(myrecv,1);
+      _x(i,1) = _buf(myrecv,2);
+      _x(i,2) = _buf(myrecv,3);
+      _v(i,0) = _buf(myrecv,4);
+      _v(i,1) = _buf(myrecv,5);
+      _v(i,2) = _buf(myrecv,6);
+      _tag[i] = _buf(myrecv,7);
+      _type[i] = _buf(myrecv,8);
+      _mask[i] = _buf(myrecv,9);
+      _image[i] = _buf(myrecv,10);
+      _dpdTheta[i] = _buf(myrecv,11);
+      _uCond[i] = _buf(myrecv,12);
+      _uMech[i] = _buf(myrecv,13);
+      _uChem[i] = _buf(myrecv,14);
+      _uCG[i] = _buf(myrecv,15);
+      _uCGnew[i] = _buf(myrecv,16);
+    }
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nrecv,int nlocal,int dim,X_FLOAT lo,X_FLOAT hi,ExecutionSpace space) {
+  if(space == Host) {
+    k_count.h_view(0) = nlocal;
+    AtomVecDPDKokkos_UnpackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_count,dim,lo,hi);
+    Kokkos::parallel_for(nrecv/17,f);
+    LMPHostType::fence();
+    return k_count.h_view(0);
+  } else {
+    k_count.h_view(0) = nlocal;
+    k_count.modify<LMPHostType>();
+    k_count.sync<LMPDeviceType>();
+    AtomVecDPDKokkos_UnpackExchangeFunctor<LMPDeviceType> f(atomKK,k_buf,k_count,dim,lo,hi);
+    Kokkos::parallel_for(nrecv/17,f);
+    LMPDeviceType::fence();
+    k_count.modify<LMPDeviceType>();
+    k_count.sync<LMPHostType>();
+
+    return k_count.h_view(0);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::unpack_exchange(double *buf)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) grow(0);
+  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+           MASK_MASK | IMAGE_MASK);
+
+  int m = 1;
+  h_x(nlocal,0) = buf[m++];
+  h_x(nlocal,1) = buf[m++];
+  h_x(nlocal,2) = buf[m++];
+  h_v(nlocal,0) = buf[m++];
+  h_v(nlocal,1) = buf[m++];
+  h_v(nlocal,2) = buf[m++];
+  h_tag(nlocal) = (tagint) ubuf(buf[m++]).i;
+  h_type(nlocal) = (int) ubuf(buf[m++]).i;
+  h_mask(nlocal) = (int) ubuf(buf[m++]).i;
+  h_image(nlocal) = (imageint) ubuf(buf[m++]).i;
+  h_dpdTheta[nlocal] = buf[m++];
+  h_uCond[nlocal] = buf[m++];
+  h_uMech[nlocal] = buf[m++];
+  h_uChem[nlocal] = buf[m++];
+  h_uCG[nlocal] = buf[m++];
+  h_uCGnew[nlocal] = buf[m++];
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      m += modify->fix[atom->extra_grow[iextra]]->
+        unpack_exchange(nlocal,&buf[m]);
+
+  atom->nlocal++;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   size of restart data for all atoms owned by this proc
+   include extra data stored by fixes
+------------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::size_restart()
+{
+  int i;
+
+  int nlocal = atom->nlocal;
+  int n = 15 * nlocal; // 11 + dpdTheta + uCond + uMech + uChem
+
+  if (atom->nextra_restart)
+    for (int iextra = 0; iextra < atom->nextra_restart; iextra++)
+      for (i = 0; i < nlocal; i++)
+        n += modify->fix[atom->extra_restart[iextra]]->size_restart(i);
+
+  return n;
+}
+
+/* ----------------------------------------------------------------------
+   pack atom I's data for restart file including extra quantities
+   xyz must be 1st 3 values, so that read_restart can test on them
+   molecular types may be negative, but write as positive
+------------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_restart(int i, double *buf)
+{
+  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+            MASK_MASK | IMAGE_MASK );
+
+  int m = 1;
+  buf[m++] = h_x(i,0);
+  buf[m++] = h_x(i,1);
+  buf[m++] = h_x(i,2);
+  buf[m++] = ubuf(h_tag(i)).d;
+  buf[m++] = ubuf(h_type(i)).d;
+  buf[m++] = ubuf(h_mask(i)).d;
+  buf[m++] = ubuf(h_image(i)).d;
+  buf[m++] = h_v(i,0);
+  buf[m++] = h_v(i,1);
+  buf[m++] = h_v(i,2);
+  buf[m++] = h_dpdTheta[i];
+  buf[m++] = h_uCond[i];
+  buf[m++] = h_uMech[i];
+  buf[m++] = h_uChem[i];
+
+  if (atom->nextra_restart)
+    for (int iextra = 0; iextra < atom->nextra_restart; iextra++)
+      m += modify->fix[atom->extra_restart[iextra]]->pack_restart(i,&buf[m]);
+
+  buf[0] = m;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   unpack data for one atom from restart file including extra quantities
+------------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::unpack_restart(double *buf)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) {
+    grow(0);
+    if (atom->nextra_store)
+      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
+  }
+  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+                MASK_MASK | IMAGE_MASK );
+
+  int m = 1;
+  h_x(nlocal,0) = buf[m++];
+  h_x(nlocal,1) = buf[m++];
+  h_x(nlocal,2) = buf[m++];
+  h_tag(nlocal) = (tagint) ubuf(buf[m++]).i;
+  h_type(nlocal) = (int) ubuf(buf[m++]).i;
+  h_mask(nlocal) = (int) ubuf(buf[m++]).i;
+  h_image(nlocal) = (imageint) ubuf(buf[m++]).i;
+  h_v(nlocal,0) = buf[m++];
+  h_v(nlocal,1) = buf[m++];
+  h_v(nlocal,2) = buf[m++];
+  h_dpdTheta[nlocal] = buf[m++];
+  h_uCond[nlocal] = buf[m++];
+  h_uMech[nlocal] = buf[m++];
+  h_uChem[nlocal] = buf[m++];
+
+  double **extra = atom->extra;
+  if (atom->nextra_store) {
+    int size = static_cast<int> (ubuf(buf[m++]).i) - m;
+    for (int i = 0; i < size; i++) extra[nlocal][i] = buf[m++];
+  }
+
+  atom->nlocal++;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   create one atom of itype at coord
+   set other values to defaults
+------------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::create_atom(int itype, double *coord)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) {
+    //if(nlocal>2) printf("typeA: %i %i\n",type[0],type[1]);
+    atomKK->modified(Host,ALL_MASK);
+    grow(0);
+    //if(nlocal>2) printf("typeB: %i %i\n",type[0],type[1]);
+  }
+  atomKK->modified(Host,ALL_MASK);
+
+  tag[nlocal] = 0;
+  type[nlocal] = itype;
+  h_x(nlocal,0) = coord[0];
+  h_x(nlocal,1) = coord[1];
+  h_x(nlocal,2) = coord[2];
+  h_mask[nlocal] = 1;
+  h_image[nlocal] = ((tagint) IMGMAX << IMG2BITS) |
+    ((tagint) IMGMAX << IMGBITS) | IMGMAX;
+  h_v(nlocal,0) = 0.0;
+  h_v(nlocal,1) = 0.0;
+  h_v(nlocal,2) = 0.0;
+  h_rho[nlocal] = 0.0;
+  h_dpdTheta[nlocal] = 0.0;
+  h_uCond[nlocal] = 0.0;
+  h_uMech[nlocal] = 0.0;
+  h_uChem[nlocal] = 0.0;
+  h_uCG[nlocal] = 0.0;
+  h_uCGnew[nlocal] = 0.0;
+  h_duChem[nlocal] = 0.0;
+
+  atom->nlocal++;
+}
+
+/* ----------------------------------------------------------------------
+   unpack one line from Atoms section of data file
+   initialize other atom quantities
+------------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::data_atom(double *coord, tagint imagetmp,
+                                    char **values)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) grow(0);
+
+  h_tag[nlocal] = ATOTAGINT(values[0]);
+  h_type[nlocal] = atoi(values[1]);
+  if (type[nlocal] <= 0 || type[nlocal] > atom->ntypes)
+    error->one(FLERR,"Invalid atom type in Atoms section of data file");
+
+  h_dpdTheta[nlocal] = atof(values[2]);
+  if (h_dpdTheta[nlocal] <= 0)
+    error->one(FLERR,"Internal temperature in Atoms section of date file must be > zero");
+
+  h_x(nlocal,0) = coord[0];
+  h_x(nlocal,1) = coord[1];
+  h_x(nlocal,2) = coord[2];
+
+  h_image[nlocal] = imagetmp;
+
+  h_mask[nlocal] = 1;
+  h_v(nlocal,0) = 0.0;
+  h_v(nlocal,1) = 0.0;
+  h_v(nlocal,2) = 0.0;
+
+  h_rho[nlocal] = 0.0;
+  h_uCond[nlocal] = 0.0;
+  h_uMech[nlocal] = 0.0;
+  h_uChem[nlocal] = 0.0;
+  h_uCG[nlocal] = 0.0;
+  h_uCGnew[nlocal] = 0.0;
+
+  atomKK->modified(Host,ALL_MASK);
+
+  atom->nlocal++;
+}
+
+/* ----------------------------------------------------------------------
+   unpack hybrid quantities from one line in Atoms section of data file
+   initialize other atom quantities for this sub-style
+------------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::data_atom_hybrid(int nlocal, char **values)
+{
+  h_dpdTheta(nlocal) = atof(values[0]);
+
+  return 1;
+}
+
+/* ----------------------------------------------------------------------
+   pack atom info for data file including 3 image flags
+------------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::pack_data(double **buf)
+{
+  int nlocal = atom->nlocal;
+  for (int i = 0; i < nlocal; i++) {
+    buf[i][0] = ubuf(h_tag(i)).d;
+    buf[i][1] = ubuf(h_type(i)).d;
+    buf[i][2] = h_dpdTheta(i);
+    buf[i][3] = h_x(i,0);
+    buf[i][4] = h_x(i,1);
+    buf[i][5] = h_x(i,2);
+    buf[i][6] = (h_image[i] & IMGMASK) - IMGMAX;
+    buf[i][7] = (h_image[i] >> IMGBITS & IMGMASK) - IMGMAX;
+    buf[i][8] = (h_image[i] >> IMG2BITS) - IMGMAX;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pack hybrid atom info for data file
+------------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_data_hybrid(int i, double *buf)
+{
+  buf[0] = h_dpdTheta(i);
+  return 1;
+}
+
+/* ----------------------------------------------------------------------
+   write atom info to data file including 3 image flags
+------------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::write_data(FILE *fp, int n, double **buf)
+{
+  for (int i = 0; i < n; i++)
+    fprintf(fp,TAGINT_FORMAT " %d %-1.16e %-1.16e %-1.16e %-1.16e %d %d %d\n",
+            (tagint) ubuf(buf[i][0]).i,(int) ubuf(buf[i][1]).i,
+            buf[i][2],buf[i][3],buf[i][4],buf[i][5],
+            (int) ubuf(buf[i][6]).i,(int) ubuf(buf[i][7]).i,
+            (int) ubuf(buf[i][8]).i);
+}
+
+/* ----------------------------------------------------------------------
+   write hybrid atom info to data file
+------------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::write_data_hybrid(FILE *fp, double *buf)
+{
+  fprintf(fp," %-1.16e",buf[0]);
+  return 1;
+}
+
+/* ----------------------------------------------------------------------
+   return # of bytes of allocated memory
+------------------------------------------------------------------------- */
+
+bigint AtomVecDPDKokkos::memory_usage()
+{
+  bigint bytes = 0;
+
+  if (atom->memcheck("tag")) bytes += memory->usage(tag,nmax);
+  if (atom->memcheck("type")) bytes += memory->usage(type,nmax);
+  if (atom->memcheck("mask")) bytes += memory->usage(mask,nmax);
+  if (atom->memcheck("image")) bytes += memory->usage(image,nmax);
+  if (atom->memcheck("x")) bytes += memory->usage(x,nmax,3);
+  if (atom->memcheck("v")) bytes += memory->usage(v,nmax,3);
+  if (atom->memcheck("f")) bytes += memory->usage(f,nmax*commKK->nthreads,3);
+  if (atom->memcheck("rho")) bytes += memory->usage(rho,nmax);
+  if (atom->memcheck("dpdTheta")) bytes += memory->usage(dpdTheta,nmax);
+  if (atom->memcheck("uCond")) bytes += memory->usage(uCond,nmax);
+  if (atom->memcheck("uMech")) bytes += memory->usage(uMech,nmax);
+  if (atom->memcheck("uChem")) bytes += memory->usage(uChem,nmax);
+  if (atom->memcheck("uCG")) bytes += memory->usage(uCG,nmax);
+  if (atom->memcheck("uCGnew")) bytes += memory->usage(uCGnew,nmax);
+  if (atom->memcheck("duChem")) bytes += memory->usage(duChem,nmax);
+
+  return bytes;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::sync(ExecutionSpace space, unsigned int mask)
+{
+  if (space == Device) {
+    if (mask & X_MASK) atomKK->k_x.sync<LMPDeviceType>();
+    if (mask & V_MASK) atomKK->k_v.sync<LMPDeviceType>();
+    if (mask & F_MASK) atomKK->k_f.sync<LMPDeviceType>();
+    if (mask & TAG_MASK) atomKK->k_tag.sync<LMPDeviceType>();
+    if (mask & TYPE_MASK) atomKK->k_type.sync<LMPDeviceType>();
+    if (mask & MASK_MASK) atomKK->k_mask.sync<LMPDeviceType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.sync<LMPDeviceType>();
+  } else {
+    if (mask & X_MASK) atomKK->k_x.sync<LMPHostType>();
+    if (mask & V_MASK) atomKK->k_v.sync<LMPHostType>();
+    if (mask & F_MASK) atomKK->k_f.sync<LMPHostType>();
+    if (mask & TAG_MASK) atomKK->k_tag.sync<LMPHostType>();
+    if (mask & TYPE_MASK) atomKK->k_type.sync<LMPHostType>();
+    if (mask & MASK_MASK) atomKK->k_mask.sync<LMPHostType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.sync<LMPHostType>();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::sync_overlapping_device(ExecutionSpace space, unsigned int mask)
+{
+  if (space == Device) {
+    if ((mask & X_MASK) && atomKK->k_x.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_x_array>(atomKK->k_x,space);
+    if ((mask & V_MASK) && atomKK->k_v.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_v_array>(atomKK->k_v,space);
+    if ((mask & F_MASK) && atomKK->k_f.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_f_array>(atomKK->k_f,space);
+    if ((mask & TAG_MASK) && atomKK->k_tag.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_tagint_1d>(atomKK->k_tag,space);
+    if ((mask & TYPE_MASK) && atomKK->k_type.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_type,space);
+    if ((mask & MASK_MASK) && atomKK->k_mask.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_mask,space);
+    if ((mask & IMAGE_MASK) && atomKK->k_image.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_imageint_1d>(atomKK->k_image,space);
+  } else {
+    if ((mask & X_MASK) && atomKK->k_x.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_x_array>(atomKK->k_x,space);
+    if ((mask & V_MASK) && atomKK->k_v.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_v_array>(atomKK->k_v,space);
+    if ((mask & F_MASK) && atomKK->k_f.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_f_array>(atomKK->k_f,space);
+    if ((mask & TAG_MASK) && atomKK->k_tag.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_tagint_1d>(atomKK->k_tag,space);
+    if ((mask & TYPE_MASK) && atomKK->k_type.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_type,space);
+    if ((mask & MASK_MASK) && atomKK->k_mask.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_mask,space);
+    if ((mask & IMAGE_MASK) && atomKK->k_image.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_imageint_1d>(atomKK->k_image,space);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::modified(ExecutionSpace space, unsigned int mask)
+{
+  if (space == Device) {
+    if (mask & X_MASK) atomKK->k_x.modify<LMPDeviceType>();
+    if (mask & V_MASK) atomKK->k_v.modify<LMPDeviceType>();
+    if (mask & F_MASK) atomKK->k_f.modify<LMPDeviceType>();
+    if (mask & TAG_MASK) atomKK->k_tag.modify<LMPDeviceType>();
+    if (mask & TYPE_MASK) atomKK->k_type.modify<LMPDeviceType>();
+    if (mask & MASK_MASK) atomKK->k_mask.modify<LMPDeviceType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.modify<LMPDeviceType>();
+  } else {
+    if (mask & X_MASK) atomKK->k_x.modify<LMPHostType>();
+    if (mask & V_MASK) atomKK->k_v.modify<LMPHostType>();
+    if (mask & F_MASK) atomKK->k_f.modify<LMPHostType>();
+    if (mask & TAG_MASK) atomKK->k_tag.modify<LMPHostType>();
+    if (mask & TYPE_MASK) atomKK->k_type.modify<LMPHostType>();
+    if (mask & MASK_MASK) atomKK->k_mask.modify<LMPHostType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.modify<LMPHostType>();
+  }
+}
+
diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.h b/src/KOKKOS/atom_vec_dpd_kokkos.h
new file mode 100644
index 0000000000..d108e58ae7
--- /dev/null
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.h
@@ -0,0 +1,135 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale AtomicKokkos/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef ATOM_CLASS
+
+AtomStyle(dpd/kk,AtomVecDPDKokkos)
+
+#else
+
+#ifndef LMP_ATOM_VEC_DPD_KOKKOS_H
+#define LMP_ATOM_VEC_DPD_KOKKOS_H
+
+#include "atom_vec_kokkos.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+class AtomVecDPDKokkos : public AtomVecKokkos {
+ public:
+  AtomVecDPDKokkos(class LAMMPS *);
+  virtual ~AtomVecDPDKokkos() {}
+  void grow(int);
+  void copy(int, int, int);
+  int pack_comm(int, int *, double *, int, int *);
+  int pack_comm_vel(int, int *, double *, int, int *);
+  int pack_comm_hybrid(int, int *, double *);
+  void unpack_comm(int, int, double *);
+  void unpack_comm_vel(int, int, double *);
+  int unpack_comm_hybrid(int, int, double *);
+  int pack_reverse(int, int, double *);
+  void unpack_reverse(int, int *, double *);
+  int pack_border(int, int *, double *, int, int *);
+  int pack_border_vel(int, int *, double *, int, int *);
+  int pack_border_hybrid(int, int *, double *);
+  void unpack_border(int, int, double *);
+  void unpack_border_vel(int, int, double *);
+  int unpack_border_hybrid(int, int, double *);
+  int pack_exchange(int, double *);
+  int unpack_exchange(double *);
+  int size_restart();
+  int pack_restart(int, double *);
+  int unpack_restart(double *);
+  void create_atom(int, double *);
+  void data_atom(double *, tagint, char **);
+  int data_atom_hybrid(int, char **);
+  void pack_data(double **);
+  int pack_data_hybrid(int, double *);
+  void write_data(FILE *, int, double **);
+  int write_data_hybrid(FILE *, double *);
+  bigint memory_usage();
+
+  void grow_reset();
+  int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
+                       const int & iswap,
+                       const DAT::tdual_xfloat_2d &buf,
+                       const int &pbc_flag, const int pbc[]);
+  void unpack_comm_kokkos(const int &n, const int &nfirst,
+                          const DAT::tdual_xfloat_2d &buf);
+  int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
+                     const int & iswap, const int nfirst,
+                     const int &pbc_flag, const int pbc[]);
+  int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
+                         DAT::tdual_xfloat_2d buf,int iswap,
+                         int pbc_flag, int *pbc, ExecutionSpace space);
+  void unpack_border_kokkos(const int &n, const int &nfirst,
+                            const DAT::tdual_xfloat_2d &buf,
+                            ExecutionSpace space);
+  int pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &buf,
+                           DAT::tdual_int_1d k_sendlist,
+                           DAT::tdual_int_1d k_copylist,
+                           ExecutionSpace space, int dim,
+                           X_FLOAT lo, X_FLOAT hi);
+  int unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv,
+                             int nlocal, int dim, X_FLOAT lo, X_FLOAT hi,
+                             ExecutionSpace space);
+
+  void sync(ExecutionSpace space, unsigned int mask);
+  void modified(ExecutionSpace space, unsigned int mask);
+  void sync_overlapping_device(ExecutionSpace space, unsigned int mask);
+  double *uCond,*uMech,*uChem,*uCG,*uCGnew,*rho,*dpdTheta;
+  double *duChem;
+
+ protected:
+  DAT::t_efloat_1d d_uCond, d_uMech, d_uChem, d_uCG, d_uCGnew,d_rho,d_dpdTheta,d_duChem;
+  HAT::t_efloat_1d h_uCond, h_uMech, h_uChem, h_uCG, h_uCGnew,h_rho,h_dpdTheta,h_duChem;
+
+  tagint *tag;
+  imageint *image;
+  int *type,*mask;
+  double **x,**v,**f;
+
+  DAT::t_tagint_1d d_tag;
+  HAT::t_tagint_1d h_tag;
+  DAT::t_imageint_1d d_image;
+  HAT::t_imageint_1d h_image;
+  DAT::t_int_1d d_type, d_mask;
+  HAT::t_int_1d h_type, h_mask;
+
+  DAT::t_x_array d_x;
+  DAT::t_v_array d_v;
+  DAT::t_f_array d_f;
+  HAT::t_x_array h_x;
+  HAT::t_v_array h_v;
+  HAT::t_f_array h_f;
+
+  DAT::tdual_int_1d k_count;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Per-processor system is too big
+
+The number of owned atoms plus ghost atoms on a single
+processor must fit in 32-bit integer.
+
+E: Invalid atom type in Atoms section of data file
+
+Atom types must range from 1 to specified # of types.
+
+*/

From 91e38720d5d69052cc92cd2344126b81d97c4aca Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Mon, 28 Nov 2016 14:25:02 -0700
Subject: [PATCH 004/267] Adding pair_exp6_rx_kokkos files

---
 src/KOKKOS/Install.sh              |    2 +
 src/KOKKOS/atom_kokkos.cpp         |   57 ++
 src/KOKKOS/atom_kokkos.h           |    4 +
 src/KOKKOS/pair_exp6_rx_kokkos.cpp | 1060 ++++++++++++++++++++++++++++
 src/KOKKOS/pair_exp6_rx_kokkos.h   |  204 ++++++
 src/USER-DPD/pair_exp6_rx.cpp      |    2 +
 src/atom.h                         |    4 +-
 7 files changed, 1331 insertions(+), 2 deletions(-)
 create mode 100644 src/KOKKOS/pair_exp6_rx_kokkos.cpp
 create mode 100644 src/KOKKOS/pair_exp6_rx_kokkos.h

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index 93adf58ef5..14a8a951ee 100644
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -134,6 +134,8 @@ action pair_eam_alloy_kokkos.cpp pair_eam_alloy.cpp
 action pair_eam_alloy_kokkos.h pair_eam_alloy.h
 action pair_eam_fs_kokkos.cpp pair_eam_fs.cpp
 action pair_eam_fs_kokkos.h pair_eam_fs.h
+action pair_exp6_rx_kokkos.cpp pair_exp6_rx.cpp
+action pair_exp6_rx_kokkos.h pair_exp6_rx.h
 action pair_kokkos.h
 action pair_lj_charmm_coul_charmm_implicit_kokkos.cpp pair_lj_charmm_coul_charmm_implicit.cpp
 action pair_lj_charmm_coul_charmm_implicit_kokkos.h pair_lj_charmm_coul_charmm_implicit.h
diff --git a/src/KOKKOS/atom_kokkos.cpp b/src/KOKKOS/atom_kokkos.cpp
index 577eff2364..4a7250e6ab 100644
--- a/src/KOKKOS/atom_kokkos.cpp
+++ b/src/KOKKOS/atom_kokkos.cpp
@@ -227,6 +227,63 @@ void AtomKokkos::grow(unsigned int mask){
   }
 }
 
+/* ----------------------------------------------------------------------
+   add a custom variable with name of type flag = 0/1 for int/double
+   assumes name does not already exist
+   return index in ivector or dvector of its location
+------------------------------------------------------------------------- */
+
+int AtomKokkos::add_custom(const char *name, int flag)
+{
+  int index;
+
+  if (flag == 0) {
+    index = nivector;
+    nivector++;
+    iname = (char **) memory->srealloc(iname,nivector*sizeof(char *),
+                                       "atom:iname");
+    int n = strlen(name) + 1;
+    iname[index] = new char[n];
+    strcpy(iname[index],name);
+    ivector = (int **) memory->srealloc(ivector,nivector*sizeof(int *),
+                                        "atom:ivector");
+    memory->create(ivector[index],nmax,"atom:ivector");
+  } else {
+    index = ndvector;
+    ndvector++;
+    dname = (char **) memory->srealloc(dname,ndvector*sizeof(char *),
+                                       "atom:dname");
+    int n = strlen(name) + 1;
+    dname[index] = new char[n];
+    strcpy(dname[index],name);
+    memory->grow_kokkos(k_dvector,dvector,ndvector,nmax,
+                        "atom:dvector");
+  }
+
+  return index;
+}
+
+/* ----------------------------------------------------------------------
+   remove a custom variable of type flag = 0/1 for int/double at index
+   free memory for vector and name and set ptrs to NULL
+   ivector/dvector and iname/dname lists never shrink
+------------------------------------------------------------------------- */
+
+void AtomKokkos::remove_custom(int flag, int index)
+{
+  if (flag == 0) {
+    memory->destroy(ivector[index]);
+    ivector[index] = NULL;
+    delete [] iname[index];
+    iname[index] = NULL;
+  } else {
+    //memory->destroy_kokkos(dvector);
+    dvector[index] = NULL;
+    delete [] dname[index];
+    dname[index] = NULL;
+  }
+}
+
 /* ---------------------------------------------------------------------- */
 
 void AtomKokkos::deallocate_topology()
diff --git a/src/KOKKOS/atom_kokkos.h b/src/KOKKOS/atom_kokkos.h
index f31c26e01f..cf454bcd0c 100644
--- a/src/KOKKOS/atom_kokkos.h
+++ b/src/KOKKOS/atom_kokkos.h
@@ -51,6 +51,8 @@ class AtomKokkos : public Atom {
   DAT::tdual_int_2d k_improper_type;
   DAT::tdual_tagint_2d k_improper_atom1, k_improper_atom2, k_improper_atom3, k_improper_atom4;
 
+  DAT::tdual_float_2d k_dvector;
+
 
 // USER-DPD package
   DAT::tdual_efloat_1d k_uCond, k_uMech, k_uChem, k_uCG, k_uCGnew,
@@ -66,6 +68,8 @@ class AtomKokkos : public Atom {
   void sync_overlapping_device(const ExecutionSpace space, unsigned int mask);
   virtual void sort();
   virtual void grow(unsigned int mask);
+  int add_custom(const char *, int);
+  void remove_custom(int, int);
   virtual void deallocate_topology();
   void sync_modify(ExecutionSpace, unsigned int, unsigned int);
  private:
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
new file mode 100644
index 0000000000..aa37c8375d
--- /dev/null
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -0,0 +1,1060 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pair_exp6_rx_kokkos.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neigh_list.h"
+#include "math_const.h"
+#include "math_special.h"
+#include "memory.h"
+#include "error.h"
+#include "modify.h"
+#include "fix.h"
+#include <float.h>
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+using namespace MathSpecial;
+
+#define MAXLINE 1024
+#define DELTA 4
+
+#define oneFluidApproxParameter (-1)
+#define isOneFluidApprox(_site) ( (_site) == oneFluidApproxParameter )
+
+#define exp6PotentialType (1)
+#define isExp6PotentialType(_type) ( (_type) == exp6PotentialType )
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairExp6rxKokkos<DeviceType>::PairExp6rxKokkos(LAMMPS *lmp) : PairExp6rx(lmp)
+{
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairExp6rxKokkos<DeviceType>::~PairExp6rxKokkos()
+{
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
+{
+  eflag = eflag_in;
+  vflag = vflag_in;
+
+  if (neighflag == FULL) no_virial_fdotr_compute = 1;
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+
+  // reallocate per-atom arrays if necessary
+
+  if (eflag_atom) {
+    memory->destroy_kokkos(k_eatom,eatom);
+    memory->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
+    d_eatom = k_eatom.d_view;
+  }
+  if (vflag_atom) {
+    memory->destroy_kokkos(k_vatom,vatom);
+    memory->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom");
+    d_vatom = k_vatom.d_view;
+  }
+
+  x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  uCG = atomKK->k_uCG.view<DeviceType>();
+  uCGnew = atomKK->k_uCGnew.view<DeviceType>();
+  dvector = atomKK->k_dvector.view<DeviceType>();
+  nlocal = atom->nlocal;
+  special_lj[0] = force->special_lj[0];
+  special_lj[1] = force->special_lj[1];
+  special_lj[2] = force->special_lj[2];
+  special_lj[3] = force->special_lj[3];
+  special_coul[0] = force->special_coul[0];
+  special_coul[1] = force->special_coul[1];
+  special_coul[2] = force->special_coul[2];
+  special_coul[3] = force->special_coul[3];
+  newton_pair = force->newton_pair;
+
+  copymode = 1;
+
+  // Initialize the Exp6 parameter data for both the local
+  // and ghost atoms. Make the parameter data persistent
+  // and exchange like any other atom property later.
+
+  {
+     const int np_total = nlocal + atom->nghost;
+
+     PairExp6ParamData.epsilon1     = typename AT::t_float_1d("PairExp6ParamData.epsilon1"    ,np_total);
+     PairExp6ParamData.alpha1       = typename AT::t_float_1d("PairExp6ParamData.alpha1"      ,np_total);
+     PairExp6ParamData.rm1          = typename AT::t_float_1d("PairExp6ParamData.rm1"         ,np_total);
+     PairExp6ParamData.fraction1    = typename AT::t_float_1d("PairExp6ParamData.fraction1"   ,np_total);
+     PairExp6ParamData.epsilon2     = typename AT::t_float_1d("PairExp6ParamData.epsilon2"    ,np_total);
+     PairExp6ParamData.alpha2       = typename AT::t_float_1d("PairExp6ParamData.alpha2"      ,np_total);
+     PairExp6ParamData.rm2          = typename AT::t_float_1d("PairExp6ParamData.rm2"         ,np_total);
+     PairExp6ParamData.fraction2    = typename AT::t_float_1d("PairExp6ParamData.fraction2"   ,np_total);
+     PairExp6ParamData.epsilonOld1  = typename AT::t_float_1d("PairExp6ParamData.epsilonOld1" ,np_total);
+     PairExp6ParamData.alphaOld1    = typename AT::t_float_1d("PairExp6ParamData.alphaOld1"   ,np_total);
+     PairExp6ParamData.rmOld1       = typename AT::t_float_1d("PairExp6ParamData.rmOld1"      ,np_total);
+     PairExp6ParamData.fractionOld1 = typename AT::t_float_1d("PairExp6ParamData.fractionOld1",np_total);
+     PairExp6ParamData.epsilonOld2  = typename AT::t_float_1d("PairExp6ParamData.epsilonOld2" ,np_total);
+     PairExp6ParamData.alphaOld2    = typename AT::t_float_1d("PairExp6ParamData.alphaOld2"   ,np_total);
+     PairExp6ParamData.rmOld2       = typename AT::t_float_1d("PairExp6ParamData.rmOld2"      ,np_total);
+     PairExp6ParamData.fractionOld2 = typename AT::t_float_1d("PairExp6ParamData.fractionOld2",np_total);
+
+     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxgetParamsEXP6>(0,np_total),*this);
+  }
+
+  int inum = list->inum;
+  NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
+  d_numneigh = k_list->d_numneigh;
+  d_neighbors = k_list->d_neighbors;
+  d_ilist = k_list->d_ilist;
+
+  // loop over neighbors of my atoms
+
+  EV_FLOAT ev;
+
+  if (evflag) {
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALF,1,1> >(0,inum),*this,ev);
+  } else {
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALF,1,0> >(0,inum),*this);
+  }
+
+  if (eflag_global) eng_vdwl += ev.evdwl;
+  if (vflag_global) {
+    virial[0] += ev.v[0];
+    virial[1] += ev.v[1];
+    virial[2] += ev.v[2];
+    virial[3] += ev.v[3];
+    virial[4] += ev.v[4];
+    virial[5] += ev.v[5];
+  }
+
+  if (vflag_fdotr) pair_virial_fdotr_compute(this);
+
+  if (eflag_atom) {
+    k_eatom.template modify<DeviceType>();
+    k_eatom.template sync<LMPHostType>();
+  }
+
+  if (vflag_atom) {
+    k_vatom.template modify<DeviceType>();
+    k_vatom.template sync<LMPHostType>();
+  }
+
+  copymode = 0;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxgetParamsEXP6, const int &i) const {
+  getParamsEXP6 (i, PairExp6ParamData.epsilon1[i],
+                    PairExp6ParamData.alpha1[i],
+                    PairExp6ParamData.rm1[i],
+                    PairExp6ParamData.fraction1[i],
+                    PairExp6ParamData.epsilon2[i],
+                    PairExp6ParamData.alpha2[i],
+                    PairExp6ParamData.rm2[i],
+                    PairExp6ParamData.fraction2[i],
+                    PairExp6ParamData.epsilonOld1[i],
+                    PairExp6ParamData.alphaOld1[i],
+                    PairExp6ParamData.rmOld1[i],
+                    PairExp6ParamData.fractionOld1[i],
+                    PairExp6ParamData.epsilonOld2[i],
+                    PairExp6ParamData.alphaOld2[i],
+                    PairExp6ParamData.rmOld2[i],
+                    PairExp6ParamData.fractionOld2[i]);
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii, EV_FLOAT& ev) const {
+  int i,j,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,evdwlOld,fpair;
+  double rsq,r2inv,r6inv,forceExp6,factor_lj;
+  double rCut,rCutInv,rCut2inv,rCut6inv,rCutExp,urc,durc;
+  double rm2ij,rm6ij;
+  double r,rexp;
+
+  double alphaOld12_ij, rmOld12_ij, epsilonOld12_ij;
+  double alphaOld21_ij, rmOld21_ij, epsilonOld21_ij;
+  double alpha12_ij, rm12_ij, epsilon12_ij;
+  double alpha21_ij, rm21_ij, epsilon21_ij;
+  double rminv, buck1, buck2;
+  double epsilonOld1_i,alphaOld1_i,rmOld1_i;
+  double epsilonOld1_j,alphaOld1_j,rmOld1_j;
+  double epsilonOld2_i,alphaOld2_i,rmOld2_i;
+  double epsilonOld2_j,alphaOld2_j,rmOld2_j;
+  double epsilon1_i,alpha1_i,rm1_i;
+  double epsilon1_j,alpha1_j,rm1_j;
+  double epsilon2_i,alpha2_i,rm2_i;
+  double epsilon2_j,alpha2_j,rm2_j;
+  double evdwlOldEXP6_12, evdwlOldEXP6_21, fpairOldEXP6_12, fpairOldEXP6_21;
+  double evdwlEXP6_12, evdwlEXP6_21;
+  double fractionOld1_i, fractionOld1_j;
+  double fractionOld2_i, fractionOld2_j;
+  double fraction1_i, fraction1_j;
+  double fraction2_i, fraction2_j;
+
+  const int nRep = 12;
+  const double shift = 1.05;
+  double rin1, aRep, uin1, win1, uin1rep, rin1exp, rin6, rin6inv;
+
+  evdwlOld = 0.0;
+  evdwl = 0.0;
+
+  i = d_ilist[ii];
+  xtmp = x(i,0);
+  ytmp = x(i,1);
+  ztmp = x(i,2);
+  itype = type[i];
+  jnum = d_numneigh[i];
+
+  {
+     epsilon1_i     = PairExp6ParamData.epsilon1[i];
+     alpha1_i       = PairExp6ParamData.alpha1[i];
+     rm1_i          = PairExp6ParamData.rm1[i];
+     fraction1_i    = PairExp6ParamData.fraction1[i];
+     epsilon2_i     = PairExp6ParamData.epsilon2[i];
+     alpha2_i       = PairExp6ParamData.alpha2[i];
+     rm2_i          = PairExp6ParamData.rm2[i];
+     fraction2_i    = PairExp6ParamData.fraction2[i];
+     epsilonOld1_i  = PairExp6ParamData.epsilonOld1[i];
+     alphaOld1_i    = PairExp6ParamData.alphaOld1[i];
+     rmOld1_i       = PairExp6ParamData.rmOld1[i];
+     fractionOld1_i = PairExp6ParamData.fractionOld1[i];
+     epsilonOld2_i  = PairExp6ParamData.epsilonOld2[i];
+     alphaOld2_i    = PairExp6ParamData.alphaOld2[i];
+     rmOld2_i       = PairExp6ParamData.rmOld2[i];
+     fractionOld2_i = PairExp6ParamData.fractionOld2[i];
+  }
+
+  for (jj = 0; jj < jnum; jj++) {
+    int j = d_neighbors(i,jj);
+    factor_lj = special_lj[sbmask(j)];
+    j &= NEIGHMASK;
+
+    delx = xtmp - x(j,0);
+    dely = ytmp - x(j,1);
+    delz = ztmp - x(j,2);
+
+    rsq = delx*delx + dely*dely + delz*delz;
+    jtype = type[j];
+
+    if (rsq < cutsq[itype][jtype]) {
+      r2inv = 1.0/rsq;
+      r6inv = r2inv*r2inv*r2inv;
+
+      r = sqrt(rsq);
+      rCut2inv = 1.0/cutsq[itype][jtype];
+      rCut6inv = rCut2inv*rCut2inv*rCut2inv;
+      rCut = sqrt(cutsq[itype][jtype]);
+      rCutInv = 1.0/rCut;
+
+      //
+      // A. Compute the exp-6 potential
+      //
+
+      // A1.  Get alpha, epsilon and rm for particle j
+
+      {
+         epsilon1_j     = PairExp6ParamData.epsilon1[j];
+         alpha1_j       = PairExp6ParamData.alpha1[j];
+         rm1_j          = PairExp6ParamData.rm1[j];
+         fraction1_j    = PairExp6ParamData.fraction1[j];
+         epsilon2_j     = PairExp6ParamData.epsilon2[j];
+         alpha2_j       = PairExp6ParamData.alpha2[j];
+         rm2_j          = PairExp6ParamData.rm2[j];
+         fraction2_j    = PairExp6ParamData.fraction2[j];
+         epsilonOld1_j  = PairExp6ParamData.epsilonOld1[j];
+         alphaOld1_j    = PairExp6ParamData.alphaOld1[j];
+         rmOld1_j       = PairExp6ParamData.rmOld1[j];
+         fractionOld1_j = PairExp6ParamData.fractionOld1[j];
+         epsilonOld2_j  = PairExp6ParamData.epsilonOld2[j];
+         alphaOld2_j    = PairExp6ParamData.alphaOld2[j];
+         rmOld2_j       = PairExp6ParamData.rmOld2[j];
+         fractionOld2_j = PairExp6ParamData.fractionOld2[j];
+      }
+
+      // A2.  Apply Lorentz-Berthelot mixing rules for the i-j pair
+      alphaOld12_ij = sqrt(alphaOld1_i*alphaOld2_j);
+      rmOld12_ij = 0.5*(rmOld1_i + rmOld2_j);
+      epsilonOld12_ij = sqrt(epsilonOld1_i*epsilonOld2_j);
+      alphaOld21_ij = sqrt(alphaOld2_i*alphaOld1_j);
+      rmOld21_ij = 0.5*(rmOld2_i + rmOld1_j);
+      epsilonOld21_ij = sqrt(epsilonOld2_i*epsilonOld1_j);
+
+      alpha12_ij = sqrt(alpha1_i*alpha2_j);
+      rm12_ij = 0.5*(rm1_i + rm2_j);
+      epsilon12_ij = sqrt(epsilon1_i*epsilon2_j);
+      alpha21_ij = sqrt(alpha2_i*alpha1_j);
+      rm21_ij = 0.5*(rm2_i + rm1_j);
+      epsilon21_ij = sqrt(epsilon2_i*epsilon1_j);
+
+      if(rmOld12_ij!=0.0 && rmOld21_ij!=0.0){
+        if(alphaOld21_ij == 6.0 || alphaOld12_ij == 6.0)
+          error->all(FLERR,"alpha_ij is 6.0 in pair exp6");
+
+        // A3.  Compute some convenient quantities for evaluating the force
+        rminv = 1.0/rmOld12_ij;
+        buck1 = epsilonOld12_ij / (alphaOld12_ij - 6.0);
+        rexp = expValue(alphaOld12_ij*(1.0-r*rminv));
+        rm2ij = rmOld12_ij*rmOld12_ij;
+        rm6ij = rm2ij*rm2ij*rm2ij;
+
+        // Compute the shifted potential
+        rCutExp = expValue(alphaOld12_ij*(1.0-rCut*rminv));
+        buck2 = 6.0*alphaOld12_ij;
+        urc = buck1*(6.0*rCutExp - alphaOld12_ij*rm6ij*rCut6inv);
+        durc = -buck1*buck2*(rCutExp* rminv - rCutInv*rm6ij*rCut6inv);
+        rin1 = shift*rmOld12_ij*func_rin(alphaOld12_ij);
+        if(r < rin1){
+          rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+          rin6inv = 1.0/rin6;
+
+          rin1exp = expValue(alphaOld12_ij*(1.0-rin1*rminv));
+
+          uin1 = buck1*(6.0*rin1exp - alphaOld12_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+          win1 = -buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) - rin1*durc;
+
+          aRep = -1.0*win1*powint(rin1,nRep)/nRep;
+
+          uin1rep = aRep/powint(rin1,nRep);
+
+          forceExp6 = -double(nRep)*aRep/powint(r,nRep);
+          fpairOldEXP6_12 = factor_lj*forceExp6*r2inv;
+
+          evdwlOldEXP6_12 = uin1 - uin1rep + aRep/powint(r,nRep);
+        } else {
+          forceExp6 = buck1*buck2*(r*rexp*rminv - rm6ij*r6inv) + r*durc;
+          fpairOldEXP6_12 = factor_lj*forceExp6*r2inv;
+
+          evdwlOldEXP6_12 = buck1*(6.0*rexp - alphaOld12_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+        }
+
+        // A3.  Compute some convenient quantities for evaluating the force
+        rminv = 1.0/rmOld21_ij;
+        buck1 = epsilonOld21_ij / (alphaOld21_ij - 6.0);
+        buck2 = 6.0*alphaOld21_ij;
+        rexp = expValue(alphaOld21_ij*(1.0-r*rminv));
+        rm2ij = rmOld21_ij*rmOld21_ij;
+        rm6ij = rm2ij*rm2ij*rm2ij;
+
+        // Compute the shifted potential
+        rCutExp = expValue(alphaOld21_ij*(1.0-rCut*rminv));
+        buck2 = 6.0*alphaOld21_ij;
+        urc = buck1*(6.0*rCutExp - alphaOld21_ij*rm6ij*rCut6inv);
+        durc = -buck1*buck2*(rCutExp* rminv - rCutInv*rm6ij*rCut6inv);
+        rin1 = shift*rmOld21_ij*func_rin(alphaOld21_ij);
+
+        if(r < rin1){
+          rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+          rin6inv = 1.0/rin6;
+
+          rin1exp = expValue(alphaOld21_ij*(1.0-rin1*rminv));
+
+          uin1 = buck1*(6.0*rin1exp - alphaOld21_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+          win1 = -buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) - rin1*durc;
+
+          aRep = -1.0*win1*powint(rin1,nRep)/nRep;
+
+          uin1rep = aRep/powint(rin1,nRep);
+
+          forceExp6 = -double(nRep)*aRep/powint(r,nRep);
+          fpairOldEXP6_21 = factor_lj*forceExp6*r2inv;
+
+          evdwlOldEXP6_21 = uin1 - uin1rep + aRep/powint(r,nRep);
+        } else {
+          forceExp6 = buck1*buck2*(r*rexp*rminv - rm6ij*r6inv) + r*durc;
+          fpairOldEXP6_21 = factor_lj*forceExp6*r2inv;
+
+          evdwlOldEXP6_21 = buck1*(6.0*rexp - alphaOld21_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+        }
+
+        if (isite1 == isite2)
+          evdwlOld = sqrt(fractionOld1_i*fractionOld2_j)*evdwlOldEXP6_12;
+        else
+          evdwlOld = sqrt(fractionOld1_i*fractionOld2_j)*evdwlOldEXP6_12 + sqrt(fractionOld2_i*fractionOld1_j)*evdwlOldEXP6_21;
+
+        evdwlOld *= factor_lj;
+
+        uCG[i] += 0.5*evdwlOld;
+        if (newton_pair || j < nlocal)
+          uCG[j] += 0.5*evdwlOld;
+      }
+
+      if(rm12_ij!=0.0 && rm21_ij!=0.0){
+        if(alpha21_ij == 6.0 || alpha12_ij == 6.0)
+          error->all(FLERR,"alpha_ij is 6.0 in pair exp6");
+
+        // A3.  Compute some convenient quantities for evaluating the force
+        rminv = 1.0/rm12_ij;
+        buck1 = epsilon12_ij / (alpha12_ij - 6.0);
+        buck2 = 6.0*alpha12_ij;
+        rexp = expValue(alpha12_ij*(1.0-r*rminv));
+        rm2ij = rm12_ij*rm12_ij;
+        rm6ij = rm2ij*rm2ij*rm2ij;
+
+        // Compute the shifted potential
+        rCutExp = expValue(alpha12_ij*(1.0-rCut*rminv));
+        urc = buck1*(6.0*rCutExp - alpha12_ij*rm6ij*rCut6inv);
+        durc = -buck1*buck2*(rCutExp*rminv - rCutInv*rm6ij*rCut6inv);
+        rin1 = shift*rm12_ij*func_rin(alpha12_ij);
+
+        if(r < rin1){
+          rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+          rin6inv = 1.0/rin6;
+
+          rin1exp = expValue(alpha12_ij*(1.0-rin1*rminv));
+
+          uin1 = buck1*(6.0*rin1exp - alpha12_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+          win1 = -buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) - rin1*durc;
+
+          aRep = -1.0*win1*powint(rin1,nRep)/nRep;
+
+          uin1rep = aRep/powint(rin1,nRep);
+
+          evdwlEXP6_12 = uin1 - uin1rep + aRep/powint(r,nRep);
+        } else {
+          evdwlEXP6_12 = buck1*(6.0*rexp - alpha12_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+        }
+
+        rminv = 1.0/rm21_ij;
+        buck1 = epsilon21_ij / (alpha21_ij - 6.0);
+        buck2 = 6.0*alpha21_ij;
+        rexp = expValue(alpha21_ij*(1.0-r*rminv));
+        rm2ij = rm21_ij*rm21_ij;
+        rm6ij = rm2ij*rm2ij*rm2ij;
+
+        // Compute the shifted potential
+        rCutExp = expValue(alpha21_ij*(1.0-rCut*rminv));
+        urc = buck1*(6.0*rCutExp - alpha21_ij*rm6ij*rCut6inv);
+        durc = -buck1*buck2*(rCutExp*rminv - rCutInv*rm6ij*rCut6inv);
+        rin1 = shift*rm21_ij*func_rin(alpha21_ij);
+
+        if(r < rin1){
+          rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+          rin6inv = 1.0/rin6;
+
+          rin1exp = expValue(alpha21_ij*(1.0-rin1*rminv));
+
+          uin1 = buck1*(6.0*rin1exp - alpha21_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+          win1 = -buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) - rin1*durc;
+
+          aRep = -1.0*win1*powint(rin1,nRep)/nRep;
+
+          uin1rep = aRep/powint(rin1,nRep);
+
+          evdwlEXP6_21 = uin1 - uin1rep + aRep/powint(r,nRep);
+        } else {
+          evdwlEXP6_21 = buck1*(6.0*rexp - alpha21_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+        }
+
+        //
+        // Apply Mixing Rule to get the overall force for the CG pair
+        //
+        if (isite1 == isite2) fpair = sqrt(fractionOld1_i*fractionOld2_j)*fpairOldEXP6_12;
+        else fpair = sqrt(fractionOld1_i*fractionOld2_j)*fpairOldEXP6_12 + sqrt(fractionOld2_i*fractionOld1_j)*fpairOldEXP6_21;
+
+        f(i,0) += delx*fpair;
+        f(i,1) += dely*fpair;
+        f(i,2) += delz*fpair;
+        if (newton_pair || j < nlocal) {
+          f(j,0) -= delx*fpair;
+          f(j,1) -= dely*fpair;
+          f(j,2) -= delz*fpair;
+        }
+
+        if (isite1 == isite2) evdwl = sqrt(fraction1_i*fraction2_j)*evdwlEXP6_12;
+        else evdwl = sqrt(fraction1_i*fraction2_j)*evdwlEXP6_12 + sqrt(fraction2_i*fraction1_j)*evdwlEXP6_21;
+        evdwl *= factor_lj;
+
+        uCGnew[i]   += 0.5*evdwl;
+        if (newton_pair || j < nlocal)
+          uCGnew[j] += 0.5*evdwl;
+        evdwl = evdwlOld;
+        //if (vflag_either || eflag_atom) 
+        if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair,delx,dely,delz);
+      }
+    }
+  }
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii) const {
+  EV_FLOAT ev;
+  this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(TagPairExp6rxCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(), ii, ev);
+}
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairExp6rxKokkos<DeviceType>::allocate()
+{
+  allocated = 1;
+  int n = atom->ntypes;
+
+  memory->create(setflag,n+1,n+1,"pair:setflag");
+  for (int i = 1; i <= n; i++)
+    for (int j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  memory->create_kokkos(k_cutsq,cutsq,n+1,n+1,"pair:cutsq");
+  d_cutsq = k_cutsq.template view<DeviceType>();
+
+  memory->create(cut,n+1,n+1,"pair:cut_lj");
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairExp6rxKokkos<DeviceType>::read_file(char *file)
+{
+  int params_per_line = 5;
+  char **words = new char*[params_per_line+1];
+
+  memory->sfree(params);
+  params = NULL;
+  nparams = maxparam = 0;
+
+  // open file on proc 0
+
+  FILE *fp;
+  fp = NULL;
+  if (comm->me == 0) {
+    fp = force->open_potential(file);
+    if (fp == NULL) {
+      char str[128];
+      sprintf(str,"Cannot open exp6/rx potential file %s",file);
+      error->one(FLERR,str);
+    }
+  }
+
+  // read each set of params from potential file
+  // one set of params can span multiple lines
+
+  int n,nwords,ispecies;
+  char line[MAXLINE],*ptr;
+  int eof = 0;
+
+  while (1) {
+    if (comm->me == 0) {
+      ptr = fgets(line,MAXLINE,fp);
+      if (ptr == NULL) {
+        eof = 1;
+        fclose(fp);
+      } else n = strlen(line) + 1;
+    }
+    MPI_Bcast(&eof,1,MPI_INT,0,world);
+    if (eof) break;
+    MPI_Bcast(&n,1,MPI_INT,0,world);
+    MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+    // strip comment, skip line if blank
+
+    if ((ptr = strchr(line,'#'))) *ptr = '\0';
+    nwords = atom->count_words(line);
+    if (nwords == 0) continue;
+
+    // concatenate additional lines until have params_per_line words
+
+    while (nwords < params_per_line) {
+      n = strlen(line);
+      if (comm->me == 0) {
+        ptr = fgets(&line[n],MAXLINE-n,fp);
+        if (ptr == NULL) {
+          eof = 1;
+          fclose(fp);
+        } else n = strlen(line) + 1;
+      }
+      MPI_Bcast(&eof,1,MPI_INT,0,world);
+      if (eof) break;
+      MPI_Bcast(&n,1,MPI_INT,0,world);
+      MPI_Bcast(line,n,MPI_CHAR,0,world);
+      if ((ptr = strchr(line,'#'))) *ptr = '\0';
+      nwords = atom->count_words(line);
+    }
+
+    if (nwords != params_per_line)
+      error->all(FLERR,"Incorrect format in exp6/rx potential file");
+
+    // words = ptrs to all words in line
+
+    nwords = 0;
+    words[nwords++] = strtok(line," \t\n\r\f");
+    while ((words[nwords++] = strtok(NULL," \t\n\r\f"))) continue;
+
+    for (ispecies = 0; ispecies < nspecies; ispecies++)
+      if (strcmp(words[0],&atom->dname[ispecies][0]) == 0) break;
+    if (ispecies == nspecies) continue;
+
+    // load up parameter settings and error check their values
+
+    if (nparams == maxparam) {
+      maxparam += DELTA;
+      memory->grow_kokkos(k_params,params,maxparam,
+                          "pair:params");
+    }
+
+    params[nparams].ispecies = ispecies;
+
+    n = strlen(&atom->dname[ispecies][0]) + 1;
+    params[nparams].name = new char[n];
+    strcpy(params[nparams].name,&atom->dname[ispecies][0]);
+
+    n = strlen(words[1]) + 1;
+    params[nparams].potential = new char[n];
+    strcpy(params[nparams].potential,words[1]);
+    if (strcmp(params[nparams].potential,"exp6") == 0){
+      params[nparams].alpha = atof(words[2]);
+      params[nparams].epsilon = atof(words[3]);
+      params[nparams].rm = atof(words[4]);
+      if (params[nparams].epsilon <= 0.0 || params[nparams].rm <= 0.0 ||
+          params[nparams].alpha < 0.0)
+        error->all(FLERR,"Illegal exp6/rx parameters.  Rm and Epsilon must be greater than zero.  Alpha cannot be negative.");
+    } else {
+      error->all(FLERR,"Illegal exp6/rx parameters.  Interaction potential does not exist.");
+    }
+    nparams++;
+  }
+
+  delete [] words;
+
+  k_params.template modify<LMPHostType>();
+  k_params.template sync<DeviceType>();
+  d_params = k_params.template view<DeviceType>();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairExp6rxKokkos<DeviceType>::setup()
+{
+  int i,j,n;
+
+  // set mol2param for all combinations
+  // must be a single exact match to lines read from file
+
+  memory->destroy_kokkos(k_mol2param,mol2param);
+  memory->create_kokkos(k_mol2param,mol2param,nspecies,"pair:mol2param");
+
+  for (i = 0; i < nspecies; i++) {
+    n = -1;
+    for (j = 0; j < nparams; j++) {
+      if (i == params[j].ispecies) {
+        if (n >= 0) error->all(FLERR,"Potential file has duplicate entry");
+        n = j;
+      }
+    }
+    mol2param[i] = n;
+  }
+
+  k_mol2param.template modify<LMPHostType>();
+  k_mol2param.template sync<DeviceType>();
+  d_mol2param = k_mol2param.template view<DeviceType>();
+
+  neighflag = lmp->kokkos->neighflag;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::getParamsEXP6(int id,double &epsilon1,double &alpha1,double &rm1, double &fraction1,double &epsilon2,double &alpha2,double &rm2,double &fraction2,double &epsilon1_old,double &alpha1_old,double &rm1_old, double &fraction1_old,double &epsilon2_old,double &alpha2_old,double &rm2_old,double &fraction2_old) const
+{
+  int iparam, jparam;
+  double rmi, rmj, rmij, rm3ij;
+  double epsiloni, epsilonj, epsilonij;
+  double alphai, alphaj, alphaij;
+  double epsilon_old, rm3_old, alpha_old;
+  double epsilon, rm3, alpha;
+  double fractionOFA, fractionOFA_old;
+  double nTotalOFA, nTotalOFA_old;
+  double nTotal, nTotal_old;
+  double xMolei, xMolej, xMolei_old, xMolej_old;
+
+  rm3 = 0.0;
+  epsilon = 0.0;
+  alpha = 0.0;
+  epsilon_old = 0.0;
+  rm3_old = 0.0;
+  alpha_old = 0.0;
+  fractionOFA = 0.0;
+  fractionOFA_old = 0.0;
+  nTotalOFA = 0.0;
+  nTotalOFA_old = 0.0;
+  nTotal = 0.0;
+  nTotal_old = 0.0;
+
+  // Compute the total number of molecules in the old and new CG particle as well as the total number of molecules in the fluid portion of the old and new CG particle
+  for (int ispecies = 0; ispecies < nspecies; ispecies++){
+    nTotal += dvector(ispecies,id);
+    nTotal_old += dvector(ispecies+nspecies,id);
+
+    iparam = mol2param[ispecies];
+
+    if (iparam < 0 || d_params[iparam].potentialType != exp6PotentialType ) continue;
+    if (isOneFluidApprox(isite1) || isOneFluidApprox(isite2)) {
+      if (isite1 == d_params[iparam].ispecies || isite2 == d_params[iparam].ispecies) continue;
+      nTotalOFA_old += dvector(ispecies+nspecies,id);
+      nTotalOFA += dvector(ispecies,id);
+    }
+  }
+  if(nTotal < 1e-8 || nTotal_old < 1e-8)
+    error->all(FLERR,"The number of molecules in CG particle is less than 1e-8.");
+
+  // Compute the mole fraction of molecules within the fluid portion of the particle (One Fluid Approximation)
+  fractionOFA_old = nTotalOFA_old / nTotal_old;
+  fractionOFA = nTotalOFA / nTotal;
+
+  for (int ispecies = 0; ispecies < nspecies; ispecies++) {
+    iparam = mol2param[ispecies];
+    if (iparam < 0 || d_params[iparam].potentialType != exp6PotentialType ) continue;
+
+    // If Site1 matches a pure species, then grab the parameters
+    if (isite1 == d_params[iparam].ispecies){
+      rm1_old = d_params[iparam].rm;
+      rm1 = d_params[iparam].rm;
+      epsilon1_old = d_params[iparam].epsilon;
+      epsilon1 = d_params[iparam].epsilon;
+      alpha1_old = d_params[iparam].alpha;
+      alpha1 = d_params[iparam].alpha;
+
+      // Compute the mole fraction of Site1
+      fraction1_old = dvector(ispecies+nspecies,id)/nTotal_old;
+      fraction1 = dvector(ispecies,id)/nTotal;
+    }
+
+    // If Site2 matches a pure species, then grab the parameters
+    if (isite2 == d_params[iparam].ispecies){
+      rm2_old = d_params[iparam].rm;
+      rm2 = d_params[iparam].rm;
+      epsilon2_old = d_params[iparam].epsilon;
+      epsilon2 = d_params[iparam].epsilon;
+      alpha2_old = d_params[iparam].alpha;
+      alpha2 = d_params[iparam].alpha;
+
+      // Compute the mole fraction of Site2
+      fraction2_old = dvector(ispecies+nspecies,id)/nTotal_old;
+      fraction2 = dvector(ispecies,id)/nTotal;
+    }
+
+    // If Site1 or Site2 matches is a fluid, then compute the paramters
+    if (isOneFluidApprox(isite1) || isOneFluidApprox(isite2)) {
+      if (isite1 == d_params[iparam].ispecies || isite2 == d_params[iparam].ispecies) continue;
+      rmi = d_params[iparam].rm;
+      epsiloni = d_params[iparam].epsilon;
+      alphai = d_params[iparam].alpha;
+      xMolei = dvector(ispecies,id)/nTotalOFA;
+      xMolei_old = dvector(ispecies+nspecies,id)/nTotalOFA_old;
+
+      for (int jspecies = 0; jspecies < nspecies; jspecies++) {
+        jparam = mol2param[jspecies];
+        if (jparam < 0 || d_params[jparam].potentialType != exp6PotentialType ) continue;
+        if (isite1 == d_params[jparam].ispecies || isite2 == d_params[jparam].ispecies) continue;
+        rmj = d_params[jparam].rm;
+        epsilonj = d_params[jparam].epsilon;
+        alphaj = d_params[jparam].alpha;
+        xMolej = dvector(jspecies,id)/nTotalOFA;
+        xMolej_old = dvector(jspecies+nspecies,id)/nTotalOFA_old;
+
+        rmij = (rmi+rmj)/2.0;
+        rm3ij = rmij*rmij*rmij;
+        epsilonij = sqrt(epsiloni*epsilonj);
+        alphaij = sqrt(alphai*alphaj);
+
+        if(fractionOFA_old > 0.0){
+          rm3_old += xMolei_old*xMolej_old*rm3ij;
+          epsilon_old += xMolei_old*xMolej_old*rm3ij*epsilonij;
+          alpha_old += xMolei_old*xMolej_old*rm3ij*epsilonij*alphaij;
+        }
+        if(fractionOFA > 0.0){
+          rm3 += xMolei*xMolej*rm3ij;
+          epsilon += xMolei*xMolej*rm3ij*epsilonij;
+          alpha += xMolei*xMolej*rm3ij*epsilonij*alphaij;
+        }
+      }
+    }
+  }
+
+  if (isOneFluidApprox(isite1)){
+    rm1 = cbrt(rm3);
+    if(rm1 < 1e-16) {
+      rm1 = 0.0;
+      epsilon1 = 0.0;
+      alpha1 = 0.0;
+    } else {
+      epsilon1 = epsilon / rm3;
+      alpha1 = alpha / epsilon1 / rm3;
+    }
+
+    fraction1 = fractionOFA;
+
+    rm1_old = cbrt(rm3_old);
+    if(rm1_old < 1e-16) {
+      rm1_old = 0.0;
+      epsilon1_old = 0.0;
+      alpha1_old = 0.0;
+    } else {
+      epsilon1_old = epsilon_old / rm3_old;
+      alpha1_old = alpha_old / epsilon1_old / rm3_old;
+    }
+    fraction1_old = fractionOFA_old;
+
+    // Fuchslin-Like Exp-6 Scaling
+    double powfuch = 0.0;
+    if(fuchslinEpsilon < 0.0){
+      powfuch = pow(nTotalOFA,-fuchslinEpsilon);
+      if(powfuch<1e-15) epsilon1 = 0.0;
+      else epsilon1 *= 1.0/powfuch;
+
+      powfuch = pow(nTotalOFA_old,-fuchslinEpsilon);
+      if(powfuch<1e-15) epsilon1_old = 0.0;
+      else epsilon1_old *= 1.0/powfuch;
+
+    } else {
+      epsilon1 *= pow(nTotalOFA,fuchslinEpsilon);
+      epsilon1_old *= pow(nTotalOFA_old,fuchslinEpsilon);
+    }
+
+    if(fuchslinR < 0.0){
+      powfuch = pow(nTotalOFA,-fuchslinR);
+      if(powfuch<1e-15) rm1 = 0.0;
+      else rm1 *= 1.0/powfuch;
+
+      powfuch = pow(nTotalOFA_old,-fuchslinR);
+      if(powfuch<1e-15) rm1_old = 0.0;
+      else rm1_old *= 1.0/powfuch;
+
+    } else {
+      rm1 *= pow(nTotalOFA,fuchslinR);
+      rm1_old *= pow(nTotalOFA_old,fuchslinR);
+    }
+  }
+
+  if (isOneFluidApprox(isite2)){
+    rm2 = cbrt(rm3);
+    if(rm2 < 1e-16) {
+      rm2 = 0.0;
+      epsilon2 = 0.0;
+      alpha2 = 0.0;
+    } else {
+      epsilon2 = epsilon / rm3;
+      alpha2 = alpha / epsilon2 / rm3;
+    }
+    fraction2 = fractionOFA;
+
+    rm2_old = cbrt(rm3_old);
+    if(rm2_old < 1e-16) {
+      rm2_old = 0.0;
+      epsilon2_old = 0.0;
+      alpha2_old = 0.0;
+    } else {
+      epsilon2_old = epsilon_old / rm3_old;
+      alpha2_old = alpha_old / epsilon2_old / rm3_old;
+    }
+    fraction2_old = fractionOFA_old;
+
+    // Fuchslin-Like Exp-6 Scaling
+    double powfuch = 0.0;
+    if(fuchslinEpsilon < 0.0){
+      powfuch = pow(nTotalOFA,-fuchslinEpsilon);
+      if(powfuch<1e-15) epsilon2 = 0.0;
+      else epsilon2 *= 1.0/powfuch;
+
+      powfuch = pow(nTotalOFA_old,-fuchslinEpsilon);
+      if(powfuch<1e-15) epsilon2_old = 0.0;
+      else epsilon2_old *= 1.0/powfuch;
+
+    } else {
+      epsilon2 *= pow(nTotalOFA,fuchslinEpsilon);
+      epsilon2_old *= pow(nTotalOFA_old,fuchslinEpsilon);
+    }
+
+    if(fuchslinR < 0.0){
+      powfuch = pow(nTotalOFA,-fuchslinR);
+      if(powfuch<1e-15) rm2 = 0.0;
+      else rm2 *= 1.0/powfuch;
+
+      powfuch = pow(nTotalOFA_old,-fuchslinR);
+      if(powfuch<1e-15) rm2_old = 0.0;
+      else rm2_old *= 1.0/powfuch;
+
+    } else {
+      rm2 *= pow(nTotalOFA,fuchslinR);
+      rm2_old *= pow(nTotalOFA_old,fuchslinR);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+double PairExp6rxKokkos<DeviceType>::func_rin(const double &alpha) const
+{
+  double function;
+
+  const double a = 3.7682065;
+  const double b = -1.4308614;
+
+  function = a+b*sqrt(alpha);
+  function = expValue(function);
+
+  return function;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+double PairExp6rxKokkos<DeviceType>::expValue(double value) const
+{
+  double returnValue;
+  if(value < DBL_MIN_EXP) returnValue = 0.0;
+  else returnValue = exp(value);
+
+  return returnValue;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, const int &j,
+      const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
+                const F_FLOAT &dely, const F_FLOAT &delz) const
+{
+  const int EFLAG = eflag;
+  const int VFLAG = vflag_either;
+
+  // The eatom and vatom arrays are atomic for Half/Thread neighbor style
+  Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom = k_eatom.view<DeviceType>();
+  Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom = k_vatom.view<DeviceType>();
+
+  if (EFLAG) {
+    if (eflag_atom) {
+      const E_FLOAT epairhalf = 0.5 * epair;
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) v_eatom[i] += epairhalf;
+        if (NEWTON_PAIR || j < nlocal) v_eatom[j] += epairhalf;
+      } else {
+        v_eatom[i] += epairhalf;
+      }
+    }
+  }
+
+  if (VFLAG) {
+    const E_FLOAT v0 = delx*delx*fpair;
+    const E_FLOAT v1 = dely*dely*fpair;
+    const E_FLOAT v2 = delz*delz*fpair;
+    const E_FLOAT v3 = delx*dely*fpair;
+    const E_FLOAT v4 = delx*delz*fpair;
+    const E_FLOAT v5 = dely*delz*fpair;
+
+    if (vflag_global) {
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) {
+          ev.v[0] += 0.5*v0;
+          ev.v[1] += 0.5*v1;
+          ev.v[2] += 0.5*v2;
+          ev.v[3] += 0.5*v3;
+          ev.v[4] += 0.5*v4;
+          ev.v[5] += 0.5*v5;
+        }
+        if (NEWTON_PAIR || j < nlocal) {
+        ev.v[0] += 0.5*v0;
+        ev.v[1] += 0.5*v1;
+        ev.v[2] += 0.5*v2;
+        ev.v[3] += 0.5*v3;
+        ev.v[4] += 0.5*v4;
+        ev.v[5] += 0.5*v5;
+        }
+      } else {
+        ev.v[0] += 0.5*v0;
+        ev.v[1] += 0.5*v1;
+        ev.v[2] += 0.5*v2;
+        ev.v[3] += 0.5*v3;
+        ev.v[4] += 0.5*v4;
+        ev.v[5] += 0.5*v5;
+      }
+    }
+
+    if (vflag_atom) {
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) {
+          v_vatom(i,0) += 0.5*v0;
+          v_vatom(i,1) += 0.5*v1;
+          v_vatom(i,2) += 0.5*v2;
+          v_vatom(i,3) += 0.5*v3;
+          v_vatom(i,4) += 0.5*v4;
+          v_vatom(i,5) += 0.5*v5;
+        }
+        if (NEWTON_PAIR || j < nlocal) {
+        v_vatom(j,0) += 0.5*v0;
+        v_vatom(j,1) += 0.5*v1;
+        v_vatom(j,2) += 0.5*v2;
+        v_vatom(j,3) += 0.5*v3;
+        v_vatom(j,4) += 0.5*v4;
+        v_vatom(j,5) += 0.5*v5;
+        }
+      } else {
+        v_vatom(i,0) += 0.5*v0;
+        v_vatom(i,1) += 0.5*v1;
+        v_vatom(i,2) += 0.5*v2;
+        v_vatom(i,3) += 0.5*v3;
+        v_vatom(i,4) += 0.5*v4;
+        v_vatom(i,5) += 0.5*v5;
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+int PairExp6rxKokkos<DeviceType>::sbmask(const int& j) const {
+  return j >> SBBITS & 3;
+}
+
+namespace LAMMPS_NS {
+template class PairExp6rxKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class PairExp6rxKokkos<LMPHostType>;
+#endif
+}
\ No newline at end of file
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.h b/src/KOKKOS/pair_exp6_rx_kokkos.h
new file mode 100644
index 0000000000..4ff055123c
--- /dev/null
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.h
@@ -0,0 +1,204 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(exp6/rx/kk,PairExp6rxKokkos<LMPDeviceType>)
+PairStyle(exp6/rx/kk/device,PairExp6rxKokkos<LMPDeviceType>)
+PairStyle(exp6/rx/kk/host,PairExp6rxKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_PAIR_EXP6_RX_KOKKOS_H
+#define LMP_PAIR_EXP6_RX_KOKKOS_H
+
+#include "pair_exp6_rx.h"
+#include "kokkos_type.h"
+#include "pair_kokkos.h"
+
+namespace LAMMPS_NS {
+
+// Create a structure to hold the parameter data for all
+// local and neighbor particles. Pack inside this struct
+// to avoid any name clashes.
+
+template<class DeviceType>
+struct PairExp6ParamDataTypeKokkos
+{
+  typedef ArrayTypes<DeviceType> AT;
+
+   int n;
+   typename AT::t_float_1d epsilon1, alpha1, rm1, fraction1,
+          epsilon2, alpha2, rm2, fraction2,
+          epsilonOld1, alphaOld1, rmOld1, fractionOld1,
+          epsilonOld2, alphaOld2, rmOld2, fractionOld2;
+
+   // Default constructor -- nullify everything.
+   PairExp6ParamDataTypeKokkos<DeviceType>(void)
+      : n(0)
+   {}
+};
+
+struct TagPairExp6rxgetParamsEXP6{};
+
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+struct TagPairExp6rxCompute{};
+
+template<class DeviceType>
+class PairExp6rxKokkos : public PairExp6rx {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typedef EV_FLOAT value_type;
+
+  PairExp6rxKokkos(class LAMMPS *);
+  virtual ~PairExp6rxKokkos();
+  virtual void compute(int, int);
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairExp6rxCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&, EV_FLOAT&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairExp6rxCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairExp6rxgetParamsEXP6, const int&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR>
+  KOKKOS_INLINE_FUNCTION
+  void ev_tally(EV_FLOAT &ev, const int &i, const int &j,
+      const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
+                  const F_FLOAT &dely, const F_FLOAT &delz) const;
+
+  KOKKOS_INLINE_FUNCTION
+  int sbmask(const int& j) const;
+
+ protected:
+  int eflag,vflag;
+  int nlocal,newton_pair,neighflag;
+  double special_coul[4];
+  double special_lj[4];
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_f_array f;
+  typename AT::t_int_1d_randomread type;
+  typename AT::t_efloat_1d uCG, uCGnew;
+  typename AT::t_float_2d dvector;
+
+  DAT::tdual_efloat_1d k_eatom;
+  DAT::tdual_virial_array k_vatom;
+  DAT::t_efloat_1d d_eatom;
+  DAT::t_virial_array d_vatom;
+
+  typename AT::t_neighbors_2d d_neighbors;
+  typename AT::t_int_1d_randomread d_ilist;
+  typename AT::t_int_1d_randomread d_numneigh;
+
+  PairExp6ParamDataTypeKokkos<DeviceType> PairExp6ParamData;
+
+  void allocate();
+  DAT::tdual_int_1d k_mol2param;               // mapping from molecule to parameters
+  typename AT::t_int_1d_randomread d_mol2param;
+
+  typedef Kokkos::DualView<Param*,Kokkos::LayoutRight,DeviceType> tdual_param_1d;
+  typedef typename tdual_param_1d::t_dev_const_randomread t_param_1d_randomread;
+
+  tdual_param_1d k_params;                // parameter set for an I-J-K interaction
+  t_param_1d_randomread d_params;                // parameter set for an I-J-K interaction
+
+  typename ArrayTypes<DeviceType>::tdual_ffloat_2d k_cutsq;
+  typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq;
+
+  void read_file(char *);
+  void setup();
+
+  KOKKOS_INLINE_FUNCTION
+  void getParamsEXP6(int, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &) const;
+
+  KOKKOS_INLINE_FUNCTION
+  double func_rin(const double &) const;
+
+  KOKKOS_INLINE_FUNCTION
+  double expValue(const double) const;
+
+  friend void pair_virial_fdotr_compute<PairExp6rxKokkos>(PairExp6rxKokkos*);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E:  alpha_ij is 6.0 in pair exp6
+
+Self-explanatory
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: PairExp6rxKokkos requires a fix rx command
+
+The fix rx command must come before the pair style command in the input file
+
+E:  There are no rx species specified
+
+There must be at least one species specified through the fix rx command
+
+E:  Site1 name not recognized in pair coefficients
+
+The site1 keyword does not match the species keywords specified throug the fix rx command
+
+E: All pair coeffs are not set
+
+All pair coefficients must be set in the data file or by the
+pair_coeff command before running a simulation.
+
+E:  Cannot open exp6/rx potential file %s
+
+Self-explanatory
+
+E:  Incorrect format in exp6/rx potential file
+
+Self-explanatory
+
+E:  Illegal exp6/rx parameters.  Rm and Epsilon must be greater than zero.  Alpha cannot be negative.
+
+Self-explanatory
+
+E:  Illegal exp6/rx parameters.  Interaction potential does not exist.
+
+Self-explanatory
+
+E:  Potential file has duplicate entry.
+
+Self-explanatory
+
+E:  The number of molecules in CG particle is less than 1e-8.
+
+Self-explanatory.  Check the species concentrations have been properly set
+and check the reaction kinetic solver parameters in fix rx to more for
+sufficient accuracy.
+
+
+*/
diff --git a/src/USER-DPD/pair_exp6_rx.cpp b/src/USER-DPD/pair_exp6_rx.cpp
index 9af28026ae..2643c9ec04 100644
--- a/src/USER-DPD/pair_exp6_rx.cpp
+++ b/src/USER-DPD/pair_exp6_rx.cpp
@@ -77,6 +77,8 @@ PairExp6rx::PairExp6rx(LAMMPS *lmp) : Pair(lmp)
 
 PairExp6rx::~PairExp6rx()
 {
+  if (copymode) return;
+
   for (int i=0; i < nparams; ++i) {
     delete[] params[i].name;
     delete[] params[i].potential;
diff --git a/src/atom.h b/src/atom.h
index 9abbb49569..de7cda06ac 100644
--- a/src/atom.h
+++ b/src/atom.h
@@ -255,8 +255,8 @@ class Atom : protected Pointers {
   void update_callback(int);
 
   int find_custom(const char *, int &);
-  int add_custom(const char *, int);
-  void remove_custom(int, int);
+  virtual int add_custom(const char *, int);
+  virtual void remove_custom(int, int);
 
   virtual void sync_modify(ExecutionSpace, unsigned int, unsigned int) {}
 

From 6d94439cfe6f9f4cfd00a04b7e45d89693ffac46 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Mon, 28 Nov 2016 14:42:47 -0700
Subject: [PATCH 005/267] Integrating pair_dpd_fdt_energy_kokkos files

---
 src/KOKKOS/Install.sh                                   | 2 ++
 src/{USER-DPD => KOKKOS}/pair_dpd_fdt_energy_kokkos.cpp | 0
 src/{USER-DPD => KOKKOS}/pair_dpd_fdt_energy_kokkos.h   | 0
 src/KOKKOS/pair_exp6_rx_kokkos.cpp                      | 4 ++++
 4 files changed, 6 insertions(+)
 rename src/{USER-DPD => KOKKOS}/pair_dpd_fdt_energy_kokkos.cpp (100%)
 rename src/{USER-DPD => KOKKOS}/pair_dpd_fdt_energy_kokkos.h (100%)

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index 14a8a951ee..7e46b52c2b 100644
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -128,6 +128,8 @@ action pair_coul_long_kokkos.cpp pair_coul_long.cpp
 action pair_coul_long_kokkos.h pair_coul_long.h
 action pair_coul_wolf_kokkos.cpp
 action pair_coul_wolf_kokkos.h
+action pair_dpd_fdt_energy_kokkos.cpp pair_dpd_fdt_energy.cpp
+action pair_dpd_fdt_energy_kokkos.h pair_dpd_fdt_energy.h
 action pair_eam_kokkos.cpp pair_eam.cpp
 action pair_eam_kokkos.h pair_eam.h
 action pair_eam_alloy_kokkos.cpp pair_eam_alloy.cpp
diff --git a/src/USER-DPD/pair_dpd_fdt_energy_kokkos.cpp b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
similarity index 100%
rename from src/USER-DPD/pair_dpd_fdt_energy_kokkos.cpp
rename to src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
diff --git a/src/USER-DPD/pair_dpd_fdt_energy_kokkos.h b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
similarity index 100%
rename from src/USER-DPD/pair_dpd_fdt_energy_kokkos.h
rename to src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index aa37c8375d..754fa4667d 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -11,6 +11,10 @@
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
+/* ----------------------------------------------------------------------
+   Contributing author: Stan Moore (Sandia)
+------------------------------------------------------------------------- */
+
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>

From 6e6776f39635b1b69dab532bced2b0d95f150d62 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Wed, 30 Nov 2016 16:25:07 -0500
Subject: [PATCH 006/267] Finish moving/integrating atom_vec_dpd_kokkos into
 the Kokkos package

---
 src/KOKKOS/Install.sh                |    2 +
 src/USER-DPD/atom_vec_dpd_kokkos.cpp | 1874 --------------------------
 src/USER-DPD/atom_vec_dpd_kokkos.h   |  135 --
 3 files changed, 2 insertions(+), 2009 deletions(-)
 delete mode 100644 src/USER-DPD/atom_vec_dpd_kokkos.cpp
 delete mode 100644 src/USER-DPD/atom_vec_dpd_kokkos.h

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index 7e46b52c2b..1381a1978c 100644
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -47,6 +47,8 @@ action atom_vec_bond_kokkos.cpp atom_vec_bond.cpp
 action atom_vec_bond_kokkos.h atom_vec_bond.h
 action atom_vec_charge_kokkos.cpp
 action atom_vec_charge_kokkos.h
+action atom_vec_dpd_kokkos.cpp atom_vec_dpd.cpp
+action atom_vec_dpd_kokkos.h atom_vec_dpd.h
 action atom_vec_full_kokkos.cpp atom_vec_full.cpp
 action atom_vec_full_kokkos.h atom_vec_full.h
 action atom_vec_kokkos.cpp
diff --git a/src/USER-DPD/atom_vec_dpd_kokkos.cpp b/src/USER-DPD/atom_vec_dpd_kokkos.cpp
deleted file mode 100644
index c79559172f..0000000000
--- a/src/USER-DPD/atom_vec_dpd_kokkos.cpp
+++ /dev/null
@@ -1,1874 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale AtomicKokkos/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#include <stdlib.h>
-#include "atom_vec_dpd_kokkos.h"
-#include "atom_kokkos.h"
-#include "comm_kokkos.h"
-#include "domain.h"
-#include "modify.h"
-#include "fix.h"
-#include "atom_masks.h"
-#include "memory.h"
-#include "error.h"
-
-using namespace LAMMPS_NS;
-
-#define DELTA 10000
-
-/* ---------------------------------------------------------------------- */
-
-AtomVecDPDKokkos::AtomVecDPDKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)
-{
-  molecular = 0;
-  mass_type = 1;
-
-  comm_x_only = comm_f_only = 0;
-  size_forward = 7;
-  size_reverse = 3;
-  size_border = 12;
-  size_velocity = 3;
-  size_data_atom = 6;
-  size_data_vel = 4;
-  xcol_data = 4;
-
-  atom->rho_flag = 1;
-  atom->dpd_flag = 1;
-
-  k_count = DAT::tdual_int_1d("atom::k_count",1);
-  atomKK = (AtomKokkos *) atom;
-  commKK = (CommKokkos *) comm;
-}
-
-/* ----------------------------------------------------------------------
-   grow atom arrays
-   n = 0 grows arrays by DELTA
-   n > 0 allocates arrays to size n
-------------------------------------------------------------------------- */
-
-void AtomVecDPDKokkos::grow(int n)
-{
-  if (n == 0) nmax += DELTA;
-  else nmax = n;
-  atomKK->nmax = nmax;
-  if (nmax < 0 || nmax > MAXSMALLINT)
-    error->one(FLERR,"Per-processor system is too big");
-
-  sync(Device,ALL_MASK);
-  modified(Device,ALL_MASK);
-
-  memory->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
-  memory->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
-  memory->grow_kokkos(atomKK->k_mask,atomKK->mask,nmax,"atom:mask");
-  memory->grow_kokkos(atomKK->k_image,atomKK->image,nmax,"atom:image");
-
-  memory->grow_kokkos(atomKK->k_x,atomKK->x,nmax,3,"atom:x");
-  memory->grow_kokkos(atomKK->k_v,atomKK->v,nmax,3,"atom:v");
-  memory->grow_kokkos(atomKK->k_f,atomKK->f,nmax,3,"atom:f");
-
-
-  memory->grow_kokkos(atomKK->k_rho,atomKK->rho,nmax,"atom:rho");
-  memory->grow_kokkos(atomKK->k_dpdTheta,atomKK->dpdTheta,nmax,"atom:dpdTheta");
-  memory->grow_kokkos(atomKK->k_uCond,atomKK->uCond,nmax,"atom:uCond");
-  memory->grow_kokkos(atomKK->k_uMech,atomKK->uMech,nmax,"atom:uMech");
-  memory->grow_kokkos(atomKK->k_uChem,atomKK->uChem,nmax,"atom:uChem");
-  memory->grow_kokkos(atomKK->k_uCG,atomKK->uCG,nmax,"atom:uCG");
-  memory->grow_kokkos(atomKK->k_uCGnew,atomKK->uCGnew,nmax,"atom:uCGnew");
-  memory->grow_kokkos(atomKK->k_duChem,atomKK->duChem,nmax,"atom:duChem");
-
-  grow_reset();
-  sync(Host,ALL_MASK);
-
-  if (atom->nextra_grow)
-    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
-      modify->fix[atom->extra_grow[iextra]]->grow_arrays(nmax);
-}
-
-/* ----------------------------------------------------------------------
-   reset local array ptrs
-------------------------------------------------------------------------- */
-
-void AtomVecDPDKokkos::grow_reset()
-{
-  tag = atomKK->tag;
-  d_tag = atomKK->k_tag.d_view;
-  h_tag = atomKK->k_tag.h_view;
-
-  type = atomKK->type;
-  d_type = atomKK->k_type.d_view;
-  h_type = atomKK->k_type.h_view;
-  mask = atomKK->mask;
-  d_mask = atomKK->k_mask.d_view;
-  h_mask = atomKK->k_mask.h_view;
-  image = atomKK->image;
-  d_image = atomKK->k_image.d_view;
-  h_image = atomKK->k_image.h_view;
-
-  x = atomKK->x;
-  d_x = atomKK->k_x.d_view;
-  h_x = atomKK->k_x.h_view;
-  v = atomKK->v;
-  d_v = atomKK->k_v.d_view;
-  h_v = atomKK->k_v.h_view;
-  f = atomKK->f;
-  d_f = atomKK->k_f.d_view;
-  h_f = atomKK->k_f.h_view;
-
-  rho = atomKK->rho;
-  d_rho = atomKK->k_rho.d_view;
-  h_rho = atomKK->k_rho.h_view;
-  dpdTheta = atomKK->dpdTheta;
-  d_dpdTheta = atomKK->k_dpdTheta.d_view;
-  h_dpdTheta = atomKK->k_dpdTheta.h_view;
-  uCond = atomKK->uCond;
-  d_uCond = atomKK->k_uCond.d_view;;
-  h_uCond = atomKK->k_uCond.h_view;
-  uMech = atomKK->uMech;
-  d_uMech = atomKK->k_uMech.d_view;;
-  h_uMech = atomKK->k_uMech.h_view;
-  uChem = atomKK->uChem;
-  d_uChem = atomKK->k_uChem.d_view;;
-  h_uChem = atomKK->k_uChem.h_view;
-  uCG = atomKK->uCG;
-  d_uCG = atomKK->k_uCG.d_view;;
-  h_uCG = atomKK->k_uCG.h_view;
-  uCGnew = atomKK->uCGnew;
-  d_uCGnew = atomKK->k_uCGnew.d_view;;
-  h_uCGnew = atomKK->k_uCGnew.h_view;
-  duChem = atomKK->duChem;
-  d_duChem = atomKK->k_duChem.d_view;;
-  h_duChem = atomKK->k_duChem.h_view;
-}
-
-/* ----------------------------------------------------------------------
-   copy atom I info to atom J
-------------------------------------------------------------------------- */
-
-void AtomVecDPDKokkos::copy(int i, int j, int delflag)
-{
-  h_tag[j] = h_tag[i];
-  h_type[j] = h_type[i];
-  mask[j] = mask[i];
-  h_image[j] = h_image[i];
-  h_x(j,0) = h_x(i,0);
-  h_x(j,1) = h_x(i,1);
-  h_x(j,2) = h_x(i,2);
-  h_v(j,0) = h_v(i,0);
-  h_v(j,1) = h_v(i,1);
-  h_v(j,2) = h_v(i,2);
-  h_dpdTheta[j] = h_dpdTheta[i];
-  h_uCond[j] = h_uCond[i];
-  h_uMech[j] = h_uMech[i];
-  h_uChem[j] = h_uChem[i];
-  h_uCG[j] = h_uCG[i];
-  h_uCGnew[j] = h_uCGnew[i];
-
-  if (atom->nextra_grow)
-    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
-      modify->fix[atom->extra_grow[iextra]]->copy_arrays(i,j,delflag);
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecDPDKokkos_PackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_efloat_1d _dpdTheta,_uCond,_uMech,_uChem;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecDPDKokkos_PackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_efloat_1d &dpdTheta,
-      const typename DAT::tdual_efloat_1d &uCond,
-      const typename DAT::tdual_efloat_1d &uMech,
-      const typename DAT::tdual_efloat_1d &uChem,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-      _x(x.view<DeviceType>()),
-      _dpdTheta(dpdTheta.view<DeviceType>()),
-      _uCond(uCond.view<DeviceType>()),
-      _uMech(uMech.view<DeviceType>()),
-      _uChem(uChem.view<DeviceType>()),
-      _list(list.view<DeviceType>()),_iswap(iswap),
-      _xprd(xprd),_yprd(yprd),_zprd(zprd),
-      _xy(xy),_xz(xz),_yz(yz) {
-        const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
-        const size_t elements = 3;
-        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
-        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-      const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _buf(i,0) = _x(j,0);
-          _buf(i,1) = _x(j,1);
-          _buf(i,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-      _buf(i,3) = _dpdTheta(j);
-      _buf(i,4) = _uCond(j);
-      _buf(i,5) = _uMech(j);
-      _buf(i,6) = _uChem(j);
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::pack_comm_kokkos(const int &n,
-                                          const DAT::tdual_int_2d &list,
-                                          const int & iswap,
-                                          const DAT::tdual_xfloat_2d &buf,
-                                          const int &pbc_flag,
-                                          const int* const pbc)
-{
-  // Check whether to always run forward communication on the host
-  // Choose correct forward PackComm kernel
-
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecDPDKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,
-          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
-          buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecDPDKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,
-          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
-          buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecDPDKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,
-          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
-          buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecDPDKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,
-          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
-          buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-    LMPHostType::fence();
-  } else {
-    sync(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecDPDKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,
-          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
-          buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecDPDKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,
-          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
-          buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecDPDKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,
-          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
-          buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecDPDKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,
-          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
-          buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-    LMPDeviceType::fence();
-  }
-
-	return n*size_forward;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecDPDKokkos_PackCommSelf {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_x_array _xw;
-  typename ArrayTypes<DeviceType>::t_efloat_1d _dpdTheta,_uCond,_uMech,_uChem;
-  int _nfirst;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecDPDKokkos_PackCommSelf(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_efloat_1d &dpdTheta,
-      const typename DAT::tdual_efloat_1d &uCond,
-      const typename DAT::tdual_efloat_1d &uMech,
-      const typename DAT::tdual_efloat_1d &uChem,
-      const int &nfirst,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-      _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),
-      _dpdTheta(dpdTheta.view<DeviceType>()),
-      _uCond(uCond.view<DeviceType>()),
-      _uMech(uMech.view<DeviceType>()),
-      _uChem(uChem.view<DeviceType>()),      
-      _nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
-      _xprd(xprd),_yprd(yprd),_zprd(zprd),
-      _xy(xy),_xz(xz),_yz(yz) {
-        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _xw(i+_nfirst,0) = _x(j,0);
-          _xw(i+_nfirst,1) = _x(j,1);
-          _xw(i+_nfirst,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-      _dpdTheta(i+_nfirst) = _dpdTheta(j);
-      _uCond(i+_nfirst) = _uCond(j);
-      _uMech(i+_nfirst) = _uMech(j);
-      _uChem(i+_nfirst) = _uChem(j); 
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
-										const int nfirst, const int &pbc_flag, const int* const pbc) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecDPDKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,
-          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
-          nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecDPDKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,
-          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
-          nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecDPDKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,
-          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
-          nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecDPDKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,
-          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
-          nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-    LMPHostType::fence();
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecDPDKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,
-          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
-          nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecDPDKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,
-          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
-          nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecDPDKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,
-          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
-          nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecDPDKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,
-          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
-          nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-    LMPDeviceType::fence();
-  }
-	return n*3;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-struct AtomVecDPDKokkos_UnpackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array _x;
-  typename ArrayTypes<DeviceType>::t_efloat_1d _dpdTheta,_uCond,_uMech,_uChem;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
-  int _first;
-
-  AtomVecDPDKokkos_UnpackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_efloat_1d &dpdTheta,
-      const typename DAT::tdual_efloat_1d &uCond,
-      const typename DAT::tdual_efloat_1d &uMech,
-      const typename DAT::tdual_efloat_1d &uChem,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const int& first):_x(x.view<DeviceType>()),
-                        _dpdTheta(dpdTheta.view<DeviceType>()),
-                        _uCond(uCond.view<DeviceType>()),
-                        _uMech(uMech.view<DeviceType>()),
-                        _uChem(uChem.view<DeviceType>()),
-                        _buf(buf.view<DeviceType>()),
-                        _first(first) {};
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-      _x(i+_first,0) = _buf(i,0);
-      _x(i+_first,1) = _buf(i,1);
-      _x(i+_first,2) = _buf(i,2);
-      _dpdTheta(i+_first) = _buf(i,3);
-      _uCond(i+_first) = _buf(i,4);
-      _uMech(i+_first) = _buf(i,5);
-      _uChem(i+_first) = _buf(i,6);
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecDPDKokkos::unpack_comm_kokkos(const int &n, const int &first,
-    const DAT::tdual_xfloat_2d &buf ) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    struct AtomVecDPDKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,
-    atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
-    buf,first);
-    Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    struct AtomVecDPDKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,
-    atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
-    buf,first);
-    Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::pack_comm(int n, int *list, double *buf,
-                             int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-      buf[m++] = dpdTheta[j];
-      buf[m++] = uCond[j];
-      buf[m++] = uMech[j];
-      buf[m++] = uChem[j];
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0) + dx;
-      buf[m++] = h_x(j,1) + dy;
-      buf[m++] = h_x(j,2) + dz;
-      buf[m++] = h_dpdTheta[j];
-      buf[m++] = h_uCond[j];
-      buf[m++] = h_uMech[j];
-      buf[m++] = h_uChem[j];
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::pack_comm_vel(int n, int *list, double *buf,
-                                 int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz,dvx,dvy,dvz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-      buf[m++] = h_v(j,0);
-      buf[m++] = h_v(j,1);
-      buf[m++] = h_v(j,2);
-      buf[m++] = h_dpdTheta[j];
-      buf[m++] = h_uCond[j];
-      buf[m++] = h_uMech[j];
-      buf[m++] = h_uChem[j];
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    if (!deform_vremap) {
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        buf[m++] = h_v(j,0);
-        buf[m++] = h_v(j,1);
-        buf[m++] = h_v(j,2);
-        buf[m++] = h_dpdTheta[j];
-        buf[m++] = h_uCond[j];
-        buf[m++] = h_uMech[j];
-        buf[m++] = h_uChem[j];
-      }
-    } else {
-      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
-      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
-      dvz = pbc[2]*h_rate[2];
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        if (mask[i] & deform_groupbit) {
-          buf[m++] = h_v(j,0) + dvx;
-          buf[m++] = h_v(j,1) + dvy;
-          buf[m++] = h_v(j,2) + dvz;
-        } else {
-          buf[m++] = h_v(j,0);
-          buf[m++] = h_v(j,1);
-          buf[m++] = h_v(j,2);
-        }
-        buf[m++] = h_dpdTheta(j);
-        buf[m++] = h_uCond(j);
-        buf[m++] = h_uMech(j);
-        buf[m++] = h_uChem(j); 
-      }
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecDPDKokkos::unpack_comm(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-    h_dpdTheta[i] = buf[m++];
-    h_uCond[i] = buf[m++];
-    h_uMech[i] = buf[m++];
-    h_uChem[i] = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecDPDKokkos::unpack_comm_vel(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-    h_v(i,0) = buf[m++];
-    h_v(i,1) = buf[m++];
-    h_v(i,2) = buf[m++];
-    h_dpdTheta[i] = buf[m++];
-    h_uCond[i] = buf[m++];
-    h_uMech[i] = buf[m++];
-    h_uChem[i] = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::pack_reverse(int n, int first, double *buf)
-{
-  if(n > 0)
-    sync(Host,F_MASK);
-
-  int m = 0;
-  const int last = first + n;
-  for (int i = first; i < last; i++) {
-    buf[m++] = h_f(i,0);
-    buf[m++] = h_f(i,1);
-    buf[m++] = h_f(i,2);
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecDPDKokkos::unpack_reverse(int n, int *list, double *buf)
-{
-  if(n > 0) {
-    sync(Host,F_MASK);
-    modified(Host,F_MASK);
-  }
-
-  int m = 0;
-  for (int i = 0; i < n; i++) {
-    const int j = list[i];
-    h_f(j,0) += buf[m++];
-    h_f(j,1) += buf[m++];
-    h_f(j,2) += buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType,int PBC_FLAG>
-struct AtomVecDPDKokkos_PackBorder {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_xfloat_2d _buf;
-  const typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  const typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  const typename ArrayTypes<DeviceType>::t_tagint_1d _tag;
-  const typename ArrayTypes<DeviceType>::t_int_1d _type;
-  const typename ArrayTypes<DeviceType>::t_int_1d _mask;
-  typename ArrayTypes<DeviceType>::t_efloat_1d _dpdTheta,_uCond,_uMech,_uChem,_uCG,_uCGnew;
-  X_FLOAT _dx,_dy,_dz;
-
-  AtomVecDPDKokkos_PackBorder(
-      const typename ArrayTypes<DeviceType>::t_xfloat_2d &buf,
-      const typename ArrayTypes<DeviceType>::t_int_2d_const &list,
-      const int & iswap,
-      const typename ArrayTypes<DeviceType>::t_x_array &x,
-      const typename ArrayTypes<DeviceType>::t_tagint_1d &tag,
-      const typename ArrayTypes<DeviceType>::t_int_1d &type,
-      const typename ArrayTypes<DeviceType>::t_int_1d &mask,
-      const typename ArrayTypes<DeviceType>::t_efloat_1d &dpdTheta,
-      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCond,
-      const typename ArrayTypes<DeviceType>::t_efloat_1d &uMech,
-      const typename ArrayTypes<DeviceType>::t_efloat_1d &uChem,
-      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCG,
-      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCGnew,
-      const X_FLOAT &dx, const X_FLOAT &dy, const X_FLOAT &dz):
-      _buf(buf),_list(list),_iswap(iswap),
-      _x(x),_tag(tag),_type(type),_mask(mask),
-      _dpdTheta(dpdTheta),
-      _uCond(uCond),
-      _uMech(uMech),
-      _uChem(uChem),
-      _uCG(uCGnew),
-      _uCGnew(uCGnew),
-      _dx(dx),_dy(dy),_dz(dz) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-      const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _buf(i,0) = _x(j,0);
-          _buf(i,1) = _x(j,1);
-          _buf(i,2) = _x(j,2);
-      } else {
-          _buf(i,0) = _x(j,0) + _dx;
-          _buf(i,1) = _x(j,1) + _dy;
-          _buf(i,2) = _x(j,2) + _dz;
-      }
-      _buf(i,3) = _tag(j);
-      _buf(i,4) = _type(j);
-      _buf(i,5) = _mask(j);
-      _buf(i,6) = _dpdTheta(j);
-      _buf(i,7) = _uCond(j);
-      _buf(i,8) = _uMech(j);
-      _buf(i,9) = _uChem(j);
-      _buf(i,10) = _uCG(j);
-      _buf(i,11) = _uCGnew(j);
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap,
-                               int pbc_flag, int *pbc, ExecutionSpace space)
-{
-  X_FLOAT dx,dy,dz;
-
-  if (pbc_flag != 0) {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0];
-      dy = pbc[1];
-      dz = pbc[2];
-    }
-    if(space==Host) {
-      AtomVecDPDKokkos_PackBorder<LMPHostType,1> f(
-        buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
-        iswap,h_x,h_tag,h_type,h_mask,
-        h_dpdTheta,h_uCond,h_uMech,h_uChem,h_uCG,h_uCGnew,
-        dx,dy,dz);
-      Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
-    } else {
-      AtomVecDPDKokkos_PackBorder<LMPDeviceType,1> f(
-        buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
-        iswap,d_x,d_tag,d_type,d_mask,
-        d_dpdTheta,d_uCond,d_uMech,d_uChem,d_uCG,d_uCGnew,
-        dx,dy,dz);
-      Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
-    }
-
-  } else {
-    dx = dy = dz = 0;
-    if(space==Host) {
-      AtomVecDPDKokkos_PackBorder<LMPHostType,0> f(
-        buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
-        iswap,h_x,h_tag,h_type,h_mask,
-        h_dpdTheta,h_uCond,h_uMech,h_uChem,h_uCG,h_uCGnew,
-        dx,dy,dz);
-      Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
-    } else {
-      AtomVecDPDKokkos_PackBorder<LMPDeviceType,0> f(
-        buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
-        iswap,d_x,d_tag,d_type,d_mask,
-        d_dpdTheta,d_uCond,d_uMech,d_uChem,d_uCG,d_uCGnew,
-        dx,dy,dz);
-      Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
-    }
-  }
-  return n*6;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::pack_border(int n, int *list, double *buf,
-                               int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-      buf[m++] = ubuf(h_tag(j)).d;
-      buf[m++] = ubuf(h_type(j)).d;
-      buf[m++] = ubuf(h_mask(j)).d;
-      buf[m++] = h_dpdTheta(j);
-      buf[m++] = h_uCond(j);
-      buf[m++] = h_uMech(j);
-      buf[m++] = h_uChem(j);
-      buf[m++] = h_uCG(j);
-      buf[m++] = h_uCGnew(j);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0];
-      dy = pbc[1];
-      dz = pbc[2];
-    }
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0) + dx;
-      buf[m++] = h_x(j,1) + dy;
-      buf[m++] = h_x(j,2) + dz;
-      buf[m++] = ubuf(h_tag(j)).d;
-      buf[m++] = ubuf(h_type(j)).d;
-      buf[m++] = ubuf(h_mask(j)).d;
-      buf[m++] = h_dpdTheta(j);
-      buf[m++] = h_uCond(j);
-      buf[m++] = h_uMech(j);
-      buf[m++] = h_uChem(j);
-      buf[m++] = h_uCG(j);
-      buf[m++] = h_uCGnew(j);
-    }
-  }
-
-  if (atom->nextra_border)
-    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
-      m += modify->fix[atom->extra_border[iextra]]->pack_border(n,list,&buf[m]);
-
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::pack_border_vel(int n, int *list, double *buf,
-                                   int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz,dvx,dvy,dvz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-      buf[m++] = ubuf(h_tag(j)).d;
-      buf[m++] = ubuf(h_type(j)).d;
-      buf[m++] = ubuf(h_mask(j)).d;
-      buf[m++] = h_v(j,0);
-      buf[m++] = h_v(j,1);
-      buf[m++] = h_v(j,2);
-      buf[m++] = h_dpdTheta(j);
-      buf[m++] = h_uCond(j);
-      buf[m++] = h_uMech(j);
-      buf[m++] = h_uChem(j);
-      buf[m++] = h_uCG(j);
-      buf[m++] = h_uCGnew(j);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0];
-      dy = pbc[1];
-      dz = pbc[2];
-    }
-    if (!deform_vremap) {
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        buf[m++] = ubuf(h_tag(j)).d;
-        buf[m++] = ubuf(h_type(j)).d;
-        buf[m++] = ubuf(h_mask(j)).d;
-        buf[m++] = h_v(j,0);
-        buf[m++] = h_v(j,1);
-        buf[m++] = h_v(j,2);
-        buf[m++] = h_dpdTheta(j);
-        buf[m++] = h_uCond(j);
-        buf[m++] = h_uMech(j);
-        buf[m++] = h_uChem(j);
-        buf[m++] = h_uCG(j);
-        buf[m++] = h_uCGnew(j);
-      }
-    } else {
-      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
-      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
-      dvz = pbc[2]*h_rate[2];
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        buf[m++] = ubuf(h_tag(j)).d;
-        buf[m++] = ubuf(h_type(j)).d;
-        buf[m++] = ubuf(h_mask(j)).d;
-        if (mask[i] & deform_groupbit) {
-          buf[m++] = h_v(j,0) + dvx;
-          buf[m++] = h_v(j,1) + dvy;
-          buf[m++] = h_v(j,2) + dvz;
-        } else {
-          buf[m++] = h_v(j,0);
-          buf[m++] = h_v(j,1);
-          buf[m++] = h_v(j,2);
-        }
-        buf[m++] = h_dpdTheta(j);
-        buf[m++] = h_uCond(j);
-        buf[m++] = h_uMech(j);
-        buf[m++] = h_uChem(j);
-        buf[m++] = h_uCG(j);
-        buf[m++] = h_uCGnew(j);
-      }
-    }
-  }
-
-  if (atom->nextra_border)
-    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
-      m += modify->fix[atom->extra_border[iextra]]->pack_border(n,list,&buf[m]);
-
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::pack_comm_hybrid(int n, int *list, double *buf)
-{
-  int i,j,m;
-
-  m = 0;
-  for (i = 0; i < n; i++) {
-    j = list[i];
-    buf[m++] = h_dpdTheta[j];
-    buf[m++] = h_uCond[j];
-    buf[m++] = h_uMech[j];
-    buf[m++] = h_uChem[j];
-    buf[m++] = h_uCG[j];
-    buf[m++] = h_uCGnew[j];
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::pack_border_hybrid(int n, int *list, double *buf)
-{
-  int i,j,m;
-
-  m = 0;
-  for (i = 0; i < n; i++) {
-    j = list[i];
-    buf[m++] = h_dpdTheta[j];
-    buf[m++] = h_uCond[j];
-    buf[m++] = h_uMech[j];
-    buf[m++] = h_uChem[j];
-    buf[m++] = h_uCG[j];
-    buf[m++] = h_uCGnew[j];
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-struct AtomVecDPDKokkos_UnpackBorder {
-  typedef DeviceType device_type;
-
-  const typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
-  typename ArrayTypes<DeviceType>::t_x_array _x;
-  typename ArrayTypes<DeviceType>::t_tagint_1d _tag;
-  typename ArrayTypes<DeviceType>::t_int_1d _type;
-  typename ArrayTypes<DeviceType>::t_int_1d _mask;
-  typename ArrayTypes<DeviceType>::t_efloat_1d _dpdTheta,_uCond,_uMech,_uChem,_uCG,_uCGnew;
-  int _first;
-
-
-  AtomVecDPDKokkos_UnpackBorder(
-      const typename ArrayTypes<DeviceType>::t_xfloat_2d_const &buf,
-      typename ArrayTypes<DeviceType>::t_x_array &x,
-      typename ArrayTypes<DeviceType>::t_tagint_1d &tag,
-      typename ArrayTypes<DeviceType>::t_int_1d &type,
-      typename ArrayTypes<DeviceType>::t_int_1d &mask,
-      const typename ArrayTypes<DeviceType>::t_efloat_1d &dpdTheta,
-      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCond,
-      const typename ArrayTypes<DeviceType>::t_efloat_1d &uMech,
-      const typename ArrayTypes<DeviceType>::t_efloat_1d &uChem,
-      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCG,
-      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCGnew,
-      const int& first):
-      _buf(buf),_x(x),_tag(tag),_type(type),_mask(mask),
-      _dpdTheta(dpdTheta),
-      _uCond(uCond),
-      _uMech(uMech),
-      _uChem(uChem),
-      _uCG(uCGnew),
-      _uCGnew(uCGnew),
-      _first(first) {};
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-      _x(i+_first,0) = _buf(i,0);
-      _x(i+_first,1) = _buf(i,1);
-      _x(i+_first,2) = _buf(i,2);
-      _tag(i+_first) = static_cast<int> (_buf(i,3));
-      _type(i+_first) = static_cast<int>  (_buf(i,4));
-      _mask(i+_first) = static_cast<int>  (_buf(i,5));
-      _dpdTheta(i+_first) = _buf(i,6);
-      _uCond(i+_first) = _buf(i,7);
-      _uMech(i+_first) = _buf(i,8);
-      _uChem(i+_first) = _buf(i,9);
-      _uCG(i+_first) = _buf(i,10);
-      _uCGnew(i+_first) = _buf(i,11);
-//      printf("%i %i %lf %lf %lf %i BORDER\n",_tag(i+_first),i+_first,_x(i+_first,0),_x(i+_first,1),_x(i+_first,2),_type(i+_first));
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecDPDKokkos::unpack_border_kokkos(const int &n, const int &first,
-                     const DAT::tdual_xfloat_2d &buf,ExecutionSpace space) {
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
-  while (first+n >= nmax) grow(0);
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
-  if(space==Host) {
-    struct AtomVecDPDKokkos_UnpackBorder<LMPHostType> f(buf.view<LMPHostType>(),
-      h_x,h_tag,h_type,h_mask,
-      h_dpdTheta,h_uCond,h_uMech,h_uChem,h_uCG,h_uCGnew,
-      first);
-    Kokkos::parallel_for(n,f);
-    LMPHostType::fence();
-  } else {
-    struct AtomVecDPDKokkos_UnpackBorder<LMPDeviceType> f(buf.view<LMPDeviceType>(),
-      d_x,d_tag,d_type,d_mask,
-      d_dpdTheta,d_uCond,d_uMech,d_uChem,d_uCG,d_uCGnew,
-      first);
-    Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecDPDKokkos::unpack_border(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    if (i == nmax) grow(0);
-    modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-    h_tag(i) =  (tagint)  ubuf(buf[m++]).i;
-    h_type(i) = (int) ubuf(buf[m++]).i;
-    h_mask(i) = (int) ubuf(buf[m++]).i;
-    h_dpdTheta(i) = buf[m++];
-    h_uCond(i) = buf[m++];
-    h_uMech(i) = buf[m++];
-    h_uChem(i) = buf[m++];
-    h_uCG(i) = buf[m++];
-    h_uCGnew(i) = buf[m++];
-  }
-
-  if (atom->nextra_border)
-    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
-      m += modify->fix[atom->extra_border[iextra]]->
-        unpack_border(n,first,&buf[m]);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecDPDKokkos::unpack_border_vel(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    if (i == nmax) grow(0);
-    modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-    h_tag(i) =  (tagint)  ubuf(buf[m++]).i;
-    h_type(i) = (int) ubuf(buf[m++]).i;
-    h_mask(i) = (int) ubuf(buf[m++]).i;
-    h_v(i,0) = buf[m++];
-    h_v(i,1) = buf[m++];
-    h_v(i,2) = buf[m++];
-    h_dpdTheta(i) = buf[m++];
-    h_uCond(i) = buf[m++];
-    h_uMech(i) = buf[m++];
-    h_uChem(i) = buf[m++];
-    h_uCG(i) = buf[m++];
-    h_uCGnew(i) = buf[m++];
-  }
-
-  if (atom->nextra_border)
-    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
-      m += modify->fix[atom->extra_border[iextra]]->
-        unpack_border(n,first,&buf[m]);
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::unpack_comm_hybrid(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_dpdTheta(i) = buf[m++];
-    h_uCond(i) = buf[m++];
-    h_uMech(i) = buf[m++];
-    h_uChem(i) = buf[m++];
-    h_uCG(i) = buf[m++];
-    h_uCGnew(i) = buf[m++];
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::unpack_border_hybrid(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_dpdTheta(i) = buf[m++];
-    h_uCond(i) = buf[m++];
-    h_uMech(i) = buf[m++];
-    h_uChem(i) = buf[m++];
-    h_uCG(i) = buf[m++];
-    h_uCGnew(i) = buf[m++];
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-struct AtomVecDPDKokkos_PackExchangeFunctor {
-  typedef DeviceType device_type;
-  typedef ArrayTypes<DeviceType> AT;
-  typename AT::t_x_array_randomread _x;
-  typename AT::t_v_array_randomread _v;
-  typename AT::t_tagint_1d_randomread _tag;
-  typename AT::t_int_1d_randomread _type;
-  typename AT::t_int_1d_randomread _mask;
-  typename AT::t_imageint_1d_randomread _image;
-  typename AT::t_efloat_1d_randomread _dpdTheta,_uCond,_uMech,_uChem,_uCG,_uCGnew;
-  typename AT::t_x_array _xw;
-  typename AT::t_v_array _vw;
-  typename AT::t_tagint_1d _tagw;
-  typename AT::t_int_1d _typew;
-  typename AT::t_int_1d _maskw;
-  typename AT::t_imageint_1d _imagew;
-  typename AT::t_efloat_1d _dpdThetaw,_uCondw,_uMechw,_uChemw,_uCGw,_uCGneww;
-
-  typename AT::t_xfloat_2d_um _buf;
-  typename AT::t_int_1d_const _sendlist;
-  typename AT::t_int_1d_const _copylist;
-  int _nlocal,_dim;
-  X_FLOAT _lo,_hi;
-
-  AtomVecDPDKokkos_PackExchangeFunctor(
-      const AtomKokkos* atom,
-      const typename AT::tdual_xfloat_2d buf,
-      typename AT::tdual_int_1d sendlist,
-      typename AT::tdual_int_1d copylist,int nlocal, int dim,
-                X_FLOAT lo, X_FLOAT hi):
-                _x(atom->k_x.view<DeviceType>()),
-                _v(atom->k_v.view<DeviceType>()),
-                _tag(atom->k_tag.view<DeviceType>()),
-                _type(atom->k_type.view<DeviceType>()),
-                _mask(atom->k_mask.view<DeviceType>()),
-                _image(atom->k_image.view<DeviceType>()),
-                _dpdTheta(atom->k_dpdTheta.view<DeviceType>()),
-                _uCond(atom->k_uCond.view<DeviceType>()),
-                _uMech(atom->k_uMech.view<DeviceType>()),
-                _uChem(atom->k_uChem.view<DeviceType>()),
-                _uCG(atom->k_uCG.view<DeviceType>()),
-                _uCGnew(atom->k_uCGnew.view<DeviceType>()),
-                _xw(atom->k_x.view<DeviceType>()),
-                _vw(atom->k_v.view<DeviceType>()),
-                _tagw(atom->k_tag.view<DeviceType>()),
-                _typew(atom->k_type.view<DeviceType>()),
-                _maskw(atom->k_mask.view<DeviceType>()),
-                _imagew(atom->k_image.view<DeviceType>()),
-                _dpdThetaw(atom->k_dpdTheta.view<DeviceType>()),
-                _uCondw(atom->k_uCond.view<DeviceType>()),
-                _uMechw(atom->k_uMech.view<DeviceType>()),
-                _uChemw(atom->k_uChem.view<DeviceType>()),
-                _uCGw(atom->k_uCG.view<DeviceType>()),
-                _uCGneww(atom->k_uCGnew.view<DeviceType>()),
-                _sendlist(sendlist.template view<DeviceType>()),
-                _copylist(copylist.template view<DeviceType>()),
-                _nlocal(nlocal),_dim(dim),
-                _lo(lo),_hi(hi){
-    const size_t elements = 17;
-    const int maxsendlist = (buf.template view<DeviceType>().dimension_0()*buf.template view<DeviceType>().dimension_1())/elements;
-
-    buffer_view<DeviceType>(_buf,buf,maxsendlist,elements);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int &mysend) const {
-    const int i = _sendlist(mysend);
-    _buf(mysend,0) = 17;
-    _buf(mysend,1) = _x(i,0);
-    _buf(mysend,2) = _x(i,1);
-    _buf(mysend,3) = _x(i,2);
-    _buf(mysend,4) = _v(i,0);
-    _buf(mysend,5) = _v(i,1);
-    _buf(mysend,6) = _v(i,2);
-    _buf(mysend,7) = _tag[i];
-    _buf(mysend,8) = _type[i];
-    _buf(mysend,9) = _mask[i];
-    _buf(mysend,10) = _image[i];
-    _buf(mysend,11) = _dpdTheta[i];
-    _buf(mysend,12) = _uCond[i];
-    _buf(mysend,13) = _uMech[i];
-    _buf(mysend,14) = _uChem[i];
-    _buf(mysend,15) = _uCG[i];
-    _buf(mysend,16) = _uCGnew[i];
-    const int j = _copylist(mysend);
-
-    if(j>-1) {
-    _xw(i,0) = _x(j,0);
-    _xw(i,1) = _x(j,1);
-    _xw(i,2) = _x(j,2);
-    _vw(i,0) = _v(j,0);
-    _vw(i,1) = _v(j,1);
-    _vw(i,2) = _v(j,2);
-    _tagw[i] = _tag(j);
-    _typew[i] = _type(j);
-    _maskw[i] = _mask(j);
-    _imagew[i] = _image(j);
-    _dpdThetaw[i] = _dpdTheta(j);
-    _uCondw[i] = _uCond(j);
-    _uMechw[i] = _uMech(j);
-    _uChemw[i] = _uChem(j);
-    _uCGw[i] = _uCG(j);
-    _uCGneww[i] = _uCGnew(j);
-    }
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &k_buf, DAT::tdual_int_1d k_sendlist,DAT::tdual_int_1d k_copylist,ExecutionSpace space,int dim,X_FLOAT lo,X_FLOAT hi )
-{
-  if(nsend > (int) (k_buf.view<LMPHostType>().dimension_0()*k_buf.view<LMPHostType>().dimension_1())/17) {
-    int newsize = nsend*17/k_buf.view<LMPHostType>().dimension_1()+1;
-    k_buf.resize(newsize,k_buf.view<LMPHostType>().dimension_1());
-  }
-  if(space == Host) {
-    AtomVecDPDKokkos_PackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
-    Kokkos::parallel_for(nsend,f);
-    LMPHostType::fence();
-    return nsend*17;
-  } else {
-    AtomVecDPDKokkos_PackExchangeFunctor<LMPDeviceType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
-    Kokkos::parallel_for(nsend,f);
-    LMPDeviceType::fence();
-    return nsend*17;
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::pack_exchange(int i, double *buf)
-{
-  int m = 1;
-  buf[m++] = h_x(i,0);
-  buf[m++] = h_x(i,1);
-  buf[m++] = h_x(i,2);
-  buf[m++] = h_v(i,0);
-  buf[m++] = h_v(i,1);
-  buf[m++] = h_v(i,2);
-  buf[m++] = ubuf(h_tag(i)).d;
-  buf[m++] = ubuf(h_type(i)).d;
-  buf[m++] = ubuf(h_mask(i)).d;
-  buf[m++] = ubuf(h_image(i)).d;
-  buf[m++] = h_dpdTheta[i];
-  buf[m++] = h_uCond[i];
-  buf[m++] = h_uMech[i];
-  buf[m++] = h_uChem[i];
-  buf[m++] = h_uCG[i];
-  buf[m++] = h_uCGnew[i];
-
-  if (atom->nextra_grow)
-    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
-      m += modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&buf[m]);
-
-  buf[0] = m;
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-struct AtomVecDPDKokkos_UnpackExchangeFunctor {
-  typedef DeviceType device_type;
-  typedef ArrayTypes<DeviceType> AT;
-  typename AT::t_x_array _x;
-  typename AT::t_v_array _v;
-  typename AT::t_tagint_1d _tag;
-  typename AT::t_int_1d _type;
-  typename AT::t_int_1d _mask;
-  typename AT::t_imageint_1d _image;
-  typename AT::t_efloat_1d _dpdTheta;
-  typename AT::t_efloat_1d _uCond;
-  typename AT::t_efloat_1d _uMech;
-  typename AT::t_efloat_1d _uChem;
-  typename AT::t_efloat_1d _uCG;
-  typename AT::t_efloat_1d _uCGnew;
-
-  typename AT::t_xfloat_2d_um _buf;
-  typename AT::t_int_1d _nlocal;
-  int _dim;
-  X_FLOAT _lo,_hi;
-
-  AtomVecDPDKokkos_UnpackExchangeFunctor(
-      const AtomKokkos* atom,
-      const typename AT::tdual_xfloat_2d buf,
-      typename AT::tdual_int_1d nlocal,
-      int dim, X_FLOAT lo, X_FLOAT hi):
-                _x(atom->k_x.view<DeviceType>()),
-                _v(atom->k_v.view<DeviceType>()),
-                _tag(atom->k_tag.view<DeviceType>()),
-                _type(atom->k_type.view<DeviceType>()),
-                _mask(atom->k_mask.view<DeviceType>()),
-                _image(atom->k_image.view<DeviceType>()),
-                _nlocal(nlocal.template view<DeviceType>()),_dim(dim),
-                _lo(lo),_hi(hi){
-    const size_t elements = 17;
-    const int maxsendlist = (buf.template view<DeviceType>().dimension_0()*buf.template view<DeviceType>().dimension_1())/elements;
-
-    buffer_view<DeviceType>(_buf,buf,maxsendlist,elements);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int &myrecv) const {
-    X_FLOAT x = _buf(myrecv,_dim+1);
-    if (x >= _lo && x < _hi) {
-      int i = Kokkos::atomic_fetch_add(&_nlocal(0),1);
-      _x(i,0) = _buf(myrecv,1);
-      _x(i,1) = _buf(myrecv,2);
-      _x(i,2) = _buf(myrecv,3);
-      _v(i,0) = _buf(myrecv,4);
-      _v(i,1) = _buf(myrecv,5);
-      _v(i,2) = _buf(myrecv,6);
-      _tag[i] = _buf(myrecv,7);
-      _type[i] = _buf(myrecv,8);
-      _mask[i] = _buf(myrecv,9);
-      _image[i] = _buf(myrecv,10);
-      _dpdTheta[i] = _buf(myrecv,11);
-      _uCond[i] = _buf(myrecv,12);
-      _uMech[i] = _buf(myrecv,13);
-      _uChem[i] = _buf(myrecv,14);
-      _uCG[i] = _buf(myrecv,15);
-      _uCGnew[i] = _buf(myrecv,16);
-    }
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nrecv,int nlocal,int dim,X_FLOAT lo,X_FLOAT hi,ExecutionSpace space) {
-  if(space == Host) {
-    k_count.h_view(0) = nlocal;
-    AtomVecDPDKokkos_UnpackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_count,dim,lo,hi);
-    Kokkos::parallel_for(nrecv/17,f);
-    LMPHostType::fence();
-    return k_count.h_view(0);
-  } else {
-    k_count.h_view(0) = nlocal;
-    k_count.modify<LMPHostType>();
-    k_count.sync<LMPDeviceType>();
-    AtomVecDPDKokkos_UnpackExchangeFunctor<LMPDeviceType> f(atomKK,k_buf,k_count,dim,lo,hi);
-    Kokkos::parallel_for(nrecv/17,f);
-    LMPDeviceType::fence();
-    k_count.modify<LMPDeviceType>();
-    k_count.sync<LMPHostType>();
-
-    return k_count.h_view(0);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::unpack_exchange(double *buf)
-{
-  int nlocal = atom->nlocal;
-  if (nlocal == nmax) grow(0);
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
-           MASK_MASK | IMAGE_MASK);
-
-  int m = 1;
-  h_x(nlocal,0) = buf[m++];
-  h_x(nlocal,1) = buf[m++];
-  h_x(nlocal,2) = buf[m++];
-  h_v(nlocal,0) = buf[m++];
-  h_v(nlocal,1) = buf[m++];
-  h_v(nlocal,2) = buf[m++];
-  h_tag(nlocal) = (tagint) ubuf(buf[m++]).i;
-  h_type(nlocal) = (int) ubuf(buf[m++]).i;
-  h_mask(nlocal) = (int) ubuf(buf[m++]).i;
-  h_image(nlocal) = (imageint) ubuf(buf[m++]).i;
-  h_dpdTheta[nlocal] = buf[m++];
-  h_uCond[nlocal] = buf[m++];
-  h_uMech[nlocal] = buf[m++];
-  h_uChem[nlocal] = buf[m++];
-  h_uCG[nlocal] = buf[m++];
-  h_uCGnew[nlocal] = buf[m++];
-
-  if (atom->nextra_grow)
-    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
-      m += modify->fix[atom->extra_grow[iextra]]->
-        unpack_exchange(nlocal,&buf[m]);
-
-  atom->nlocal++;
-  return m;
-}
-
-/* ----------------------------------------------------------------------
-   size of restart data for all atoms owned by this proc
-   include extra data stored by fixes
-------------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::size_restart()
-{
-  int i;
-
-  int nlocal = atom->nlocal;
-  int n = 15 * nlocal; // 11 + dpdTheta + uCond + uMech + uChem
-
-  if (atom->nextra_restart)
-    for (int iextra = 0; iextra < atom->nextra_restart; iextra++)
-      for (i = 0; i < nlocal; i++)
-        n += modify->fix[atom->extra_restart[iextra]]->size_restart(i);
-
-  return n;
-}
-
-/* ----------------------------------------------------------------------
-   pack atom I's data for restart file including extra quantities
-   xyz must be 1st 3 values, so that read_restart can test on them
-   molecular types may be negative, but write as positive
-------------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::pack_restart(int i, double *buf)
-{
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
-            MASK_MASK | IMAGE_MASK );
-
-  int m = 1;
-  buf[m++] = h_x(i,0);
-  buf[m++] = h_x(i,1);
-  buf[m++] = h_x(i,2);
-  buf[m++] = ubuf(h_tag(i)).d;
-  buf[m++] = ubuf(h_type(i)).d;
-  buf[m++] = ubuf(h_mask(i)).d;
-  buf[m++] = ubuf(h_image(i)).d;
-  buf[m++] = h_v(i,0);
-  buf[m++] = h_v(i,1);
-  buf[m++] = h_v(i,2);
-  buf[m++] = h_dpdTheta[i];
-  buf[m++] = h_uCond[i];
-  buf[m++] = h_uMech[i];
-  buf[m++] = h_uChem[i];
-
-  if (atom->nextra_restart)
-    for (int iextra = 0; iextra < atom->nextra_restart; iextra++)
-      m += modify->fix[atom->extra_restart[iextra]]->pack_restart(i,&buf[m]);
-
-  buf[0] = m;
-  return m;
-}
-
-/* ----------------------------------------------------------------------
-   unpack data for one atom from restart file including extra quantities
-------------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::unpack_restart(double *buf)
-{
-  int nlocal = atom->nlocal;
-  if (nlocal == nmax) {
-    grow(0);
-    if (atom->nextra_store)
-      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
-  }
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
-                MASK_MASK | IMAGE_MASK );
-
-  int m = 1;
-  h_x(nlocal,0) = buf[m++];
-  h_x(nlocal,1) = buf[m++];
-  h_x(nlocal,2) = buf[m++];
-  h_tag(nlocal) = (tagint) ubuf(buf[m++]).i;
-  h_type(nlocal) = (int) ubuf(buf[m++]).i;
-  h_mask(nlocal) = (int) ubuf(buf[m++]).i;
-  h_image(nlocal) = (imageint) ubuf(buf[m++]).i;
-  h_v(nlocal,0) = buf[m++];
-  h_v(nlocal,1) = buf[m++];
-  h_v(nlocal,2) = buf[m++];
-  h_dpdTheta[nlocal] = buf[m++];
-  h_uCond[nlocal] = buf[m++];
-  h_uMech[nlocal] = buf[m++];
-  h_uChem[nlocal] = buf[m++];
-
-  double **extra = atom->extra;
-  if (atom->nextra_store) {
-    int size = static_cast<int> (ubuf(buf[m++]).i) - m;
-    for (int i = 0; i < size; i++) extra[nlocal][i] = buf[m++];
-  }
-
-  atom->nlocal++;
-  return m;
-}
-
-/* ----------------------------------------------------------------------
-   create one atom of itype at coord
-   set other values to defaults
-------------------------------------------------------------------------- */
-
-void AtomVecDPDKokkos::create_atom(int itype, double *coord)
-{
-  int nlocal = atom->nlocal;
-  if (nlocal == nmax) {
-    //if(nlocal>2) printf("typeA: %i %i\n",type[0],type[1]);
-    atomKK->modified(Host,ALL_MASK);
-    grow(0);
-    //if(nlocal>2) printf("typeB: %i %i\n",type[0],type[1]);
-  }
-  atomKK->modified(Host,ALL_MASK);
-
-  tag[nlocal] = 0;
-  type[nlocal] = itype;
-  h_x(nlocal,0) = coord[0];
-  h_x(nlocal,1) = coord[1];
-  h_x(nlocal,2) = coord[2];
-  h_mask[nlocal] = 1;
-  h_image[nlocal] = ((tagint) IMGMAX << IMG2BITS) |
-    ((tagint) IMGMAX << IMGBITS) | IMGMAX;
-  h_v(nlocal,0) = 0.0;
-  h_v(nlocal,1) = 0.0;
-  h_v(nlocal,2) = 0.0;
-  h_rho[nlocal] = 0.0;
-  h_dpdTheta[nlocal] = 0.0;
-  h_uCond[nlocal] = 0.0;
-  h_uMech[nlocal] = 0.0;
-  h_uChem[nlocal] = 0.0;
-  h_uCG[nlocal] = 0.0;
-  h_uCGnew[nlocal] = 0.0;
-  h_duChem[nlocal] = 0.0;
-
-  atom->nlocal++;
-}
-
-/* ----------------------------------------------------------------------
-   unpack one line from Atoms section of data file
-   initialize other atom quantities
-------------------------------------------------------------------------- */
-
-void AtomVecDPDKokkos::data_atom(double *coord, tagint imagetmp,
-                                    char **values)
-{
-  int nlocal = atom->nlocal;
-  if (nlocal == nmax) grow(0);
-
-  h_tag[nlocal] = ATOTAGINT(values[0]);
-  h_type[nlocal] = atoi(values[1]);
-  if (type[nlocal] <= 0 || type[nlocal] > atom->ntypes)
-    error->one(FLERR,"Invalid atom type in Atoms section of data file");
-
-  h_dpdTheta[nlocal] = atof(values[2]);
-  if (h_dpdTheta[nlocal] <= 0)
-    error->one(FLERR,"Internal temperature in Atoms section of date file must be > zero");
-
-  h_x(nlocal,0) = coord[0];
-  h_x(nlocal,1) = coord[1];
-  h_x(nlocal,2) = coord[2];
-
-  h_image[nlocal] = imagetmp;
-
-  h_mask[nlocal] = 1;
-  h_v(nlocal,0) = 0.0;
-  h_v(nlocal,1) = 0.0;
-  h_v(nlocal,2) = 0.0;
-
-  h_rho[nlocal] = 0.0;
-  h_uCond[nlocal] = 0.0;
-  h_uMech[nlocal] = 0.0;
-  h_uChem[nlocal] = 0.0;
-  h_uCG[nlocal] = 0.0;
-  h_uCGnew[nlocal] = 0.0;
-
-  atomKK->modified(Host,ALL_MASK);
-
-  atom->nlocal++;
-}
-
-/* ----------------------------------------------------------------------
-   unpack hybrid quantities from one line in Atoms section of data file
-   initialize other atom quantities for this sub-style
-------------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::data_atom_hybrid(int nlocal, char **values)
-{
-  h_dpdTheta(nlocal) = atof(values[0]);
-
-  return 1;
-}
-
-/* ----------------------------------------------------------------------
-   pack atom info for data file including 3 image flags
-------------------------------------------------------------------------- */
-
-void AtomVecDPDKokkos::pack_data(double **buf)
-{
-  int nlocal = atom->nlocal;
-  for (int i = 0; i < nlocal; i++) {
-    buf[i][0] = ubuf(h_tag(i)).d;
-    buf[i][1] = ubuf(h_type(i)).d;
-    buf[i][2] = h_dpdTheta(i);
-    buf[i][3] = h_x(i,0);
-    buf[i][4] = h_x(i,1);
-    buf[i][5] = h_x(i,2);
-    buf[i][6] = (h_image[i] & IMGMASK) - IMGMAX;
-    buf[i][7] = (h_image[i] >> IMGBITS & IMGMASK) - IMGMAX;
-    buf[i][8] = (h_image[i] >> IMG2BITS) - IMGMAX;
-  }
-}
-
-/* ----------------------------------------------------------------------
-   pack hybrid atom info for data file
-------------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::pack_data_hybrid(int i, double *buf)
-{
-  buf[0] = h_dpdTheta(i);
-  return 1;
-}
-
-/* ----------------------------------------------------------------------
-   write atom info to data file including 3 image flags
-------------------------------------------------------------------------- */
-
-void AtomVecDPDKokkos::write_data(FILE *fp, int n, double **buf)
-{
-  for (int i = 0; i < n; i++)
-    fprintf(fp,TAGINT_FORMAT " %d %-1.16e %-1.16e %-1.16e %-1.16e %d %d %d\n",
-            (tagint) ubuf(buf[i][0]).i,(int) ubuf(buf[i][1]).i,
-            buf[i][2],buf[i][3],buf[i][4],buf[i][5],
-            (int) ubuf(buf[i][6]).i,(int) ubuf(buf[i][7]).i,
-            (int) ubuf(buf[i][8]).i);
-}
-
-/* ----------------------------------------------------------------------
-   write hybrid atom info to data file
-------------------------------------------------------------------------- */
-
-int AtomVecDPDKokkos::write_data_hybrid(FILE *fp, double *buf)
-{
-  fprintf(fp," %-1.16e",buf[0]);
-  return 1;
-}
-
-/* ----------------------------------------------------------------------
-   return # of bytes of allocated memory
-------------------------------------------------------------------------- */
-
-bigint AtomVecDPDKokkos::memory_usage()
-{
-  bigint bytes = 0;
-
-  if (atom->memcheck("tag")) bytes += memory->usage(tag,nmax);
-  if (atom->memcheck("type")) bytes += memory->usage(type,nmax);
-  if (atom->memcheck("mask")) bytes += memory->usage(mask,nmax);
-  if (atom->memcheck("image")) bytes += memory->usage(image,nmax);
-  if (atom->memcheck("x")) bytes += memory->usage(x,nmax,3);
-  if (atom->memcheck("v")) bytes += memory->usage(v,nmax,3);
-  if (atom->memcheck("f")) bytes += memory->usage(f,nmax*commKK->nthreads,3);
-  if (atom->memcheck("rho")) bytes += memory->usage(rho,nmax);
-  if (atom->memcheck("dpdTheta")) bytes += memory->usage(dpdTheta,nmax);
-  if (atom->memcheck("uCond")) bytes += memory->usage(uCond,nmax);
-  if (atom->memcheck("uMech")) bytes += memory->usage(uMech,nmax);
-  if (atom->memcheck("uChem")) bytes += memory->usage(uChem,nmax);
-  if (atom->memcheck("uCG")) bytes += memory->usage(uCG,nmax);
-  if (atom->memcheck("uCGnew")) bytes += memory->usage(uCGnew,nmax);
-  if (atom->memcheck("duChem")) bytes += memory->usage(duChem,nmax);
-
-  return bytes;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecDPDKokkos::sync(ExecutionSpace space, unsigned int mask)
-{
-  if (space == Device) {
-    if (mask & X_MASK) atomKK->k_x.sync<LMPDeviceType>();
-    if (mask & V_MASK) atomKK->k_v.sync<LMPDeviceType>();
-    if (mask & F_MASK) atomKK->k_f.sync<LMPDeviceType>();
-    if (mask & TAG_MASK) atomKK->k_tag.sync<LMPDeviceType>();
-    if (mask & TYPE_MASK) atomKK->k_type.sync<LMPDeviceType>();
-    if (mask & MASK_MASK) atomKK->k_mask.sync<LMPDeviceType>();
-    if (mask & IMAGE_MASK) atomKK->k_image.sync<LMPDeviceType>();
-  } else {
-    if (mask & X_MASK) atomKK->k_x.sync<LMPHostType>();
-    if (mask & V_MASK) atomKK->k_v.sync<LMPHostType>();
-    if (mask & F_MASK) atomKK->k_f.sync<LMPHostType>();
-    if (mask & TAG_MASK) atomKK->k_tag.sync<LMPHostType>();
-    if (mask & TYPE_MASK) atomKK->k_type.sync<LMPHostType>();
-    if (mask & MASK_MASK) atomKK->k_mask.sync<LMPHostType>();
-    if (mask & IMAGE_MASK) atomKK->k_image.sync<LMPHostType>();
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecDPDKokkos::sync_overlapping_device(ExecutionSpace space, unsigned int mask)
-{
-  if (space == Device) {
-    if ((mask & X_MASK) && atomKK->k_x.need_sync<LMPDeviceType>())
-      perform_async_copy<DAT::tdual_x_array>(atomKK->k_x,space);
-    if ((mask & V_MASK) && atomKK->k_v.need_sync<LMPDeviceType>())
-      perform_async_copy<DAT::tdual_v_array>(atomKK->k_v,space);
-    if ((mask & F_MASK) && atomKK->k_f.need_sync<LMPDeviceType>())
-      perform_async_copy<DAT::tdual_f_array>(atomKK->k_f,space);
-    if ((mask & TAG_MASK) && atomKK->k_tag.need_sync<LMPDeviceType>())
-      perform_async_copy<DAT::tdual_tagint_1d>(atomKK->k_tag,space);
-    if ((mask & TYPE_MASK) && atomKK->k_type.need_sync<LMPDeviceType>())
-      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_type,space);
-    if ((mask & MASK_MASK) && atomKK->k_mask.need_sync<LMPDeviceType>())
-      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_mask,space);
-    if ((mask & IMAGE_MASK) && atomKK->k_image.need_sync<LMPDeviceType>())
-      perform_async_copy<DAT::tdual_imageint_1d>(atomKK->k_image,space);
-  } else {
-    if ((mask & X_MASK) && atomKK->k_x.need_sync<LMPHostType>())
-      perform_async_copy<DAT::tdual_x_array>(atomKK->k_x,space);
-    if ((mask & V_MASK) && atomKK->k_v.need_sync<LMPHostType>())
-      perform_async_copy<DAT::tdual_v_array>(atomKK->k_v,space);
-    if ((mask & F_MASK) && atomKK->k_f.need_sync<LMPHostType>())
-      perform_async_copy<DAT::tdual_f_array>(atomKK->k_f,space);
-    if ((mask & TAG_MASK) && atomKK->k_tag.need_sync<LMPHostType>())
-      perform_async_copy<DAT::tdual_tagint_1d>(atomKK->k_tag,space);
-    if ((mask & TYPE_MASK) && atomKK->k_type.need_sync<LMPHostType>())
-      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_type,space);
-    if ((mask & MASK_MASK) && atomKK->k_mask.need_sync<LMPHostType>())
-      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_mask,space);
-    if ((mask & IMAGE_MASK) && atomKK->k_image.need_sync<LMPHostType>())
-      perform_async_copy<DAT::tdual_imageint_1d>(atomKK->k_image,space);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecDPDKokkos::modified(ExecutionSpace space, unsigned int mask)
-{
-  if (space == Device) {
-    if (mask & X_MASK) atomKK->k_x.modify<LMPDeviceType>();
-    if (mask & V_MASK) atomKK->k_v.modify<LMPDeviceType>();
-    if (mask & F_MASK) atomKK->k_f.modify<LMPDeviceType>();
-    if (mask & TAG_MASK) atomKK->k_tag.modify<LMPDeviceType>();
-    if (mask & TYPE_MASK) atomKK->k_type.modify<LMPDeviceType>();
-    if (mask & MASK_MASK) atomKK->k_mask.modify<LMPDeviceType>();
-    if (mask & IMAGE_MASK) atomKK->k_image.modify<LMPDeviceType>();
-  } else {
-    if (mask & X_MASK) atomKK->k_x.modify<LMPHostType>();
-    if (mask & V_MASK) atomKK->k_v.modify<LMPHostType>();
-    if (mask & F_MASK) atomKK->k_f.modify<LMPHostType>();
-    if (mask & TAG_MASK) atomKK->k_tag.modify<LMPHostType>();
-    if (mask & TYPE_MASK) atomKK->k_type.modify<LMPHostType>();
-    if (mask & MASK_MASK) atomKK->k_mask.modify<LMPHostType>();
-    if (mask & IMAGE_MASK) atomKK->k_image.modify<LMPHostType>();
-  }
-}
-
diff --git a/src/USER-DPD/atom_vec_dpd_kokkos.h b/src/USER-DPD/atom_vec_dpd_kokkos.h
deleted file mode 100644
index d108e58ae7..0000000000
--- a/src/USER-DPD/atom_vec_dpd_kokkos.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale AtomicKokkos/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifdef ATOM_CLASS
-
-AtomStyle(dpd/kk,AtomVecDPDKokkos)
-
-#else
-
-#ifndef LMP_ATOM_VEC_DPD_KOKKOS_H
-#define LMP_ATOM_VEC_DPD_KOKKOS_H
-
-#include "atom_vec_kokkos.h"
-#include "kokkos_type.h"
-
-namespace LAMMPS_NS {
-
-class AtomVecDPDKokkos : public AtomVecKokkos {
- public:
-  AtomVecDPDKokkos(class LAMMPS *);
-  virtual ~AtomVecDPDKokkos() {}
-  void grow(int);
-  void copy(int, int, int);
-  int pack_comm(int, int *, double *, int, int *);
-  int pack_comm_vel(int, int *, double *, int, int *);
-  int pack_comm_hybrid(int, int *, double *);
-  void unpack_comm(int, int, double *);
-  void unpack_comm_vel(int, int, double *);
-  int unpack_comm_hybrid(int, int, double *);
-  int pack_reverse(int, int, double *);
-  void unpack_reverse(int, int *, double *);
-  int pack_border(int, int *, double *, int, int *);
-  int pack_border_vel(int, int *, double *, int, int *);
-  int pack_border_hybrid(int, int *, double *);
-  void unpack_border(int, int, double *);
-  void unpack_border_vel(int, int, double *);
-  int unpack_border_hybrid(int, int, double *);
-  int pack_exchange(int, double *);
-  int unpack_exchange(double *);
-  int size_restart();
-  int pack_restart(int, double *);
-  int unpack_restart(double *);
-  void create_atom(int, double *);
-  void data_atom(double *, tagint, char **);
-  int data_atom_hybrid(int, char **);
-  void pack_data(double **);
-  int pack_data_hybrid(int, double *);
-  void write_data(FILE *, int, double **);
-  int write_data_hybrid(FILE *, double *);
-  bigint memory_usage();
-
-  void grow_reset();
-  int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
-                       const int & iswap,
-                       const DAT::tdual_xfloat_2d &buf,
-                       const int &pbc_flag, const int pbc[]);
-  void unpack_comm_kokkos(const int &n, const int &nfirst,
-                          const DAT::tdual_xfloat_2d &buf);
-  int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
-                     const int & iswap, const int nfirst,
-                     const int &pbc_flag, const int pbc[]);
-  int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
-                         DAT::tdual_xfloat_2d buf,int iswap,
-                         int pbc_flag, int *pbc, ExecutionSpace space);
-  void unpack_border_kokkos(const int &n, const int &nfirst,
-                            const DAT::tdual_xfloat_2d &buf,
-                            ExecutionSpace space);
-  int pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &buf,
-                           DAT::tdual_int_1d k_sendlist,
-                           DAT::tdual_int_1d k_copylist,
-                           ExecutionSpace space, int dim,
-                           X_FLOAT lo, X_FLOAT hi);
-  int unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv,
-                             int nlocal, int dim, X_FLOAT lo, X_FLOAT hi,
-                             ExecutionSpace space);
-
-  void sync(ExecutionSpace space, unsigned int mask);
-  void modified(ExecutionSpace space, unsigned int mask);
-  void sync_overlapping_device(ExecutionSpace space, unsigned int mask);
-  double *uCond,*uMech,*uChem,*uCG,*uCGnew,*rho,*dpdTheta;
-  double *duChem;
-
- protected:
-  DAT::t_efloat_1d d_uCond, d_uMech, d_uChem, d_uCG, d_uCGnew,d_rho,d_dpdTheta,d_duChem;
-  HAT::t_efloat_1d h_uCond, h_uMech, h_uChem, h_uCG, h_uCGnew,h_rho,h_dpdTheta,h_duChem;
-
-  tagint *tag;
-  imageint *image;
-  int *type,*mask;
-  double **x,**v,**f;
-
-  DAT::t_tagint_1d d_tag;
-  HAT::t_tagint_1d h_tag;
-  DAT::t_imageint_1d d_image;
-  HAT::t_imageint_1d h_image;
-  DAT::t_int_1d d_type, d_mask;
-  HAT::t_int_1d h_type, h_mask;
-
-  DAT::t_x_array d_x;
-  DAT::t_v_array d_v;
-  DAT::t_f_array d_f;
-  HAT::t_x_array h_x;
-  HAT::t_v_array h_v;
-  HAT::t_f_array h_f;
-
-  DAT::tdual_int_1d k_count;
-};
-
-}
-
-#endif
-#endif
-
-/* ERROR/WARNING messages:
-
-E: Per-processor system is too big
-
-The number of owned atoms plus ghost atoms on a single
-processor must fit in 32-bit integer.
-
-E: Invalid atom type in Atoms section of data file
-
-Atom types must range from 1 to specified # of types.
-
-*/

From 1dbf6d443f45a96887c02999bec3832d3c534b61 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Tue, 13 Dec 2016 16:43:40 -0700
Subject: [PATCH 007/267] Adding Kokkos files

---
 src/Depend.sh                            |   4 +
 src/KOKKOS/Install.sh                    |   6 +
 src/KOKKOS/fix_eos_table_rx_kokkos.cpp   | 354 ++++++++++
 src/KOKKOS/fix_eos_table_rx_kokkos.h     | 152 +++++
 src/KOKKOS/pair_exp6_rx_kokkos.cpp       |  46 +-
 src/KOKKOS/pair_exp6_rx_kokkos.h         |   9 +-
 src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp | 791 +++++++++++++++++++++++
 src/KOKKOS/pair_multi_lucy_rx_kokkos.h   | 215 ++++++
 src/KOKKOS/pair_table_kokkos.cpp         | 758 +---------------------
 src/KOKKOS/pair_table_kokkos.h           |  44 +-
 src/KOKKOS/pair_table_rx_kokkos.cpp      | 634 ++++++++++++++++++
 src/KOKKOS/pair_table_rx_kokkos.h        | 269 ++++++++
 src/USER-DPD/pair_multi_lucy.h           |   2 +-
 src/USER-DPD/pair_multi_lucy_rx.cpp      |   6 +-
 src/USER-DPD/pair_multi_lucy_rx.h        |   2 +-
 src/pair_table.h                         |   6 +-
 16 files changed, 2493 insertions(+), 805 deletions(-)
 create mode 100644 src/KOKKOS/fix_eos_table_rx_kokkos.cpp
 create mode 100644 src/KOKKOS/fix_eos_table_rx_kokkos.h
 create mode 100644 src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
 create mode 100644 src/KOKKOS/pair_multi_lucy_rx_kokkos.h
 create mode 100644 src/KOKKOS/pair_table_rx_kokkos.cpp
 create mode 100644 src/KOKKOS/pair_table_rx_kokkos.h

diff --git a/src/Depend.sh b/src/Depend.sh
index 44964d5182..51f83b2ea5 100644
--- a/src/Depend.sh
+++ b/src/Depend.sh
@@ -113,6 +113,10 @@ if (test $1 = "USER-CG-CMM") then
   depend USER-OMP
 fi
 
+if (test $1 = "USER-DPD") then
+  depend KOKKOS
+fi
+
 if (test $1 = "USER-FEP") then
   depend USER-OMP
 fi
diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index 1381a1978c..567e825642 100644
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -73,6 +73,8 @@ action domain_kokkos.cpp
 action domain_kokkos.h
 action fix_deform_kokkos.cpp
 action fix_deform_kokkos.h
+action fix_eos_table_rx_kokkos.cpp fix_eos_table_rx.cpp
+action fix_eos_table_rx_kokkos.h fix_eos_table_rx.h  
 action fix_langevin_kokkos.cpp
 action fix_langevin_kokkos.h
 action fix_nh_kokkos.cpp
@@ -171,6 +173,8 @@ action pair_lj_gromacs_kokkos.cpp
 action pair_lj_gromacs_kokkos.h
 action pair_lj_sdk_kokkos.cpp pair_lj_sdk.cpp
 action pair_lj_sdk_kokkos.h pair_lj_sdk.h
+action pair_multi_lucy_rx_kokkos.cpp pair_multi_lucy_rx.cpp
+action pair_multi_lucy_rx_kokkos.h pair_multi_lucy_rx.h
 action pair_reax_c_kokkos.cpp pair_reax_c.cpp
 action pair_reax_c_kokkos.h pair_reax_c.h
 action pair_sw_kokkos.cpp pair_sw.cpp
@@ -179,6 +183,8 @@ action pair_vashishta_kokkos.cpp pair_vashishta.cpp
 action pair_vashishta_kokkos.h pair_vashishta.h
 action pair_table_kokkos.cpp
 action pair_table_kokkos.h
+action pair_table_rx_kokkos.cpp pair_table_rx.cpp
+action pair_table_rx_kokkos.h pair_table_rx.h  
 action pair_tersoff_kokkos.cpp pair_tersoff.cpp
 action pair_tersoff_kokkos.h pair_tersoff.h
 action pair_tersoff_mod_kokkos.cpp pair_tersoff_mod.cpp
diff --git a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
new file mode 100644
index 0000000000..a1e0b1a07d
--- /dev/null
+++ b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
@@ -0,0 +1,354 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Stan Moore (Sandia)
+------------------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <string.h>
+#include "fix_eos_table_rx_kokkos.h"
+#include "atom_kokkos.h"
+#include "error.h"
+#include "force.h"
+#include "memory.h"
+#include "comm.h"
+#include <math.h>
+#include "modify.h"
+#include "atom_masks.h"
+
+#define MAXLINE 1024
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+FixEOStableRXKokkos<DeviceType>::FixEOStableRXKokkos(LAMMPS *lmp, int narg, char **arg) :
+  FixEOStableRX(lmp, narg, arg)
+{
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
+  datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+FixEOStableRXKokkos<DeviceType>::~FixEOStableRXKokkos()
+{
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixEOStableRXKokkos<DeviceType>::setup(int vflag)
+{
+  int nlocal = atom->nlocal;
+  mask = atomKK->k_mask.view<DeviceType>();
+  uCond = atomKK->k_uCond.view<DeviceType>();
+  uMech = atomKK->k_uMech.view<DeviceType>();
+  uChem = atomKK->k_uChem.view<DeviceType>();
+  dpdTheta= atomKK->k_dpdTheta.view<DeviceType>();
+  uCG = atomKK->k_uCG.view<DeviceType>();
+  uCGnew = atomKK->k_uCGnew.view<DeviceType>();
+  double duChem;
+
+  for (int i = 0; i < nlocal; i++) // parallel_for
+    if (mask[i] & groupbit){
+      duChem = uCG[i] - uCGnew[i];
+      uChem[i] += duChem;
+      uCG[i] = 0.0;
+      uCGnew[i] = 0.0;
+    }
+
+  // Communicate the updated momenta and velocities to all nodes
+  comm->forward_comm_fix(this);
+
+  for (int i = 0; i < nlocal; i++) // parallel_for
+    if (mask[i] & groupbit)
+      temperature_lookup(i,uCond[i]+uMech[i]+uChem[i],dpdTheta[i]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixEOStableRXKokkos<DeviceType>::init()
+{
+  int nlocal = atom->nlocal;
+  mask = atomKK->k_mask.view<DeviceType>();
+  uCond = atomKK->k_uCond.view<DeviceType>();
+  uMech = atomKK->k_uMech.view<DeviceType>();
+  uChem = atomKK->k_uChem.view<DeviceType>();
+  dpdTheta= atomKK->k_dpdTheta.view<DeviceType>();
+  double tmp;
+
+  if(this->restart_reset){
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit)
+        temperature_lookup(i,uCond[i]+uMech[i]+uChem[i],dpdTheta[i]);
+  } else {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+        if(dpdTheta[i] <= 0.0)
+          error->one(FLERR,"Internal temperature <= zero");
+        energy_lookup(i,dpdTheta[i],tmp);
+        uCond[i] = tmp / 2.0;
+        uMech[i] = tmp / 2.0;
+        uChem[i] = 0.0;
+      }
+  }
+}
+
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixEOStableRXKokkos<DeviceType>::post_integrate()
+{
+  int nlocal = atom->nlocal;
+  mask = atomKK->k_mask.view<DeviceType>();
+  uCond = atomKK->k_uCond.view<DeviceType>();
+  uMech = atomKK->k_uMech.view<DeviceType>();
+  uChem = atomKK->k_uChem.view<DeviceType>();
+  dpdTheta= atomKK->k_dpdTheta.view<DeviceType>();
+
+  for (int i = 0; i < nlocal; i++)
+    if (mask[i] & groupbit){
+      temperature_lookup(i,uCond[i]+uMech[i]+uChem[i],dpdTheta[i]);
+      if(dpdTheta[i] <= 0.0)
+        error->one(FLERR,"Internal temperature <= zero");
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixEOStableRXKokkos<DeviceType>::end_of_step()
+{
+  int nlocal = atom->nlocal;
+  mask = atomKK->k_mask.view<DeviceType>();
+  uCond = atomKK->k_uCond.view<DeviceType>();
+  uMech = atomKK->k_uMech.view<DeviceType>();
+  uChem = atomKK->k_uChem.view<DeviceType>();
+  dpdTheta= atomKK->k_dpdTheta.view<DeviceType>();
+  uCG = atomKK->k_uCG.view<DeviceType>();
+  uCGnew = atomKK->k_uCGnew.view<DeviceType>();
+  double duChem;
+
+  // Communicate the ghost uCGnew
+  comm->reverse_comm_fix(this);
+
+  for (int i = 0; i < nlocal; i++)
+    if (mask[i] & groupbit){
+      duChem = uCG[i] - uCGnew[i];
+      uChem[i] += duChem;
+      uCG[i] = 0.0;
+      uCGnew[i] = 0.0;
+    }
+
+  // Communicate the updated momenta and velocities to all nodes
+  comm->forward_comm_fix(this);
+
+  for (int i = 0; i < nlocal; i++)
+    if (mask[i] & groupbit){
+      temperature_lookup(i,uCond[i]+uMech[i]+uChem[i],dpdTheta[i]);
+      if(dpdTheta[i] <= 0.0)
+        error->one(FLERR,"Internal temperature <= zero");
+    }
+}
+
+/* ----------------------------------------------------------------------
+   calculate potential ui at temperature thetai
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixEOStableRXKokkos<DeviceType>::energy_lookup(int id, double thetai, double &ui) const
+{
+  int itable;
+  double fraction, uTmp, nTotal;
+
+  ui = 0.0;
+  nTotal = 0.0;
+  for(int ispecies=0;ispecies<nspecies;ispecies++){
+    Table *tb = &tables[ispecies];
+    thetai = MAX(thetai,tb->lo);
+    thetai = MIN(thetai,tb->hi);
+
+    if (tabstyle == LINEAR) {
+      itable = static_cast<int> ((thetai - tb->lo) * tb->invdelta);
+      fraction = (thetai - tb->r[itable]) * tb->invdelta;
+      uTmp = tb->e[itable] + fraction*tb->de[itable];
+
+      uTmp += dHf[ispecies];
+      // mol fraction form:
+      ui += atom->dvector[ispecies][id]*uTmp;
+      nTotal += atom->dvector[ispecies][id];
+    }
+  }
+  ui = ui - double(nTotal+1.5)*force->boltz*thetai;
+}
+
+/* ----------------------------------------------------------------------
+   calculate temperature thetai at energy ui
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixEOStableRXKokkos<DeviceType>::temperature_lookup(int id, double ui, double &thetai) const
+{
+  Table *tb = &tables[0];
+
+  int it;
+  double t1,t2,u1,u2,f1,f2;
+  double maxit = 100;
+  double temp;
+  double delta = 0.001;
+
+  // Store the current thetai in t1
+  t1 = MAX(thetai,tb->lo);
+  t1 = MIN(t1,tb->hi);
+  if(t1==tb->hi) delta = -delta;
+
+  // Compute u1 at thetai
+  energy_lookup(id,t1,u1);
+
+  // Compute f1
+  f1 = u1 - ui;
+
+  // Compute guess of t2
+  t2 = (1.0 + delta)*t1;
+
+  // Compute u2 at t2
+  energy_lookup(id,t2,u2);
+
+  // Compute f1
+  f2 = u2 - ui;
+
+  // Apply the Secant Method
+  for(it=0; it<maxit; it++){
+    if(fabs(f2-f1)<1e-15){
+      if(isnan(f1) || isnan(f2)) error->one(FLERR,"NaN detected in secant solver.");
+      temp = t1;
+      temp = MAX(temp,tb->lo);
+      temp = MIN(temp,tb->hi);
+      char str[256];
+      sprintf(str,"Secant solver did not converge because table bounds were exceeded:  it=%d id=%d ui=%lf thetai=%lf t1=%lf t2=%lf f1=%lf f2=%lf dpdTheta=%lf\n",it,id,ui,thetai,t1,t2,f1,f2,temp);
+      error->warning(FLERR,str);
+      break;
+    }
+    temp = t2 - f2*(t2-t1)/(f2-f1);
+    if(fabs(temp-t2) < 1e-6) break;
+    f1 = f2;
+    t1 = t2;
+    t2 = temp;
+    energy_lookup(id,t2,u2);
+    f2 = u2 - ui;
+  }
+  if(it==maxit){
+    char str[256];
+    sprintf(str,"Maxit exceeded in secant solver:  id=%d ui=%lf thetai=%lf t1=%lf t2=%lf f1=%lf f2=%lf\n",id,ui,thetai,t1,t2,f1,f2);
+    if(isnan(f1) || isnan(f2) || isnan(ui) || isnan(thetai) || isnan(t1) || isnan(t2))
+      error->one(FLERR,"NaN detected in secant solver.");
+    error->one(FLERR,str);
+  }
+  thetai = temp;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int FixEOStableRXKokkos<DeviceType>::pack_forward_comm(int n, int *list, double *buf, int pbc_flag, int *pbc)
+{
+  int ii,jj,m;
+  uChem = atomKK->k_uChem.view<DeviceType>();
+  uCG = atomKK->k_uCG.view<DeviceType>();
+  uCGnew = atomKK->k_uCGnew.view<DeviceType>();
+
+  m = 0;
+  for (ii = 0; ii < n; ii++) {
+    jj = list[ii];
+    buf[m++] = uChem[jj];
+    buf[m++] = uCG[jj];
+    buf[m++] = uCGnew[jj];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixEOStableRXKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *buf)
+{
+  int ii,m,last;
+  uChem = atomKK->k_uChem.view<DeviceType>();
+  uCG = atomKK->k_uCG.view<DeviceType>();
+  uCGnew = atomKK->k_uCGnew.view<DeviceType>();
+
+  m = 0;
+  last = first + n ;
+  for (ii = first; ii < last; ii++){
+    uChem[ii]  = buf[m++];
+    uCG[ii]    = buf[m++];
+    uCGnew[ii] = buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int FixEOStableRXKokkos<DeviceType>::pack_reverse_comm(int n, int first, double *buf)
+{
+  int i,m,last;
+  uCG = atomKK->k_uCG.view<DeviceType>();
+  uCGnew = atomKK->k_uCGnew.view<DeviceType>();
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    buf[m++] = uCG[i];
+    buf[m++] = uCGnew[i];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixEOStableRXKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, double *buf)
+{
+  int i,j,m;
+  uCG = atomKK->k_uCG.view<DeviceType>();
+  uCGnew = atomKK->k_uCGnew.view<DeviceType>();
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+
+    uCG[j] += buf[m++];
+    uCGnew[j] += buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+namespace LAMMPS_NS {
+template class FixEOStableRXKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class FixEOStableRXKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/fix_eos_table_rx_kokkos.h b/src/KOKKOS/fix_eos_table_rx_kokkos.h
new file mode 100644
index 0000000000..9eccd67c54
--- /dev/null
+++ b/src/KOKKOS/fix_eos_table_rx_kokkos.h
@@ -0,0 +1,152 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(eos/table/rx/kk,FixEOStableRXKokkos<LMPDeviceType>)
+FixStyle(eos/table/rx/kk/device,FixEOStableRXKokkos<LMPDeviceType>)
+FixStyle(eos/table/rx/kk/host,FixEOStableRXKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_FIX_EOS_TABLE_RX_KOKKOS_H
+#define LMP_FIX_EOS_TABLE_RX_KOKKOS_H
+
+#include "fix_eos_table_rx.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+template<class DeviceType>
+class FixEOStableRXKokkos : public FixEOStableRX {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typedef EV_FLOAT value_type;
+
+  FixEOStableRXKokkos(class LAMMPS *, int, char **);
+  virtual ~FixEOStableRXKokkos();
+  void setup(int);
+  void init();
+  void post_integrate();
+  void end_of_step();
+
+  KOKKOS_INLINE_FUNCTION
+  void energy_lookup(int, double, double &) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void temperature_lookup(int, double, double &) const;
+
+ protected:
+  //struct Table {
+  //  int ninput;
+  //  double lo,hi;
+  //  double *rfile,*efile;
+  //  double *e2file;
+  //  double delta,invdelta,deltasq6;
+  //  double *r,*e,*de,*e2;
+  //};
+  //Table *tables, *tables2;
+
+  void allocate();
+
+  //double *dHf;
+
+  typename AT::t_int_1d mask;
+  typename AT::t_efloat_1d uCond,uMech,uChem,uCG,uCGnew,rho,dpdTheta,duChem;
+
+  int pack_reverse_comm(int, int, double *);
+  void unpack_reverse_comm(int, int *, double *);
+  int pack_forward_comm(int , int *, double *, int, int *);
+  void unpack_forward_comm(int , int , double *);
+
+  //int *eosSpecies;
+  };
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: FixEOStableRXKokkos requires a fix rx command.
+
+The fix rx command must come before the pair style command in the input file
+
+E:  There are no rx species specified
+
+There must be at least one species specified through the fix rx command
+
+E:  Invalid eos/table/rx length
+
+The eos/table/rx table must have more than one entry.
+
+E:  eos/table/rx values are not increasing
+
+The equation-of-state must an increasing function
+
+E:  Internal temperature <= zero.
+
+Self-explanatory.
+
+E:  Cannot open eos table/rx potential file %s
+
+Self-explanatory.
+
+E:  Incorrect format in eos table/rx file
+
+Self-explanatory.
+
+E:  Cannot open file %s
+
+Self-explanatory.
+
+E:  Did not find keyword in table file
+
+Self-explanatory.
+
+E:  Illegal fix eos/table/rx command
+
+Incorrect number of arguments specified for the fix eos/table/rx command.
+
+E:  Invalid keyword in fix eos/table/rx parameters
+
+Self-explanatory.
+
+E:  The number of columns in fix eos/table/rx does not match the number of species.
+
+Self-explanatory.  Check format for fix eos/table/rx file.
+
+E:  fix eos/table/rx parameters did not set N
+
+The number of table entries was not set in the eos/table/rx file
+
+W:  Secant solver did not converge because table bounds were exceeded
+
+The secant solver failed to converge, resulting in the lower or upper table bound temperature to be returned
+
+E: NaN detected in secant solver.
+
+Self-explanatory.
+
+E: Maxit exceeded in secant solver
+
+The maximum number of interations was exceeded in the secant solver
+
+*/
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index 754fa4667d..a7d5569537 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -31,6 +31,8 @@
 #include "modify.h"
 #include "fix.h"
 #include <float.h>
+#include "atom_masks.h"
+#include "neigh_request.h"
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
@@ -50,7 +52,10 @@ using namespace MathSpecial;
 template<class DeviceType>
 PairExp6rxKokkos<DeviceType>::PairExp6rxKokkos(LAMMPS *lmp) : PairExp6rx(lmp)
 {
-
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
+  datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -63,6 +68,39 @@ PairExp6rxKokkos<DeviceType>::~PairExp6rxKokkos()
 
 /* ---------------------------------------------------------------------- */
 
+template<class DeviceType>
+void PairExp6rxKokkos<DeviceType>::init_style()
+{
+  PairExp6rxKokkos::init_style();
+
+  // irequest = neigh request made by parent class
+
+  neighflag = lmp->kokkos->neighflag;
+  int irequest = neighbor->nrequest - 1;
+
+  neighbor->requests[irequest]->
+    kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value &&
+    !Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+  neighbor->requests[irequest]->
+    kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+
+  if (neighflag == FULL) {
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full_cluster = 0;
+    neighbor->requests[irequest]->ghost = 1;
+  } else if (neighflag == HALF || neighflag == HALFTHREAD) {
+    neighbor->requests[irequest]->full = 0;
+    neighbor->requests[irequest]->half = 1;
+    neighbor->requests[irequest]->full_cluster = 0;
+    neighbor->requests[irequest]->ghost = 1;
+  } else {
+    error->all(FLERR,"Cannot use chosen neighbor list style with reax/c/kk");
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 {
@@ -270,14 +308,14 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
     rsq = delx*delx + dely*dely + delz*delz;
     jtype = type[j];
 
-    if (rsq < cutsq[itype][jtype]) {
+    if (rsq < d_cutsq(itype,jtype)) { // optimize
       r2inv = 1.0/rsq;
       r6inv = r2inv*r2inv*r2inv;
 
       r = sqrt(rsq);
-      rCut2inv = 1.0/cutsq[itype][jtype];
+      rCut2inv = 1.0/d_cutsq(itype,jtype);
       rCut6inv = rCut2inv*rCut2inv*rCut2inv;
-      rCut = sqrt(cutsq[itype][jtype]);
+      rCut = sqrt(d_cutsq(itype,jtype));
       rCutInv = 1.0/rCut;
 
       //
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.h b/src/KOKKOS/pair_exp6_rx_kokkos.h
index 4ff055123c..b0fbd3d9e5 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.h
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.h
@@ -63,7 +63,11 @@ class PairExp6rxKokkos : public PairExp6rx {
 
   PairExp6rxKokkos(class LAMMPS *);
   virtual ~PairExp6rxKokkos();
-  virtual void compute(int, int);
+  void compute(int, int);
+  void init_style();
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairExp6rxgetParamsEXP6, const int&) const;
 
   template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
@@ -73,9 +77,6 @@ class PairExp6rxKokkos : public PairExp6rx {
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairExp6rxCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&) const;
 
-  KOKKOS_INLINE_FUNCTION
-  void operator()(TagPairExp6rxgetParamsEXP6, const int&) const;
-
   template<int NEIGHFLAG, int NEWTON_PAIR>
   KOKKOS_INLINE_FUNCTION
   void ev_tally(EV_FLOAT &ev, const int &i, const int &j,
diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
new file mode 100644
index 0000000000..de70ae86f5
--- /dev/null
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
@@ -0,0 +1,791 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------------------------
+   Contributing authors:
+   Stan Moore (Sandia)
+
+   Please cite the related publications:
+   J.D. Moore, B.C. Barnes, S. Izvekov, M. Lisal, M.S. Sellers, D.E. Taylor & J.K. Brennan
+   "A coarse-grain force field for RDX: Density dependent and energy conserving"
+   The Journal of Chemical Physics, 2016, 144, 104501.
+------------------------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <math.h>
+#include "math_const.h"
+#include <stdlib.h>
+#include <string.h>
+#include "pair_multi_lucy_rx_kokkos.h"
+#include "atom_kokkos.h"
+#include "force.h"
+#include "comm.h"
+#include "neigh_list.h"
+#include "memory.h"
+#include "error.h"
+#include "citeme.h"
+#include "modify.h"
+#include "fix.h"
+#include "atom_masks.h"
+#include "neigh_request.h"
+
+using namespace LAMMPS_NS;
+
+enum{NONE,RLINEAR,RSQ};
+
+#define MAXLINE 1024
+
+#define oneFluidParameter (-1)
+#define isOneFluid(_site) ( (_site) == oneFluidParameter )
+
+static const char cite_pair_multi_lucy_rx[] =
+  "pair_style multi/lucy/rx command:\n\n"
+  "@Article{Moore16,\n"
+  " author = {J.D. Moore, B.C. Barnes, S. Izvekov, M. Lisal, M.S. Sellers, D.E. Taylor and J. K. Brennan},\n"
+  " title = {A coarse-grain force field for RDX:  Density dependent and energy conserving},\n"
+  " journal = {J. Chem. Phys.},\n"
+  " year =    2016,\n"
+  " volume =  144\n"
+  " pages =   {104501}\n"
+  "}\n\n";
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairMultiLucyRXKokkos<DeviceType>::PairMultiLucyRXKokkos(LAMMPS *lmp) : PairMultiLucyRX(lmp)
+{
+  respa_enable = 0;
+
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
+  datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairMultiLucyRXKokkos<DeviceType>::~PairMultiLucyRXKokkos()
+{
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMultiLucyRXKokkos<DeviceType>::init_style()
+{
+  PairMultiLucyRX::init_style();
+
+  // irequest = neigh request made by parent class
+
+  neighflag = lmp->kokkos->neighflag;
+  int irequest = neighbor->nrequest - 1;
+
+  neighbor->requests[irequest]->
+    kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value &&
+    !Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+  neighbor->requests[irequest]->
+    kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+
+  if (neighflag == FULL) {
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full_cluster = 0;
+    neighbor->requests[irequest]->ghost = 1;
+  } else if (neighflag == HALF || neighflag == HALFTHREAD) {
+    neighbor->requests[irequest]->full = 0;
+    neighbor->requests[irequest]->half = 1;
+    neighbor->requests[irequest]->full_cluster = 0;
+    neighbor->requests[irequest]->ghost = 1;
+  } else {
+    error->all(FLERR,"Cannot use chosen neighbor list style with reax/c/kk");
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMultiLucyRXKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
+{
+  eflag = eflag_in;
+  vflag = vflag_in;
+
+  double evdwl,evdwlOld;
+
+  evdwlOld = 0.0;
+  evdwl = 0.0;
+  if (neighflag == FULL) no_virial_fdotr_compute = 1;
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+
+  // reallocate per-atom arrays if necessary
+
+  if (eflag_atom) {
+    memory->destroy_kokkos(k_eatom,eatom);
+    memory->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
+    d_eatom = k_eatom.d_view;
+  }
+  if (vflag_atom) {
+    memory->destroy_kokkos(k_vatom,vatom);
+    memory->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom");
+    d_vatom = k_vatom.d_view;
+  }
+
+  x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  uCG = atomKK->k_uCG.view<DeviceType>();
+  uCGnew = atomKK->k_uCGnew.view<DeviceType>();
+  dvector = atomKK->k_dvector.view<DeviceType>();
+  rho = atomKK->k_rho.view<DeviceType>();
+
+  nlocal = atom->nlocal;
+  int nghost = atom->nghost;
+  int newton_pair = force->newton_pair;
+
+  {
+    const int ntotal = nlocal + nghost;
+    d_fractionOld1 = typename AT::t_float_1d("PairMultiLucyRX::fractionOld1",ntotal);
+    d_fractionOld2 = typename AT::t_float_1d("PairMultiLucyRX::fractionOld2",ntotal);
+    d_fraction1 = typename AT::t_float_1d("PairMultiLucyRX::fraction1",ntotal);
+    d_fraction2 = typename AT::t_float_1d("PairMultiLucyRX::fraction2",ntotal);
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXgetParams>(0,ntotal),*this);
+  }
+
+  const int inum = list->inum;
+  NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
+  d_numneigh = k_list->d_numneigh;
+  d_neighbors = k_list->d_neighbors;
+  d_ilist = k_list->d_ilist;
+
+  computeLocalDensity();
+
+  // loop over neighbors of my atoms
+
+  EV_FLOAT ev;
+
+  if (evflag) {
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALF,1,1> >(0,inum),*this,ev);
+  } else {
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALF,1,0> >(0,inum),*this);
+  }
+
+  if (eflag_global) eng_vdwl += ev.evdwl;
+  if (vflag_global) {
+    virial[0] += ev.v[0];
+    virial[1] += ev.v[1];
+    virial[2] += ev.v[2];
+    virial[3] += ev.v[3];
+    virial[4] += ev.v[4];
+    virial[5] += ev.v[5];
+  }
+
+  if (vflag_fdotr) pair_virial_fdotr_compute(this);
+
+  if (eflag_atom) {
+    k_eatom.template modify<DeviceType>();
+    k_eatom.template sync<LMPHostType>();
+  }
+
+  if (vflag_atom) {
+    k_vatom.template modify<DeviceType>();
+    k_vatom.template sync<LMPHostType>();
+  }
+
+  copymode = 0;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXgetParams, const int &i) const {
+  getParams(i, d_fractionOld1[i], d_fractionOld2[i], d_fraction1[i], d_fraction2[i]);
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii, EV_FLOAT& ev) const {
+  int i,j,jj,inum,jnum,itype,jtype,itable;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,evdwlOld,fpair;
+  double rsq;
+
+  double fractionOld1_i,fractionOld1_j;
+  double fractionOld2_i,fractionOld2_j;
+  double fraction1_i;
+
+  double pi = MathConst::MY_PI;
+  double A_i, A_j;
+  double fraction_i,fraction_j;
+  int jtable;
+
+  Table *tb;
+
+  int tlm1 = tablength - 1;
+
+  i = d_ilist[ii];
+  xtmp = x(i,0);
+  ytmp = x(i,1);
+  ztmp = x(i,2);
+  itype = type[i];
+  jnum = d_numneigh[i];
+
+  double fx_i = 0.0;
+  double fy_i = 0.0;
+  double fz_i = 0.0;
+
+  fractionOld1_i = d_fractionOld1[i];
+  fractionOld2_i = d_fractionOld2[i];
+  fraction1_i = d_fraction1[i];
+
+  for (jj = 0; jj < jnum; jj++) {
+    int j = d_neighbors(i,jj);
+    j &= NEIGHMASK;
+
+    delx = xtmp - x(j,0);
+    dely = ytmp - x(j,1);
+    delz = ztmp - x(j,2);
+    rsq = delx*delx + dely*dely + delz*delz;
+    jtype = type[j];
+
+    if (rsq < d_cutsq(itype,jtype)) { // optimize
+      fpair = 0.0;
+
+      fractionOld1_j = d_fractionOld1[j];
+      fractionOld2_j = d_fractionOld2[j];
+
+      tb = &tables[tabindex[itype][jtype]];
+      if (rho[i]*rho[i] < tb->innersq || rho[j]*rho[j] < tb->innersq){
+        //printf("Table inner cutoff = %lf\n",sqrt(tb->innersq));
+        //printf("rho[%d]=%lf\n",i,rho[i]);
+        //printf("rho[%d]=%lf\n",j,rho[j]);
+        error->one(FLERR,"Density < table inner cutoff");
+      }
+      if (tabstyle == LOOKUP) {
+        itable = static_cast<int> (((rho[i]*rho[i]) - tb->innersq) * tb->invdelta);
+        jtable = static_cast<int> (((rho[j]*rho[j]) - tb->innersq) * tb->invdelta);
+        if (itable >= tlm1 || jtable >= tlm1){
+          //printf("Table outer index = %d\n",tlm1);
+          //printf("itableIndex=%d rho[%d]=%lf\n",itable,i,rho[i]);
+          //printf("jtableIndex=%d rho[%d]=%lf\n",jtable,j,rho[j]);
+          error->one(FLERR,"Density > table outer cutoff");
+        }
+        A_i = tb->f[itable];
+        A_j = tb->f[jtable];
+
+        const double rfactor = 1.0-sqrt(rsq/d_cutsq(itype,jtype));
+        fpair = 0.5*(A_i + A_j)*(4.0-3.0*rfactor)*rfactor*rfactor*rfactor;
+        fpair /= sqrt(rsq);
+
+      } else if (tabstyle == LINEAR) {
+        itable = static_cast<int> ((rho[i]*rho[i] - tb->innersq) * tb->invdelta);
+        jtable = static_cast<int> (((rho[j]*rho[j]) - tb->innersq) * tb->invdelta);
+        if (itable >= tlm1 || jtable >= tlm1){
+          //printf("Table outer index = %d\n",tlm1);
+          //printf("itableIndex=%d rho[%d]=%lf\n",itable,i,rho[i]);
+          //printf("jtableIndex=%d rho[%d]=%lf\n",jtable,j,rho[j]);
+          error->one(FLERR,"Density > table outer cutoff");
+        }
+        if(itable<0) itable=0;
+        if(itable>=tlm1) itable=tlm1;
+        if(jtable<0) jtable=0;
+        if(jtable>=tlm1)jtable=tlm1;
+
+        fraction_i = (((rho[i]*rho[i]) - tb->rsq[itable]) * tb->invdelta);
+        fraction_j = (((rho[j]*rho[j]) - tb->rsq[jtable]) * tb->invdelta);
+        if(itable==0) fraction_i=0.0;
+        if(itable==tlm1) fraction_i=0.0;
+        if(jtable==0) fraction_j=0.0;
+        if(jtable==tlm1) fraction_j=0.0;
+
+        A_i = tb->f[itable] + fraction_i*tb->df[itable];
+        A_j = tb->f[jtable] + fraction_j*tb->df[jtable];
+
+        const double rfactor = 1.0-sqrt(rsq/d_cutsq(itype,jtype));
+        fpair = 0.5*(A_i + A_j)*(4.0-3.0*rfactor)*rfactor*rfactor*rfactor;
+        fpair /= sqrt(rsq);
+
+      } else error->one(FLERR,"Only LOOKUP and LINEAR table styles have been implemented for pair multi/lucy/rx");
+
+      if (isite1 == isite2) fpair = sqrt(fractionOld1_i*fractionOld2_j)*fpair;
+      else fpair = (sqrt(fractionOld1_i*fractionOld2_j) + sqrt(fractionOld2_i*fractionOld1_j))*fpair;
+
+      fx_i += delx*fpair;
+      fy_i += dely*fpair;
+      fz_i += delz*fpair;
+      if (NEWTON_PAIR || j < nlocal) {
+        f(j,0) -= delx*fpair;
+        f(j,1) -= dely*fpair;
+        f(j,2) -= delz*fpair;
+      }
+      //if (evflag) ev_tally(i,j,nlocal,newton_pair,0.0,0.0,fpair,delx,dely,delz);
+      if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,0.0,fpair,delx,dely,delz);
+    }
+  }
+
+  f(i,0) += fx_i;
+  f(i,1) += fy_i;
+  f(i,2) += fz_i;
+
+  tb = &tables[tabindex[itype][itype]];
+  itable = static_cast<int> (((rho[i]*rho[i]) - tb->innersq) * tb->invdelta);
+  if (tabstyle == LOOKUP) evdwl = tb->e[itable];
+  else if (tabstyle == LINEAR){
+    if (itable >= tlm1){
+      //printf("itableIndex=%d rho[%d]=%lf\n",itable,i,rho[i]);
+      error->one(FLERR,"Density > table outer cutoff");
+    }
+    if(itable==0) fraction_i=0.0;
+    else fraction_i = (((rho[i]*rho[i]) - tb->rsq[itable]) * tb->invdelta);
+    evdwl = tb->e[itable] + fraction_i*tb->de[itable];
+  } else error->one(FLERR,"Only LOOKUP and LINEAR table styles have been implemented for pair multi/lucy/rx");
+
+  evdwl *=(pi*d_cutsq(itype,itype)*d_cutsq(itype,itype))/84.0;
+  evdwlOld = fractionOld1_i*evdwl;
+  evdwl = fraction1_i*evdwl;
+
+  uCG[i] += evdwlOld;
+  uCGnew[i] += evdwl;
+
+  evdwl = evdwlOld;
+
+  //if (evflag) ev_tally(0,0,nlocal,newton_pair,evdwl,0.0,0.0,0.0,0.0,0.0);
+  if (EVFLAG) ev.evdwl += evdwl;
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii) const {
+  EV_FLOAT ev;
+  this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(TagPairMultiLucyRXCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(), ii, ev);
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMultiLucyRXKokkos<DeviceType>::coeff(int narg, char **arg)
+{
+  if (narg != 6 && narg != 7) error->all(FLERR,"Illegal pair_coeff command");
+
+  bool rx_flag = false;
+  for (int i = 0; i < modify->nfix; i++)
+    if (strncmp(modify->fix[i]->style,"rx",2) == 0) rx_flag = true;
+  if (!rx_flag) error->all(FLERR,"PairMultiLucyRXKokkos<DeviceType> requires a fix rx command.");
+
+  if (!allocated) allocate();
+
+  int ilo,ihi,jlo,jhi;
+  force->bounds(FLERR,arg[0],atom->ntypes,ilo,ihi);
+  force->bounds(FLERR,arg[1],atom->ntypes,jlo,jhi);
+
+  int me;
+  MPI_Comm_rank(world,&me);
+  tables = (Table *)
+    memory->srealloc(tables,(ntables+1)*sizeof(Table),"pair:tables");
+  Table *tb = &tables[ntables];
+  null_table(tb);
+  if (me == 0) read_table(tb,arg[2],arg[3]);
+  bcast_table(tb);
+
+  nspecies = atom->nspecies_dpd;
+  int n;
+  n = strlen(arg[3]) + 1;
+  site1 = new char[n];
+  strcpy(site1,arg[4]);
+
+  n = strlen(arg[4]) + 1;
+  site2 = new char[n];
+  strcpy(site2,arg[5]);
+
+  // set table cutoff
+
+  if (narg == 7) tb->cut = force->numeric(FLERR,arg[6]);
+  else if (tb->rflag) tb->cut = tb->rhi;
+  else tb->cut = tb->rfile[tb->ninput-1];
+
+  // error check on table parameters
+  // insure cutoff is within table
+
+  if (tb->ninput <= 1) error->one(FLERR,"Invalid pair table length");
+  if (tb->rflag == 0) {
+    rho_0 = tb->rfile[0];
+  } else {
+    rho_0 = tb->rlo;
+  }
+
+  tb->match = 0;
+  if (tabstyle == LINEAR && tb->ninput == tablength &&
+      tb->rflag == RSQ) tb->match = 1;
+
+  // spline read-in values and compute r,e,f vectors within table
+
+  if (tb->match == 0) spline_table(tb);
+  compute_table(tb);
+
+  // store ptr to table in tabindex
+
+  int count = 0;
+  for (int i = ilo; i <= ihi; i++) {
+    for (int j = MAX(jlo,i); j <= jhi; j++) {
+      tabindex[i][j] = ntables;
+      setflag[i][j] = 1;
+      count++;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Illegal pair_coeff command");
+  ntables++;
+
+  // Match site* to isite values.
+
+  if (strcmp(site1, "1fluid") == 0)
+     isite1 = oneFluidParameter;
+  else {
+     isite1 = nspecies;
+     for (int ispecies = 0; ispecies < nspecies; ++ispecies)
+        if (strcmp(site1, atom->dname[ispecies]) == 0){
+           isite1 = ispecies;
+           break;
+        }
+
+     if (isite1 == nspecies)
+        error->all(FLERR,"Pair_multi_lucy_rx site1 is invalid.");
+  }
+
+  if (strcmp(site2, "1fluid") == 0)
+     isite2 = oneFluidParameter;
+  else {
+     isite2 = nspecies;
+     for (int ispecies = 0; ispecies < nspecies; ++ispecies)
+        if (strcmp(site2, atom->dname[ispecies]) == 0){
+           isite2 = ispecies;
+           break;
+        }
+
+     if (isite2 == nspecies)
+        error->all(FLERR,"Pair_multi_lucy_rx site2 is invalid.");
+  }
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMultiLucyRXKokkos<DeviceType>::computeLocalDensity()
+{
+  x = atomKK->k_x.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  rho = atomKK->k_rho.view<DeviceType>();
+  nlocal = atom->nlocal;
+
+  //sync
+
+  const int inum = list->inum;
+  NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
+  d_numneigh = k_list->d_numneigh;
+  d_neighbors = k_list->d_neighbors;
+  d_ilist = k_list->d_ilist;
+
+  const double pi = MathConst::MY_PI;
+
+  const bool newton_pair = force->newton_pair;
+  one_type = (atom->ntypes == 1);
+
+  // Special cut-off values for when there's only one type.
+  cutsq_type11 = cutsq[1][1];
+  rcut_type11 = sqrt(cutsq_type11);
+  factor_type11 = 84.0/(5.0*pi*rcut_type11*rcut_type11*rcut_type11);
+
+  // zero out density
+  int m = nlocal;
+  if (newton_pair) m += atom->nghost;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXZero>(0,m),*this);
+
+// rho = density at each atom
+// loop over neighbors of my atoms
+  if (newton_pair)
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALF,1> >(0,inum),*this);
+  else
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALF,0> >(0,inum),*this);
+
+  if (newton_pair) comm->reverse_comm_pair(this);
+
+  comm->forward_comm_pair(this);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXZero, const int &i) const {
+  rho[i] = 0.0;
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR>
+KOKKOS_INLINE_FUNCTION
+void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXComputeLocalDensity<NEIGHFLAG,NEWTON_PAIR>, const int &ii) const {
+  const int i = d_ilist[ii];
+
+  const double xtmp = x(i,0);
+  const double ytmp = x(i,1);
+  const double ztmp = x(i,2);
+
+  double rho_i = rho[i];
+
+  const int itype = type[i];
+  const int jnum = d_numneigh[i];
+
+  const double pi = MathConst::MY_PI;
+
+  for (int jj = 0; jj < jnum; jj++){
+    const int j = (d_neighbors(i,jj) & NEIGHMASK);
+    const int jtype = type[j];
+
+    const double delx = xtmp - x(j,0);
+    const double dely = ytmp - x(j,1);
+    const double delz = ztmp - x(j,2);
+    const double rsq = delx*delx + dely*dely + delz*delz;
+
+    if (one_type) {
+      if (rsq < cutsq_type11) {
+        const double rcut = rcut_type11;
+        const double r_over_rcut = sqrt(rsq) / rcut;
+        const double tmpFactor = 1.0 - r_over_rcut;
+        const double tmpFactor4 = tmpFactor*tmpFactor*tmpFactor*tmpFactor;
+        const double factor = factor_type11*(1.0 + 1.5*r_over_rcut)*tmpFactor4;
+        rho_i += factor;
+        if (NEWTON_PAIR || j < nlocal)
+          rho[j] += factor;
+      } else if (rsq < d_cutsq(itype,jtype)) {
+        const double rcut = sqrt(d_cutsq(itype,jtype));
+        const double tmpFactor = 1.0-sqrt(rsq)/rcut;
+        const double tmpFactor4 = tmpFactor*tmpFactor*tmpFactor*tmpFactor;
+        const double factor = (84.0/(5.0*pi*rcut*rcut*rcut))*(1.0+3.0*sqrt(rsq)/(2.0*rcut))*tmpFactor4;
+        rho_i += factor;
+        if (NEWTON_PAIR || j < nlocal)
+          rho[j] += factor;
+      }
+    }
+  }
+
+  rho[i] = rho_i;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairMultiLucyRXKokkos<DeviceType>::getParams(int id, double &fractionOld1, double &fractionOld2, double &fraction1, double &fraction2) const
+{
+  double fractionOld, fraction;
+  double nTotal, nTotalOld;
+
+  nTotal = 0.0;
+  nTotalOld = 0.0;
+  for (int ispecies = 0; ispecies < nspecies; ispecies++){
+    nTotal += dvector(ispecies,id);
+    nTotalOld += dvector(ispecies+nspecies,id);
+  }
+
+  if (isOneFluid(isite1) == false){
+    fractionOld1 = dvector(isite1+nspecies,id)/nTotalOld;
+    fraction1 = dvector(isite1,id)/nTotal;
+  }
+  if (isOneFluid(isite2) == false){
+    fractionOld2 = dvector(isite2+nspecies,id)/nTotalOld;
+    fraction2 = dvector(isite2,id)/nTotal;
+  }
+
+  if (isOneFluid(isite1) || isOneFluid(isite2)){
+    fractionOld  = 0.0;
+    fraction  = 0.0;
+
+    for (int ispecies = 0; ispecies < nspecies; ispecies++){
+      if (isite1 == ispecies || isite2 == ispecies) continue;
+      fractionOld += dvector(ispecies+nspecies,id) / nTotalOld;
+      fraction += dvector(ispecies,id) / nTotal;
+    }
+    if (isOneFluid(isite1)){
+      fractionOld1 = fractionOld;
+      fraction1 = fraction;
+    }
+    if (isOneFluid(isite2)){
+      fractionOld2 = fractionOld;
+      fraction2 = fraction;
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int PairMultiLucyRXKokkos<DeviceType>::pack_forward_comm(int n, int *list, double *buf, int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  rho = atomKK->k_rho.view<DeviceType>();
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    buf[m++] = rho[j];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMultiLucyRXKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *buf)
+{
+  int i,m,last;
+  rho = atomKK->k_rho.view<DeviceType>();
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) rho[i] = buf[m++];
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int PairMultiLucyRXKokkos<DeviceType>::pack_reverse_comm(int n, int first, double *buf)
+{
+  int i,m,last;
+  rho = atomKK->k_rho.view<DeviceType>();
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) buf[m++] = rho[i];
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMultiLucyRXKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, double *buf)
+{
+  int i,j,m;
+  rho = atomKK->k_rho.view<DeviceType>();
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    rho[j] += buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR>
+KOKKOS_INLINE_FUNCTION
+void PairMultiLucyRXKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, const int &j,
+      const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
+                const F_FLOAT &dely, const F_FLOAT &delz) const
+{
+  const int EFLAG = eflag;
+  const int VFLAG = vflag_either;
+
+  // The eatom and vatom arrays are atomic for Half/Thread neighbor style
+  Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom = k_eatom.view<DeviceType>();
+  Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom = k_vatom.view<DeviceType>();
+
+  if (EFLAG) {
+    if (eflag_atom) {
+      const E_FLOAT epairhalf = 0.5 * epair;
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) v_eatom[i] += epairhalf;
+        if (NEWTON_PAIR || j < nlocal) v_eatom[j] += epairhalf;
+      } else {
+        v_eatom[i] += epairhalf;
+      }
+    }
+  }
+
+  if (VFLAG) {
+    const E_FLOAT v0 = delx*delx*fpair;
+    const E_FLOAT v1 = dely*dely*fpair;
+    const E_FLOAT v2 = delz*delz*fpair;
+    const E_FLOAT v3 = delx*dely*fpair;
+    const E_FLOAT v4 = delx*delz*fpair;
+    const E_FLOAT v5 = dely*delz*fpair;
+
+    if (vflag_global) {
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) {
+          ev.v[0] += 0.5*v0;
+          ev.v[1] += 0.5*v1;
+          ev.v[2] += 0.5*v2;
+          ev.v[3] += 0.5*v3;
+          ev.v[4] += 0.5*v4;
+          ev.v[5] += 0.5*v5;
+        }
+        if (NEWTON_PAIR || j < nlocal) {
+        ev.v[0] += 0.5*v0;
+        ev.v[1] += 0.5*v1;
+        ev.v[2] += 0.5*v2;
+        ev.v[3] += 0.5*v3;
+        ev.v[4] += 0.5*v4;
+        ev.v[5] += 0.5*v5;
+        }
+      } else {
+        ev.v[0] += 0.5*v0;
+        ev.v[1] += 0.5*v1;
+        ev.v[2] += 0.5*v2;
+        ev.v[3] += 0.5*v3;
+        ev.v[4] += 0.5*v4;
+        ev.v[5] += 0.5*v5;
+      }
+    }
+
+    if (vflag_atom) {
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) {
+          v_vatom(i,0) += 0.5*v0;
+          v_vatom(i,1) += 0.5*v1;
+          v_vatom(i,2) += 0.5*v2;
+          v_vatom(i,3) += 0.5*v3;
+          v_vatom(i,4) += 0.5*v4;
+          v_vatom(i,5) += 0.5*v5;
+        }
+        if (NEWTON_PAIR || j < nlocal) {
+        v_vatom(j,0) += 0.5*v0;
+        v_vatom(j,1) += 0.5*v1;
+        v_vatom(j,2) += 0.5*v2;
+        v_vatom(j,3) += 0.5*v3;
+        v_vatom(j,4) += 0.5*v4;
+        v_vatom(j,5) += 0.5*v5;
+        }
+      } else {
+        v_vatom(i,0) += 0.5*v0;
+        v_vatom(i,1) += 0.5*v1;
+        v_vatom(i,2) += 0.5*v2;
+        v_vatom(i,3) += 0.5*v3;
+        v_vatom(i,4) += 0.5*v4;
+        v_vatom(i,5) += 0.5*v5;
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+namespace LAMMPS_NS {
+template class PairMultiLucyRXKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class PairMultiLucyRXKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.h b/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
new file mode 100644
index 0000000000..74a10ddee1
--- /dev/null
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
@@ -0,0 +1,215 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(multi/lucy/rx/kk,PairMultiLucyRXKokkos<LMPDeviceType>)
+PairStyle(multi/lucy/rx/kk/device,PairMultiLucyRXKokkos<LMPDeviceType>)
+PairStyle(multi/lucy/rx/kk/host,PairMultiLucyRXKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_PAIR_MULTI_LUCY_RX_KOKKOS_H
+#define LMP_PAIR_MULTI_LUCY_RX_KOKKOS_H
+
+
+#include "pair_multi_lucy_rx.h"
+#include "pair_kokkos.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+struct TagPairMultiLucyRXgetParams{};
+
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+struct TagPairMultiLucyRXCompute{};
+
+struct TagPairMultiLucyRXZero{};
+
+template<int NEIGHFLAG, int NEWTON_PAIR>
+struct TagPairMultiLucyRXComputeLocalDensity{};
+
+template<class DeviceType>
+class PairMultiLucyRXKokkos : public PairMultiLucyRX {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typedef EV_FLOAT value_type;
+
+  PairMultiLucyRXKokkos(class LAMMPS *);
+  virtual ~PairMultiLucyRXKokkos();
+
+  void compute(int, int);
+  void init_style();
+  void coeff(int, char **);
+  int pack_forward_comm(int, int *, double *, int, int *);
+  void unpack_forward_comm(int, int, double *);
+  int pack_reverse_comm(int, int, double *);
+  void unpack_reverse_comm(int, int *, double *);
+  void computeLocalDensity();
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMultiLucyRXgetParams, const int&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMultiLucyRXCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&, EV_FLOAT&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMultiLucyRXCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMultiLucyRXZero, const int&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMultiLucyRXComputeLocalDensity<NEIGHFLAG,NEWTON_PAIR>, const int&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR>
+  KOKKOS_INLINE_FUNCTION
+  void ev_tally(EV_FLOAT &ev, const int &i, const int &j,
+      const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
+                  const F_FLOAT &dely, const F_FLOAT &delz) const;
+
+ private:
+  int nlocal;
+  int neighflag;
+  int eflag,vflag;
+
+  bool one_type;
+  double cutsq_type11;
+  double rcut_type11;
+  double factor_type11;
+
+  //struct Table {
+  //  int ninput,rflag,fpflag,match;
+  //  double rlo,rhi,fplo,fphi,cut;
+  //  double *rfile,*efile,*ffile;
+  //  double *e2file,*f2file;
+  //  double innersq,delta,invdelta,deltasq6;
+  //  double *rsq,*drsq,*e,*de,*f,*df,*e2,*f2;
+  //};
+  //Table *tables;
+
+  int **tabindex;
+
+  //void read_table(Table *, char *, char *);
+  //void param_extract(Table *, char *);
+
+  char *site1, *site2;
+
+  KOKKOS_INLINE_FUNCTION
+  void getParams(int, double &, double &, double &, double &) const;
+
+  typename AT::t_float_1d d_fractionOld1,d_fractionOld2,d_fraction1,d_fraction2;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_f_array f;
+  typename AT::t_int_1d_randomread type;
+  typename AT::t_efloat_1d rho;
+  typename AT::t_efloat_1d uCG, uCGnew;
+  typename AT::t_float_2d dvector;
+
+  DAT::tdual_efloat_1d k_eatom;
+  DAT::tdual_virial_array k_vatom;
+  DAT::t_efloat_1d d_eatom;
+  DAT::t_virial_array d_vatom;
+
+  typename AT::t_neighbors_2d d_neighbors;
+  typename AT::t_int_1d_randomread d_ilist;
+  typename AT::t_int_1d_randomread d_numneigh;
+
+  typename AT::tdual_ffloat_2d k_cutsq;
+  typename AT::t_ffloat_2d d_cutsq;
+
+  friend void pair_virial_fdotr_compute<PairMultiLucyRXKokkos>(PairMultiLucyRXKokkos*);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Pair multi/lucy/rx command requires atom_style with density (e.g. dpd, meso)
+
+Self-explanatory
+
+E: Density < table inner cutoff
+
+The local density inner is smaller than the inner cutoff
+
+E: Density > table inner cutoff
+
+The local density inner is greater than the inner cutoff
+
+E: Only LOOKUP and LINEAR table styles have been implemented for pair multi/lucy/rx
+
+Self-explanatory
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E:  Unknown table style in pair_style command
+
+Self-explanatory
+
+E: Illegal number of pair table entries
+
+There must be at least 2 table entries.
+
+E: Illegal pair_coeff command
+
+All pair coefficients must be set in the data file or by the
+pair_coeff command before running a simulation.
+
+E: PairMultiLucyRXKokkos requires a fix rx command
+
+The fix rx command must come before the pair style command in the input file
+
+E:  There are no rx species specified
+
+There must be at least one species specified through the fix rx command
+
+E: Invalid pair table length
+
+Length of read-in pair table is invalid
+
+E: All pair coeffs are not set
+
+All pair coefficients must be set in the data file or by the
+pair_coeff command before running a simulation.
+
+E: Cannot open file %s
+
+The specified file cannot be opened.  Check that the path and name are
+correct.
+
+E: Did not find keyword in table file
+
+Keyword used in pair_coeff command was not found in table file.
+
+E: Invalid keyword in pair table parameters
+
+Keyword used in list of table parameters is not recognized.
+
+E: Pair table parameters did not set N
+
+List of pair table parameters must include N setting.
+
+*/
diff --git a/src/KOKKOS/pair_table_kokkos.cpp b/src/KOKKOS/pair_table_kokkos.cpp
index 278c5b0a2f..271490bbdd 100644
--- a/src/KOKKOS/pair_table_kokkos.cpp
+++ b/src/KOKKOS/pair_table_kokkos.cpp
@@ -12,7 +12,7 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing author: Paul Crozier (SNL)
+   Contributing author: Christian Trott (SNL)
 ------------------------------------------------------------------------- */
 
 #include <mpi.h>
@@ -41,7 +41,7 @@ enum{FULL,HALFTHREAD,HALF};
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
-PairTableKokkos<DeviceType>::PairTableKokkos(LAMMPS *lmp) : Pair(lmp)
+PairTableKokkos<DeviceType>::PairTableKokkos(LAMMPS *lmp) : PairTable(lmp)
 {
   update_table = 0;
   atomKK = (AtomKokkos *) atom;
@@ -98,6 +98,7 @@ void PairTableKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
 
   if (neighflag == FULL || neighflag == FULLCLUSTER) no_virial_fdotr_compute = 1;
 
+
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
 
@@ -221,6 +222,7 @@ compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, c
   //if (rsq < d_table_const.innersq(tidx))
   //  error->one(FLERR,"Pair distance < table inner cutoff");
 
+
   if (Specialisation::TabStyle == LOOKUP) {
     const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
     //if (itable >= tlm1)
@@ -338,8 +340,6 @@ void PairTableKokkos<DeviceType>::create_kokkos_tables()
     memory->create_kokkos(d_table->drsq,h_table->drsq,ntables,ntable,"Table::drsq");
   }
 
-
-
   for(int i=0; i < ntables; i++) {
     Table* tb = &tables[i];
 
@@ -477,85 +477,6 @@ void PairTableKokkos<DeviceType>::settings(int narg, char **arg)
   tables = NULL;
 }
 
-/* ----------------------------------------------------------------------
-   set coeffs for one or more type pairs
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-void PairTableKokkos<DeviceType>::coeff(int narg, char **arg)
-{
-  if (narg != 4 && narg != 5) error->all(FLERR,"Illegal pair_coeff command");
-  if (!allocated) allocate();
-
-  int ilo,ihi,jlo,jhi;
-  force->bounds(FLERR,arg[0],atom->ntypes,ilo,ihi);
-  force->bounds(FLERR,arg[1],atom->ntypes,jlo,jhi);
-
-  int me;
-  MPI_Comm_rank(world,&me);
-  tables = (Table *)
-    memory->srealloc(tables,(ntables+1)*sizeof(Table),"pair:tables");
-  Table *tb = &tables[ntables];
-  null_table(tb);
-  if (me == 0) read_table(tb,arg[2],arg[3]);
-  bcast_table(tb);
-
-  // set table cutoff
-
-  if (narg == 5) tb->cut = force->numeric(FLERR,arg[4]);
-  else if (tb->rflag) tb->cut = tb->rhi;
-  else tb->cut = tb->rfile[tb->ninput-1];
-
-  // error check on table parameters
-  // insure cutoff is within table
-  // for BITMAP tables, file values can be in non-ascending order
-
-  if (tb->ninput <= 1) error->one(FLERR,"Invalid pair table length");
-  double rlo,rhi;
-  if (tb->rflag == 0) {
-    rlo = tb->rfile[0];
-    rhi = tb->rfile[tb->ninput-1];
-  } else {
-    rlo = tb->rlo;
-    rhi = tb->rhi;
-  }
-  if (tb->cut <= rlo || tb->cut > rhi)
-    error->all(FLERR,"Invalid pair table cutoff");
-  if (rlo <= 0.0) error->all(FLERR,"Invalid pair table cutoff");
-
-  // match = 1 if don't need to spline read-in tables
-  // this is only the case if r values needed by final tables
-  //   exactly match r values read from file
-  // for tabstyle SPLINE, always need to build spline tables
-
-  tb->match = 0;
-  if (tabstyle == LINEAR && tb->ninput == tablength &&
-      tb->rflag == RSQ && tb->rhi == tb->cut) tb->match = 1;
-  if (tabstyle == BITMAP && tb->ninput == 1 << tablength &&
-      tb->rflag == BMP && tb->rhi == tb->cut) tb->match = 1;
-  if (tb->rflag == BMP && tb->match == 0)
-    error->all(FLERR,"Bitmapped table in file does not match requested table");
-
-  // spline read-in values and compute r,e,f vectors within table
-
-  if (tb->match == 0) spline_table(tb);
-  compute_table(tb);
-
-  // store ptr to table in tabindex
-
-  int count = 0;
-  for (int i = ilo; i <= ihi; i++) {
-    for (int j = MAX(jlo,i); j <= jhi; j++) {
-      tabindex[i][j] = ntables;
-      setflag[i][j] = 1;
-      count++;
-    }
-  }
-
-  if (count == 0) error->all(FLERR,"Illegal pair_coeff command");
-  ntables++;
-}
-
 /* ----------------------------------------------------------------------
    init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
@@ -574,677 +495,6 @@ double PairTableKokkos<DeviceType>::init_one(int i, int j)
   return tables[tabindex[i][j]].cut;
 }
 
-/* ----------------------------------------------------------------------
-   read a table section from a tabulated potential file
-   only called by proc 0
-   this function sets these values in Table:
-     ninput,rfile,efile,ffile,rflag,rlo,rhi,fpflag,fplo,fphi,ntablebits
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-void PairTableKokkos<DeviceType>::read_table(Table *tb, char *file, char *keyword)
-{
-  char line[MAXLINE];
-
-  // open file
-
-  FILE *fp = force->open_potential(file);
-  if (fp == NULL) {
-    char str[128];
-    sprintf(str,"Cannot open file %s",file);
-    error->one(FLERR,str);
-  }
-
-  // loop until section found with matching keyword
-
-  while (1) {
-    if (fgets(line,MAXLINE,fp) == NULL)
-      error->one(FLERR,"Did not find keyword in table file");
-    if (strspn(line," \t\n\r") == strlen(line)) continue;  // blank line
-    if (line[0] == '#') continue;                          // comment
-    char *word = strtok(line," \t\n\r");
-    if (strcmp(word,keyword) == 0) break;           // matching keyword
-    fgets(line,MAXLINE,fp);                         // no match, skip section
-    param_extract(tb,line);
-    fgets(line,MAXLINE,fp);
-    for (int i = 0; i < tb->ninput; i++) fgets(line,MAXLINE,fp);
-  }
-
-  // read args on 2nd line of section
-  // allocate table arrays for file values
-
-  fgets(line,MAXLINE,fp);
-  param_extract(tb,line);
-  memory->create(tb->rfile,tb->ninput,"pair:rfile");
-  memory->create(tb->efile,tb->ninput,"pair:efile");
-  memory->create(tb->ffile,tb->ninput,"pair:ffile");
-
-  // setup bitmap parameters for table to read in
-
-  tb->ntablebits = 0;
-  int masklo,maskhi,nmask,nshiftbits;
-  if (tb->rflag == BMP) {
-    while (1 << tb->ntablebits < tb->ninput) tb->ntablebits++;
-    if (1 << tb->ntablebits != tb->ninput)
-      error->one(FLERR,"Bitmapped table is incorrect length in table file");
-    init_bitmap(tb->rlo,tb->rhi,tb->ntablebits,masklo,maskhi,nmask,nshiftbits);
-  }
-
-  // read r,e,f table values from file
-  // if rflag set, compute r
-  // if rflag not set, use r from file
-
-  int itmp;
-  double rtmp;
-  union_int_float_t rsq_lookup;
-
-  fgets(line,MAXLINE,fp);
-  for (int i = 0; i < tb->ninput; i++) {
-    fgets(line,MAXLINE,fp);
-    sscanf(line,"%d %lg %lg %lg",&itmp,&rtmp,&tb->efile[i],&tb->ffile[i]);
-
-    if (tb->rflag == RLINEAR)
-      rtmp = tb->rlo + (tb->rhi - tb->rlo)*i/(tb->ninput-1);
-    else if (tb->rflag == RSQ) {
-      rtmp = tb->rlo*tb->rlo +
-        (tb->rhi*tb->rhi - tb->rlo*tb->rlo)*i/(tb->ninput-1);
-      rtmp = sqrt(rtmp);
-    } else if (tb->rflag == BMP) {
-      rsq_lookup.i = i << nshiftbits;
-      rsq_lookup.i |= masklo;
-      if (rsq_lookup.f < tb->rlo*tb->rlo) {
-        rsq_lookup.i = i << nshiftbits;
-        rsq_lookup.i |= maskhi;
-      }
-      rtmp = sqrtf(rsq_lookup.f);
-    }
-
-    tb->rfile[i] = rtmp;
-  }
-
-  // close file
-
-  fclose(fp);
-}
-
-/* ----------------------------------------------------------------------
-   broadcast read-in table info from proc 0 to other procs
-   this function communicates these values in Table:
-     ninput,rfile,efile,ffile,rflag,rlo,rhi,fpflag,fplo,fphi
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-void PairTableKokkos<DeviceType>::bcast_table(Table *tb)
-{
-  MPI_Bcast(&tb->ninput,1,MPI_INT,0,world);
-
-  int me;
-  MPI_Comm_rank(world,&me);
-  if (me > 0) {
-    memory->create(tb->rfile,tb->ninput,"pair:rfile");
-    memory->create(tb->efile,tb->ninput,"pair:efile");
-    memory->create(tb->ffile,tb->ninput,"pair:ffile");
-  }
-
-  MPI_Bcast(tb->rfile,tb->ninput,MPI_DOUBLE,0,world);
-  MPI_Bcast(tb->efile,tb->ninput,MPI_DOUBLE,0,world);
-  MPI_Bcast(tb->ffile,tb->ninput,MPI_DOUBLE,0,world);
-
-  MPI_Bcast(&tb->rflag,1,MPI_INT,0,world);
-  if (tb->rflag) {
-    MPI_Bcast(&tb->rlo,1,MPI_DOUBLE,0,world);
-    MPI_Bcast(&tb->rhi,1,MPI_DOUBLE,0,world);
-  }
-  MPI_Bcast(&tb->fpflag,1,MPI_INT,0,world);
-  if (tb->fpflag) {
-    MPI_Bcast(&tb->fplo,1,MPI_DOUBLE,0,world);
-    MPI_Bcast(&tb->fphi,1,MPI_DOUBLE,0,world);
-  }
-}
-
-/* ----------------------------------------------------------------------
-   build spline representation of e,f over entire range of read-in table
-   this function sets these values in Table: e2file,f2file
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-void PairTableKokkos<DeviceType>::spline_table(Table *tb)
-{
-  memory->create(tb->e2file,tb->ninput,"pair:e2file");
-  memory->create(tb->f2file,tb->ninput,"pair:f2file");
-
-  double ep0 = - tb->ffile[0];
-  double epn = - tb->ffile[tb->ninput-1];
-  spline(tb->rfile,tb->efile,tb->ninput,ep0,epn,tb->e2file);
-
-  if (tb->fpflag == 0) {
-    tb->fplo = (tb->ffile[1] - tb->ffile[0]) / (tb->rfile[1] - tb->rfile[0]);
-    tb->fphi = (tb->ffile[tb->ninput-1] - tb->ffile[tb->ninput-2]) /
-      (tb->rfile[tb->ninput-1] - tb->rfile[tb->ninput-2]);
-  }
-
-  double fp0 = tb->fplo;
-  double fpn = tb->fphi;
-  spline(tb->rfile,tb->ffile,tb->ninput,fp0,fpn,tb->f2file);
-}
-
-/* ----------------------------------------------------------------------
-   extract attributes from parameter line in table section
-   format of line: N value R/RSQ/BITMAP lo hi FP fplo fphi
-   N is required, other params are optional
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-void PairTableKokkos<DeviceType>::param_extract(Table *tb, char *line)
-{
-  tb->ninput = 0;
-  tb->rflag = NONE;
-  tb->fpflag = 0;
-
-  char *word = strtok(line," \t\n\r\f");
-  while (word) {
-    if (strcmp(word,"N") == 0) {
-      word = strtok(NULL," \t\n\r\f");
-      tb->ninput = atoi(word);
-    } else if (strcmp(word,"R") == 0 || strcmp(word,"RSQ") == 0 ||
-               strcmp(word,"BITMAP") == 0) {
-      if (strcmp(word,"R") == 0) tb->rflag = RLINEAR;
-      else if (strcmp(word,"RSQ") == 0) tb->rflag = RSQ;
-      else if (strcmp(word,"BITMAP") == 0) tb->rflag = BMP;
-      word = strtok(NULL," \t\n\r\f");
-      tb->rlo = atof(word);
-      word = strtok(NULL," \t\n\r\f");
-      tb->rhi = atof(word);
-    } else if (strcmp(word,"FP") == 0) {
-      tb->fpflag = 1;
-      word = strtok(NULL," \t\n\r\f");
-      tb->fplo = atof(word);
-      word = strtok(NULL," \t\n\r\f");
-      tb->fphi = atof(word);
-    } else {
-      error->one(FLERR,"Invalid keyword in pair table parameters");
-    }
-    word = strtok(NULL," \t\n\r\f");
-  }
-
-  if (tb->ninput == 0) error->one(FLERR,"Pair table parameters did not set N");
-}
-
-/* ----------------------------------------------------------------------
-   compute r,e,f vectors from splined values
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-void PairTableKokkos<DeviceType>::compute_table(Table *tb)
-{
-  update_table = 1;
-  int tlm1 = tablength-1;
-
-  // inner = inner table bound
-  // cut = outer table bound
-  // delta = table spacing in rsq for N-1 bins
-
-  double inner;
-  if (tb->rflag) inner = tb->rlo;
-  else inner = tb->rfile[0];
-  tb->innersq = inner*inner;
-  tb->delta = (tb->cut*tb->cut - tb->innersq) / tlm1;
-  tb->invdelta = 1.0/tb->delta;
-
-  // direct lookup tables
-  // N-1 evenly spaced bins in rsq from inner to cut
-  // e,f = value at midpt of bin
-  // e,f are N-1 in length since store 1 value at bin midpt
-  // f is converted to f/r when stored in f[i]
-  // e,f are never a match to read-in values, always computed via spline interp
-
-  if (tabstyle == LOOKUP) {
-    memory->create(tb->e,tlm1,"pair:e");
-    memory->create(tb->f,tlm1,"pair:f");
-
-    double r,rsq;
-    for (int i = 0; i < tlm1; i++) {
-      rsq = tb->innersq + (i+0.5)*tb->delta;
-      r = sqrt(rsq);
-      tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
-      tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
-    }
-  }
-
-  // linear tables
-  // N-1 evenly spaced bins in rsq from inner to cut
-  // rsq,e,f = value at lower edge of bin
-  // de,df values = delta from lower edge to upper edge of bin
-  // rsq,e,f are N in length so de,df arrays can compute difference
-  // f is converted to f/r when stored in f[i]
-  // e,f can match read-in values, else compute via spline interp
-
-  if (tabstyle == LINEAR) {
-    memory->create(tb->rsq,tablength,"pair:rsq");
-    memory->create(tb->e,tablength,"pair:e");
-    memory->create(tb->f,tablength,"pair:f");
-    memory->create(tb->de,tlm1,"pair:de");
-    memory->create(tb->df,tlm1,"pair:df");
-
-    double r,rsq;
-    for (int i = 0; i < tablength; i++) {
-      rsq = tb->innersq + i*tb->delta;
-      r = sqrt(rsq);
-      tb->rsq[i] = rsq;
-      if (tb->match) {
-        tb->e[i] = tb->efile[i];
-        tb->f[i] = tb->ffile[i]/r;
-      } else {
-        tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
-        tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
-      }
-    }
-
-    for (int i = 0; i < tlm1; i++) {
-      tb->de[i] = tb->e[i+1] - tb->e[i];
-      tb->df[i] = tb->f[i+1] - tb->f[i];
-    }
-  }
-
-  // cubic spline tables
-  // N-1 evenly spaced bins in rsq from inner to cut
-  // rsq,e,f = value at lower edge of bin
-  // e2,f2 = spline coefficient for each bin
-  // rsq,e,f,e2,f2 are N in length so have N-1 spline bins
-  // f is converted to f/r after e is splined
-  // e,f can match read-in values, else compute via spline interp
-
-  if (tabstyle == SPLINE) {
-    memory->create(tb->rsq,tablength,"pair:rsq");
-    memory->create(tb->e,tablength,"pair:e");
-    memory->create(tb->f,tablength,"pair:f");
-    memory->create(tb->e2,tablength,"pair:e2");
-    memory->create(tb->f2,tablength,"pair:f2");
-
-    tb->deltasq6 = tb->delta*tb->delta / 6.0;
-
-    double r,rsq;
-    for (int i = 0; i < tablength; i++) {
-      rsq = tb->innersq + i*tb->delta;
-      r = sqrt(rsq);
-      tb->rsq[i] = rsq;
-      if (tb->match) {
-        tb->e[i] = tb->efile[i];
-        tb->f[i] = tb->ffile[i]/r;
-      } else {
-        tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
-        tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r);
-      }
-    }
-
-    // ep0,epn = dh/dg at inner and at cut
-    // h(r) = e(r) and g(r) = r^2
-    // dh/dg = (de/dr) / 2r = -f/2r
-
-    double ep0 = - tb->f[0] / (2.0 * sqrt(tb->innersq));
-    double epn = - tb->f[tlm1] / (2.0 * tb->cut);
-    spline(tb->rsq,tb->e,tablength,ep0,epn,tb->e2);
-
-    // fp0,fpn = dh/dg at inner and at cut
-    // h(r) = f(r)/r and g(r) = r^2
-    // dh/dg = (1/r df/dr - f/r^2) / 2r
-    // dh/dg in secant approx = (f(r2)/r2 - f(r1)/r1) / (g(r2) - g(r1))
-
-    double fp0,fpn;
-    double secant_factor = 0.1;
-    if (tb->fpflag) fp0 = (tb->fplo/sqrt(tb->innersq) - tb->f[0]/tb->innersq) /
-      (2.0 * sqrt(tb->innersq));
-    else {
-      double rsq1 = tb->innersq;
-      double rsq2 = rsq1 + secant_factor*tb->delta;
-      fp0 = (splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,sqrt(rsq2)) /
-             sqrt(rsq2) - tb->f[0] / sqrt(rsq1)) / (secant_factor*tb->delta);
-    }
-
-    if (tb->fpflag && tb->cut == tb->rfile[tb->ninput-1]) fpn =
-      (tb->fphi/tb->cut - tb->f[tlm1]/(tb->cut*tb->cut)) / (2.0 * tb->cut);
-    else {
-      double rsq2 = tb->cut * tb->cut;
-      double rsq1 = rsq2 - secant_factor*tb->delta;
-      fpn = (tb->f[tlm1] / sqrt(rsq2) -
-             splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,sqrt(rsq1)) /
-             sqrt(rsq1)) / (secant_factor*tb->delta);
-    }
-
-    for (int i = 0; i < tablength; i++) tb->f[i] /= sqrt(tb->rsq[i]);
-    spline(tb->rsq,tb->f,tablength,fp0,fpn,tb->f2);
-  }
-
-  // bitmapped linear tables
-  // 2^N bins from inner to cut, spaced in bitmapped manner
-  // f is converted to f/r when stored in f[i]
-  // e,f can match read-in values, else compute via spline interp
-
-  if (tabstyle == BITMAP) {
-    double r;
-    union_int_float_t rsq_lookup;
-    int masklo,maskhi;
-
-    // linear lookup tables of length ntable = 2^n
-    // stored value = value at lower edge of bin
-
-    init_bitmap(inner,tb->cut,tablength,masklo,maskhi,tb->nmask,tb->nshiftbits);
-    int ntable = 1 << tablength;
-    int ntablem1 = ntable - 1;
-
-    memory->create(tb->rsq,ntable,"pair:rsq");
-    memory->create(tb->e,ntable,"pair:e");
-    memory->create(tb->f,ntable,"pair:f");
-    memory->create(tb->de,ntable,"pair:de");
-    memory->create(tb->df,ntable,"pair:df");
-    memory->create(tb->drsq,ntable,"pair:drsq");
-
-    union_int_float_t minrsq_lookup;
-    minrsq_lookup.i = 0 << tb->nshiftbits;
-    minrsq_lookup.i |= maskhi;
-
-    for (int i = 0; i < ntable; i++) {
-      rsq_lookup.i = i << tb->nshiftbits;
-      rsq_lookup.i |= masklo;
-      if (rsq_lookup.f < tb->innersq) {
-        rsq_lookup.i = i << tb->nshiftbits;
-        rsq_lookup.i |= maskhi;
-      }
-      r = sqrtf(rsq_lookup.f);
-      tb->rsq[i] = rsq_lookup.f;
-      if (tb->match) {
-        tb->e[i] = tb->efile[i];
-        tb->f[i] = tb->ffile[i]/r;
-      } else {
-        tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
-        tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
-      }
-      minrsq_lookup.f = MIN(minrsq_lookup.f,rsq_lookup.f);
-    }
-
-    tb->innersq = minrsq_lookup.f;
-
-    for (int i = 0; i < ntablem1; i++) {
-      tb->de[i] = tb->e[i+1] - tb->e[i];
-      tb->df[i] = tb->f[i+1] - tb->f[i];
-      tb->drsq[i] = 1.0/(tb->rsq[i+1] - tb->rsq[i]);
-    }
-
-    // get the delta values for the last table entries
-    // tables are connected periodically between 0 and ntablem1
-
-    tb->de[ntablem1] = tb->e[0] - tb->e[ntablem1];
-    tb->df[ntablem1] = tb->f[0] - tb->f[ntablem1];
-    tb->drsq[ntablem1] = 1.0/(tb->rsq[0] - tb->rsq[ntablem1]);
-
-    // get the correct delta values at itablemax
-    // smallest r is in bin itablemin
-    // largest r is in bin itablemax, which is itablemin-1,
-    //   or ntablem1 if itablemin=0
-
-    // deltas at itablemax only needed if corresponding rsq < cut*cut
-    // if so, compute deltas between rsq and cut*cut
-    //   if tb->match, data at cut*cut is unavailable, so we'll take
-    //   deltas at itablemax-1 as a good approximation
-
-    double e_tmp,f_tmp;
-    int itablemin = minrsq_lookup.i & tb->nmask;
-    itablemin >>= tb->nshiftbits;
-    int itablemax = itablemin - 1;
-    if (itablemin == 0) itablemax = ntablem1;
-    int itablemaxm1 = itablemax - 1;
-    if (itablemax == 0) itablemaxm1 = ntablem1;
-    rsq_lookup.i = itablemax << tb->nshiftbits;
-    rsq_lookup.i |= maskhi;
-    if (rsq_lookup.f < tb->cut*tb->cut) {
-      if (tb->match) {
-        tb->de[itablemax] = tb->de[itablemaxm1];
-        tb->df[itablemax] = tb->df[itablemaxm1];
-        tb->drsq[itablemax] = tb->drsq[itablemaxm1];
-      } else {
-            rsq_lookup.f = tb->cut*tb->cut;
-        r = sqrtf(rsq_lookup.f);
-        e_tmp = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
-        f_tmp = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
-        tb->de[itablemax] = e_tmp - tb->e[itablemax];
-        tb->df[itablemax] = f_tmp - tb->f[itablemax];
-        tb->drsq[itablemax] = 1.0/(rsq_lookup.f - tb->rsq[itablemax]);
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   set all ptrs in a table to NULL, so can be freed safely
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-void PairTableKokkos<DeviceType>::null_table(Table *tb)
-{
-  tb->rfile = tb->efile = tb->ffile = NULL;
-  tb->e2file = tb->f2file = NULL;
-  tb->rsq = tb->drsq = tb->e = tb->de = NULL;
-  tb->f = tb->df = tb->e2 = tb->f2 = NULL;
-}
-
-/* ----------------------------------------------------------------------
-   free all arrays in a table
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-void PairTableKokkos<DeviceType>::free_table(Table *tb)
-{
-  memory->destroy(tb->rfile);
-  memory->destroy(tb->efile);
-  memory->destroy(tb->ffile);
-  memory->destroy(tb->e2file);
-  memory->destroy(tb->f2file);
-
-  memory->destroy(tb->rsq);
-  memory->destroy(tb->drsq);
-  memory->destroy(tb->e);
-  memory->destroy(tb->de);
-  memory->destroy(tb->f);
-  memory->destroy(tb->df);
-  memory->destroy(tb->e2);
-  memory->destroy(tb->f2);
-}
-
-/* ----------------------------------------------------------------------
-   spline and splint routines modified from Numerical Recipes
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-void PairTableKokkos<DeviceType>::spline(double *x, double *y, int n,
-                       double yp1, double ypn, double *y2)
-{
-  int i,k;
-  double p,qn,sig,un;
-  double *u = new double[n];
-
-  if (yp1 > 0.99e30) y2[0] = u[0] = 0.0;
-  else {
-    y2[0] = -0.5;
-    u[0] = (3.0/(x[1]-x[0])) * ((y[1]-y[0]) / (x[1]-x[0]) - yp1);
-  }
-  for (i = 1; i < n-1; i++) {
-    sig = (x[i]-x[i-1]) / (x[i+1]-x[i-1]);
-    p = sig*y2[i-1] + 2.0;
-    y2[i] = (sig-1.0) / p;
-    u[i] = (y[i+1]-y[i]) / (x[i+1]-x[i]) - (y[i]-y[i-1]) / (x[i]-x[i-1]);
-    u[i] = (6.0*u[i] / (x[i+1]-x[i-1]) - sig*u[i-1]) / p;
-  }
-  if (ypn > 0.99e30) qn = un = 0.0;
-  else {
-    qn = 0.5;
-    un = (3.0/(x[n-1]-x[n-2])) * (ypn - (y[n-1]-y[n-2]) / (x[n-1]-x[n-2]));
-  }
-  y2[n-1] = (un-qn*u[n-2]) / (qn*y2[n-2] + 1.0);
-  for (k = n-2; k >= 0; k--) y2[k] = y2[k]*y2[k+1] + u[k];
-
-  delete [] u;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-double PairTableKokkos<DeviceType>::splint(double *xa, double *ya, double *y2a, int n, double x)
-{
-  int klo,khi,k;
-  double h,b,a,y;
-
-  klo = 0;
-  khi = n-1;
-  while (khi-klo > 1) {
-    k = (khi+klo) >> 1;
-    if (xa[k] > x) khi = k;
-    else klo = k;
-  }
-  h = xa[khi]-xa[klo];
-  a = (xa[khi]-x) / h;
-  b = (x-xa[klo]) / h;
-  y = a*ya[klo] + b*ya[khi] +
-    ((a*a*a-a)*y2a[klo] + (b*b*b-b)*y2a[khi]) * (h*h)/6.0;
-  return y;
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 writes to restart file
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-void PairTableKokkos<DeviceType>::write_restart(FILE *fp)
-{
-  write_restart_settings(fp);
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 reads from restart file, bcasts
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-void PairTableKokkos<DeviceType>::read_restart(FILE *fp)
-{
-  read_restart_settings(fp);
-  allocate();
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 writes to restart file
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-void PairTableKokkos<DeviceType>::write_restart_settings(FILE *fp)
-{
-  fwrite(&tabstyle,sizeof(int),1,fp);
-  fwrite(&tablength,sizeof(int),1,fp);
-  fwrite(&ewaldflag,sizeof(int),1,fp);
-  fwrite(&pppmflag,sizeof(int),1,fp);
-  fwrite(&msmflag,sizeof(int),1,fp);
-  fwrite(&dispersionflag,sizeof(int),1,fp);
-  fwrite(&tip4pflag,sizeof(int),1,fp);
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 reads from restart file, bcasts
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-void PairTableKokkos<DeviceType>::read_restart_settings(FILE *fp)
-{
-  if (comm->me == 0) {
-    fread(&tabstyle,sizeof(int),1,fp);
-    fread(&tablength,sizeof(int),1,fp);
-    fread(&ewaldflag,sizeof(int),1,fp);
-    fread(&pppmflag,sizeof(int),1,fp);
-    fread(&msmflag,sizeof(int),1,fp);
-    fread(&dispersionflag,sizeof(int),1,fp);
-    fread(&tip4pflag,sizeof(int),1,fp);
-  }
-  MPI_Bcast(&tabstyle,1,MPI_INT,0,world);
-  MPI_Bcast(&tablength,1,MPI_INT,0,world);
-  MPI_Bcast(&ewaldflag,1,MPI_INT,0,world);
-  MPI_Bcast(&pppmflag,1,MPI_INT,0,world);
-  MPI_Bcast(&msmflag,1,MPI_INT,0,world);
-  MPI_Bcast(&dispersionflag,1,MPI_INT,0,world);
-  MPI_Bcast(&tip4pflag,1,MPI_INT,0,world);
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-double PairTableKokkos<DeviceType>::single(int i, int j, int itype, int jtype, double rsq,
-                         double factor_coul, double factor_lj,
-                         double &fforce)
-{
-  int itable;
-  double fraction,value,a,b,phi;
-  int tlm1 = tablength - 1;
-
-  Table *tb = &tables[tabindex[itype][jtype]];
-  if (rsq < tb->innersq) error->one(FLERR,"Pair distance < table inner cutoff");
-
-  if (tabstyle == LOOKUP) {
-    itable = static_cast<int> ((rsq-tb->innersq) * tb->invdelta);
-    if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff");
-    fforce = factor_lj * tb->f[itable];
-  } else if (tabstyle == LINEAR) {
-    itable = static_cast<int> ((rsq-tb->innersq) * tb->invdelta);
-    if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff");
-    fraction = (rsq - tb->rsq[itable]) * tb->invdelta;
-    value = tb->f[itable] + fraction*tb->df[itable];
-    fforce = factor_lj * value;
-  } else if (tabstyle == SPLINE) {
-    itable = static_cast<int> ((rsq-tb->innersq) * tb->invdelta);
-    if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff");
-    b = (rsq - tb->rsq[itable]) * tb->invdelta;
-    a = 1.0 - b;
-    value = a * tb->f[itable] + b * tb->f[itable+1] +
-      ((a*a*a-a)*tb->f2[itable] + (b*b*b-b)*tb->f2[itable+1]) *
-      tb->deltasq6;
-    fforce = factor_lj * value;
-  } else {
-    union_int_float_t rsq_lookup;
-    rsq_lookup.f = rsq;
-    itable = rsq_lookup.i & tb->nmask;
-    itable >>= tb->nshiftbits;
-    fraction = (rsq_lookup.f - tb->rsq[itable]) * tb->drsq[itable];
-    value = tb->f[itable] + fraction*tb->df[itable];
-    fforce = factor_lj * value;
-  }
-
-  if (tabstyle == LOOKUP)
-    phi = tb->e[itable];
-  else if (tabstyle == LINEAR || tabstyle == BITMAP)
-    phi = tb->e[itable] + fraction*tb->de[itable];
-  else
-    phi = a * tb->e[itable] + b * tb->e[itable+1] +
-      ((a*a*a-a)*tb->e2[itable] + (b*b*b-b)*tb->e2[itable+1]) * tb->deltasq6;
-  return factor_lj*phi;
-}
-
-/* ----------------------------------------------------------------------
-   return the Coulomb cutoff for tabled potentials
-   called by KSpace solvers which require that all pairwise cutoffs be the same
-   loop over all tables not just those indexed by tabindex[i][j] since
-     no way to know which tables are active since pair::init() not yet called
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-void *PairTableKokkos<DeviceType>::extract(const char *str, int &dim)
-{
-  if (strcmp(str,"cut_coul") != 0) return NULL;
-  if (ntables == 0) error->all(FLERR,"All pair coeffs are not set");
-
-  double cut_coul = tables[0].cut;
-  for (int m = 1; m < ntables; m++)
-    if (tables[m].cut != cut_coul)
-      error->all(FLERR,
-                 "Pair table cutoffs must all be equal to use with KSpace");
-  dim = 0;
-  return &tables[0].cut;
-}
-
 template<class DeviceType>
 void PairTableKokkos<DeviceType>::init_style()
 {
diff --git a/src/KOKKOS/pair_table_kokkos.h b/src/KOKKOS/pair_table_kokkos.h
index 09e64804b4..7c021df61e 100644
--- a/src/KOKKOS/pair_table_kokkos.h
+++ b/src/KOKKOS/pair_table_kokkos.h
@@ -22,7 +22,7 @@ PairStyle(table/kk/host,PairTableKokkos<LMPHostType>)
 #ifndef LMP_PAIR_TABLE_KOKKOS_H
 #define LMP_PAIR_TABLE_KOKKOS_H
 
-#include "pair.h"
+#include "pair_table.h"
 #include "pair_kokkos.h"
 #include "neigh_list_kokkos.h"
 #include "atom_kokkos.h"
@@ -38,7 +38,7 @@ template <class DeviceType, int NEIGHFLAG, int TABSTYLE>
 class PairTableComputeFunctor;
 
 template<class DeviceType>
-class PairTableKokkos : public Pair {
+class PairTableKokkos : public PairTable {
  public:
 
   enum {EnabledNeighFlags=FULL|HALFTHREAD|HALF|N2|FULLCLUSTER};
@@ -59,18 +59,9 @@ class PairTableKokkos : public Pair {
                         const NeighListKokkos<DeviceType> &list) const;
 */
   void settings(int, char **);
-  void coeff(int, char **);
   double init_one(int, int);
-  void write_restart(FILE *);
-  void read_restart(FILE *);
-  void write_restart_settings(FILE *);
-  void read_restart_settings(FILE *);
-  double single(int, int, int, int, double, double, double, double &);
-  void *extract(const char *, int &);
-
   void init_style();
 
-
  protected:
   enum{LOOKUP,LINEAR,SPLINE,BITMAP};
 
@@ -107,17 +98,6 @@ class PairTableKokkos : public Pair {
     typename ArrayTypes<LMPHostType>::t_ffloat_2d rsq,drsq,e,de,f,df,e2,f2;
   };
 
-  struct Table {
-    int ninput,rflag,fpflag,match,ntablebits;
-    int nshiftbits,nmask;
-    double rlo,rhi,fplo,fphi,cut;
-    double *rfile,*efile,*ffile;
-    double *e2file,*f2file;
-    double innersq,delta,invdelta,deltasq6;
-    double *rsq,*drsq,*e,*de,*f,*df,*e2,*f2;
-  };
-  int ntables;
-  Table *tables;
   TableDeviceConst d_table_const;
   TableDevice* d_table;
   TableHost* h_table;
@@ -128,15 +108,6 @@ class PairTableKokkos : public Pair {
   typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq;
 
   void allocate();
-  void read_table(Table *, char *, char *);
-  void param_extract(Table *, char *);
-  void bcast_table(Table *);
-  void spline_table(Table *);
-  void compute_table(Table *);
-  void null_table(Table *);
-  void free_table(Table *);
-  void spline(double *, double *, int, double, double, double *);
-  double splint(double *, double *, double *, int, double);
 
   typename ArrayTypes<DeviceType>::t_x_array_randomread x;
   typename ArrayTypes<DeviceType>::t_x_array_const c_x;
@@ -213,11 +184,6 @@ class PairTableKokkos : public Pair {
   friend void pair_virial_fdotr_compute<PairTableKokkos>(PairTableKokkos*);
 };
 
-
-
-
-
-
 }
 
 #endif
@@ -297,4 +263,10 @@ E: Cannot use chosen neighbor list style with lj/cut/kk
 
 That style is not supported by Kokkos.
 
+
+
+
 */
+
+
+
diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
new file mode 100644
index 0000000000..4c809d98bd
--- /dev/null
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -0,0 +1,634 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Stan Moore (SNL)
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pair_table_rx_kokkos.h"
+#include "kokkos.h"
+#include "atom.h"
+#include "force.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "memory.h"
+#include "error.h"
+#include "atom_masks.h"
+
+using namespace LAMMPS_NS;
+
+enum{NONE,RLINEAR,RSQ,BMP};
+enum{FULL,HALFTHREAD,HALF};
+
+#define MAXLINE 1024
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairTableRXKokkos<DeviceType>::PairTableRXKokkos(LAMMPS *lmp) : PairTableRX(lmp)
+{
+  update_table = 0;
+  atomKK = (AtomKokkos *) atom;
+  ntables = 0;
+  tables = NULL;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
+  datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
+  h_table = new TableHost();
+  d_table = new TableDevice();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairTableRXKokkos<DeviceType>::~PairTableRXKokkos()
+{
+/*  for (int m = 0; m < ntables; m++) free_table(&tables[m]);
+  memory->sfree(tables);
+
+  if (allocated) {
+    memory->destroy(setflag);
+    memory->destroy(cutsq);
+    memory->destroy(tabindex);
+  }*/
+  delete h_table;
+  delete d_table;
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableRXKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
+{
+  if(update_table)
+    create_kokkos_tables();
+  if(tabstyle == LOOKUP)
+    compute_style<LOOKUP>(eflag_in,vflag_in);
+  if(tabstyle == LINEAR)
+    compute_style<LINEAR>(eflag_in,vflag_in);
+  if(tabstyle == SPLINE)
+    compute_style<SPLINE>(eflag_in,vflag_in);
+  if(tabstyle == BITMAP)
+    compute_style<BITMAP>(eflag_in,vflag_in);
+}
+
+template<class DeviceType>
+template<int TABSTYLE>
+void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
+{
+  eflag = eflag_in;
+  vflag = vflag_in;
+
+  if (neighflag == FULL || neighflag == FULLCLUSTER) no_virial_fdotr_compute = 1;
+
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+
+  atomKK->sync(execution_space,datamask_read);
+  //k_cutsq.template sync<DeviceType>();
+  //k_params.template sync<DeviceType>();
+  if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
+  else atomKK->modified(execution_space,F_MASK);
+
+  x = c_x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  nlocal = atom->nlocal;
+  nall = atom->nlocal + atom->nghost;
+  special_lj[0] = force->special_lj[0];
+  special_lj[1] = force->special_lj[1];
+  special_lj[2] = force->special_lj[2];
+  special_lj[3] = force->special_lj[3];
+  newton_pair = force->newton_pair;
+  d_cutsq = d_table->cutsq;
+  // loop over neighbors of my atoms
+
+  EV_FLOAT ev;
+  if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
+    if (neighflag == FULL) {
+      PairComputeFunctor<PairTableRXKokkos<DeviceType>,FULL,false,S_TableRXCompute<DeviceType,TABSTYLE> >
+        ff(this,(NeighListKokkos<DeviceType>*) list);
+      if (eflag || vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
+      else Kokkos::parallel_for(list->inum,ff);
+    } else if (neighflag == HALFTHREAD) {
+      PairComputeFunctor<PairTableRXKokkos<DeviceType>,HALFTHREAD,false,S_TableRXCompute<DeviceType,TABSTYLE> >
+        ff(this,(NeighListKokkos<DeviceType>*) list);
+      if (eflag || vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
+      else Kokkos::parallel_for(list->inum,ff);
+    } else if (neighflag == HALF) {
+      PairComputeFunctor<PairTableRXKokkos<DeviceType>,HALF,false,S_TableRXCompute<DeviceType,TABSTYLE> >
+        f(this,(NeighListKokkos<DeviceType>*) list);
+      if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
+      else Kokkos::parallel_for(list->inum,f);
+    } else if (neighflag == N2) {
+      PairComputeFunctor<PairTableRXKokkos<DeviceType>,N2,false,S_TableRXCompute<DeviceType,TABSTYLE> >
+        f(this,(NeighListKokkos<DeviceType>*) list);
+      if (eflag || vflag) Kokkos::parallel_reduce(nlocal,f,ev);
+      else Kokkos::parallel_for(nlocal,f);
+    } else if (neighflag == FULLCLUSTER) {
+      typedef PairComputeFunctor<PairTableRXKokkos<DeviceType>,FULLCLUSTER,false,S_TableRXCompute<DeviceType,TABSTYLE> >
+        f_type;
+      f_type f(this,(NeighListKokkos<DeviceType>*) list);
+      #ifdef KOKKOS_HAVE_CUDA
+        const int teamsize = Kokkos::Impl::is_same<DeviceType, Kokkos::Cuda>::value ? 32 : 1;
+      #else
+        const int teamsize = 1;
+      #endif
+      const int nteams = (list->inum*+teamsize-1)/teamsize;
+      Kokkos::TeamPolicy<DeviceType> config(nteams,teamsize,NeighClusterSize);
+      if (eflag || vflag) Kokkos::parallel_reduce(config,f,ev);
+      else Kokkos::parallel_for(config,f);
+    }
+  } else {
+    if (neighflag == FULL) {
+      PairComputeFunctor<PairTableRXKokkos<DeviceType>,FULL,true,S_TableRXCompute<DeviceType,TABSTYLE> >
+        f(this,(NeighListKokkos<DeviceType>*) list);
+      if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
+      else Kokkos::parallel_for(list->inum,f);
+    } else if (neighflag == HALFTHREAD) {
+      PairComputeFunctor<PairTableRXKokkos<DeviceType>,HALFTHREAD,true,S_TableRXCompute<DeviceType,TABSTYLE> >
+        f(this,(NeighListKokkos<DeviceType>*) list);
+      if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
+      else Kokkos::parallel_for(list->inum,f);
+    } else if (neighflag == HALF) {
+      PairComputeFunctor<PairTableRXKokkos<DeviceType>,HALF,true,S_TableRXCompute<DeviceType,TABSTYLE> >
+        f(this,(NeighListKokkos<DeviceType>*) list);
+      if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
+      else Kokkos::parallel_for(list->inum,f);
+    } else if (neighflag == N2) {
+      PairComputeFunctor<PairTableRXKokkos<DeviceType>,N2,true,S_TableRXCompute<DeviceType,TABSTYLE> >
+        f(this,(NeighListKokkos<DeviceType>*) list);
+      if (eflag || vflag) Kokkos::parallel_reduce(nlocal,f,ev);
+      else Kokkos::parallel_for(nlocal,f);
+    } else if (neighflag == FULLCLUSTER) {
+      typedef PairComputeFunctor<PairTableRXKokkos<DeviceType>,FULLCLUSTER,true,S_TableRXCompute<DeviceType,TABSTYLE> >
+        f_type;
+      f_type f(this,(NeighListKokkos<DeviceType>*) list);
+      #ifdef KOKKOS_HAVE_CUDA
+        const int teamsize = Kokkos::Impl::is_same<DeviceType, Kokkos::Cuda>::value ? 32 : 1;
+      #else
+        const int teamsize = 1;
+      #endif
+      const int nteams = (list->inum*+teamsize-1)/teamsize;
+      Kokkos::TeamPolicy<DeviceType> config(nteams,teamsize,NeighClusterSize);
+      if (eflag || vflag) Kokkos::parallel_reduce(config,f,ev);
+      else Kokkos::parallel_for(config,f);
+    }
+  }
+
+  if (eflag) eng_vdwl += ev.evdwl;
+  if (vflag_global) {
+    virial[0] += ev.v[0];
+    virial[1] += ev.v[1];
+    virial[2] += ev.v[2];
+    virial[3] += ev.v[3];
+    virial[4] += ev.v[4];
+    virial[5] += ev.v[5];
+  }
+
+  if (vflag_fdotr) pair_virial_fdotr_compute(this);
+}
+
+template<class DeviceType>
+template<bool STACKPARAMS, class Specialisation>
+KOKKOS_INLINE_FUNCTION
+F_FLOAT PairTableRXKokkos<DeviceType>::
+compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const {
+  (void) i;
+  (void) j;
+  union_int_float_t rsq_lookup;
+  double fpair;
+  const int tidx = d_table_const.tabindex(itype,jtype);
+  //const Table* const tb = &tables[tabindex[itype][jtype]];
+
+  //if (rsq < d_table_const.innersq(tidx))
+  //  error->one(FLERR,"Pair distance < table inner cutoff");
+
+
+  if (Specialisation::TabStyle == LOOKUP) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    //if (itable >= tlm1)
+    //  error->one(FLERR,"Pair distance > table outer cutoff");
+    fpair = d_table_const.f(tidx,itable);
+  } else if (Specialisation::TabStyle == LINEAR) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    //if (itable >= tlm1)
+    //  error->one(FLERR,"Pair distance > table outer cutoff");
+    const double fraction = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
+    fpair = d_table_const.f(tidx,itable) + fraction*d_table_const.df(tidx,itable);
+  } else if (Specialisation::TabStyle == SPLINE) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    //if (itable >= tlm1)
+    //  error->one(FLERR,"Pair distance > table outer cutoff");
+    const double b = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
+    const double a = 1.0 - b;
+    fpair = a * d_table_const.f(tidx,itable) + b * d_table_const.f(tidx,itable+1) +
+      ((a*a*a-a)*d_table_const.f2(tidx,itable) + (b*b*b-b)*d_table_const.f2(tidx,itable+1)) *
+      d_table_const.deltasq6(tidx);
+  } else {
+    rsq_lookup.f = rsq;
+    int itable = rsq_lookup.i & d_table_const.nmask(tidx);
+    itable >>= d_table_const.nshiftbits(tidx);
+    const double fraction = (rsq_lookup.f - d_table_const.rsq(tidx,itable)) * d_table_const.drsq(tidx,itable);
+    fpair = d_table_const.f(tidx,itable) + fraction*d_table_const.df(tidx,itable);
+  }
+  return fpair;
+}
+
+template<class DeviceType>
+template<bool STACKPARAMS, class Specialisation>
+KOKKOS_INLINE_FUNCTION
+F_FLOAT PairTableRXKokkos<DeviceType>::
+compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const {
+  (void) i;
+  (void) j;
+  double evdwl;
+  union_int_float_t rsq_lookup;
+  const int tidx = d_table_const.tabindex(itype,jtype);
+  //const Table* const tb = &tables[tabindex[itype][jtype]];
+
+  //if (rsq < d_table_const.innersq(tidx))
+  //  error->one(FLERR,"Pair distance < table inner cutoff");
+
+  if (Specialisation::TabStyle == LOOKUP) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    //if (itable >= tlm1)
+    //  error->one(FLERR,"Pair distance > table outer cutoff");
+    evdwl = d_table_const.e(tidx,itable);
+  } else if (Specialisation::TabStyle == LINEAR) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    //if (itable >= tlm1)
+    //  error->one(FLERR,"Pair distance > table outer cutoff");
+    const double fraction = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
+    evdwl = d_table_const.e(tidx,itable) + fraction*d_table_const.de(tidx,itable);
+  } else if (Specialisation::TabStyle == SPLINE) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    //if (itable >= tlm1)
+    //  error->one(FLERR,"Pair distance > table outer cutoff");
+    const double b = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
+    const double a = 1.0 - b;
+    evdwl = a * d_table_const.e(tidx,itable) + b * d_table_const.e(tidx,itable+1) +
+        ((a*a*a-a)*d_table_const.e2(tidx,itable) + (b*b*b-b)*d_table_const.e2(tidx,itable+1)) *
+        d_table_const.deltasq6(tidx);
+  } else {
+    rsq_lookup.f = rsq;
+    int itable = rsq_lookup.i & d_table_const.nmask(tidx);
+    itable >>= d_table_const.nshiftbits(tidx);
+    const double fraction = (rsq_lookup.f - d_table_const.rsq(tidx,itable)) * d_table_const.drsq(tidx,itable);
+    evdwl = d_table_const.e(tidx,itable) + fraction*d_table_const.de(tidx,itable);
+  }
+  return evdwl;
+}
+
+template<class DeviceType>
+void PairTableRXKokkos<DeviceType>::create_kokkos_tables()
+{
+  const int tlm1 = tablength-1;
+
+  memory->create_kokkos(d_table->nshiftbits,h_table->nshiftbits,ntables,"Table::nshiftbits");
+  memory->create_kokkos(d_table->nmask,h_table->nmask,ntables,"Table::nmask");
+  memory->create_kokkos(d_table->innersq,h_table->innersq,ntables,"Table::innersq");
+  memory->create_kokkos(d_table->invdelta,h_table->invdelta,ntables,"Table::invdelta");
+  memory->create_kokkos(d_table->deltasq6,h_table->deltasq6,ntables,"Table::deltasq6");
+
+  if(tabstyle == LOOKUP) {
+    memory->create_kokkos(d_table->e,h_table->e,ntables,tlm1,"Table::e");
+    memory->create_kokkos(d_table->f,h_table->f,ntables,tlm1,"Table::f");
+  }
+
+  if(tabstyle == LINEAR) {
+    memory->create_kokkos(d_table->rsq,h_table->rsq,ntables,tablength,"Table::rsq");
+    memory->create_kokkos(d_table->e,h_table->e,ntables,tablength,"Table::e");
+    memory->create_kokkos(d_table->f,h_table->f,ntables,tablength,"Table::f");
+    memory->create_kokkos(d_table->de,h_table->de,ntables,tlm1,"Table::de");
+    memory->create_kokkos(d_table->df,h_table->df,ntables,tlm1,"Table::df");
+  }
+
+  if(tabstyle == SPLINE) {
+    memory->create_kokkos(d_table->rsq,h_table->rsq,ntables,tablength,"Table::rsq");
+    memory->create_kokkos(d_table->e,h_table->e,ntables,tablength,"Table::e");
+    memory->create_kokkos(d_table->f,h_table->f,ntables,tablength,"Table::f");
+    memory->create_kokkos(d_table->e2,h_table->e2,ntables,tablength,"Table::e2");
+    memory->create_kokkos(d_table->f2,h_table->f2,ntables,tablength,"Table::f2");
+  }
+
+  if(tabstyle == BITMAP) {
+    int ntable = 1 << tablength;
+    memory->create_kokkos(d_table->rsq,h_table->rsq,ntables,ntable,"Table::rsq");
+    memory->create_kokkos(d_table->e,h_table->e,ntables,ntable,"Table::e");
+    memory->create_kokkos(d_table->f,h_table->f,ntables,ntable,"Table::f");
+    memory->create_kokkos(d_table->de,h_table->de,ntables,ntable,"Table::de");
+    memory->create_kokkos(d_table->df,h_table->df,ntables,ntable,"Table::df");
+    memory->create_kokkos(d_table->drsq,h_table->drsq,ntables,ntable,"Table::drsq");
+  }
+
+  for(int i=0; i < ntables; i++) {
+    Table* tb = &tables[i];
+
+    h_table->nshiftbits[i] = tb->nshiftbits;
+    h_table->nmask[i] = tb->nmask;
+    h_table->innersq[i] = tb->innersq;
+    h_table->invdelta[i] = tb->invdelta;
+    h_table->deltasq6[i] = tb->deltasq6;
+
+    for(int j = 0; j<h_table->rsq.dimension_1(); j++)
+      h_table->rsq(i,j) = tb->rsq[j];
+    for(int j = 0; j<h_table->drsq.dimension_1(); j++)
+      h_table->drsq(i,j) = tb->drsq[j];
+    for(int j = 0; j<h_table->e.dimension_1(); j++)
+      h_table->e(i,j) = tb->e[j];
+    for(int j = 0; j<h_table->de.dimension_1(); j++)
+      h_table->de(i,j) = tb->de[j];
+    for(int j = 0; j<h_table->f.dimension_1(); j++)
+      h_table->f(i,j) = tb->f[j];
+    for(int j = 0; j<h_table->df.dimension_1(); j++)
+      h_table->df(i,j) = tb->df[j];
+    for(int j = 0; j<h_table->e2.dimension_1(); j++)
+      h_table->e2(i,j) = tb->e2[j];
+    for(int j = 0; j<h_table->f2.dimension_1(); j++)
+      h_table->f2(i,j) = tb->f2[j];
+  }
+
+
+  Kokkos::deep_copy(d_table->nshiftbits,h_table->nshiftbits);
+  Kokkos::deep_copy(d_table->nmask,h_table->nmask);
+  Kokkos::deep_copy(d_table->innersq,h_table->innersq);
+  Kokkos::deep_copy(d_table->invdelta,h_table->invdelta);
+  Kokkos::deep_copy(d_table->deltasq6,h_table->deltasq6);
+  Kokkos::deep_copy(d_table->rsq,h_table->rsq);
+  Kokkos::deep_copy(d_table->drsq,h_table->drsq);
+  Kokkos::deep_copy(d_table->e,h_table->e);
+  Kokkos::deep_copy(d_table->de,h_table->de);
+  Kokkos::deep_copy(d_table->f,h_table->f);
+  Kokkos::deep_copy(d_table->df,h_table->df);
+  Kokkos::deep_copy(d_table->e2,h_table->e2);
+  Kokkos::deep_copy(d_table->f2,h_table->f2);
+  Kokkos::deep_copy(d_table->tabindex,h_table->tabindex);
+
+  d_table_const.nshiftbits = d_table->nshiftbits;
+  d_table_const.nmask = d_table->nmask;
+  d_table_const.innersq = d_table->innersq;
+  d_table_const.invdelta = d_table->invdelta;
+  d_table_const.deltasq6 = d_table->deltasq6;
+  d_table_const.rsq = d_table->rsq;
+  d_table_const.drsq = d_table->drsq;
+  d_table_const.e = d_table->e;
+  d_table_const.de = d_table->de;
+  d_table_const.f = d_table->f;
+  d_table_const.df = d_table->df;
+  d_table_const.e2 = d_table->e2;
+  d_table_const.f2 = d_table->f2;
+
+
+  Kokkos::deep_copy(d_table->cutsq,h_table->cutsq);
+  update_table = 0;
+}
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableRXKokkos<DeviceType>::allocate()
+{
+  allocated = 1;
+  const int nt = atom->ntypes + 1;
+
+  memory->create(setflag,nt,nt,"pair:setflag");
+  memory->create_kokkos(d_table->cutsq,h_table->cutsq,cutsq,nt,nt,"pair:cutsq");
+  memory->create_kokkos(d_table->tabindex,h_table->tabindex,tabindex,nt,nt,"pair:tabindex");
+
+  d_table_const.cutsq = d_table->cutsq;
+  d_table_const.tabindex = d_table->tabindex;
+  memset(&setflag[0][0],0,nt*nt*sizeof(int));
+  memset(&cutsq[0][0],0,nt*nt*sizeof(double));
+  memset(&tabindex[0][0],0,nt*nt*sizeof(int));
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableRXKokkos<DeviceType>::settings(int narg, char **arg)
+{
+  if (narg < 2) error->all(FLERR,"Illegal pair_style command");
+
+  // new settings
+
+  if (strcmp(arg[0],"lookup") == 0) tabstyle = LOOKUP;
+  else if (strcmp(arg[0],"linear") == 0) tabstyle = LINEAR;
+  else if (strcmp(arg[0],"spline") == 0) tabstyle = SPLINE;
+  else if (strcmp(arg[0],"bitmap") == 0) tabstyle = BITMAP;
+  else error->all(FLERR,"Unknown table style in pair_style command");
+
+  tablength = force->inumeric(FLERR,arg[1]);
+  if (tablength < 2) error->all(FLERR,"Illegal number of pair table entries");
+
+  // optional keywords
+  // assert the tabulation is compatible with a specific long-range solver
+
+  int iarg = 2;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"ewald") == 0) ewaldflag = 1;
+    else if (strcmp(arg[iarg],"pppm") == 0) pppmflag = 1;
+    else if (strcmp(arg[iarg],"msm") == 0) msmflag = 1;
+    else if (strcmp(arg[iarg],"dispersion") == 0) dispersionflag = 1;
+    else if (strcmp(arg[iarg],"tip4p") == 0) tip4pflag = 1;
+    else error->all(FLERR,"Illegal pair_style command");
+    iarg++;
+  }
+
+  // delete old tables, since cannot just change settings
+
+  for (int m = 0; m < ntables; m++) free_table(&tables[m]);
+  memory->sfree(tables);
+
+  if (allocated) {
+    memory->destroy(setflag);
+
+    d_table_const.tabindex = d_table->tabindex = typename ArrayTypes<DeviceType>::t_int_2d();
+    h_table->tabindex = typename ArrayTypes<LMPHostType>::t_int_2d();
+
+    d_table_const.cutsq = d_table->cutsq = typename ArrayTypes<DeviceType>::t_ffloat_2d();
+    h_table->cutsq = typename ArrayTypes<LMPHostType>::t_ffloat_2d();
+  }
+  allocated = 0;
+
+  ntables = 0;
+  tables = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PairTableRXKokkos<DeviceType>::init_one(int i, int j)
+{
+  if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
+
+  tabindex[j][i] = tabindex[i][j];
+
+  if(i<MAX_TYPES_STACKPARAMS+1 && j<MAX_TYPES_STACKPARAMS+1) {
+    m_cutsq[j][i] = m_cutsq[i][j] = tables[tabindex[i][j]].cut*tables[tabindex[i][j]].cut;
+  }
+
+  return tables[tabindex[i][j]].cut;
+}
+
+template<class DeviceType>
+void PairTableRXKokkos<DeviceType>::init_style()
+{
+  neighbor->request(this,instance_me);
+  neighflag = lmp->kokkos->neighflag;
+  int irequest = neighbor->nrequest - 1;
+
+  neighbor->requests[irequest]->
+    kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value &&
+    !Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+  neighbor->requests[irequest]->
+    kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+
+  if (neighflag == FULL) {
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full_cluster = 0;
+  } else if (neighflag == HALF || neighflag == HALFTHREAD) {
+    neighbor->requests[irequest]->full = 0;
+    neighbor->requests[irequest]->half = 1;
+    neighbor->requests[irequest]->full_cluster = 0;
+  } else if (neighflag == N2) {
+    neighbor->requests[irequest]->full = 0;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full_cluster = 0;
+  } else if (neighflag == FULLCLUSTER) {
+    neighbor->requests[irequest]->full_cluster = 1;
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+  } else {
+    error->all(FLERR,"Cannot use chosen neighbor list style with lj/cut/kk");
+  }
+}
+
+/*
+template <class DeviceType> template<int NEIGHFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairTableRXKokkos<DeviceType>::
+ev_tally(EV_FLOAT &ev, const int &i, const int &j, const F_FLOAT &fpair,
+         const F_FLOAT &delx, const F_FLOAT &dely, const F_FLOAT &delz) const
+{
+  const int EFLAG = eflag;
+  const int NEWTON_PAIR = newton_pair;
+  const int VFLAG = vflag_either;
+
+  if (EFLAG) {
+    if (eflag_atom) {
+      E_FLOAT epairhalf = 0.5 * (ev.evdwl + ev.ecoul);
+      if (NEWTON_PAIR || i < nlocal) eatom[i] += epairhalf;
+      if (NEWTON_PAIR || j < nlocal) eatom[j] += epairhalf;
+    }
+  }
+
+  if (VFLAG) {
+    const E_FLOAT v0 = delx*delx*fpair;
+    const E_FLOAT v1 = dely*dely*fpair;
+    const E_FLOAT v2 = delz*delz*fpair;
+    const E_FLOAT v3 = delx*dely*fpair;
+    const E_FLOAT v4 = delx*delz*fpair;
+    const E_FLOAT v5 = dely*delz*fpair;
+
+    if (vflag_global) {
+      if (NEIGHFLAG) {
+        if (NEWTON_PAIR) {
+          ev.v[0] += v0;
+          ev.v[1] += v1;
+          ev.v[2] += v2;
+          ev.v[3] += v3;
+          ev.v[4] += v4;
+          ev.v[5] += v5;
+        } else {
+          if (i < nlocal) {
+            ev.v[0] += 0.5*v0;
+            ev.v[1] += 0.5*v1;
+            ev.v[2] += 0.5*v2;
+            ev.v[3] += 0.5*v3;
+            ev.v[4] += 0.5*v4;
+            ev.v[5] += 0.5*v5;
+          }
+          if (j < nlocal) {
+            ev.v[0] += 0.5*v0;
+            ev.v[1] += 0.5*v1;
+            ev.v[2] += 0.5*v2;
+            ev.v[3] += 0.5*v3;
+            ev.v[4] += 0.5*v4;
+            ev.v[5] += 0.5*v5;
+          }
+        }
+      } else {
+        ev.v[0] += 0.5*v0;
+        ev.v[1] += 0.5*v1;
+        ev.v[2] += 0.5*v2;
+        ev.v[3] += 0.5*v3;
+        ev.v[4] += 0.5*v4;
+        ev.v[5] += 0.5*v5;
+      }
+    }
+
+    if (vflag_atom) {
+      if (NEWTON_PAIR || i < nlocal) {
+        d_vatom(i,0) += 0.5*v0;
+        d_vatom(i,1) += 0.5*v1;
+        d_vatom(i,2) += 0.5*v2;
+        d_vatom(i,3) += 0.5*v3;
+        d_vatom(i,4) += 0.5*v4;
+        d_vatom(i,5) += 0.5*v5;
+      }
+      if (NEWTON_PAIR || (NEIGHFLAG && j < nlocal)) {
+        d_vatom(j,0) += 0.5*v0;
+        d_vatom(j,1) += 0.5*v1;
+        d_vatom(j,2) += 0.5*v2;
+        d_vatom(j,3) += 0.5*v3;
+        d_vatom(j,4) += 0.5*v4;
+        d_vatom(j,5) += 0.5*v5;
+      }
+    }
+  }
+}
+*/
+template<class DeviceType>
+void PairTableRXKokkos<DeviceType>::cleanup_copy() {
+  // WHY needed: this prevents parent copy from deallocating any arrays
+  allocated = 0;
+  cutsq = NULL;
+  eatom = NULL;
+  vatom = NULL;
+  h_table=NULL; d_table=NULL;
+}
+
+namespace LAMMPS_NS {
+template class PairTableRXKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class PairTableRXKokkos<LMPHostType>;
+#endif
+
+}
+
diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
new file mode 100644
index 0000000000..6f0616cc28
--- /dev/null
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -0,0 +1,269 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(table/rx/kk,PairTableRXKokkos<LMPDeviceType>)
+PairStyle(table/rx/kk/device,PairTableRXKokkos<LMPDeviceType>)
+PairStyle(table/rx/kk/host,PairTableRXKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_PAIR_TABLE_RX_KOKKOS_H
+#define LMP_PAIR_TABLE_RX_KOKKOS_H
+
+#include "pair_table_rx.h"
+#include "pair_kokkos.h"
+#include "neigh_list_kokkos.h"
+#include "atom_kokkos.h"
+
+namespace LAMMPS_NS {
+
+template<class Device,int TABSTYLE>
+struct S_TableRXCompute {
+  enum {TabStyle = TABSTYLE};
+};
+
+template <class DeviceType, int NEIGHFLAG, int TABSTYLE>
+class PairTableRXComputeFunctor;
+
+template<class DeviceType>
+class PairTableRXKokkos : public PairTableRX {
+ public:
+
+  enum {EnabledNeighFlags=FULL|HALFTHREAD|HALF|N2|FULLCLUSTER};
+  enum {COUL_FLAG=0};
+  typedef DeviceType device_type;
+
+  PairTableRXKokkos(class LAMMPS *);
+  virtual ~PairTableRXKokkos();
+
+  virtual void compute(int, int);
+
+  template<int TABSTYLE>
+  void compute_style(int, int);
+
+  /*template<int EVFLAG, int NEIGHFLAG, int NEWTON_PAIR, int TABSTYLE>
+  KOKKOS_FUNCTION
+  EV_FLOAT compute_item(const int& i,
+                        const NeighListKokkos<DeviceType> &list) const;
+*/
+  void settings(int, char **);
+  double init_one(int, int);
+  void init_style();
+
+ protected:
+  enum{LOOKUP,LINEAR,SPLINE,BITMAP};
+
+  int tabstyle,tablength;
+  /*struct TableDeviceConst {
+    typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread cutsq;
+    typename ArrayTypes<DeviceType>::t_int_2d_randomread tabindex;
+    typename ArrayTypes<DeviceType>::t_int_1d_randomread nshiftbits,nmask;
+    typename ArrayTypes<DeviceType>::t_ffloat_1d_randomread innersq,invdelta,deltasq6;
+    typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread rsq,drsq,e,de,f,df,e2,f2;
+  };*/
+ //Its faster not to use texture fetch if the number of tables is less than 32!
+  struct TableDeviceConst {
+    typename ArrayTypes<DeviceType>::t_ffloat_2d cutsq;
+    typename ArrayTypes<DeviceType>::t_int_2d tabindex;
+    typename ArrayTypes<DeviceType>::t_int_1d nshiftbits,nmask;
+    typename ArrayTypes<DeviceType>::t_ffloat_1d innersq,invdelta,deltasq6;
+    typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread rsq,drsq,e,de,f,df,e2,f2;
+  };
+
+  struct TableDevice {
+    typename ArrayTypes<DeviceType>::t_ffloat_2d cutsq;
+    typename ArrayTypes<DeviceType>::t_int_2d tabindex;
+    typename ArrayTypes<DeviceType>::t_int_1d nshiftbits,nmask;
+    typename ArrayTypes<DeviceType>::t_ffloat_1d innersq,invdelta,deltasq6;
+    typename ArrayTypes<DeviceType>::t_ffloat_2d rsq,drsq,e,de,f,df,e2,f2;
+  };
+
+  struct TableHost {
+    typename ArrayTypes<LMPHostType>::t_ffloat_2d cutsq;
+    typename ArrayTypes<LMPHostType>::t_int_2d tabindex;
+    typename ArrayTypes<LMPHostType>::t_int_1d nshiftbits,nmask;
+    typename ArrayTypes<LMPHostType>::t_ffloat_1d innersq,invdelta,deltasq6;
+    typename ArrayTypes<LMPHostType>::t_ffloat_2d rsq,drsq,e,de,f,df,e2,f2;
+  };
+
+  TableDeviceConst d_table_const;
+  TableDevice* d_table;
+  TableHost* h_table;
+
+  int **tabindex;
+  F_FLOAT m_cutsq[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
+
+  typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq;
+
+  void allocate();
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread x;
+  typename ArrayTypes<DeviceType>::t_x_array_const c_x;
+  typename ArrayTypes<DeviceType>::t_f_array f;
+  typename ArrayTypes<DeviceType>::t_int_1d_randomread type;
+  typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom;
+  typename ArrayTypes<DeviceType>::t_virial_array d_vatom;
+
+ protected:
+  int nlocal,nall,eflag,vflag,neighflag,newton_pair;
+
+  int update_table;
+  void create_kokkos_tables();
+  void cleanup_copy();
+
+  template<bool STACKPARAMS, class Specialisation>
+  KOKKOS_INLINE_FUNCTION
+  F_FLOAT compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const;
+
+  template<bool STACKPARAMS, class Specialisation>
+  KOKKOS_INLINE_FUNCTION
+  F_FLOAT compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const;
+
+  template<bool STACKPARAMS, class Specialisation>
+  KOKKOS_INLINE_FUNCTION
+  F_FLOAT compute_ecoul(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const {
+    return 0;
+  }
+
+  friend class PairComputeFunctor<PairTableRXKokkos,FULL,true,S_TableRXCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALF,true,S_TableRXCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,true,S_TableRXCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,N2,true,S_TableRXCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,FULLCLUSTER,true,S_TableRXCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,FULL,false,S_TableRXCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALF,false,S_TableRXCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,false,S_TableRXCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,N2,false,S_TableRXCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,FULLCLUSTER,false,S_TableRXCompute<DeviceType,LOOKUP> >;
+
+  friend class PairComputeFunctor<PairTableRXKokkos,FULL,true,S_TableRXCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALF,true,S_TableRXCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,true,S_TableRXCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,N2,true,S_TableRXCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,FULLCLUSTER,true,S_TableRXCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,FULL,false,S_TableRXCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALF,false,S_TableRXCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,false,S_TableRXCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,N2,false,S_TableRXCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,FULLCLUSTER,false,S_TableRXCompute<DeviceType,LINEAR> >;
+
+  friend class PairComputeFunctor<PairTableRXKokkos,FULL,true,S_TableRXCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALF,true,S_TableRXCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,true,S_TableRXCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,N2,true,S_TableRXCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,FULLCLUSTER,true,S_TableRXCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,FULL,false,S_TableRXCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALF,false,S_TableRXCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,false,S_TableRXCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,N2,false,S_TableRXCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,FULLCLUSTER,false,S_TableRXCompute<DeviceType,SPLINE> >;
+
+  friend class PairComputeFunctor<PairTableRXKokkos,FULL,true,S_TableRXCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALF,true,S_TableRXCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,true,S_TableRXCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,N2,true,S_TableRXCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,FULLCLUSTER,true,S_TableRXCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,FULL,false,S_TableRXCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALF,false,S_TableRXCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,false,S_TableRXCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,N2,false,S_TableRXCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,FULLCLUSTER,false,S_TableRXCompute<DeviceType,BITMAP> >;
+
+  friend void pair_virial_fdotr_compute<PairTableRXKokkos>(PairTableRXKokkos*);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Pair distance < table inner cutoff
+
+Two atoms are closer together than the pairwise table allows.
+
+E: Pair distance > table outer cutoff
+
+Two atoms are further apart than the pairwise table allows.
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Unknown table style in pair_style command
+
+Style of table is invalid for use with pair_style table command.
+
+E: Illegal number of pair table entries
+
+There must be at least 2 table entries.
+
+E: Invalid pair table length
+
+Length of read-in pair table is invalid
+
+E: Invalid pair table cutoff
+
+Cutoffs in pair_coeff command are not valid with read-in pair table.
+
+E: Bitmapped table in file does not match requested table
+
+Setting for bitmapped table in pair_coeff command must match table
+in file exactly.
+
+E: All pair coeffs are not set
+
+All pair coefficients must be set in the data file or by the
+pair_coeff command before running a simulation.
+
+E: Cannot open file %s
+
+The specified file cannot be opened.  Check that the path and name are
+correct. If the file is a compressed file, also check that the gzip
+executable can be found and run.
+
+E: Did not find keyword in table file
+
+Keyword used in pair_coeff command was not found in table file.
+
+E: Bitmapped table is incorrect length in table file
+
+Number of table entries is not a correct power of 2.
+
+E: Invalid keyword in pair table parameters
+
+Keyword used in list of table parameters is not recognized.
+
+E: Pair table parameters did not set N
+
+List of pair table parameters must include N setting.
+
+E: Pair table cutoffs must all be equal to use with KSpace
+
+When using pair style table with a long-range KSpace solver, the
+cutoffs for all atom type pairs must all be the same, since the
+long-range solver starts at that cutoff.
+
+E: Cannot use chosen neighbor list style with lj/cut/kk
+
+That style is not supported by Kokkos.
+
+
+
+
+*/
\ No newline at end of file
diff --git a/src/USER-DPD/pair_multi_lucy.h b/src/USER-DPD/pair_multi_lucy.h
index f3c67e4fa4..0a2d2f9885 100644
--- a/src/USER-DPD/pair_multi_lucy.h
+++ b/src/USER-DPD/pair_multi_lucy.h
@@ -18,7 +18,7 @@ PairStyle(multi/lucy,PairMultiLucy)
 #else
 
 #ifndef LMP_PAIR_MULTI_LUCY_H
-#define LMP_PAIR_MUTLI_LUCY_H
+#define LMP_PAIR_MULTI_LUCY_H
 
 #include "pair.h"
 
diff --git a/src/USER-DPD/pair_multi_lucy_rx.cpp b/src/USER-DPD/pair_multi_lucy_rx.cpp
index cd107f1519..431293e823 100644
--- a/src/USER-DPD/pair_multi_lucy_rx.cpp
+++ b/src/USER-DPD/pair_multi_lucy_rx.cpp
@@ -59,8 +59,7 @@ static const char cite_pair_multi_lucy_rx[] =
 
 /* ---------------------------------------------------------------------- */
 
-PairMultiLucyRX::PairMultiLucyRX(LAMMPS *lmp) : Pair(lmp),
-  ntables(0), tables(NULL), tabindex(NULL), site1(NULL), site2(NULL)
+PairMultiLucyRX::PairMultiLucyRX(LAMMPS *lmp) : Pair(lmp)
 {
   if (lmp->citeme) lmp->citeme->add(cite_pair_multi_lucy_rx);
 
@@ -69,6 +68,9 @@ PairMultiLucyRX::PairMultiLucyRX(LAMMPS *lmp) : Pair(lmp),
   ntables = 0;
   tables = NULL;
 
+  tabindex = NULL;
+  site1 = site2 = NULL;
+
   comm_forward = 1;
   comm_reverse = 1;
 
diff --git a/src/USER-DPD/pair_multi_lucy_rx.h b/src/USER-DPD/pair_multi_lucy_rx.h
index 596a6c684d..2913716c5a 100644
--- a/src/USER-DPD/pair_multi_lucy_rx.h
+++ b/src/USER-DPD/pair_multi_lucy_rx.h
@@ -18,7 +18,7 @@ PairStyle(multi/lucy/rx,PairMultiLucyRX)
 #else
 
 #ifndef LMP_PAIR_MULTI_LUCY_RX_H
-#define LMP_PAIR_MUTLI_LUCY_RX_H
+#define LMP_PAIR_MULTI_LUCY_RX_H
 
 #include "pair.h"
 
diff --git a/src/pair_table.h b/src/pair_table.h
index 6cfd9df832..358491f7cf 100644
--- a/src/pair_table.h
+++ b/src/pair_table.h
@@ -30,9 +30,9 @@ class PairTable : public Pair {
   virtual ~PairTable();
 
   virtual void compute(int, int);
-  void settings(int, char **);
+  virtual void settings(int, char **);
   void coeff(int, char **);
-  double init_one(int, int);
+  virtual double init_one(int, int);
   void write_restart(FILE *);
   void read_restart(FILE *);
   void write_restart_settings(FILE *);
@@ -58,7 +58,7 @@ class PairTable : public Pair {
 
   int **tabindex;
 
-  void allocate();
+  virtual void allocate();
   void read_table(Table *, char *, char *);
   void param_extract(Table *, char *);
   void bcast_table(Table *);

From f93c62d3e20301ea53e41a5a6f50b8aa9957d942 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Tue, 13 Dec 2016 16:54:00 -0700
Subject: [PATCH 008/267] Reverting accidental change

---
 src/USER-DPD/pair_multi_lucy_rx.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/USER-DPD/pair_multi_lucy_rx.cpp b/src/USER-DPD/pair_multi_lucy_rx.cpp
index 431293e823..cd107f1519 100644
--- a/src/USER-DPD/pair_multi_lucy_rx.cpp
+++ b/src/USER-DPD/pair_multi_lucy_rx.cpp
@@ -59,7 +59,8 @@ static const char cite_pair_multi_lucy_rx[] =
 
 /* ---------------------------------------------------------------------- */
 
-PairMultiLucyRX::PairMultiLucyRX(LAMMPS *lmp) : Pair(lmp)
+PairMultiLucyRX::PairMultiLucyRX(LAMMPS *lmp) : Pair(lmp),
+  ntables(0), tables(NULL), tabindex(NULL), site1(NULL), site2(NULL)
 {
   if (lmp->citeme) lmp->citeme->add(cite_pair_multi_lucy_rx);
 
@@ -68,9 +69,6 @@ PairMultiLucyRX::PairMultiLucyRX(LAMMPS *lmp) : Pair(lmp)
   ntables = 0;
   tables = NULL;
 
-  tabindex = NULL;
-  site1 = site2 = NULL;
-
   comm_forward = 1;
   comm_reverse = 1;
 

From 3e2cd6d265db7bbbe97cc2dc00977cead964e67c Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Wed, 14 Dec 2016 11:46:04 -0700
Subject: [PATCH 009/267] Merging from master to 13Dec16 version

---
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp |  1 -
 src/KOKKOS/pair_exp6_rx_kokkos.cpp        |  2 --
 src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp  |  2 --
 src/KOKKOS/pair_table_rx_kokkos.cpp       | 35 +----------------------
 src/KOKKOS/pair_table_rx_kokkos.h         | 10 +------
 src/KOKKOS/pair_vashishta_kokkos.cpp      |  1 -
 src/neigh_request.cpp                     |  1 -
 src/neigh_request.h                       |  1 -
 8 files changed, 2 insertions(+), 51 deletions(-)

diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
index f7e1fecc09..45da5bf165 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
@@ -317,7 +317,6 @@ void PairDPDfdtEnergyKokkos<DeviceType>::init_style()
   if (neighflag == HALF || neighflag == HALFTHREAD) {
     neighbor->requests[irequest]->full = 0;
     neighbor->requests[irequest]->half = 1;
-    neighbor->requests[irequest]->full_cluster = 0;
   } else {
     error->all(FLERR,"Cannot use chosen neighbor list style with dpd/fdt/energy/kk");
   }
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index a7d5569537..569d131af7 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -87,12 +87,10 @@ void PairExp6rxKokkos<DeviceType>::init_style()
   if (neighflag == FULL) {
     neighbor->requests[irequest]->full = 1;
     neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->full_cluster = 0;
     neighbor->requests[irequest]->ghost = 1;
   } else if (neighflag == HALF || neighflag == HALFTHREAD) {
     neighbor->requests[irequest]->full = 0;
     neighbor->requests[irequest]->half = 1;
-    neighbor->requests[irequest]->full_cluster = 0;
     neighbor->requests[irequest]->ghost = 1;
   } else {
     error->all(FLERR,"Cannot use chosen neighbor list style with reax/c/kk");
diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
index de70ae86f5..d1a13b12fd 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
@@ -101,12 +101,10 @@ void PairMultiLucyRXKokkos<DeviceType>::init_style()
   if (neighflag == FULL) {
     neighbor->requests[irequest]->full = 1;
     neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->full_cluster = 0;
     neighbor->requests[irequest]->ghost = 1;
   } else if (neighflag == HALF || neighflag == HALFTHREAD) {
     neighbor->requests[irequest]->full = 0;
     neighbor->requests[irequest]->half = 1;
-    neighbor->requests[irequest]->full_cluster = 0;
     neighbor->requests[irequest]->ghost = 1;
   } else {
     error->all(FLERR,"Cannot use chosen neighbor list style with reax/c/kk");
diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index 4c809d98bd..bf32d1c14f 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -96,7 +96,7 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
   eflag = eflag_in;
   vflag = vflag_in;
 
-  if (neighflag == FULL || neighflag == FULLCLUSTER) no_virial_fdotr_compute = 1;
+  if (neighflag == FULL) no_virial_fdotr_compute = 1;
 
 
   if (eflag || vflag) ev_setup(eflag,vflag);
@@ -143,19 +143,6 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
         f(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(nlocal,f,ev);
       else Kokkos::parallel_for(nlocal,f);
-    } else if (neighflag == FULLCLUSTER) {
-      typedef PairComputeFunctor<PairTableRXKokkos<DeviceType>,FULLCLUSTER,false,S_TableRXCompute<DeviceType,TABSTYLE> >
-        f_type;
-      f_type f(this,(NeighListKokkos<DeviceType>*) list);
-      #ifdef KOKKOS_HAVE_CUDA
-        const int teamsize = Kokkos::Impl::is_same<DeviceType, Kokkos::Cuda>::value ? 32 : 1;
-      #else
-        const int teamsize = 1;
-      #endif
-      const int nteams = (list->inum*+teamsize-1)/teamsize;
-      Kokkos::TeamPolicy<DeviceType> config(nteams,teamsize,NeighClusterSize);
-      if (eflag || vflag) Kokkos::parallel_reduce(config,f,ev);
-      else Kokkos::parallel_for(config,f);
     }
   } else {
     if (neighflag == FULL) {
@@ -178,19 +165,6 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
         f(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(nlocal,f,ev);
       else Kokkos::parallel_for(nlocal,f);
-    } else if (neighflag == FULLCLUSTER) {
-      typedef PairComputeFunctor<PairTableRXKokkos<DeviceType>,FULLCLUSTER,true,S_TableRXCompute<DeviceType,TABSTYLE> >
-        f_type;
-      f_type f(this,(NeighListKokkos<DeviceType>*) list);
-      #ifdef KOKKOS_HAVE_CUDA
-        const int teamsize = Kokkos::Impl::is_same<DeviceType, Kokkos::Cuda>::value ? 32 : 1;
-      #else
-        const int teamsize = 1;
-      #endif
-      const int nteams = (list->inum*+teamsize-1)/teamsize;
-      Kokkos::TeamPolicy<DeviceType> config(nteams,teamsize,NeighClusterSize);
-      if (eflag || vflag) Kokkos::parallel_reduce(config,f,ev);
-      else Kokkos::parallel_for(config,f);
     }
   }
 
@@ -511,19 +485,12 @@ void PairTableRXKokkos<DeviceType>::init_style()
   if (neighflag == FULL) {
     neighbor->requests[irequest]->full = 1;
     neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->full_cluster = 0;
   } else if (neighflag == HALF || neighflag == HALFTHREAD) {
     neighbor->requests[irequest]->full = 0;
     neighbor->requests[irequest]->half = 1;
-    neighbor->requests[irequest]->full_cluster = 0;
   } else if (neighflag == N2) {
     neighbor->requests[irequest]->full = 0;
     neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->full_cluster = 0;
-  } else if (neighflag == FULLCLUSTER) {
-    neighbor->requests[irequest]->full_cluster = 1;
-    neighbor->requests[irequest]->full = 1;
-    neighbor->requests[irequest]->half = 0;
   } else {
     error->all(FLERR,"Cannot use chosen neighbor list style with lj/cut/kk");
   }
diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
index 6f0616cc28..b379901201 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.h
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -41,7 +41,7 @@ template<class DeviceType>
 class PairTableRXKokkos : public PairTableRX {
  public:
 
-  enum {EnabledNeighFlags=FULL|HALFTHREAD|HALF|N2|FULLCLUSTER};
+  enum {EnabledNeighFlags=FULL|HALFTHREAD|HALF|N2};
   enum {COUL_FLAG=0};
   typedef DeviceType device_type;
 
@@ -141,45 +141,37 @@ class PairTableRXKokkos : public PairTableRX {
   friend class PairComputeFunctor<PairTableRXKokkos,HALF,true,S_TableRXCompute<DeviceType,LOOKUP> >;
   friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,true,S_TableRXCompute<DeviceType,LOOKUP> >;
   friend class PairComputeFunctor<PairTableRXKokkos,N2,true,S_TableRXCompute<DeviceType,LOOKUP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,FULLCLUSTER,true,S_TableRXCompute<DeviceType,LOOKUP> >;
   friend class PairComputeFunctor<PairTableRXKokkos,FULL,false,S_TableRXCompute<DeviceType,LOOKUP> >;
   friend class PairComputeFunctor<PairTableRXKokkos,HALF,false,S_TableRXCompute<DeviceType,LOOKUP> >;
   friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,false,S_TableRXCompute<DeviceType,LOOKUP> >;
   friend class PairComputeFunctor<PairTableRXKokkos,N2,false,S_TableRXCompute<DeviceType,LOOKUP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,FULLCLUSTER,false,S_TableRXCompute<DeviceType,LOOKUP> >;
 
   friend class PairComputeFunctor<PairTableRXKokkos,FULL,true,S_TableRXCompute<DeviceType,LINEAR> >;
   friend class PairComputeFunctor<PairTableRXKokkos,HALF,true,S_TableRXCompute<DeviceType,LINEAR> >;
   friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,true,S_TableRXCompute<DeviceType,LINEAR> >;
   friend class PairComputeFunctor<PairTableRXKokkos,N2,true,S_TableRXCompute<DeviceType,LINEAR> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,FULLCLUSTER,true,S_TableRXCompute<DeviceType,LINEAR> >;
   friend class PairComputeFunctor<PairTableRXKokkos,FULL,false,S_TableRXCompute<DeviceType,LINEAR> >;
   friend class PairComputeFunctor<PairTableRXKokkos,HALF,false,S_TableRXCompute<DeviceType,LINEAR> >;
   friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,false,S_TableRXCompute<DeviceType,LINEAR> >;
   friend class PairComputeFunctor<PairTableRXKokkos,N2,false,S_TableRXCompute<DeviceType,LINEAR> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,FULLCLUSTER,false,S_TableRXCompute<DeviceType,LINEAR> >;
 
   friend class PairComputeFunctor<PairTableRXKokkos,FULL,true,S_TableRXCompute<DeviceType,SPLINE> >;
   friend class PairComputeFunctor<PairTableRXKokkos,HALF,true,S_TableRXCompute<DeviceType,SPLINE> >;
   friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,true,S_TableRXCompute<DeviceType,SPLINE> >;
   friend class PairComputeFunctor<PairTableRXKokkos,N2,true,S_TableRXCompute<DeviceType,SPLINE> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,FULLCLUSTER,true,S_TableRXCompute<DeviceType,SPLINE> >;
   friend class PairComputeFunctor<PairTableRXKokkos,FULL,false,S_TableRXCompute<DeviceType,SPLINE> >;
   friend class PairComputeFunctor<PairTableRXKokkos,HALF,false,S_TableRXCompute<DeviceType,SPLINE> >;
   friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,false,S_TableRXCompute<DeviceType,SPLINE> >;
   friend class PairComputeFunctor<PairTableRXKokkos,N2,false,S_TableRXCompute<DeviceType,SPLINE> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,FULLCLUSTER,false,S_TableRXCompute<DeviceType,SPLINE> >;
 
   friend class PairComputeFunctor<PairTableRXKokkos,FULL,true,S_TableRXCompute<DeviceType,BITMAP> >;
   friend class PairComputeFunctor<PairTableRXKokkos,HALF,true,S_TableRXCompute<DeviceType,BITMAP> >;
   friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,true,S_TableRXCompute<DeviceType,BITMAP> >;
   friend class PairComputeFunctor<PairTableRXKokkos,N2,true,S_TableRXCompute<DeviceType,BITMAP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,FULLCLUSTER,true,S_TableRXCompute<DeviceType,BITMAP> >;
   friend class PairComputeFunctor<PairTableRXKokkos,FULL,false,S_TableRXCompute<DeviceType,BITMAP> >;
   friend class PairComputeFunctor<PairTableRXKokkos,HALF,false,S_TableRXCompute<DeviceType,BITMAP> >;
   friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,false,S_TableRXCompute<DeviceType,BITMAP> >;
   friend class PairComputeFunctor<PairTableRXKokkos,N2,false,S_TableRXCompute<DeviceType,BITMAP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,FULLCLUSTER,false,S_TableRXCompute<DeviceType,BITMAP> >;
 
   friend void pair_virial_fdotr_compute<PairTableRXKokkos>(PairTableRXKokkos*);
 };
diff --git a/src/KOKKOS/pair_vashishta_kokkos.cpp b/src/KOKKOS/pair_vashishta_kokkos.cpp
index 73e4e04f98..bf3b5bae85 100644
--- a/src/KOKKOS/pair_vashishta_kokkos.cpp
+++ b/src/KOKKOS/pair_vashishta_kokkos.cpp
@@ -603,7 +603,6 @@ void PairVashishtaKokkos<DeviceType>::init_style()
   if (neighflag == FULL || neighflag == HALF || neighflag == HALFTHREAD) {
     neighbor->requests[irequest]->full = 1;
     neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->full_cluster = 0;
     if (neighflag == FULL)
       neighbor->requests[irequest]->ghost = 1;
     else
diff --git a/src/neigh_request.cpp b/src/neigh_request.cpp
index 4a3eb14933..a8ba8496cd 100644
--- a/src/neigh_request.cpp
+++ b/src/neigh_request.cpp
@@ -39,7 +39,6 @@ NeighRequest::NeighRequest(LAMMPS *lmp) : Pointers(lmp)
   gran = granhistory = 0;
   respainner = respamiddle = respaouter = 0;
   half_from_full = 0;
-  full_cluster = 0;
 
   // only set when command = 1;
 
diff --git a/src/neigh_request.h b/src/neigh_request.h
index 0b561710e7..62cb11f830 100644
--- a/src/neigh_request.h
+++ b/src/neigh_request.h
@@ -47,7 +47,6 @@ class NeighRequest : protected Pointers {
   int respainner;        // 1 if a rRESPA inner list
   int respamiddle;       // 1 if a rRESPA middle list
   int respaouter;        // 1 if a rRESPA outer list
-  int full_cluster;      // only used by Kokkos pair styles
 
   // command_style only set if command = 1
   // allows print_pair_info() to access command name

From a9d26b3f4aadbf8e7a7aa91190d513b6a02217d2 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Wed, 14 Dec 2016 12:58:02 -0700
Subject: [PATCH 010/267] Updates to Kokkos files

---
 src/KOKKOS/fix_eos_table_rx_kokkos.cpp   | 143 ++++++++++++++---------
 src/KOKKOS/fix_eos_table_rx_kokkos.h     |  21 ++++
 src/KOKKOS/pair_exp6_rx_kokkos.cpp       |  16 ++-
 src/KOKKOS/pair_exp6_rx_kokkos.h         |   2 +
 src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp |  33 +++---
 src/KOKKOS/pair_multi_lucy_rx_kokkos.h   |   2 +
 6 files changed, 145 insertions(+), 72 deletions(-)

diff --git a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
index a1e0b1a07d..faf490fcc0 100644
--- a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
@@ -42,6 +42,9 @@ FixEOStableRXKokkos<DeviceType>::FixEOStableRXKokkos(LAMMPS *lmp, int narg, char
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
   datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
   datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
+
+  k_error_flag = DAT::tdual_int_scalar("fix:error_flag");
+  k_warning_flag = DAT::tdual_int_scalar("fix:warning_flag");
 }
 
 /* ---------------------------------------------------------------------- */
@@ -65,22 +68,33 @@ void FixEOStableRXKokkos<DeviceType>::setup(int vflag)
   dpdTheta= atomKK->k_dpdTheta.view<DeviceType>();
   uCG = atomKK->k_uCG.view<DeviceType>();
   uCGnew = atomKK->k_uCGnew.view<DeviceType>();
-  double duChem;
 
-  for (int i = 0; i < nlocal; i++) // parallel_for
-    if (mask[i] & groupbit){
-      duChem = uCG[i] - uCGnew[i];
-      uChem[i] += duChem;
-      uCG[i] = 0.0;
-      uCGnew[i] = 0.0;
-    }
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXSetup>(0,nlocal),*this);
 
   // Communicate the updated momenta and velocities to all nodes
   comm->forward_comm_fix(this);
 
-  for (int i = 0; i < nlocal; i++) // parallel_for
-    if (mask[i] & groupbit)
-      temperature_lookup(i,uCond[i]+uMech[i]+uChem[i],dpdTheta[i]);
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXTemperatureLookup>(0,nlocal),*this);
+
+  error_check();
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixEOStableRXKokkos<DeviceType>::operator()(TagFixEOStableRXSetup, const int &i) const {
+  if (mask[i] & groupbit) {
+    const double duChem = uCG[i] - uCGnew[i];
+    uChem[i] += duChem;
+    uCG[i] = 0.0;
+    uCGnew[i] = 0.0;
+  }
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixEOStableRXKokkos<DeviceType>::operator()(TagFixEOStableRXTemperatureLookup, const int &i) const {
+  if (mask[i] & groupbit)
+    temperature_lookup(i,uCond[i]+uMech[i]+uChem[i],dpdTheta[i]);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -94,25 +108,28 @@ void FixEOStableRXKokkos<DeviceType>::init()
   uMech = atomKK->k_uMech.view<DeviceType>();
   uChem = atomKK->k_uChem.view<DeviceType>();
   dpdTheta= atomKK->k_dpdTheta.view<DeviceType>();
-  double tmp;
 
-  if(this->restart_reset){
-    for (int i = 0; i < nlocal; i++)
-      if (mask[i] & groupbit)
-        temperature_lookup(i,uCond[i]+uMech[i]+uChem[i],dpdTheta[i]);
-  } else {
-    for (int i = 0; i < nlocal; i++)
-      if (mask[i] & groupbit) {
-        if(dpdTheta[i] <= 0.0)
-          error->one(FLERR,"Internal temperature <= zero");
-        energy_lookup(i,dpdTheta[i],tmp);
-        uCond[i] = tmp / 2.0;
-        uMech[i] = tmp / 2.0;
-        uChem[i] = 0.0;
-      }
-  }
+  if (this->restart_reset)
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXTemperatureLookup>(0,nlocal),*this);
+  else
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXInit>(0,nlocal),*this);
+
+  error_check();
 }
 
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixEOStableRXKokkos<DeviceType>::operator()(TagFixEOStableRXInit, const int &i) const {
+  double tmp;
+  if (mask[i] & groupbit) {
+    if(dpdTheta[i] <= 0.0)
+      k_error_flag.d_view() = 1;
+    energy_lookup(i,dpdTheta[i],tmp);
+    uCond[i] = tmp / 2.0;
+    uMech[i] = tmp / 2.0;
+    uChem[i] = 0.0;
+  }
+}
 
 /* ---------------------------------------------------------------------- */
 
@@ -126,12 +143,19 @@ void FixEOStableRXKokkos<DeviceType>::post_integrate()
   uChem = atomKK->k_uChem.view<DeviceType>();
   dpdTheta= atomKK->k_dpdTheta.view<DeviceType>();
 
-  for (int i = 0; i < nlocal; i++)
-    if (mask[i] & groupbit){
-      temperature_lookup(i,uCond[i]+uMech[i]+uChem[i],dpdTheta[i]);
-      if(dpdTheta[i] <= 0.0)
-        error->one(FLERR,"Internal temperature <= zero");
-    }
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXTemperatureLookup2>(0,nlocal),*this);
+
+  error_check();
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixEOStableRXKokkos<DeviceType>::operator()(TagFixEOStableRXTemperatureLookup2, const int &i) const {
+  if (mask[i] & groupbit){
+    temperature_lookup(i,uCond[i]+uMech[i]+uChem[i],dpdTheta[i]);
+    if (dpdTheta[i] <= 0.0)
+      k_error_flag.d_view() = 1;
+  }
 }
 
 /* ---------------------------------------------------------------------- */
@@ -152,23 +176,14 @@ void FixEOStableRXKokkos<DeviceType>::end_of_step()
   // Communicate the ghost uCGnew
   comm->reverse_comm_fix(this);
 
-  for (int i = 0; i < nlocal; i++)
-    if (mask[i] & groupbit){
-      duChem = uCG[i] - uCGnew[i];
-      uChem[i] += duChem;
-      uCG[i] = 0.0;
-      uCGnew[i] = 0.0;
-    }
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXSetup>(0,nlocal),*this);
 
   // Communicate the updated momenta and velocities to all nodes
   comm->forward_comm_fix(this);
 
-  for (int i = 0; i < nlocal; i++)
-    if (mask[i] & groupbit){
-      temperature_lookup(i,uCond[i]+uMech[i]+uChem[i],dpdTheta[i]);
-      if(dpdTheta[i] <= 0.0)
-        error->one(FLERR,"Internal temperature <= zero");
-    }
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXTemperatureLookup2>(0,nlocal),*this);
+
+  error_check();
 }
 
 /* ----------------------------------------------------------------------
@@ -242,13 +257,11 @@ void FixEOStableRXKokkos<DeviceType>::temperature_lookup(int id, double ui, doub
   // Apply the Secant Method
   for(it=0; it<maxit; it++){
     if(fabs(f2-f1)<1e-15){
-      if(isnan(f1) || isnan(f2)) error->one(FLERR,"NaN detected in secant solver.");
+      if(isnan(f1) || isnan(f2)) k_error_flag.d_view() = 2;
       temp = t1;
       temp = MAX(temp,tb->lo);
       temp = MIN(temp,tb->hi);
-      char str[256];
-      sprintf(str,"Secant solver did not converge because table bounds were exceeded:  it=%d id=%d ui=%lf thetai=%lf t1=%lf t2=%lf f1=%lf f2=%lf dpdTheta=%lf\n",it,id,ui,thetai,t1,t2,f1,f2,temp);
-      error->warning(FLERR,str);
+      k_warning_flag.d_view() = 1;
       break;
     }
     temp = t2 - f2*(t2-t1)/(f2-f1);
@@ -260,11 +273,9 @@ void FixEOStableRXKokkos<DeviceType>::temperature_lookup(int id, double ui, doub
     f2 = u2 - ui;
   }
   if(it==maxit){
-    char str[256];
-    sprintf(str,"Maxit exceeded in secant solver:  id=%d ui=%lf thetai=%lf t1=%lf t2=%lf f1=%lf f2=%lf\n",id,ui,thetai,t1,t2,f1,f2);
     if(isnan(f1) || isnan(f2) || isnan(ui) || isnan(thetai) || isnan(t1) || isnan(t2))
-      error->one(FLERR,"NaN detected in secant solver.");
-    error->one(FLERR,str);
+      k_error_flag.d_view() = 2;
+    k_error_flag.d_view() = 3;
   }
   thetai = temp;
 }
@@ -346,6 +357,30 @@ void FixEOStableRXKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, doub
 
 /* ---------------------------------------------------------------------- */
 
+template<class DeviceType>
+void FixEOStableRXKokkos<DeviceType>::error_check()
+{
+  k_error_flag.template modify<DeviceType>();
+  k_error_flag.template sync<LMPHostType>();
+  if (k_error_flag.h_view() == 1)
+    error->one(FLERR,"Internal temperature <= zero");
+  else if (k_error_flag.h_view() == 2)
+    error->one(FLERR,"NaN detected in secant solver.");
+  else if (k_error_flag.h_view() == 3)
+    error->one(FLERR,"Maxit exceeded in secant solver.");
+
+  k_warning_flag.template modify<DeviceType>();
+  k_warning_flag.template sync<LMPHostType>();
+  if (k_warning_flag.h_view()) {
+    error->warning(FLERR,"Secant solver did not converge because table bounds were exceeded.");
+    k_warning_flag.h_view() = 0;
+    k_warning_flag.template modify<LMPHostType>();
+    k_warning_flag.template sync<DeviceType>();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
 namespace LAMMPS_NS {
 template class FixEOStableRXKokkos<LMPDeviceType>;
 #ifdef KOKKOS_HAVE_CUDA
diff --git a/src/KOKKOS/fix_eos_table_rx_kokkos.h b/src/KOKKOS/fix_eos_table_rx_kokkos.h
index 9eccd67c54..9b0ca366a0 100644
--- a/src/KOKKOS/fix_eos_table_rx_kokkos.h
+++ b/src/KOKKOS/fix_eos_table_rx_kokkos.h
@@ -27,6 +27,11 @@ FixStyle(eos/table/rx/kk/host,FixEOStableRXKokkos<LMPHostType>)
 
 namespace LAMMPS_NS {
 
+struct TagFixEOStableRXInit{};
+struct TagFixEOStableRXSetup{};
+struct TagFixEOStableRXTemperatureLookup{};
+struct TagFixEOStableRXTemperatureLookup2{};
+
 template<class DeviceType>
 class FixEOStableRXKokkos : public FixEOStableRX {
  public:
@@ -41,6 +46,18 @@ class FixEOStableRXKokkos : public FixEOStableRX {
   void post_integrate();
   void end_of_step();
 
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFixEOStableRXInit, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFixEOStableRXSetup, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFixEOStableRXTemperatureLookup, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFixEOStableRXTemperatureLookup2, const int&) const;
+
   KOKKOS_INLINE_FUNCTION
   void energy_lookup(int, double, double &) const;
 
@@ -59,12 +76,16 @@ class FixEOStableRXKokkos : public FixEOStableRX {
   //Table *tables, *tables2;
 
   void allocate();
+  void error_check();
 
   //double *dHf;
 
   typename AT::t_int_1d mask;
   typename AT::t_efloat_1d uCond,uMech,uChem,uCG,uCGnew,rho,dpdTheta,duChem;
 
+  DAT::tdual_int_scalar k_error_flag;
+  DAT::tdual_int_scalar k_warning_flag;
+
   int pack_reverse_comm(int, int, double *);
   void unpack_reverse_comm(int, int *, double *);
   int pack_forward_comm(int , int *, double *, int, int *);
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index 569d131af7..c46f3d037d 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -56,6 +56,8 @@ PairExp6rxKokkos<DeviceType>::PairExp6rxKokkos(LAMMPS *lmp) : PairExp6rx(lmp)
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
   datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
   datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
+
+  k_error_flag = DAT::tdual_int_scalar("pair:error_flag");
 }
 
 /* ---------------------------------------------------------------------- */
@@ -168,6 +170,11 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxgetParamsEXP6>(0,np_total),*this);
   }
 
+  k_error_flag.template modify<DeviceType>();
+  k_error_flag.template sync<LMPHostType>();
+  if (k_error_flag.h_view())
+    error->all(FLERR,"The number of molecules in CG particle is less than 1e-8.");
+
   int inum = list->inum;
   NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
   d_numneigh = k_list->d_numneigh;
@@ -184,6 +191,11 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALF,1,0> >(0,inum),*this);
   }
 
+  k_error_flag.template modify<DeviceType>();
+  k_error_flag.template sync<LMPHostType>();
+  if (k_error_flag.h_view())
+    error->all(FLERR,"alpha_ij is 6.0 in pair exp6");
+
   if (eflag_global) eng_vdwl += ev.evdwl;
   if (vflag_global) {
     virial[0] += ev.v[0];
@@ -358,7 +370,7 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
 
       if(rmOld12_ij!=0.0 && rmOld21_ij!=0.0){
         if(alphaOld21_ij == 6.0 || alphaOld12_ij == 6.0)
-          error->all(FLERR,"alpha_ij is 6.0 in pair exp6");
+          k_error_flag.d_view() = 1;
 
         // A3.  Compute some convenient quantities for evaluating the force
         rminv = 1.0/rmOld12_ij;
@@ -774,7 +786,7 @@ void PairExp6rxKokkos<DeviceType>::getParamsEXP6(int id,double &epsilon1,double
     }
   }
   if(nTotal < 1e-8 || nTotal_old < 1e-8)
-    error->all(FLERR,"The number of molecules in CG particle is less than 1e-8.");
+    k_error_flag.d_view() = 1;
 
   // Compute the mole fraction of molecules within the fluid portion of the particle (One Fluid Approximation)
   fractionOFA_old = nTotalOFA_old / nTotal_old;
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.h b/src/KOKKOS/pair_exp6_rx_kokkos.h
index b0fbd3d9e5..366cf99d75 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.h
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.h
@@ -103,6 +103,8 @@ class PairExp6rxKokkos : public PairExp6rx {
   DAT::t_efloat_1d d_eatom;
   DAT::t_virial_array d_vatom;
 
+  DAT::tdual_int_scalar k_error_flag;
+
   typename AT::t_neighbors_2d d_neighbors;
   typename AT::t_int_1d_randomread d_ilist;
   typename AT::t_int_1d_randomread d_numneigh;
diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
index d1a13b12fd..bea7cb6b0b 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
@@ -70,6 +70,8 @@ PairMultiLucyRXKokkos<DeviceType>::PairMultiLucyRXKokkos(LAMMPS *lmp) : PairMult
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
   datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
   datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
+
+  k_error_flag = DAT::tdual_int_scalar("pair:error_flag");
 }
 
 /* ---------------------------------------------------------------------- */
@@ -180,6 +182,15 @@ void PairMultiLucyRXKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALF,1,0> >(0,inum),*this);
   }
 
+  k_error_flag.template modify<DeviceType>();
+  k_error_flag.template sync<LMPHostType>();
+  if (k_error_flag.h_view() == 1)
+    error->one(FLERR,"Density < table inner cutoff");
+  else if (k_error_flag.h_view() == 2)
+    error->one(FLERR,"Density > table outer cutoff");
+  else if (k_error_flag.h_view() == 3)
+    error->one(FLERR,"Only LOOKUP and LINEAR table styles have been implemented for pair multi/lucy/rx");
+
   if (eflag_global) eng_vdwl += ev.evdwl;
   if (vflag_global) {
     virial[0] += ev.v[0];
@@ -265,19 +276,13 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
 
       tb = &tables[tabindex[itype][jtype]];
       if (rho[i]*rho[i] < tb->innersq || rho[j]*rho[j] < tb->innersq){
-        //printf("Table inner cutoff = %lf\n",sqrt(tb->innersq));
-        //printf("rho[%d]=%lf\n",i,rho[i]);
-        //printf("rho[%d]=%lf\n",j,rho[j]);
-        error->one(FLERR,"Density < table inner cutoff");
+        k_error_flag.d_view() = 1;
       }
       if (tabstyle == LOOKUP) {
         itable = static_cast<int> (((rho[i]*rho[i]) - tb->innersq) * tb->invdelta);
         jtable = static_cast<int> (((rho[j]*rho[j]) - tb->innersq) * tb->invdelta);
         if (itable >= tlm1 || jtable >= tlm1){
-          //printf("Table outer index = %d\n",tlm1);
-          //printf("itableIndex=%d rho[%d]=%lf\n",itable,i,rho[i]);
-          //printf("jtableIndex=%d rho[%d]=%lf\n",jtable,j,rho[j]);
-          error->one(FLERR,"Density > table outer cutoff");
+          k_error_flag.d_view() = 2;
         }
         A_i = tb->f[itable];
         A_j = tb->f[jtable];
@@ -290,10 +295,7 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
         itable = static_cast<int> ((rho[i]*rho[i] - tb->innersq) * tb->invdelta);
         jtable = static_cast<int> (((rho[j]*rho[j]) - tb->innersq) * tb->invdelta);
         if (itable >= tlm1 || jtable >= tlm1){
-          //printf("Table outer index = %d\n",tlm1);
-          //printf("itableIndex=%d rho[%d]=%lf\n",itable,i,rho[i]);
-          //printf("jtableIndex=%d rho[%d]=%lf\n",jtable,j,rho[j]);
-          error->one(FLERR,"Density > table outer cutoff");
+          k_error_flag.d_view() = 2;
         }
         if(itable<0) itable=0;
         if(itable>=tlm1) itable=tlm1;
@@ -314,7 +316,7 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
         fpair = 0.5*(A_i + A_j)*(4.0-3.0*rfactor)*rfactor*rfactor*rfactor;
         fpair /= sqrt(rsq);
 
-      } else error->one(FLERR,"Only LOOKUP and LINEAR table styles have been implemented for pair multi/lucy/rx");
+      } else k_error_flag.d_view() = 3;
 
       if (isite1 == isite2) fpair = sqrt(fractionOld1_i*fractionOld2_j)*fpair;
       else fpair = (sqrt(fractionOld1_i*fractionOld2_j) + sqrt(fractionOld2_i*fractionOld1_j))*fpair;
@@ -341,13 +343,12 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
   if (tabstyle == LOOKUP) evdwl = tb->e[itable];
   else if (tabstyle == LINEAR){
     if (itable >= tlm1){
-      //printf("itableIndex=%d rho[%d]=%lf\n",itable,i,rho[i]);
-      error->one(FLERR,"Density > table outer cutoff");
+      k_error_flag.d_view() = 2;
     }
     if(itable==0) fraction_i=0.0;
     else fraction_i = (((rho[i]*rho[i]) - tb->rsq[itable]) * tb->invdelta);
     evdwl = tb->e[itable] + fraction_i*tb->de[itable];
-  } else error->one(FLERR,"Only LOOKUP and LINEAR table styles have been implemented for pair multi/lucy/rx");
+  } else k_error_flag.d_view() = 3;
 
   evdwl *=(pi*d_cutsq(itype,itype)*d_cutsq(itype,itype))/84.0;
   evdwlOld = fractionOld1_i*evdwl;
diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.h b/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
index 74a10ddee1..ff22516eb1 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
@@ -130,6 +130,8 @@ class PairMultiLucyRXKokkos : public PairMultiLucyRX {
   typename AT::t_int_1d_randomread d_ilist;
   typename AT::t_int_1d_randomread d_numneigh;
 
+  DAT::tdual_int_scalar k_error_flag;
+
   typename AT::tdual_ffloat_2d k_cutsq;
   typename AT::t_ffloat_2d d_cutsq;
 

From c0d6cbbdd3f135578b584525c92e447c663e2e1b Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Thu, 15 Dec 2016 11:18:50 -0700
Subject: [PATCH 011/267] Updates to Kokkos files

---
 src/KOKKOS/atom_vec_dpd_kokkos.cpp        |  72 ++++
 src/KOKKOS/fix_eos_table_rx_kokkos.cpp    |   2 +-
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp |   3 +
 src/KOKKOS/pair_exp6_rx_kokkos.cpp        |  68 ++-
 src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp  | 490 ++++++++++++++--------
 src/KOKKOS/pair_multi_lucy_rx_kokkos.h    |  79 +++-
 src/USER-DPD/fix_eos_table_rx.cpp         |   2 +
 src/USER-DPD/pair_multi_lucy_rx.cpp       |   2 +
 src/USER-DPD/pair_table_rx.cpp            |   2 +
 src/atom_masks.h                          |  12 +
 10 files changed, 533 insertions(+), 199 deletions(-)

diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.cpp b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
index c79559172f..58fc9c46c3 100644
--- a/src/KOKKOS/atom_vec_dpd_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
@@ -1801,6 +1801,15 @@ void AtomVecDPDKokkos::sync(ExecutionSpace space, unsigned int mask)
     if (mask & TYPE_MASK) atomKK->k_type.sync<LMPDeviceType>();
     if (mask & MASK_MASK) atomKK->k_mask.sync<LMPDeviceType>();
     if (mask & IMAGE_MASK) atomKK->k_image.sync<LMPDeviceType>();
+    if (mask & DPDRHO_MASK) atomKK->k_rho.sync<LMPDeviceType>();
+    if (mask & DPDTHETA_MASK) atomKK->k_dpdTheta.sync<LMPDeviceType>();
+    if (mask & UCOND_MASK) atomKK->k_uCond.sync<LMPDeviceType>();
+    if (mask & UMECH_MASK) atomKK->k_uMech.sync<LMPDeviceType>();
+    if (mask & UCHEM_MASK) atomKK->k_uChem.sync<LMPDeviceType>();
+    if (mask & UCG_MASK) atomKK->k_uCG.sync<LMPDeviceType>();
+    if (mask & UCGNEW_MASK) atomKK->k_uCGnew.sync<LMPDeviceType>();
+    if (mask & DUCHEM_MASK) atomKK->k_duChem.sync<LMPDeviceType>();
+    if (mask & DVECTOR_MASK) atomKK->k_dvector.sync<LMPDeviceType>();
   } else {
     if (mask & X_MASK) atomKK->k_x.sync<LMPHostType>();
     if (mask & V_MASK) atomKK->k_v.sync<LMPHostType>();
@@ -1809,6 +1818,15 @@ void AtomVecDPDKokkos::sync(ExecutionSpace space, unsigned int mask)
     if (mask & TYPE_MASK) atomKK->k_type.sync<LMPHostType>();
     if (mask & MASK_MASK) atomKK->k_mask.sync<LMPHostType>();
     if (mask & IMAGE_MASK) atomKK->k_image.sync<LMPHostType>();
+    if (mask & DPDRHO_MASK) atomKK->k_rho.sync<LMPHostType>();
+    if (mask & DPDTHETA_MASK) atomKK->k_dpdTheta.sync<LMPHostType>();
+    if (mask & UCOND_MASK) atomKK->k_uCond.sync<LMPHostType>();
+    if (mask & UMECH_MASK) atomKK->k_uMech.sync<LMPHostType>();
+    if (mask & UCHEM_MASK) atomKK->k_uChem.sync<LMPHostType>();
+    if (mask & UCG_MASK) atomKK->k_uCG.sync<LMPHostType>();
+    if (mask & UCGNEW_MASK) atomKK->k_uCGnew.sync<LMPHostType>();
+    if (mask & DUCHEM_MASK) atomKK->k_duChem.sync<LMPHostType>();
+    if (mask & DVECTOR_MASK) atomKK->k_dvector.sync<LMPHostType>();
   }
 }
 
@@ -1831,6 +1849,24 @@ void AtomVecDPDKokkos::sync_overlapping_device(ExecutionSpace space, unsigned in
       perform_async_copy<DAT::tdual_int_1d>(atomKK->k_mask,space);
     if ((mask & IMAGE_MASK) && atomKK->k_image.need_sync<LMPDeviceType>())
       perform_async_copy<DAT::tdual_imageint_1d>(atomKK->k_image,space);
+    if ((mask & DPDRHO_MASK) && atomKK->k_rho.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_rho,space);
+    if ((mask & DPDTHETA_MASK) && atomKK->k_dpdTheta.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_dpdTheta,space);
+    if ((mask & UCOND_MASK) && atomKK->k_uCond.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uCond,space);
+    if ((mask & UMECH_MASK) && atomKK->k_uMech.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uMech,space);
+    if ((mask & UCHEM_MASK) && atomKK->k_uChem.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uChem,space);
+    if ((mask & UCG_MASK) && atomKK->k_uCG.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uCG,space);
+    if ((mask & UCGNEW_MASK) && atomKK->k_uCGnew.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uCGnew,space);
+    if ((mask & DUCHEM_MASK) && atomKK->k_duChem.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_duChem,space);
+    if ((mask & DVECTOR_MASK) && atomKK->k_dvector.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_float_2d>(atomKK->k_dvector,space);
   } else {
     if ((mask & X_MASK) && atomKK->k_x.need_sync<LMPHostType>())
       perform_async_copy<DAT::tdual_x_array>(atomKK->k_x,space);
@@ -1846,6 +1882,24 @@ void AtomVecDPDKokkos::sync_overlapping_device(ExecutionSpace space, unsigned in
       perform_async_copy<DAT::tdual_int_1d>(atomKK->k_mask,space);
     if ((mask & IMAGE_MASK) && atomKK->k_image.need_sync<LMPHostType>())
       perform_async_copy<DAT::tdual_imageint_1d>(atomKK->k_image,space);
+    if ((mask & DPDRHO_MASK) && atomKK->k_rho.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_rho,space);
+    if ((mask & DPDTHETA_MASK) && atomKK->k_dpdTheta.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_dpdTheta,space);
+    if ((mask & UCOND_MASK) && atomKK->k_uCond.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uCond,space);
+    if ((mask & UMECH_MASK) && atomKK->k_uMech.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uMech,space);
+    if ((mask & UCHEM_MASK) && atomKK->k_uChem.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uChem,space);
+    if ((mask & UCG_MASK) && atomKK->k_uCG.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uCG,space);
+    if ((mask & UCGNEW_MASK) && atomKK->k_uCGnew.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uCGnew,space);
+    if ((mask & DUCHEM_MASK) && atomKK->k_duChem.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_duChem,space);
+    if ((mask & DVECTOR_MASK) && atomKK->k_dvector.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_float_2d>(atomKK->k_dvector,space);
   }
 }
 
@@ -1861,6 +1915,15 @@ void AtomVecDPDKokkos::modified(ExecutionSpace space, unsigned int mask)
     if (mask & TYPE_MASK) atomKK->k_type.modify<LMPDeviceType>();
     if (mask & MASK_MASK) atomKK->k_mask.modify<LMPDeviceType>();
     if (mask & IMAGE_MASK) atomKK->k_image.modify<LMPDeviceType>();
+    if (mask & DPDRHO_MASK) atomKK->k_rho.modify<LMPDeviceType>();
+    if (mask & DPDTHETA_MASK) atomKK->k_dpdTheta.modify<LMPDeviceType>();
+    if (mask & UCOND_MASK) atomKK->k_uCond.modify<LMPDeviceType>();
+    if (mask & UMECH_MASK) atomKK->k_uMech.modify<LMPDeviceType>();
+    if (mask & UCHEM_MASK) atomKK->k_uChem.modify<LMPDeviceType>();
+    if (mask & UCG_MASK) atomKK->k_uCG.modify<LMPDeviceType>();
+    if (mask & UCGNEW_MASK) atomKK->k_uCGnew.modify<LMPDeviceType>();
+    if (mask & DUCHEM_MASK) atomKK->k_duChem.modify<LMPDeviceType>();
+    if (mask & DVECTOR_MASK) atomKK->k_dvector.modify<LMPDeviceType>();
   } else {
     if (mask & X_MASK) atomKK->k_x.modify<LMPHostType>();
     if (mask & V_MASK) atomKK->k_v.modify<LMPHostType>();
@@ -1869,6 +1932,15 @@ void AtomVecDPDKokkos::modified(ExecutionSpace space, unsigned int mask)
     if (mask & TYPE_MASK) atomKK->k_type.modify<LMPHostType>();
     if (mask & MASK_MASK) atomKK->k_mask.modify<LMPHostType>();
     if (mask & IMAGE_MASK) atomKK->k_image.modify<LMPHostType>();
+    if (mask & DPDRHO_MASK) atomKK->k_rho.modify<LMPHostType>();
+    if (mask & DPDTHETA_MASK) atomKK->k_dpdTheta.modify<LMPHostType>();
+    if (mask & UCOND_MASK) atomKK->k_uCond.modify<LMPHostType>();
+    if (mask & UMECH_MASK) atomKK->k_uMech.modify<LMPHostType>();
+    if (mask & UCHEM_MASK) atomKK->k_uChem.modify<LMPHostType>();
+    if (mask & UCG_MASK) atomKK->k_uCG.modify<LMPHostType>();
+    if (mask & UCGNEW_MASK) atomKK->k_uCGnew.modify<LMPHostType>();
+    if (mask & DUCHEM_MASK) atomKK->k_duChem.modify<LMPHostType>();
+    if (mask & DVECTOR_MASK) atomKK->k_dvector.modify<LMPHostType>();
   }
 }
 
diff --git a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
index faf490fcc0..75e9b292f9 100644
--- a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
@@ -52,7 +52,7 @@ FixEOStableRXKokkos<DeviceType>::FixEOStableRXKokkos(LAMMPS *lmp, int narg, char
 template<class DeviceType>
 FixEOStableRXKokkos<DeviceType>::~FixEOStableRXKokkos()
 {
-
+  if (copymode) return;
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
index 45da5bf165..0bfbb9491e 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
@@ -52,6 +52,8 @@ PairDPDfdtEnergyKokkos<DeviceType>::PairDPDfdtEnergyKokkos(LAMMPS *lmp) : PairDP
 {
   atomKK = (AtomKokkos *) atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = X_MASK | F_MASK | TYPE_MASK | TAG_MASK | ENERGY_MASK | VIRIAL_MASK;
+  datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
   cutsq = NULL;
 }
 
@@ -357,6 +359,7 @@ double PairDPDfdtEnergyKokkos<DeviceType>::init_one(int i, int j)
     m_cutsq[j][i] = m_cutsq[i][j] = cutone*cutone;
   }
   k_cutsq.h_view(i,j) = cutone*cutone;
+  k_cutsq.h_view(j,i) = k_cutsq.h_view(i,j);
   k_cutsq.template modify<LMPHostType>();
 
   return cutone;
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index c46f3d037d..7e74f39ef0 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -54,8 +54,8 @@ PairExp6rxKokkos<DeviceType>::PairExp6rxKokkos(LAMMPS *lmp) : PairExp6rx(lmp)
 {
   atomKK = (AtomKokkos *) atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
-  datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
-  datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
 
   k_error_flag = DAT::tdual_int_scalar("pair:error_flag");
 }
@@ -104,6 +104,8 @@ void PairExp6rxKokkos<DeviceType>::init_style()
 template<class DeviceType>
 void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 {
+  copymode = 1;
+
   eflag = eflag_in;
   vflag = vflag_in;
 
@@ -141,7 +143,9 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   special_coul[3] = force->special_coul[3];
   newton_pair = force->newton_pair;
 
-  copymode = 1;
+  atomKK->sync(execution_space,X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK | UCG_MASK | UCGNEW_MASK | DVECTOR_MASK);
+  if (evflag) atomKK->modified(execution_space,F_MASK | ENERGY_MASK | VIRIAL_MASK | UCG_MASK | UCGNEW_MASK);
+  else atomKK->modified(execution_space,F_MASK | UCG_MASK | UCGNEW_MASK);
 
   // Initialize the Exp6 parameter data for both the local
   // and ghost atoms. Make the parameter data persistent
@@ -185,10 +189,22 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
   EV_FLOAT ev;
 
-  if (evflag) {
-    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALF,1,1> >(0,inum),*this,ev);
-  } else {
-    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALF,1,0> >(0,inum),*this);
+  if (neighflag == HALF) {
+    if (newton_pair) {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALF,1,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALF,1,0> >(0,inum),*this);
+    } else {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALF,0,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALF,0,0> >(0,inum),*this);
+    }
+  } else if (neighflag == HALFTHREAD) {
+    if (newton_pair) {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALFTHREAD,1,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALFTHREAD,1,0> >(0,inum),*this);
+    } else {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALFTHREAD,0,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALFTHREAD,0,0> >(0,inum),*this);
+    }
   }
 
   k_error_flag.template modify<DeviceType>();
@@ -246,6 +262,12 @@ template<class DeviceType>
 template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii, EV_FLOAT& ev) const {
+
+  // These arrays are atomic for Half/Thread neighbor style
+  Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
+  Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_uCG = uCG;
+  Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_uCGnew = uCGnew;
+
   int i,j,jj,jnum,itype,jtype;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,evdwlOld,fpair;
   double rsq,r2inv,r6inv,forceExp6,factor_lj;
@@ -287,6 +309,12 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
   itype = type[i];
   jnum = d_numneigh[i];
 
+  double fx_i = 0.0;
+  double fy_i = 0.0;
+  double fz_i = 0.0;
+  double uCG_i = 0.0;
+  double uCGnew_i = 0.0;
+
   {
      epsilon1_i     = PairExp6ParamData.epsilon1[i];
      alpha1_i       = PairExp6ParamData.alpha1[i];
@@ -457,9 +485,9 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
 
         evdwlOld *= factor_lj;
 
-        uCG[i] += 0.5*evdwlOld;
+        uCG_i += 0.5*evdwlOld;
         if (newton_pair || j < nlocal)
-          uCG[j] += 0.5*evdwlOld;
+          a_uCG[j] += 0.5*evdwlOld;
       }
 
       if(rm12_ij!=0.0 && rm21_ij!=0.0){
@@ -537,28 +565,34 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
         if (isite1 == isite2) fpair = sqrt(fractionOld1_i*fractionOld2_j)*fpairOldEXP6_12;
         else fpair = sqrt(fractionOld1_i*fractionOld2_j)*fpairOldEXP6_12 + sqrt(fractionOld2_i*fractionOld1_j)*fpairOldEXP6_21;
 
-        f(i,0) += delx*fpair;
-        f(i,1) += dely*fpair;
-        f(i,2) += delz*fpair;
+        fx_i += delx*fpair;
+        fy_i += dely*fpair;
+        fz_i += delz*fpair;
         if (newton_pair || j < nlocal) {
-          f(j,0) -= delx*fpair;
-          f(j,1) -= dely*fpair;
-          f(j,2) -= delz*fpair;
+          a_f(j,0) -= delx*fpair;
+          a_f(j,1) -= dely*fpair;
+          a_f(j,2) -= delz*fpair;
         }
 
         if (isite1 == isite2) evdwl = sqrt(fraction1_i*fraction2_j)*evdwlEXP6_12;
         else evdwl = sqrt(fraction1_i*fraction2_j)*evdwlEXP6_12 + sqrt(fraction2_i*fraction1_j)*evdwlEXP6_21;
         evdwl *= factor_lj;
 
-        uCGnew[i]   += 0.5*evdwl;
+        uCGnew_i   += 0.5*evdwl;
         if (newton_pair || j < nlocal)
-          uCGnew[j] += 0.5*evdwl;
+          a_uCGnew[j] += 0.5*evdwl;
         evdwl = evdwlOld;
         //if (vflag_either || eflag_atom) 
         if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair,delx,dely,delz);
       }
     }
   }
+
+  a_f(i,0) += fx_i;
+  a_f(i,1) += fy_i;
+  a_f(i,2) += fz_i;
+  a_uCG[i] += uCG_i;
+  a_uCGnew[i] += uCGnew_i;
 }
 
 template<class DeviceType>
diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
index bea7cb6b0b..03bbaf9907 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
@@ -68,8 +68,14 @@ PairMultiLucyRXKokkos<DeviceType>::PairMultiLucyRXKokkos(LAMMPS *lmp) : PairMult
 
   atomKK = (AtomKokkos *) atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
-  datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
-  datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  update_table = 0;
+  ntables = 0;
+  tables = NULL;
+  h_table = new TableHost();
+  d_table = new TableDevice();
 
   k_error_flag = DAT::tdual_int_scalar("pair:error_flag");
 }
@@ -79,7 +85,10 @@ PairMultiLucyRXKokkos<DeviceType>::PairMultiLucyRXKokkos(LAMMPS *lmp) : PairMult
 template<class DeviceType>
 PairMultiLucyRXKokkos<DeviceType>::~PairMultiLucyRXKokkos()
 {
+  if (copymode) return;
 
+  delete h_table;
+  delete d_table;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -109,7 +118,7 @@ void PairMultiLucyRXKokkos<DeviceType>::init_style()
     neighbor->requests[irequest]->half = 1;
     neighbor->requests[irequest]->ghost = 1;
   } else {
-    error->all(FLERR,"Cannot use chosen neighbor list style with reax/c/kk");
+    error->all(FLERR,"Cannot use chosen neighbor list style with multi/lucy/rx/kk");
   }
 }
 
@@ -118,6 +127,23 @@ void PairMultiLucyRXKokkos<DeviceType>::init_style()
 template<class DeviceType>
 void PairMultiLucyRXKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 {
+  if (update_table)
+    create_kokkos_tables();
+
+  if (tabstyle == LOOKUP)
+    compute_style<LOOKUP>(eflag_in,vflag_in);
+  else if(tabstyle == LINEAR)
+    compute_style<LINEAR>(eflag_in,vflag_in);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<int TABSTYLE>
+void PairMultiLucyRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
+{
+  copymode = 1;
+
   eflag = eflag_in;
   vflag = vflag_in;
 
@@ -145,10 +171,14 @@ void PairMultiLucyRXKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   x = atomKK->k_x.view<DeviceType>();
   f = atomKK->k_f.view<DeviceType>();
   type = atomKK->k_type.view<DeviceType>();
+  rho = atomKK->k_rho.view<DeviceType>();
   uCG = atomKK->k_uCG.view<DeviceType>();
   uCGnew = atomKK->k_uCGnew.view<DeviceType>();
   dvector = atomKK->k_dvector.view<DeviceType>();
-  rho = atomKK->k_rho.view<DeviceType>();
+
+  atomKK->sync(execution_space,X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK | DPDRHO_MASK | UCG_MASK | UCGNEW_MASK | DVECTOR_MASK);
+  if (evflag) atomKK->modified(execution_space,F_MASK | ENERGY_MASK | VIRIAL_MASK | UCG_MASK | UCGNEW_MASK);
+  else atomKK->modified(execution_space,F_MASK | UCG_MASK | UCGNEW_MASK);
 
   nlocal = atom->nlocal;
   int nghost = atom->nghost;
@@ -176,10 +206,22 @@ void PairMultiLucyRXKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
   EV_FLOAT ev;
 
-  if (evflag) {
-    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALF,1,1> >(0,inum),*this,ev);
-  } else {
-    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALF,1,0> >(0,inum),*this);
+  if (neighflag == HALF) {
+    if (newton_pair) {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALF,1,1,TABSTYLE> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALF,1,0,TABSTYLE> >(0,inum),*this);
+    } else {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALF,0,1,TABSTYLE> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALF,0,0,TABSTYLE> >(0,inum),*this);
+    }
+  } else if (neighflag == HALFTHREAD) {
+    if (newton_pair) {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALFTHREAD,1,1,TABSTYLE> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALFTHREAD,1,0,TABSTYLE> >(0,inum),*this);
+    } else {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALFTHREAD,0,1,TABSTYLE> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALFTHREAD,0,0,TABSTYLE> >(0,inum),*this);
+    }
   }
 
   k_error_flag.template modify<DeviceType>();
@@ -223,9 +265,13 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXgetParams,
 }
 
 template<class DeviceType>
-template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, int TABSTYLE>
 KOKKOS_INLINE_FUNCTION
-void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii, EV_FLOAT& ev) const {
+void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG,TABSTYLE>, const int &ii, EV_FLOAT& ev) const {
+
+  // The f array is atomic for Half/Thread neighbor style
+  Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
+
   int i,j,jj,inum,jnum,itype,jtype,itable;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,evdwlOld,fpair;
   double rsq;
@@ -239,8 +285,6 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
   double fraction_i,fraction_j;
   int jtable;
 
-  Table *tb;
-
   int tlm1 = tablength - 1;
 
   i = d_ilist[ii];
@@ -274,26 +318,34 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
       fractionOld1_j = d_fractionOld1[j];
       fractionOld2_j = d_fractionOld2[j];
 
-      tb = &tables[tabindex[itype][jtype]];
-      if (rho[i]*rho[i] < tb->innersq || rho[j]*rho[j] < tb->innersq){
+      //tb = &tables[tabindex[itype][jtype]];
+      const int tidx = d_table_const.tabindex(itype,jtype);
+      //if (rho[i]*rho[i] < tb->innersq || rho[j]*rho[j] < tb->innersq){
+      if (rho[i]*rho[i] < d_table_const.innersq(tidx) || rho[j]*rho[j] < d_table_const.innersq(tidx)){
         k_error_flag.d_view() = 1;
       }
-      if (tabstyle == LOOKUP) {
-        itable = static_cast<int> (((rho[i]*rho[i]) - tb->innersq) * tb->invdelta);
-        jtable = static_cast<int> (((rho[j]*rho[j]) - tb->innersq) * tb->invdelta);
+      if (TABSTYLE == LOOKUP) {
+        //itable = static_cast<int> (((rho[i]*rho[i]) - tb->innersq) * tb->invdelta);
+        itable = static_cast<int> (((rho[i]*rho[i]) - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+        //jtable = static_cast<int> (((rho[j]*rho[j]) - tb->innersq) * tb->invdelta);
+        jtable = static_cast<int> (((rho[j]*rho[j]) - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
         if (itable >= tlm1 || jtable >= tlm1){
           k_error_flag.d_view() = 2;
         }
-        A_i = tb->f[itable];
-        A_j = tb->f[jtable];
+        //A_i = tb->f[itable];
+        A_i = d_table_const.f(tidx,itable);
+        //A_j = tb->f[jtable];
+        A_j = d_table_const.f(tidx,jtable);
 
         const double rfactor = 1.0-sqrt(rsq/d_cutsq(itype,jtype));
         fpair = 0.5*(A_i + A_j)*(4.0-3.0*rfactor)*rfactor*rfactor*rfactor;
         fpair /= sqrt(rsq);
 
-      } else if (tabstyle == LINEAR) {
-        itable = static_cast<int> ((rho[i]*rho[i] - tb->innersq) * tb->invdelta);
-        jtable = static_cast<int> (((rho[j]*rho[j]) - tb->innersq) * tb->invdelta);
+      } else if (TABSTYLE == LINEAR) {
+        //itable = static_cast<int> ((rho[i]*rho[i] - tb->innersq) * tb->invdelta);
+        itable = static_cast<int> ((rho[i]*rho[i] - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+        //jtable = static_cast<int> (((rho[j]*rho[j]) - tb->innersq) * tb->invdelta);
+        jtable = static_cast<int> ((rho[j]*rho[j] - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
         if (itable >= tlm1 || jtable >= tlm1){
           k_error_flag.d_view() = 2;
         }
@@ -302,15 +354,19 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
         if(jtable<0) jtable=0;
         if(jtable>=tlm1)jtable=tlm1;
 
-        fraction_i = (((rho[i]*rho[i]) - tb->rsq[itable]) * tb->invdelta);
-        fraction_j = (((rho[j]*rho[j]) - tb->rsq[jtable]) * tb->invdelta);
+        //fraction_i = (((rho[i]*rho[i]) - tb->rsq[itable]) * tb->invdelta);
+        fraction_i = (((rho[i]*rho[i]) - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx));
+        //fraction_j = (((rho[j]*rho[j]) - tb->rsq[jtable]) * tb->invdelta);
+        fraction_j = (((rho[j]*rho[j]) - d_table_const.rsq(tidx,jtable)) * d_table_const.invdelta(tidx));
         if(itable==0) fraction_i=0.0;
         if(itable==tlm1) fraction_i=0.0;
         if(jtable==0) fraction_j=0.0;
         if(jtable==tlm1) fraction_j=0.0;
 
-        A_i = tb->f[itable] + fraction_i*tb->df[itable];
-        A_j = tb->f[jtable] + fraction_j*tb->df[jtable];
+        //A_i = tb->f[itable] + fraction_i*tb->df[itable];
+        A_i = d_table_const.f(tidx,itable) + fraction_i*d_table_const.df(tidx,itable);
+        //A_j = tb->f[jtable] + fraction_j*tb->df[jtable];
+        A_j = d_table_const.f(tidx,jtable) + fraction_j*d_table_const.df(tidx,jtable);
 
         const double rfactor = 1.0-sqrt(rsq/d_cutsq(itype,jtype));
         fpair = 0.5*(A_i + A_j)*(4.0-3.0*rfactor)*rfactor*rfactor*rfactor;
@@ -325,29 +381,34 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
       fy_i += dely*fpair;
       fz_i += delz*fpair;
       if (NEWTON_PAIR || j < nlocal) {
-        f(j,0) -= delx*fpair;
-        f(j,1) -= dely*fpair;
-        f(j,2) -= delz*fpair;
+        a_f(j,0) -= delx*fpair;
+        a_f(j,1) -= dely*fpair;
+        a_f(j,2) -= delz*fpair;
       }
       //if (evflag) ev_tally(i,j,nlocal,newton_pair,0.0,0.0,fpair,delx,dely,delz);
       if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,0.0,fpair,delx,dely,delz);
     }
   }
 
-  f(i,0) += fx_i;
-  f(i,1) += fy_i;
-  f(i,2) += fz_i;
+  a_f(i,0) += fx_i;
+  a_f(i,1) += fy_i;
+  a_f(i,2) += fz_i;
 
-  tb = &tables[tabindex[itype][itype]];
-  itable = static_cast<int> (((rho[i]*rho[i]) - tb->innersq) * tb->invdelta);
-  if (tabstyle == LOOKUP) evdwl = tb->e[itable];
-  else if (tabstyle == LINEAR){
+  //tb = &tables[tabindex[itype][itype]];
+  const int tidx = d_table_const.tabindex(itype,itype);
+  //itable = static_cast<int> (((rho[i]*rho[i]) - tb->innersq) * tb->invdelta);
+  itable = static_cast<int> (((rho[i]*rho[i]) - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+  //if (TABSTYLE == LOOKUP) evdwl = tb->e[itable];
+  if (TABSTYLE == LOOKUP) evdwl = d_table_const.e(tidx,itable);
+  else if (TABSTYLE == LINEAR){
     if (itable >= tlm1){
       k_error_flag.d_view() = 2;
     }
     if(itable==0) fraction_i=0.0;
-    else fraction_i = (((rho[i]*rho[i]) - tb->rsq[itable]) * tb->invdelta);
-    evdwl = tb->e[itable] + fraction_i*tb->de[itable];
+    //else fraction_i = (((rho[i]*rho[i]) - tb->rsq[itable]) * tb->invdelta);
+    else fraction_i = (((rho[i]*rho[i]) - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx));
+    //evdwl = tb->e[itable] + fraction_i*tb->de[itable];
+    evdwl = d_table_const.e(tidx,itable); + fraction_i*d_table_const.de(tidx,itable);
   } else k_error_flag.d_view() = 3;
 
   evdwl *=(pi*d_cutsq(itype,itype)*d_cutsq(itype,itype))/84.0;
@@ -364,121 +425,11 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
 }
 
 template<class DeviceType>
-template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, int TABSTYLE>
 KOKKOS_INLINE_FUNCTION
-void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii) const {
+void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG,TABSTYLE>, const int &ii) const {
   EV_FLOAT ev;
-  this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(TagPairMultiLucyRXCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(), ii, ev);
-}
-
-/* ----------------------------------------------------------------------
-   set coeffs for one or more type pairs
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-void PairMultiLucyRXKokkos<DeviceType>::coeff(int narg, char **arg)
-{
-  if (narg != 6 && narg != 7) error->all(FLERR,"Illegal pair_coeff command");
-
-  bool rx_flag = false;
-  for (int i = 0; i < modify->nfix; i++)
-    if (strncmp(modify->fix[i]->style,"rx",2) == 0) rx_flag = true;
-  if (!rx_flag) error->all(FLERR,"PairMultiLucyRXKokkos<DeviceType> requires a fix rx command.");
-
-  if (!allocated) allocate();
-
-  int ilo,ihi,jlo,jhi;
-  force->bounds(FLERR,arg[0],atom->ntypes,ilo,ihi);
-  force->bounds(FLERR,arg[1],atom->ntypes,jlo,jhi);
-
-  int me;
-  MPI_Comm_rank(world,&me);
-  tables = (Table *)
-    memory->srealloc(tables,(ntables+1)*sizeof(Table),"pair:tables");
-  Table *tb = &tables[ntables];
-  null_table(tb);
-  if (me == 0) read_table(tb,arg[2],arg[3]);
-  bcast_table(tb);
-
-  nspecies = atom->nspecies_dpd;
-  int n;
-  n = strlen(arg[3]) + 1;
-  site1 = new char[n];
-  strcpy(site1,arg[4]);
-
-  n = strlen(arg[4]) + 1;
-  site2 = new char[n];
-  strcpy(site2,arg[5]);
-
-  // set table cutoff
-
-  if (narg == 7) tb->cut = force->numeric(FLERR,arg[6]);
-  else if (tb->rflag) tb->cut = tb->rhi;
-  else tb->cut = tb->rfile[tb->ninput-1];
-
-  // error check on table parameters
-  // insure cutoff is within table
-
-  if (tb->ninput <= 1) error->one(FLERR,"Invalid pair table length");
-  if (tb->rflag == 0) {
-    rho_0 = tb->rfile[0];
-  } else {
-    rho_0 = tb->rlo;
-  }
-
-  tb->match = 0;
-  if (tabstyle == LINEAR && tb->ninput == tablength &&
-      tb->rflag == RSQ) tb->match = 1;
-
-  // spline read-in values and compute r,e,f vectors within table
-
-  if (tb->match == 0) spline_table(tb);
-  compute_table(tb);
-
-  // store ptr to table in tabindex
-
-  int count = 0;
-  for (int i = ilo; i <= ihi; i++) {
-    for (int j = MAX(jlo,i); j <= jhi; j++) {
-      tabindex[i][j] = ntables;
-      setflag[i][j] = 1;
-      count++;
-    }
-  }
-
-  if (count == 0) error->all(FLERR,"Illegal pair_coeff command");
-  ntables++;
-
-  // Match site* to isite values.
-
-  if (strcmp(site1, "1fluid") == 0)
-     isite1 = oneFluidParameter;
-  else {
-     isite1 = nspecies;
-     for (int ispecies = 0; ispecies < nspecies; ++ispecies)
-        if (strcmp(site1, atom->dname[ispecies]) == 0){
-           isite1 = ispecies;
-           break;
-        }
-
-     if (isite1 == nspecies)
-        error->all(FLERR,"Pair_multi_lucy_rx site1 is invalid.");
-  }
-
-  if (strcmp(site2, "1fluid") == 0)
-     isite2 = oneFluidParameter;
-  else {
-     isite2 = nspecies;
-     for (int ispecies = 0; ispecies < nspecies; ++ispecies)
-        if (strcmp(site2, atom->dname[ispecies]) == 0){
-           isite2 = ispecies;
-           break;
-        }
-
-     if (isite2 == nspecies)
-        error->all(FLERR,"Pair_multi_lucy_rx site2 is invalid.");
-  }
-
+  this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG,TABSTYLE>(TagPairMultiLucyRXCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG,TABSTYLE>(), ii, ev);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -486,12 +437,16 @@ void PairMultiLucyRXKokkos<DeviceType>::coeff(int narg, char **arg)
 template<class DeviceType>
 void PairMultiLucyRXKokkos<DeviceType>::computeLocalDensity()
 {
+  copymode = 1;
+
   x = atomKK->k_x.view<DeviceType>();
   type = atomKK->k_type.view<DeviceType>();
   rho = atomKK->k_rho.view<DeviceType>();
+  h_rho = atomKK->k_rho.h_view;
   nlocal = atom->nlocal;
 
-  //sync
+  atomKK->sync(execution_space,X_MASK | TYPE_MASK | DPDRHO_MASK);
+  atomKK->modified(execution_space,DPDRHO_MASK);
 
   const int inum = list->inum;
   NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
@@ -514,16 +469,34 @@ void PairMultiLucyRXKokkos<DeviceType>::computeLocalDensity()
   if (newton_pair) m += atom->nghost;
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXZero>(0,m),*this);
 
-// rho = density at each atom
-// loop over neighbors of my atoms
-  if (newton_pair)
-    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALF,1> >(0,inum),*this);
-  else
-    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALF,0> >(0,inum),*this);
+  // rho = density at each atom
+  // loop over neighbors of my atoms
 
-  if (newton_pair) comm->reverse_comm_pair(this);
+  if (neighflag == HALF) {
+    if (newton_pair)
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALF,1> >(0,inum),*this);
+    else
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALF,0> >(0,inum),*this);
+  } else if (neighflag == HALFTHREAD) {
+    if (newton_pair)
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALFTHREAD,1> >(0,inum),*this);
+    else
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALFTHREAD,0> >(0,inum),*this);
+  }
+
+  // communicate and sum densities (on the host)
+
+  if (newton_pair) {
+    atomKK->modified(execution_space,DPDRHO_MASK);
+    atomKK->sync(Host,DPDRHO_MASK);
+    comm->reverse_comm_pair(this);
+    atomKK->modified(Host,DPDRHO_MASK);
+    atomKK->sync(execution_space,DPDRHO_MASK);
+  }
 
   comm->forward_comm_pair(this);
+
+  copymode = 0;
 }
 
 template<class DeviceType>
@@ -536,6 +509,10 @@ template<class DeviceType>
 template<int NEIGHFLAG, int NEWTON_PAIR>
 KOKKOS_INLINE_FUNCTION
 void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXComputeLocalDensity<NEIGHFLAG,NEWTON_PAIR>, const int &ii) const {
+
+  // The rho array is atomic for Half/Thread neighbor style
+  Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_rho = rho;
+
   const int i = d_ilist[ii];
 
   const double xtmp = x(i,0);
@@ -567,7 +544,7 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXComputeLoca
         const double factor = factor_type11*(1.0 + 1.5*r_over_rcut)*tmpFactor4;
         rho_i += factor;
         if (NEWTON_PAIR || j < nlocal)
-          rho[j] += factor;
+          a_rho[j] += factor;
       } else if (rsq < d_cutsq(itype,jtype)) {
         const double rcut = sqrt(d_cutsq(itype,jtype));
         const double tmpFactor = 1.0-sqrt(rsq)/rcut;
@@ -575,12 +552,12 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXComputeLoca
         const double factor = (84.0/(5.0*pi*rcut*rcut*rcut))*(1.0+3.0*sqrt(rsq)/(2.0*rcut))*tmpFactor4;
         rho_i += factor;
         if (NEWTON_PAIR || j < nlocal)
-          rho[j] += factor;
+          a_rho[j] += factor;
       }
     }
   }
 
-  rho[i] = rho_i;
+  a_rho[i] = rho_i;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -630,16 +607,53 @@ void PairMultiLucyRXKokkos<DeviceType>::getParams(int id, double &fractionOld1,
 
 /* ---------------------------------------------------------------------- */
 
+template<class DeviceType>
+int PairMultiLucyRXKokkos<DeviceType>::pack_forward_comm_kokkos(int n, DAT::tdual_int_2d k_sendlist, int iswap_in, DAT::tdual_xfloat_1d &buf,
+                               int pbc_flag, int *pbc)
+{
+  d_sendlist = k_sendlist.view<DeviceType>();
+  iswap = iswap_in;
+  v_buf = buf.view<DeviceType>();
+  Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType, TagPairMultiLucyRXPackForwardComm>(0,n),*this);
+  DeviceType::fence();
+  return n;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXPackForwardComm, const int &i) const {
+  int j = d_sendlist(iswap, i);
+  v_buf[i] = rho[j];
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMultiLucyRXKokkos<DeviceType>::unpack_forward_comm_kokkos(int n, int first_in, DAT::tdual_xfloat_1d &buf)
+{
+  first = first_in;
+  v_buf = buf.view<DeviceType>();
+  Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType, TagPairMultiLucyRXUnpackForwardComm>(0,n),*this);
+  DeviceType::fence();
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXUnpackForwardComm, const int &i) const {
+  rho[i + first] = v_buf[i];
+}
+
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 int PairMultiLucyRXKokkos<DeviceType>::pack_forward_comm(int n, int *list, double *buf, int pbc_flag, int *pbc)
 {
   int i,j,m;
-  rho = atomKK->k_rho.view<DeviceType>();
 
   m = 0;
   for (i = 0; i < n; i++) {
     j = list[i];
-    buf[m++] = rho[j];
+    buf[m++] = h_rho[j];
   }
   return m;
 }
@@ -650,11 +664,10 @@ template<class DeviceType>
 void PairMultiLucyRXKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *buf)
 {
   int i,m,last;
-  rho = atomKK->k_rho.view<DeviceType>();
 
   m = 0;
   last = first + n;
-  for (i = first; i < last; i++) rho[i] = buf[m++];
+  for (i = first; i < last; i++) h_rho[i] = buf[m++];
 }
 
 /* ---------------------------------------------------------------------- */
@@ -663,11 +676,10 @@ template<class DeviceType>
 int PairMultiLucyRXKokkos<DeviceType>::pack_reverse_comm(int n, int first, double *buf)
 {
   int i,m,last;
-  rho = atomKK->k_rho.view<DeviceType>();
 
   m = 0;
   last = first + n;
-  for (i = first; i < last; i++) buf[m++] = rho[i];
+  for (i = first; i < last; i++) buf[m++] = h_rho[i];
   return m;
 }
 
@@ -677,12 +689,11 @@ template<class DeviceType>
 void PairMultiLucyRXKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, double *buf)
 {
   int i,j,m;
-  rho = atomKK->k_rho.view<DeviceType>();
 
   m = 0;
   for (i = 0; i < n; i++) {
     j = list[i];
-    rho[j] += buf[m++];
+    h_rho[j] += buf[m++];
   }
 }
 
@@ -782,6 +793,145 @@ void PairMultiLucyRXKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, con
 
 /* ---------------------------------------------------------------------- */
 
+template<class DeviceType>
+void PairMultiLucyRXKokkos<DeviceType>::create_kokkos_tables()
+{
+  const int tlm1 = tablength-1;
+
+  memory->create_kokkos(d_table->innersq,h_table->innersq,ntables,"Table::innersq");
+  memory->create_kokkos(d_table->invdelta,h_table->invdelta,ntables,"Table::invdelta");
+  memory->create_kokkos(d_table->deltasq6,h_table->deltasq6,ntables,"Table::deltasq6");
+
+  if(tabstyle == LOOKUP) {
+    memory->create_kokkos(d_table->e,h_table->e,ntables,tlm1,"Table::e");
+    memory->create_kokkos(d_table->f,h_table->f,ntables,tlm1,"Table::f");
+  }
+
+  if(tabstyle == LINEAR) {
+    memory->create_kokkos(d_table->rsq,h_table->rsq,ntables,tablength,"Table::rsq");
+    memory->create_kokkos(d_table->e,h_table->e,ntables,tablength,"Table::e");
+    memory->create_kokkos(d_table->f,h_table->f,ntables,tablength,"Table::f");
+    memory->create_kokkos(d_table->de,h_table->de,ntables,tlm1,"Table::de");
+    memory->create_kokkos(d_table->df,h_table->df,ntables,tlm1,"Table::df");
+  }
+
+  for(int i=0; i < ntables; i++) {
+    Table* tb = &tables[i];
+
+    h_table->innersq[i] = tb->innersq;
+    h_table->invdelta[i] = tb->invdelta;
+    h_table->deltasq6[i] = tb->deltasq6;
+
+    for(int j = 0; j<h_table->rsq.dimension_1(); j++)
+      h_table->rsq(i,j) = tb->rsq[j];
+    for(int j = 0; j<h_table->drsq.dimension_1(); j++)
+      h_table->drsq(i,j) = tb->drsq[j];
+    for(int j = 0; j<h_table->e.dimension_1(); j++)
+      h_table->e(i,j) = tb->e[j];
+    for(int j = 0; j<h_table->de.dimension_1(); j++)
+      h_table->de(i,j) = tb->de[j];
+    for(int j = 0; j<h_table->f.dimension_1(); j++)
+      h_table->f(i,j) = tb->f[j];
+    for(int j = 0; j<h_table->df.dimension_1(); j++)
+      h_table->df(i,j) = tb->df[j];
+    for(int j = 0; j<h_table->e2.dimension_1(); j++)
+      h_table->e2(i,j) = tb->e2[j];
+    for(int j = 0; j<h_table->f2.dimension_1(); j++)
+      h_table->f2(i,j) = tb->f2[j];
+  }
+
+
+  Kokkos::deep_copy(d_table->innersq,h_table->innersq);
+  Kokkos::deep_copy(d_table->invdelta,h_table->invdelta);
+  Kokkos::deep_copy(d_table->deltasq6,h_table->deltasq6);
+  Kokkos::deep_copy(d_table->rsq,h_table->rsq);
+  Kokkos::deep_copy(d_table->drsq,h_table->drsq);
+  Kokkos::deep_copy(d_table->e,h_table->e);
+  Kokkos::deep_copy(d_table->de,h_table->de);
+  Kokkos::deep_copy(d_table->f,h_table->f);
+  Kokkos::deep_copy(d_table->df,h_table->df);
+  Kokkos::deep_copy(d_table->e2,h_table->e2);
+  Kokkos::deep_copy(d_table->f2,h_table->f2);
+  Kokkos::deep_copy(d_table->tabindex,h_table->tabindex);
+
+  d_table_const.innersq = d_table->innersq;
+  d_table_const.invdelta = d_table->invdelta;
+  d_table_const.deltasq6 = d_table->deltasq6;
+  d_table_const.rsq = d_table->rsq;
+  d_table_const.drsq = d_table->drsq;
+  d_table_const.e = d_table->e;
+  d_table_const.de = d_table->de;
+  d_table_const.f = d_table->f;
+  d_table_const.df = d_table->df;
+  d_table_const.e2 = d_table->e2;
+  d_table_const.f2 = d_table->f2;
+
+
+  Kokkos::deep_copy(d_table->cutsq,h_table->cutsq);
+  update_table = 0;
+}
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMultiLucyRXKokkos<DeviceType>::allocate()
+{
+  allocated = 1;
+  const int nt = atom->ntypes + 1;
+
+  memory->create(setflag,nt,nt,"pair:setflag");
+  memory->create_kokkos(d_table->cutsq,h_table->cutsq,cutsq,nt,nt,"pair:cutsq");
+  memory->create_kokkos(d_table->tabindex,h_table->tabindex,tabindex,nt,nt,"pair:tabindex");
+
+  d_table_const.cutsq = d_table->cutsq;
+  d_table_const.tabindex = d_table->tabindex;
+  memset(&setflag[0][0],0,nt*nt*sizeof(int));
+  memset(&cutsq[0][0],0,nt*nt*sizeof(double));
+  memset(&tabindex[0][0],0,nt*nt*sizeof(int));
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMultiLucyRXKokkos<DeviceType>::settings(int narg, char **arg)
+{
+  if (narg < 2) error->all(FLERR,"Illegal pair_style command");
+
+  // new settings
+
+  if (strcmp(arg[0],"lookup") == 0) tabstyle = LOOKUP;
+  else if (strcmp(arg[0],"linear") == 0) tabstyle = LINEAR;
+  else error->all(FLERR,"Unknown table style in pair_style command");
+
+  tablength = force->inumeric(FLERR,arg[1]);
+  if (tablength < 2) error->all(FLERR,"Illegal number of pair table entries");
+
+  // delete old tables, since cannot just change settings
+
+  for (int m = 0; m < ntables; m++) free_table(&tables[m]);
+  memory->sfree(tables);
+
+  if (allocated) {
+    memory->destroy(setflag);
+
+    d_table_const.tabindex = d_table->tabindex = typename ArrayTypes<DeviceType>::t_int_2d();
+    h_table->tabindex = typename ArrayTypes<LMPHostType>::t_int_2d();
+
+    d_table_const.cutsq = d_table->cutsq = typename ArrayTypes<DeviceType>::t_ffloat_2d();
+    h_table->cutsq = typename ArrayTypes<LMPHostType>::t_ffloat_2d();
+  }
+  allocated = 0;
+
+  ntables = 0;
+  tables = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
 namespace LAMMPS_NS {
 template class PairMultiLucyRXKokkos<LMPDeviceType>;
 #ifdef KOKKOS_HAVE_CUDA
diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.h b/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
index ff22516eb1..a259588f78 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
@@ -29,9 +29,12 @@ PairStyle(multi/lucy/rx/kk/host,PairMultiLucyRXKokkos<LMPHostType>)
 
 namespace LAMMPS_NS {
 
+struct TagPairMultiLucyRXPackForwardComm{};
+struct TagPairMultiLucyRXUnpackForwardComm{};
+
 struct TagPairMultiLucyRXgetParams{};
 
-template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, int TABSTYLE>
 struct TagPairMultiLucyRXCompute{};
 
 struct TagPairMultiLucyRXZero{};
@@ -50,24 +53,37 @@ class PairMultiLucyRXKokkos : public PairMultiLucyRX {
   virtual ~PairMultiLucyRXKokkos();
 
   void compute(int, int);
+  void settings(int, char **);
+
+  template<int TABSTYLE>
+  void compute_style(int, int);
+
   void init_style();
-  void coeff(int, char **);
+  int pack_forward_comm_kokkos(int, DAT::tdual_int_2d, int, DAT::tdual_xfloat_1d&,
+                               int, int *);
+  void unpack_forward_comm_kokkos(int, int, DAT::tdual_xfloat_1d&);
   int pack_forward_comm(int, int *, double *, int, int *);
   void unpack_forward_comm(int, int, double *);
   int pack_reverse_comm(int, int, double *);
   void unpack_reverse_comm(int, int *, double *);
   void computeLocalDensity();
 
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMultiLucyRXPackForwardComm, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMultiLucyRXUnpackForwardComm, const int&) const;
+
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairMultiLucyRXgetParams, const int&) const;
 
-  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, int TABSTYLE>
   KOKKOS_INLINE_FUNCTION
-  void operator()(TagPairMultiLucyRXCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&, EV_FLOAT&) const;
+  void operator()(TagPairMultiLucyRXCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG,TABSTYLE>, const int&, EV_FLOAT&) const;
 
-  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, int TABSTYLE>
   KOKKOS_INLINE_FUNCTION
-  void operator()(TagPairMultiLucyRXCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&) const;
+  void operator()(TagPairMultiLucyRXCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG,TABSTYLE>, const int&) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairMultiLucyRXZero, const int&) const;
@@ -92,6 +108,8 @@ class PairMultiLucyRXKokkos : public PairMultiLucyRX {
   double rcut_type11;
   double factor_type11;
 
+  enum{LOOKUP,LINEAR,SPLINE,BITMAP};
+
   //struct Table {
   //  int ninput,rflag,fpflag,match;
   //  double rlo,rhi,fplo,fphi,cut;
@@ -100,14 +118,47 @@ class PairMultiLucyRXKokkos : public PairMultiLucyRX {
   //  double innersq,delta,invdelta,deltasq6;
   //  double *rsq,*drsq,*e,*de,*f,*df,*e2,*f2;
   //};
-  //Table *tables;
+
+  int tabstyle,tablength;
+  /*struct TableDeviceConst {
+    typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread cutsq;
+    typename ArrayTypes<DeviceType>::t_int_2d_randomread tabindex;
+    typename ArrayTypes<DeviceType>::t_ffloat_1d_randomread innersq,invdelta,deltasq6;
+    typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread rsq,drsq,e,de,f,df,e2,f2;
+  };*/
+ //Its faster not to use texture fetch if the number of tables is less than 32!
+  struct TableDeviceConst {
+    typename ArrayTypes<DeviceType>::t_ffloat_2d cutsq;
+    typename ArrayTypes<DeviceType>::t_int_2d tabindex;
+    typename ArrayTypes<DeviceType>::t_ffloat_1d innersq,invdelta,deltasq6;
+    typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread rsq,drsq,e,de,f,df,e2,f2;
+  };
+
+  struct TableDevice {
+    typename ArrayTypes<DeviceType>::t_ffloat_2d cutsq;
+    typename ArrayTypes<DeviceType>::t_int_2d tabindex;
+    typename ArrayTypes<DeviceType>::t_ffloat_1d innersq,invdelta,deltasq6;
+    typename ArrayTypes<DeviceType>::t_ffloat_2d rsq,drsq,e,de,f,df,e2,f2;
+  };
+
+  struct TableHost {
+    typename ArrayTypes<LMPHostType>::t_ffloat_2d cutsq;
+    typename ArrayTypes<LMPHostType>::t_int_2d tabindex;
+    typename ArrayTypes<LMPHostType>::t_ffloat_1d innersq,invdelta,deltasq6;
+    typename ArrayTypes<LMPHostType>::t_ffloat_2d rsq,drsq,e,de,f,df,e2,f2;
+  };
+
+  TableDeviceConst d_table_const;
+  TableDevice* d_table;
+  TableHost* h_table;
 
   int **tabindex;
+  F_FLOAT m_cutsq[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
 
-  //void read_table(Table *, char *, char *);
-  //void param_extract(Table *, char *);
-
-  char *site1, *site2;
+  void allocate();
+  int update_table;
+  void create_kokkos_tables();
+  void cleanup_copy();
 
   KOKKOS_INLINE_FUNCTION
   void getParams(int, double &, double &, double &, double &) const;
@@ -118,6 +169,7 @@ class PairMultiLucyRXKokkos : public PairMultiLucyRX {
   typename AT::t_f_array f;
   typename AT::t_int_1d_randomread type;
   typename AT::t_efloat_1d rho;
+  typename HAT::t_efloat_1d h_rho;
   typename AT::t_efloat_1d uCG, uCGnew;
   typename AT::t_float_2d dvector;
 
@@ -135,6 +187,11 @@ class PairMultiLucyRXKokkos : public PairMultiLucyRX {
   typename AT::tdual_ffloat_2d k_cutsq;
   typename AT::t_ffloat_2d d_cutsq;
 
+  int iswap;
+  int first;
+  typename AT::t_int_2d d_sendlist;
+  typename AT::t_xfloat_1d_um v_buf;
+
   friend void pair_virial_fdotr_compute<PairMultiLucyRXKokkos>(PairMultiLucyRXKokkos*);
 };
 
diff --git a/src/USER-DPD/fix_eos_table_rx.cpp b/src/USER-DPD/fix_eos_table_rx.cpp
index e10ce96089..8871bdd176 100644
--- a/src/USER-DPD/fix_eos_table_rx.cpp
+++ b/src/USER-DPD/fix_eos_table_rx.cpp
@@ -127,6 +127,8 @@ FixEOStableRX::FixEOStableRX(LAMMPS *lmp, int narg, char **arg) :
 
 FixEOStableRX::~FixEOStableRX()
 {
+  if (copymode) return;
+
   for (int m = 0; m < ntables; m++) {
     free_table(&tables[m]);
     free_table(&tables2[m]);
diff --git a/src/USER-DPD/pair_multi_lucy_rx.cpp b/src/USER-DPD/pair_multi_lucy_rx.cpp
index cd107f1519..6b5c7cf40a 100644
--- a/src/USER-DPD/pair_multi_lucy_rx.cpp
+++ b/src/USER-DPD/pair_multi_lucy_rx.cpp
@@ -78,6 +78,8 @@ PairMultiLucyRX::PairMultiLucyRX(LAMMPS *lmp) : Pair(lmp),
 
 PairMultiLucyRX::~PairMultiLucyRX()
 {
+  if (copymode) return;
+
   for (int m = 0; m < ntables; m++) free_table(&tables[m]);
   memory->sfree(tables);
 
diff --git a/src/USER-DPD/pair_table_rx.cpp b/src/USER-DPD/pair_table_rx.cpp
index 902d0e5bb4..463e1838c6 100644
--- a/src/USER-DPD/pair_table_rx.cpp
+++ b/src/USER-DPD/pair_table_rx.cpp
@@ -50,6 +50,8 @@ PairTableRX::PairTableRX(LAMMPS *lmp) : Pair(lmp)
 
 PairTableRX::~PairTableRX()
 {
+  if (copymode) return;
+
   for (int m = 0; m < ntables; m++) free_table(&tables[m]);
   memory->sfree(tables);
 
diff --git a/src/atom_masks.h b/src/atom_masks.h
index 119f09f273..8e29448488 100644
--- a/src/atom_masks.h
+++ b/src/atom_masks.h
@@ -42,6 +42,18 @@
 #define ENERGY_MASK    0x00010000
 #define VIRIAL_MASK    0x00020000
 
+// DPD
+
+#define DPDRHO_MASK       0x00040000
+#define DPDTHETA_MASK     0x00080000
+#define UCOND_MASK        0x00100000
+#define UMECH_MASK        0x00200000
+#define UCHEM_MASK        0x00400000
+#define UCG_MASK          0x00800000
+#define UCGNEW_MASK       0x01000000
+#define DUCHEM_MASK       0x02000000
+#define DVECTOR_MASK      0x04000000
+
 // granular
 
 #define RADIUS_MASK    0x00100000

From d5f8f36442bfc14ba49c4090f465f87afc65a24e Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Thu, 15 Dec 2016 15:48:09 -0700
Subject: [PATCH 012/267] Change to fix_property_atom to allow virtual override
 of grow_arrays() function

---
 src/fix_property_atom.cpp | 3 ++-
 src/fix_property_atom.h   | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/fix_property_atom.cpp b/src/fix_property_atom.cpp
index b83aadc95d..002260d8f0 100644
--- a/src/fix_property_atom.cpp
+++ b/src/fix_property_atom.cpp
@@ -134,7 +134,6 @@ FixPropertyAtom::FixPropertyAtom(LAMMPS *lmp, int narg, char **arg) :
   // register with Atom class
 
   nmax_old = 0;
-  grow_arrays(atom->nmax);
   atom->add_callback(0);
   atom->add_callback(1);
   if (border) atom->add_callback(2);
@@ -190,6 +189,8 @@ int FixPropertyAtom::setmask()
 
 void FixPropertyAtom::init()
 {
+  grow_arrays(atom->nmax);
+
   // error if atom style has changed since fix was defined
   // don't allow this b/c user could change to style that defines molecule,q
 
diff --git a/src/fix_property_atom.h b/src/fix_property_atom.h
index 77a41f393a..d923d76cac 100644
--- a/src/fix_property_atom.h
+++ b/src/fix_property_atom.h
@@ -27,7 +27,7 @@ namespace LAMMPS_NS {
 class FixPropertyAtom : public Fix {
  public:
   FixPropertyAtom(class LAMMPS *, int, char **);
-  ~FixPropertyAtom();
+  virtual ~FixPropertyAtom();
   int setmask();
   void init();
 
@@ -38,7 +38,7 @@ class FixPropertyAtom : public Fix {
   void write_data_section_keyword(int, FILE *);
   void write_data_section(int, FILE *, int, double **, int);
 
-  void grow_arrays(int);
+  virtual void grow_arrays(int);
   void copy_arrays(int, int, int);
   int pack_border(int, int *, double *);
   int unpack_border(int, int, double *);
@@ -50,7 +50,7 @@ class FixPropertyAtom : public Fix {
   int maxsize_restart();
   double memory_usage();
 
- private:
+ protected:
   int nvalue,border;
   int molecule_flag,q_flag,rmass_flag;
   int *style,*index;

From a3c1d385e84a68721433eaaf318513962c489657 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Thu, 15 Dec 2016 15:50:30 -0700
Subject: [PATCH 013/267] Adding Kokkos version of fix_property_atom

---
 src/KOKKOS/Install.sh                   |  2 +
 src/KOKKOS/fix_property_atom_kokkos.cpp | 72 ++++++++++++++++++++
 src/KOKKOS/fix_property_atom_kokkos.h   | 90 +++++++++++++++++++++++++
 3 files changed, 164 insertions(+)
 create mode 100644 src/KOKKOS/fix_property_atom_kokkos.cpp
 create mode 100644 src/KOKKOS/fix_property_atom_kokkos.h

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index 707ea1e986..a1830163bd 100644
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -87,6 +87,8 @@ action fix_nve_kokkos.cpp
 action fix_nve_kokkos.h
 action fix_nvt_kokkos.cpp
 action fix_nvt_kokkos.h
+action fix_property_atom_kokkos.cpp
+action fix_property_atom_kokkos.h
 action fix_qeq_reax_kokkos.cpp fix_qeq_reax.cpp
 action fix_qeq_reax_kokkos.h fix_qeq_reax.h
 action fix_reaxc_bonds_kokkos.cpp fix_reaxc_bonds.cpp
diff --git a/src/KOKKOS/fix_property_atom_kokkos.cpp b/src/KOKKOS/fix_property_atom_kokkos.cpp
new file mode 100644
index 0000000000..327563efbd
--- /dev/null
+++ b/src/KOKKOS/fix_property_atom_kokkos.cpp
@@ -0,0 +1,72 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <string.h>
+#include "fix_property_atom_kokkos.h"
+#include "atom_kokkos.h"
+#include "comm.h"
+#include "memory.h"
+#include "error.h"
+#include "update.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+enum{MOLECULE,CHARGE,RMASS,INTEGER,DOUBLE};
+
+/* ---------------------------------------------------------------------- */
+
+FixPropertyAtomKokkos::FixPropertyAtomKokkos(LAMMPS *lmp, int narg, char **arg) :
+  FixPropertyAtom(lmp, narg, arg)
+{
+  atomKK = (AtomKokkos *) atom;
+}
+
+/* ----------------------------------------------------------------------
+   allocate atom-based arrays
+   initialize new values to 0,
+   since AtomVec class won't do it as atoms are added,
+   e.g. in create_atom() or data_atom()
+------------------------------------------------------------------------- */
+
+void FixPropertyAtomKokkos::grow_arrays(int nmax)
+{
+  for (int m = 0; m < nvalue; m++) {
+    if (style[m] == MOLECULE) {
+      memory->grow(atom->molecule,nmax,"atom:molecule");
+      size_t nbytes = (nmax-nmax_old) * sizeof(tagint);
+      memset(&atom->molecule[nmax_old],0,nbytes);
+    } else if (style[m] == CHARGE) {
+      memory->grow(atom->q,nmax,"atom:q");
+      size_t nbytes = (nmax-nmax_old) * sizeof(double);
+      memset(&atom->q[nmax_old],0,nbytes);
+    } else if (style[m] == RMASS) {
+      memory->grow(atom->rmass,nmax,"atom:rmass");
+      size_t nbytes = (nmax-nmax_old) * sizeof(double);
+      memset(&atom->rmass[nmax_old],0,nbytes);
+    } else if (style[m] == INTEGER) {
+      memory->grow(atom->ivector[index[m]],nmax,"atom:ivector");
+      size_t nbytes = (nmax-nmax_old) * sizeof(int);
+      memset(&atom->ivector[index[m]][nmax_old],0,nbytes);
+    } else if (style[m] == DOUBLE) {
+      memory->grow_kokkos(atomKK->k_dvector,atomKK->dvector,nvalue,nmax,
+                          "atom:dvector");
+      //memory->grow(atom->dvector[index[m]],nmax,"atom:dvector");
+      //size_t nbytes = (nmax-nmax_old) * sizeof(double);
+      //memset(&atom->dvector[index[m]][nmax_old],0,nbytes);
+    }
+  }
+
+  nmax_old = nmax;
+}
diff --git a/src/KOKKOS/fix_property_atom_kokkos.h b/src/KOKKOS/fix_property_atom_kokkos.h
new file mode 100644
index 0000000000..ed1e4d7cfb
--- /dev/null
+++ b/src/KOKKOS/fix_property_atom_kokkos.h
@@ -0,0 +1,90 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(property/atom/kk,FixPropertyAtomKokkos)
+
+#else
+
+#ifndef LMP_FIX_PROPERTY_ATOM_KOKKOS_H
+#define LMP_FIX_PROPERTY_ATOM_KOKKOS_H
+
+#include "fix_property_atom.h"
+
+namespace LAMMPS_NS {
+
+class FixPropertyAtomKokkos : public FixPropertyAtom {
+ public:
+  FixPropertyAtomKokkos(class LAMMPS *, int, char **);
+  virtual ~FixPropertyAtomKokkos() {}
+
+  void grow_arrays(int);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Fix property/atom mol when atom_style already has molecule attribute
+
+Self-explanatory.
+
+E: Fix property/atom cannot specify mol twice
+
+Self-explanatory.
+
+E: Fix property/atom q when atom_style already has charge attribute
+
+Self-explanatory.
+
+E: Fix property/atom cannot specify q twice
+
+Self-explanatory.
+
+E: Fix property/atom vector name already exists
+
+The name for an integer or floating-point vector must be unique.
+
+W: Fix property/atom mol or charge w/out ghost communication
+
+A model typically needs these properties defined for ghost atoms.
+
+E: Atom style was redefined after using fix property/atom
+
+This is not allowed.
+
+E: Incorrect %s format in data file
+
+A section of the data file being read by fix property/atom does
+not have the correct number of values per line.
+
+E: Too few lines in %s section of data file
+
+Self-explanatory.
+
+E: Invalid atom ID in %s section of data file
+
+An atom in a section of the data file being read by fix property/atom
+has an invalid atom ID that is <= 0 or > the maximum existing atom ID.
+
+*/

From f47a40b2e4ead7248c601129c9e8a5d82deac91b Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Fri, 16 Dec 2016 10:02:01 -0700
Subject: [PATCH 014/267] Fixing Kokkos memory deallocation issue

---
 src/KOKKOS/atom_kokkos.cpp | 13 +++++++++++++
 src/atom.cpp               |  8 +++++---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/KOKKOS/atom_kokkos.cpp b/src/KOKKOS/atom_kokkos.cpp
index 4a7250e6ab..97b76ba67c 100644
--- a/src/KOKKOS/atom_kokkos.cpp
+++ b/src/KOKKOS/atom_kokkos.cpp
@@ -73,6 +73,19 @@ AtomKokkos::~AtomKokkos()
   memory->destroy_kokkos(k_improper_atom2, improper_atom2);
   memory->destroy_kokkos(k_improper_atom3, improper_atom3);
   memory->destroy_kokkos(k_improper_atom4, improper_atom4);
+
+  // USER-DPD package
+  memory->destroy_kokkos(k_uCond,uCond);
+  memory->destroy_kokkos(k_uMech,uMech);
+  memory->destroy_kokkos(k_uChem,uChem);
+  memory->destroy_kokkos(k_uCG,uCG);
+  memory->destroy_kokkos(k_uCGnew,uCGnew);
+  memory->destroy_kokkos(k_rho,rho);
+  memory->destroy_kokkos(k_dpdTheta,dpdTheta);
+  memory->destroy_kokkos(k_duChem,duChem);
+
+  memory->destroy_kokkos(k_dvector,dvector);
+  dvector = NULL;
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/atom.cpp b/src/atom.cpp
index 053a18430b..c7f8345898 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -331,9 +331,11 @@ Atom::~Atom()
     delete [] iname[i];
     memory->destroy(ivector[i]);
   }
-  for (int i = 0; i < ndvector; i++) {
-    delete [] dname[i];
-    memory->destroy(dvector[i]);
+  if (dvector != NULL) {
+    for (int i = 0; i < ndvector; i++) {
+      delete [] dname[i];
+      memory->destroy(dvector[i]);
+    }
   }
 
   memory->sfree(iname);

From d93e3d1cee93983df5e1c0707b4957d2bc138e9a Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Fri, 16 Dec 2016 10:06:12 -0700
Subject: [PATCH 015/267] Fixing runtime issues with pair_exp6_rx_kokkos

---
 src/KOKKOS/fix_property_atom_kokkos.cpp |  2 +-
 src/KOKKOS/pair_exp6_rx_kokkos.cpp      | 17 ++++++++++++++---
 src/USER-DPD/pair_exp6_rx.cpp           | 10 ++++++----
 src/USER-DPD/pair_exp6_rx.h             |  4 ++--
 4 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/KOKKOS/fix_property_atom_kokkos.cpp b/src/KOKKOS/fix_property_atom_kokkos.cpp
index 327563efbd..cb52988c31 100644
--- a/src/KOKKOS/fix_property_atom_kokkos.cpp
+++ b/src/KOKKOS/fix_property_atom_kokkos.cpp
@@ -60,7 +60,7 @@ void FixPropertyAtomKokkos::grow_arrays(int nmax)
       size_t nbytes = (nmax-nmax_old) * sizeof(int);
       memset(&atom->ivector[index[m]][nmax_old],0,nbytes);
     } else if (style[m] == DOUBLE) {
-      memory->grow_kokkos(atomKK->k_dvector,atomKK->dvector,nvalue,nmax,
+      memory->grow_kokkos(atomKK->k_dvector,atomKK->dvector,atomKK->k_dvector.dimension_0(),nmax,
                           "atom:dvector");
       //memory->grow(atom->dvector[index[m]],nmax,"atom:dvector");
       //size_t nbytes = (nmax-nmax_old) * sizeof(double);
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index 7e74f39ef0..e7934cfa0b 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -65,7 +65,20 @@ PairExp6rxKokkos<DeviceType>::PairExp6rxKokkos(LAMMPS *lmp) : PairExp6rx(lmp)
 template<class DeviceType>
 PairExp6rxKokkos<DeviceType>::~PairExp6rxKokkos()
 {
+  if (copymode) return;
 
+  memory->destroy_kokkos(k_eatom,eatom);
+  memory->destroy_kokkos(k_vatom,vatom);
+
+  memory->destroy_kokkos(k_cutsq,cutsq);
+
+  for (int i=0; i < nparams; ++i) {
+    delete[] params[i].name;
+    delete[] params[i].potential;
+  }
+  memory->destroy_kokkos(k_params,params);
+
+  memory->destroy_kokkos(k_mol2param,mol2param);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -73,7 +86,7 @@ PairExp6rxKokkos<DeviceType>::~PairExp6rxKokkos()
 template<class DeviceType>
 void PairExp6rxKokkos<DeviceType>::init_style()
 {
-  PairExp6rxKokkos::init_style();
+  PairExp6rx::init_style();
 
   // irequest = neigh request made by parent class
 
@@ -89,11 +102,9 @@ void PairExp6rxKokkos<DeviceType>::init_style()
   if (neighflag == FULL) {
     neighbor->requests[irequest]->full = 1;
     neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->ghost = 1;
   } else if (neighflag == HALF || neighflag == HALFTHREAD) {
     neighbor->requests[irequest]->full = 0;
     neighbor->requests[irequest]->half = 1;
-    neighbor->requests[irequest]->ghost = 1;
   } else {
     error->all(FLERR,"Cannot use chosen neighbor list style with reax/c/kk");
   }
diff --git a/src/USER-DPD/pair_exp6_rx.cpp b/src/USER-DPD/pair_exp6_rx.cpp
index 2643c9ec04..dd8ac4bbe7 100644
--- a/src/USER-DPD/pair_exp6_rx.cpp
+++ b/src/USER-DPD/pair_exp6_rx.cpp
@@ -79,11 +79,13 @@ PairExp6rx::~PairExp6rx()
 {
   if (copymode) return;
 
-  for (int i=0; i < nparams; ++i) {
-    delete[] params[i].name;
-    delete[] params[i].potential;
+  if (params != NULL) {
+    for (int i=0; i < nparams; ++i) {
+      delete[] params[i].name;
+      delete[] params[i].potential;
+    }
+    memory->destroy(params);
   }
-  memory->destroy(params);
   memory->destroy(mol2param);
 
   if (allocated) {
diff --git a/src/USER-DPD/pair_exp6_rx.h b/src/USER-DPD/pair_exp6_rx.h
index dd9fa22a48..f9654e4086 100644
--- a/src/USER-DPD/pair_exp6_rx.h
+++ b/src/USER-DPD/pair_exp6_rx.h
@@ -44,7 +44,7 @@ class PairExp6rx : public Pair {
   double **epsilon,**rm,**alpha;
   double **rminv,**buck1,**buck2,**offset;
 
-  void allocate();
+  virtual void allocate();
   int *mol2param;               // mapping from molecule to parameters
   int nparams;                  // # of stored parameter sets
   int maxparam;                 // max # of parameter sets
@@ -58,7 +58,7 @@ class PairExp6rx : public Pair {
   Param *params;                // parameter set for an I-J-K interaction
 
   int nspecies;
-  void read_file(char *);
+  virtual void read_file(char *);
   void setup();
 
   int isite1, isite2;

From cfa61b98aec5951824affe4057f81e022868d470 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Fri, 16 Dec 2016 12:37:41 -0700
Subject: [PATCH 016/267] Fixing runtime issues in fix_eos_table_rx_kokkos

---
 src/KOKKOS/fix_eos_table_rx_kokkos.cpp | 152 ++++++++++++++++++++-----
 src/KOKKOS/fix_eos_table_rx_kokkos.h   |  33 ++++++
 2 files changed, 157 insertions(+), 28 deletions(-)

diff --git a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
index 75e9b292f9..6cb5c0611a 100644
--- a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
@@ -40,8 +40,12 @@ FixEOStableRXKokkos<DeviceType>::FixEOStableRXKokkos(LAMMPS *lmp, int narg, char
 {
   atomKK = (AtomKokkos *) atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
-  datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
-  datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  update_table = 1;
+  h_table = new TableHost();
+  d_table = new TableDevice();
 
   k_error_flag = DAT::tdual_int_scalar("fix:error_flag");
   k_warning_flag = DAT::tdual_int_scalar("fix:warning_flag");
@@ -53,6 +57,9 @@ template<class DeviceType>
 FixEOStableRXKokkos<DeviceType>::~FixEOStableRXKokkos()
 {
   if (copymode) return;
+
+  delete h_table;
+  delete d_table;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -60,6 +67,11 @@ FixEOStableRXKokkos<DeviceType>::~FixEOStableRXKokkos()
 template<class DeviceType>
 void FixEOStableRXKokkos<DeviceType>::setup(int vflag)
 {
+  if (update_table)
+    create_kokkos_tables();
+
+  copymode = 1;
+
   int nlocal = atom->nlocal;
   mask = atomKK->k_mask.view<DeviceType>();
   uCond = atomKK->k_uCond.view<DeviceType>();
@@ -68,6 +80,10 @@ void FixEOStableRXKokkos<DeviceType>::setup(int vflag)
   dpdTheta= atomKK->k_dpdTheta.view<DeviceType>();
   uCG = atomKK->k_uCG.view<DeviceType>();
   uCGnew = atomKK->k_uCGnew.view<DeviceType>();
+  dvector = atomKK->k_dvector.view<DeviceType>();
+
+  atomKK->sync(execution_space,MASK_MASK | UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK | UCG_MASK | UCGNEW_MASK | DVECTOR_MASK);
+  atomKK->modified(execution_space,UCHEM_MASK | DPDTHETA_MASK | UCG_MASK | UCGNEW_MASK);
 
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXSetup>(0,nlocal),*this);
 
@@ -77,6 +93,8 @@ void FixEOStableRXKokkos<DeviceType>::setup(int vflag)
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXTemperatureLookup>(0,nlocal),*this);
 
   error_check();
+
+  copymode = 0;
 }
 
 template<class DeviceType>
@@ -102,12 +120,21 @@ void FixEOStableRXKokkos<DeviceType>::operator()(TagFixEOStableRXTemperatureLook
 template<class DeviceType>
 void FixEOStableRXKokkos<DeviceType>::init()
 {
+  if (update_table)
+    create_kokkos_tables();
+
+  copymode = 1;
+
   int nlocal = atom->nlocal;
   mask = atomKK->k_mask.view<DeviceType>();
   uCond = atomKK->k_uCond.view<DeviceType>();
   uMech = atomKK->k_uMech.view<DeviceType>();
   uChem = atomKK->k_uChem.view<DeviceType>();
   dpdTheta= atomKK->k_dpdTheta.view<DeviceType>();
+  dvector = atomKK->k_dvector.view<DeviceType>();
+
+  atomKK->sync(execution_space,MASK_MASK | UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK | UCG_MASK | UCGNEW_MASK | DVECTOR_MASK);
+  atomKK->modified(execution_space,UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK);
 
   if (this->restart_reset)
     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXTemperatureLookup>(0,nlocal),*this);
@@ -115,6 +142,8 @@ void FixEOStableRXKokkos<DeviceType>::init()
     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXInit>(0,nlocal),*this);
 
   error_check();
+
+  copymode = 0;
 }
 
 template<class DeviceType>
@@ -136,16 +165,27 @@ void FixEOStableRXKokkos<DeviceType>::operator()(TagFixEOStableRXInit, const int
 template<class DeviceType>
 void FixEOStableRXKokkos<DeviceType>::post_integrate()
 {
+  if (update_table)
+    create_kokkos_tables();
+
+  copymode = 1;
+
   int nlocal = atom->nlocal;
   mask = atomKK->k_mask.view<DeviceType>();
   uCond = atomKK->k_uCond.view<DeviceType>();
   uMech = atomKK->k_uMech.view<DeviceType>();
   uChem = atomKK->k_uChem.view<DeviceType>();
   dpdTheta= atomKK->k_dpdTheta.view<DeviceType>();
+  dvector = atomKK->k_dvector.view<DeviceType>();
+
+  atomKK->sync(execution_space,MASK_MASK | UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK | DVECTOR_MASK);
+  atomKK->modified(execution_space,DPDTHETA_MASK);
 
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXTemperatureLookup2>(0,nlocal),*this);
 
   error_check();
+
+  copymode = 0;
 }
 
 template<class DeviceType>
@@ -163,6 +203,11 @@ void FixEOStableRXKokkos<DeviceType>::operator()(TagFixEOStableRXTemperatureLook
 template<class DeviceType>
 void FixEOStableRXKokkos<DeviceType>::end_of_step()
 {
+  if (update_table)
+    create_kokkos_tables();
+
+  copymode = 1;
+
   int nlocal = atom->nlocal;
   mask = atomKK->k_mask.view<DeviceType>();
   uCond = atomKK->k_uCond.view<DeviceType>();
@@ -171,7 +216,10 @@ void FixEOStableRXKokkos<DeviceType>::end_of_step()
   dpdTheta= atomKK->k_dpdTheta.view<DeviceType>();
   uCG = atomKK->k_uCG.view<DeviceType>();
   uCGnew = atomKK->k_uCGnew.view<DeviceType>();
-  double duChem;
+  dvector = atomKK->k_dvector.view<DeviceType>();
+
+  atomKK->sync(execution_space,MASK_MASK | UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK | UCG_MASK | UCGNEW_MASK | DVECTOR_MASK);
+  atomKK->modified(execution_space,UCHEM_MASK | DPDTHETA_MASK | UCG_MASK | UCGNEW_MASK);
 
   // Communicate the ghost uCGnew
   comm->reverse_comm_fix(this);
@@ -184,6 +232,8 @@ void FixEOStableRXKokkos<DeviceType>::end_of_step()
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXTemperatureLookup2>(0,nlocal),*this);
 
   error_check();
+
+  copymode = 0;
 }
 
 /* ----------------------------------------------------------------------
@@ -200,22 +250,27 @@ void FixEOStableRXKokkos<DeviceType>::energy_lookup(int id, double thetai, doubl
   ui = 0.0;
   nTotal = 0.0;
   for(int ispecies=0;ispecies<nspecies;ispecies++){
-    Table *tb = &tables[ispecies];
-    thetai = MAX(thetai,tb->lo);
-    thetai = MIN(thetai,tb->hi);
+    //Table *tb = &tables[ispecies];
+    //thetai = MAX(thetai,tb->lo);
+    thetai = MAX(thetai,d_table_const.lo(ispecies));
+    //thetai = MIN(thetai,tb->hi);
+    thetai = MIN(thetai,d_table_const.hi(ispecies));
 
     if (tabstyle == LINEAR) {
-      itable = static_cast<int> ((thetai - tb->lo) * tb->invdelta);
-      fraction = (thetai - tb->r[itable]) * tb->invdelta;
-      uTmp = tb->e[itable] + fraction*tb->de[itable];
+      //itable = static_cast<int> ((thetai - tb->lo) * tb->invdelta);
+      itable = static_cast<int> ((thetai - d_table_const.lo(ispecies)) * d_table_const.invdelta(ispecies));
+      //fraction = (thetai - tb->r[itable]) * tb->invdelta;
+      fraction = (thetai - d_table_const.r(ispecies,itable)) * d_table_const.invdelta(ispecies);
+      //uTmp = tb->e[itable] + fraction*tb->de[itable];
+      uTmp = d_table_const.e(ispecies,itable) + fraction*d_table_const.de(ispecies,itable);
 
       uTmp += dHf[ispecies];
       // mol fraction form:
-      ui += atom->dvector[ispecies][id]*uTmp;
-      nTotal += atom->dvector[ispecies][id];
+      ui += dvector(ispecies,id)*uTmp;
+      nTotal += dvector(ispecies,id);
     }
   }
-  ui = ui - double(nTotal+1.5)*force->boltz*thetai;
+  ui = ui - double(nTotal+1.5)*force->boltz*thetai; // need class variable
 }
 
 /* ----------------------------------------------------------------------
@@ -226,18 +281,20 @@ template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void FixEOStableRXKokkos<DeviceType>::temperature_lookup(int id, double ui, double &thetai) const
 {
-  Table *tb = &tables[0];
+  //Table *tb = &tables[0];
 
   int it;
   double t1,t2,u1,u2,f1,f2;
   double maxit = 100;
   double temp;
   double delta = 0.001;
+  int lo = d_table_const.lo(0);
+  int hi = d_table_const.hi(0);
 
   // Store the current thetai in t1
-  t1 = MAX(thetai,tb->lo);
-  t1 = MIN(t1,tb->hi);
-  if(t1==tb->hi) delta = -delta;
+  t1 = MAX(thetai,lo);
+  t1 = MIN(t1,hi);
+  if(t1==hi) delta = -delta;
 
   // Compute u1 at thetai
   energy_lookup(id,t1,u1);
@@ -259,8 +316,8 @@ void FixEOStableRXKokkos<DeviceType>::temperature_lookup(int id, double ui, doub
     if(fabs(f2-f1)<1e-15){
       if(isnan(f1) || isnan(f2)) k_error_flag.d_view() = 2;
       temp = t1;
-      temp = MAX(temp,tb->lo);
-      temp = MIN(temp,tb->hi);
+      temp = MAX(temp,lo);
+      temp = MIN(temp,hi);
       k_warning_flag.d_view() = 1;
       break;
     }
@@ -286,9 +343,6 @@ template<class DeviceType>
 int FixEOStableRXKokkos<DeviceType>::pack_forward_comm(int n, int *list, double *buf, int pbc_flag, int *pbc)
 {
   int ii,jj,m;
-  uChem = atomKK->k_uChem.view<DeviceType>();
-  uCG = atomKK->k_uCG.view<DeviceType>();
-  uCGnew = atomKK->k_uCGnew.view<DeviceType>();
 
   m = 0;
   for (ii = 0; ii < n; ii++) {
@@ -306,9 +360,6 @@ template<class DeviceType>
 void FixEOStableRXKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *buf)
 {
   int ii,m,last;
-  uChem = atomKK->k_uChem.view<DeviceType>();
-  uCG = atomKK->k_uCG.view<DeviceType>();
-  uCGnew = atomKK->k_uCGnew.view<DeviceType>();
 
   m = 0;
   last = first + n ;
@@ -325,8 +376,6 @@ template<class DeviceType>
 int FixEOStableRXKokkos<DeviceType>::pack_reverse_comm(int n, int first, double *buf)
 {
   int i,m,last;
-  uCG = atomKK->k_uCG.view<DeviceType>();
-  uCGnew = atomKK->k_uCGnew.view<DeviceType>();
 
   m = 0;
   last = first + n;
@@ -343,8 +392,6 @@ template<class DeviceType>
 void FixEOStableRXKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, double *buf)
 {
   int i,j,m;
-  uCG = atomKK->k_uCG.view<DeviceType>();
-  uCGnew = atomKK->k_uCGnew.view<DeviceType>();
 
   m = 0;
   for (i = 0; i < n; i++) {
@@ -381,6 +428,55 @@ void FixEOStableRXKokkos<DeviceType>::error_check()
 
 /* ---------------------------------------------------------------------- */
 
+template<class DeviceType>
+void FixEOStableRXKokkos<DeviceType>::create_kokkos_tables()
+{
+  const int tlm1 = tablength-1;
+
+  memory->create_kokkos(d_table->lo,h_table->lo,ntables,"Table::lo");
+  memory->create_kokkos(d_table->hi,h_table->hi,ntables,"Table::hi");
+  memory->create_kokkos(d_table->invdelta,h_table->invdelta,ntables,"Table::invdelta");
+
+  if(tabstyle == LINEAR) {
+    memory->create_kokkos(d_table->r,h_table->r,ntables,tablength,"Table::r");
+    memory->create_kokkos(d_table->e,h_table->e,ntables,tablength,"Table::e");
+    memory->create_kokkos(d_table->de,h_table->de,ntables,tlm1,"Table::de");
+  }
+
+  for(int i=0; i < ntables; i++) {
+    Table* tb = &tables[i];
+
+    h_table->lo[i] = tb->lo;
+    h_table->hi[i] = tb->hi;
+    h_table->invdelta[i] = tb->invdelta;
+
+    for(int j = 0; j<h_table->r.dimension_1(); j++)
+      h_table->r(i,j) = tb->r[j];
+    for(int j = 0; j<h_table->e.dimension_1(); j++)
+      h_table->e(i,j) = tb->e[j];
+    for(int j = 0; j<h_table->de.dimension_1(); j++)
+      h_table->de(i,j) = tb->de[j];
+  }
+
+  Kokkos::deep_copy(d_table->lo,h_table->lo);
+  Kokkos::deep_copy(d_table->hi,h_table->hi);
+  Kokkos::deep_copy(d_table->invdelta,h_table->invdelta);
+  Kokkos::deep_copy(d_table->r,h_table->r);
+  Kokkos::deep_copy(d_table->e,h_table->e);
+  Kokkos::deep_copy(d_table->de,h_table->de);
+
+  d_table_const.lo = d_table->lo;
+  d_table_const.hi = d_table->hi;
+  d_table_const.invdelta = d_table->invdelta;
+  d_table_const.r = d_table->r;
+  d_table_const.e = d_table->e;
+  d_table_const.de = d_table->de;
+
+  update_table = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
 namespace LAMMPS_NS {
 template class FixEOStableRXKokkos<LMPDeviceType>;
 #ifdef KOKKOS_HAVE_CUDA
diff --git a/src/KOKKOS/fix_eos_table_rx_kokkos.h b/src/KOKKOS/fix_eos_table_rx_kokkos.h
index 9b0ca366a0..7de8f4dbc4 100644
--- a/src/KOKKOS/fix_eos_table_rx_kokkos.h
+++ b/src/KOKKOS/fix_eos_table_rx_kokkos.h
@@ -75,13 +75,46 @@ class FixEOStableRXKokkos : public FixEOStableRX {
   //};
   //Table *tables, *tables2;
 
+  /*struct TableDeviceConst {
+    typename ArrayTypes<DeviceType>::t_int_1d_randomread lo,hi;
+    typename ArrayTypes<DeviceType>::t_ffloat_1d_randomread invdelta;
+    typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread r,e,de;
+  };*/
+ //Its faster not to use texture fetch if the number of tables is less than 32!
+  struct TableDeviceConst {
+    typename ArrayTypes<DeviceType>::t_int_1d lo,hi;
+    typename ArrayTypes<DeviceType>::t_ffloat_1d invdelta;
+    typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread r,e,de;
+  };
+
+  struct TableDevice {
+    typename ArrayTypes<DeviceType>::t_int_1d lo,hi;
+    typename ArrayTypes<DeviceType>::t_ffloat_1d invdelta;
+    typename ArrayTypes<DeviceType>::t_ffloat_2d r,e,de;
+  };
+
+  struct TableHost {
+    typename ArrayTypes<LMPHostType>::t_int_1d lo,hi;
+    typename ArrayTypes<LMPHostType>::t_ffloat_1d invdelta;
+    typename ArrayTypes<LMPHostType>::t_ffloat_2d r,e,de;
+  };
+
+  TableDeviceConst d_table_const;
+  TableDevice* d_table;
+  TableHost* h_table;
+
+  int **tabindex;
+
   void allocate();
   void error_check();
+  int update_table;
+  void create_kokkos_tables();
 
   //double *dHf;
 
   typename AT::t_int_1d mask;
   typename AT::t_efloat_1d uCond,uMech,uChem,uCG,uCGnew,rho,dpdTheta,duChem;
+  typename AT::t_float_2d dvector;
 
   DAT::tdual_int_scalar k_error_flag;
   DAT::tdual_int_scalar k_warning_flag;

From 5cae3eca8c43fc9712a28776e1d196b638d8216c Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Fri, 16 Dec 2016 17:09:19 -0700
Subject: [PATCH 017/267] Whitespace cleanup to pair_dpd_fdt_energy, should be
 cherry-picked to Master

---
 src/USER-DPD/pair_dpd_fdt_energy.cpp | 80 ++++++++++++++--------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/src/USER-DPD/pair_dpd_fdt_energy.cpp b/src/USER-DPD/pair_dpd_fdt_energy.cpp
index 2041405467..0f6141d015 100644
--- a/src/USER-DPD/pair_dpd_fdt_energy.cpp
+++ b/src/USER-DPD/pair_dpd_fdt_energy.cpp
@@ -206,7 +206,7 @@ void PairDPDfdtEnergy::compute(int eflag, int vflag)
           if (r < EPSILON) continue;     // r can be 0.0 in DPD systems
           rinv = 1.0/r;
           wr = 1.0 - r/cut[itype][jtype];
-	  wd = wr*wr;
+          wd = wr*wr;
 
           delvx = vxtmp - v[j][0];
           delvy = vytmp - v[j][1];
@@ -214,11 +214,11 @@ void PairDPDfdtEnergy::compute(int eflag, int vflag)
           dot = delx*delvx + dely*delvy + delz*delvz;
           randnum = random->gaussian();
 
-	  // Compute the current temperature
-	  theta_ij = 0.5*(1.0/dpdTheta[i] + 1.0/dpdTheta[j]);
-	  theta_ij = 1.0/theta_ij;
-	
-	  gamma_ij = sigma[itype][jtype]*sigma[itype][jtype]
+          // Compute the current temperature
+          theta_ij = 0.5*(1.0/dpdTheta[i] + 1.0/dpdTheta[j]);
+          theta_ij = 1.0/theta_ij;
+
+          gamma_ij = sigma[itype][jtype]*sigma[itype][jtype]
                      / (2.0*force->boltz*theta_ij);
 
           // conservative force = a0 * wr
@@ -239,44 +239,44 @@ void PairDPDfdtEnergy::compute(int eflag, int vflag)
             f[j][2] -= delz*fpair;
           }
 
-	  if (rmass) {
-	    mass_i = rmass[i];
-	    mass_j = rmass[j];
-	  } else {
-	    mass_i = mass[itype];
-	    mass_j = mass[jtype];
-	  }
-	  massinv_i = 1.0 / mass_i;
-	  massinv_j = 1.0 / mass_j;
+          if (rmass) {
+            mass_i = rmass[i];
+            mass_j = rmass[j];
+          } else {
+            mass_i = mass[itype];
+            mass_j = mass[jtype];
+          }
+          massinv_i = 1.0 / mass_i;
+          massinv_j = 1.0 / mass_j;
 
-	  // Compute the mechanical and conductive energy, uMech and uCond
-	  mu_ij = massinv_i + massinv_j;
-	  mu_ij *= force->ftm2v;
+          // Compute the mechanical and conductive energy, uMech and uCond
+          mu_ij = massinv_i + massinv_j;
+          mu_ij *= force->ftm2v;
 
-	  uTmp = gamma_ij*wd*rinv*rinv*dot*dot
-                 - 0.5*sigma[itype][jtype]*sigma[itype][jtype]*mu_ij*wd;
-	  uTmp -= sigma[itype][jtype]*wr*rinv*dot*randnum*dtinvsqrt;
-	  uTmp *= 0.5;
+          uTmp = gamma_ij*wd*rinv*rinv*dot*dot
+                        - 0.5*sigma[itype][jtype]*sigma[itype][jtype]*mu_ij*wd;
+          uTmp -= sigma[itype][jtype]*wr*rinv*dot*randnum*dtinvsqrt;
+          uTmp *= 0.5;
 
-	  duMech[i] += uTmp;
-	  if (newton_pair || j < nlocal) {
-	    duMech[j] += uTmp;
-	  }
-	
-	  // Compute uCond
-	  randnum = random->gaussian();
-	  kappa_ij = kappa[itype][jtype];
-	  alpha_ij = sqrt(2.0*force->boltz*kappa_ij);
-	  randPair = alpha_ij*wr*randnum*dtinvsqrt;
+          duMech[i] += uTmp;
+          if (newton_pair || j < nlocal) {
+            duMech[j] += uTmp;
+          }
+
+          // Compute uCond
+          randnum = random->gaussian();
+          kappa_ij = kappa[itype][jtype];
+          alpha_ij = sqrt(2.0*force->boltz*kappa_ij);
+          randPair = alpha_ij*wr*randnum*dtinvsqrt;
+
+          uTmp = kappa_ij*(1.0/dpdTheta[i] - 1.0/dpdTheta[j])*wd;
+          uTmp += randPair;
+
+          duCond[i] += uTmp;
+          if (newton_pair || j < nlocal) {
+            duCond[j] -= uTmp;
+          }
 
-	  uTmp = kappa_ij*(1.0/dpdTheta[i] - 1.0/dpdTheta[j])*wd;
-	  uTmp += randPair;
-	
-	  duCond[i] += uTmp;
-	  if (newton_pair || j < nlocal) {
-	    duCond[j] -= uTmp;
-	  }
-	
           if (eflag) {
             // unshifted eng of conservative term:
             // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);

From ac57f4721cea7db41b6b964d6fb4a772fa5c5202 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Fri, 16 Dec 2016 17:14:27 -0700
Subject: [PATCH 018/267] Small whitespace tweak to pair_dpd_fdt_energy

---
 src/USER-DPD/pair_dpd_fdt_energy.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/USER-DPD/pair_dpd_fdt_energy.cpp b/src/USER-DPD/pair_dpd_fdt_energy.cpp
index 0f6141d015..558994d35e 100644
--- a/src/USER-DPD/pair_dpd_fdt_energy.cpp
+++ b/src/USER-DPD/pair_dpd_fdt_energy.cpp
@@ -254,7 +254,7 @@ void PairDPDfdtEnergy::compute(int eflag, int vflag)
           mu_ij *= force->ftm2v;
 
           uTmp = gamma_ij*wd*rinv*rinv*dot*dot
-                        - 0.5*sigma[itype][jtype]*sigma[itype][jtype]*mu_ij*wd;
+                 - 0.5*sigma[itype][jtype]*sigma[itype][jtype]*mu_ij*wd;
           uTmp -= sigma[itype][jtype]*wr*rinv*dot*randnum*dtinvsqrt;
           uTmp *= 0.5;
 

From 21bb603b93181e4551ebc69fc0e225bf055cd21d Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Mon, 19 Dec 2016 13:30:00 -0700
Subject: [PATCH 019/267] Porting recent changes from USER-DPD package to
 KOKKOS package

---
 src/KOKKOS/fix_eos_table_rx_kokkos.cpp |  7 ++--
 src/KOKKOS/fix_eos_table_rx_kokkos.h   |  4 +++
 src/KOKKOS/pair_exp6_rx_kokkos.cpp     | 50 ++++++++++++++++++++------
 3 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
index 6cb5c0611a..3b22f61e66 100644
--- a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
@@ -85,7 +85,8 @@ void FixEOStableRXKokkos<DeviceType>::setup(int vflag)
   atomKK->sync(execution_space,MASK_MASK | UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK | UCG_MASK | UCGNEW_MASK | DVECTOR_MASK);
   atomKK->modified(execution_space,UCHEM_MASK | DPDTHETA_MASK | UCG_MASK | UCGNEW_MASK);
 
-  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXSetup>(0,nlocal),*this);
+  if (!this->restart_reset)
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXSetup>(0,nlocal),*this);
 
   // Communicate the updated momenta and velocities to all nodes
   comm->forward_comm_fix(this);
@@ -154,8 +155,8 @@ void FixEOStableRXKokkos<DeviceType>::operator()(TagFixEOStableRXInit, const int
     if(dpdTheta[i] <= 0.0)
       k_error_flag.d_view() = 1;
     energy_lookup(i,dpdTheta[i],tmp);
-    uCond[i] = tmp / 2.0;
-    uMech[i] = tmp / 2.0;
+    uCond[i] = 0.0;
+    uMech[i] = tmp;
     uChem[i] = 0.0;
   }
 }
diff --git a/src/KOKKOS/fix_eos_table_rx_kokkos.h b/src/KOKKOS/fix_eos_table_rx_kokkos.h
index 7de8f4dbc4..3b9a00afe2 100644
--- a/src/KOKKOS/fix_eos_table_rx_kokkos.h
+++ b/src/KOKKOS/fix_eos_table_rx_kokkos.h
@@ -155,6 +155,10 @@ E:  eos/table/rx values are not increasing
 
 The equation-of-state must an increasing function
 
+E:  FixEOStableRX requires atom_style with internal temperature and energies (e.g. dpd)
+
+Self-explanatory.
+
 E:  Internal temperature <= zero.
 
 Self-explanatory.
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index e7934cfa0b..e6b8a80f44 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -187,8 +187,10 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
   k_error_flag.template modify<DeviceType>();
   k_error_flag.template sync<LMPHostType>();
-  if (k_error_flag.h_view())
+  if (k_error_flag.h_view() == 1)
     error->all(FLERR,"The number of molecules in CG particle is less than 1e-8.");
+  else if (k_error_flag.h_view() == 2)
+    error->all(FLERR,"Computed fraction less than -1.0e-10");
 
   int inum = list->inum;
   NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
@@ -432,13 +434,13 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
 
           uin1 = buck1*(6.0*rin1exp - alphaOld12_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
 
-          win1 = -buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) - rin1*durc;
+          win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
 
-          aRep = -1.0*win1*powint(rin1,nRep)/nRep;
+          aRep = win1*powint(rin1,nRep)/nRep;
 
           uin1rep = aRep/powint(rin1,nRep);
 
-          forceExp6 = -double(nRep)*aRep/powint(r,nRep);
+          forceExp6 = double(nRep)*aRep/powint(r,nRep);
           fpairOldEXP6_12 = factor_lj*forceExp6*r2inv;
 
           evdwlOldEXP6_12 = uin1 - uin1rep + aRep/powint(r,nRep);
@@ -472,13 +474,13 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
 
           uin1 = buck1*(6.0*rin1exp - alphaOld21_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
 
-          win1 = -buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) - rin1*durc;
+          win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
 
-          aRep = -1.0*win1*powint(rin1,nRep)/nRep;
+          aRep = win1*powint(rin1,nRep)/nRep;
 
           uin1rep = aRep/powint(rin1,nRep);
 
-          forceExp6 = -double(nRep)*aRep/powint(r,nRep);
+          forceExp6 = double(nRep)*aRep/powint(r,nRep);
           fpairOldEXP6_21 = factor_lj*forceExp6*r2inv;
 
           evdwlOldEXP6_21 = uin1 - uin1rep + aRep/powint(r,nRep);
@@ -527,9 +529,9 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
 
           uin1 = buck1*(6.0*rin1exp - alpha12_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
 
-          win1 = -buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) - rin1*durc;
+          win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
 
-          aRep = -1.0*win1*powint(rin1,nRep)/nRep;
+          aRep = win1*powint(rin1,nRep)/nRep;
 
           uin1rep = aRep/powint(rin1,nRep);
 
@@ -559,9 +561,9 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
 
           uin1 = buck1*(6.0*rin1exp - alpha21_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
 
-          win1 = -buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) - rin1*durc;
+          win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
 
-          aRep = -1.0*win1*powint(rin1,nRep)/nRep;
+          aRep = win1*powint(rin1,nRep)/nRep;
 
           uin1rep = aRep/powint(rin1,nRep);
 
@@ -1015,6 +1017,32 @@ void PairExp6rxKokkos<DeviceType>::getParamsEXP6(int id,double &epsilon1,double
       rm2_old *= pow(nTotalOFA_old,fuchslinR);
     }
   }
+
+  // Check that no fractions are less than zero
+  if(fraction1 < 0.0){
+    if(fraction1 < -1.0e-10){
+      k_error_flag.d_view() = 2;
+    }
+    fraction1 = 0.0;
+  }
+  if(fraction2 < 0.0){
+    if(fraction2 < -1.0e-10){
+      k_error_flag.d_view() = 2;
+    }
+    fraction2 = 0.0;
+  }
+  if(fraction1_old < 0.0){
+    if(fraction1_old < -1.0e-10){
+      k_error_flag.d_view() = 2;
+    }
+    fraction1_old = 0.0;
+  }
+  if(fraction2_old < 0.0){
+    if(fraction2_old < -1.0e-10){
+      k_error_flag.d_view() = 2;
+    }
+    fraction2_old = 0.0;
+  }
 }
 
 /* ---------------------------------------------------------------------- */

From 3f1f51c1c7551165d023f8269663e45a0ccb08c3 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Mon, 19 Dec 2016 13:31:09 -0700
Subject: [PATCH 020/267] Changes necessary for runtime testing of Kokkos
 styles

---
 src/USER-DPD/fix_rx.cpp | 4 ++--
 src/pair_hybrid.cpp     | 3 ---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/USER-DPD/fix_rx.cpp b/src/USER-DPD/fix_rx.cpp
index 47194cd7bc..1c2313c694 100644
--- a/src/USER-DPD/fix_rx.cpp
+++ b/src/USER-DPD/fix_rx.cpp
@@ -363,11 +363,11 @@ void FixRX::post_constructor()
   newarg2[nspecies+3] = (char *) "ghost";
   newarg2[nspecies+4] = (char *) "yes";
 
-  modify->add_fix(nspecies+5,newarg);
+  modify->add_fix(nspecies+5,newarg,1);
   fix_species = (FixPropertyAtom *) modify->fix[modify->nfix-1];
   restartFlag = modify->fix[modify->nfix-1]->restart_reset;
 
-  modify->add_fix(nspecies+5,newarg2);
+  modify->add_fix(nspecies+5,newarg2,1);
   fix_species_old = (FixPropertyAtom *) modify->fix[modify->nfix-1];
 
   if(nspecies==0) error->all(FLERR,"There are no rx species specified.");
diff --git a/src/pair_hybrid.cpp b/src/pair_hybrid.cpp
index 620ceadfd9..d756b9be98 100644
--- a/src/pair_hybrid.cpp
+++ b/src/pair_hybrid.cpp
@@ -39,9 +39,6 @@ PairHybrid::PairHybrid(LAMMPS *lmp) : Pair(lmp),
   
   outerflag = 0;
   respaflag = 0;
-
-  if (lmp->kokkos)
-    error->all(FLERR,"Cannot yet use pair hybrid with Kokkos");
 }
 
 /* ---------------------------------------------------------------------- */

From 000df6e1cf3ebf6cfccc69268aafd60ed1fba1b0 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Mon, 19 Dec 2016 15:20:10 -0700
Subject: [PATCH 021/267] Fixing what seems to be a Kokkos bug, I will submit
 to Kokkos lib developers too

---
 lib/kokkos/algorithms/src/Kokkos_Random.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
index d7c06dc14b..d54abeceb0 100644
--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -670,8 +670,8 @@ namespace Kokkos {
       double S = 2.0;
       double U;
       while(S>=1.0) {
-        U = drand();
-        const double V = drand();
+        U = 2.0*drand() - 1.0;
+        const double V = 2.0*drand() - 1.0;
         S = U*U+V*V;
       }
       return U*sqrt(-2.0*log(S)/S);

From 99910fc43241df916648ad24253f8c757a1b711c Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Mon, 19 Dec 2016 15:27:16 -0700
Subject: [PATCH 022/267] Adding CPU runtime tested version of
 pair_dpd_fdt_energy_kokkos

---
 src/KOKKOS/Install.sh                     |   2 +
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp | 694 ++++++++++++++++------
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.h   | 104 +++-
 src/KOKKOS/rand_pool_wrap.cpp             |  72 +++
 src/KOKKOS/rand_pool_wrap.h               |  84 +++
 src/USER-DPD/pair_dpd_fdt_energy.cpp      |   2 +
 src/USER-DPD/pair_dpd_fdt_energy.h        |   6 +-
 7 files changed, 737 insertions(+), 227 deletions(-)
 create mode 100644 src/KOKKOS/rand_pool_wrap.cpp
 create mode 100644 src/KOKKOS/rand_pool_wrap.h

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index a1830163bd..94be32cc32 100644
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -200,6 +200,8 @@ action pair_tersoff_zbl_kokkos.cpp pair_tersoff_zbl.cpp
 action pair_tersoff_zbl_kokkos.h pair_tersoff_zbl.h
 action pppm_kokkos.cpp pppm.cpp
 action pppm_kokkos.h pppm.h
+action rand_pool_wrap_kokkos.cpp
+action rand_pool_wrap_kokkos.h
 action region_block_kokkos.cpp
 action region_block_kokkos.h
 action verlet_kokkos.cpp
diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
index 0bfbb9491e..3b49f43246 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
@@ -12,15 +12,13 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing author: James Larentzos (U.S. Army Research Laboratory)
+   Contributing author: Stan Moore (Sandia)
 ------------------------------------------------------------------------- */
 
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include "pair_dpd_fdt_energy_kokkos.h"
-#include "kokkos.h"
 #include "atom_kokkos.h"
 #include "atom_vec.h"
 #include "comm.h"
@@ -31,30 +29,26 @@
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "random_mars.h"
-#include "math_const.h"
 #include "memory.h"
 #include "modify.h"
+#include "pair_dpd_fdt_energy_kokkos.h"
 #include "error.h"
 #include "atom_masks.h"
 
 using namespace LAMMPS_NS;
-using namespace MathConst;
-
-#define KOKKOS_CUDA_MAX_THREADS 256
-#define KOKKOS_CUDA_MIN_BLOCKS 8
 
 #define EPSILON 1.0e-10
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
-PairDPDfdtEnergyKokkos<DeviceType>::PairDPDfdtEnergyKokkos(LAMMPS *lmp) : PairDPDfdtEnergy(lmp)
+PairDPDfdtEnergyKokkos<DeviceType>::PairDPDfdtEnergyKokkos(LAMMPS *lmp) :
+  PairDPDfdtEnergy(lmp),rand_pool(seed + comm->me /** , lmp/**/)
 {
   atomKK = (AtomKokkos *) atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
-  datamask_read = X_MASK | F_MASK | TYPE_MASK | TAG_MASK | ENERGY_MASK | VIRIAL_MASK;
-  datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
-  cutsq = NULL;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -62,26 +56,49 @@ PairDPDfdtEnergyKokkos<DeviceType>::PairDPDfdtEnergyKokkos(LAMMPS *lmp) : PairDP
 template<class DeviceType>
 PairDPDfdtEnergyKokkos<DeviceType>::~PairDPDfdtEnergyKokkos()
 {
+  if (copymode) return;
+
   if (allocated) {
-    memory->destroy_kokkos(k_eatom,eatom);
-    memory->destroy_kokkos(k_vatom,vatom);
-    k_cutsq = DAT::tdual_ffloat_2d();
-    memory->sfree(cutsq);
-    eatom = NULL;
-    vatom = NULL;
-    cutsq = NULL;
+    memory->destroy_kokkos(k_duCond,duCond);
+    memory->destroy_kokkos(k_duMech,duMech);
   }
+
+  memory->destroy_kokkos(k_cutsq,cutsq);
+
+  /** rand_pool.destroy();/**/
 }
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
 
 template<class DeviceType>
-void PairDPDfdtEnergyKokkos<DeviceType>::cleanup_copy() {
-  // WHY needed: this prevents parent copy from deallocating any arrays
-  allocated = 0;
-  cutsq = NULL;
-  eatom = NULL;
-  vatom = NULL;
+void PairDPDfdtEnergyKokkos<DeviceType>::init_style()
+{
+  PairDPDfdtEnergy::init_style();
+
+  // irequest = neigh request made by parent class
+
+  neighflag = lmp->kokkos->neighflag;
+  int irequest = neighbor->nrequest - 1;
+
+  neighbor->requests[irequest]->
+    kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value &&
+    !Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+  neighbor->requests[irequest]->
+    kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+
+  if (neighflag == FULL) {
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+  } else if (neighflag == HALF || neighflag == HALFTHREAD) {
+    neighbor->requests[irequest]->full = 0;
+    neighbor->requests[irequest]->half = 1;
+  } else {
+    error->all(FLERR,"Cannot use chosen neighbor list style with reax/c/kk");
+  }
+
+  /** rand_pool.init(random,seed);/**/
 }
 
 /* ---------------------------------------------------------------------- */
@@ -89,9 +106,12 @@ void PairDPDfdtEnergyKokkos<DeviceType>::cleanup_copy() {
 template<class DeviceType>
 void PairDPDfdtEnergyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 {
+  copymode = 1;
+
   eflag = eflag_in;
   vflag = vflag_in;
 
+  if (neighflag == FULL) no_virial_fdotr_compute = 1;
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
 
@@ -100,35 +120,115 @@ void PairDPDfdtEnergyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   if (eflag_atom) {
     memory->destroy_kokkos(k_eatom,eatom);
     memory->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
-    d_eatom = k_eatom.view<DeviceType>();
+    d_eatom = k_eatom.d_view;
   }
   if (vflag_atom) {
     memory->destroy_kokkos(k_vatom,vatom);
     memory->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom");
-    d_vatom = k_vatom.view<DeviceType>();
+    d_vatom = k_vatom.d_view;
   }
 
-  atomKK->sync(execution_space,datamask_read);
-  k_cutsq.template sync<DeviceType>();
-  if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
-  else atomKK->modified(execution_space,F_MASK);
-
   x = atomKK->k_x.view<DeviceType>();
-  c_x = atomKK->k_x.view<DeviceType>();
+  v = atomKK->k_v.view<DeviceType>();
   f = atomKK->k_f.view<DeviceType>();
   type = atomKK->k_type.view<DeviceType>();
-  tag = atomKK->k_tag.view<DeviceType>();
+  mass = atomKK->k_mass.view<DeviceType>();
+  rmass = atomKK->rmass;
+  dpdTheta = atomKK->k_dpdTheta.view<DeviceType>();
+
+  k_cutsq.template sync<DeviceType>();
+  k_params.template sync<DeviceType>();
+  atomKK->sync(execution_space,X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK | DPDTHETA_MASK | RMASS_MASK);
+  if (evflag) atomKK->modified(execution_space,F_MASK | ENERGY_MASK | VIRIAL_MASK);
+  else atomKK->modified(execution_space,F_MASK | UCG_MASK | UCGNEW_MASK);
+  atomKK->k_mass.sync<DeviceType>();
+
   nlocal = atom->nlocal;
-  nall = atom->nlocal + atom->nghost;
-  newton_pair = force->newton_pair;
-  special_lj[0] = force->special_lj[0];
-  special_lj[1] = force->special_lj[1];
-  special_lj[2] = force->special_lj[2];
-  special_lj[3] = force->special_lj[3];
+  int nghost = atom->nghost;
+  int newton_pair = force->newton_pair;
+  dtinvsqrt = 1.0/sqrt(update->dt);
+
+  int inum = list->inum;
+  NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
+  d_numneigh = k_list->d_numneigh;
+  d_neighbors = k_list->d_neighbors;
+  d_ilist = k_list->d_ilist;
+
+  boltz = force->boltz;
+
+  int STACKPARAMS = 0; // optimize
 
   // loop over neighbors of my atoms
 
-  EV_FLOAT ev = pair_compute<PairDPDfdtEnergyKokkos<DeviceType>,void >(this,(NeighListKokkos<DeviceType>*)list);
+  EV_FLOAT ev;
+
+  if (splitFDT_flag) {
+    if (neighflag == HALF) {
+      if (newton_pair) {
+        if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,1,1> >(0,inum),*this,ev);
+        else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,1,0> >(0,inum),*this);
+      } else {
+        if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,0,1> >(0,inum),*this,ev);
+        else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,0,0> >(0,inum),*this);
+      }
+    } else if (neighflag == HALFTHREAD) {
+      if (newton_pair) {
+        if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,1,1> >(0,inum),*this,ev);
+        else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,1,0> >(0,inum),*this);
+      } else {
+        if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,0,1> >(0,inum),*this,ev);
+        else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,0,0> >(0,inum),*this);
+      }
+    }
+  } else {
+
+    // Allocate memory for duCond and duMech
+    if (allocated) {
+      memory->destroy_kokkos(k_duCond,duCond);
+      memory->destroy_kokkos(k_duMech,duMech);
+    }
+    memory->create_kokkos(k_duCond,duCond,nlocal+nghost,"pair:duCond");
+    memory->create_kokkos(k_duMech,duMech,nlocal+nghost,"pair:duMech");
+    d_duCond = k_duCond.view<DeviceType>();
+    d_duMech = k_duMech.view<DeviceType>();
+    h_duCond = k_duCond.h_view;
+    h_duMech = k_duMech.h_view;
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyZero>(0,nlocal+nghost),*this);
+
+    atomKK->sync(execution_space,V_MASK);
+
+    // loop over neighbors of my atoms
+
+    if (neighflag == HALF) {
+      if (newton_pair) {
+        if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,1,1> >(0,inum),*this,ev);
+        else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,1,0> >(0,inum),*this);
+      } else {
+        if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,0,1> >(0,inum),*this,ev);
+        else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,0,0> >(0,inum),*this);
+      }
+    } else if (neighflag == HALFTHREAD) {
+      if (newton_pair) {
+        if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,1,1> >(0,inum),*this,ev);
+        else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,1,0> >(0,inum),*this);
+      } else {
+        if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,0,1> >(0,inum),*this,ev);
+        else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,0,0> >(0,inum),*this);
+      }
+    }
+
+    // Communicate the ghost delta energies to the locally owned atoms
+
+    k_duCond.template modify<DeviceType>();
+    k_duCond.template sync<LMPHostType>();
+    k_duMech.template modify<DeviceType>();
+    k_duMech.template sync<LMPHostType>();
+    comm->reverse_comm_pair(this);
+    //k_duCond.template modify<LMPHostType>();
+    //k_duCond.template sync<DeviceType>();
+    //k_duMech.template modify<LMPHostType>();
+    //k_duMech.template sync<DeviceType>();
+  }
 
   if (eflag_global) eng_vdwl += ev.evdwl;
   if (vflag_global) {
@@ -151,125 +251,262 @@ void PairDPDfdtEnergyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     k_vatom.template modify<DeviceType>();
     k_vatom.template sync<LMPHostType>();
   }
+
+  copymode = 0;
 }
 
 template<class DeviceType>
-template<bool STACKPARAMS, class Specialisation>
 KOKKOS_INLINE_FUNCTION
-F_FLOAT PairDPDfdtEnergyKokkos<DeviceType>::
-compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const {
-  (void) i;
-  (void) j;
-  const F_FLOAT r = sqrt(rsq);
- if (r < EPSILON) return 0;     // r can be 0.0 in DPD systems
-  const F_FLOAT rinv = 1.0/r;
-  const F_FLOAT wr = 1.0 - r/cut[itype][jtype];
-  const F_FLOAT wd = wr*wr;
-
- // conservative force = a0 * wr
-  return  a0[itype][jtype]*wr*rinv;
+void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyZero, const int &ii) const {
+  d_duCond[ii] = 0.0;
+  d_duMech[ii] = 0.0;
 }
 
 template<class DeviceType>
-template<bool STACKPARAMS, class Specialisation>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
-F_FLOAT PairDPDfdtEnergyKokkos<DeviceType>::
-compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const {
-  (void) i;
-  (void) j;
-  const F_FLOAT r = sqrt(rsq);
-  if (r < EPSILON) return 0;     // r can be 0.0 in DPD systems
-  const F_FLOAT rinv = 1.0/r;
-  const F_FLOAT wr = 1.0 - r/cut[itype][jtype];
-  const F_FLOAT wd = wr*wr;
-     // unshifted eng of conservative term:
-     // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
-     // eng shifted to 0.0 at cutoff
-  return 0.5*a0[itype][jtype]*cut[itype][jtype] * wd;
-}
+void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii, EV_FLOAT& ev) const {
 
+  // The f array is atomic for Half/Thread neighbor style
+  Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
 
-/*
-  int i,j,ii,jj,inum,jnum,itype,jtype;
+  int i,j,jj,inum,jnum,itype,jtype;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
-  double rsq,r,rinv,wd,wr,factor_dpd;
-  int *ilist,*jlist,*numneigh,**firstneigh;
+  double vxtmp,vytmp,vztmp,delvx,delvy,delvz;
+  double rsq,r,rinv,wd,wr,factor_dpd,uTmp;
+  double dot,randnum;
 
-  evdwl = 0.0;
-  if (eflag || vflag) ev_setup(eflag,vflag);
-  else evflag = vflag_fdotr = 0;
+  double kappa_ij, alpha_ij, theta_ij, gamma_ij;
+  double mass_i, mass_j;
+  double massinv_i, massinv_j;
+  double randPair, mu_ij;
 
-  double **x = atom->x;
-  double **f = atom->f;
-  int *type = atom->type;
-  int nlocal = atom->nlocal;
-  double *special_lj = force->special_lj;
-  int newton_pair = force->newton_pair;
+  i = d_ilist[ii];
+  xtmp = x(i,0);
+  ytmp = x(i,1);
+  ztmp = x(i,2);
+  itype = type[i];
+  jnum = d_numneigh[i];
 
-  inum = list->inum;
-  ilist = list->ilist;
-  numneigh = list->numneigh;
-  firstneigh = list->firstneigh;
+  double fx_i = 0.0;
+  double fy_i = 0.0;
+  double fz_i = 0.0;
 
-  // loop over neighbors of my atoms
+  for (jj = 0; jj < jnum; jj++) {
+    j = d_neighbors(i,jj);
+    factor_dpd = special_lj[sbmask(j)];
+    j &= NEIGHMASK;
 
-  for (ii = 0; ii < inum; ii++) {
-    i = ilist[ii];
-    xtmp = x[i][0];
-    ytmp = x[i][1];
-    ztmp = x[i][2];
-    itype = type[i];
-    jlist = firstneigh[i];
-    jnum = numneigh[i];
+    delx = xtmp - x(j,0);
+    dely = ytmp - x(j,1);
+    delz = ztmp - x(j,2);
+    rsq = delx*delx + dely*dely + delz*delz;
+    jtype = type[j];
 
-    for (jj = 0; jj < jnum; jj++) {
-      j = jlist[jj];
-      factor_dpd = special_lj[sbmask(j)];
-      j &= NEIGHMASK;
+    double cutsq_ij = STACKPARAMS?m_cutsq[itype][jtype]:d_cutsq(itype,jtype);
+    if (rsq < cutsq_ij) {
+      r = sqrt(rsq);
+      if (r < EPSILON) continue;     // r can be 0.0 in DPD systems
+      rinv = 1.0/r;
+      double cut_ij = STACKPARAMS?m_params[itype][jtype].cut:params(itype,jtype).cut;
+      wr = 1.0 - r/cut_ij;
+      wd = wr*wr;
 
-      delx = xtmp - x[j][0];
-      dely = ytmp - x[j][1];
-      delz = ztmp - x[j][2];
-      rsq = delx*delx + dely*dely + delz*delz;
-      jtype = type[j];
+      // conservative force = a0 * wr
+      double a0_ij = STACKPARAMS?m_params[itype][jtype].a0:params(itype,jtype).a0;
+      fpair = a0_ij*wr;
+      fpair *= factor_dpd*rinv;
 
-      if (rsq < cutsq[itype][jtype]) {
-        r = sqrt(rsq);
-        if (r < EPSILON) continue;     // r can be 0.0 in DPD systems
-        rinv = 1.0/r;
-        wr = 1.0 - r/cut[itype][jtype];
-        wd = wr*wr;
-
-        // conservative force = a0 * wr
-        fpair = a0[itype][jtype]*wr;
-        fpair *= factor_dpd*rinv;
-
-        f[i][0] += delx*fpair;
-        f[i][1] += dely*fpair;
-        f[i][2] += delz*fpair;
-        if (newton_pair || j < nlocal) {
-          f[j][0] -= delx*fpair;
-          f[j][1] -= dely*fpair;
-          f[j][2] -= delz*fpair;
-        }
-
-        if (eflag) {
-          // unshifted eng of conservative term:
-          // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
-          // eng shifted to 0.0 at cutoff
-          evdwl = 0.5*a0[itype][jtype]*cut[itype][jtype] * wd;
-          evdwl *= factor_dpd;
-        }
-
-        if (evflag) ev_tally(i,j,nlocal,newton_pair,
-                             evdwl,0.0,fpair,delx,dely,delz);
+      fx_i += delx*fpair;
+      fy_i += dely*fpair;
+      fz_i += delz*fpair;
+      if (NEWTON_PAIR || j < nlocal) {
+        a_f(j,0) -= delx*fpair;
+        a_f(j,1) -= dely*fpair;
+        a_f(j,2) -= delz*fpair;
       }
+
+      if (eflag) {
+        // unshifted eng of conservative term:
+        // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/d_cut(itype,jtype));
+        // eng shifted to 0.0 at cutoff
+        evdwl = 0.5*a0_ij*cut_ij * wd;
+        evdwl *= factor_dpd;
+        ev.evdwl += evdwl;
+      }
+
+        if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair,delx,dely,delz);
     }
   }
 
-  if (vflag_fdotr) virial_fdotr_compute();
+  a_f(i,0) += fx_i;
+  a_f(i,1) += fy_i;
+  a_f(i,2) += fz_i;
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii) const {
+  EV_FLOAT ev;
+  this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(TagPairDPDfdtEnergyComputeSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(), ii, ev);
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeNoSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii, EV_FLOAT& ev) const {
+
+  // These array are atomic for Half/Thread neighbor style
+  Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
+  Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_duCond = d_duCond;
+  Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_duMech = d_duMech;
+
+  int i,j,jj,inum,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double vxtmp,vytmp,vztmp,delvx,delvy,delvz;
+  double rsq,r,rinv,wd,wr,factor_dpd,uTmp;
+  double dot,randnum;
+
+  double kappa_ij, alpha_ij, theta_ij, gamma_ij;
+  double mass_i, mass_j;
+  double massinv_i, massinv_j;
+  double randPair, mu_ij;
+
+  rand_type rand_gen = rand_pool.get_state();
+
+  i = d_ilist[ii];
+  xtmp = x(i,0);
+  ytmp = x(i,1);
+  ztmp = x(i,2);
+  vxtmp = v(i,0);
+  vytmp = v(i,1);
+  vztmp = v(i,2);
+  itype = type[i];
+  jnum = d_numneigh[i];
+
+  double fx_i = 0.0;
+  double fy_i = 0.0;
+  double fz_i = 0.0;
+
+  for (jj = 0; jj < jnum; jj++) {
+    j = d_neighbors(i,jj);
+    factor_dpd = special_lj[sbmask(j)];
+    j &= NEIGHMASK;
+
+    delx = xtmp - x(j,0);
+    dely = ytmp - x(j,1);
+    delz = ztmp - x(j,2);
+    rsq = delx*delx + dely*dely + delz*delz;
+    jtype = type[j];
+
+    double cutsq_ij = STACKPARAMS?m_cutsq[itype][jtype]:d_cutsq(itype,jtype);
+    if (rsq < cutsq_ij) {
+      r = sqrt(rsq);
+      if (r < EPSILON) continue;     // r can be 0.0 in DPD systems
+      rinv = 1.0/r;
+      double cut_ij = STACKPARAMS?m_params[itype][jtype].cut:params(itype,jtype).cut;
+      wr = 1.0 - r/cut_ij;
+      wd = wr*wr;
+
+      delvx = vxtmp - v(j,0);
+      delvy = vytmp - v(j,1);
+      delvz = vztmp - v(j,2);
+      dot = delx*delvx + dely*delvy + delz*delvz;
+      randnum = rand_gen.normal();
+
+      // Compute the current temperature
+      theta_ij = 0.5*(1.0/dpdTheta[i] + 1.0/dpdTheta[j]);
+      theta_ij = 1.0/theta_ij;
+
+      double sigma_ij = STACKPARAMS?m_params[itype][jtype].sigma:params(itype,jtype).sigma;
+      gamma_ij = sigma_ij*sigma_ij
+                 / (2.0*boltz*theta_ij);
+
+      // conservative force = a0 * wr
+      // drag force = -gamma * wr^2 * (delx dot delv) / r
+      // random force = sigma * wr * rnd * dtinvsqrt;
+
+      double a0_ij = STACKPARAMS?m_params[itype][jtype].a0:params(itype,jtype).a0;
+      fpair = a0_ij*wr;
+      fpair -= gamma_ij*wd*dot*rinv;
+      fpair += sigma_ij*wr*randnum*dtinvsqrt;
+      fpair *= factor_dpd*rinv;
+
+      fx_i += delx*fpair;
+      fy_i += dely*fpair;
+      fz_i += delz*fpair;
+      if (NEWTON_PAIR || j < nlocal) {
+        f(j,0) -= delx*fpair;
+        f(j,1) -= dely*fpair;
+        f(j,2) -= delz*fpair;
+      }
+
+      if (rmass) {
+        mass_i = rmass[i];
+        mass_j = rmass[j];
+      } else {
+        mass_i = mass[itype];
+        mass_j = mass[jtype];
+      }
+      massinv_i = 1.0 / mass_i;
+      massinv_j = 1.0 / mass_j;
+
+      // Compute the mechanical and conductive energy, uMech and uCond
+      mu_ij = massinv_i + massinv_j;
+      mu_ij *= force->ftm2v;
+
+      uTmp = gamma_ij*wd*rinv*rinv*dot*dot
+             - 0.5*sigma_ij*sigma_ij*mu_ij*wd;
+      uTmp -= sigma_ij*wr*rinv*dot*randnum*dtinvsqrt;
+      uTmp *= 0.5;
+
+      a_duMech[i] += uTmp;
+      if (NEWTON_PAIR || j < nlocal) {
+        a_duMech[j] += uTmp;
+      }
+
+      // Compute uCond
+      randnum = rand_gen.normal();
+      kappa_ij = STACKPARAMS?m_params[itype][jtype].kappa:params(itype,jtype).kappa;
+      alpha_ij = sqrt(2.0*boltz*kappa_ij);
+      randPair = alpha_ij*wr*randnum*dtinvsqrt;
+
+      uTmp = kappa_ij*(1.0/dpdTheta[i] - 1.0/dpdTheta[j])*wd;
+      uTmp += randPair;
+
+      a_duCond[i] += uTmp;
+      if (NEWTON_PAIR || j < nlocal) {
+        a_duCond[j] -= uTmp;
+      }
+
+      if (eflag) {
+        // unshifted eng of conservative term:
+        // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/d_cut(itype,jtype));
+        // eng shifted to 0.0 at cutoff
+        evdwl = 0.5*a0_ij*cut_ij * wd;
+        evdwl *= factor_dpd;
+        ev.evdwl += evdwl;
+      }
+
+      if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair,delx,dely,delz);
+    }
+  }
+
+  a_f(i,0) += fx_i;
+  a_f(i,1) += fy_i;
+  a_f(i,2) += fz_i;
+
+  rand_pool.free_state(rand_gen);
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeNoSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii) const {
+  EV_FLOAT ev;
+  this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(TagPairDPDfdtEnergyComputeNoSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(), ii, ev);
 }
-*/
 
 /* ----------------------------------------------------------------------
    allocate all arrays
@@ -281,69 +518,26 @@ void PairDPDfdtEnergyKokkos<DeviceType>::allocate()
   PairDPDfdtEnergy::allocate();
 
   int n = atom->ntypes;
+  int nlocal = atom->nlocal;
+  int nghost = atom->nghost;
+
   memory->destroy(cutsq);
   memory->create_kokkos(k_cutsq,cutsq,n+1,n+1,"pair:cutsq");
   d_cutsq = k_cutsq.template view<DeviceType>();
-}
 
-/* ----------------------------------------------------------------------
-   global settings
-------------------------------------------------------------------------- */
+  k_params = Kokkos::DualView<params_dpd**,Kokkos::LayoutRight,DeviceType>("PairDPDfdtEnergy::params",n+1,n+1);
+  params = k_params.d_view;
 
-template<class DeviceType>
-void PairDPDfdtEnergyKokkos<DeviceType>::settings(int narg, char **arg)
-{
-  if (narg != 2) error->all(FLERR,"Illegal pair_style command");
-
-  PairDPDfdtEnergy::settings(2,arg);
-}
-
-/* ----------------------------------------------------------------------
-   init specific to this pair style
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-void PairDPDfdtEnergyKokkos<DeviceType>::init_style()
-{
-  PairDPDfdtEnergy::init_style();
-
-  neighflag = lmp->kokkos->neighflag;
-  int irequest = neighbor->nrequest - 1;
-
-  neighbor->requests[irequest]->
-    kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value &&
-    !Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
-  neighbor->requests[irequest]->
-    kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
-
-  if (neighflag == HALF || neighflag == HALFTHREAD) {
-    neighbor->requests[irequest]->full = 0;
-    neighbor->requests[irequest]->half = 1;
-  } else {
-    error->all(FLERR,"Cannot use chosen neighbor list style with dpd/fdt/energy/kk");
+  if (!splitFDT_flag) {
+    memory->destroy(duCond);
+    memory->destroy(duMech);
+    memory->create_kokkos(k_duCond,duCond,nlocal+nghost+1,"pair:duCond");
+    memory->create_kokkos(k_duMech,duMech,nlocal+nghost+1,"pair:duMech");
+    d_duCond = k_duCond.view<DeviceType>();
+    d_duMech = k_duMech.view<DeviceType>();
+    h_duCond = k_duCond.h_view;
+    h_duMech = k_duMech.h_view;
   }
-
-/*
-  if (comm->ghost_velocity == 0)
-    error->all(FLERR,"Pair dpd/fdt/energy requires ghost atoms store velocity");
-
-  // if newton off, forces between atoms ij will be double computed
-  // using different random numbers
-
-  if (force->newton_pair == 0 && comm->me == 0) error->warning(FLERR,
-      "Pair dpd/fdt/energy requires newton pair on");
-
-  int irequest = neighbor->request(this,instance_me);
-  neighbor->requests[irequest]->ssa = 0;
-  for (int i = 0; i < modify->nfix; i++)
-    if (strcmp(modify->fix[i]->style,"shardlow") == 0)
-      neighbor->requests[irequest]->ssa = 1;
-
-  bool eos_flag = false;
-  for (int i = 0; i < modify->nfix; i++)
-    if (strncmp(modify->fix[i]->style,"eos",3) == 0) eos_flag = true;
-  if(!eos_flag) error->all(FLERR,"pair_style dpd/fdt/energy requires an EOS to be specified");
-*/
 }
 
 /* ----------------------------------------------------------------------
@@ -355,21 +549,129 @@ double PairDPDfdtEnergyKokkos<DeviceType>::init_one(int i, int j)
 {
   double cutone = PairDPDfdtEnergy::init_one(i,j);
 
+  k_params.h_view(i,j).cut = cut[i][j];
+  k_params.h_view(i,j).a0 = a0[i][j];
+  k_params.h_view(i,j).sigma = sigma[i][j];
+  k_params.h_view(i,j).kappa = kappa[i][j];
+  k_params.h_view(j,i) = k_params.h_view(i,j);
   if(i<MAX_TYPES_STACKPARAMS+1 && j<MAX_TYPES_STACKPARAMS+1) {
+    m_params[i][j] = m_params[j][i] = k_params.h_view(i,j);
     m_cutsq[j][i] = m_cutsq[i][j] = cutone*cutone;
   }
+
   k_cutsq.h_view(i,j) = cutone*cutone;
   k_cutsq.h_view(j,i) = k_cutsq.h_view(i,j);
   k_cutsq.template modify<LMPHostType>();
+  k_params.template modify<LMPHostType>();
 
   return cutone;
 }
 
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR>
+KOKKOS_INLINE_FUNCTION
+void PairDPDfdtEnergyKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, const int &j,
+      const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
+                const F_FLOAT &dely, const F_FLOAT &delz) const
+{
+  const int EFLAG = eflag;
+  const int VFLAG = vflag_either;
+
+  // The eatom and vatom arrays are atomic for Half/Thread neighbor style
+  Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom = k_eatom.view<DeviceType>();
+  Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom = k_vatom.view<DeviceType>();
+
+  if (EFLAG) {
+    if (eflag_atom) {
+      const E_FLOAT epairhalf = 0.5 * epair;
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) v_eatom[i] += epairhalf;
+        if (NEWTON_PAIR || j < nlocal) v_eatom[j] += epairhalf;
+      } else {
+        v_eatom[i] += epairhalf;
+      }
+    }
+  }
+
+  if (VFLAG) {
+    const E_FLOAT v0 = delx*delx*fpair;
+    const E_FLOAT v1 = dely*dely*fpair;
+    const E_FLOAT v2 = delz*delz*fpair;
+    const E_FLOAT v3 = delx*dely*fpair;
+    const E_FLOAT v4 = delx*delz*fpair;
+    const E_FLOAT v5 = dely*delz*fpair;
+
+    if (vflag_global) {
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) {
+          ev.v[0] += 0.5*v0;
+          ev.v[1] += 0.5*v1;
+          ev.v[2] += 0.5*v2;
+          ev.v[3] += 0.5*v3;
+          ev.v[4] += 0.5*v4;
+          ev.v[5] += 0.5*v5;
+        }
+        if (NEWTON_PAIR || j < nlocal) {
+        ev.v[0] += 0.5*v0;
+        ev.v[1] += 0.5*v1;
+        ev.v[2] += 0.5*v2;
+        ev.v[3] += 0.5*v3;
+        ev.v[4] += 0.5*v4;
+        ev.v[5] += 0.5*v5;
+        }
+      } else {
+        ev.v[0] += 0.5*v0;
+        ev.v[1] += 0.5*v1;
+        ev.v[2] += 0.5*v2;
+        ev.v[3] += 0.5*v3;
+        ev.v[4] += 0.5*v4;
+        ev.v[5] += 0.5*v5;
+      }
+    }
+
+    if (vflag_atom) {
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) {
+          v_vatom(i,0) += 0.5*v0;
+          v_vatom(i,1) += 0.5*v1;
+          v_vatom(i,2) += 0.5*v2;
+          v_vatom(i,3) += 0.5*v3;
+          v_vatom(i,4) += 0.5*v4;
+          v_vatom(i,5) += 0.5*v5;
+        }
+        if (NEWTON_PAIR || j < nlocal) {
+        v_vatom(j,0) += 0.5*v0;
+        v_vatom(j,1) += 0.5*v1;
+        v_vatom(j,2) += 0.5*v2;
+        v_vatom(j,3) += 0.5*v3;
+        v_vatom(j,4) += 0.5*v4;
+        v_vatom(j,5) += 0.5*v5;
+        }
+      } else {
+        v_vatom(i,0) += 0.5*v0;
+        v_vatom(i,1) += 0.5*v1;
+        v_vatom(i,2) += 0.5*v2;
+        v_vatom(i,3) += 0.5*v3;
+        v_vatom(i,4) += 0.5*v4;
+        v_vatom(i,5) += 0.5*v5;
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+int PairDPDfdtEnergyKokkos<DeviceType>::sbmask(const int& j) const {
+  return j >> SBBITS & 3;
+}
 
 namespace LAMMPS_NS {
 template class PairDPDfdtEnergyKokkos<LMPDeviceType>;
 #ifdef KOKKOS_HAVE_CUDA
 template class PairDPDfdtEnergyKokkos<LMPHostType>;
 #endif
-}
-
+}
\ No newline at end of file
diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
index a8a5f25801..b8d22eff34 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
@@ -22,67 +22,115 @@ PairStyle(dpd/fdt/energy/kk/host,PairDPDfdtEnergyKokkos<LMPHostType>)
 #ifndef LMP_PAIR_DPD_FDT_ENERGY_KOKKOS_H
 #define LMP_PAIR_DPD_FDT_ENERGY_KOKKOS_H
 
-#include "pair_kokkos.h"
 #include "pair_dpd_fdt_energy.h"
-#include "neigh_list_kokkos.h"
+#include "pair_kokkos.h"
+#include "kokkos_type.h"
+#include "Kokkos_Random.hpp"
+#include "rand_pool_wrap.h"
 
 namespace LAMMPS_NS {
 
+struct TagPairDPDfdtEnergyZero{};
+
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+struct TagPairDPDfdtEnergyComputeSplit{};
+
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+struct TagPairDPDfdtEnergyComputeNoSplit{};
+
 template<class DeviceType>
 class PairDPDfdtEnergyKokkos : public PairDPDfdtEnergy {
  public:
-  enum {EnabledNeighFlags=HALFTHREAD|HALF};
-  enum {COUL_FLAG=0};
   typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typedef EV_FLOAT value_type;
+
   PairDPDfdtEnergyKokkos(class LAMMPS *);
   virtual ~PairDPDfdtEnergyKokkos();
   virtual void compute(int, int);
-  virtual void settings(int, char **);
   void init_style();
   double init_one(int, int);
 
+  void operator()(TagPairDPDfdtEnergyZero, const int&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairDPDfdtEnergyComputeSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&, EV_FLOAT&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairDPDfdtEnergyComputeSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairDPDfdtEnergyComputeNoSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&, EV_FLOAT&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairDPDfdtEnergyComputeNoSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR>
+  KOKKOS_INLINE_FUNCTION
+  void ev_tally(EV_FLOAT &ev, const int &i, const int &j,
+      const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
+                  const F_FLOAT &dely, const F_FLOAT &delz) const;
+
+  KOKKOS_INLINE_FUNCTION
+  int sbmask(const int& j) const;
+
+  struct params_dpd {
+    params_dpd(){cut=0;a0=0;sigma=0;kappa=0;};
+    params_dpd(int i){cut=0;a0=0;sigma=0;kappa=0;};
+    F_FLOAT cut,a0,sigma,kappa;
+  };
+
  protected:
-  void cleanup_copy();
+  int eflag,vflag;
+  int nlocal,neighflag;
+  int STACKPARAMS;
+  double dtinvsqrt;
+  double boltz;
 
-  template<bool STACKPARAMS, class Specialisation>
-  KOKKOS_INLINE_FUNCTION
-  F_FLOAT compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const;
+  virtual void allocate();
 
-  template<bool STACKPARAMS, class Specialisation>
-  KOKKOS_INLINE_FUNCTION
-  F_FLOAT compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const;
+  Kokkos::DualView<params_dpd**,Kokkos::LayoutRight,DeviceType> k_params;
+  typename Kokkos::DualView<params_dpd**,
+    Kokkos::LayoutRight,DeviceType>::t_dev_const_um params;
+  // hardwired to space for 15 atom types
+  params_dpd m_params[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
 
   F_FLOAT m_cutsq[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
+  F_FLOAT m_cut[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
   typename ArrayTypes<DeviceType>::t_x_array_randomread x;
   typename ArrayTypes<DeviceType>::t_x_array c_x;
+  typename ArrayTypes<DeviceType>::t_v_array_randomread v;
   typename ArrayTypes<DeviceType>::t_f_array f;
   typename ArrayTypes<DeviceType>::t_int_1d_randomread type;
+  typename ArrayTypes<DeviceType>::t_float_1d_randomread mass;
+  double *rmass;
+  typename AT::t_efloat_1d dpdTheta;
+  DAT::tdual_efloat_1d k_duCond,k_duMech;
+  typename AT::t_efloat_1d d_duCond,d_duMech;
+  HAT::t_efloat_1d h_duCond,h_duMech;
 
   DAT::tdual_efloat_1d k_eatom;
   DAT::tdual_virial_array k_vatom;
-  typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom;
-  typename ArrayTypes<DeviceType>::t_virial_array d_vatom;
-  typename ArrayTypes<DeviceType>::t_tagint_1d tag;
+  DAT::t_efloat_1d d_eatom;
+  DAT::t_virial_array d_vatom;
 
-  int newton_pair;
-  double special_lj[4];
+  typename AT::t_neighbors_2d d_neighbors;
+  typename AT::t_int_1d_randomread d_ilist;
+  typename AT::t_int_1d_randomread d_numneigh;
 
   typename ArrayTypes<DeviceType>::tdual_ffloat_2d k_cutsq;
   typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq;
 
+  /**/Kokkos::Random_XorShift64_Pool<DeviceType> rand_pool;
+  typedef typename Kokkos::Random_XorShift64_Pool<DeviceType>::generator_type rand_type;/**/
 
-  int neighflag;
-  int nlocal,nall,eflag,vflag;
+  /**RandPoolWrap rand_pool;
+  typedef RandWrap rand_type;/**/
 
-  void allocate();
-
-  friend class PairComputeFunctor<PairDPDfdtEnergyKokkos,HALF,true>;
-  friend class PairComputeFunctor<PairDPDfdtEnergyKokkos,HALFTHREAD,true>;
-  friend class PairComputeFunctor<PairDPDfdtEnergyKokkos,HALF,false>;
-  friend class PairComputeFunctor<PairDPDfdtEnergyKokkos,HALFTHREAD,false>;
-  friend EV_FLOAT pair_compute_neighlist<PairDPDfdtEnergyKokkos,HALF,void>(PairDPDfdtEnergyKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairDPDfdtEnergyKokkos,HALFTHREAD,void>(PairDPDfdtEnergyKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairDPDfdtEnergyKokkos,void>(PairDPDfdtEnergyKokkos*,NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairDPDfdtEnergyKokkos>(PairDPDfdtEnergyKokkos*);
 };
 
diff --git a/src/KOKKOS/rand_pool_wrap.cpp b/src/KOKKOS/rand_pool_wrap.cpp
new file mode 100644
index 0000000000..b6fd0dbc55
--- /dev/null
+++ b/src/KOKKOS/rand_pool_wrap.cpp
@@ -0,0 +1,72 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "comm.h"
+#include "rand_pool_wrap.h"
+#include "lammps.h"
+#include "kokkos.h"
+#include "random_mars.h"
+#include "update.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+RandPoolWrap::RandPoolWrap(int, LAMMPS *lmp) : Pointers(lmp)
+{
+  random_thr =  NULL;
+  nthreads = lmp->kokkos->num_threads;
+}
+
+/* ---------------------------------------------------------------------- */
+
+RandPoolWrap::~RandPoolWrap()
+{
+
+}
+
+void RandPoolWrap::destroy()
+{
+  if (random_thr) {
+    for (int i=1; i < nthreads; ++i)
+      delete random_thr[i];
+  
+    delete[] random_thr;
+    random_thr = NULL;
+  }
+}
+
+void RandPoolWrap::init(RanMars* random, int seed)
+{
+  // deallocate pool of RNGs
+  if (random_thr) {
+    for (int i=1; i < this->nthreads; ++i)
+      delete random_thr[i];
+  
+    delete[] random_thr;
+  }
+  
+  // allocate pool of RNGs
+  // generate a random number generator instance for
+  // all threads != 0. make sure we use unique seeds.
+  nthreads = lmp->kokkos->num_threads;
+  random_thr = new RanMars*[nthreads];
+  for (int tid = 1; tid < nthreads; ++tid) {
+    random_thr[tid] = new RanMars(lmp, seed + comm->me
+                                  + comm->nprocs*tid);
+  }
+
+  // to ensure full compatibility with the serial style
+  // we use the serial random number generator instance for thread 0
+  random_thr[0] = random;
+}
\ No newline at end of file
diff --git a/src/KOKKOS/rand_pool_wrap.h b/src/KOKKOS/rand_pool_wrap.h
new file mode 100644
index 0000000000..349896ee9a
--- /dev/null
+++ b/src/KOKKOS/rand_pool_wrap.h
@@ -0,0 +1,84 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef RAND_POOL_WRAP_H
+#define RAND_POOL_WRAP_H
+
+#include "pointers.h"
+#include "kokkos_type.h"
+#include "random_mars.h"
+#include "error.h"
+
+namespace LAMMPS_NS {
+
+struct RandWrap {
+  class RanMars* rng;
+
+  RandWrap() {
+    rng = NULL;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  double drand() {
+    return rng->uniform();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  double normal() {
+    return rng->gaussian();
+  }
+};
+
+class RandPoolWrap : protected Pointers {
+ public:
+  RandPoolWrap(int, class LAMMPS *);
+  ~RandPoolWrap();
+  void destroy();
+  void init(RanMars*, int);
+
+  KOKKOS_INLINE_FUNCTION
+  RandWrap get_state() const
+  {
+#ifdef KOKKOS_HAVE_CUDA
+    error->all(FLERR,"Cannot use Marsaglia RNG with GPUs");
+#endif
+
+    RandWrap rand_wrap;
+    int tid = 0;
+#ifndef KOKKOS_HAVE_CUDA
+    tid = LMPDeviceType::hardware_thread_id();
+#endif
+    rand_wrap.rng = random_thr[tid];
+    return rand_wrap;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void free_state(RandWrap) const
+  {
+
+  }
+
+  void clean_copy() { random_thr = NULL; }
+
+ private:
+  class RanMars **random_thr;
+  int nthreads;
+};
+
+}
+
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
diff --git a/src/USER-DPD/pair_dpd_fdt_energy.cpp b/src/USER-DPD/pair_dpd_fdt_energy.cpp
index 558994d35e..19994acfa1 100644
--- a/src/USER-DPD/pair_dpd_fdt_energy.cpp
+++ b/src/USER-DPD/pair_dpd_fdt_energy.cpp
@@ -54,6 +54,8 @@ PairDPDfdtEnergy::PairDPDfdtEnergy(LAMMPS *lmp) : Pair(lmp)
 
 PairDPDfdtEnergy::~PairDPDfdtEnergy()
 {
+  if (copymode) return;
+
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(cutsq);
diff --git a/src/USER-DPD/pair_dpd_fdt_energy.h b/src/USER-DPD/pair_dpd_fdt_energy.h
index 335beea7e3..ff29667682 100644
--- a/src/USER-DPD/pair_dpd_fdt_energy.h
+++ b/src/USER-DPD/pair_dpd_fdt_energy.h
@@ -31,8 +31,8 @@ class PairDPDfdtEnergy : public Pair {
   virtual void compute(int, int);
   virtual void settings(int, char **);
   virtual void coeff(int, char **);
-  void init_style();
-  double init_one(int, int);
+  virtual void init_style();
+  virtual double init_one(int, int);
   void write_restart(FILE *);
   void read_restart(FILE *);
   virtual void write_restart_settings(FILE *);
@@ -53,7 +53,7 @@ class PairDPDfdtEnergy : public Pair {
   int seed;
   bool splitFDT_flag;
 
-  void allocate();
+  virtual void allocate();
 
 };
 

From 6f51c3b75c49e4777304e6f926ad53f4792e2ce1 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Mon, 19 Dec 2016 16:25:31 -0700
Subject: [PATCH 023/267] Fixing issues in pair_multi_lucy_rx_kokkos

---
 src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp | 4 ----
 src/KOKKOS/pair_multi_lucy_rx_kokkos.h   | 1 -
 2 files changed, 5 deletions(-)

diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
index 03bbaf9907..76337b5219 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
@@ -72,8 +72,6 @@ PairMultiLucyRXKokkos<DeviceType>::PairMultiLucyRXKokkos(LAMMPS *lmp) : PairMult
   datamask_modify = EMPTY_MASK;
 
   update_table = 0;
-  ntables = 0;
-  tables = NULL;
   h_table = new TableHost();
   d_table = new TableDevice();
 
@@ -112,11 +110,9 @@ void PairMultiLucyRXKokkos<DeviceType>::init_style()
   if (neighflag == FULL) {
     neighbor->requests[irequest]->full = 1;
     neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->ghost = 1;
   } else if (neighflag == HALF || neighflag == HALFTHREAD) {
     neighbor->requests[irequest]->full = 0;
     neighbor->requests[irequest]->half = 1;
-    neighbor->requests[irequest]->ghost = 1;
   } else {
     error->all(FLERR,"Cannot use chosen neighbor list style with multi/lucy/rx/kk");
   }
diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.h b/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
index a259588f78..b205f00796 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
@@ -119,7 +119,6 @@ class PairMultiLucyRXKokkos : public PairMultiLucyRX {
   //  double *rsq,*drsq,*e,*de,*f,*df,*e2,*f2;
   //};
 
-  int tabstyle,tablength;
   /*struct TableDeviceConst {
     typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread cutsq;
     typename ArrayTypes<DeviceType>::t_int_2d_randomread tabindex;

From f62a6fe5a55d17cb95d0a3088d2c5d7f7f10b0ee Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Mon, 19 Dec 2016 16:50:22 -0700
Subject: [PATCH 024/267] Renaming rand_pool_wrap to rand_pool_wrap_kokkos

---
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.h                      | 2 +-
 src/KOKKOS/{rand_pool_wrap.cpp => rand_pool_wrap_kokkos.cpp} | 4 ++--
 src/KOKKOS/{rand_pool_wrap.h => rand_pool_wrap_kokkos.h}     | 0
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename src/KOKKOS/{rand_pool_wrap.cpp => rand_pool_wrap_kokkos.cpp} (98%)
 rename src/KOKKOS/{rand_pool_wrap.h => rand_pool_wrap_kokkos.h} (100%)

diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
index b8d22eff34..67fa315721 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
@@ -26,7 +26,7 @@ PairStyle(dpd/fdt/energy/kk/host,PairDPDfdtEnergyKokkos<LMPHostType>)
 #include "pair_kokkos.h"
 #include "kokkos_type.h"
 #include "Kokkos_Random.hpp"
-#include "rand_pool_wrap.h"
+#include "rand_pool_wrap_kokkos.h"
 
 namespace LAMMPS_NS {
 
diff --git a/src/KOKKOS/rand_pool_wrap.cpp b/src/KOKKOS/rand_pool_wrap_kokkos.cpp
similarity index 98%
rename from src/KOKKOS/rand_pool_wrap.cpp
rename to src/KOKKOS/rand_pool_wrap_kokkos.cpp
index b6fd0dbc55..c11764640b 100644
--- a/src/KOKKOS/rand_pool_wrap.cpp
+++ b/src/KOKKOS/rand_pool_wrap_kokkos.cpp
@@ -12,7 +12,7 @@
 ------------------------------------------------------------------------- */
 
 #include "comm.h"
-#include "rand_pool_wrap.h"
+#include "rand_pool_wrap_kokkos.h"
 #include "lammps.h"
 #include "kokkos.h"
 #include "random_mars.h"
@@ -69,4 +69,4 @@ void RandPoolWrap::init(RanMars* random, int seed)
   // to ensure full compatibility with the serial style
   // we use the serial random number generator instance for thread 0
   random_thr[0] = random;
-}
\ No newline at end of file
+}
diff --git a/src/KOKKOS/rand_pool_wrap.h b/src/KOKKOS/rand_pool_wrap_kokkos.h
similarity index 100%
rename from src/KOKKOS/rand_pool_wrap.h
rename to src/KOKKOS/rand_pool_wrap_kokkos.h

From 889ee78f8ba0587020cffe61d5a6cb6515f2e4c6 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Mon, 19 Dec 2016 17:15:02 -0700
Subject: [PATCH 025/267] Change necessary for pair_exp6_rx_kokkos to compile
 on GPU

---
 src/USER-DPD/pair_exp6_rx.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/USER-DPD/pair_exp6_rx.h b/src/USER-DPD/pair_exp6_rx.h
index f9654e4086..2dfc1c1a2e 100644
--- a/src/USER-DPD/pair_exp6_rx.h
+++ b/src/USER-DPD/pair_exp6_rx.h
@@ -37,6 +37,14 @@ class PairExp6rx : public Pair {
   void write_restart_settings(FILE *);
   void read_restart_settings(FILE *);
 
+  struct Param {
+    double epsilon,rm,alpha;
+    int ispecies;
+    char *name, *potential;      // names of unique molecules and interaction type
+    char *tablename;             // name of interaction table
+   int potentialType;              // enumerated interaction potential type.
+  };
+
  protected:
   enum{LINEAR};
   double cut_global;
@@ -48,13 +56,6 @@ class PairExp6rx : public Pair {
   int *mol2param;               // mapping from molecule to parameters
   int nparams;                  // # of stored parameter sets
   int maxparam;                 // max # of parameter sets
-  struct Param {
-    double epsilon,rm,alpha;
-    int ispecies;
-    char *name, *potential;      // names of unique molecules and interaction type
-    char *tablename;             // name of interaction table
-   int potentialType;              // enumerated interaction potential type.
-  };
   Param *params;                // parameter set for an I-J-K interaction
 
   int nspecies;

From 35803c75c970c4f978a8623a1364fbce8e337126 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Tue, 20 Dec 2016 17:03:46 -0700
Subject: [PATCH 026/267] Fixing issues found during GPU runtime testing

---
 src/KOKKOS/fix_eos_table_rx_kokkos.cpp | 79 ++++++++++++++++++--------
 src/KOKKOS/fix_eos_table_rx_kokkos.h   |  6 +-
 src/KOKKOS/pair_exp6_rx_kokkos.cpp     |  9 +--
 3 files changed, 65 insertions(+), 29 deletions(-)

diff --git a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
index 3b22f61e66..cf77e25ff4 100644
--- a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
@@ -49,6 +49,13 @@ FixEOStableRXKokkos<DeviceType>::FixEOStableRXKokkos(LAMMPS *lmp, int narg, char
 
   k_error_flag = DAT::tdual_int_scalar("fix:error_flag");
   k_warning_flag = DAT::tdual_int_scalar("fix:warning_flag");
+
+  k_dHf = DAT::tdual_float_1d("fix:dHf",nspecies);
+  for (int n = 0; n < nspecies; n++)
+    k_dHf.h_view(n) = dHf[n];
+  k_dHf.modify<LMPHostType>();
+  k_dHf.sync<DeviceType>();
+  d_dHf = k_dHf.view<DeviceType>();
 }
 
 /* ---------------------------------------------------------------------- */
@@ -73,6 +80,7 @@ void FixEOStableRXKokkos<DeviceType>::setup(int vflag)
   copymode = 1;
 
   int nlocal = atom->nlocal;
+  double boltz = force->boltz;
   mask = atomKK->k_mask.view<DeviceType>();
   uCond = atomKK->k_uCond.view<DeviceType>();
   uMech = atomKK->k_uMech.view<DeviceType>();
@@ -82,16 +90,20 @@ void FixEOStableRXKokkos<DeviceType>::setup(int vflag)
   uCGnew = atomKK->k_uCGnew.view<DeviceType>();
   dvector = atomKK->k_dvector.view<DeviceType>();
 
-  atomKK->sync(execution_space,MASK_MASK | UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK | UCG_MASK | UCGNEW_MASK | DVECTOR_MASK);
-  atomKK->modified(execution_space,UCHEM_MASK | DPDTHETA_MASK | UCG_MASK | UCGNEW_MASK);
-
-  if (!this->restart_reset)
+  if (!this->restart_reset) {
+    atomKK->sync(execution_space,MASK_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXSetup>(0,nlocal),*this);
+    atomKK->modified(execution_space,UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
+  }
 
   // Communicate the updated momenta and velocities to all nodes
+  atomKK->sync(Host,UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
   comm->forward_comm_fix(this);
+  atomKK->modified(Host,UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
 
+  atomKK->sync(execution_space,MASK_MASK | UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK | DVECTOR_MASK);
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXTemperatureLookup>(0,nlocal),*this);
+  atomKK->modified(execution_space,DPDTHETA_MASK);
 
   error_check();
 
@@ -127,6 +139,7 @@ void FixEOStableRXKokkos<DeviceType>::init()
   copymode = 1;
 
   int nlocal = atom->nlocal;
+  double boltz = force->boltz;
   mask = atomKK->k_mask.view<DeviceType>();
   uCond = atomKK->k_uCond.view<DeviceType>();
   uMech = atomKK->k_uMech.view<DeviceType>();
@@ -134,13 +147,15 @@ void FixEOStableRXKokkos<DeviceType>::init()
   dpdTheta= atomKK->k_dpdTheta.view<DeviceType>();
   dvector = atomKK->k_dvector.view<DeviceType>();
 
-  atomKK->sync(execution_space,MASK_MASK | UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK | UCG_MASK | UCGNEW_MASK | DVECTOR_MASK);
-  atomKK->modified(execution_space,UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK);
-
-  if (this->restart_reset)
+  if (this->restart_reset) {
+    atomKK->sync(execution_space,MASK_MASK | UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK | DVECTOR_MASK);
     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXTemperatureLookup>(0,nlocal),*this);
-  else
+    atomKK->modified(execution_space,DPDTHETA_MASK);
+  } else {
+    atomKK->sync(execution_space,MASK_MASK | UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK | DVECTOR_MASK);
     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXInit>(0,nlocal),*this);
+    atomKK->modified(execution_space,UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK);
+  }
 
   error_check();
 
@@ -172,6 +187,7 @@ void FixEOStableRXKokkos<DeviceType>::post_integrate()
   copymode = 1;
 
   int nlocal = atom->nlocal;
+  double boltz = force->boltz;
   mask = atomKK->k_mask.view<DeviceType>();
   uCond = atomKK->k_uCond.view<DeviceType>();
   uMech = atomKK->k_uMech.view<DeviceType>();
@@ -210,6 +226,7 @@ void FixEOStableRXKokkos<DeviceType>::end_of_step()
   copymode = 1;
 
   int nlocal = atom->nlocal;
+  double boltz = force->boltz;
   mask = atomKK->k_mask.view<DeviceType>();
   uCond = atomKK->k_uCond.view<DeviceType>();
   uMech = atomKK->k_uMech.view<DeviceType>();
@@ -219,18 +236,24 @@ void FixEOStableRXKokkos<DeviceType>::end_of_step()
   uCGnew = atomKK->k_uCGnew.view<DeviceType>();
   dvector = atomKK->k_dvector.view<DeviceType>();
 
-  atomKK->sync(execution_space,MASK_MASK | UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK | UCG_MASK | UCGNEW_MASK | DVECTOR_MASK);
-  atomKK->modified(execution_space,UCHEM_MASK | DPDTHETA_MASK | UCG_MASK | UCGNEW_MASK);
 
   // Communicate the ghost uCGnew
+  atomKK->sync(Host,UCG_MASK | UCGNEW_MASK);
   comm->reverse_comm_fix(this);
+  atomKK->modified(Host,UCG_MASK | UCGNEW_MASK);
 
+  atomKK->sync(execution_space,MASK_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXSetup>(0,nlocal),*this);
+  atomKK->modified(execution_space,UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
 
   // Communicate the updated momenta and velocities to all nodes
+  atomKK->sync(Host,UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
   comm->forward_comm_fix(this);
+  atomKK->modified(Host,UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
 
+  atomKK->sync(execution_space,MASK_MASK | UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK | DVECTOR_MASK);
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXTemperatureLookup2>(0,nlocal),*this);
+  atomKK->modified(execution_space,DPDTHETA_MASK);
 
   error_check();
 
@@ -265,13 +288,13 @@ void FixEOStableRXKokkos<DeviceType>::energy_lookup(int id, double thetai, doubl
       //uTmp = tb->e[itable] + fraction*tb->de[itable];
       uTmp = d_table_const.e(ispecies,itable) + fraction*d_table_const.de(ispecies,itable);
 
-      uTmp += dHf[ispecies];
+      uTmp += d_dHf[ispecies];
       // mol fraction form:
       ui += dvector(ispecies,id)*uTmp;
       nTotal += dvector(ispecies,id);
     }
   }
-  ui = ui - double(nTotal+1.5)*force->boltz*thetai; // need class variable
+  ui = ui - double(nTotal+1.5)*boltz*thetai;
 }
 
 /* ----------------------------------------------------------------------
@@ -344,13 +367,16 @@ template<class DeviceType>
 int FixEOStableRXKokkos<DeviceType>::pack_forward_comm(int n, int *list, double *buf, int pbc_flag, int *pbc)
 {
   int ii,jj,m;
+  HAT::t_efloat_1d h_uChem = atomKK->k_uChem.h_view;
+  HAT::t_efloat_1d h_uCG = atomKK->k_uCG.h_view;
+  HAT::t_efloat_1d h_uCGnew = atomKK->k_uCGnew.h_view;
 
   m = 0;
   for (ii = 0; ii < n; ii++) {
     jj = list[ii];
-    buf[m++] = uChem[jj];
-    buf[m++] = uCG[jj];
-    buf[m++] = uCGnew[jj];
+    buf[m++] = h_uChem[jj];
+    buf[m++] = h_uCG[jj];
+    buf[m++] = h_uCGnew[jj];
   }
   return m;
 }
@@ -361,13 +387,16 @@ template<class DeviceType>
 void FixEOStableRXKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *buf)
 {
   int ii,m,last;
+  HAT::t_efloat_1d h_uChem = atomKK->k_uChem.h_view;
+  HAT::t_efloat_1d h_uCG = atomKK->k_uCG.h_view;
+  HAT::t_efloat_1d h_uCGnew = atomKK->k_uCGnew.h_view;
 
   m = 0;
   last = first + n ;
   for (ii = first; ii < last; ii++){
-    uChem[ii]  = buf[m++];
-    uCG[ii]    = buf[m++];
-    uCGnew[ii] = buf[m++];
+    h_uChem[ii]  = buf[m++];
+    h_uCG[ii]    = buf[m++];
+    h_uCGnew[ii] = buf[m++];
   }
 }
 
@@ -377,12 +406,14 @@ template<class DeviceType>
 int FixEOStableRXKokkos<DeviceType>::pack_reverse_comm(int n, int first, double *buf)
 {
   int i,m,last;
+  HAT::t_efloat_1d h_uCG = atomKK->k_uCG.h_view;
+  HAT::t_efloat_1d h_uCGnew = atomKK->k_uCGnew.h_view;
 
   m = 0;
   last = first + n;
   for (i = first; i < last; i++) {
-    buf[m++] = uCG[i];
-    buf[m++] = uCGnew[i];
+    buf[m++] = h_uCG[i];
+    buf[m++] = h_uCGnew[i];
   }
   return m;
 }
@@ -393,13 +424,15 @@ template<class DeviceType>
 void FixEOStableRXKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, double *buf)
 {
   int i,j,m;
+  HAT::t_efloat_1d h_uCG = atomKK->k_uCG.h_view;
+  HAT::t_efloat_1d h_uCGnew = atomKK->k_uCGnew.h_view;
 
   m = 0;
   for (i = 0; i < n; i++) {
     j = list[i];
 
-    uCG[j] += buf[m++];
-    uCGnew[j] += buf[m++];
+    h_uCG[j] += buf[m++];
+    h_uCGnew[j] += buf[m++];
   }
 }
 
diff --git a/src/KOKKOS/fix_eos_table_rx_kokkos.h b/src/KOKKOS/fix_eos_table_rx_kokkos.h
index 3b9a00afe2..d4a5094ae0 100644
--- a/src/KOKKOS/fix_eos_table_rx_kokkos.h
+++ b/src/KOKKOS/fix_eos_table_rx_kokkos.h
@@ -105,12 +105,15 @@ class FixEOStableRXKokkos : public FixEOStableRX {
 
   int **tabindex;
 
+  double boltz;
+
   void allocate();
   void error_check();
   int update_table;
   void create_kokkos_tables();
 
-  //double *dHf;
+  DAT::tdual_float_1d k_dHf;
+  typename AT::t_float_1d d_dHf;
 
   typename AT::t_int_1d mask;
   typename AT::t_efloat_1d uCond,uMech,uChem,uCG,uCGnew,rho,dpdTheta,duChem;
@@ -124,7 +127,6 @@ class FixEOStableRXKokkos : public FixEOStableRX {
   int pack_forward_comm(int , int *, double *, int, int *);
   void unpack_forward_comm(int , int , double *);
 
-  //int *eosSpecies;
   };
 }
 
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index e6b8a80f44..a2c70ca115 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -645,7 +645,7 @@ void PairExp6rxKokkos<DeviceType>::read_file(char *file)
   int params_per_line = 5;
   char **words = new char*[params_per_line+1];
 
-  memory->sfree(params);
+  memory->destroy_kokkos(k_params,params);
   params = NULL;
   nparams = maxparam = 0;
 
@@ -723,6 +723,7 @@ void PairExp6rxKokkos<DeviceType>::read_file(char *file)
     // load up parameter settings and error check their values
 
     if (nparams == maxparam) {
+      k_params.template modify<LMPHostType>();
       maxparam += DELTA;
       memory->grow_kokkos(k_params,params,maxparam,
                           "pair:params");
@@ -823,7 +824,7 @@ void PairExp6rxKokkos<DeviceType>::getParamsEXP6(int id,double &epsilon1,double
     nTotal += dvector(ispecies,id);
     nTotal_old += dvector(ispecies+nspecies,id);
 
-    iparam = mol2param[ispecies];
+    iparam = d_mol2param[ispecies];
 
     if (iparam < 0 || d_params[iparam].potentialType != exp6PotentialType ) continue;
     if (isOneFluidApprox(isite1) || isOneFluidApprox(isite2)) {
@@ -840,7 +841,7 @@ void PairExp6rxKokkos<DeviceType>::getParamsEXP6(int id,double &epsilon1,double
   fractionOFA = nTotalOFA / nTotal;
 
   for (int ispecies = 0; ispecies < nspecies; ispecies++) {
-    iparam = mol2param[ispecies];
+    iparam = d_mol2param[ispecies];
     if (iparam < 0 || d_params[iparam].potentialType != exp6PotentialType ) continue;
 
     // If Site1 matches a pure species, then grab the parameters
@@ -881,7 +882,7 @@ void PairExp6rxKokkos<DeviceType>::getParamsEXP6(int id,double &epsilon1,double
       xMolei_old = dvector(ispecies+nspecies,id)/nTotalOFA_old;
 
       for (int jspecies = 0; jspecies < nspecies; jspecies++) {
-        jparam = mol2param[jspecies];
+        jparam = d_mol2param[jspecies];
         if (jparam < 0 || d_params[jparam].potentialType != exp6PotentialType ) continue;
         if (isite1 == d_params[jparam].ispecies || isite2 == d_params[jparam].ispecies) continue;
         rmj = d_params[jparam].rm;

From 73326922d67c756653f4607222b63ac1b5378139 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Wed, 21 Dec 2016 08:56:48 -0700
Subject: [PATCH 027/267] Fixing Kokkos issue in fix_eos_table_rx_kokkos

---
 src/KOKKOS/fix_eos_table_rx_kokkos.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
index cf77e25ff4..c47923680c 100644
--- a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
@@ -80,7 +80,7 @@ void FixEOStableRXKokkos<DeviceType>::setup(int vflag)
   copymode = 1;
 
   int nlocal = atom->nlocal;
-  double boltz = force->boltz;
+  boltz = force->boltz;
   mask = atomKK->k_mask.view<DeviceType>();
   uCond = atomKK->k_uCond.view<DeviceType>();
   uMech = atomKK->k_uMech.view<DeviceType>();
@@ -139,7 +139,7 @@ void FixEOStableRXKokkos<DeviceType>::init()
   copymode = 1;
 
   int nlocal = atom->nlocal;
-  double boltz = force->boltz;
+  boltz = force->boltz;
   mask = atomKK->k_mask.view<DeviceType>();
   uCond = atomKK->k_uCond.view<DeviceType>();
   uMech = atomKK->k_uMech.view<DeviceType>();
@@ -187,7 +187,7 @@ void FixEOStableRXKokkos<DeviceType>::post_integrate()
   copymode = 1;
 
   int nlocal = atom->nlocal;
-  double boltz = force->boltz;
+  boltz = force->boltz;
   mask = atomKK->k_mask.view<DeviceType>();
   uCond = atomKK->k_uCond.view<DeviceType>();
   uMech = atomKK->k_uMech.view<DeviceType>();
@@ -226,7 +226,7 @@ void FixEOStableRXKokkos<DeviceType>::end_of_step()
   copymode = 1;
 
   int nlocal = atom->nlocal;
-  double boltz = force->boltz;
+  boltz = force->boltz;
   mask = atomKK->k_mask.view<DeviceType>();
   uCond = atomKK->k_uCond.view<DeviceType>();
   uMech = atomKK->k_uMech.view<DeviceType>();

From 807d9529da32778eb6dbc2216f6a72b1e1325d3c Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Wed, 21 Dec 2016 10:41:29 -0700
Subject: [PATCH 028/267] Fixing issues found during GPU runtime testing

---
 lib/kokkos/algorithms/src/Kokkos_Random.hpp |  8 +++---
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp   | 29 ++++++++++++---------
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.h     |  3 ++-
 src/KOKKOS/pair_exp6_rx_kokkos.cpp          |  5 +---
 src/KOKKOS/pair_exp6_rx_kokkos.h            |  1 -
 5 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
index d54abeceb0..afe6b54e90 100644
--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -910,8 +910,8 @@ namespace Kokkos {
       double S = 2.0;
       double U;
       while(S>=1.0) {
-        U = drand();
-        const double V = drand();
+        U = 2.0*drand() - 1.0;
+        const double V = 2.0*drand() - 1.0;
         S = U*U+V*V;
       }
       return U*sqrt(-2.0*log(S)/S);
@@ -1163,8 +1163,8 @@ namespace Kokkos {
       double S = 2.0;
       double U;
       while(S>=1.0) {
-        U = drand();
-        const double V = drand();
+        U = 2.0*drand() - 1.0;
+        const double V = 2.0*drand() - 1.0;
         S = U*U+V*V;
       }
       return U*sqrt(-2.0*log(S)/S);
diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
index 3b49f43246..310f4689cb 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
@@ -138,10 +138,14 @@ void PairDPDfdtEnergyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
   k_cutsq.template sync<DeviceType>();
   k_params.template sync<DeviceType>();
-  atomKK->sync(execution_space,X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK | DPDTHETA_MASK | RMASS_MASK);
+  atomKK->sync(execution_space,X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK);
   if (evflag) atomKK->modified(execution_space,F_MASK | ENERGY_MASK | VIRIAL_MASK);
-  else atomKK->modified(execution_space,F_MASK | UCG_MASK | UCGNEW_MASK);
-  atomKK->k_mass.sync<DeviceType>();
+  else atomKK->modified(execution_space,F_MASK);
+
+  special_lj[0] = force->special_lj[0];
+  special_lj[1] = force->special_lj[1];
+  special_lj[2] = force->special_lj[2];
+  special_lj[3] = force->special_lj[3];
 
   nlocal = atom->nlocal;
   int nghost = atom->nghost;
@@ -155,6 +159,7 @@ void PairDPDfdtEnergyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   d_ilist = k_list->d_ilist;
 
   boltz = force->boltz;
+  ftm2v = force->ftm2v;
 
   int STACKPARAMS = 0; // optimize
 
@@ -195,7 +200,8 @@ void PairDPDfdtEnergyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     h_duMech = k_duMech.h_view;
     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyZero>(0,nlocal+nghost),*this);
 
-    atomKK->sync(execution_space,V_MASK);
+    atomKK->sync(execution_space,V_MASK | DPDTHETA_MASK | RMASS_MASK);
+    atomKK->k_mass.sync<DeviceType>();
 
     // loop over neighbors of my atoms
 
@@ -219,15 +225,12 @@ void PairDPDfdtEnergyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
     // Communicate the ghost delta energies to the locally owned atoms
 
+    // this memory transfer can be removed when fix_dpd_fdt_energy_kokkos is added
     k_duCond.template modify<DeviceType>();
     k_duCond.template sync<LMPHostType>();
     k_duMech.template modify<DeviceType>();
     k_duMech.template sync<LMPHostType>();
     comm->reverse_comm_pair(this);
-    //k_duCond.template modify<LMPHostType>();
-    //k_duCond.template sync<DeviceType>();
-    //k_duMech.template modify<LMPHostType>();
-    //k_duMech.template sync<DeviceType>();
   }
 
   if (eflag_global) eng_vdwl += ev.evdwl;
@@ -335,7 +338,7 @@ void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeSp
         ev.evdwl += evdwl;
       }
 
-        if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair,delx,dely,delz);
+      if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair,delx,dely,delz);
     }
   }
 
@@ -437,9 +440,9 @@ void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeNo
       fy_i += dely*fpair;
       fz_i += delz*fpair;
       if (NEWTON_PAIR || j < nlocal) {
-        f(j,0) -= delx*fpair;
-        f(j,1) -= dely*fpair;
-        f(j,2) -= delz*fpair;
+        a_f(j,0) -= delx*fpair;
+        a_f(j,1) -= dely*fpair;
+        a_f(j,2) -= delz*fpair;
       }
 
       if (rmass) {
@@ -454,7 +457,7 @@ void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeNo
 
       // Compute the mechanical and conductive energy, uMech and uCond
       mu_ij = massinv_i + massinv_j;
-      mu_ij *= force->ftm2v;
+      mu_ij *= ftm2v;
 
       uTmp = gamma_ij*wd*rinv*rinv*dot*dot
              - 0.5*sigma_ij*sigma_ij*mu_ij*wd;
diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
index 67fa315721..8e7d01de2a 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
@@ -89,7 +89,8 @@ class PairDPDfdtEnergyKokkos : public PairDPDfdtEnergy {
   int nlocal,neighflag;
   int STACKPARAMS;
   double dtinvsqrt;
-  double boltz;
+  double boltz,ftm2v;
+  double special_lj[4];
 
   virtual void allocate();
 
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index a2c70ca115..8ab7d62324 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -148,10 +148,6 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   special_lj[1] = force->special_lj[1];
   special_lj[2] = force->special_lj[2];
   special_lj[3] = force->special_lj[3];
-  special_coul[0] = force->special_coul[0];
-  special_coul[1] = force->special_coul[1];
-  special_coul[2] = force->special_coul[2];
-  special_coul[3] = force->special_coul[3];
   newton_pair = force->newton_pair;
 
   atomKK->sync(execution_space,X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK | UCG_MASK | UCGNEW_MASK | DVECTOR_MASK);
@@ -595,6 +591,7 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
         if (newton_pair || j < nlocal)
           a_uCGnew[j] += 0.5*evdwl;
         evdwl = evdwlOld;
+        ev.evdwl += evdwl;
         //if (vflag_either || eflag_atom) 
         if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair,delx,dely,delz);
       }
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.h b/src/KOKKOS/pair_exp6_rx_kokkos.h
index 366cf99d75..7dfe20fc22 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.h
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.h
@@ -89,7 +89,6 @@ class PairExp6rxKokkos : public PairExp6rx {
  protected:
   int eflag,vflag;
   int nlocal,newton_pair,neighflag;
-  double special_coul[4];
   double special_lj[4];
 
   typename AT::t_x_array_randomread x;

From 163b61a32eff89364b99ca0e9dbde28364a65ded Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Wed, 21 Dec 2016 15:37:00 -0700
Subject: [PATCH 029/267] Removing pair_table_rx_kokkos from Kokkos Install.sh
 since it isn't ready for runtime testing

---
 src/KOKKOS/Install.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index 94be32cc32..17e9f93c9d 100644
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -190,8 +190,8 @@ action pair_vashishta_kokkos.cpp pair_vashishta.cpp
 action pair_vashishta_kokkos.h pair_vashishta.h
 action pair_table_kokkos.cpp
 action pair_table_kokkos.h
-action pair_table_rx_kokkos.cpp pair_table_rx.cpp
-action pair_table_rx_kokkos.h pair_table_rx.h  
+#action pair_table_rx_kokkos.cpp pair_table_rx.cpp
+#action pair_table_rx_kokkos.h pair_table_rx.h  
 action pair_tersoff_kokkos.cpp pair_tersoff.cpp
 action pair_tersoff_kokkos.h pair_tersoff.h
 action pair_tersoff_mod_kokkos.cpp pair_tersoff_mod.cpp

From f6fe61196da6acb067c321433dcfa31dba8fc39e Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Thu, 22 Dec 2016 11:34:17 -0700
Subject: [PATCH 030/267] CPU runtime tested version of
 pair_multi_lucy_rx_kokkos

---
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp |  9 ++-
 src/KOKKOS/pair_exp6_rx_kokkos.cpp        | 11 ++--
 src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp  | 70 ++++++++++-------------
 src/KOKKOS/pair_multi_lucy_rx_kokkos.h    | 30 ++++------
 src/USER-DPD/pair_multi_lucy_rx.h         | 12 ++--
 5 files changed, 61 insertions(+), 71 deletions(-)

diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
index 310f4689cb..133d366fbc 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
@@ -58,6 +58,9 @@ PairDPDfdtEnergyKokkos<DeviceType>::~PairDPDfdtEnergyKokkos()
 {
   if (copymode) return;
 
+  memory->destroy_kokkos(k_eatom,eatom);
+  memory->destroy_kokkos(k_vatom,vatom);
+
   if (allocated) {
     memory->destroy_kokkos(k_duCond,duCond);
     memory->destroy_kokkos(k_duMech,duMech);
@@ -335,7 +338,8 @@ void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeSp
         // eng shifted to 0.0 at cutoff
         evdwl = 0.5*a0_ij*cut_ij * wd;
         evdwl *= factor_dpd;
-        ev.evdwl += evdwl;
+        if (EVFLAG)
+          ev.evdwl += ((NEWTON_PAIR||(j<nlocal))?1.0:0.5)*evdwl;
       }
 
       if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair,delx,dely,delz);
@@ -489,7 +493,8 @@ void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeNo
         // eng shifted to 0.0 at cutoff
         evdwl = 0.5*a0_ij*cut_ij * wd;
         evdwl *= factor_dpd;
-        ev.evdwl += evdwl;
+        if (EVFLAG)
+          ev.evdwl += ((NEWTON_PAIR||(j<nlocal))?1.0:0.5)*evdwl;
       }
 
       if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair,delx,dely,delz);
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index 8ab7d62324..559948067d 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -153,6 +153,7 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   atomKK->sync(execution_space,X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK | UCG_MASK | UCGNEW_MASK | DVECTOR_MASK);
   if (evflag) atomKK->modified(execution_space,F_MASK | ENERGY_MASK | VIRIAL_MASK | UCG_MASK | UCGNEW_MASK);
   else atomKK->modified(execution_space,F_MASK | UCG_MASK | UCGNEW_MASK);
+  k_cutsq.template sync<DeviceType>();
 
   // Initialize the Exp6 parameter data for both the local
   // and ghost atoms. Make the parameter data persistent
@@ -495,7 +496,7 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
         evdwlOld *= factor_lj;
 
         uCG_i += 0.5*evdwlOld;
-        if (newton_pair || j < nlocal)
+        if (NEWTON_PAIR || j < nlocal)
           a_uCG[j] += 0.5*evdwlOld;
       }
 
@@ -577,7 +578,7 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
         fx_i += delx*fpair;
         fy_i += dely*fpair;
         fz_i += delz*fpair;
-        if (newton_pair || j < nlocal) {
+        if (NEWTON_PAIR || j < nlocal) {
           a_f(j,0) -= delx*fpair;
           a_f(j,1) -= dely*fpair;
           a_f(j,2) -= delz*fpair;
@@ -588,10 +589,11 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
         evdwl *= factor_lj;
 
         uCGnew_i   += 0.5*evdwl;
-        if (newton_pair || j < nlocal)
+        if (NEWTON_PAIR || j < nlocal)
           a_uCGnew[j] += 0.5*evdwl;
         evdwl = evdwlOld;
-        ev.evdwl += evdwl;
+        if (EVFLAG)
+          ev.evdwl += ((NEWTON_PAIR||(j<nlocal))?1.0:0.5)*evdwl;
         //if (vflag_either || eflag_atom) 
         if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair,delx,dely,delz);
       }
@@ -630,6 +632,7 @@ void PairExp6rxKokkos<DeviceType>::allocate()
 
   memory->create_kokkos(k_cutsq,cutsq,n+1,n+1,"pair:cutsq");
   d_cutsq = k_cutsq.template view<DeviceType>();
+  k_cutsq.template modify<LMPHostType>();
 
   memory->create(cut,n+1,n+1,"pair:cut_lj");
 }
diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
index 76337b5219..1dc8ccbae9 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
@@ -71,7 +71,7 @@ PairMultiLucyRXKokkos<DeviceType>::PairMultiLucyRXKokkos(LAMMPS *lmp) : PairMult
   datamask_read = EMPTY_MASK;
   datamask_modify = EMPTY_MASK;
 
-  update_table = 0;
+  update_table = 1;
   h_table = new TableHost();
   d_table = new TableDevice();
 
@@ -85,8 +85,14 @@ PairMultiLucyRXKokkos<DeviceType>::~PairMultiLucyRXKokkos()
 {
   if (copymode) return;
 
+  memory->destroy_kokkos(k_eatom,eatom);
+  memory->destroy_kokkos(k_vatom,vatom);
+
+  memory->destroy_kokkos(k_cutsq,cutsq);
+
   delete h_table;
   delete d_table;
+  tabindex = NULL;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -123,6 +129,8 @@ void PairMultiLucyRXKokkos<DeviceType>::init_style()
 template<class DeviceType>
 void PairMultiLucyRXKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 {
+  copymode = 1;
+
   if (update_table)
     create_kokkos_tables();
 
@@ -130,6 +138,8 @@ void PairMultiLucyRXKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     compute_style<LOOKUP>(eflag_in,vflag_in);
   else if(tabstyle == LINEAR)
     compute_style<LINEAR>(eflag_in,vflag_in);
+
+  copymode = 0;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -138,15 +148,9 @@ template<class DeviceType>
 template<int TABSTYLE>
 void PairMultiLucyRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
 {
-  copymode = 1;
-
   eflag = eflag_in;
   vflag = vflag_in;
 
-  double evdwl,evdwlOld;
-
-  evdwlOld = 0.0;
-  evdwl = 0.0;
   if (neighflag == FULL) no_virial_fdotr_compute = 1;
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
@@ -175,6 +179,7 @@ void PairMultiLucyRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in
   atomKK->sync(execution_space,X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK | DPDRHO_MASK | UCG_MASK | UCGNEW_MASK | DVECTOR_MASK);
   if (evflag) atomKK->modified(execution_space,F_MASK | ENERGY_MASK | VIRIAL_MASK | UCG_MASK | UCGNEW_MASK);
   else atomKK->modified(execution_space,F_MASK | UCG_MASK | UCGNEW_MASK);
+  k_cutsq.template sync<DeviceType>();
 
   nlocal = atom->nlocal;
   int nghost = atom->nghost;
@@ -250,8 +255,6 @@ void PairMultiLucyRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in
     k_vatom.template modify<DeviceType>();
     k_vatom.template sync<LMPHostType>();
   }
-
-  copymode = 0;
 }
 
 template<class DeviceType>
@@ -316,10 +319,12 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
 
       //tb = &tables[tabindex[itype][jtype]];
       const int tidx = d_table_const.tabindex(itype,jtype);
+
       //if (rho[i]*rho[i] < tb->innersq || rho[j]*rho[j] < tb->innersq){
       if (rho[i]*rho[i] < d_table_const.innersq(tidx) || rho[j]*rho[j] < d_table_const.innersq(tidx)){
         k_error_flag.d_view() = 1;
       }
+
       if (TABSTYLE == LOOKUP) {
         //itable = static_cast<int> (((rho[i]*rho[i]) - tb->innersq) * tb->invdelta);
         itable = static_cast<int> (((rho[i]*rho[i]) - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
@@ -338,6 +343,7 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
         fpair /= sqrt(rsq);
 
       } else if (TABSTYLE == LINEAR) {
+
         //itable = static_cast<int> ((rho[i]*rho[i] - tb->innersq) * tb->invdelta);
         itable = static_cast<int> ((rho[i]*rho[i] - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
         //jtable = static_cast<int> (((rho[j]*rho[j]) - tb->innersq) * tb->invdelta);
@@ -395,8 +401,9 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
   //itable = static_cast<int> (((rho[i]*rho[i]) - tb->innersq) * tb->invdelta);
   itable = static_cast<int> (((rho[i]*rho[i]) - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
   //if (TABSTYLE == LOOKUP) evdwl = tb->e[itable];
-  if (TABSTYLE == LOOKUP) evdwl = d_table_const.e(tidx,itable);
-  else if (TABSTYLE == LINEAR){
+  if (TABSTYLE == LOOKUP) {
+    evdwl = d_table_const.e(tidx,itable);
+  } else if (TABSTYLE == LINEAR) {
     if (itable >= tlm1){
       k_error_flag.d_view() = 2;
     }
@@ -404,7 +411,7 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
     //else fraction_i = (((rho[i]*rho[i]) - tb->rsq[itable]) * tb->invdelta);
     else fraction_i = (((rho[i]*rho[i]) - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx));
     //evdwl = tb->e[itable] + fraction_i*tb->de[itable];
-    evdwl = d_table_const.e(tidx,itable); + fraction_i*d_table_const.de(tidx,itable);
+    evdwl = d_table_const.e(tidx,itable) + fraction_i*d_table_const.de(tidx,itable);
   } else k_error_flag.d_view() = 3;
 
   evdwl *=(pi*d_cutsq(itype,itype)*d_cutsq(itype,itype))/84.0;
@@ -417,7 +424,8 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
   evdwl = evdwlOld;
 
   //if (evflag) ev_tally(0,0,nlocal,newton_pair,evdwl,0.0,0.0,0.0,0.0,0.0);
-  if (EVFLAG) ev.evdwl += evdwl;
+  if (EVFLAG)
+    ev.evdwl += ((NEWTON_PAIR||(j<nlocal))?1.0:0.5)*evdwl;
 }
 
 template<class DeviceType>
@@ -433,8 +441,6 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
 template<class DeviceType>
 void PairMultiLucyRXKokkos<DeviceType>::computeLocalDensity()
 {
-  copymode = 1;
-
   x = atomKK->k_x.view<DeviceType>();
   type = atomKK->k_type.view<DeviceType>();
   rho = atomKK->k_rho.view<DeviceType>();
@@ -491,8 +497,6 @@ void PairMultiLucyRXKokkos<DeviceType>::computeLocalDensity()
   }
 
   comm->forward_comm_pair(this);
-
-  copymode = 0;
 }
 
 template<class DeviceType>
@@ -506,6 +510,7 @@ template<int NEIGHFLAG, int NEWTON_PAIR>
 KOKKOS_INLINE_FUNCTION
 void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXComputeLocalDensity<NEIGHFLAG,NEWTON_PAIR>, const int &ii) const {
 
+
   // The rho array is atomic for Half/Thread neighbor style
   Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_rho = rho;
 
@@ -565,6 +570,7 @@ void PairMultiLucyRXKokkos<DeviceType>::getParams(int id, double &fractionOld1,
   double fractionOld, fraction;
   double nTotal, nTotalOld;
 
+
   nTotal = 0.0;
   nTotalOld = 0.0;
   for (int ispecies = 0; ispecies < nspecies; ispecies++){
@@ -796,7 +802,6 @@ void PairMultiLucyRXKokkos<DeviceType>::create_kokkos_tables()
 
   memory->create_kokkos(d_table->innersq,h_table->innersq,ntables,"Table::innersq");
   memory->create_kokkos(d_table->invdelta,h_table->invdelta,ntables,"Table::invdelta");
-  memory->create_kokkos(d_table->deltasq6,h_table->deltasq6,ntables,"Table::deltasq6");
 
   if(tabstyle == LOOKUP) {
     memory->create_kokkos(d_table->e,h_table->e,ntables,tlm1,"Table::e");
@@ -816,12 +821,9 @@ void PairMultiLucyRXKokkos<DeviceType>::create_kokkos_tables()
 
     h_table->innersq[i] = tb->innersq;
     h_table->invdelta[i] = tb->invdelta;
-    h_table->deltasq6[i] = tb->deltasq6;
 
     for(int j = 0; j<h_table->rsq.dimension_1(); j++)
       h_table->rsq(i,j) = tb->rsq[j];
-    for(int j = 0; j<h_table->drsq.dimension_1(); j++)
-      h_table->drsq(i,j) = tb->drsq[j];
     for(int j = 0; j<h_table->e.dimension_1(); j++)
       h_table->e(i,j) = tb->e[j];
     for(int j = 0; j<h_table->de.dimension_1(); j++)
@@ -830,40 +832,26 @@ void PairMultiLucyRXKokkos<DeviceType>::create_kokkos_tables()
       h_table->f(i,j) = tb->f[j];
     for(int j = 0; j<h_table->df.dimension_1(); j++)
       h_table->df(i,j) = tb->df[j];
-    for(int j = 0; j<h_table->e2.dimension_1(); j++)
-      h_table->e2(i,j) = tb->e2[j];
-    for(int j = 0; j<h_table->f2.dimension_1(); j++)
-      h_table->f2(i,j) = tb->f2[j];
   }
 
 
   Kokkos::deep_copy(d_table->innersq,h_table->innersq);
   Kokkos::deep_copy(d_table->invdelta,h_table->invdelta);
-  Kokkos::deep_copy(d_table->deltasq6,h_table->deltasq6);
   Kokkos::deep_copy(d_table->rsq,h_table->rsq);
-  Kokkos::deep_copy(d_table->drsq,h_table->drsq);
   Kokkos::deep_copy(d_table->e,h_table->e);
   Kokkos::deep_copy(d_table->de,h_table->de);
   Kokkos::deep_copy(d_table->f,h_table->f);
   Kokkos::deep_copy(d_table->df,h_table->df);
-  Kokkos::deep_copy(d_table->e2,h_table->e2);
-  Kokkos::deep_copy(d_table->f2,h_table->f2);
   Kokkos::deep_copy(d_table->tabindex,h_table->tabindex);
 
   d_table_const.innersq = d_table->innersq;
   d_table_const.invdelta = d_table->invdelta;
-  d_table_const.deltasq6 = d_table->deltasq6;
   d_table_const.rsq = d_table->rsq;
-  d_table_const.drsq = d_table->drsq;
   d_table_const.e = d_table->e;
   d_table_const.de = d_table->de;
   d_table_const.f = d_table->f;
   d_table_const.df = d_table->df;
-  d_table_const.e2 = d_table->e2;
-  d_table_const.f2 = d_table->f2;
 
-
-  Kokkos::deep_copy(d_table->cutsq,h_table->cutsq);
   update_table = 0;
 }
 
@@ -878,11 +866,14 @@ void PairMultiLucyRXKokkos<DeviceType>::allocate()
   const int nt = atom->ntypes + 1;
 
   memory->create(setflag,nt,nt,"pair:setflag");
-  memory->create_kokkos(d_table->cutsq,h_table->cutsq,cutsq,nt,nt,"pair:cutsq");
-  memory->create_kokkos(d_table->tabindex,h_table->tabindex,tabindex,nt,nt,"pair:tabindex");
 
-  d_table_const.cutsq = d_table->cutsq;
+  memory->create_kokkos(k_cutsq,cutsq,nt,nt,"pair:cutsq");
+  d_cutsq = k_cutsq.template view<DeviceType>();
+  k_cutsq.template modify<LMPHostType>();
+
+  memory->create_kokkos(d_table->tabindex,h_table->tabindex,tabindex,nt,nt,"pair:tabindex");
   d_table_const.tabindex = d_table->tabindex;
+
   memset(&setflag[0][0],0,nt*nt*sizeof(int));
   memset(&cutsq[0][0],0,nt*nt*sizeof(double));
   memset(&tabindex[0][0],0,nt*nt*sizeof(int));
@@ -916,9 +907,6 @@ void PairMultiLucyRXKokkos<DeviceType>::settings(int narg, char **arg)
 
     d_table_const.tabindex = d_table->tabindex = typename ArrayTypes<DeviceType>::t_int_2d();
     h_table->tabindex = typename ArrayTypes<LMPHostType>::t_int_2d();
-
-    d_table_const.cutsq = d_table->cutsq = typename ArrayTypes<DeviceType>::t_ffloat_2d();
-    h_table->cutsq = typename ArrayTypes<LMPHostType>::t_ffloat_2d();
   }
   allocated = 0;
 
diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.h b/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
index b205f00796..a6622ac4ec 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
@@ -120,44 +120,38 @@ class PairMultiLucyRXKokkos : public PairMultiLucyRX {
   //};
 
   /*struct TableDeviceConst {
-    typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread cutsq;
-    typename ArrayTypes<DeviceType>::t_int_2d_randomread tabindex;
-    typename ArrayTypes<DeviceType>::t_ffloat_1d_randomread innersq,invdelta,deltasq6;
-    typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread rsq,drsq,e,de,f,df,e2,f2;
+    typename AT::t_int_2d_randomread tabindex;
+    typename AT::t_ffloat_1d_randomread innersq,invdelta;
+    typename AT::t_ffloat_2d_randomread rsq,e,de,f,df;
   };*/
  //Its faster not to use texture fetch if the number of tables is less than 32!
   struct TableDeviceConst {
-    typename ArrayTypes<DeviceType>::t_ffloat_2d cutsq;
-    typename ArrayTypes<DeviceType>::t_int_2d tabindex;
-    typename ArrayTypes<DeviceType>::t_ffloat_1d innersq,invdelta,deltasq6;
-    typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread rsq,drsq,e,de,f,df,e2,f2;
+    typename AT::t_int_2d tabindex;
+    typename AT::t_ffloat_1d innersq,invdelta;
+    typename AT::t_ffloat_2d_randomread rsq,e,de,f,df;
   };
 
   struct TableDevice {
-    typename ArrayTypes<DeviceType>::t_ffloat_2d cutsq;
-    typename ArrayTypes<DeviceType>::t_int_2d tabindex;
-    typename ArrayTypes<DeviceType>::t_ffloat_1d innersq,invdelta,deltasq6;
-    typename ArrayTypes<DeviceType>::t_ffloat_2d rsq,drsq,e,de,f,df,e2,f2;
+    typename AT::t_int_2d tabindex;
+    typename AT::t_ffloat_1d innersq,invdelta;
+    typename AT::t_ffloat_2d rsq,e,de,f,df;
   };
 
   struct TableHost {
-    typename ArrayTypes<LMPHostType>::t_ffloat_2d cutsq;
-    typename ArrayTypes<LMPHostType>::t_int_2d tabindex;
-    typename ArrayTypes<LMPHostType>::t_ffloat_1d innersq,invdelta,deltasq6;
-    typename ArrayTypes<LMPHostType>::t_ffloat_2d rsq,drsq,e,de,f,df,e2,f2;
+    HAT::t_int_2d tabindex;
+    HAT::t_ffloat_1d innersq,invdelta;
+    HAT::t_ffloat_2d rsq,e,de,f,df;
   };
 
   TableDeviceConst d_table_const;
   TableDevice* d_table;
   TableHost* h_table;
 
-  int **tabindex;
   F_FLOAT m_cutsq[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
 
   void allocate();
   int update_table;
   void create_kokkos_tables();
-  void cleanup_copy();
 
   KOKKOS_INLINE_FUNCTION
   void getParams(int, double &, double &, double &, double &) const;
diff --git a/src/USER-DPD/pair_multi_lucy_rx.h b/src/USER-DPD/pair_multi_lucy_rx.h
index 2913716c5a..0562739c50 100644
--- a/src/USER-DPD/pair_multi_lucy_rx.h
+++ b/src/USER-DPD/pair_multi_lucy_rx.h
@@ -30,17 +30,17 @@ class PairMultiLucyRX : public Pair {
   virtual ~PairMultiLucyRX();
 
   virtual void compute(int, int);
-  void settings(int, char **);
+  virtual void settings(int, char **);
   void coeff(int, char **);
   double init_one(int, int);
   void write_restart(FILE *);
   void read_restart(FILE *);
   void write_restart_settings(FILE *);
   void read_restart_settings(FILE *);
-  int pack_forward_comm(int, int *, double *, int, int *);
-  void unpack_forward_comm(int, int, double *);
-  int pack_reverse_comm(int, int, double *);
-  void unpack_reverse_comm(int, int *, double *);
+  virtual int pack_forward_comm(int, int *, double *, int, int *);
+  virtual void unpack_forward_comm(int, int, double *);
+  virtual int pack_reverse_comm(int, int, double *);
+  virtual void unpack_reverse_comm(int, int *, double *);
   void computeLocalDensity();
   double rho_0;
 
@@ -64,7 +64,7 @@ class PairMultiLucyRX : public Pair {
 
   int **tabindex;
 
-  void allocate();
+  virtual void allocate();
   void read_table(Table *, char *, char *);
   void param_extract(Table *, char *);
   void bcast_table(Table *);

From a4ab877c4672b919ed5807864f5db726ef522926 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Thu, 22 Dec 2016 13:16:57 -0700
Subject: [PATCH 031/267] Change to allow pair_dpd_fdt_energy_kokkos

---
 src/USER-DPD/fix_dpd_energy.cpp | 2 ++
 src/USER-DPD/fix_rx.cpp         | 3 +++
 src/USER-DPD/fix_shardlow.cpp   | 2 ++
 3 files changed, 7 insertions(+)

diff --git a/src/USER-DPD/fix_dpd_energy.cpp b/src/USER-DPD/fix_dpd_energy.cpp
index 05907a5fcf..475e12f02f 100644
--- a/src/USER-DPD/fix_dpd_energy.cpp
+++ b/src/USER-DPD/fix_dpd_energy.cpp
@@ -34,6 +34,8 @@ FixDPDenergy::FixDPDenergy(LAMMPS *lmp, int narg, char **arg) :
 
   pairDPDE = NULL;
   pairDPDE = (PairDPDfdtEnergy *) force->pair_match("dpd/fdt/energy",1);
+  if (pairDPDE == NULL)
+    pairDPDE = (PairDPDfdtEnergy *) force->pair_match("dpd/fdt/energy/kk",1);
 
   if (pairDPDE == NULL)
     error->all(FLERR,"Must use pair_style dpd/fdt/energy with fix dpd/energy");
diff --git a/src/USER-DPD/fix_rx.cpp b/src/USER-DPD/fix_rx.cpp
index df67cf4035..0bd560b241 100644
--- a/src/USER-DPD/fix_rx.cpp
+++ b/src/USER-DPD/fix_rx.cpp
@@ -627,6 +627,9 @@ int FixRX::setmask()
 void FixRX::init()
 {
   pairDPDE = (PairDPDfdtEnergy *) force->pair_match("dpd/fdt/energy",1);
+  if (pairDPDE == NULL)
+    pairDPDE = (PairDPDfdtEnergy *) force->pair_match("dpd/fdt/energy/kk",1);
+
   if (pairDPDE == NULL)
     error->all(FLERR,"Must use pair_style dpd/fdt/energy with fix rx");
 
diff --git a/src/USER-DPD/fix_shardlow.cpp b/src/USER-DPD/fix_shardlow.cpp
index 28c5382237..541f4ba3c3 100644
--- a/src/USER-DPD/fix_shardlow.cpp
+++ b/src/USER-DPD/fix_shardlow.cpp
@@ -94,6 +94,8 @@ FixShardlow::FixShardlow(LAMMPS *lmp, int narg, char **arg) :
   pairDPDE = NULL;
   pairDPD = (PairDPDfdt *) force->pair_match("dpd/fdt",1);
   pairDPDE = (PairDPDfdtEnergy *) force->pair_match("dpd/fdt/energy",1);
+  if (pairDPDE == NULL)
+    pairDPDE = (PairDPDfdtEnergy *) force->pair_match("dpd/fdt/energy/kk",1);
 
   if(pairDPDE){
     comm_forward = 3;

From a36e563aa56e8e3a31f19f8b010301c3b170b941 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Thu, 22 Dec 2016 14:37:42 -0700
Subject: [PATCH 032/267] Temporarily reverting change to pair_table_kokkos to
 allow runtime testing

---
 src/KOKKOS/pair_table_kokkos.cpp | 758 ++++++++++++++++++++++++++++++-
 src/KOKKOS/pair_table_kokkos.h   |  44 +-
 2 files changed, 790 insertions(+), 12 deletions(-)

diff --git a/src/KOKKOS/pair_table_kokkos.cpp b/src/KOKKOS/pair_table_kokkos.cpp
index b8b647964c..5230d1a91f 100644
--- a/src/KOKKOS/pair_table_kokkos.cpp
+++ b/src/KOKKOS/pair_table_kokkos.cpp
@@ -12,7 +12,7 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing author: Christian Trott (SNL)
+   Contributing author: Paul Crozier (SNL)
 ------------------------------------------------------------------------- */
 
 #include <mpi.h>
@@ -41,7 +41,7 @@ enum{FULL,HALFTHREAD,HALF};
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
-PairTableKokkos<DeviceType>::PairTableKokkos(LAMMPS *lmp) : PairTable(lmp)
+PairTableKokkos<DeviceType>::PairTableKokkos(LAMMPS *lmp) : Pair(lmp)
 {
   update_table = 0;
   atomKK = (AtomKokkos *) atom;
@@ -98,7 +98,6 @@ void PairTableKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
 
   if (neighflag == FULL) no_virial_fdotr_compute = 1;
 
-
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
 
@@ -196,7 +195,6 @@ compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, c
   //if (rsq < d_table_const.innersq(tidx))
   //  error->one(FLERR,"Pair distance < table inner cutoff");
 
-
   if (Specialisation::TabStyle == LOOKUP) {
     const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
     //if (itable >= tlm1)
@@ -314,6 +312,8 @@ void PairTableKokkos<DeviceType>::create_kokkos_tables()
     memory->create_kokkos(d_table->drsq,h_table->drsq,ntables,ntable,"Table::drsq");
   }
 
+
+
   for(int i=0; i < ntables; i++) {
     Table* tb = &tables[i];
 
@@ -451,6 +451,85 @@ void PairTableKokkos<DeviceType>::settings(int narg, char **arg)
   tables = NULL;
 }
 
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::coeff(int narg, char **arg)
+{
+  if (narg != 4 && narg != 5) error->all(FLERR,"Illegal pair_coeff command");
+  if (!allocated) allocate();
+
+  int ilo,ihi,jlo,jhi;
+  force->bounds(FLERR,arg[0],atom->ntypes,ilo,ihi);
+  force->bounds(FLERR,arg[1],atom->ntypes,jlo,jhi);
+
+  int me;
+  MPI_Comm_rank(world,&me);
+  tables = (Table *)
+    memory->srealloc(tables,(ntables+1)*sizeof(Table),"pair:tables");
+  Table *tb = &tables[ntables];
+  null_table(tb);
+  if (me == 0) read_table(tb,arg[2],arg[3]);
+  bcast_table(tb);
+
+  // set table cutoff
+
+  if (narg == 5) tb->cut = force->numeric(FLERR,arg[4]);
+  else if (tb->rflag) tb->cut = tb->rhi;
+  else tb->cut = tb->rfile[tb->ninput-1];
+
+  // error check on table parameters
+  // insure cutoff is within table
+  // for BITMAP tables, file values can be in non-ascending order
+
+  if (tb->ninput <= 1) error->one(FLERR,"Invalid pair table length");
+  double rlo,rhi;
+  if (tb->rflag == 0) {
+    rlo = tb->rfile[0];
+    rhi = tb->rfile[tb->ninput-1];
+  } else {
+    rlo = tb->rlo;
+    rhi = tb->rhi;
+  }
+  if (tb->cut <= rlo || tb->cut > rhi)
+    error->all(FLERR,"Invalid pair table cutoff");
+  if (rlo <= 0.0) error->all(FLERR,"Invalid pair table cutoff");
+
+  // match = 1 if don't need to spline read-in tables
+  // this is only the case if r values needed by final tables
+  //   exactly match r values read from file
+  // for tabstyle SPLINE, always need to build spline tables
+
+  tb->match = 0;
+  if (tabstyle == LINEAR && tb->ninput == tablength &&
+      tb->rflag == RSQ && tb->rhi == tb->cut) tb->match = 1;
+  if (tabstyle == BITMAP && tb->ninput == 1 << tablength &&
+      tb->rflag == BMP && tb->rhi == tb->cut) tb->match = 1;
+  if (tb->rflag == BMP && tb->match == 0)
+    error->all(FLERR,"Bitmapped table in file does not match requested table");
+
+  // spline read-in values and compute r,e,f vectors within table
+
+  if (tb->match == 0) spline_table(tb);
+  compute_table(tb);
+
+  // store ptr to table in tabindex
+
+  int count = 0;
+  for (int i = ilo; i <= ihi; i++) {
+    for (int j = MAX(jlo,i); j <= jhi; j++) {
+      tabindex[i][j] = ntables;
+      setflag[i][j] = 1;
+      count++;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Illegal pair_coeff command");
+  ntables++;
+}
+
 /* ----------------------------------------------------------------------
    init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
@@ -469,6 +548,677 @@ double PairTableKokkos<DeviceType>::init_one(int i, int j)
   return tables[tabindex[i][j]].cut;
 }
 
+/* ----------------------------------------------------------------------
+   read a table section from a tabulated potential file
+   only called by proc 0
+   this function sets these values in Table:
+     ninput,rfile,efile,ffile,rflag,rlo,rhi,fpflag,fplo,fphi,ntablebits
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::read_table(Table *tb, char *file, char *keyword)
+{
+  char line[MAXLINE];
+
+  // open file
+
+  FILE *fp = force->open_potential(file);
+  if (fp == NULL) {
+    char str[128];
+    sprintf(str,"Cannot open file %s",file);
+    error->one(FLERR,str);
+  }
+
+  // loop until section found with matching keyword
+
+  while (1) {
+    if (fgets(line,MAXLINE,fp) == NULL)
+      error->one(FLERR,"Did not find keyword in table file");
+    if (strspn(line," \t\n\r") == strlen(line)) continue;  // blank line
+    if (line[0] == '#') continue;                          // comment
+    char *word = strtok(line," \t\n\r");
+    if (strcmp(word,keyword) == 0) break;           // matching keyword
+    fgets(line,MAXLINE,fp);                         // no match, skip section
+    param_extract(tb,line);
+    fgets(line,MAXLINE,fp);
+    for (int i = 0; i < tb->ninput; i++) fgets(line,MAXLINE,fp);
+  }
+
+  // read args on 2nd line of section
+  // allocate table arrays for file values
+
+  fgets(line,MAXLINE,fp);
+  param_extract(tb,line);
+  memory->create(tb->rfile,tb->ninput,"pair:rfile");
+  memory->create(tb->efile,tb->ninput,"pair:efile");
+  memory->create(tb->ffile,tb->ninput,"pair:ffile");
+
+  // setup bitmap parameters for table to read in
+
+  tb->ntablebits = 0;
+  int masklo,maskhi,nmask,nshiftbits;
+  if (tb->rflag == BMP) {
+    while (1 << tb->ntablebits < tb->ninput) tb->ntablebits++;
+    if (1 << tb->ntablebits != tb->ninput)
+      error->one(FLERR,"Bitmapped table is incorrect length in table file");
+    init_bitmap(tb->rlo,tb->rhi,tb->ntablebits,masklo,maskhi,nmask,nshiftbits);
+  }
+
+  // read r,e,f table values from file
+  // if rflag set, compute r
+  // if rflag not set, use r from file
+
+  int itmp;
+  double rtmp;
+  union_int_float_t rsq_lookup;
+
+  fgets(line,MAXLINE,fp);
+  for (int i = 0; i < tb->ninput; i++) {
+    fgets(line,MAXLINE,fp);
+    sscanf(line,"%d %lg %lg %lg",&itmp,&rtmp,&tb->efile[i],&tb->ffile[i]);
+
+    if (tb->rflag == RLINEAR)
+      rtmp = tb->rlo + (tb->rhi - tb->rlo)*i/(tb->ninput-1);
+    else if (tb->rflag == RSQ) {
+      rtmp = tb->rlo*tb->rlo +
+        (tb->rhi*tb->rhi - tb->rlo*tb->rlo)*i/(tb->ninput-1);
+      rtmp = sqrt(rtmp);
+    } else if (tb->rflag == BMP) {
+      rsq_lookup.i = i << nshiftbits;
+      rsq_lookup.i |= masklo;
+      if (rsq_lookup.f < tb->rlo*tb->rlo) {
+        rsq_lookup.i = i << nshiftbits;
+        rsq_lookup.i |= maskhi;
+      }
+      rtmp = sqrtf(rsq_lookup.f);
+    }
+
+    tb->rfile[i] = rtmp;
+  }
+
+  // close file
+
+  fclose(fp);
+}
+
+/* ----------------------------------------------------------------------
+   broadcast read-in table info from proc 0 to other procs
+   this function communicates these values in Table:
+     ninput,rfile,efile,ffile,rflag,rlo,rhi,fpflag,fplo,fphi
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::bcast_table(Table *tb)
+{
+  MPI_Bcast(&tb->ninput,1,MPI_INT,0,world);
+
+  int me;
+  MPI_Comm_rank(world,&me);
+  if (me > 0) {
+    memory->create(tb->rfile,tb->ninput,"pair:rfile");
+    memory->create(tb->efile,tb->ninput,"pair:efile");
+    memory->create(tb->ffile,tb->ninput,"pair:ffile");
+  }
+
+  MPI_Bcast(tb->rfile,tb->ninput,MPI_DOUBLE,0,world);
+  MPI_Bcast(tb->efile,tb->ninput,MPI_DOUBLE,0,world);
+  MPI_Bcast(tb->ffile,tb->ninput,MPI_DOUBLE,0,world);
+
+  MPI_Bcast(&tb->rflag,1,MPI_INT,0,world);
+  if (tb->rflag) {
+    MPI_Bcast(&tb->rlo,1,MPI_DOUBLE,0,world);
+    MPI_Bcast(&tb->rhi,1,MPI_DOUBLE,0,world);
+  }
+  MPI_Bcast(&tb->fpflag,1,MPI_INT,0,world);
+  if (tb->fpflag) {
+    MPI_Bcast(&tb->fplo,1,MPI_DOUBLE,0,world);
+    MPI_Bcast(&tb->fphi,1,MPI_DOUBLE,0,world);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   build spline representation of e,f over entire range of read-in table
+   this function sets these values in Table: e2file,f2file
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::spline_table(Table *tb)
+{
+  memory->create(tb->e2file,tb->ninput,"pair:e2file");
+  memory->create(tb->f2file,tb->ninput,"pair:f2file");
+
+  double ep0 = - tb->ffile[0];
+  double epn = - tb->ffile[tb->ninput-1];
+  spline(tb->rfile,tb->efile,tb->ninput,ep0,epn,tb->e2file);
+
+  if (tb->fpflag == 0) {
+    tb->fplo = (tb->ffile[1] - tb->ffile[0]) / (tb->rfile[1] - tb->rfile[0]);
+    tb->fphi = (tb->ffile[tb->ninput-1] - tb->ffile[tb->ninput-2]) /
+      (tb->rfile[tb->ninput-1] - tb->rfile[tb->ninput-2]);
+  }
+
+  double fp0 = tb->fplo;
+  double fpn = tb->fphi;
+  spline(tb->rfile,tb->ffile,tb->ninput,fp0,fpn,tb->f2file);
+}
+
+/* ----------------------------------------------------------------------
+   extract attributes from parameter line in table section
+   format of line: N value R/RSQ/BITMAP lo hi FP fplo fphi
+   N is required, other params are optional
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::param_extract(Table *tb, char *line)
+{
+  tb->ninput = 0;
+  tb->rflag = NONE;
+  tb->fpflag = 0;
+
+  char *word = strtok(line," \t\n\r\f");
+  while (word) {
+    if (strcmp(word,"N") == 0) {
+      word = strtok(NULL," \t\n\r\f");
+      tb->ninput = atoi(word);
+    } else if (strcmp(word,"R") == 0 || strcmp(word,"RSQ") == 0 ||
+               strcmp(word,"BITMAP") == 0) {
+      if (strcmp(word,"R") == 0) tb->rflag = RLINEAR;
+      else if (strcmp(word,"RSQ") == 0) tb->rflag = RSQ;
+      else if (strcmp(word,"BITMAP") == 0) tb->rflag = BMP;
+      word = strtok(NULL," \t\n\r\f");
+      tb->rlo = atof(word);
+      word = strtok(NULL," \t\n\r\f");
+      tb->rhi = atof(word);
+    } else if (strcmp(word,"FP") == 0) {
+      tb->fpflag = 1;
+      word = strtok(NULL," \t\n\r\f");
+      tb->fplo = atof(word);
+      word = strtok(NULL," \t\n\r\f");
+      tb->fphi = atof(word);
+    } else {
+      error->one(FLERR,"Invalid keyword in pair table parameters");
+    }
+    word = strtok(NULL," \t\n\r\f");
+  }
+
+  if (tb->ninput == 0) error->one(FLERR,"Pair table parameters did not set N");
+}
+
+/* ----------------------------------------------------------------------
+   compute r,e,f vectors from splined values
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::compute_table(Table *tb)
+{
+  update_table = 1;
+  int tlm1 = tablength-1;
+
+  // inner = inner table bound
+  // cut = outer table bound
+  // delta = table spacing in rsq for N-1 bins
+
+  double inner;
+  if (tb->rflag) inner = tb->rlo;
+  else inner = tb->rfile[0];
+  tb->innersq = inner*inner;
+  tb->delta = (tb->cut*tb->cut - tb->innersq) / tlm1;
+  tb->invdelta = 1.0/tb->delta;
+
+  // direct lookup tables
+  // N-1 evenly spaced bins in rsq from inner to cut
+  // e,f = value at midpt of bin
+  // e,f are N-1 in length since store 1 value at bin midpt
+  // f is converted to f/r when stored in f[i]
+  // e,f are never a match to read-in values, always computed via spline interp
+
+  if (tabstyle == LOOKUP) {
+    memory->create(tb->e,tlm1,"pair:e");
+    memory->create(tb->f,tlm1,"pair:f");
+
+    double r,rsq;
+    for (int i = 0; i < tlm1; i++) {
+      rsq = tb->innersq + (i+0.5)*tb->delta;
+      r = sqrt(rsq);
+      tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
+      tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
+    }
+  }
+
+  // linear tables
+  // N-1 evenly spaced bins in rsq from inner to cut
+  // rsq,e,f = value at lower edge of bin
+  // de,df values = delta from lower edge to upper edge of bin
+  // rsq,e,f are N in length so de,df arrays can compute difference
+  // f is converted to f/r when stored in f[i]
+  // e,f can match read-in values, else compute via spline interp
+
+  if (tabstyle == LINEAR) {
+    memory->create(tb->rsq,tablength,"pair:rsq");
+    memory->create(tb->e,tablength,"pair:e");
+    memory->create(tb->f,tablength,"pair:f");
+    memory->create(tb->de,tlm1,"pair:de");
+    memory->create(tb->df,tlm1,"pair:df");
+
+    double r,rsq;
+    for (int i = 0; i < tablength; i++) {
+      rsq = tb->innersq + i*tb->delta;
+      r = sqrt(rsq);
+      tb->rsq[i] = rsq;
+      if (tb->match) {
+        tb->e[i] = tb->efile[i];
+        tb->f[i] = tb->ffile[i]/r;
+      } else {
+        tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
+        tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
+      }
+    }
+
+    for (int i = 0; i < tlm1; i++) {
+      tb->de[i] = tb->e[i+1] - tb->e[i];
+      tb->df[i] = tb->f[i+1] - tb->f[i];
+    }
+  }
+
+  // cubic spline tables
+  // N-1 evenly spaced bins in rsq from inner to cut
+  // rsq,e,f = value at lower edge of bin
+  // e2,f2 = spline coefficient for each bin
+  // rsq,e,f,e2,f2 are N in length so have N-1 spline bins
+  // f is converted to f/r after e is splined
+  // e,f can match read-in values, else compute via spline interp
+
+  if (tabstyle == SPLINE) {
+    memory->create(tb->rsq,tablength,"pair:rsq");
+    memory->create(tb->e,tablength,"pair:e");
+    memory->create(tb->f,tablength,"pair:f");
+    memory->create(tb->e2,tablength,"pair:e2");
+    memory->create(tb->f2,tablength,"pair:f2");
+
+    tb->deltasq6 = tb->delta*tb->delta / 6.0;
+
+    double r,rsq;
+    for (int i = 0; i < tablength; i++) {
+      rsq = tb->innersq + i*tb->delta;
+      r = sqrt(rsq);
+      tb->rsq[i] = rsq;
+      if (tb->match) {
+        tb->e[i] = tb->efile[i];
+        tb->f[i] = tb->ffile[i]/r;
+      } else {
+        tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
+        tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r);
+      }
+    }
+
+    // ep0,epn = dh/dg at inner and at cut
+    // h(r) = e(r) and g(r) = r^2
+    // dh/dg = (de/dr) / 2r = -f/2r
+
+    double ep0 = - tb->f[0] / (2.0 * sqrt(tb->innersq));
+    double epn = - tb->f[tlm1] / (2.0 * tb->cut);
+    spline(tb->rsq,tb->e,tablength,ep0,epn,tb->e2);
+
+    // fp0,fpn = dh/dg at inner and at cut
+    // h(r) = f(r)/r and g(r) = r^2
+    // dh/dg = (1/r df/dr - f/r^2) / 2r
+    // dh/dg in secant approx = (f(r2)/r2 - f(r1)/r1) / (g(r2) - g(r1))
+
+    double fp0,fpn;
+    double secant_factor = 0.1;
+    if (tb->fpflag) fp0 = (tb->fplo/sqrt(tb->innersq) - tb->f[0]/tb->innersq) /
+      (2.0 * sqrt(tb->innersq));
+    else {
+      double rsq1 = tb->innersq;
+      double rsq2 = rsq1 + secant_factor*tb->delta;
+      fp0 = (splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,sqrt(rsq2)) /
+             sqrt(rsq2) - tb->f[0] / sqrt(rsq1)) / (secant_factor*tb->delta);
+    }
+
+    if (tb->fpflag && tb->cut == tb->rfile[tb->ninput-1]) fpn =
+      (tb->fphi/tb->cut - tb->f[tlm1]/(tb->cut*tb->cut)) / (2.0 * tb->cut);
+    else {
+      double rsq2 = tb->cut * tb->cut;
+      double rsq1 = rsq2 - secant_factor*tb->delta;
+      fpn = (tb->f[tlm1] / sqrt(rsq2) -
+             splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,sqrt(rsq1)) /
+             sqrt(rsq1)) / (secant_factor*tb->delta);
+    }
+
+    for (int i = 0; i < tablength; i++) tb->f[i] /= sqrt(tb->rsq[i]);
+    spline(tb->rsq,tb->f,tablength,fp0,fpn,tb->f2);
+  }
+
+  // bitmapped linear tables
+  // 2^N bins from inner to cut, spaced in bitmapped manner
+  // f is converted to f/r when stored in f[i]
+  // e,f can match read-in values, else compute via spline interp
+
+  if (tabstyle == BITMAP) {
+    double r;
+    union_int_float_t rsq_lookup;
+    int masklo,maskhi;
+
+    // linear lookup tables of length ntable = 2^n
+    // stored value = value at lower edge of bin
+
+    init_bitmap(inner,tb->cut,tablength,masklo,maskhi,tb->nmask,tb->nshiftbits);
+    int ntable = 1 << tablength;
+    int ntablem1 = ntable - 1;
+
+    memory->create(tb->rsq,ntable,"pair:rsq");
+    memory->create(tb->e,ntable,"pair:e");
+    memory->create(tb->f,ntable,"pair:f");
+    memory->create(tb->de,ntable,"pair:de");
+    memory->create(tb->df,ntable,"pair:df");
+    memory->create(tb->drsq,ntable,"pair:drsq");
+
+    union_int_float_t minrsq_lookup;
+    minrsq_lookup.i = 0 << tb->nshiftbits;
+    minrsq_lookup.i |= maskhi;
+
+    for (int i = 0; i < ntable; i++) {
+      rsq_lookup.i = i << tb->nshiftbits;
+      rsq_lookup.i |= masklo;
+      if (rsq_lookup.f < tb->innersq) {
+        rsq_lookup.i = i << tb->nshiftbits;
+        rsq_lookup.i |= maskhi;
+      }
+      r = sqrtf(rsq_lookup.f);
+      tb->rsq[i] = rsq_lookup.f;
+      if (tb->match) {
+        tb->e[i] = tb->efile[i];
+        tb->f[i] = tb->ffile[i]/r;
+      } else {
+        tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
+        tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
+      }
+      minrsq_lookup.f = MIN(minrsq_lookup.f,rsq_lookup.f);
+    }
+
+    tb->innersq = minrsq_lookup.f;
+
+    for (int i = 0; i < ntablem1; i++) {
+      tb->de[i] = tb->e[i+1] - tb->e[i];
+      tb->df[i] = tb->f[i+1] - tb->f[i];
+      tb->drsq[i] = 1.0/(tb->rsq[i+1] - tb->rsq[i]);
+    }
+
+    // get the delta values for the last table entries
+    // tables are connected periodically between 0 and ntablem1
+
+    tb->de[ntablem1] = tb->e[0] - tb->e[ntablem1];
+    tb->df[ntablem1] = tb->f[0] - tb->f[ntablem1];
+    tb->drsq[ntablem1] = 1.0/(tb->rsq[0] - tb->rsq[ntablem1]);
+
+    // get the correct delta values at itablemax
+    // smallest r is in bin itablemin
+    // largest r is in bin itablemax, which is itablemin-1,
+    //   or ntablem1 if itablemin=0
+
+    // deltas at itablemax only needed if corresponding rsq < cut*cut
+    // if so, compute deltas between rsq and cut*cut
+    //   if tb->match, data at cut*cut is unavailable, so we'll take
+    //   deltas at itablemax-1 as a good approximation
+
+    double e_tmp,f_tmp;
+    int itablemin = minrsq_lookup.i & tb->nmask;
+    itablemin >>= tb->nshiftbits;
+    int itablemax = itablemin - 1;
+    if (itablemin == 0) itablemax = ntablem1;
+    int itablemaxm1 = itablemax - 1;
+    if (itablemax == 0) itablemaxm1 = ntablem1;
+    rsq_lookup.i = itablemax << tb->nshiftbits;
+    rsq_lookup.i |= maskhi;
+    if (rsq_lookup.f < tb->cut*tb->cut) {
+      if (tb->match) {
+        tb->de[itablemax] = tb->de[itablemaxm1];
+        tb->df[itablemax] = tb->df[itablemaxm1];
+        tb->drsq[itablemax] = tb->drsq[itablemaxm1];
+      } else {
+            rsq_lookup.f = tb->cut*tb->cut;
+        r = sqrtf(rsq_lookup.f);
+        e_tmp = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
+        f_tmp = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
+        tb->de[itablemax] = e_tmp - tb->e[itablemax];
+        tb->df[itablemax] = f_tmp - tb->f[itablemax];
+        tb->drsq[itablemax] = 1.0/(rsq_lookup.f - tb->rsq[itablemax]);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   set all ptrs in a table to NULL, so can be freed safely
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::null_table(Table *tb)
+{
+  tb->rfile = tb->efile = tb->ffile = NULL;
+  tb->e2file = tb->f2file = NULL;
+  tb->rsq = tb->drsq = tb->e = tb->de = NULL;
+  tb->f = tb->df = tb->e2 = tb->f2 = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays in a table
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::free_table(Table *tb)
+{
+  memory->destroy(tb->rfile);
+  memory->destroy(tb->efile);
+  memory->destroy(tb->ffile);
+  memory->destroy(tb->e2file);
+  memory->destroy(tb->f2file);
+
+  memory->destroy(tb->rsq);
+  memory->destroy(tb->drsq);
+  memory->destroy(tb->e);
+  memory->destroy(tb->de);
+  memory->destroy(tb->f);
+  memory->destroy(tb->df);
+  memory->destroy(tb->e2);
+  memory->destroy(tb->f2);
+}
+
+/* ----------------------------------------------------------------------
+   spline and splint routines modified from Numerical Recipes
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::spline(double *x, double *y, int n,
+                       double yp1, double ypn, double *y2)
+{
+  int i,k;
+  double p,qn,sig,un;
+  double *u = new double[n];
+
+  if (yp1 > 0.99e30) y2[0] = u[0] = 0.0;
+  else {
+    y2[0] = -0.5;
+    u[0] = (3.0/(x[1]-x[0])) * ((y[1]-y[0]) / (x[1]-x[0]) - yp1);
+  }
+  for (i = 1; i < n-1; i++) {
+    sig = (x[i]-x[i-1]) / (x[i+1]-x[i-1]);
+    p = sig*y2[i-1] + 2.0;
+    y2[i] = (sig-1.0) / p;
+    u[i] = (y[i+1]-y[i]) / (x[i+1]-x[i]) - (y[i]-y[i-1]) / (x[i]-x[i-1]);
+    u[i] = (6.0*u[i] / (x[i+1]-x[i-1]) - sig*u[i-1]) / p;
+  }
+  if (ypn > 0.99e30) qn = un = 0.0;
+  else {
+    qn = 0.5;
+    un = (3.0/(x[n-1]-x[n-2])) * (ypn - (y[n-1]-y[n-2]) / (x[n-1]-x[n-2]));
+  }
+  y2[n-1] = (un-qn*u[n-2]) / (qn*y2[n-2] + 1.0);
+  for (k = n-2; k >= 0; k--) y2[k] = y2[k]*y2[k+1] + u[k];
+
+  delete [] u;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PairTableKokkos<DeviceType>::splint(double *xa, double *ya, double *y2a, int n, double x)
+{
+  int klo,khi,k;
+  double h,b,a,y;
+
+  klo = 0;
+  khi = n-1;
+  while (khi-klo > 1) {
+    k = (khi+klo) >> 1;
+    if (xa[k] > x) khi = k;
+    else klo = k;
+  }
+  h = xa[khi]-xa[klo];
+  a = (xa[khi]-x) / h;
+  b = (x-xa[klo]) / h;
+  y = a*ya[klo] + b*ya[khi] +
+    ((a*a*a-a)*y2a[klo] + (b*b*b-b)*y2a[khi]) * (h*h)/6.0;
+  return y;
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::write_restart(FILE *fp)
+{
+  write_restart_settings(fp);
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::read_restart(FILE *fp)
+{
+  read_restart_settings(fp);
+  allocate();
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::write_restart_settings(FILE *fp)
+{
+  fwrite(&tabstyle,sizeof(int),1,fp);
+  fwrite(&tablength,sizeof(int),1,fp);
+  fwrite(&ewaldflag,sizeof(int),1,fp);
+  fwrite(&pppmflag,sizeof(int),1,fp);
+  fwrite(&msmflag,sizeof(int),1,fp);
+  fwrite(&dispersionflag,sizeof(int),1,fp);
+  fwrite(&tip4pflag,sizeof(int),1,fp);
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::read_restart_settings(FILE *fp)
+{
+  if (comm->me == 0) {
+    fread(&tabstyle,sizeof(int),1,fp);
+    fread(&tablength,sizeof(int),1,fp);
+    fread(&ewaldflag,sizeof(int),1,fp);
+    fread(&pppmflag,sizeof(int),1,fp);
+    fread(&msmflag,sizeof(int),1,fp);
+    fread(&dispersionflag,sizeof(int),1,fp);
+    fread(&tip4pflag,sizeof(int),1,fp);
+  }
+  MPI_Bcast(&tabstyle,1,MPI_INT,0,world);
+  MPI_Bcast(&tablength,1,MPI_INT,0,world);
+  MPI_Bcast(&ewaldflag,1,MPI_INT,0,world);
+  MPI_Bcast(&pppmflag,1,MPI_INT,0,world);
+  MPI_Bcast(&msmflag,1,MPI_INT,0,world);
+  MPI_Bcast(&dispersionflag,1,MPI_INT,0,world);
+  MPI_Bcast(&tip4pflag,1,MPI_INT,0,world);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PairTableKokkos<DeviceType>::single(int i, int j, int itype, int jtype, double rsq,
+                         double factor_coul, double factor_lj,
+                         double &fforce)
+{
+  int itable;
+  double fraction,value,a,b,phi;
+  int tlm1 = tablength - 1;
+
+  Table *tb = &tables[tabindex[itype][jtype]];
+  if (rsq < tb->innersq) error->one(FLERR,"Pair distance < table inner cutoff");
+
+  if (tabstyle == LOOKUP) {
+    itable = static_cast<int> ((rsq-tb->innersq) * tb->invdelta);
+    if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff");
+    fforce = factor_lj * tb->f[itable];
+  } else if (tabstyle == LINEAR) {
+    itable = static_cast<int> ((rsq-tb->innersq) * tb->invdelta);
+    if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff");
+    fraction = (rsq - tb->rsq[itable]) * tb->invdelta;
+    value = tb->f[itable] + fraction*tb->df[itable];
+    fforce = factor_lj * value;
+  } else if (tabstyle == SPLINE) {
+    itable = static_cast<int> ((rsq-tb->innersq) * tb->invdelta);
+    if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff");
+    b = (rsq - tb->rsq[itable]) * tb->invdelta;
+    a = 1.0 - b;
+    value = a * tb->f[itable] + b * tb->f[itable+1] +
+      ((a*a*a-a)*tb->f2[itable] + (b*b*b-b)*tb->f2[itable+1]) *
+      tb->deltasq6;
+    fforce = factor_lj * value;
+  } else {
+    union_int_float_t rsq_lookup;
+    rsq_lookup.f = rsq;
+    itable = rsq_lookup.i & tb->nmask;
+    itable >>= tb->nshiftbits;
+    fraction = (rsq_lookup.f - tb->rsq[itable]) * tb->drsq[itable];
+    value = tb->f[itable] + fraction*tb->df[itable];
+    fforce = factor_lj * value;
+  }
+
+  if (tabstyle == LOOKUP)
+    phi = tb->e[itable];
+  else if (tabstyle == LINEAR || tabstyle == BITMAP)
+    phi = tb->e[itable] + fraction*tb->de[itable];
+  else
+    phi = a * tb->e[itable] + b * tb->e[itable+1] +
+      ((a*a*a-a)*tb->e2[itable] + (b*b*b-b)*tb->e2[itable+1]) * tb->deltasq6;
+  return factor_lj*phi;
+}
+
+/* ----------------------------------------------------------------------
+   return the Coulomb cutoff for tabled potentials
+   called by KSpace solvers which require that all pairwise cutoffs be the same
+   loop over all tables not just those indexed by tabindex[i][j] since
+     no way to know which tables are active since pair::init() not yet called
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void *PairTableKokkos<DeviceType>::extract(const char *str, int &dim)
+{
+  if (strcmp(str,"cut_coul") != 0) return NULL;
+  if (ntables == 0) error->all(FLERR,"All pair coeffs are not set");
+
+  double cut_coul = tables[0].cut;
+  for (int m = 1; m < ntables; m++)
+    if (tables[m].cut != cut_coul)
+      error->all(FLERR,
+                 "Pair table cutoffs must all be equal to use with KSpace");
+  dim = 0;
+  return &tables[0].cut;
+}
+
 template<class DeviceType>
 void PairTableKokkos<DeviceType>::init_style()
 {
diff --git a/src/KOKKOS/pair_table_kokkos.h b/src/KOKKOS/pair_table_kokkos.h
index 5b3f3852c3..4d3a9ec106 100644
--- a/src/KOKKOS/pair_table_kokkos.h
+++ b/src/KOKKOS/pair_table_kokkos.h
@@ -22,7 +22,7 @@ PairStyle(table/kk/host,PairTableKokkos<LMPHostType>)
 #ifndef LMP_PAIR_TABLE_KOKKOS_H
 #define LMP_PAIR_TABLE_KOKKOS_H
 
-#include "pair_table.h"
+#include "pair.h"
 #include "pair_kokkos.h"
 #include "neigh_list_kokkos.h"
 #include "atom_kokkos.h"
@@ -38,7 +38,7 @@ template <class DeviceType, int NEIGHFLAG, int TABSTYLE>
 class PairTableComputeFunctor;
 
 template<class DeviceType>
-class PairTableKokkos : public PairTable {
+class PairTableKokkos : public Pair {
  public:
 
   enum {EnabledNeighFlags=FULL|HALFTHREAD|HALF|N2};
@@ -59,9 +59,18 @@ class PairTableKokkos : public PairTable {
                         const NeighListKokkos<DeviceType> &list) const;
 */
   void settings(int, char **);
+  void coeff(int, char **);
   double init_one(int, int);
+  void write_restart(FILE *);
+  void read_restart(FILE *);
+  void write_restart_settings(FILE *);
+  void read_restart_settings(FILE *);
+  double single(int, int, int, int, double, double, double, double &);
+  void *extract(const char *, int &);
+
   void init_style();
 
+
  protected:
   enum{LOOKUP,LINEAR,SPLINE,BITMAP};
 
@@ -98,6 +107,17 @@ class PairTableKokkos : public PairTable {
     typename ArrayTypes<LMPHostType>::t_ffloat_2d rsq,drsq,e,de,f,df,e2,f2;
   };
 
+  struct Table {
+    int ninput,rflag,fpflag,match,ntablebits;
+    int nshiftbits,nmask;
+    double rlo,rhi,fplo,fphi,cut;
+    double *rfile,*efile,*ffile;
+    double *e2file,*f2file;
+    double innersq,delta,invdelta,deltasq6;
+    double *rsq,*drsq,*e,*de,*f,*df,*e2,*f2;
+  };
+  int ntables;
+  Table *tables;
   TableDeviceConst d_table_const;
   TableDevice* d_table;
   TableHost* h_table;
@@ -108,6 +128,15 @@ class PairTableKokkos : public PairTable {
   typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq;
 
   void allocate();
+  void read_table(Table *, char *, char *);
+  void param_extract(Table *, char *);
+  void bcast_table(Table *);
+  void spline_table(Table *);
+  void compute_table(Table *);
+  void null_table(Table *);
+  void free_table(Table *);
+  void spline(double *, double *, int, double, double, double *);
+  double splint(double *, double *, double *, int, double);
 
   typename ArrayTypes<DeviceType>::t_x_array_randomread x;
   typename ArrayTypes<DeviceType>::t_x_array_const c_x;
@@ -176,6 +205,11 @@ class PairTableKokkos : public PairTable {
   friend void pair_virial_fdotr_compute<PairTableKokkos>(PairTableKokkos*);
 };
 
+
+
+
+
+
 }
 
 #endif
@@ -255,10 +289,4 @@ E: Cannot use chosen neighbor list style with lj/cut/kk
 
 That style is not supported by Kokkos.
 
-
-
-
 */
-
-
-

From cc1b55e0310a5a0953d500c1b92b274ac8acf009 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Fri, 23 Dec 2016 12:36:05 -0700
Subject: [PATCH 033/267] Merging USER-DPD updates

---
 doc/src/fix_eos_table_rx.txt                  |  42 +-
 doc/src/pair_exp6_rx.txt                      |  60 ++-
 doc/src/pair_multi_lucy_rx.txt                |  13 +-
 doc/src/pair_table_rx.txt                     |  15 +-
 .../USER/dpd/dpde-vv/log.dpde-vv.reference    | 232 ++++-----
 .../USER/dpd/dpdrx-shardlow/in.dpdrx-shardlow |   2 +-
 .../log.dpdrx-shardlow.reference              |  58 ++-
 src/KOKKOS/pair_exp6_rx_kokkos.cpp            |  40 +-
 src/USER-DPD/fix_eos_table_rx.cpp             | 139 ++++--
 src/USER-DPD/fix_eos_table_rx.h               |   3 +-
 src/USER-DPD/fix_rx.cpp                       |  15 +-
 src/USER-DPD/pair_exp6_rx.cpp                 | 445 ++++++++++++------
 src/USER-DPD/pair_exp6_rx.h                   |  13 +-
 src/USER-DPD/pair_multi_lucy_rx.cpp           | 126 +++--
 src/USER-DPD/pair_multi_lucy_rx.h             |   3 +-
 src/USER-DPD/pair_table_rx.cpp                | 174 ++++---
 src/USER-DPD/pair_table_rx.h                  |   5 +-
 17 files changed, 896 insertions(+), 489 deletions(-)

diff --git a/doc/src/fix_eos_table_rx.txt b/doc/src/fix_eos_table_rx.txt
index f92b405f49..749642f57c 100644
--- a/doc/src/fix_eos_table_rx.txt
+++ b/doc/src/fix_eos_table_rx.txt
@@ -10,7 +10,7 @@ fix eos/table/rx command :h3
 
 [Syntax:]
 
-fix ID group-ID eos/table/rx style file1 N keyword file2 :pre
+fix ID group-ID eos/table/rx style file1 N keyword ... :pre
 
 ID, group-ID are documented in "fix"_fix.html command
 eos/table/rx = style name of this fix command
@@ -18,11 +18,16 @@ style = {linear} = method of interpolation
 file1 = filename containing the tabulated equation of state
 N = use N values in {linear} tables
 keyword = name of table keyword correponding to table file
-file2 = filename containing the heats of formation of each species :ul
+file2 = filename containing the heats of formation of each species (optional)
+deltaHf = heat of formation for a single species in energy units (optional)
+energyCorr = energy correction in energy units (optional)
+tempCorrCoeff = temperature correction coefficient (optional) :ul
 
 [Examples:]
 
-fix 1 all eos/table/rx linear eos.table 10000 KEYWORD thermo.table :pre
+fix 1 all eos/table/rx linear eos.table 10000 KEYWORD thermo.table
+fix 1 all eos/table/rx linear eos.table 10000 KEYWORD 1.5
+fix 1 all eos/table/rx linear eos.table 10000 KEYWORD 1.5 0.025 0.0 :pre
 
 [Description:]
 
@@ -39,7 +44,15 @@ where {m} is the number of species, {c_i,j} is the concentration of
 species {j} in particle {i}, {u_j} is the internal energy of species j,
 {DeltaH_f,j} is the heat of formation of species {j}, N is the number of
 molecules represented by the coarse-grained particle, kb is the
-Boltzmann constant, and T is the temperature of the system.
+Boltzmann constant, and T is the temperature of the system.  Additionally,
+it is possible to modify the concentration-dependent particle internal 
+energy relation by adding an energy correction, temperature-dependent 
+correction, and/or a molecule-dependent correction.  An energy correction can
+be specified as a constant (in energy units).  A temperature correction can be 
+specified by multiplying a temperature correction coefficient by the 
+internal temperature.  A molecular correction can be specified by 
+by multiplying a molecule correction coefficient by the average number of 
+product gas particles in the coarse-grain particle. 
 
 Fix {eos/table/rx} creates interpolation tables of length {N} from {m}
 internal energy values of each species {u_j} listed in a file as a
@@ -58,6 +71,14 @@ file is described below.
 The second filename specifies a file containing heat of formation
 {DeltaH_f,j} for each species.
 
+In cases where the coarse-grain particle represents a single molecular
+species (i.e., no reactions occur and fix {rx} is not present in the input file), 
+fix {eos/table/rx} can be applied in a similar manner to fix {eos/table} 
+within a non-reactive DPD simulation.  In this case, the heat of formation 
+filename is replaced with the heat of formation value for the single species.
+Additionally, the energy correction and temperature correction coefficients may 
+also be specified as fix arguments.  
+
 :line
 
 The format of a tabulated file is as follows (without the
@@ -116,6 +137,19 @@ Note that the species can be listed in any order.  The tag that is
 used as the species name must correspond with the tags used to define
 the reactions with the "fix rx"_fix_rx.html command.
 
+Alternatively, corrections to the EOS can be included by specifying
+three additional columns that correspond to the energy correction, 
+the temperature correction coefficient and molecule correction 
+coefficient.  In this case, the format of the file is as follows:
+
+# HEAT OF FORMATION TABLE     (one or more comment or blank lines) :pre
+                              (blank)
+h2      0.00 1.23 0.025  0.0  (species name, heat of formation, energy correction, temperature correction coefficient, molecule correction coefficient)
+no2     0.34 0.00 0.000 -1.76
+n2      0.00 0.00 0.000 -1.76
+...
+no      0.93 0.00 0.000 -1.76 :pre
+
 :line
 
 [Restrictions:]
diff --git a/doc/src/pair_exp6_rx.txt b/doc/src/pair_exp6_rx.txt
index 7b22dccc4f..dafba2c44c 100644
--- a/doc/src/pair_exp6_rx.txt
+++ b/doc/src/pair_exp6_rx.txt
@@ -10,16 +10,21 @@ pair_style exp6/rx command :h3
 
 [Syntax:]
 
-pair_style exp6/rx cutoff :pre
+pair_style exp6/rx cutoff ... :pre
 
-cutoff = global cutoff for DPD interactions (distance units) :ul
+cutoff = global cutoff for DPD interactions (distance units)
+weighting = fractional or molecular (optional) :ul
 
 [Examples:]
 
 pair_style exp6/rx 10.0
-pair_coeff * * exp6.params h2o h2o 1.0 1.0 10.0
-pair_coeff * * exp6.params h2o 1fluid 1.0 1.0 10.0
-pair_coeff * * exp6.params 1fluid 1fluid 1.0 1.0 10.0 :pre
+pair_style exp6/rx 10.0 fractional
+pair_style exp6/rx 10.0 molecular
+pair_coeff * * exp6.params h2o h2o exponent 1.0 1.0 10.0
+pair_coeff * * exp6.params h2o 1fluid exponent 1.0 1.0 10.0
+pair_coeff * * exp6.params 1fluid 1fluid exponent 1.0 1.0 10.0
+pair_coeff * * exp6.params 1fluid 1fluid none 10.0
+pair_coeff * * exp6.params 1fluid 1fluid polynomial filename 10.0 :pre
 
 [Description:]
 
@@ -50,14 +55,36 @@ defined in the reaction kinetics files specified with the "fix
 rx"_fix_rx.html command or they must correspond to the tag "1fluid",
 signifying interaction with a product species mixture determined
 through a one-fluid approximation.  The interaction potential is
-weighted by the geometric average of the concentrations of the two
-species.  The coarse-grained potential is stored before and after the
+weighted by the geometric average of either the mole fraction concentrations 
+or the number of molecules associated with the interacting coarse-grained 
+particles (see the {fractional} or {molecular} weighting pair style options). 
+The coarse-grained potential is stored before and after the
 reaction kinetics solver is applied, where the difference is defined
 to be the internal chemical energy (uChem).
 
-The fourth and fifth arguments specify the {Rm} and {epsilon} scaling exponents.
+The fourth argument specifies the type of scaling that will be used 
+to scale the EXP-6 paramters as reactions occur.  Currently, there
+are three scaling options:  {exponent}, {polynomial} and {none}.
 
-The final argument specifies the interaction cutoff.
+Exponent scaling requires two additional arguments for scaling 
+the {Rm} and {epsilon} parameters, respectively.  The scaling factor
+is computed by phi^exponent, where phi is the number of molecules  
+represented by the coarse-grain particle and exponent is specified 
+as a pair coefficient argument for {Rm} and {epsilon}, respectively.
+The {Rm} and {epsilon} parameters are multiplied by the scaling 
+factor to give the scaled interaction paramters for the CG particle.
+
+Polynomial scaling requires a filename to be specified as a pair 
+coeff argument.  The file contains the coefficients to a fifth order
+polynomial for the {alpha}, {epsilon} and {Rm} parameters that depend 
+upon phi (the number of molecules represented by the CG particle). 
+The format of a polynomial file is provided below.
+
+The {none} option to the scaling does not have any additional pair coeff
+arguments.  This is equivalent to specifying the {exponent} option with 
+{Rm} and {epsilon} exponents of 0.0 and 0.0, respectively.
+
+The final argument specifies the interaction cutoff (optional).
 
 :line
 
@@ -70,6 +97,19 @@ no2  exp6  13.60 0.01 3.70
 ...
 co2  exp6  13.00 0.03 3.20 :pre
 
+The format of the polynomial scaling file as follows (without the
+parenthesized comments):
+
+# POLYNOMIAL FILE          (one or more comment or blank lines) :pre
+#  General Functional Form:
+#  A*phi^5 + B*phi^4 + C*phi^3 + D*phi^2 + E*phi + F 
+#
+#  Parameter  A        B         C        D         E        F
+                           (blank)
+alpha        0.0000   0.00000   0.00008  0.04955  -0.73804  13.63201
+epsilon      0.0000   0.00478  -0.06283  0.24486  -0.33737   2.60097
+rm           0.0001  -0.00118  -0.00253  0.05812  -0.00509   1.50106 :pre
+
 A section begins with a non-blank line whose 1st character is not a
 "#"; blank lines or lines starting with "#" can be used as comments
 between sections.
@@ -117,4 +157,4 @@ LAMMPS"_Section_start.html#start_3 section for more info.
 
 "pair_coeff"_pair_coeff.html
 
-[Default:] none
+[Default:] fractional weighting
diff --git a/doc/src/pair_multi_lucy_rx.txt b/doc/src/pair_multi_lucy_rx.txt
index 14b5b32181..75547a71ce 100644
--- a/doc/src/pair_multi_lucy_rx.txt
+++ b/doc/src/pair_multi_lucy_rx.txt
@@ -13,11 +13,14 @@ pair_style multi/lucy/rx command :h3
 pair_style multi/lucy/rx style N keyword ... :pre
 
 style = {lookup} or {linear} = method of interpolation
-N = use N values in {lookup}, {linear} tables :ul
+N = use N values in {lookup}, {linear} tables
+weighting = fractional or molecular (optional) :ul
 
 [Examples:]
 
 pair_style multi/lucy/rx linear 1000
+pair_style multi/lucy/rx linear 1000 fractional
+pair_style multi/lucy/rx linear 1000 molecular
 pair_coeff * * multibody.table ENTRY1 h2o h2o 7.0
 pair_coeff * * multibody.table ENTRY1 h2o 1fluid 7.0 :pre
 
@@ -94,8 +97,10 @@ tags must either correspond to the species defined in the reaction
 kinetics files specified with the "fix rx"_fix_rx.html command or they
 must correspond to the tag "1fluid", signifying interaction with a
 product species mixture determined through a one-fluid approximation.
-The interaction potential is weighted by the geometric average of the
-concentrations of the two species.  The coarse-grained potential is
+The interaction potential is weighted by the geometric average of 
+either the mole fraction concentrations or the number of molecules 
+associated with the interacting coarse-grained particles (see the 
+{fractional} or {molecular} weighting pair style options). The coarse-grained potential is
 stored before and after the reaction kinetics solver is applied, where
 the difference is defined to be the internal chemical energy (uChem).
 
@@ -205,7 +210,7 @@ LAMMPS"_Section_start.html#start_3 section for more info.
 
 "pair_coeff"_pair_coeff.html
 
-[Default:] none
+[Default:] fractional weighting
 
 :line
 
diff --git a/doc/src/pair_table_rx.txt b/doc/src/pair_table_rx.txt
index e6006f62e2..d089a4f9da 100644
--- a/doc/src/pair_table_rx.txt
+++ b/doc/src/pair_table_rx.txt
@@ -10,16 +10,17 @@ pair_style table/rx command :h3
 
 [Syntax:]
 
-pair_style table style N :pre
+pair_style table style N ... :pre
 
 style = {lookup} or {linear} or {spline} or {bitmap} = method of interpolation
 N = use N values in {lookup}, {linear}, {spline} tables
-N = use 2^N values in {bitmap} tables
+weighting = fractional or molecular (optional) :ul
 
 [Examples:]
 
 pair_style table/rx linear 1000
-pair_style table/rx bitmap 12
+pair_style table/rx linear 1000 fractional
+pair_style table/rx linear 1000 molecular
 pair_coeff * * rxn.table ENTRY1 h2o h2o 10.0
 pair_coeff * * rxn.table ENTRY1 1fluid 1fluid 10.0
 pair_coeff * 3 rxn.table ENTRY1 h2o no2 10.0 :pre
@@ -84,8 +85,10 @@ tags must either correspond to the species defined in the reaction
 kinetics files specified with the "fix rx"_fix_rx.html command or they
 must correspond to the tag "1fluid", signifying interaction with a
 product species mixture determined through a one-fluid approximation.
-The interaction potential is weighted by the geometric average of the
-concentrations of the two species.  The coarse-grained potential is
+The interaction potential is weighted by the geometric average of 
+either the mole fraction concentrations or the number of molecules 
+associated with the interacting coarse-grained particles (see the 
+{fractional} or {molecular} weighting pair style options). The coarse-grained potential is
 stored before and after the reaction kinetics solver is applied, where
 the difference is defined to be the internal chemical energy (uChem).
 
@@ -230,7 +233,7 @@ LAMMPS"_Section_start.html#start_3 section for more info.
 
 "pair_coeff"_pair_coeff.html
 
-[Default:] none
+[Default:] fractional weighting
 
 :line
 
diff --git a/examples/USER/dpd/dpde-vv/log.dpde-vv.reference b/examples/USER/dpd/dpde-vv/log.dpde-vv.reference
index 7bc7bda365..800a39f7a5 100644
--- a/examples/USER/dpd/dpde-vv/log.dpde-vv.reference
+++ b/examples/USER/dpd/dpde-vv/log.dpde-vv.reference
@@ -35,129 +35,133 @@ thermo_modify   format float %24.16f
 
 run             1000
 Neighbor list info ...
-  1 neighbor list requests
   update every 1 steps, delay 0 steps, check no
   max neighbors/atom: 2000, page size: 100000
   master list distance cutoff = 10.6
   ghost atom cutoff = 10.6
-  binsize = 5.3 -> bins = 25 25 25
-Memory usage per processor = 3.36353 Mbytes
+  binsize = 5.3, bins = 25 25 25
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair dpd/fdt/energy, perpetual
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+Memory usage per processor = 4.28221 Mbytes
 Step Temp Press PotEng KinEng c_dpdU[1] c_dpdU[2] v_totEnergy c_dpdU[4] 
-       0     301.4391322267262012    1636.1776395935085020    1188.6488072196075336     394.4722035796053206    7852.5601874986105031    7852.5601874986105031   17288.2413857964347699     299.9999999999841407 
-      10     301.4791572483523510    1486.4422375141198245    1188.7147620806101713     394.5245815119678241    7852.5601874999802021    7852.3731942333779443   17288.1727253259377903     299.9960221120699089 
-      20     301.4275643919337426    1677.9356110821624952    1188.7839634625399867     394.4570655673388728    7852.5601874999938445    7852.3711851933012440   17288.1724017231754260     299.9955485734552099 
-      30     301.2240988054542186    1452.7304951528931269    1188.8550809767796181     394.1908044563202225    7852.5601875000002110    7852.5679666239848302   17288.1740395570850524     299.9988968405210130 
-      40     301.1023506886409677    1527.9758363521380033    1188.9264527568634549     394.0314812537677653    7852.5601874999947540    7852.6574764573806533   17288.1755979680056043     300.0001694462812338 
-      50     301.0409654880461972    1597.1737251233498682    1188.9944523606982330     393.9511507566391515    7852.5601875000029395    7852.6700547249911324   17288.1758453423317405     299.9999653064982681 
-      60     301.2904978886139133    1610.8630327676828529    1189.0651026961211301     394.2776962691256131    7852.5601874999829306    7852.2734988976435488   17288.1764853628737910     299.9919857290491905 
-      70     300.8575037843163500    1489.3259312130880971    1189.1295686642290548     393.7110673208616731    7852.5601874999856591    7852.7707182199101226   17288.1715417049854295     300.0010992278233175 
-      80     300.5955830326474825    1449.3896097889587509    1189.1880764967559116     393.3683100440913449    7852.5601875000411383    7853.0484238882281716   17288.1649979291178170     300.0059513551503301 
-      90     301.0092332775843147    1553.9266324350364812    1189.2470037925052111     393.9096250433288446    7852.5601875000420478    7852.4452067113825251   17288.1620230472581170     299.9940347326859182 
-     100     301.0478004479094238    1539.2270336322194453    1189.3010269201699884     393.9600951881690207    7852.5601875000074870    7852.3416236045995902   17288.1629332129450631     299.9916385566916119 
-     110     300.9609384905550087    1500.0429484565006533    1189.3524514939088021     393.8464250502817663    7852.5601874999983920    7852.4114980357189779   17288.1705620799075405     299.9925626482005327 
-     120     300.9625536631411933    1630.5065919443034090    1189.4006029528841282     393.8485387131115658    7852.5601875000575092    7852.3600810123671181   17288.1694101784196391     299.9911580775880680 
-     130     301.0373750247310340    1539.2267307640183844    1189.4426173625224692     393.9464521696795032    7852.5601874999993015    7852.2178388309775983   17288.1670958631802932     299.9879581026651749 
-     140     300.7465104415114752    1550.8353679735087098    1189.4887352231000932     393.5658181350791551    7852.5601874999920256    7852.5559582333216895   17288.1706990914935886     299.9939749909034958 
-     150     300.6667173911141617    1634.8987162883277051    1189.5368575067818711     393.4613985788388959    7852.5601874999920256    7852.6079668015609059   17288.1664103871735279     299.9946423938895350 
-     160     300.4684731724562425    1462.9400882126803936    1189.5825022927965620     393.2019703048678707    7852.5601874999847496    7852.8265187980177870   17288.1711788956672535     299.9983600613423960 
-     170     300.1439323338466920    1510.2352578813552100    1189.6305700279478970     392.7772665220106774    7852.5601874999802021    7853.2009671047335360   17288.1689911546709482     300.0051118582463232 
-     180     300.1074244553407198    1529.6307083879951279    1189.6764977580119194     392.7294912276224181    7852.5601874999729262    7853.2047509722533505   17288.1709274578606710     300.0047089238623812 
-     190     300.4193298066089142    1546.3205495807171701    1189.7172820166240399     393.1376598363699486    7852.5601874999847496    7852.7461854379371289   17288.1613147909156396     299.9954451643528728 
-     200     300.3353919251508728    1532.5496449337254035    1189.7600175880224924     393.0278162310690391    7852.5601874999683787    7852.8107089913455638   17288.1587303104060993     299.9962707550171785 
-     210     300.3276568499739483    1504.8178651700843602    1189.7998299597820733     393.0176938818990493    7852.5601875000156724    7852.7810130200659842   17288.1587243617614149     299.9953436245502871 
-     220     300.5768315696971626    1592.5896084568344122    1189.8391466344742184     393.3437713226064716    7852.5601875000329528    7852.4205574703573802   17288.1636629274726147     299.9880321846658831 
-     230     300.6587445618569063    1672.3049358942289473    1189.8766340798690635     393.4509650976162334    7852.5601874999847496    7852.2733199687863817   17288.1611066462573945     299.9848228571166828 
-     240     300.7517707836825025    1527.1722267937811921    1189.9126240081129708     393.5727019751183207    7852.5601875000065775    7852.1160682173085661   17288.1615817005440476     299.9814952182625802 
-     250     300.8473715548367409    1589.1847713095248764    1189.9441342461948352     393.6978079843565865    7852.5601875000047585    7851.9625847797888127   17288.1647145103452203     299.9782210858571148 
-     260     300.8450266408960942    1623.1896863377055524    1189.9636161513917614     393.6947393603111891    7852.5601874999820211    7851.9471828473988353   17288.1657258590821584     299.9775302202895659 
-     270     300.6663619570709898    1564.5160171187899323    1189.9764081239700317     393.4609334472908131    7852.5601875000193104    7852.1708276117251444   17288.1683566830033669     299.9812899253168439 
-     280     300.7668534205726019    1618.5400526904263643    1189.9872008155405183     393.5924395618274048    7852.5601875000184009    7852.0271568534708422   17288.1669847308585304     299.9781169783826158 
-     290     300.8462727198648849    1562.6765776748122789    1189.9918265985252219     393.6963700162682471    7852.5601875000211294    7851.9189772084127981   17288.1673613232269417     299.9756806168044250 
-     300     300.8095414073812890    1525.1785808192844343    1189.9873922767767453     393.6483023295390922    7852.5601875000020300    7851.9657301693578120   17288.1616122756749974     299.9761279889730758 
-     310     300.9496330741350221    1566.5597234051326723    1189.9752299662607129     393.8316304464934774    7852.5601875000056680    7851.7898117189633922   17288.1568596317229094     299.9723726900590464 
-     320     301.2370566356515837    1513.6869483705047514    1189.9626455872523820     394.2077614578674343    7852.5601874999929350    7851.4248466706330873   17288.1554412157456682     299.9650543775110236 
-     330     301.3279721508968692    1549.0667862452519330    1189.9513389477854162     394.3267362020337146    7852.5601874999929350    7851.3129955581916875   17288.1512582080031279     299.9625537201162615 
-     340     301.1145736537583844    1414.7930515101759283    1189.9408691169965095     394.0474765890400590    7852.5601874999993015    7851.6028846074832472   17288.1514178135184920     299.9677356565828745 
-     350     301.1651600907370039    1529.8016115175887535    1189.9314470205476937     394.1136755032911196    7852.5601874999929350    7851.5441417268757505   17288.1494517507089768     299.9662576716461331 
-     360     301.0550563185083206    1536.7721716375504002    1189.9200519814730796     393.9695904359920178    7852.5601875000074870    7851.7101209691463737   17288.1599508866202086     299.9690811750865009 
-     370     301.1008976932964742    1522.3385843459479929    1189.9109162496640693     394.0295798208944120    7852.5601875000211294    7851.6603423306560217   17288.1610259012340975     299.9677565060027860 
-     380     301.1656898730700505    1505.0548721701993600    1189.9005648244351505     394.1143687921909304    7852.5601875000056680    7851.5816827598300733   17288.1568038764598896     299.9659906785156522 
-     390     300.8379322662876802    1740.9151205755624687    1189.8851457594087151     393.6854554509390596    7852.5601875000238579    7852.0268864110385039   17288.1576751214088290     299.9741278188615752 
-     400     300.8663790447546376    1564.9461156870302148    1189.8690133470408909     393.7226817503372445    7852.5601875000411383    7852.0043792319993372   17288.1562618294192362     299.9732593416579789 
-     410     300.6263441860635908    1564.2840871092373618    1189.8566574093877080     393.4085650033033517    7852.5601874999892971    7852.3284491703725507   17288.1538590830532485     299.9792095875052951 
-     420     300.5302259436974168    1438.1569922368764765    1189.8406936554465574     393.2827818158641549    7852.5601875000302243    7852.4696075433648730   17288.1532705147074012     299.9815165752025337 
-     430     300.5877786105220935    1503.3641639033023694    1189.8251514530138593     393.3580969454444016    7852.5601874999802021    7852.4023373559457468   17288.1457732543858583     299.9798346272511935 
-     440     300.7289160804472772    1689.2527029957295781    1189.8035410609209066     393.5427936314976591    7852.5601875000029395    7852.2436462415198548   17288.1501684339418716     299.9764596782897570 
-     450     300.9487198282456575    1497.3668092174791582    1189.7808137689632986     393.8304353457919547    7852.5601874999938445    7851.9788323927432430   17288.1502690074921702     299.9710227473042323 
-     460     300.9359942496024587    1625.1573864018491804    1189.7615359247627111     393.8137822755282400    7852.5601875000147629    7852.0165192783370003   17288.1520249786408385     299.9713565393226986 
-     470     301.0000133856357252    1486.1561922844011860    1189.7439269526955741     393.8975596188205941    7852.5601874999656502    7851.9561324572268859   17288.1578065287103527     299.9697143418395626 
-     480     300.8568627175957886    1535.6080526199095857    1189.7237810071801505     393.7102284019063063    7852.5601874999601932    7852.1697010727630186   17288.1638979818089865     299.9732503057674080 
-     490     301.0608040775520067    1497.3221544489886128    1189.7062242497636362     393.9771121242308709    7852.5601874999974825    7851.9258988739011329   17288.1694227478947141     299.9682362511933320 
-     500     301.0232592587148019    1517.5854528541199215    1189.6911287485861521     393.9279798589197981    7852.5601875000247674    7851.9823225510326665   17288.1616186585633841     299.9690333355835037 
-     510     300.7038579923685120    1420.2615974401142012    1189.6747661513456933     393.5100018730125839    7852.5601874999674692    7852.4114869568047652   17288.1564424811294884     299.9768186576545759 
-     520     300.5917863355052759    1537.4862082427132464    1189.6604754398756540     393.3633415734188361    7852.5601875000029395    7852.5789017095057716   17288.1629062228021212     299.9795694302102333 
-     530     300.4751352158502868    1481.1071694751799441    1189.6453243069925065     393.2106884527691477    7852.5601874999811116    7852.7451655714066874   17288.1613658311471227     299.9823181268525900 
-     540     300.5380123640739498    1547.3461372766389559    1189.6261485232855648     393.2929713568877332    7852.5601875000375003    7852.6850583598352387   17288.1643657400454686     299.9808112190538623 
-     550     300.4253885005187499    1544.3485889749692888    1189.6033595464525661     393.1455884232119047    7852.5601874999756546    7852.8598718466746504   17288.1690073163154011     299.9835860164698147 
-     560     300.3263552442093101    1556.5150300058251105    1189.5759163336824713     393.0159905619273673    7852.5601875000111249    7853.0148613782675966   17288.1669557738860021     299.9861837797674866 
-     570     300.1977324643196425    1511.2320626303917379    1189.5441090918316149     392.8476709710407704    7852.5601875000102154    7853.2098259401755058   17288.1617935030590161     299.9896761688499964 
-     580     300.3543631005173893    1588.9566243200433746    1189.5094471319721379     393.0526424747489500    7852.5601875000156724    7853.0374555421631158   17288.1597326488990802     299.9859298211933378 
-     590     300.5019108864805730    1504.4406939723214691    1189.4809412920112663     393.2457278908070748    7852.5601874999874781    7852.8704277855340479   17288.1572844683396397     299.9823573257917815 
-     600     300.4791158523048011    1540.4690749004150803    1189.4551948503105905     393.2158976318902432    7852.5601875000220389    7852.9312239063838206   17288.1625038886049879     299.9832002920041987 
-     610     300.5939139841889869    1368.0565839211087678    1189.4252547652590692     393.3661258776944578    7852.5601874999574648    7852.8130977336286378   17288.1646658765384927     299.9807742697515778 
-     620     300.7674247480806002    1483.2566452708945235    1189.3941250938435132     393.5931872179773450    7852.5601875000193104    7852.6187967208716145   17288.1662965327122947     299.9766963671718258 
-     630     300.7920034341021278    1543.0699124130637756    1189.3598279316649950     393.6253516166882491    7852.5601875000302243    7852.6219971866230480   17288.1673642350069713     299.9762538437230432 
-     640     300.8032734267029014    1423.2549819291616586    1189.3293074476885067     393.6400998638143278    7852.5601874999847496    7852.6384826097782934   17288.1680774212654796     299.9762118202994543 
-     650     300.7516995878241346    1542.6559695158523482    1189.3021161045705867     393.5726088061030055    7852.5601874999720167    7852.7361949473242930   17288.1711073579681397     299.9775656396505497 
-     660     300.8699697098109596    1675.5121937767839881    1189.2687179804190691     393.7273806013013768    7852.5601874999802021    7852.6179739687149777   17288.1742600504148868     299.9750492262036801 
-     670     301.0255004186900578    1520.7397686587873977    1189.2284265783687260     393.9309127074437242    7852.5601874999847496    7852.4592279727157802   17288.1787547585117863     299.9715123049731460 
-     680     301.1071983488760679    1651.9751417063259851    1189.1858967311386550     394.0378250459656329    7852.5601875000002110    7852.3982826328638112   17288.1821919099675142     299.9699481289110850 
-     690     301.0027086454253435    1496.1607274163641250    1189.1436949551202815     393.9010867158519886    7852.5601875000293148    7852.5788938360938118   17288.1838630070960789     299.9731939774295597 
-     700     300.9009090279179759    1551.8182127127668082    1189.0993919251338866     393.7678687121208441    7852.5601875000102154    7852.7513665452252098   17288.1788146824910655     299.9761043445071209 
-     710     301.2325536720837817    1678.1546953970853338    1189.0528341066981284     394.2018687459686817    7852.5601874999956635    7852.3633298995819132   17288.1782202522445004     299.9683013583347133 
-     720     301.2122298224125529    1524.1415452491430642    1189.0046957644285612     394.1752723525083866    7852.5601875000093059    7852.4351629896145823   17288.1753186065616319     299.9693315350040734 
-     730     301.0763282392692304    1547.1987029633166912    1188.9602551214045434     393.9974275034455218    7852.5601874999883876    7852.6518053705112834   17288.1696754953518393     299.9732715774841267 
-     740     301.3262401480515109    1544.7045314021493141    1188.9131307177485724     394.3244696516559884    7852.5601874999965730    7852.3694201272974169   17288.1672079966992897     299.9674666811455950 
-     750     301.5740779122830304    1591.1785078054851965    1188.8637580645938669     394.6487975126887022    7852.5601875000029395    7852.0919529470393172   17288.1646960243233480     299.9616008527094095 
-     760     301.4385361878654521    1547.3218422039201414    1188.8113669183098864     394.4714235854450521    7852.5601874999838401    7852.3161911124070684   17288.1591691161447670     299.9656339783694534 
-     770     301.6110125684814420    1494.5039561806622714    1188.7581685915934031     394.6971313010439530    7852.5601875000083965    7852.1351720579104949   17288.1506594505553949     299.9619855799395509 
-     780     301.8360352039435384    1588.1458619705292676    1188.7039178696472845     394.9916026067776329    7852.5601874999956635    7851.9015195838428554   17288.1572275602629816     299.9572350302977952 
-     790     302.1008324754310479    1545.4409171812178556    1188.6491103416560691     395.3381241828382144    7852.5601875000138534    7851.6150048936624444   17288.1624269181702402     299.9513959104631340 
-     800     301.9660372380565718    1563.9565804790736365    1188.5964649891604950     395.1617271307158035    7852.5601874999874781    7851.8461249560614306   17288.1645045759250934     299.9555810527747326 
-     810     302.0507207347627627    1511.4560763489957935    1188.5468477146612258     395.2725464702810996    7852.5601875000120344    7851.7904104899025697   17288.1699921748586348     299.9541551776504775 
-     820     302.4700213214911741    1458.5135514273570152    1188.4981381693974072     395.8212556746473751    7852.5601875000202199    7851.2935886962204677   17288.1731700402851857     299.9441803241180651 
-     830     302.2853997979337350    1496.2544527963129894    1188.4496917372191547     395.5796544641875698    7852.5601875000447762    7851.5862641793482908   17288.1757978808018379     299.9494768794835977 
-     840     302.0840465730901201    1518.8301331998704882    1188.3994383226176978     395.3161576523596636    7852.5601875000038490    7851.8962146812327774   17288.1719981562127941     299.9550476592922337 
-     850     301.8910942560261788    1469.8827850510901953    1188.3489956121345585     395.0636545180261692    7852.5601874999829306    7852.2025804631493884   17288.1754180932912277     299.9606927700139067 
-     860     301.7284384160519153    1657.6802015862324424    1188.3052233777652873     394.8507982536594341    7852.5601875000093059    7852.4644669022691232   17288.1806760337058222     299.9652835238809985 
-     870     301.6331619894115192    1501.5829953208524330    1188.2628815714097072     394.7261166912876433    7852.5601875000202199    7852.6378180648598573   17288.1870038275774277     299.9682811831179379 
-     880     301.3703918424367316    1499.1595903074553462    1188.2195190931643083     394.3822478705861272    7852.5601874999956635    7853.0266423250832304   17288.1885967888301820     299.9755099056966401 
-     890     301.4157954313303662    1598.8758859042511631    1188.1845892608291706     394.4416643558612918    7852.5601875000065775    7853.0036606192506952   17288.1901017359487014     299.9745322513492738 
-     900     301.4752150615485675    1621.2148728756822038    1188.1517520946135846     394.5194226492019993    7852.5601874999711072    7852.9579580608560718   17288.1893203046420240     299.9733125337182287 
-     910     301.4308816315938770    1538.4823217911632582    1188.1159856659232901     394.4614066057066566    7852.5601875000002110    7853.0558695713261841   17288.1934493429580471     299.9748317405193916 
-     920     301.4323110133492492    1594.7193046491217956    1188.0835779842032025     394.4632771371357762    7852.5601875000202199    7853.0942701464364291   17288.2013127677964803     299.9751127806911200 
-     930     301.4801256941950101    1387.6885377097617038    1188.0464206196895702     394.5258488489681099    7852.5601875000229484    7853.0656502842994087   17288.1981072529815719     299.9740698440909910 
-     940     301.8075611840245074    1534.2487040663793323    1188.0124217312886685     394.9543406584059539    7852.5601874999701977    7852.6729444202819650   17288.1998943099461030     299.9660570413493588 
-     950     301.6915970126173647    1567.7725992489238251    1187.9790455470049437     394.8025864986412898    7852.5601875000274958    7852.8619557087595240   17288.2037752544347313     299.9694678653150959 
-     960     301.6392594677008105    1504.8502165144939227    1187.9439133338105421     394.7340960325207675    7852.5601874999711072    7852.9728807988849439   17288.2110776651898050     299.9711546356286362 
-     970     301.6049535791644303    1514.0198965433548892    1187.9094123369413865     394.6892023276233772    7852.5601874999765641    7853.0497909819878259   17288.2085931465298927     299.9722547114341751 
-     980     301.2982841679705643    1634.1208149125807267    1187.8768454876480973     394.2878856256063500    7852.5601874999856591    7853.4862008383515786   17288.2111194515891839     299.9802110109069986 
-     990     301.2573007350166563    1489.7316698898257528    1187.8432331161868660     394.2342534877078606    7852.5601875000047585    7853.5840096862748396   17288.2216837901723920     299.9819468620868292 
-    1000     301.3195135766228532    1562.6587211933920116    1187.8034267774903583     394.3156670604516307    7852.5601874999356369    7853.5372636956635688   17288.2165450335414789     299.9807651637231629 
-Loop time of 21.3308 on 1 procs for 1000 steps with 10125 atoms
+       0     301.4391322267262012    1636.1776395935080473    1188.6488072196075336     394.4722035796053206       0.0000000000000000   15705.1203749972210062   17288.2413857964347699     299.9999999999841407 
+      10     301.4791572483523510    1486.4422375141214161    1188.7147620806101713     394.5245815119678241       0.0000000000000000   15704.9333817333845218   17288.1727253259632562     299.9960221120699089 
+      20     301.4275643919337995    1677.9356110821622678    1188.7839634625399867     394.4570655673389865      -0.0000000000000000   15704.9313726932996360   17288.1724017231790640     299.9955485734552667 
+      30     301.2240988054542186    1452.7304951528922174    1188.8550809767796181     394.1908044563202225      -0.0000000000000000   15705.1281541239713988   17288.1740395570705005     299.9988968405209562 
+      40     301.1023506886409109    1527.9758363521384581    1188.9264527568634549     394.0314812537677085      -0.0000000000000000   15705.2176639573335706   17288.1755979679655866     300.0001694462812907 
+      50     301.0409654880461972    1597.1737251233505503    1188.9944523606984603     393.9511507566391515      -0.0000000000000000   15705.2302422249904339   17288.1758453423281026     299.9999653064982112 
+      60     301.2904978886138565    1610.8630327676828529    1189.0651026961211301     394.2776962691255562      -0.0000000000000000   15704.8336863976528548   17288.1764853628992569     299.9919857290491905 
+      70     300.8575037843164068    1489.3259312130892340    1189.1295686642290548     393.7110673208617300       0.0000000000000000   15705.3309057198275696   17288.1715417049199459     300.0010992278232607 
+      80     300.5955830326474825    1449.3896097889576140    1189.1880764967559116     393.3683100440913449      -0.0000000000000000   15705.6086113882302016   17288.1649979290777992     300.0059513551502164 
+      90     301.0092332775843147    1553.9266324350371633    1189.2470037925056658     393.9096250433288446      -0.0000000000000000   15705.0053942113881931   17288.1620230472217372     299.9940347326859182 
+     100     301.0478004479094238    1539.2270336322201274    1189.3010269201699884     393.9600951881690207      -0.0000000000000000   15704.9018111045588739   17288.1629332128977694     299.9916385566916119 
+     110     300.9609384905550655    1500.0429484565015628    1189.3524514939088021     393.8464250502818231      -0.0000000000000000   15704.9716855356964516   17288.1705620798857126     299.9925626482006464 
+     120     300.9625536631413070    1630.5065919443020448    1189.4006029528841282     393.8485387131116795       0.0000000000000000   15704.9202685123345873   17288.1694101783286897     299.9911580775880680 
+     130     301.0373750247309772    1539.2267307640188392    1189.4426173625224692     393.9464521696794463      -0.0000000000000000   15704.7780263310032751   17288.1670958632057591     299.9879581026650044 
+     140     300.7465104415114183    1550.8353679735089372    1189.4887352231000932     393.5658181350790983       0.0000000000000000   15705.1161457332873397   17288.1706990914681228     299.9939749909034958 
+     150     300.6667173911142186    1634.8987162883267956    1189.5368575067818711     393.4613985788390096       0.0000000000000000   15705.1681543015274656   17288.1664103871480620     299.9946423938894213 
+     160     300.4684731724561857    1462.9400882126797114    1189.5825022927965620     393.2019703048678139       0.0000000000000000   15705.3867062980680203   17288.1711788957327371     299.9983600613422254 
+     170     300.1439323338466920    1510.2352578813547552    1189.6305700279476696     392.7772665220106774      -0.0000000000000000   15705.7611546046609874   17288.1689911546200165     300.0051118582463232 
+     180     300.1074244553407766    1529.6307083879964921    1189.6764977580119194     392.7294912276225318      -0.0000000000000000   15705.7649384723172261   17288.1709274579516205     300.0047089238623812 
+     190     300.4193298066088573    1546.3205495807169427    1189.7172820166242673     393.1376598363698349       0.0000000000000000   15705.3063729379555298   17288.1613147909483814     299.9954451643527022 
+     200     300.3353919251508728    1532.5496449337249487    1189.7600175880224924     393.0278162310690391      -0.0000000000000000   15705.3708964914076205   17288.1587303105006868     299.9962707550172922 
+     210     300.3276568499739483    1504.8178651700850423    1189.7998299597820733     393.0176938818990493       0.0000000000000000   15705.3412005200552812   17288.1587243617359491     299.9953436245502871 
+     220     300.5768315696972195    1592.5896084568353217    1189.8391466344739911     393.3437713226065284      -0.0000000000000000   15704.9807449702821032   17288.1636629273634753     299.9880321846658262 
+     230     300.6587445618569063    1672.3049358942282652    1189.8766340798690635     393.4509650976162334       0.0000000000000000   15704.8335074687693123   17288.1611066462537565     299.9848228571169102 
+     240     300.7517707836825025    1527.1722267937814195    1189.9126240081131982     393.5727019751183207      -0.0000000000000000   15704.6762557172896777   17288.1615817005222198     299.9814952182625802 
+     250     300.8473715548367409    1589.1847713095232848    1189.9441342461948352     393.6978079843565865       0.0000000000000000   15704.5227722798481409   17288.1647145103997900     299.9782210858571148 
+     260     300.8450266408959806    1623.1896863377055524    1189.9636161513917614     393.6947393603110186       0.0000000000000000   15704.5073703474117792   17288.1657258591149002     299.9775302202894522 
+     270     300.6663619570710466    1564.5160171187892502    1189.9764081239700317     393.4609334472908699       0.0000000000000000   15704.7310151116998895   17288.1683566829597112     299.9812899253167302 
+     280     300.7668534205727155    1618.5400526904256822    1189.9872008155405183     393.5924395618275184       0.0000000000000000   15704.5873443533891987   17288.1669847307566670     299.9781169783825590 
+     290     300.8462727198648281    1562.6765776748138705    1189.9918265985252219     393.6963700162681334       0.0000000000000000   15704.4791647084566648   17288.1673613232487696     299.9756806168042544 
+     300     300.8095414073812890    1525.1785808192844343    1189.9873922767767453     393.6483023295390922       0.0000000000000000   15704.5259176693853078   17288.1616122757004632     299.9761279889731327 
+     310     300.9496330741349652    1566.5597234051326723    1189.9752299662607129     393.8316304464933637       0.0000000000000000   15704.3499992189717887   17288.1568596317265474     299.9723726900589327 
+     320     301.2370566356514132    1513.6869483705036146    1189.9626455872523820     394.2077614578672069       0.0000000000000000   15703.9850341706151085   17288.1554412157347542     299.9650543775107394 
+     330     301.3279721508969260    1549.0667862452526151    1189.9513389477854162     394.3267362020338282       0.0000000000000000   15703.8731830581982649   17288.1512582080176799     299.9625537201162615 
+     340     301.1145736537582707    1414.7930515101757010    1189.9408691169962822     394.0474765890398885       0.0000000000000000   15704.1630721074998291   17288.1514178135366819     299.9677356565827040 
+     350     301.1651600907369470    1529.8016115175894356    1189.9314470205474663     394.1136755032910628       0.0000000000000000   15704.1043292268568621   17288.1494517506944248     299.9662576716459625 
+     360     301.0550563185083206    1536.7721716375513097    1189.9200519814730796     393.9695904359920178       0.0000000000000000   15704.2703084691693221   17288.1599508866347605     299.9690811750866146 
+     370     301.1008976932965311    1522.3385843459491298    1189.9109162496640693     394.0295798208944689       0.0000000000000000   15704.2205298306434997   17288.1610259012013557     299.9677565060027860 
+     380     301.1656898730701073    1505.0548721701995873    1189.9005648244356053     394.1143687921909873      -0.0000000000000000   15704.1418702597857191   17288.1568038764125959     299.9659906785157091 
+     390     300.8379322662877371    1740.9151205755633782    1189.8851457594089425     393.6854554509391164      -0.0000000000000000   15704.5870739109432179   17288.1576751212924137     299.9741278188614046 
+     400     300.8663790447545239    1564.9461156870302148    1189.8690133470406636     393.7226817503371308       0.0000000000000000   15704.5645667319495260   17288.1562618293282867     299.9732593416576947 
+     410     300.6263441860637045    1564.2840871092375892    1189.8566574093874806     393.4085650033035222      -0.0000000000000000   15704.8886366703736712   17288.1538590830641624     299.9792095875053519 
+     420     300.5302259436973031    1438.1569922368769312    1189.8406936554461026     393.2827818158640412       0.0000000000000000   15705.0297950433650840   17288.1532705146746594     299.9815165752024768 
+     430     300.5877786105221503    1503.3641639033021420    1189.8251514530136319     393.3580969454445153      -0.0000000000000000   15704.9625248558968451   17288.1457732543567545     299.9798346272512504 
+     440     300.7289160804472772    1689.2527029957295781    1189.8035410609209066     393.5427936314976591      -0.0000000000000000   15704.8038337415237038   17288.1501684339418716     299.9764596782894728 
+     450     300.9487198282456006    1497.3668092174784761    1189.7808137689632986     393.8304353457918978      -0.0000000000000000   15704.5390198927143501   17288.1502690074703423     299.9710227473042323 
+     460     300.9359942496024019    1625.1573864018473614    1189.7615359247631659     393.8137822755281263       0.0000000000000000   15704.5767067783035600   17288.1520249785935448     299.9713565393225849 
+     470     301.0000133856357252    1486.1561922844020955    1189.7439269526958014     393.8975596188205941       0.0000000000000000   15704.5163199572089070   17288.1578065287249046     299.9697143418395058 
+     480     300.8568627175958454    1535.6080526199100404    1189.7237810071803779     393.7102284019064200      -0.0000000000000000   15704.7298885727686866   17288.1638979818562802     299.9732503057675785 
+     490     301.0608040775520067    1497.3221544489890675    1189.7062242497640909     393.9771121242308709      -0.0000000000000000   15704.4860863739140768   17288.1694227479092660     299.9682362511933889 
+     500     301.0232592587148019    1517.5854528541185573    1189.6911287485863795     393.9279798589197981      -0.0000000000000000   15704.5425100510510674   17288.1616186585561081     299.9690333355832195 
+     510     300.7038579923685120    1420.2615974401142012    1189.6747661513456933     393.5100018730125839      -0.0000000000000000   15704.9716744568013382   17288.1564424811585923     299.9768186576548032 
+     520     300.5917863355052759    1537.4862082427125642    1189.6604754398761088     393.3633415734188361      -0.0000000000000000   15705.1390892093895673   17288.1629062226857059     299.9795694302102902 
+     530     300.4751352158504574    1481.1071694751785799    1189.6453243069920518     393.2106884527693751      -0.0000000000000000   15705.3053530714041699   17288.1613658311653126     299.9823181268525900 
+     540     300.5380123640739498    1547.3461372766387285    1189.6261485232855648     393.2929713568877332       0.0000000000000000   15705.2452458598490921   17288.1643657400236407     299.9808112190538623 
+     550     300.4253885005187499    1544.3485889749688340    1189.6033595464525661     393.1455884232119047       0.0000000000000000   15705.4200593467012368   17288.1690073163663328     299.9835860164698147 
+     560     300.3263552442091395    1556.5150300058239736    1189.5759163336820166     393.0159905619271399       0.0000000000000000   15705.5750488783432957   17288.1669557739514858     299.9861837797674298 
+     570     300.1977324643196994    1511.2320626303924200    1189.5441090918316149     392.8476709710408272       0.0000000000000000   15705.7700134401693504   17288.1617935030408262     299.9896761688500533 
+     580     300.3543631005173893    1588.9566243200420104    1189.5094471319723652     393.0526424747489500      -0.0000000000000000   15705.5976430422142585   17288.1597326489354600     299.9859298211932810 
+     590     300.5019108864805730    1504.4406939723210144    1189.4809412920112663     393.2457278908070748      -0.0000000000000000   15705.4306152855297114   17288.1572844683469157     299.9823573257918952 
+     600     300.4791158523048011    1540.4690749004137160    1189.4551948503108179     393.2158976318902432       0.0000000000000000   15705.4914114063831221   17288.1625038885831600     299.9832002920041418 
+     610     300.5939139841890437    1368.0565839211083130    1189.4252547652597514     393.3661258776945715       0.0000000000000000   15705.3732852337052464   17288.1646658766585460     299.9807742697515209 
+     620     300.7674247480806002    1483.2566452708929319    1189.3941250938437406     393.5931872179773450       0.0000000000000000   15705.1789842209145718   17288.1662965327341226     299.9766963671719395 
+     630     300.7920034341022415    1543.0699124130630935    1189.3598279316649950     393.6253516166883628      -0.0000000000000000   15705.1821846865786938   17288.1673642349305737     299.9762538437231001 
+     640     300.8032734267029014    1423.2549819291609765    1189.3293074476887341     393.6400998638143278      -0.0000000000000000   15705.1986701098048798   17288.1680774213091354     299.9762118202993975 
+     650     300.7516995878240209    1542.6559695158514387    1189.3021161045703593     393.5726088061028349       0.0000000000000000   15705.2963824473390559   17288.1711073580117954     299.9775656396504360 
+     660     300.8699697098108459    1675.5121937767842155    1189.2687179804192965     393.7273806013012063       0.0000000000000000   15705.1781614686860848   17288.1742600504076108     299.9750492262035095 
+     670     301.0255004186899441    1520.7397686587889893    1189.2284265783694082     393.9309127074436105       0.0000000000000000   15705.0194154727287241   17288.1787547585408902     299.9715123049731460 
+     680     301.1071983488761248    1651.9751417063253029    1189.1858967311388824     394.0378250459656897       0.0000000000000000   15704.9584701329349627   17288.1821919100402738     299.9699481289110281 
+     690     301.0027086454255141    1496.1607274163641250    1189.1436949551202815     393.9010867158522160       0.0000000000000000   15705.1390813360922039   17288.1838630070633371     299.9731939774292755 
+     700     300.9009090279178622    1551.8182127127668082    1189.0993919251338866     393.7678687121206735      -0.0000000000000000   15705.3115540452217829   17288.1788146824765136     299.9761043445070641 
+     710     301.2325536720837817    1678.1546953970841969    1189.0528341066981284     394.2018687459686817       0.0000000000000000   15704.9235173995584773   17288.1782202522263105     299.9683013583346565 
+     720     301.2122298224125529    1524.1415452491437463    1189.0046957644283339     394.1752723525083866       0.0000000000000000   15704.9953504895402148   17288.1753186064779584     299.9693315350040734 
+     730     301.0763282392692304    1547.1987029633176007    1188.9602551214045434     393.9974275034455218       0.0000000000000000   15705.2119928705469647   17288.1696754953954951     299.9732715774840699 
+     740     301.3262401480515109    1544.7045314021493141    1188.9131307177485724     394.3244696516559884       0.0000000000000000   15704.9296076272603386   17288.1672079966665478     299.9674666811455950 
+     750     301.5740779122830872    1591.1785078054849691    1188.8637580645940943     394.6487975126887591       0.0000000000000000   15704.6521404470349808   17288.1646960243160720     299.9616008527092959 
+     760     301.4385361878655658    1547.3218422039212783    1188.8113669183098864     394.4714235854451658       0.0000000000000000   15704.8763786124927719   17288.1591691162466304     299.9656339783693966 
+     770     301.6110125684815557    1494.5039561806624988    1188.7581685915934031     394.6971313010441236       0.0000000000000000   15704.6953595579507237   17288.1506594505881367     299.9619855799396646 
+     780     301.8360352039435384    1588.1458619705304045    1188.7039178696477393     394.9916026067776329       0.0000000000000000   15704.4617070838321524   17288.1572275602593436     299.9572350302976247 
+     790     302.1008324754310479    1545.4409171812180830    1188.6491103416560691     395.3381241828382144       0.0000000000000000   15704.1751923936917592   17288.1624269181847922     299.9513959104630771 
+     800     301.9660372380565718    1563.9565804790738639    1188.5964649891604950     395.1617271307158035       0.0000000000000000   15704.4063124560707365   17288.1645045759469212     299.9555810527747326 
+     810     302.0507207347627059    1511.4560763489960209    1188.5468477146607711     395.2725464702810427       0.0000000000000000   15704.3505979898400255   17288.1699921747822373     299.9541551776507617 
+     820     302.4700213214913447    1458.5135514273563331    1188.4981381693974072     395.8212556746476025       0.0000000000000000   15703.8537761962070363   17288.1731700402524439     299.9441803241177809 
+     830     302.2853997979336214    1496.2544527963145811    1188.4496917372191547     395.5796544641873993       0.0000000000000000   15704.1464516793694202   17288.1757978807763720     299.9494768794834840 
+     840     302.0840465730901201    1518.8301331998702608    1188.3994383226179252     395.3161576523596636       0.0000000000000000   15704.4564021812439023   17288.1719981562200701     299.9550476592922337 
+     850     301.8910942560260082    1469.8827850510904227    1188.3489956121347859     395.0636545180259986       0.0000000000000000   15704.7627679631386854   17288.1754180932985037     299.9606927700136794 
+     860     301.7284384160518016    1657.6802015862315329    1188.3052233777652873     394.8507982536592635       0.0000000000000000   15705.0246544022065791   17288.1806760336330626     299.9652835238807711 
+     870     301.6331619894114624    1501.5829953208508414    1188.2628815714099346     394.7261166912875865       0.0000000000000000   15705.1980055648327834   17288.1870038275301340     299.9682811831179947 
+     880     301.3703918424367316    1499.1595903074555736    1188.2195190931643083     394.3822478705861272       0.0000000000000000   15705.5868298250898079   17288.1885967888410960     299.9755099056964127 
+     890     301.4157954313303662    1598.8758859042509357    1188.1845892608291706     394.4416643558612918       0.0000000000000000   15705.5638481192290783   17288.1901017359195976     299.9745322513492738 
+     900     301.4752150615486812    1621.2148728756842502    1188.1517520946144941     394.5194226492021699       0.0000000000000000   15705.5181455608308170   17288.1893203046492999     299.9733125337182287 
+     910     301.4308816315937634    1538.4823217911621214    1188.1159856659228353     394.4614066057064861       0.0000000000000000   15705.6160570713091147   17288.1934493429398572     299.9748317405192779 
+     920     301.4323110133492492    1594.7193046491240693    1188.0835779842032025     394.4632771371357762       0.0000000000000000   15705.6544576464475540   17288.2013127677855664     299.9751127806913473 
+     930     301.4801256941949532    1387.6885377097596574    1188.0464206196900250     394.5258488489680531       0.0000000000000000   15705.6258377843460039   17288.1981072530033998     299.9740698440912183 
+     940     301.8075611840245074    1534.2487040663797870    1188.0124217312888959     394.9543406584059539       0.0000000000000000   15705.2331319202457962   17288.1998943099388271     299.9660570413491882 
+     950     301.6915970126175353    1567.7725992489226883    1187.9790455470049437     394.8025864986415172       0.0000000000000000   15705.4221432087451831   17288.2037752543910756     299.9694678653152096 
+     960     301.6392594677008105    1504.8502165144939227    1187.9439133338107695     394.7340960325207675       0.0000000000000000   15705.5330682989206252   17288.2110776652516506     299.9711546356285226 
+     970     301.6049535791644871    1514.0198965433535250    1187.9094123369409317     394.6892023276234909       0.0000000000000000   15705.6099784820144123   17288.2085931465771864     299.9722547114341751 
+     980     301.2982841679706780    1634.1208149125800446    1187.8768454876478700     394.2878856256065205       0.0000000000000000   15706.0463883383199573   17288.2111194515746320     299.9802110109068849 
+     990     301.2573007350166563    1489.7316698898262075    1187.8432331161866387     394.2342534877078606       0.0000000000000000   15706.1441971863041545   17288.2216837901978579     299.9819468620868292 
+    1000     301.3195135766228532    1562.6587211933931485    1187.8034267774903583     394.3156670604516307       0.0000000000000000   15706.0974511956701463   17288.2165450336106005     299.9807651637235040 
+Loop time of 17.0881 on 1 procs for 1000 steps with 10125 atoms
 
-Performance: 4.050 ns/day, 5.925 hours/ns, 46.880 timesteps/s
-99.8% CPU use with 1 MPI tasks x no OpenMP threads
+Performance: 5.056 ns/day, 4.747 hours/ns, 58.520 timesteps/s
+100.0% CPU use with 1 MPI tasks x no OpenMP threads
 
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 10.099     | 10.099     | 10.099     |   0.0 | 47.34
-Neigh   | 10.145     | 10.145     | 10.145     |   0.0 | 47.56
-Comm    | 0.49807    | 0.49807    | 0.49807    |   0.0 |  2.33
-Output  | 0.011203   | 0.011203   | 0.011203   |   0.0 |  0.05
-Modify  | 0.28296    | 0.28296    | 0.28296    |   0.0 |  1.33
-Other   |            | 0.295      |            |       |  1.38
+Pair    | 8.0541     | 8.0541     | 8.0541     |   0.0 | 47.13
+Neigh   | 8.1306     | 8.1306     | 8.1306     |   0.0 | 47.58
+Comm    | 0.39415    | 0.39415    | 0.39415    |   0.0 |  2.31
+Output  | 0.01103    | 0.01103    | 0.01103    |   0.0 |  0.06
+Modify  | 0.24061    | 0.24061    | 0.24061    |   0.0 |  1.41
+Other   |            | 0.2576     |            |       |  1.51
 
 Nlocal:    10125 ave 10125 max 10125 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
@@ -170,4 +174,4 @@ Total # of neighbors = 114682
 Ave neighs/atom = 11.3266
 Neighbor list builds = 1000
 Dangerous builds not checked
-Total wall time: 0:00:21
+Total wall time: 0:00:17
diff --git a/examples/USER/dpd/dpdrx-shardlow/in.dpdrx-shardlow b/examples/USER/dpd/dpdrx-shardlow/in.dpdrx-shardlow
index e65b5a14db..815c974741 100755
--- a/examples/USER/dpd/dpdrx-shardlow/in.dpdrx-shardlow
+++ b/examples/USER/dpd/dpdrx-shardlow/in.dpdrx-shardlow
@@ -37,7 +37,7 @@ timestep        0.001
 
 pair_style      hybrid/overlay dpd/fdt/energy 16.00 234324 exp6/rx 16.00
 pair_coeff      * * dpd/fdt/energy 0.0 0.05 10.0 16.00
-pair_coeff      * * exp6/rx params.exp6 1fluid 1fluid 1.0 1.0 16.00
+pair_coeff      * * exp6/rx params.exp6 1fluid 1fluid exponent 1.0 1.0 16.00
 
 fix             1 all shardlow
 fix             2 all nve
diff --git a/examples/USER/dpd/dpdrx-shardlow/log.dpdrx-shardlow.reference b/examples/USER/dpd/dpdrx-shardlow/log.dpdrx-shardlow.reference
index 067708154a..b80e033eb9 100644
--- a/examples/USER/dpd/dpdrx-shardlow/log.dpdrx-shardlow.reference
+++ b/examples/USER/dpd/dpdrx-shardlow/log.dpdrx-shardlow.reference
@@ -48,7 +48,7 @@ timestep        0.001
 
 pair_style      hybrid/overlay dpd/fdt/energy 16.00 234324 exp6/rx 16.00
 pair_coeff      * * dpd/fdt/energy 0.0 0.05 10.0 16.00
-pair_coeff      * * exp6/rx params.exp6 1fluid 1fluid 1.0 1.0 16.00
+pair_coeff      * * exp6/rx params.exp6 1fluid 1fluid exponent 1.0 1.0 16.00
 
 fix             1 all shardlow
 fix             2 all nve
@@ -69,39 +69,51 @@ dump_modify     2 sort id
 
 run             10
 Neighbor list info ...
-  2 neighbor list requests
   update every 1 steps, delay 10 steps, check yes
   max neighbors/atom: 2000, page size: 100000
   master list distance cutoff = 18
   ghost atom cutoff = 18
-  binsize = 9 -> bins = 8 8 5
-Memory usage per processor = 6.52436 Mbytes
+  binsize = 9, bins = 8 8 5
+  3 neighbor lists, perpetual/occasional/extra = 3 0 0
+  (1) pair dpd/fdt/energy, perpetual
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+  (2) pair exp6/rx, perpetual, copy from (1)
+      pair build: copy
+      stencil: none
+      bin: none
+  (3) fix shardlow, perpetual, ssa
+      pair build: half/bin/newton/ssa
+      stencil: half/bin/3d/newton/ssa
+      bin: ssa
+Memory usage per processor = 8.39564 Mbytes
 Step Temp Press Volume PotEng KinEng c_dpdU[1] c_dpdU[2] c_dpdU[3] v_totEnergy c_dpdU[4] 
-       0   2065.00000000   1368.17463335 179834.51777865      0.00000000    230.35385810   3841.42393279   3841.42393279      0.00000000   7682.84786557   2065.00000000 
-       1   2064.93210437   1368.12964881 179834.51777865      0.00000000    230.34628424   3841.42393279   3841.43150665      0.00000000   7682.85543943   2065.20275230 
-       2   2067.82089565   1370.04362990 179834.51777865     -0.00000000    230.66853326   3841.42393279   3841.10925763      0.00000000   7682.53319042   2065.32453473 
-       3   2070.45225169   1371.78704616 179834.51777865     -0.00000000    230.96206499   3841.42393279   3840.81572590      0.00000000   7682.23965869   2065.45336917 
-       4   2075.00241157   1374.80177416 179834.51777865     -0.00000000    231.46964217   3841.42393279   3840.30814872      0.00000000   7681.73208151   2065.52973333 
-       5   2073.96509212   1374.11449370 179834.51777865     -0.00000000    231.35392762   3841.42393279   3840.42386327      0.00000000   7681.84779605   2065.76011517 
-       6   2074.26516936   1374.31331117 179834.51777865     -0.00000000    231.38740169   3841.42393279   3840.39038920      0.00000000   7681.81432198   2065.95399323 
-       7   2071.41069700   1372.42206822 179834.51777865     -0.00000000    231.06898100   3841.42393279   3840.70880989      0.00000000   7682.13274267   2066.23407076 
-       8   2071.35844957   1372.38745146 179834.51777865     -0.00000000    231.06315272   3841.42393279   3840.71463817      0.00000000   7682.13857095   2066.43766287 
-       9   2071.35676496   1372.38633532 179834.51777865     -0.00000000    231.06296480   3841.42393279   3840.71482609      0.00000000   7682.13875887   2066.64001166 
-      10   2066.53172340   1369.18948328 179834.51777865     -0.00000000    230.52472415   3841.42393279   3841.25306673      0.00000000   7682.67699952   2066.97516855 
-Loop time of 0.289778 on 1 procs for 10 steps with 864 atoms
+       0   2065.00000000   1368.17463335 179834.51777865      0.00000000    230.35385810      0.00000000   7682.84786557      0.00000000   7682.84786557   2065.00000000 
+       1   2064.93210437   1368.12964881 179834.51777865      0.00000000    230.34628424      0.00000000   7682.85543943      0.00000000   7682.85543943   2065.20275230 
+       2   2067.82089565   1370.04362990 179834.51777865     -0.00000000    230.66853326      0.00000000   7682.53319042      0.00000000   7682.53319042   2065.32453473 
+       3   2070.45225169   1371.78704616 179834.51777865     -0.00000000    230.96206499      0.00000000   7682.23965869      0.00000000   7682.23965869   2065.45336917 
+       4   2075.00241157   1374.80177416 179834.51777865     -0.00000000    231.46964217      0.00000000   7681.73208151      0.00000000   7681.73208151   2065.52973333 
+       5   2073.96509212   1374.11449370 179834.51777865     -0.00000000    231.35392762     -0.00000000   7681.84779605      0.00000000   7681.84779605   2065.76011517 
+       6   2074.26516936   1374.31331117 179834.51777865     -0.00000000    231.38740169     -0.00000000   7681.81432198      0.00000000   7681.81432198   2065.95399323 
+       7   2071.41069700   1372.42206822 179834.51777865     -0.00000000    231.06898100     -0.00000000   7682.13274267      0.00000000   7682.13274267   2066.23407076 
+       8   2071.35844957   1372.38745146 179834.51777865     -0.00000000    231.06315272      0.00000000   7682.13857095      0.00000000   7682.13857095   2066.43766287 
+       9   2071.35676496   1372.38633532 179834.51777865     -0.00000000    231.06296480      0.00000000   7682.13875887      0.00000000   7682.13875887   2066.64001166 
+      10   2066.53172340   1369.18948328 179834.51777865     -0.00000000    230.52472415      0.00000000   7682.67699952      0.00000000   7682.67699952   2066.97516855 
+Loop time of 0.611304 on 1 procs for 10 steps with 864 atoms
 
-Performance: 2.982 ns/day, 8.049 hours/ns, 34.509 timesteps/s
-99.4% CPU use with 1 MPI tasks x no OpenMP threads
+Performance: 1.413 ns/day, 16.981 hours/ns, 16.358 timesteps/s
+98.2% CPU use with 1 MPI tasks x no OpenMP threads
 
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.16405    | 0.16405    | 0.16405    |   0.0 | 56.61
+Pair    | 0.34177    | 0.34177    | 0.34177    |   0.0 | 55.91
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
-Comm    | 0.00066328 | 0.00066328 | 0.00066328 |   0.0 |  0.23
-Output  | 0.037718   | 0.037718   | 0.037718   |   0.0 | 13.02
-Modify  | 0.087281   | 0.087281   | 0.087281   |   0.0 | 30.12
-Other   |            | 7.057e-05  |            |       |  0.02
+Comm    | 0.0013342  | 0.0013342  | 0.0013342  |   0.0 |  0.22
+Output  | 0.083583   | 0.083583   | 0.083583   |   0.0 | 13.67
+Modify  | 0.18451    | 0.18451    | 0.18451    |   0.0 | 30.18
+Other   |            | 0.0001087  |            |       |  0.02
 
 Nlocal:    864 ave 864 max 864 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index 559948067d..ce3b547435 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -936,32 +936,32 @@ void PairExp6rxKokkos<DeviceType>::getParamsEXP6(int id,double &epsilon1,double
 
     // Fuchslin-Like Exp-6 Scaling
     double powfuch = 0.0;
-    if(fuchslinEpsilon < 0.0){
-      powfuch = pow(nTotalOFA,-fuchslinEpsilon);
+    if(exponentEpsilon < 0.0){
+      powfuch = pow(nTotalOFA,-exponentEpsilon);
       if(powfuch<1e-15) epsilon1 = 0.0;
       else epsilon1 *= 1.0/powfuch;
 
-      powfuch = pow(nTotalOFA_old,-fuchslinEpsilon);
+      powfuch = pow(nTotalOFA_old,-exponentEpsilon);
       if(powfuch<1e-15) epsilon1_old = 0.0;
       else epsilon1_old *= 1.0/powfuch;
 
     } else {
-      epsilon1 *= pow(nTotalOFA,fuchslinEpsilon);
-      epsilon1_old *= pow(nTotalOFA_old,fuchslinEpsilon);
+      epsilon1 *= pow(nTotalOFA,exponentEpsilon);
+      epsilon1_old *= pow(nTotalOFA_old,exponentEpsilon);
     }
 
-    if(fuchslinR < 0.0){
-      powfuch = pow(nTotalOFA,-fuchslinR);
+    if(exponentR < 0.0){
+      powfuch = pow(nTotalOFA,-exponentR);
       if(powfuch<1e-15) rm1 = 0.0;
       else rm1 *= 1.0/powfuch;
 
-      powfuch = pow(nTotalOFA_old,-fuchslinR);
+      powfuch = pow(nTotalOFA_old,-exponentR);
       if(powfuch<1e-15) rm1_old = 0.0;
       else rm1_old *= 1.0/powfuch;
 
     } else {
-      rm1 *= pow(nTotalOFA,fuchslinR);
-      rm1_old *= pow(nTotalOFA_old,fuchslinR);
+      rm1 *= pow(nTotalOFA,exponentR);
+      rm1_old *= pow(nTotalOFA_old,exponentR);
     }
   }
 
@@ -990,32 +990,32 @@ void PairExp6rxKokkos<DeviceType>::getParamsEXP6(int id,double &epsilon1,double
 
     // Fuchslin-Like Exp-6 Scaling
     double powfuch = 0.0;
-    if(fuchslinEpsilon < 0.0){
-      powfuch = pow(nTotalOFA,-fuchslinEpsilon);
+    if(exponentEpsilon < 0.0){
+      powfuch = pow(nTotalOFA,-exponentEpsilon);
       if(powfuch<1e-15) epsilon2 = 0.0;
       else epsilon2 *= 1.0/powfuch;
 
-      powfuch = pow(nTotalOFA_old,-fuchslinEpsilon);
+      powfuch = pow(nTotalOFA_old,-exponentEpsilon);
       if(powfuch<1e-15) epsilon2_old = 0.0;
       else epsilon2_old *= 1.0/powfuch;
 
     } else {
-      epsilon2 *= pow(nTotalOFA,fuchslinEpsilon);
-      epsilon2_old *= pow(nTotalOFA_old,fuchslinEpsilon);
+      epsilon2 *= pow(nTotalOFA,exponentEpsilon);
+      epsilon2_old *= pow(nTotalOFA_old,exponentEpsilon);
     }
 
-    if(fuchslinR < 0.0){
-      powfuch = pow(nTotalOFA,-fuchslinR);
+    if(exponentR < 0.0){
+      powfuch = pow(nTotalOFA,-exponentR);
       if(powfuch<1e-15) rm2 = 0.0;
       else rm2 *= 1.0/powfuch;
 
-      powfuch = pow(nTotalOFA_old,-fuchslinR);
+      powfuch = pow(nTotalOFA_old,-exponentR);
       if(powfuch<1e-15) rm2_old = 0.0;
       else rm2_old *= 1.0/powfuch;
 
     } else {
-      rm2 *= pow(nTotalOFA,fuchslinR);
-      rm2_old *= pow(nTotalOFA_old,fuchslinR);
+      rm2 *= pow(nTotalOFA,exponentR);
+      rm2_old *= pow(nTotalOFA_old,exponentR);
     }
   }
 
diff --git a/src/USER-DPD/fix_eos_table_rx.cpp b/src/USER-DPD/fix_eos_table_rx.cpp
index 91ccc8475e..52b1930c1c 100644
--- a/src/USER-DPD/fix_eos_table_rx.cpp
+++ b/src/USER-DPD/fix_eos_table_rx.cpp
@@ -28,6 +28,12 @@
 
 #define MAXLINE 1024
 
+#ifdef DBL_EPSILON
+  #define MY_EPSILON (10.0*DBL_EPSILON)
+#else
+  #define MY_EPSILON (10.0*2.220446049250313e-16)
+#endif
+
 using namespace LAMMPS_NS;
 using namespace FixConst;
 
@@ -37,17 +43,18 @@ FixEOStableRX::FixEOStableRX(LAMMPS *lmp, int narg, char **arg) :
   Fix(lmp, narg, arg), ntables(0), tables(NULL),
   tables2(NULL), dHf(NULL), eosSpecies(NULL)
 {
-  if (narg != 8) error->all(FLERR,"Illegal fix eos/table/rx command");
+  if (narg != 8 && narg != 10) error->all(FLERR,"Illegal fix eos/table/rx command");
   restart_peratom = 1;
   nevery = 1;
 
-  bool rx_flag = false;
+  rx_flag = false;
+  nspecies = 1;
   for (int i = 0; i < modify->nfix; i++)
-    if (strncmp(modify->fix[i]->style,"rx",2) == 0) rx_flag = true;
-  if (!rx_flag) error->all(FLERR,"FixEOStableRX requires a fix rx command.");
-
-  nspecies = atom->nspecies_dpd;
-  if(nspecies==0) error->all(FLERR,"There are no rx species specified.");
+    if (strncmp(modify->fix[i]->style,"rx",2) == 0){
+      rx_flag = true;
+      nspecies = atom->nspecies_dpd;
+      if(nspecies==0) error->all(FLERR,"There are no rx species specified.");
+    }
 
   if (strcmp(arg[3],"linear") == 0) tabstyle = LINEAR;
   else error->all(FLERR,"Unknown table style in fix eos/table/rx");
@@ -113,8 +120,25 @@ FixEOStableRX::FixEOStableRX(LAMMPS *lmp, int narg, char **arg) :
     ntables++;
   }
 
-  // Read the Formation Enthalpies
-  read_file(arg[7]);
+  // Read the Formation Enthalpies and Correction Coefficients
+  dHf = new double[nspecies];
+  energyCorr = new double[nspecies];
+  tempCorrCoeff = new double[nspecies];
+  moleculeCorrCoeff= new double[nspecies];
+  for (int ii=0; ii<nspecies; ii++){
+    dHf[ii] = 0.0;
+    energyCorr[ii] = 0.0;
+    tempCorrCoeff[ii] = 0.0;
+    moleculeCorrCoeff[ii] = 0.0;
+  }
+
+  if(rx_flag) read_file(arg[7]);
+  else dHf[0] = atof(arg[7]);
+
+  if(narg==10){
+    energyCorr[0] = atof(arg[8]);
+    tempCorrCoeff[0] = atof(arg[9]);
+  }
 
   comm_forward = 3;
   comm_reverse = 2;
@@ -138,6 +162,9 @@ FixEOStableRX::~FixEOStableRX()
 
   delete [] dHf;
   delete [] eosSpecies;
+  delete [] energyCorr;
+  delete [] tempCorrCoeff;
+  delete [] moleculeCorrCoeff;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -271,10 +298,9 @@ void FixEOStableRX::end_of_step()
 
 void FixEOStableRX::read_file(char *file)
 {
-  int params_per_line = 2;
-  char **words = new char*[params_per_line+1];
-
-  dHf = new double[nspecies];
+  int min_params_per_line = 2;
+  int max_params_per_line = 5;
+  char **words = new char*[max_params_per_line+1];
 
   // open file on proc 0
 
@@ -315,7 +341,7 @@ void FixEOStableRX::read_file(char *file)
 
     // concatenate additional lines until have params_per_line words
 
-    while (nwords < params_per_line) {
+    while (nwords < min_params_per_line) {
       n = strlen(line);
       if (comm->me == 0) {
         ptr = fgets(&line[n],MAXLINE-n,fp);
@@ -332,7 +358,7 @@ void FixEOStableRX::read_file(char *file)
       nwords = atom->count_words(line);
     }
 
-    if (nwords != params_per_line)
+    if (nwords != min_params_per_line && nwords != max_params_per_line)
       error->all(FLERR,"Incorrect format in eos table/rx potential file");
 
     // words = ptrs to all words in line
@@ -344,8 +370,14 @@ void FixEOStableRX::read_file(char *file)
     for (ispecies = 0; ispecies < nspecies; ispecies++)
       if (strcmp(words[0],&atom->dname[ispecies][0]) == 0) break;
 
-    if (ispecies < nspecies)
+    if (ispecies < nspecies){
       dHf[ispecies] = atof(words[1]);
+      if(nwords > min_params_per_line+1){
+        energyCorr[ispecies] = atof(words[2]);
+        tempCorrCoeff[ispecies] = atof(words[3]);
+        moleculeCorrCoeff[ispecies] = atof(words[4]);
+      }
+    }
   }
 
   delete [] words;
@@ -547,27 +579,33 @@ void FixEOStableRX::param_extract(Table *tb, char *line)
     error->one(FLERR,"Invalid keyword in fix eos/table/rx parameters");
   word = strtok(NULL," \t\n\r\f");
 
-  while (word) {
-    for (ispecies = 0; ispecies < nspecies; ispecies++)
-      if (strcmp(word,&atom->dname[ispecies][0]) == 0){
-        eosSpecies[ncolumn] =  ispecies;
-        ncolumn++;
-        break;
+  if(rx_flag){
+    while (word) {
+      for (ispecies = 0; ispecies < nspecies; ispecies++)
+        if (strcmp(word,&atom->dname[ispecies][0]) == 0){
+          eosSpecies[ncolumn] =  ispecies;
+          ncolumn++;
+          break;
+        }
+      if (ispecies == nspecies){
+        printf("name=%s not found in species list\n",word);
+        error->one(FLERR,"Invalid keyword in fix eos/table/rx parameters");
       }
-    if (ispecies == nspecies){
-      printf("name=%s not found in species list\n",word);
-      error->one(FLERR,"Invalid keyword in fix eos/table/rx parameters");
+      word = strtok(NULL," \t\n\r\f");
     }
-    word = strtok(NULL," \t\n\r\f");
+
+    for (int icolumn = 0; icolumn < ncolumn; icolumn++)
+      if(eosSpecies[icolumn]==-1)
+        error->one(FLERR,"EOS data is missing from fix eos/table/rx tabe");
+    if(ncolumn != nspecies){
+      printf("ncolumns=%d nspecies=%d\n",ncolumn,nspecies);
+      error->one(FLERR,"The number of columns in fix eos/table/rx does not match the number of species");
+    }
+  } else {
+    eosSpecies[0] = 0;
+    ncolumn++;
   }
 
-  for (int icolumn = 0; icolumn < ncolumn; icolumn++)
-    if(eosSpecies[icolumn]==-1)
-      error->one(FLERR,"EOS data is missing from fix eos/table/rx tabe");
-  if(ncolumn != nspecies){
-    printf("ncolumns=%d nspecies=%d\n",ncolumn,nspecies);
-    error->one(FLERR,"The number of columns in fix eos/table/rx does not match the number of species");
-  }
   if (tb->ninput == 0) error->one(FLERR,"fix eos/table/rx parameters did not set N");
 
 }
@@ -655,11 +693,27 @@ double FixEOStableRX::splint(double *xa, double *ya, double *y2a, int n, double
 
 void FixEOStableRX::energy_lookup(int id, double thetai, double &ui)
 {
-  int itable;
-  double fraction, uTmp, nTotal;
+  int itable, nPG;
+  double fraction, uTmp, nMolecules, nTotal, nTotalPG;
+  double tolerance = 1.0e-10;
 
   ui = 0.0;
   nTotal = 0.0;
+  nTotalPG = 0.0;
+  nPG = 0;
+
+  if(rx_flag){
+    for(int ispecies=0;ispecies<nspecies;ispecies++){
+      nTotal += atom->dvector[ispecies][id];
+      if(fabs(moleculeCorrCoeff[ispecies]) > tolerance){
+        nPG++;
+        nTotalPG += atom->dvector[ispecies][id];
+      }
+    }
+  } else {
+    nTotal = 1.0;
+  }
+
   for(int ispecies=0;ispecies<nspecies;ispecies++){
     Table *tb = &tables[ispecies];
     thetai = MAX(thetai,tb->lo);
@@ -671,9 +725,13 @@ void FixEOStableRX::energy_lookup(int id, double thetai, double &ui)
       uTmp = tb->e[itable] + fraction*tb->de[itable];
 
       uTmp += dHf[ispecies];
-      // mol fraction form:
-      ui += atom->dvector[ispecies][id]*uTmp;
-      nTotal += atom->dvector[ispecies][id];
+      uTmp += tempCorrCoeff[ispecies]*thetai; // temperature correction
+      uTmp += energyCorr[ispecies]; // energy correction
+      if(nPG > 0) ui += moleculeCorrCoeff[ispecies]*nTotalPG/double(nPG); // molecule correction
+
+      if(rx_flag) nMolecules = atom->dvector[ispecies][id];
+      else nMolecules = 1.0;
+      ui += nMolecules*uTmp;
     }
   }
   ui = ui - double(nTotal+1.5)*force->boltz*thetai;
@@ -692,6 +750,7 @@ void FixEOStableRX::temperature_lookup(int id, double ui, double &thetai)
   double maxit = 100;
   double temp;
   double delta = 0.001;
+  double tolerance = 1.0e-10;
 
   // Store the current thetai in t1
   t1 = MAX(thetai,tb->lo);
@@ -715,7 +774,7 @@ void FixEOStableRX::temperature_lookup(int id, double ui, double &thetai)
 
   // Apply the Secant Method
   for(it=0; it<maxit; it++){
-    if(fabs(f2-f1)<1e-15){
+    if(fabs(f2-f1) < MY_EPSILON){
       if(isnan(f1) || isnan(f2)) error->one(FLERR,"NaN detected in secant solver.");
       temp = t1;
       temp = MAX(temp,tb->lo);
@@ -726,7 +785,7 @@ void FixEOStableRX::temperature_lookup(int id, double ui, double &thetai)
       break;
     }
     temp = t2 - f2*(t2-t1)/(f2-f1);
-    if(fabs(temp-t2) < 1e-6) break;
+    if(fabs(temp-t2) < tolerance) break;
     f1 = f2;
     t1 = t2;
     t2 = temp;
diff --git a/src/USER-DPD/fix_eos_table_rx.h b/src/USER-DPD/fix_eos_table_rx.h
index 078cf1e2e1..8c26d133a5 100644
--- a/src/USER-DPD/fix_eos_table_rx.h
+++ b/src/USER-DPD/fix_eos_table_rx.h
@@ -67,7 +67,7 @@ class FixEOStableRX : public Fix {
 
   void read_file(char *);
 
-  double *dHf;
+  double *dHf,*energyCorr,*tempCorrCoeff,*moleculeCorrCoeff;
 
   int pack_reverse_comm(int, int, double *);
   void unpack_reverse_comm(int, int *, double *);
@@ -76,6 +76,7 @@ class FixEOStableRX : public Fix {
 
   int *eosSpecies;
   int ncolumn;
+  bool rx_flag;
   };
 }
 
diff --git a/src/USER-DPD/fix_rx.cpp b/src/USER-DPD/fix_rx.cpp
index 0bd560b241..b7330ba1ef 100644
--- a/src/USER-DPD/fix_rx.cpp
+++ b/src/USER-DPD/fix_rx.cpp
@@ -45,6 +45,12 @@ enum{LUCY};
 #define MAXLINE 1024
 #define DELTA 4
 
+#ifdef DBL_EPSILON
+  #define MY_EPSILON (10.0*DBL_EPSILON)
+#else
+  #define MY_EPSILON (10.0*2.220446049250313e-16)
+#endif
+
 #define SparseKinetics_enableIntegralReactions (true)
 #define SparseKinetics_invalidIndex (-1)
 
@@ -693,7 +699,6 @@ void FixRX::pre_force(int vflag)
   int *mask = atom->mask;
   double *dpdTheta = atom->dpdTheta;
   int newton_pair = force->newton_pair;
-  int ii;
   double theta;
 
   if(localTempFlag){
@@ -996,9 +1001,9 @@ void FixRX::rk4(int id, double *rwork)
 
   // Store the solution back in atom->dvector.
   for (int ispecies = 0; ispecies < nspecies; ispecies++){
-    if(y[ispecies] < -1.0e-10)
-      error->one(FLERR,"Computed concentration in RK4 solver is < -1.0e-10");
-    else if(y[ispecies] < 1e-15)
+    if(y[ispecies] < -MY_EPSILON)
+      error->one(FLERR,"Computed concentration in RK4 solver is < -10*DBL_EPSILON");
+    else if(y[ispecies] < MY_EPSILON)
       y[ispecies] = 0.0;
     atom->dvector[ispecies][id] = y[ispecies];
   }
@@ -1515,7 +1520,7 @@ void FixRX::rkf45(int id, double *rwork)
   for (int ispecies = 0; ispecies < nspecies; ispecies++){
     if(y[ispecies] < -1.0e-10)
       error->one(FLERR,"Computed concentration in RKF45 solver is < -1.0e-10");
-    else if(y[ispecies] < 1e-20)
+    else if(y[ispecies] < MY_EPSILON)
       y[ispecies] = 0.0;
     atom->dvector[ispecies][id] = y[ispecies];
   }
diff --git a/src/USER-DPD/pair_exp6_rx.cpp b/src/USER-DPD/pair_exp6_rx.cpp
index 202e0bf654..87a283179c 100644
--- a/src/USER-DPD/pair_exp6_rx.cpp
+++ b/src/USER-DPD/pair_exp6_rx.cpp
@@ -35,6 +35,12 @@ using namespace MathSpecial;
 #define MAXLINE 1024
 #define DELTA 4
 
+#ifdef DBL_EPSILON
+  #define MY_EPSILON (10.0*DBL_EPSILON)
+#else
+  #define MY_EPSILON (10.0*2.220446049250313e-16)
+#endif
+
 #define oneFluidApproxParameter (-1)
 #define isOneFluidApprox(_site) ( (_site) == oneFluidApproxParameter )
 
@@ -47,17 +53,17 @@ using namespace MathSpecial;
 struct PairExp6ParamDataType
 {
    int n;
-   double *epsilon1, *alpha1, *rm1, *fraction1,
-          *epsilon2, *alpha2, *rm2, *fraction2,
-          *epsilonOld1, *alphaOld1, *rmOld1, *fractionOld1,
-          *epsilonOld2, *alphaOld2, *rmOld2, *fractionOld2;
+   double *epsilon1, *alpha1, *rm1, *mixWtSite1,
+          *epsilon2, *alpha2, *rm2, *mixWtSite2,
+          *epsilonOld1, *alphaOld1, *rmOld1, *mixWtSite1old,
+          *epsilonOld2, *alphaOld2, *rmOld2, *mixWtSite2old;
 
    // Default constructor -- nullify everything.
    PairExp6ParamDataType(void)
-      : n(0), epsilon1(NULL), alpha1(NULL), rm1(NULL), fraction1(NULL),
-              epsilon2(NULL), alpha2(NULL), rm2(NULL), fraction2(NULL),
-              epsilonOld1(NULL), alphaOld1(NULL), rmOld1(NULL), fractionOld1(NULL),
-              epsilonOld2(NULL), alphaOld2(NULL), rmOld2(NULL), fractionOld2(NULL)
+      : n(0), epsilon1(NULL), alpha1(NULL), rm1(NULL), mixWtSite1(NULL),
+              epsilon2(NULL), alpha2(NULL), rm2(NULL), mixWtSite2(NULL),
+              epsilonOld1(NULL), alphaOld1(NULL), rmOld1(NULL), mixWtSite1old(NULL),
+              epsilonOld2(NULL), alphaOld2(NULL), rmOld2(NULL), mixWtSite2old(NULL)
    {}
 };
 
@@ -71,6 +77,7 @@ PairExp6rx::PairExp6rx(LAMMPS *lmp) : Pair(lmp)
   nparams = maxparam = 0;
   params = NULL;
   mol2param = NULL;
+  fractionalWeighting = true;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -93,6 +100,11 @@ PairExp6rx::~PairExp6rx()
     memory->destroy(cutsq);
     memory->destroy(cut);
   }
+  if(scalingFlag == POLYNOMIAL){
+    memory->destroy(coeffAlpha);
+    memory->destroy(coeffEps);
+    memory->destroy(coeffRm);
+  }
 }
 
 /* ---------------------------------------------------------------------- */
@@ -134,10 +146,10 @@ void PairExp6rx::compute(int eflag, int vflag)
   double epsilon2_j,alpha2_j,rm2_j;
   double evdwlOldEXP6_12, evdwlOldEXP6_21, fpairOldEXP6_12, fpairOldEXP6_21;
   double evdwlEXP6_12, evdwlEXP6_21;
-  double fractionOld1_i, fractionOld1_j;
-  double fractionOld2_i, fractionOld2_j;
-  double fraction1_i, fraction1_j;
-  double fraction2_i, fraction2_j;
+  double mixWtSite1old_i, mixWtSite1old_j;
+  double mixWtSite2old_i, mixWtSite2old_j;
+  double mixWtSite1_i, mixWtSite1_j;
+  double mixWtSite2_i, mixWtSite2_j;
   double *uCG = atom->uCG;
   double *uCGnew = atom->uCGnew;
 
@@ -157,38 +169,38 @@ void PairExp6rx::compute(int eflag, int vflag)
      memory->create( PairExp6ParamData.epsilon1     , np_total, "PairExp6ParamData.epsilon1");
      memory->create( PairExp6ParamData.alpha1       , np_total, "PairExp6ParamData.alpha1");
      memory->create( PairExp6ParamData.rm1          , np_total, "PairExp6ParamData.rm1");
-     memory->create( PairExp6ParamData.fraction1    , np_total, "PairExp6ParamData.fraction1");
+     memory->create( PairExp6ParamData.mixWtSite1    , np_total, "PairExp6ParamData.mixWtSite1");
      memory->create( PairExp6ParamData.epsilon2     , np_total, "PairExp6ParamData.epsilon2");
      memory->create( PairExp6ParamData.alpha2       , np_total, "PairExp6ParamData.alpha2");
      memory->create( PairExp6ParamData.rm2          , np_total, "PairExp6ParamData.rm2");
-     memory->create( PairExp6ParamData.fraction2    , np_total, "PairExp6ParamData.fraction2");
+     memory->create( PairExp6ParamData.mixWtSite2    , np_total, "PairExp6ParamData.mixWtSite2");
      memory->create( PairExp6ParamData.epsilonOld1  , np_total, "PairExp6ParamData.epsilonOld1");
      memory->create( PairExp6ParamData.alphaOld1    , np_total, "PairExp6ParamData.alphaOld1");
      memory->create( PairExp6ParamData.rmOld1       , np_total, "PairExp6ParamData.rmOld1");
-     memory->create( PairExp6ParamData.fractionOld1 , np_total, "PairExp6ParamData.fractionOld1");
+     memory->create( PairExp6ParamData.mixWtSite1old , np_total, "PairExp6ParamData.mixWtSite1old");
      memory->create( PairExp6ParamData.epsilonOld2  , np_total, "PairExp6ParamData.epsilonOld2");
      memory->create( PairExp6ParamData.alphaOld2    , np_total, "PairExp6ParamData.alphaOld2");
      memory->create( PairExp6ParamData.rmOld2       , np_total, "PairExp6ParamData.rmOld2");
-     memory->create( PairExp6ParamData.fractionOld2 , np_total, "PairExp6ParamData.fractionOld2");
+     memory->create( PairExp6ParamData.mixWtSite2old , np_total, "PairExp6ParamData.mixWtSite2old");
 
      for (i = 0; i < np_total; ++i)
      {
-        getParamsEXP6 (i, PairExp6ParamData.epsilon1[i],
+        getMixingWeights (i, PairExp6ParamData.epsilon1[i],
                           PairExp6ParamData.alpha1[i],
                           PairExp6ParamData.rm1[i],
-                          PairExp6ParamData.fraction1[i],
+                          PairExp6ParamData.mixWtSite1[i],
                           PairExp6ParamData.epsilon2[i],
                           PairExp6ParamData.alpha2[i],
                           PairExp6ParamData.rm2[i],
-                          PairExp6ParamData.fraction2[i],
+                          PairExp6ParamData.mixWtSite2[i],
                           PairExp6ParamData.epsilonOld1[i],
                           PairExp6ParamData.alphaOld1[i],
                           PairExp6ParamData.rmOld1[i],
-                          PairExp6ParamData.fractionOld1[i],
+                          PairExp6ParamData.mixWtSite1old[i],
                           PairExp6ParamData.epsilonOld2[i],
                           PairExp6ParamData.alphaOld2[i],
                           PairExp6ParamData.rmOld2[i],
-                          PairExp6ParamData.fractionOld2[i]);
+                          PairExp6ParamData.mixWtSite2old[i]);
      }
   }
 
@@ -212,19 +224,19 @@ void PairExp6rx::compute(int eflag, int vflag)
        epsilon1_i     = PairExp6ParamData.epsilon1[i];
        alpha1_i       = PairExp6ParamData.alpha1[i];
        rm1_i          = PairExp6ParamData.rm1[i];
-       fraction1_i    = PairExp6ParamData.fraction1[i];
+       mixWtSite1_i    = PairExp6ParamData.mixWtSite1[i];
        epsilon2_i     = PairExp6ParamData.epsilon2[i];
        alpha2_i       = PairExp6ParamData.alpha2[i];
        rm2_i          = PairExp6ParamData.rm2[i];
-       fraction2_i    = PairExp6ParamData.fraction2[i];
+       mixWtSite2_i    = PairExp6ParamData.mixWtSite2[i];
        epsilonOld1_i  = PairExp6ParamData.epsilonOld1[i];
        alphaOld1_i    = PairExp6ParamData.alphaOld1[i];
        rmOld1_i       = PairExp6ParamData.rmOld1[i];
-       fractionOld1_i = PairExp6ParamData.fractionOld1[i];
+       mixWtSite1old_i = PairExp6ParamData.mixWtSite1old[i];
        epsilonOld2_i  = PairExp6ParamData.epsilonOld2[i];
        alphaOld2_i    = PairExp6ParamData.alphaOld2[i];
        rmOld2_i       = PairExp6ParamData.rmOld2[i];
-       fractionOld2_i = PairExp6ParamData.fractionOld2[i];
+       mixWtSite2old_i = PairExp6ParamData.mixWtSite2old[i];
     }
 
     for (jj = 0; jj < jnum; jj++) {
@@ -259,19 +271,19 @@ void PairExp6rx::compute(int eflag, int vflag)
            epsilon1_j     = PairExp6ParamData.epsilon1[j];
            alpha1_j       = PairExp6ParamData.alpha1[j];
            rm1_j          = PairExp6ParamData.rm1[j];
-           fraction1_j    = PairExp6ParamData.fraction1[j];
+           mixWtSite1_j    = PairExp6ParamData.mixWtSite1[j];
            epsilon2_j     = PairExp6ParamData.epsilon2[j];
            alpha2_j       = PairExp6ParamData.alpha2[j];
            rm2_j          = PairExp6ParamData.rm2[j];
-           fraction2_j    = PairExp6ParamData.fraction2[j];
+           mixWtSite2_j    = PairExp6ParamData.mixWtSite2[j];
            epsilonOld1_j  = PairExp6ParamData.epsilonOld1[j];
            alphaOld1_j    = PairExp6ParamData.alphaOld1[j];
            rmOld1_j       = PairExp6ParamData.rmOld1[j];
-           fractionOld1_j = PairExp6ParamData.fractionOld1[j];
+           mixWtSite1old_j = PairExp6ParamData.mixWtSite1old[j];
            epsilonOld2_j  = PairExp6ParamData.epsilonOld2[j];
            alphaOld2_j    = PairExp6ParamData.alphaOld2[j];
            rmOld2_j       = PairExp6ParamData.rmOld2[j];
-           fractionOld2_j = PairExp6ParamData.fractionOld2[j];
+           mixWtSite2old_j = PairExp6ParamData.mixWtSite2old[j];
         }
 
         // A2.  Apply Lorentz-Berthelot mixing rules for the i-j pair
@@ -372,9 +384,9 @@ void PairExp6rx::compute(int eflag, int vflag)
           }
 
           if (isite1 == isite2)
-            evdwlOld = sqrt(fractionOld1_i*fractionOld2_j)*evdwlOldEXP6_12;
+            evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwlOldEXP6_12;
           else
-            evdwlOld = sqrt(fractionOld1_i*fractionOld2_j)*evdwlOldEXP6_12 + sqrt(fractionOld2_i*fractionOld1_j)*evdwlOldEXP6_21;
+            evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwlOldEXP6_12 + sqrt(mixWtSite2old_i*mixWtSite1old_j)*evdwlOldEXP6_21;
 
           evdwlOld *= factor_lj;
 
@@ -455,8 +467,8 @@ void PairExp6rx::compute(int eflag, int vflag)
           //
           // Apply Mixing Rule to get the overall force for the CG pair
           //
-          if (isite1 == isite2) fpair = sqrt(fractionOld1_i*fractionOld2_j)*fpairOldEXP6_12;
-          else fpair = sqrt(fractionOld1_i*fractionOld2_j)*fpairOldEXP6_12 + sqrt(fractionOld2_i*fractionOld1_j)*fpairOldEXP6_21;
+          if (isite1 == isite2) fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpairOldEXP6_12;
+          else fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpairOldEXP6_12 + sqrt(mixWtSite2old_i*mixWtSite1old_j)*fpairOldEXP6_21;
 
           f[i][0] += delx*fpair;
           f[i][1] += dely*fpair;
@@ -467,8 +479,8 @@ void PairExp6rx::compute(int eflag, int vflag)
             f[j][2] -= delz*fpair;
           }
 
-          if (isite1 == isite2) evdwl = sqrt(fraction1_i*fraction2_j)*evdwlEXP6_12;
-          else evdwl = sqrt(fraction1_i*fraction2_j)*evdwlEXP6_12 + sqrt(fraction2_i*fraction1_j)*evdwlEXP6_21;
+          if (isite1 == isite2) evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwlEXP6_12;
+          else evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwlEXP6_12 + sqrt(mixWtSite2_i*mixWtSite1_j)*evdwlEXP6_21;
           evdwl *= factor_lj;
 
           uCGnew[i]   += 0.5*evdwl;
@@ -488,19 +500,19 @@ void PairExp6rx::compute(int eflag, int vflag)
      if (PairExp6ParamData.epsilon1    ) memory->destroy(PairExp6ParamData.epsilon1);
      if (PairExp6ParamData.alpha1      ) memory->destroy(PairExp6ParamData.alpha1);
      if (PairExp6ParamData.rm1         ) memory->destroy(PairExp6ParamData.rm1);
-     if (PairExp6ParamData.fraction1   ) memory->destroy(PairExp6ParamData.fraction1);
+     if (PairExp6ParamData.mixWtSite1   ) memory->destroy(PairExp6ParamData.mixWtSite1);
      if (PairExp6ParamData.epsilon2    ) memory->destroy(PairExp6ParamData.epsilon2);
      if (PairExp6ParamData.alpha2      ) memory->destroy(PairExp6ParamData.alpha2);
      if (PairExp6ParamData.rm2         ) memory->destroy(PairExp6ParamData.rm2);
-     if (PairExp6ParamData.fraction2   ) memory->destroy(PairExp6ParamData.fraction2);
+     if (PairExp6ParamData.mixWtSite2   ) memory->destroy(PairExp6ParamData.mixWtSite2);
      if (PairExp6ParamData.epsilonOld1 ) memory->destroy(PairExp6ParamData.epsilonOld1);
      if (PairExp6ParamData.alphaOld1   ) memory->destroy(PairExp6ParamData.alphaOld1);
      if (PairExp6ParamData.rmOld1      ) memory->destroy(PairExp6ParamData.rmOld1);
-     if (PairExp6ParamData.fractionOld1) memory->destroy(PairExp6ParamData.fractionOld1);
+     if (PairExp6ParamData.mixWtSite1old) memory->destroy(PairExp6ParamData.mixWtSite1old);
      if (PairExp6ParamData.epsilonOld2 ) memory->destroy(PairExp6ParamData.epsilonOld2);
      if (PairExp6ParamData.alphaOld2   ) memory->destroy(PairExp6ParamData.alphaOld2);
      if (PairExp6ParamData.rmOld2      ) memory->destroy(PairExp6ParamData.rmOld2);
-     if (PairExp6ParamData.fractionOld2) memory->destroy(PairExp6ParamData.fractionOld2);
+     if (PairExp6ParamData.mixWtSite2old) memory->destroy(PairExp6ParamData.mixWtSite2old);
   }
 
 }
@@ -530,10 +542,20 @@ void PairExp6rx::allocate()
 
 void PairExp6rx::settings(int narg, char **arg)
 {
-  if (narg != 1) error->all(FLERR,"Illegal pair_style command");
+  if (narg < 1) error->all(FLERR,"Illegal pair_style command");
 
   cut_global = force->numeric(FLERR,arg[0]);
 
+  // optional keywords
+
+  int iarg = 1;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"fractional") == 0) fractionalWeighting = true;
+    else if (strcmp(arg[iarg],"molecular") == 0) fractionalWeighting = false;
+    else error->all(FLERR,"Illegal pair_style command");
+    iarg++;
+  }
+
   if (allocated) {
     int i,j;
     for (i = 1; i <= atom->ntypes; i++)
@@ -551,7 +573,7 @@ void PairExp6rx::settings(int narg, char **arg)
 
 void PairExp6rx::coeff(int narg, char **arg)
 {
-  if (narg < 7 || narg > 8) error->all(FLERR,"Incorrect args for pair coefficients");
+  if (narg < 6 || narg > 9) error->all(FLERR,"Incorrect args for pair coefficients");
 
   bool rx_flag = false;
   for (int i = 0; i < modify->nfix; i++)
@@ -628,21 +650,36 @@ void PairExp6rx::coeff(int narg, char **arg)
           params[iparam].potentialType = exp6PotentialType;
         else
           error->all(FLERR,"params[].potential type unknown");
-
-        //printf("params[%d].name= %s ispecies= %d potential= %s potentialType= %d\n", iparam, params[iparam].name, params[iparam].ispecies, params[iparam].potential, params[iparam].potentialType);
       }
   }
   delete[] site1;
   delete[] site2;
   site1 = site2 = NULL;
 
-  fuchslinR = force->numeric(FLERR,arg[5]);
-  fuchslinEpsilon = force->numeric(FLERR,arg[6]);
-
   setup();
 
   double cut_one = cut_global;
-  if (narg == 8) cut_one = force->numeric(FLERR,arg[7]);
+  if (strcmp(arg[5],"exponent") == 0){
+    scalingFlag = EXPONENT;
+    exponentR = force->numeric(FLERR,arg[6]);
+    exponentEpsilon = force->numeric(FLERR,arg[7]);
+    if (narg > 9) error->all(FLERR,"Incorrect args for pair coefficients");
+    if (narg == 9) cut_one = force->numeric(FLERR,arg[8]);
+  } else if (strcmp(arg[5],"polynomial") == 0){
+    scalingFlag = POLYNOMIAL;
+    memory->create(coeffAlpha,6,"pair:coeffAlpha");
+    memory->create(coeffEps,6,"pair:coeffEps");
+    memory->create(coeffRm,6,"pair:coeffRm");
+    read_file2(arg[6]);
+    if (narg > 8) error->all(FLERR,"Incorrect args for pair coefficients");
+    if (narg == 8) cut_one = force->numeric(FLERR,arg[7]);
+  } else if (strcmp(arg[5],"none") == 0){
+    scalingFlag = NONE;
+    if (narg > 7) error->all(FLERR,"Incorrect args for pair coefficients");
+    if (narg == 7) cut_one = force->numeric(FLERR,arg[6]);
+  } else {
+    error->all(FLERR,"Incorrect args for pair coefficients");
+  }
 
   int count = 0;
   for (int i = ilo; i <= ihi; i++) {
@@ -784,6 +821,95 @@ void PairExp6rx::read_file(char *file)
 
 /* ---------------------------------------------------------------------- */
 
+void PairExp6rx::read_file2(char *file)
+{
+  int params_per_line = 7;
+  char **words = new char*[params_per_line+1];
+
+  // open file on proc 0
+
+  FILE *fp;
+  fp = NULL;
+  if (comm->me == 0) {
+    fp = fopen(file,"r");
+    if (fp == NULL) {
+      char str[128];
+      sprintf(str,"Cannot open polynomial file %s",file);
+      error->one(FLERR,str);
+    }
+  }
+
+  // one set of params can span multiple lines
+  int n,nwords;
+  char line[MAXLINE],*ptr;
+  int eof = 0;
+
+  while (1) {
+    if (comm->me == 0) {
+      ptr = fgets(line,MAXLINE,fp);
+      if (ptr == NULL) {
+        eof = 1;
+        fclose(fp);
+      } else n = strlen(line) + 1;
+    }
+    MPI_Bcast(&eof,1,MPI_INT,0,world);
+    if (eof) break;
+    MPI_Bcast(&n,1,MPI_INT,0,world);
+    MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+    // strip comment, skip line if blank
+
+    if ((ptr = strchr(line,'#'))) *ptr = '\0';
+    nwords = atom->count_words(line);
+    if (nwords == 0) continue;
+
+    // concatenate additional lines until have params_per_line words
+
+    while (nwords < params_per_line) {
+      n = strlen(line);
+      if (comm->me == 0) {
+        ptr = fgets(&line[n],MAXLINE-n,fp);
+        if (ptr == NULL) {
+          eof = 1;
+          fclose(fp);
+        } else n = strlen(line) + 1;
+      }
+      MPI_Bcast(&eof,1,MPI_INT,0,world);
+      if (eof) break;
+      MPI_Bcast(&n,1,MPI_INT,0,world);
+      MPI_Bcast(line,n,MPI_CHAR,0,world);
+      if ((ptr = strchr(line,'#'))) *ptr = '\0';
+      nwords = atom->count_words(line);
+    }
+
+    if (nwords != params_per_line)
+      error->all(FLERR,"Incorrect format in polynomial file");
+
+    // words = ptrs to all words in line
+
+    nwords = 0;
+    words[nwords++] = strtok(line," \t\n\r\f");
+    while ((words[nwords++] = strtok(NULL," \t\n\r\f"))) continue;
+
+    if (strcmp(words[0],"alpha") == 0){
+      for (int ii=1; ii<params_per_line; ii++)
+        coeffAlpha[ii-1] = atof(words[ii]);
+    }
+    if (strcmp(words[0],"epsilon") == 0){
+      for (int ii=1; ii<params_per_line; ii++)
+        coeffEps[ii-1] = atof(words[ii]);
+    }
+    if (strcmp(words[0],"rm") == 0){
+      for (int ii=1; ii<params_per_line; ii++)
+        coeffRm[ii-1] = atof(words[ii]);
+    }
+  }
+
+  delete [] words;
+}
+
+/* ---------------------------------------------------------------------- */
+
 void PairExp6rx::setup()
 {
   int i,j,n;
@@ -881,7 +1007,7 @@ void PairExp6rx::read_restart_settings(FILE *fp)
 
 /* ---------------------------------------------------------------------- */
 
-void PairExp6rx::getParamsEXP6(int id,double &epsilon1,double &alpha1,double &rm1, double &fraction1,double &epsilon2,double &alpha2,double &rm2,double &fraction2,double &epsilon1_old,double &alpha1_old,double &rm1_old, double &fraction1_old,double &epsilon2_old,double &alpha2_old,double &rm2_old,double &fraction2_old) const
+void PairExp6rx::getMixingWeights(int id,double &epsilon1,double &alpha1,double &rm1, double &mixWtSite1,double &epsilon2,double &alpha2,double &rm2,double &mixWtSite2,double &epsilon1_old,double &alpha1_old,double &rm1_old, double &mixWtSite1old,double &epsilon2_old,double &alpha2_old,double &rm2_old,double &mixWtSite2old) const
 {
   int iparam, jparam;
   double rmi, rmj, rmij, rm3ij;
@@ -889,11 +1015,16 @@ void PairExp6rx::getParamsEXP6(int id,double &epsilon1,double &alpha1,double &rm
   double alphai, alphaj, alphaij;
   double epsilon_old, rm3_old, alpha_old;
   double epsilon, rm3, alpha;
-  double fractionOFA, fractionOFA_old;
-  double nTotalOFA, nTotalOFA_old;
-  double nTotal, nTotal_old;
   double xMolei, xMolej, xMolei_old, xMolej_old;
 
+  double fractionOFAold, fractionOFA;
+  double fractionOld1, fraction1;
+  double fractionOld2, fraction2;
+  double nMoleculesOFAold, nMoleculesOFA;
+  double nMoleculesOld1, nMolecules1;
+  double nMoleculesOld2, nMolecules2;
+  double nTotal, nTotalOld;
+
   rm3 = 0.0;
   epsilon = 0.0;
   alpha = 0.0;
@@ -901,32 +1032,32 @@ void PairExp6rx::getParamsEXP6(int id,double &epsilon1,double &alpha1,double &rm
   rm3_old = 0.0;
   alpha_old = 0.0;
   fractionOFA = 0.0;
-  fractionOFA_old = 0.0;
-  nTotalOFA = 0.0;
-  nTotalOFA_old = 0.0;
+  fractionOFAold = 0.0;
+  nMoleculesOFA = 0.0;
+  nMoleculesOFAold = 0.0;
   nTotal = 0.0;
-  nTotal_old = 0.0;
+  nTotalOld = 0.0;
 
   // Compute the total number of molecules in the old and new CG particle as well as the total number of molecules in the fluid portion of the old and new CG particle
   for (int ispecies = 0; ispecies < nspecies; ispecies++){
     nTotal += atom->dvector[ispecies][id];
-    nTotal_old += atom->dvector[ispecies+nspecies][id];
+    nTotalOld += atom->dvector[ispecies+nspecies][id];
 
     iparam = mol2param[ispecies];
 
     if (iparam < 0 || params[iparam].potentialType != exp6PotentialType ) continue;
     if (isOneFluidApprox(isite1) || isOneFluidApprox(isite2)) {
       if (isite1 == params[iparam].ispecies || isite2 == params[iparam].ispecies) continue;
-      nTotalOFA_old += atom->dvector[ispecies+nspecies][id];
-      nTotalOFA += atom->dvector[ispecies][id];
+      nMoleculesOFAold += atom->dvector[ispecies+nspecies][id];
+      nMoleculesOFA += atom->dvector[ispecies][id];
     }
   }
-  if(nTotal < 1e-8 || nTotal_old < 1e-8)
-    error->all(FLERR,"The number of molecules in CG particle is less than 1e-8.");
+  if(nTotal < MY_EPSILON || nTotalOld < MY_EPSILON)
+    error->all(FLERR,"The number of molecules in CG particle is less than 10*DBL_EPSILON.");
 
   // Compute the mole fraction of molecules within the fluid portion of the particle (One Fluid Approximation)
-  fractionOFA_old = nTotalOFA_old / nTotal_old;
-  fractionOFA = nTotalOFA / nTotal;
+  fractionOFAold = nMoleculesOFAold / nTotalOld;
+  fractionOFA = nMoleculesOFA / nTotal;
 
   for (int ispecies = 0; ispecies < nspecies; ispecies++) {
     iparam = mol2param[ispecies];
@@ -942,8 +1073,10 @@ void PairExp6rx::getParamsEXP6(int id,double &epsilon1,double &alpha1,double &rm
       alpha1 = params[iparam].alpha;
 
       // Compute the mole fraction of Site1
-      fraction1_old = atom->dvector[ispecies+nspecies][id]/nTotal_old;
-      fraction1 = atom->dvector[ispecies][id]/nTotal;
+      nMoleculesOld1 = atom->dvector[ispecies+nspecies][id];
+      nMolecules1 = atom->dvector[ispecies][id];
+      fractionOld1 = nMoleculesOld1/nTotalOld;
+      fraction1 = nMolecules1/nTotal;
     }
 
     // If Site2 matches a pure species, then grab the parameters
@@ -956,7 +1089,9 @@ void PairExp6rx::getParamsEXP6(int id,double &epsilon1,double &alpha1,double &rm
       alpha2 = params[iparam].alpha;
 
       // Compute the mole fraction of Site2
-      fraction2_old = atom->dvector[ispecies+nspecies][id]/nTotal_old;
+      nMoleculesOld2 = atom->dvector[ispecies+nspecies][id];
+      nMolecules2 = atom->dvector[ispecies][id];
+      fractionOld2 = atom->dvector[ispecies+nspecies][id]/nTotalOld;
       fraction2 = atom->dvector[ispecies][id]/nTotal;
     }
 
@@ -966,8 +1101,10 @@ void PairExp6rx::getParamsEXP6(int id,double &epsilon1,double &alpha1,double &rm
       rmi = params[iparam].rm;
       epsiloni = params[iparam].epsilon;
       alphai = params[iparam].alpha;
-      xMolei = atom->dvector[ispecies][id]/nTotalOFA;
-      xMolei_old = atom->dvector[ispecies+nspecies][id]/nTotalOFA_old;
+      if(nMoleculesOFA<MY_EPSILON) xMolei = 0.0;
+      else xMolei = atom->dvector[ispecies][id]/nMoleculesOFA;
+      if(nMoleculesOFAold<MY_EPSILON) xMolei_old = 0.0;
+      else xMolei_old = atom->dvector[ispecies+nspecies][id]/nMoleculesOFAold;
 
       for (int jspecies = 0; jspecies < nspecies; jspecies++) {
         jparam = mol2param[jspecies];
@@ -976,15 +1113,17 @@ void PairExp6rx::getParamsEXP6(int id,double &epsilon1,double &alpha1,double &rm
         rmj = params[jparam].rm;
         epsilonj = params[jparam].epsilon;
         alphaj = params[jparam].alpha;
-        xMolej = atom->dvector[jspecies][id]/nTotalOFA;
-        xMolej_old = atom->dvector[jspecies+nspecies][id]/nTotalOFA_old;
+        if(nMoleculesOFA<MY_EPSILON) xMolej = 0.0;
+        else xMolej = atom->dvector[jspecies][id]/nMoleculesOFA;
+        if(nMoleculesOFAold<MY_EPSILON) xMolej_old = 0.0;
+        else xMolej_old = atom->dvector[jspecies+nspecies][id]/nMoleculesOFAold;
 
         rmij = (rmi+rmj)/2.0;
         rm3ij = rmij*rmij*rmij;
         epsilonij = sqrt(epsiloni*epsilonj);
         alphaij = sqrt(alphai*alphaj);
 
-        if(fractionOFA_old > 0.0){
+        if(fractionOFAold > 0.0){
           rm3_old += xMolei_old*xMolej_old*rm3ij;
           epsilon_old += xMolei_old*xMolej_old*rm3ij*epsilonij;
           alpha_old += xMolei_old*xMolej_old*rm3ij*epsilonij*alphaij;
@@ -1000,7 +1139,7 @@ void PairExp6rx::getParamsEXP6(int id,double &epsilon1,double &alpha1,double &rm
 
   if (isOneFluidApprox(isite1)){
     rm1 = cbrt(rm3);
-    if(rm1 < 1e-16) {
+    if(rm1 < MY_EPSILON) {
       rm1 = 0.0;
       epsilon1 = 0.0;
       alpha1 = 0.0;
@@ -1008,11 +1147,11 @@ void PairExp6rx::getParamsEXP6(int id,double &epsilon1,double &alpha1,double &rm
       epsilon1 = epsilon / rm3;
       alpha1 = alpha / epsilon1 / rm3;
     }
-
+    nMolecules1 = 1.0-(nTotal-nMoleculesOFA);
     fraction1 = fractionOFA;
 
     rm1_old = cbrt(rm3_old);
-    if(rm1_old < 1e-16) {
+    if(rm1_old < MY_EPSILON) {
       rm1_old = 0.0;
       epsilon1_old = 0.0;
       alpha1_old = 0.0;
@@ -1020,42 +1159,21 @@ void PairExp6rx::getParamsEXP6(int id,double &epsilon1,double &alpha1,double &rm
       epsilon1_old = epsilon_old / rm3_old;
       alpha1_old = alpha_old / epsilon1_old / rm3_old;
     }
-    fraction1_old = fractionOFA_old;
+    nMoleculesOld1 = 1.0-(nTotalOld-nMoleculesOFAold);
+    fractionOld1 = fractionOFAold;
 
-    // Fuchslin-Like Exp-6 Scaling
-    double powfuch = 0.0;
-    if(fuchslinEpsilon < 0.0){
-      powfuch = pow(nTotalOFA,-fuchslinEpsilon);
-      if(powfuch<1e-15) epsilon1 = 0.0;
-      else epsilon1 *= 1.0/powfuch;
-
-      powfuch = pow(nTotalOFA_old,-fuchslinEpsilon);
-      if(powfuch<1e-15) epsilon1_old = 0.0;
-      else epsilon1_old *= 1.0/powfuch;
-
-    } else {
-      epsilon1 *= pow(nTotalOFA,fuchslinEpsilon);
-      epsilon1_old *= pow(nTotalOFA_old,fuchslinEpsilon);
-    }
-
-    if(fuchslinR < 0.0){
-      powfuch = pow(nTotalOFA,-fuchslinR);
-      if(powfuch<1e-15) rm1 = 0.0;
-      else rm1 *= 1.0/powfuch;
-
-      powfuch = pow(nTotalOFA_old,-fuchslinR);
-      if(powfuch<1e-15) rm1_old = 0.0;
-      else rm1_old *= 1.0/powfuch;
-
-    } else {
-      rm1 *= pow(nTotalOFA,fuchslinR);
-      rm1_old *= pow(nTotalOFA_old,fuchslinR);
+    if(scalingFlag == EXPONENT){
+      exponentScaling(nMoleculesOFA,epsilon1,rm1);
+      exponentScaling(nMoleculesOFAold,epsilon1_old,rm1_old);
+    } else if(scalingFlag == POLYNOMIAL){
+      polynomialScaling(nMoleculesOFA,alpha1,epsilon1,rm1);
+      polynomialScaling(nMoleculesOFAold,alpha1_old,epsilon1_old,rm1_old);
     }
   }
 
   if (isOneFluidApprox(isite2)){
     rm2 = cbrt(rm3);
-    if(rm2 < 1e-16) {
+    if(rm2 < MY_EPSILON) {
       rm2 = 0.0;
       epsilon2 = 0.0;
       alpha2 = 0.0;
@@ -1063,10 +1181,11 @@ void PairExp6rx::getParamsEXP6(int id,double &epsilon1,double &alpha1,double &rm
       epsilon2 = epsilon / rm3;
       alpha2 = alpha / epsilon2 / rm3;
     }
+    nMolecules2 = 1.0-(nTotal-nMoleculesOFA);
     fraction2 = fractionOFA;
 
     rm2_old = cbrt(rm3_old);
-    if(rm2_old < 1e-16) {
+    if(rm2_old < MY_EPSILON) {
       rm2_old = 0.0;
       epsilon2_old = 0.0;
       alpha2_old = 0.0;
@@ -1074,64 +1193,96 @@ void PairExp6rx::getParamsEXP6(int id,double &epsilon1,double &alpha1,double &rm
       epsilon2_old = epsilon_old / rm3_old;
       alpha2_old = alpha_old / epsilon2_old / rm3_old;
     }
-    fraction2_old = fractionOFA_old;
+    nMoleculesOld2 = 1.0-(nTotalOld-nMoleculesOFAold);
+    fractionOld2 = fractionOFAold;
 
-    // Fuchslin-Like Exp-6 Scaling
-    double powfuch = 0.0;
-    if(fuchslinEpsilon < 0.0){
-      powfuch = pow(nTotalOFA,-fuchslinEpsilon);
-      if(powfuch<1e-15) epsilon2 = 0.0;
-      else epsilon2 *= 1.0/powfuch;
-
-      powfuch = pow(nTotalOFA_old,-fuchslinEpsilon);
-      if(powfuch<1e-15) epsilon2_old = 0.0;
-      else epsilon2_old *= 1.0/powfuch;
-
-    } else {
-      epsilon2 *= pow(nTotalOFA,fuchslinEpsilon);
-      epsilon2_old *= pow(nTotalOFA_old,fuchslinEpsilon);
-    }
-
-    if(fuchslinR < 0.0){
-      powfuch = pow(nTotalOFA,-fuchslinR);
-      if(powfuch<1e-15) rm2 = 0.0;
-      else rm2 *= 1.0/powfuch;
-
-      powfuch = pow(nTotalOFA_old,-fuchslinR);
-      if(powfuch<1e-15) rm2_old = 0.0;
-      else rm2_old *= 1.0/powfuch;
-
-    } else {
-      rm2 *= pow(nTotalOFA,fuchslinR);
-      rm2_old *= pow(nTotalOFA_old,fuchslinR);
+    if(scalingFlag == EXPONENT){
+      exponentScaling(nMoleculesOFA,epsilon2,rm2);
+      exponentScaling(nMoleculesOFAold,epsilon2_old,rm2_old);
+    } else if(scalingFlag == POLYNOMIAL){
+      polynomialScaling(nMoleculesOFA,alpha2,epsilon2,rm2);
+      polynomialScaling(nMoleculesOFAold,alpha2_old,epsilon2_old,rm2_old);
     }
   }
 
   // Check that no fractions are less than zero
-  if(fraction1 < 0.0){
-    if(fraction1 < -1.0e-10){
-      error->all(FLERR,"Computed fraction less than -1.0e-10");
+  if(fraction1 < 0.0 || nMolecules1 < 0.0){
+    if(fraction1 < -MY_EPSILON || nMolecules1 < -MY_EPSILON){
+      error->all(FLERR,"Computed fraction less than -10*DBL_EPSILON");
     }
+    nMolecules1 = 0.0;
     fraction1 = 0.0;
   }
-  if(fraction2 < 0.0){
-    if(fraction2 < -1.0e-10){
-      error->all(FLERR,"Computed fraction less than -1.0e-10");
+  if(fraction2 < 0.0 || nMolecules2 < 0.0){
+    if(fraction2 < -MY_EPSILON || nMolecules2 < -MY_EPSILON){
+      error->all(FLERR,"Computed fraction less than -10*DBL_EPSILON");
     }
+    nMolecules2 = 0.0;
     fraction2 = 0.0;
   }
-  if(fraction1_old < 0.0){
-    if(fraction1_old < -1.0e-10){
-      error->all(FLERR,"Computed fraction less than -1.0e-10");
+  if(fractionOld1 < 0.0 || nMoleculesOld1 < 0.0){
+    if(fractionOld1 < -MY_EPSILON || nMoleculesOld1 < -MY_EPSILON){
+      error->all(FLERR,"Computed fraction less than -10*DBL_EPSILON");
     }
-    fraction1_old = 0.0;
+    nMoleculesOld1 = 0.0;
+    fractionOld1 = 0.0;
   }
-  if(fraction2_old < 0.0){
-    if(fraction2_old < -1.0e-10){
-      error->all(FLERR,"Computed fraction less than -1.0e-10");
+  if(fractionOld2 < 0.0 || nMoleculesOld2 < 0.0){
+    if(fractionOld2 < -MY_EPSILON || nMoleculesOld2 < -MY_EPSILON){
+      error->all(FLERR,"Computed fraction less than -10*DBL_EPSILON");
     }
-    fraction2_old = 0.0;
+    nMoleculesOld2 = 0.0;
+    fractionOld2 = 0.0;
   }
+
+  if(fractionalWeighting){
+    mixWtSite1old = fractionOld1;
+    mixWtSite1 = fraction1;
+    mixWtSite2old = fractionOld2;
+    mixWtSite2 = fraction2;
+  } else {
+    mixWtSite1old = nMoleculesOld1;
+    mixWtSite1 = nMolecules1;
+    mixWtSite2old = nMoleculesOld2;
+    mixWtSite2 = nMolecules2;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairExp6rx::exponentScaling(double phi, double &epsilon, double &rm) const
+{
+  double powfuch;
+
+  if(exponentEpsilon < 0.0){
+    powfuch = pow(phi,-exponentEpsilon);
+    if(powfuch<MY_EPSILON) epsilon = 0.0;
+    else epsilon *= 1.0/powfuch;
+  } else {
+    epsilon *= pow(phi,exponentEpsilon);
+  }
+
+  if(exponentR < 0.0){
+    powfuch = pow(phi,-exponentR);
+    if(powfuch<MY_EPSILON) rm = 0.0;
+    else rm *= 1.0/powfuch;
+  } else {
+    rm *= pow(phi,exponentR);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairExp6rx::polynomialScaling(double phi, double &alpha, double &epsilon, double &rm) const
+{
+    double phi2 = phi*phi;
+    double phi3 = phi2*phi;
+    double phi4 = phi2*phi2;
+    double phi5 = phi2*phi3;
+
+    alpha = (coeffAlpha[0]*phi5 + coeffAlpha[1]*phi4 + coeffAlpha[2]*phi3 + coeffAlpha[3]*phi2 + coeffAlpha[4]*phi + coeffAlpha[5]);
+    epsilon *= (coeffEps[0]*phi5 + coeffEps[1]*phi4 + coeffEps[2]*phi3 + coeffEps[3]*phi2 + coeffEps[4]*phi + coeffEps[5]);
+    rm *= (coeffRm[0]*phi5 + coeffRm[1]*phi4 + coeffRm[2]*phi3 + coeffRm[3]*phi2 + coeffRm[4]*phi + coeffRm[5]);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/USER-DPD/pair_exp6_rx.h b/src/USER-DPD/pair_exp6_rx.h
index 2dfc1c1a2e..a7531da318 100644
--- a/src/USER-DPD/pair_exp6_rx.h
+++ b/src/USER-DPD/pair_exp6_rx.h
@@ -47,6 +47,7 @@ class PairExp6rx : public Pair {
 
  protected:
   enum{LINEAR};
+  enum{NONE,EXPONENT,POLYNOMIAL};
   double cut_global;
   double **cut;
   double **epsilon,**rm,**alpha;
@@ -60,12 +61,18 @@ class PairExp6rx : public Pair {
 
   int nspecies;
   virtual void read_file(char *);
+  void read_file2(char *);
   void setup();
 
   int isite1, isite2;
   char *site1, *site2;
-  double fuchslinR, fuchslinEpsilon;
-  void getParamsEXP6(int, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &) const;
+  void getMixingWeights(int, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &) const;
+  double exponentR, exponentEpsilon;
+  int scalingFlag;
+  void exponentScaling(double, double &, double &) const;
+  void polynomialScaling(double, double &, double &, double &) const;
+  double *coeffAlpha, *coeffEps, *coeffRm;
+  bool fractionalWeighting;
 
   inline double func_rin(const double &) const;
   inline double expValue(const double) const;
@@ -129,7 +136,7 @@ E:  Potential file has duplicate entry.
 
 Self-explanatory
 
-E:  The number of molecules in CG particle is less than 1e-8.
+E:  The number of molecules in CG particle is less than 10*DBL_EPSILON.
 
 Self-explanatory.  Check the species concentrations have been properly set
 and check the reaction kinetic solver parameters in fix rx to more for
diff --git a/src/USER-DPD/pair_multi_lucy_rx.cpp b/src/USER-DPD/pair_multi_lucy_rx.cpp
index 6b5c7cf40a..f681795b0f 100644
--- a/src/USER-DPD/pair_multi_lucy_rx.cpp
+++ b/src/USER-DPD/pair_multi_lucy_rx.cpp
@@ -43,6 +43,12 @@ enum{NONE,RLINEAR,RSQ};
 
 #define MAXLINE 1024
 
+#ifdef DBL_EPSILON
+  #define MY_EPSILON (10.0*DBL_EPSILON)
+#else
+  #define MY_EPSILON (10.0*2.220446049250313e-16)
+#endif
+
 #define oneFluidParameter (-1)
 #define isOneFluid(_site) ( (_site) == oneFluidParameter )
 
@@ -72,6 +78,7 @@ PairMultiLucyRX::PairMultiLucyRX(LAMMPS *lmp) : Pair(lmp),
   comm_forward = 1;
   comm_reverse = 1;
 
+  fractionalWeighting = true;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -114,9 +121,9 @@ void PairMultiLucyRX::compute(int eflag, int vflag)
   int nghost = atom->nghost;
   int newton_pair = force->newton_pair;
 
-  double fractionOld1_i,fractionOld1_j;
-  double fractionOld2_i,fractionOld2_j;
-  double fraction1_i;
+  double mixWtSite1old_i,mixWtSite1old_j;
+  double mixWtSite2old_i,mixWtSite2old_j;
+  double mixWtSite1_i;
   double *uCG = atom->uCG;
   double *uCGnew = atom->uCGnew;
 
@@ -126,20 +133,20 @@ void PairMultiLucyRX::compute(int eflag, int vflag)
   int jtable;
   double *rho = atom->rho;
 
-  double *fractionOld1 = NULL;
-  double *fractionOld2 = NULL;
-  double *fraction1 = NULL;
-  double *fraction2 = NULL;
+  double *mixWtSite1old = NULL;
+  double *mixWtSite2old = NULL;
+  double *mixWtSite1 = NULL;
+  double *mixWtSite2 = NULL;
 
   {
     const int ntotal = nlocal + nghost;
-    memory->create(fractionOld1, ntotal, "PairMultiLucyRX::fractionOld1");
-    memory->create(fractionOld2, ntotal, "PairMultiLucyRX::fractionOld2");
-    memory->create(fraction1, ntotal, "PairMultiLucyRX::fraction1");
-    memory->create(fraction2, ntotal, "PairMultiLucyRX::fraction2");
+    memory->create(mixWtSite1old, ntotal, "PairMultiLucyRX::mixWtSite1old");
+    memory->create(mixWtSite2old, ntotal, "PairMultiLucyRX::mixWtSite2old");
+    memory->create(mixWtSite1, ntotal, "PairMultiLucyRX::mixWtSite1");
+    memory->create(mixWtSite2, ntotal, "PairMultiLucyRX::mixWtSite2");
 
     for (int i = 0; i < ntotal; ++i)
-       getParams(i, fractionOld1[i], fractionOld2[i], fraction1[i], fraction2[i]);
+       getMixingWeights(i, mixWtSite1old[i], mixWtSite2old[i], mixWtSite1[i], mixWtSite2[i]);
   }
 
   inum = list->inum;
@@ -164,9 +171,9 @@ void PairMultiLucyRX::compute(int eflag, int vflag)
     double fy_i = 0.0;
     double fz_i = 0.0;
 
-    fractionOld1_i = fractionOld1[i];
-    fractionOld2_i = fractionOld2[i];
-    fraction1_i = fraction1[i];
+    mixWtSite1old_i = mixWtSite1old[i];
+    mixWtSite2old_i = mixWtSite2old[i];
+    mixWtSite1_i = mixWtSite1[i];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
@@ -181,8 +188,8 @@ void PairMultiLucyRX::compute(int eflag, int vflag)
       if (rsq < cutsq[itype][jtype]) {
         fpair = 0.0;
 
-        fractionOld1_j = fractionOld1[j];
-        fractionOld2_j = fractionOld2[j];
+        mixWtSite1old_j = mixWtSite1old[j];
+        mixWtSite2old_j = mixWtSite2old[j];
 
         tb = &tables[tabindex[itype][jtype]];
         if (rho[i]*rho[i] < tb->innersq || rho[j]*rho[j] < tb->innersq){
@@ -237,8 +244,8 @@ void PairMultiLucyRX::compute(int eflag, int vflag)
 
         } else error->one(FLERR,"Only LOOKUP and LINEAR table styles have been implemented for pair multi/lucy/rx");
 
-        if (isite1 == isite2) fpair = sqrt(fractionOld1_i*fractionOld2_j)*fpair;
-        else fpair = (sqrt(fractionOld1_i*fractionOld2_j) + sqrt(fractionOld2_i*fractionOld1_j))*fpair;
+        if (isite1 == isite2) fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpair;
+        else fpair = (sqrt(mixWtSite1old_i*mixWtSite2old_j) + sqrt(mixWtSite2old_i*mixWtSite1old_j))*fpair;
 
         fx_i += delx*fpair;
         fy_i += dely*fpair;
@@ -270,8 +277,8 @@ void PairMultiLucyRX::compute(int eflag, int vflag)
     } else error->one(FLERR,"Only LOOKUP and LINEAR table styles have been implemented for pair multi/lucy/rx");
 
     evdwl *=(pi*cutsq[itype][itype]*cutsq[itype][itype])/84.0;
-    evdwlOld = fractionOld1_i*evdwl;
-    evdwl = fraction1_i*evdwl;
+    evdwlOld = mixWtSite1old_i*evdwl;
+    evdwl = mixWtSite1_i*evdwl;
 
     uCG[i] += evdwlOld;
     uCGnew[i] += evdwl;
@@ -283,10 +290,10 @@ void PairMultiLucyRX::compute(int eflag, int vflag)
 
   if (vflag_fdotr) virial_fdotr_compute();
 
-  memory->destroy(fractionOld1);
-  memory->destroy(fractionOld2);
-  memory->destroy(fraction1);
-  memory->destroy(fraction2);
+  memory->destroy(mixWtSite1old);
+  memory->destroy(mixWtSite2old);
+  memory->destroy(mixWtSite1);
+  memory->destroy(mixWtSite2);
 }
 
 /* ----------------------------------------------------------------------
@@ -313,7 +320,7 @@ void PairMultiLucyRX::allocate()
 
 void PairMultiLucyRX::settings(int narg, char **arg)
 {
-  if (narg != 2) error->all(FLERR,"Illegal pair_style command");
+  if (narg < 2) error->all(FLERR,"Illegal pair_style command");
 
   // new settings
 
@@ -324,6 +331,16 @@ void PairMultiLucyRX::settings(int narg, char **arg)
   tablength = force->inumeric(FLERR,arg[1]);
   if (tablength < 2) error->all(FLERR,"Illegal number of pair table entries");
 
+  // optional keywords
+
+  int iarg = 2;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"fractional") == 0)   fractionalWeighting = true;
+    else if (strcmp(arg[iarg],"molecular") == 0)   fractionalWeighting = false;
+    else error->all(FLERR,"Illegal pair_style command");
+    iarg++;
+  }
+
   // delete old tables, since cannot just change settings
 
   for (int m = 0; m < ntables; m++) free_table(&tables[m]);
@@ -930,9 +947,14 @@ void PairMultiLucyRX::computeLocalDensity()
 
 /* ---------------------------------------------------------------------- */
 
-void PairMultiLucyRX::getParams(int id, double &fractionOld1, double &fractionOld2, double &fraction1, double &fraction2)
+void PairMultiLucyRX::getMixingWeights(int id, double &mixWtSite1old, double &mixWtSite2old, double &mixWtSite1, double &mixWtSite2)
 {
-  double fractionOld, fraction;
+  double fractionOFAold, fractionOFA;
+  double fractionOld1, fraction1;
+  double fractionOld2, fraction2;
+  double nMoleculesOFAold, nMoleculesOFA;
+  double nMoleculesOld1, nMolecules1;
+  double nMoleculesOld2, nMolecules2;
   double nTotal, nTotalOld;
 
   nTotal = 0.0;
@@ -943,32 +965,56 @@ void PairMultiLucyRX::getParams(int id, double &fractionOld1, double &fractionOl
   }
 
   if (isOneFluid(isite1) == false){
-    fractionOld1 = atom->dvector[isite1+nspecies][id]/nTotalOld;
-    fraction1 = atom->dvector[isite1][id]/nTotal;
+    nMoleculesOld1 = atom->dvector[isite1+nspecies][id];
+    nMolecules1 = atom->dvector[isite1][id];
+    fractionOld1 = nMoleculesOld1/nTotalOld;
+    fraction1 = nMolecules1/nTotal;
   }
   if (isOneFluid(isite2) == false){
-    fractionOld2 = atom->dvector[isite2+nspecies][id]/nTotalOld;
-    fraction2 = atom->dvector[isite2][id]/nTotal;
+    nMoleculesOld2 = atom->dvector[isite2+nspecies][id];
+    nMolecules2 = atom->dvector[isite2][id];
+    fractionOld2 = nMoleculesOld2/nTotalOld;
+    fraction2 = nMolecules2/nTotal;
   }
 
   if (isOneFluid(isite1) || isOneFluid(isite2)){
-    fractionOld  = 0.0;
-    fraction  = 0.0;
+    nMoleculesOFAold  = 0.0;
+    nMoleculesOFA  = 0.0;
+    fractionOFAold  = 0.0;
+    fractionOFA  = 0.0;
 
     for (int ispecies = 0; ispecies < nspecies; ispecies++){
       if (isite1 == ispecies || isite2 == ispecies) continue;
-      fractionOld += atom->dvector[ispecies+nspecies][id] / nTotalOld;
-      fraction += atom->dvector[ispecies][id] / nTotal;
+      nMoleculesOFAold += atom->dvector[ispecies+nspecies][id];
+      nMoleculesOFA += atom->dvector[ispecies][id];
+      fractionOFAold += atom->dvector[ispecies+nspecies][id] / nTotalOld;
+      fractionOFA += atom->dvector[ispecies][id] / nTotal;
     }
     if (isOneFluid(isite1)){
-      fractionOld1 = fractionOld;
-      fraction1 = fraction;
+      nMoleculesOld1 = 1.0-(nTotalOld-nMoleculesOFAold);
+      nMolecules1 = 1.0-(nTotal-nMoleculesOFA);
+      fractionOld1 = fractionOFAold;
+      fraction1 = fractionOFA;
     }
     if (isOneFluid(isite2)){
-      fractionOld2 = fractionOld;
-      fraction2 = fraction;
+      nMoleculesOld2 = 1.0-(nTotalOld-nMoleculesOFAold);
+      nMolecules2 = 1.0-(nTotal-nMoleculesOFA);
+      fractionOld2 = fractionOFAold;
+      fraction2 = fractionOFA;
     }
   }
+
+  if(fractionalWeighting){
+    mixWtSite1old = fractionOld1;
+    mixWtSite1 = fraction1;
+    mixWtSite2old = fractionOld2;
+    mixWtSite2 = fraction2;
+  } else {
+    mixWtSite1old = nMoleculesOld1;
+    mixWtSite1 = nMolecules1;
+    mixWtSite2old = nMoleculesOld2;
+    mixWtSite2 = nMolecules2;
+  }
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/USER-DPD/pair_multi_lucy_rx.h b/src/USER-DPD/pair_multi_lucy_rx.h
index 0562739c50..5975bd6ccd 100644
--- a/src/USER-DPD/pair_multi_lucy_rx.h
+++ b/src/USER-DPD/pair_multi_lucy_rx.h
@@ -78,7 +78,8 @@ class PairMultiLucyRX : public Pair {
   int nspecies;
   char *site1, *site2;
   int isite1, isite2;
-  void getParams(int, double &, double &, double &, double &);
+  void getMixingWeights(int, double &, double &, double &, double &);
+  bool fractionalWeighting;
 
 };
 
diff --git a/src/USER-DPD/pair_table_rx.cpp b/src/USER-DPD/pair_table_rx.cpp
index 463e1838c6..e3cacc6155 100644
--- a/src/USER-DPD/pair_table_rx.cpp
+++ b/src/USER-DPD/pair_table_rx.cpp
@@ -35,6 +35,12 @@ enum{NONE,RLINEAR,RSQ,BMP};
 
 #define MAXLINE 1024
 
+#ifdef DBL_EPSILON
+  #define MY_EPSILON (10.0*DBL_EPSILON)
+#else
+  #define MY_EPSILON (10.0*2.220446049250313e-16)
+#endif
+
 #define OneFluidValue (-1)
 #define isOneFluid(_site_) ( (_site_) == OneFluidValue )
 
@@ -44,6 +50,7 @@ PairTableRX::PairTableRX(LAMMPS *lmp) : Pair(lmp)
 {
   ntables = 0;
   tables = NULL;
+  fractionalWeighting = true;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -84,21 +91,6 @@ void PairTableRX::compute(int eflag, int vflag)
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
 
-  double *fractionOld1, *fractionOld2;
-  double *fraction1, *fraction2;
-
-  {
-    const int ntotal = atom->nlocal + atom->nghost;
-
-    memory->create(fractionOld1, ntotal, "PairTableRx::compute::fractionOld1");
-    memory->create(fractionOld2, ntotal, "PairTableRx::compute::fractionOld2");
-    memory->create(fraction1, ntotal, "PairTableRx::compute::fraction1");
-    memory->create(fraction2, ntotal, "PairTableRx::compute::fraction2");
-
-    for (int i = 0; i < ntotal; ++i)
-      getParams(i, fractionOld1[i], fractionOld2[i], fraction1[i], fraction2[i]);
-  }
-
   double **x = atom->x;
   double **f = atom->f;
   int *type = atom->type;
@@ -106,13 +98,29 @@ void PairTableRX::compute(int eflag, int vflag)
   double *special_lj = force->special_lj;
   int newton_pair = force->newton_pair;
 
-  double fractionOld1_i, fractionOld1_j;
-  double fractionOld2_i, fractionOld2_j;
-  double fraction1_i, fraction1_j;
-  double fraction2_i, fraction2_j;
+  double mixWtSite1old_i, mixWtSite1old_j;
+  double mixWtSite2old_i, mixWtSite2old_j;
+  double mixWtSite1_i, mixWtSite1_j;
+  double mixWtSite2_i, mixWtSite2_j;
   double *uCG = atom->uCG;
   double *uCGnew = atom->uCGnew;
 
+  double *mixWtSite1old = NULL;
+  double *mixWtSite2old = NULL;
+  double *mixWtSite1 = NULL;
+  double *mixWtSite2 = NULL;
+
+  {
+    const int ntotal = atom->nlocal + atom->nghost;
+    memory->create(mixWtSite1old, ntotal, "PairTableRx::compute::mixWtSite1old");
+    memory->create(mixWtSite2old, ntotal, "PairTableRx::compute::mixWtSite2old");
+    memory->create(mixWtSite1, ntotal, "PairTableRx::compute::mixWtSite1");
+    memory->create(mixWtSite2, ntotal, "PairTableRx::compute::mixWtSite2");
+
+    for (int i = 0; i < ntotal; ++i)
+      getMixingWeights(i, mixWtSite1old[i], mixWtSite2old[i], mixWtSite1[i], mixWtSite2[i]);
+  }
+
   inum = list->inum;
   ilist = list->ilist;
   numneigh = list->numneigh;
@@ -132,10 +140,10 @@ void PairTableRX::compute(int eflag, int vflag)
     double uCGnew_i = 0.0;
     double fx_i = 0.0, fy_i = 0.0, fz_i = 0.0;
 
-    fractionOld1_i = fractionOld1[i];
-    fractionOld2_i = fractionOld2[i];
-    fraction1_i = fraction1[i];
-    fraction2_i = fraction2[i];
+    mixWtSite1old_i = mixWtSite1old[i];
+    mixWtSite2old_i = mixWtSite2old[i];
+    mixWtSite1_i = mixWtSite1[i];
+    mixWtSite2_i = mixWtSite2[i];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
@@ -149,10 +157,10 @@ void PairTableRX::compute(int eflag, int vflag)
       jtype = type[j];
 
       if (rsq < cutsq[itype][jtype]) {
-        fractionOld1_j = fractionOld1[j];
-        fractionOld2_j = fractionOld2[j];
-        fraction1_j = fraction1[j];
-        fraction2_j = fraction2[j];
+        mixWtSite1old_j = mixWtSite1old[j];
+        mixWtSite2old_j = mixWtSite2old[j];
+        mixWtSite1_j = mixWtSite1[j];
+        mixWtSite2_j = mixWtSite2[j];
 
         tb = &tables[tabindex[itype][jtype]];
         if (rsq < tb->innersq)
@@ -188,8 +196,8 @@ void PairTableRX::compute(int eflag, int vflag)
           value = tb->f[itable] + fraction*tb->df[itable];
           fpair = factor_lj * value;
         }
-        if (isite1 == isite2) fpair = sqrt(fractionOld1_i*fractionOld2_j)*fpair;
-        else fpair = (sqrt(fractionOld1_i*fractionOld2_j) + sqrt(fractionOld2_i*fractionOld1_j))*fpair;
+        if (isite1 == isite2) fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpair;
+        else fpair = (sqrt(mixWtSite1old_i*mixWtSite2old_j) + sqrt(mixWtSite2old_i*mixWtSite1old_j))*fpair;
 
         fx_i += delx*fpair;
         fy_i += dely*fpair;
@@ -210,11 +218,11 @@ void PairTableRX::compute(int eflag, int vflag)
             ((a*a*a-a)*tb->e2[itable] + (b*b*b-b)*tb->e2[itable+1]) *
             tb->deltasq6;
         if (isite1 == isite2){
-          evdwlOld = sqrt(fractionOld1_i*fractionOld2_j)*evdwl;
-          evdwl = sqrt(fraction1_i*fraction2_j)*evdwl;
+          evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwl;
+          evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwl;
         } else {
-          evdwlOld = (sqrt(fractionOld1_i*fractionOld2_j) + sqrt(fractionOld2_i*fractionOld1_j))*evdwl;
-          evdwl = (sqrt(fraction1_i*fraction2_j) + sqrt(fraction2_i*fraction1_j))*evdwl;
+          evdwlOld = (sqrt(mixWtSite1old_i*mixWtSite2old_j) + sqrt(mixWtSite2old_i*mixWtSite1old_j))*evdwl;
+          evdwl = (sqrt(mixWtSite1_i*mixWtSite2_j) + sqrt(mixWtSite2_i*mixWtSite1_j))*evdwl;
         }
         evdwlOld *= factor_lj;
         evdwl *= factor_lj;
@@ -240,10 +248,10 @@ void PairTableRX::compute(int eflag, int vflag)
   }
   if (vflag_fdotr) virial_fdotr_compute();
 
-  memory->destroy(fractionOld1);
-  memory->destroy(fractionOld2);
-  memory->destroy(fraction1);
-  memory->destroy(fraction2);
+  memory->destroy(mixWtSite1old);
+  memory->destroy(mixWtSite2old);
+  memory->destroy(mixWtSite1);
+  memory->destroy(mixWtSite2);
 }
 
 /* ----------------------------------------------------------------------
@@ -293,6 +301,8 @@ void PairTableRX::settings(int narg, char **arg)
     else if (strcmp(arg[iarg],"msm") == 0) msmflag = 1;
     else if (strcmp(arg[iarg],"dispersion") == 0) dispersionflag = 1;
     else if (strcmp(arg[iarg],"tip4p") == 0) tip4pflag = 1;
+    else if (strcmp(arg[iarg],"fractional") == 0)   fractionalWeighting = true;
+    else if (strcmp(arg[iarg],"molecular") == 0)   fractionalWeighting = false;
     else error->all(FLERR,"Illegal pair_style command");
     iarg++;
   }
@@ -1061,17 +1071,17 @@ double PairTableRX::single(int i, int j, int itype, int jtype, double rsq,
   int tlm1 = tablength - 1;
 
   Table *tb = &tables[tabindex[itype][jtype]];
-  double fraction1_i, fraction1_j;
-  double fraction2_i, fraction2_j;
-  double fractionOld1_i, fractionOld1_j;
-  double fractionOld2_i, fractionOld2_j;
+  double mixWtSite1_i, mixWtSite1_j;
+  double mixWtSite2_i, mixWtSite2_j;
+  double mixWtSite1old_i, mixWtSite1old_j;
+  double mixWtSite2old_i, mixWtSite2old_j;
 
   fraction = 0.0;
   a = 0.0;
   b = 0.0;
 
-  getParams(i,fractionOld1_i,fractionOld2_i,fraction1_i,fraction2_i);
-  getParams(j,fractionOld1_j,fractionOld2_j,fraction1_j,fraction2_j);
+  getMixingWeights(i,mixWtSite1old_i,mixWtSite2old_i,mixWtSite1_i,mixWtSite2_i);
+  getMixingWeights(j,mixWtSite1old_j,mixWtSite2old_j,mixWtSite1_j,mixWtSite2_j);
 
   if (rsq < tb->innersq) error->one(FLERR,"Pair distance < table inner cutoff");
 
@@ -1104,8 +1114,8 @@ double PairTableRX::single(int i, int j, int itype, int jtype, double rsq,
     fforce = factor_lj * value;
   }
 
-  if (isite1 == isite2) fforce = sqrt(fraction1_i*fraction2_j)*fforce;
-  else fforce = (sqrt(fraction1_i*fraction2_j) + sqrt(fraction2_i*fraction1_j))*fforce;
+  if (isite1 == isite2) fforce = sqrt(mixWtSite1_i*mixWtSite2_j)*fforce;
+  else fforce = (sqrt(mixWtSite1_i*mixWtSite2_j) + sqrt(mixWtSite2_i*mixWtSite1_j))*fforce;
 
   if (tabstyle == LOOKUP)
     phi = tb->e[itable];
@@ -1115,8 +1125,8 @@ double PairTableRX::single(int i, int j, int itype, int jtype, double rsq,
     phi = a * tb->e[itable] + b * tb->e[itable+1] +
       ((a*a*a-a)*tb->e2[itable] + (b*b*b-b)*tb->e2[itable+1]) * tb->deltasq6;
 
-  if (isite1 == isite2) phi = sqrt(fraction1_i*fraction2_j)*phi;
-  else phi = (sqrt(fraction1_i*fraction2_j) + sqrt(fraction2_i*fraction1_j))*phi;
+  if (isite1 == isite2) phi = sqrt(mixWtSite1_i*mixWtSite2_j)*phi;
+  else phi = (sqrt(mixWtSite1_i*mixWtSite2_j) + sqrt(mixWtSite2_i*mixWtSite1_j))*phi;
 
   return factor_lj*phi;
 }
@@ -1143,46 +1153,74 @@ void *PairTableRX::extract(const char *str, int &dim)
 
 /* ---------------------------------------------------------------------- */
 
-void PairTableRX::getParams(int id, double &fractionOld1, double &fractionOld2, double &fraction1, double &fraction2)
+void PairTableRX::getMixingWeights(int id, double &mixWtSite1old, double &mixWtSite2old, double &mixWtSite1, double &mixWtSite2)
 {
-  double nTotal = 0.0;
-  double nTotalOld = 0.0;
+  double fractionOFAold, fractionOFA;
+  double fractionOld1, fraction1;
+  double fractionOld2, fraction2;
+  double nMoleculesOFAold, nMoleculesOFA;
+  double nMoleculesOld1, nMolecules1;
+  double nMoleculesOld2, nMolecules2;
+  double nTotal, nTotalOld;
+
+  nTotal = 0.0;
+  nTotalOld = 0.0;
   for (int ispecies = 0; ispecies < nspecies; ++ispecies){
     nTotal += atom->dvector[ispecies][id];
     nTotalOld += atom->dvector[ispecies+nspecies][id];
   }
-  if(nTotal < 1e-8 || nTotalOld < 1e-8)
-    error->all(FLERR,"The number of molecules in CG particle is less than 1e-8.");
+  if(nTotal < MY_EPSILON || nTotalOld < MY_EPSILON)
+    error->all(FLERR,"The number of molecules in CG particle is less than 10*DBL_EPSILON.");
 
   if (isOneFluid(isite1) == false){
-    fractionOld1 = atom->dvector[isite1+nspecies][id]/nTotalOld;
-    fraction1 = atom->dvector[isite1][id]/nTotal;
+    nMoleculesOld1 = atom->dvector[isite1+nspecies][id];
+    nMolecules1 = atom->dvector[isite1][id];
+    fractionOld1 = nMoleculesOld1/nTotalOld;
+    fraction1 = nMolecules1/nTotal;
   }
   if (isOneFluid(isite2) == false){
-    fractionOld2 = atom->dvector[isite2+nspecies][id]/nTotalOld;
-    fraction2 = atom->dvector[isite2][id]/nTotal;
+    nMoleculesOld2 = atom->dvector[isite2+nspecies][id];
+    nMolecules2 = atom->dvector[isite2][id];
+    fractionOld2 = nMoleculesOld2/nTotalOld;
+    fraction2 = nMolecules2/nTotal;
   }
 
   if (isOneFluid(isite1) || isOneFluid(isite2)){
-    double fractionOld  = 0.0;
-    double fraction  = 0.0;
+    nMoleculesOFAold  = 0.0;
+    nMoleculesOFA  = 0.0;
+    fractionOFAold  = 0.0;
+    fractionOFA  = 0.0;
 
     for (int ispecies = 0; ispecies < nspecies; ispecies++){
       if (isite1 == ispecies || isite2 == ispecies) continue;
-
-      fractionOld += atom->dvector[ispecies+nspecies][id]/nTotalOld;
-      fraction += atom->dvector[ispecies][id]/nTotal;
+      nMoleculesOFAold += atom->dvector[ispecies+nspecies][id];
+      nMoleculesOFA += atom->dvector[ispecies][id];
+      fractionOFAold += atom->dvector[ispecies+nspecies][id]/nTotalOld;
+      fractionOFA += atom->dvector[ispecies][id]/nTotal;
     }
-
     if(isOneFluid(isite1)){
-      fractionOld1 = fractionOld;
-      fraction1 = fraction;
+      nMoleculesOld1 = 1.0-(nTotalOld-nMoleculesOFAold);
+      nMolecules1 = 1.0-(nTotal-nMoleculesOFA);
+      fractionOld1 = fractionOFAold;
+      fraction1 = fractionOFA;
     }
-
     if(isOneFluid(isite2)){
-      fractionOld2 = fractionOld;
-      fraction2 = fraction;
+      nMoleculesOld2 = 1.0-(nTotalOld-nMoleculesOFAold);
+      nMolecules2 = 1.0-(nTotal-nMoleculesOFA);
+      fractionOld2 = fractionOFAold;
+      fraction2 = fractionOFA;
     }
   }
-}
 
+  if(fractionalWeighting){
+    mixWtSite1old = fractionOld1;
+    mixWtSite1 = fraction1;
+    mixWtSite2old = fractionOld2;
+    mixWtSite2 = fraction2;
+  } else {
+    mixWtSite1old = nMoleculesOld1;
+    mixWtSite1 = nMolecules1;
+    mixWtSite2old = nMoleculesOld2;
+    mixWtSite2 = nMolecules2;
+  }
+}
diff --git a/src/USER-DPD/pair_table_rx.h b/src/USER-DPD/pair_table_rx.h
index f04ebced20..c6afe6a8d5 100644
--- a/src/USER-DPD/pair_table_rx.h
+++ b/src/USER-DPD/pair_table_rx.h
@@ -72,7 +72,8 @@ class PairTableRX : public Pair {
   int nspecies;
   char *site1, *site2;
   int isite1, isite2;
-  void getParams(int, double &, double &, double &, double &);
+  void getMixingWeights(int, double &, double &, double &, double &);
+  bool fractionalWeighting;
 
 };
 
@@ -163,7 +164,7 @@ When using pair style table with a long-range KSpace solver, the
 cutoffs for all atom type pairs must all be the same, since the
 long-range solver starts at that cutoff.
 
-E:  The number of molecules in CG particle is less than 1e-8
+E:  The number of molecules in CG particle is less than 10*DBL_EPSILON
 
 Self-explanatory.  Check the species concentrations have been properly set
 and check the reaction kinetic solver parameters in fix rx to more for

From 2af10cb8da9b8bbac64cead53eb7cae57088ed7c Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Tue, 3 Jan 2017 10:09:44 -0700
Subject: [PATCH 034/267] Updating fix_eos_table_rx_kokkos to USER-DPD changes

---
 src/KOKKOS/fix_eos_table_rx_kokkos.cpp | 64 ++++++++++++++++++++++----
 src/KOKKOS/fix_eos_table_rx_kokkos.h   |  4 +-
 2 files changed, 58 insertions(+), 10 deletions(-)

diff --git a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
index c47923680c..aff2cdfa2d 100644
--- a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
@@ -29,6 +29,13 @@
 
 #define MAXLINE 1024
 
+#ifdef DBL_EPSILON
+  #define MY_EPSILON (10.0*DBL_EPSILON)
+#else
+  #define MY_EPSILON (10.0*2.220446049250313e-16)
+#endif
+
+
 using namespace LAMMPS_NS;
 using namespace FixConst;
 
@@ -51,11 +58,31 @@ FixEOStableRXKokkos<DeviceType>::FixEOStableRXKokkos(LAMMPS *lmp, int narg, char
   k_warning_flag = DAT::tdual_int_scalar("fix:warning_flag");
 
   k_dHf = DAT::tdual_float_1d("fix:dHf",nspecies);
-  for (int n = 0; n < nspecies; n++)
+  k_energyCorr = DAT::tdual_float_1d("fix:energyCorr",nspecies);
+  k_tempCorrCoeff = DAT::tdual_float_1d("fix:tempCorrCoeff",nspecies);
+  k_moleculeCorrCoeff = DAT::tdual_float_1d("fix:moleculeCorrCoeff",nspecies);
+  for (int n = 0; n < nspecies; n++) {
     k_dHf.h_view(n) = dHf[n];
+    k_energyCorr.h_view(n) = energyCorr[n];
+    k_tempCorrCoeff.h_view(n) = tempCorrCoeff[n];
+    k_moleculeCorrCoeff.h_view(n) = moleculeCorrCoeff[n];
+  }
+
   k_dHf.modify<LMPHostType>();
   k_dHf.sync<DeviceType>();
   d_dHf = k_dHf.view<DeviceType>();
+
+  k_energyCorr.modify<LMPHostType>();
+  k_energyCorr.sync<DeviceType>();
+  d_energyCorr = k_energyCorr.view<DeviceType>();
+
+  k_tempCorrCoeff.modify<LMPHostType>();
+  k_tempCorrCoeff.sync<DeviceType>();
+  d_tempCorrCoeff = k_tempCorrCoeff.view<DeviceType>();
+
+  k_moleculeCorrCoeff.modify<LMPHostType>();
+  k_moleculeCorrCoeff.sync<DeviceType>();
+  d_moleculeCorrCoeff = k_moleculeCorrCoeff.view<DeviceType>();
 }
 
 /* ---------------------------------------------------------------------- */
@@ -268,11 +295,27 @@ template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void FixEOStableRXKokkos<DeviceType>::energy_lookup(int id, double thetai, double &ui) const
 {
-  int itable;
-  double fraction, uTmp, nTotal;
+  int itable, nPG;
+  double fraction, uTmp, nMolecules, nTotal, nTotalPG;
+  double tolerance = 1.0e-10;
 
   ui = 0.0;
   nTotal = 0.0;
+  nTotalPG = 0.0;
+  nPG = 0;
+
+  if (rx_flag) {
+    for (int ispecies = 0; ispecies < nspecies; ispecies++ ) {
+      nTotal += dvector(ispecies,id);
+      if (fabs(d_moleculeCorrCoeff[ispecies]) > tolerance) {
+        nPG++;
+        nTotalPG += dvector(ispecies,id);
+      }
+    }
+  } else {
+    nTotal = 1.0;
+  }
+
   for(int ispecies=0;ispecies<nspecies;ispecies++){
     //Table *tb = &tables[ispecies];
     //thetai = MAX(thetai,tb->lo);
@@ -289,9 +332,13 @@ void FixEOStableRXKokkos<DeviceType>::energy_lookup(int id, double thetai, doubl
       uTmp = d_table_const.e(ispecies,itable) + fraction*d_table_const.de(ispecies,itable);
 
       uTmp += d_dHf[ispecies];
-      // mol fraction form:
-      ui += dvector(ispecies,id)*uTmp;
-      nTotal += dvector(ispecies,id);
+      uTmp += d_tempCorrCoeff[ispecies]*thetai; // temperature correction
+      uTmp += d_energyCorr[ispecies]; // energy correction
+      if (nPG > 0) ui += d_moleculeCorrCoeff[ispecies]*nTotalPG/double(nPG); // molecule correction
+
+      if (rx_flag) nMolecules = dvector(ispecies,id);
+      else nMolecules = 1.0;
+      ui += nMolecules*uTmp;
     }
   }
   ui = ui - double(nTotal+1.5)*boltz*thetai;
@@ -312,6 +359,7 @@ void FixEOStableRXKokkos<DeviceType>::temperature_lookup(int id, double ui, doub
   double maxit = 100;
   double temp;
   double delta = 0.001;
+  double tolerance = 1.0e-10;
   int lo = d_table_const.lo(0);
   int hi = d_table_const.hi(0);
 
@@ -337,7 +385,7 @@ void FixEOStableRXKokkos<DeviceType>::temperature_lookup(int id, double ui, doub
 
   // Apply the Secant Method
   for(it=0; it<maxit; it++){
-    if(fabs(f2-f1)<1e-15){
+    if(fabs(f2-f1) < MY_EPSILON){
       if(isnan(f1) || isnan(f2)) k_error_flag.d_view() = 2;
       temp = t1;
       temp = MAX(temp,lo);
@@ -346,7 +394,7 @@ void FixEOStableRXKokkos<DeviceType>::temperature_lookup(int id, double ui, doub
       break;
     }
     temp = t2 - f2*(t2-t1)/(f2-f1);
-    if(fabs(temp-t2) < 1e-6) break;
+    if(fabs(temp-t2) < tolerance) break;
     f1 = f2;
     t1 = t2;
     t2 = temp;
diff --git a/src/KOKKOS/fix_eos_table_rx_kokkos.h b/src/KOKKOS/fix_eos_table_rx_kokkos.h
index d4a5094ae0..91d73f1036 100644
--- a/src/KOKKOS/fix_eos_table_rx_kokkos.h
+++ b/src/KOKKOS/fix_eos_table_rx_kokkos.h
@@ -112,8 +112,8 @@ class FixEOStableRXKokkos : public FixEOStableRX {
   int update_table;
   void create_kokkos_tables();
 
-  DAT::tdual_float_1d k_dHf;
-  typename AT::t_float_1d d_dHf;
+  DAT::tdual_float_1d k_dHf,k_energyCorr,k_tempCorrCoeff,k_moleculeCorrCoeff;
+  typename AT::t_float_1d d_dHf,d_energyCorr,d_tempCorrCoeff,d_moleculeCorrCoeff;
 
   typename AT::t_int_1d mask;
   typename AT::t_efloat_1d uCond,uMech,uChem,uCG,uCGnew,rho,dpdTheta,duChem;

From f220b07625b7089b981fcaeff999bae3712b6a3a Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Tue, 3 Jan 2017 10:36:55 -0700
Subject: [PATCH 035/267] Updating pair_exp6_rx_kokkos to USER-DPD changes

---
 src/KOKKOS/pair_exp6_rx_kokkos.cpp | 306 +++++++++++++++++------------
 src/KOKKOS/pair_exp6_rx_kokkos.h   |  30 ++-
 src/USER-DPD/pair_exp6_rx.h        |   2 +-
 3 files changed, 201 insertions(+), 137 deletions(-)

diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index ce3b547435..3ce6b78e57 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -41,6 +41,12 @@ using namespace MathSpecial;
 #define MAXLINE 1024
 #define DELTA 4
 
+#ifdef DBL_EPSILON
+  #define MY_EPSILON (10.0*DBL_EPSILON)
+#else
+  #define MY_EPSILON (10.0*2.220446049250313e-16)
+#endif
+
 #define oneFluidApproxParameter (-1)
 #define isOneFluidApprox(_site) ( (_site) == oneFluidApproxParameter )
 
@@ -165,29 +171,29 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
      PairExp6ParamData.epsilon1     = typename AT::t_float_1d("PairExp6ParamData.epsilon1"    ,np_total);
      PairExp6ParamData.alpha1       = typename AT::t_float_1d("PairExp6ParamData.alpha1"      ,np_total);
      PairExp6ParamData.rm1          = typename AT::t_float_1d("PairExp6ParamData.rm1"         ,np_total);
-     PairExp6ParamData.fraction1    = typename AT::t_float_1d("PairExp6ParamData.fraction1"   ,np_total);
+     PairExp6ParamData.mixWtSite1    = typename AT::t_float_1d("PairExp6ParamData.mixWtSite1"   ,np_total);
      PairExp6ParamData.epsilon2     = typename AT::t_float_1d("PairExp6ParamData.epsilon2"    ,np_total);
      PairExp6ParamData.alpha2       = typename AT::t_float_1d("PairExp6ParamData.alpha2"      ,np_total);
      PairExp6ParamData.rm2          = typename AT::t_float_1d("PairExp6ParamData.rm2"         ,np_total);
-     PairExp6ParamData.fraction2    = typename AT::t_float_1d("PairExp6ParamData.fraction2"   ,np_total);
+     PairExp6ParamData.mixWtSite2    = typename AT::t_float_1d("PairExp6ParamData.mixWtSite2"   ,np_total);
      PairExp6ParamData.epsilonOld1  = typename AT::t_float_1d("PairExp6ParamData.epsilonOld1" ,np_total);
      PairExp6ParamData.alphaOld1    = typename AT::t_float_1d("PairExp6ParamData.alphaOld1"   ,np_total);
      PairExp6ParamData.rmOld1       = typename AT::t_float_1d("PairExp6ParamData.rmOld1"      ,np_total);
-     PairExp6ParamData.fractionOld1 = typename AT::t_float_1d("PairExp6ParamData.fractionOld1",np_total);
+     PairExp6ParamData.mixWtSite1old = typename AT::t_float_1d("PairExp6ParamData.mixWtSite1old",np_total);
      PairExp6ParamData.epsilonOld2  = typename AT::t_float_1d("PairExp6ParamData.epsilonOld2" ,np_total);
      PairExp6ParamData.alphaOld2    = typename AT::t_float_1d("PairExp6ParamData.alphaOld2"   ,np_total);
      PairExp6ParamData.rmOld2       = typename AT::t_float_1d("PairExp6ParamData.rmOld2"      ,np_total);
-     PairExp6ParamData.fractionOld2 = typename AT::t_float_1d("PairExp6ParamData.fractionOld2",np_total);
+     PairExp6ParamData.mixWtSite2old = typename AT::t_float_1d("PairExp6ParamData.mixWtSite2old",np_total);
 
-     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxgetParamsEXP6>(0,np_total),*this);
+     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxgetMixingWeights>(0,np_total),*this);
   }
 
   k_error_flag.template modify<DeviceType>();
   k_error_flag.template sync<LMPHostType>();
   if (k_error_flag.h_view() == 1)
-    error->all(FLERR,"The number of molecules in CG particle is less than 1e-8.");
+    error->all(FLERR,"The number of molecules in CG particle is less than 10*DBL_EPSILON.");
   else if (k_error_flag.h_view() == 2)
-    error->all(FLERR,"Computed fraction less than -1.0e-10");
+    error->all(FLERR,"Computed fraction less than -10*DBL_EPSILON");
 
   int inum = list->inum;
   NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
@@ -249,23 +255,23 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxgetParamsEXP6, const int &i) const {
-  getParamsEXP6 (i, PairExp6ParamData.epsilon1[i],
+void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxgetMixingWeights, const int &i) const {
+  getMixingWeights (i, PairExp6ParamData.epsilon1[i],
                     PairExp6ParamData.alpha1[i],
                     PairExp6ParamData.rm1[i],
-                    PairExp6ParamData.fraction1[i],
+                    PairExp6ParamData.mixWtSite1[i],
                     PairExp6ParamData.epsilon2[i],
                     PairExp6ParamData.alpha2[i],
                     PairExp6ParamData.rm2[i],
-                    PairExp6ParamData.fraction2[i],
+                    PairExp6ParamData.mixWtSite2[i],
                     PairExp6ParamData.epsilonOld1[i],
                     PairExp6ParamData.alphaOld1[i],
                     PairExp6ParamData.rmOld1[i],
-                    PairExp6ParamData.fractionOld1[i],
+                    PairExp6ParamData.mixWtSite1old[i],
                     PairExp6ParamData.epsilonOld2[i],
                     PairExp6ParamData.alphaOld2[i],
                     PairExp6ParamData.rmOld2[i],
-                    PairExp6ParamData.fractionOld2[i]);
+                    PairExp6ParamData.mixWtSite2old[i]);
 }
 
 template<class DeviceType>
@@ -300,10 +306,10 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
   double epsilon2_j,alpha2_j,rm2_j;
   double evdwlOldEXP6_12, evdwlOldEXP6_21, fpairOldEXP6_12, fpairOldEXP6_21;
   double evdwlEXP6_12, evdwlEXP6_21;
-  double fractionOld1_i, fractionOld1_j;
-  double fractionOld2_i, fractionOld2_j;
-  double fraction1_i, fraction1_j;
-  double fraction2_i, fraction2_j;
+  double mixWtSite1old_i, mixWtSite1old_j;
+  double mixWtSite2old_i, mixWtSite2old_j;
+  double mixWtSite1_i, mixWtSite1_j;
+  double mixWtSite2_i, mixWtSite2_j;
 
   const int nRep = 12;
   const double shift = 1.05;
@@ -329,19 +335,19 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
      epsilon1_i     = PairExp6ParamData.epsilon1[i];
      alpha1_i       = PairExp6ParamData.alpha1[i];
      rm1_i          = PairExp6ParamData.rm1[i];
-     fraction1_i    = PairExp6ParamData.fraction1[i];
+     mixWtSite1_i    = PairExp6ParamData.mixWtSite1[i];
      epsilon2_i     = PairExp6ParamData.epsilon2[i];
      alpha2_i       = PairExp6ParamData.alpha2[i];
      rm2_i          = PairExp6ParamData.rm2[i];
-     fraction2_i    = PairExp6ParamData.fraction2[i];
+     mixWtSite2_i    = PairExp6ParamData.mixWtSite2[i];
      epsilonOld1_i  = PairExp6ParamData.epsilonOld1[i];
      alphaOld1_i    = PairExp6ParamData.alphaOld1[i];
      rmOld1_i       = PairExp6ParamData.rmOld1[i];
-     fractionOld1_i = PairExp6ParamData.fractionOld1[i];
+     mixWtSite1old_i = PairExp6ParamData.mixWtSite1old[i];
      epsilonOld2_i  = PairExp6ParamData.epsilonOld2[i];
      alphaOld2_i    = PairExp6ParamData.alphaOld2[i];
      rmOld2_i       = PairExp6ParamData.rmOld2[i];
-     fractionOld2_i = PairExp6ParamData.fractionOld2[i];
+     mixWtSite2old_i = PairExp6ParamData.mixWtSite2old[i];
   }
 
   for (jj = 0; jj < jnum; jj++) {
@@ -376,19 +382,19 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
          epsilon1_j     = PairExp6ParamData.epsilon1[j];
          alpha1_j       = PairExp6ParamData.alpha1[j];
          rm1_j          = PairExp6ParamData.rm1[j];
-         fraction1_j    = PairExp6ParamData.fraction1[j];
+         mixWtSite1_j    = PairExp6ParamData.mixWtSite1[j];
          epsilon2_j     = PairExp6ParamData.epsilon2[j];
          alpha2_j       = PairExp6ParamData.alpha2[j];
          rm2_j          = PairExp6ParamData.rm2[j];
-         fraction2_j    = PairExp6ParamData.fraction2[j];
+         mixWtSite2_j    = PairExp6ParamData.mixWtSite2[j];
          epsilonOld1_j  = PairExp6ParamData.epsilonOld1[j];
          alphaOld1_j    = PairExp6ParamData.alphaOld1[j];
          rmOld1_j       = PairExp6ParamData.rmOld1[j];
-         fractionOld1_j = PairExp6ParamData.fractionOld1[j];
+         mixWtSite1old_j = PairExp6ParamData.mixWtSite1old[j];
          epsilonOld2_j  = PairExp6ParamData.epsilonOld2[j];
          alphaOld2_j    = PairExp6ParamData.alphaOld2[j];
          rmOld2_j       = PairExp6ParamData.rmOld2[j];
-         fractionOld2_j = PairExp6ParamData.fractionOld2[j];
+         mixWtSite2old_j = PairExp6ParamData.mixWtSite2old[j];
       }
 
       // A2.  Apply Lorentz-Berthelot mixing rules for the i-j pair
@@ -489,9 +495,9 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
         }
 
         if (isite1 == isite2)
-          evdwlOld = sqrt(fractionOld1_i*fractionOld2_j)*evdwlOldEXP6_12;
+          evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwlOldEXP6_12;
         else
-          evdwlOld = sqrt(fractionOld1_i*fractionOld2_j)*evdwlOldEXP6_12 + sqrt(fractionOld2_i*fractionOld1_j)*evdwlOldEXP6_21;
+          evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwlOldEXP6_12 + sqrt(mixWtSite2old_i*mixWtSite1old_j)*evdwlOldEXP6_21;
 
         evdwlOld *= factor_lj;
 
@@ -572,8 +578,8 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
         //
         // Apply Mixing Rule to get the overall force for the CG pair
         //
-        if (isite1 == isite2) fpair = sqrt(fractionOld1_i*fractionOld2_j)*fpairOldEXP6_12;
-        else fpair = sqrt(fractionOld1_i*fractionOld2_j)*fpairOldEXP6_12 + sqrt(fractionOld2_i*fractionOld1_j)*fpairOldEXP6_21;
+        if (isite1 == isite2) fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpairOldEXP6_12;
+        else fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpairOldEXP6_12 + sqrt(mixWtSite2old_i*mixWtSite1old_j)*fpairOldEXP6_21;
 
         fx_i += delx*fpair;
         fy_i += dely*fpair;
@@ -584,8 +590,8 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
           a_f(j,2) -= delz*fpair;
         }
 
-        if (isite1 == isite2) evdwl = sqrt(fraction1_i*fraction2_j)*evdwlEXP6_12;
-        else evdwl = sqrt(fraction1_i*fraction2_j)*evdwlEXP6_12 + sqrt(fraction2_i*fraction1_j)*evdwlEXP6_21;
+        if (isite1 == isite2) evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwlEXP6_12;
+        else evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwlEXP6_12 + sqrt(mixWtSite2_i*mixWtSite1_j)*evdwlEXP6_21;
         evdwl *= factor_lj;
 
         uCGnew_i   += 0.5*evdwl;
@@ -637,6 +643,24 @@ void PairExp6rxKokkos<DeviceType>::allocate()
   memory->create(cut,n+1,n+1,"pair:cut_lj");
 }
 
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairExp6rxKokkos<DeviceType>::coeff(int narg, char **arg)
+{
+  PairExp6rx::coeff(narg,arg);
+  
+  if (scalingFlag == POLYNOMIAL)
+    for (int i = 0; i < 6; i++) {
+      s_coeffAlpha[i] = coeffAlpha[i];
+      s_coeffEps[i] = coeffEps[i];
+      s_coeffRm[i] = coeffRm[i];
+    }
+}
+
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
@@ -793,7 +817,7 @@ void PairExp6rxKokkos<DeviceType>::setup()
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairExp6rxKokkos<DeviceType>::getParamsEXP6(int id,double &epsilon1,double &alpha1,double &rm1, double &fraction1,double &epsilon2,double &alpha2,double &rm2,double &fraction2,double &epsilon1_old,double &alpha1_old,double &rm1_old, double &fraction1_old,double &epsilon2_old,double &alpha2_old,double &rm2_old,double &fraction2_old) const
+void PairExp6rxKokkos<DeviceType>::getMixingWeights(int id,double &epsilon1,double &alpha1,double &rm1, double &mixWtSite1,double &epsilon2,double &alpha2,double &rm2,double &mixWtSite2,double &epsilon1_old,double &alpha1_old,double &rm1_old, double &mixWtSite1old,double &epsilon2_old,double &alpha2_old,double &rm2_old,double &mixWtSite2old) const
 {
   int iparam, jparam;
   double rmi, rmj, rmij, rm3ij;
@@ -801,11 +825,16 @@ void PairExp6rxKokkos<DeviceType>::getParamsEXP6(int id,double &epsilon1,double
   double alphai, alphaj, alphaij;
   double epsilon_old, rm3_old, alpha_old;
   double epsilon, rm3, alpha;
-  double fractionOFA, fractionOFA_old;
-  double nTotalOFA, nTotalOFA_old;
-  double nTotal, nTotal_old;
   double xMolei, xMolej, xMolei_old, xMolej_old;
 
+  double fractionOFAold, fractionOFA;
+  double fractionOld1, fraction1;
+  double fractionOld2, fraction2;
+  double nMoleculesOFAold, nMoleculesOFA;
+  double nMoleculesOld1, nMolecules1;
+  double nMoleculesOld2, nMolecules2;
+  double nTotal, nTotalold;
+
   rm3 = 0.0;
   epsilon = 0.0;
   alpha = 0.0;
@@ -813,32 +842,32 @@ void PairExp6rxKokkos<DeviceType>::getParamsEXP6(int id,double &epsilon1,double
   rm3_old = 0.0;
   alpha_old = 0.0;
   fractionOFA = 0.0;
-  fractionOFA_old = 0.0;
-  nTotalOFA = 0.0;
-  nTotalOFA_old = 0.0;
+  fractionOFAold = 0.0;
+  nMoleculesOFA = 0.0;
+  nMoleculesOFAold = 0.0;
   nTotal = 0.0;
-  nTotal_old = 0.0;
+  nTotalold = 0.0;
 
   // Compute the total number of molecules in the old and new CG particle as well as the total number of molecules in the fluid portion of the old and new CG particle
   for (int ispecies = 0; ispecies < nspecies; ispecies++){
     nTotal += dvector(ispecies,id);
-    nTotal_old += dvector(ispecies+nspecies,id);
+    nTotalold += dvector(ispecies+nspecies,id);
 
     iparam = d_mol2param[ispecies];
 
     if (iparam < 0 || d_params[iparam].potentialType != exp6PotentialType ) continue;
     if (isOneFluidApprox(isite1) || isOneFluidApprox(isite2)) {
       if (isite1 == d_params[iparam].ispecies || isite2 == d_params[iparam].ispecies) continue;
-      nTotalOFA_old += dvector(ispecies+nspecies,id);
-      nTotalOFA += dvector(ispecies,id);
+      nMoleculesOFAold += dvector(ispecies+nspecies,id);
+      nMoleculesOFA += dvector(ispecies,id);
     }
   }
-  if(nTotal < 1e-8 || nTotal_old < 1e-8)
+  if(nTotal < MY_EPSILON || nTotalold < MY_EPSILON)
     k_error_flag.d_view() = 1;
 
   // Compute the mole fraction of molecules within the fluid portion of the particle (One Fluid Approximation)
-  fractionOFA_old = nTotalOFA_old / nTotal_old;
-  fractionOFA = nTotalOFA / nTotal;
+  fractionOFAold = nMoleculesOFAold / nTotalold;
+  fractionOFA = nMoleculesOFA / nTotal;
 
   for (int ispecies = 0; ispecies < nspecies; ispecies++) {
     iparam = d_mol2param[ispecies];
@@ -854,8 +883,10 @@ void PairExp6rxKokkos<DeviceType>::getParamsEXP6(int id,double &epsilon1,double
       alpha1 = d_params[iparam].alpha;
 
       // Compute the mole fraction of Site1
-      fraction1_old = dvector(ispecies+nspecies,id)/nTotal_old;
-      fraction1 = dvector(ispecies,id)/nTotal;
+      nMoleculesOld1 = dvector(ispecies+nspecies,id);
+      nMolecules1 = dvector(ispecies,id);
+      fractionOld1 = nMoleculesOld1/nTotalold;
+      fraction1 = nMolecules1/nTotal;
     }
 
     // If Site2 matches a pure species, then grab the parameters
@@ -868,8 +899,9 @@ void PairExp6rxKokkos<DeviceType>::getParamsEXP6(int id,double &epsilon1,double
       alpha2 = d_params[iparam].alpha;
 
       // Compute the mole fraction of Site2
-      fraction2_old = dvector(ispecies+nspecies,id)/nTotal_old;
-      fraction2 = dvector(ispecies,id)/nTotal;
+      nMoleculesOld2 = dvector(ispecies+nspecies,id);
+      nMolecules2 = dvector(ispecies,id);
+      fractionOld2 = dvector(ispecies+nspecies,id)/nTotalold;
     }
 
     // If Site1 or Site2 matches is a fluid, then compute the paramters
@@ -878,8 +910,10 @@ void PairExp6rxKokkos<DeviceType>::getParamsEXP6(int id,double &epsilon1,double
       rmi = d_params[iparam].rm;
       epsiloni = d_params[iparam].epsilon;
       alphai = d_params[iparam].alpha;
-      xMolei = dvector(ispecies,id)/nTotalOFA;
-      xMolei_old = dvector(ispecies+nspecies,id)/nTotalOFA_old;
+      if(nMoleculesOFA<MY_EPSILON) xMolei = 0.0;
+      else xMolei = dvector(ispecies,id)/nMoleculesOFA;
+      if(nMoleculesOFAold<MY_EPSILON) xMolei_old = 0.0;
+      else xMolei_old = dvector(ispecies+nspecies,id)/nMoleculesOFAold;
 
       for (int jspecies = 0; jspecies < nspecies; jspecies++) {
         jparam = d_mol2param[jspecies];
@@ -888,15 +922,17 @@ void PairExp6rxKokkos<DeviceType>::getParamsEXP6(int id,double &epsilon1,double
         rmj = d_params[jparam].rm;
         epsilonj = d_params[jparam].epsilon;
         alphaj = d_params[jparam].alpha;
-        xMolej = dvector(jspecies,id)/nTotalOFA;
-        xMolej_old = dvector(jspecies+nspecies,id)/nTotalOFA_old;
+        if(nMoleculesOFA<MY_EPSILON) xMolej = 0.0;
+        else xMolej = dvector(jspecies,id)/nMoleculesOFA;
+        if(nMoleculesOFAold<MY_EPSILON) xMolej_old = 0.0;
+        else xMolej_old = dvector(jspecies+nspecies,id)/nMoleculesOFAold;
 
         rmij = (rmi+rmj)/2.0;
         rm3ij = rmij*rmij*rmij;
         epsilonij = sqrt(epsiloni*epsilonj);
         alphaij = sqrt(alphai*alphaj);
 
-        if(fractionOFA_old > 0.0){
+        if(fractionOFAold > 0.0){
           rm3_old += xMolei_old*xMolej_old*rm3ij;
           epsilon_old += xMolei_old*xMolej_old*rm3ij*epsilonij;
           alpha_old += xMolei_old*xMolej_old*rm3ij*epsilonij*alphaij;
@@ -912,7 +948,7 @@ void PairExp6rxKokkos<DeviceType>::getParamsEXP6(int id,double &epsilon1,double
 
   if (isOneFluidApprox(isite1)){
     rm1 = cbrt(rm3);
-    if(rm1 < 1e-16) {
+    if(rm1 < MY_EPSILON) {
       rm1 = 0.0;
       epsilon1 = 0.0;
       alpha1 = 0.0;
@@ -920,11 +956,11 @@ void PairExp6rxKokkos<DeviceType>::getParamsEXP6(int id,double &epsilon1,double
       epsilon1 = epsilon / rm3;
       alpha1 = alpha / epsilon1 / rm3;
     }
-
+    nMolecules1 = 1.0-(nTotal-nMoleculesOFA);
     fraction1 = fractionOFA;
 
     rm1_old = cbrt(rm3_old);
-    if(rm1_old < 1e-16) {
+    if(rm1_old < MY_EPSILON) {
       rm1_old = 0.0;
       epsilon1_old = 0.0;
       alpha1_old = 0.0;
@@ -932,42 +968,21 @@ void PairExp6rxKokkos<DeviceType>::getParamsEXP6(int id,double &epsilon1,double
       epsilon1_old = epsilon_old / rm3_old;
       alpha1_old = alpha_old / epsilon1_old / rm3_old;
     }
-    fraction1_old = fractionOFA_old;
+    nMoleculesOld1 = 1.0-(nTotalold-nMoleculesOFAold);
+    fractionOld1 = fractionOFAold;
 
-    // Fuchslin-Like Exp-6 Scaling
-    double powfuch = 0.0;
-    if(exponentEpsilon < 0.0){
-      powfuch = pow(nTotalOFA,-exponentEpsilon);
-      if(powfuch<1e-15) epsilon1 = 0.0;
-      else epsilon1 *= 1.0/powfuch;
-
-      powfuch = pow(nTotalOFA_old,-exponentEpsilon);
-      if(powfuch<1e-15) epsilon1_old = 0.0;
-      else epsilon1_old *= 1.0/powfuch;
-
-    } else {
-      epsilon1 *= pow(nTotalOFA,exponentEpsilon);
-      epsilon1_old *= pow(nTotalOFA_old,exponentEpsilon);
-    }
-
-    if(exponentR < 0.0){
-      powfuch = pow(nTotalOFA,-exponentR);
-      if(powfuch<1e-15) rm1 = 0.0;
-      else rm1 *= 1.0/powfuch;
-
-      powfuch = pow(nTotalOFA_old,-exponentR);
-      if(powfuch<1e-15) rm1_old = 0.0;
-      else rm1_old *= 1.0/powfuch;
-
-    } else {
-      rm1 *= pow(nTotalOFA,exponentR);
-      rm1_old *= pow(nTotalOFA_old,exponentR);
+    if(scalingFlag == EXPONENT){
+      exponentScaling(nMoleculesOFA,epsilon1,rm1);
+      exponentScaling(nMoleculesOFAold,epsilon1_old,rm1_old);
+    } else if(scalingFlag == POLYNOMIAL){
+      polynomialScaling(nMoleculesOFA,alpha1,epsilon1,rm1);
+      polynomialScaling(nMoleculesOFAold,alpha1_old,epsilon1_old,rm1_old);
     }
   }
 
   if (isOneFluidApprox(isite2)){
     rm2 = cbrt(rm3);
-    if(rm2 < 1e-16) {
+    if(rm2 < MY_EPSILON) {
       rm2 = 0.0;
       epsilon2 = 0.0;
       alpha2 = 0.0;
@@ -975,10 +990,11 @@ void PairExp6rxKokkos<DeviceType>::getParamsEXP6(int id,double &epsilon1,double
       epsilon2 = epsilon / rm3;
       alpha2 = alpha / epsilon2 / rm3;
     }
+    nMolecules2 = 1.0-(nTotal-nMoleculesOFA);
     fraction2 = fractionOFA;
 
     rm2_old = cbrt(rm3_old);
-    if(rm2_old < 1e-16) {
+    if(rm2_old < MY_EPSILON) {
       rm2_old = 0.0;
       epsilon2_old = 0.0;
       alpha2_old = 0.0;
@@ -986,64 +1002,100 @@ void PairExp6rxKokkos<DeviceType>::getParamsEXP6(int id,double &epsilon1,double
       epsilon2_old = epsilon_old / rm3_old;
       alpha2_old = alpha_old / epsilon2_old / rm3_old;
     }
-    fraction2_old = fractionOFA_old;
+    nMoleculesOld2 = 1.0-(nTotalold-nMoleculesOFAold);
+    fractionOld2 = fractionOFAold;
 
-    // Fuchslin-Like Exp-6 Scaling
-    double powfuch = 0.0;
-    if(exponentEpsilon < 0.0){
-      powfuch = pow(nTotalOFA,-exponentEpsilon);
-      if(powfuch<1e-15) epsilon2 = 0.0;
-      else epsilon2 *= 1.0/powfuch;
-
-      powfuch = pow(nTotalOFA_old,-exponentEpsilon);
-      if(powfuch<1e-15) epsilon2_old = 0.0;
-      else epsilon2_old *= 1.0/powfuch;
-
-    } else {
-      epsilon2 *= pow(nTotalOFA,exponentEpsilon);
-      epsilon2_old *= pow(nTotalOFA_old,exponentEpsilon);
-    }
-
-    if(exponentR < 0.0){
-      powfuch = pow(nTotalOFA,-exponentR);
-      if(powfuch<1e-15) rm2 = 0.0;
-      else rm2 *= 1.0/powfuch;
-
-      powfuch = pow(nTotalOFA_old,-exponentR);
-      if(powfuch<1e-15) rm2_old = 0.0;
-      else rm2_old *= 1.0/powfuch;
-
-    } else {
-      rm2 *= pow(nTotalOFA,exponentR);
-      rm2_old *= pow(nTotalOFA_old,exponentR);
+    if(scalingFlag == EXPONENT){
+      exponentScaling(nMoleculesOFA,epsilon2,rm2);
+      exponentScaling(nMoleculesOFAold,epsilon2_old,rm2_old);
+    } else if(scalingFlag == POLYNOMIAL){
+      polynomialScaling(nMoleculesOFA,alpha2,epsilon2,rm2);
+      polynomialScaling(nMoleculesOFAold,alpha2_old,epsilon2_old,rm2_old);
     }
   }
 
   // Check that no fractions are less than zero
-  if(fraction1 < 0.0){
-    if(fraction1 < -1.0e-10){
+  if(fraction1 < 0.0 || nMolecules1 < 0.0){
+    if(fraction1 < -MY_EPSILON || nMolecules1 < -MY_EPSILON){
       k_error_flag.d_view() = 2;
     }
+    nMolecules1 = 0.0;
     fraction1 = 0.0;
   }
-  if(fraction2 < 0.0){
-    if(fraction2 < -1.0e-10){
+  if(fraction2 < 0.0 || nMolecules2 < 0.0){
+    if(fraction2 < -MY_EPSILON || nMolecules2 < -MY_EPSILON){
       k_error_flag.d_view() = 2;
     }
+    nMolecules2 = 0.0;
     fraction2 = 0.0;
   }
-  if(fraction1_old < 0.0){
-    if(fraction1_old < -1.0e-10){
+  if(fractionOld1 < 0.0 || nMoleculesOld1 < 0.0){
+    if(fractionOld1 < -MY_EPSILON || nMoleculesOld1 < -MY_EPSILON){
       k_error_flag.d_view() = 2;
     }
-    fraction1_old = 0.0;
+    nMoleculesOld1 = 0.0;
+    fractionOld1 = 0.0;
   }
-  if(fraction2_old < 0.0){
-    if(fraction2_old < -1.0e-10){
+  if(fractionOld2 < 0.0 || nMoleculesOld2 < 0.0){
+    if(fractionOld2 < -MY_EPSILON || nMoleculesOld2 < -MY_EPSILON){
       k_error_flag.d_view() = 2;
     }
-    fraction2_old = 0.0;
+    nMoleculesOld2 = 0.0;
+    fractionOld2 = 0.0;
   }
+
+  if(fractionalWeighting){
+    mixWtSite1old = fractionOld1;
+    mixWtSite1 = fraction1;
+    mixWtSite2old = fractionOld2;
+    mixWtSite2 = fraction2;
+  } else {
+    mixWtSite1old = nMoleculesOld1;
+    mixWtSite1 = nMolecules1;
+    mixWtSite2old = nMoleculesOld2;
+    mixWtSite2 = nMolecules2;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::exponentScaling(double phi, double &epsilon, double &rm) const
+{
+  double powfuch;
+
+  if(exponentEpsilon < 0.0){
+    powfuch = pow(phi,-exponentEpsilon);
+    if(powfuch<MY_EPSILON) epsilon = 0.0;
+    else epsilon *= 1.0/powfuch;
+  } else {
+    epsilon *= pow(phi,exponentEpsilon);
+  }
+
+  if(exponentR < 0.0){
+    powfuch = pow(phi,-exponentR);
+    if(powfuch<MY_EPSILON) rm = 0.0;
+    else rm *= 1.0/powfuch;
+  } else {
+    rm *= pow(phi,exponentR);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::polynomialScaling(double phi, double &alpha, double &epsilon, double &rm) const
+{
+    double phi2 = phi*phi;
+    double phi3 = phi2*phi;
+    double phi4 = phi2*phi2;
+    double phi5 = phi2*phi3;
+
+    alpha = (s_coeffAlpha[0]*phi5 + s_coeffAlpha[1]*phi4 + s_coeffAlpha[2]*phi3 + s_coeffAlpha[3]*phi2 + s_coeffAlpha[4]*phi + s_coeffAlpha[5]);
+    epsilon *= (s_coeffEps[0]*phi5 + s_coeffEps[1]*phi4 + s_coeffEps[2]*phi3 + s_coeffEps[3]*phi2 + s_coeffEps[4]*phi + s_coeffEps[5]);
+    rm *= (s_coeffEps[0]*phi5 + s_coeffEps[1]*phi4 + s_coeffEps[2]*phi3 + s_coeffEps[3]*phi2 + s_coeffEps[4]*phi + s_coeffEps[5]);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.h b/src/KOKKOS/pair_exp6_rx_kokkos.h
index 7dfe20fc22..488c9d0039 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.h
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.h
@@ -38,18 +38,21 @@ struct PairExp6ParamDataTypeKokkos
   typedef ArrayTypes<DeviceType> AT;
 
    int n;
-   typename AT::t_float_1d epsilon1, alpha1, rm1, fraction1,
-          epsilon2, alpha2, rm2, fraction2,
-          epsilonOld1, alphaOld1, rmOld1, fractionOld1,
-          epsilonOld2, alphaOld2, rmOld2, fractionOld2;
+   typename AT::t_float_1d epsilon1, alpha1, rm1, mixWtSite1,
+          epsilon2, alpha2, rm2, mixWtSite2,
+          epsilonOld1, alphaOld1, rmOld1, mixWtSite1old,
+          epsilonOld2, alphaOld2, rmOld2, mixWtSite2old;
 
    // Default constructor -- nullify everything.
    PairExp6ParamDataTypeKokkos<DeviceType>(void)
-      : n(0)
+      : n(0), epsilon1(NULL), alpha1(NULL), rm1(NULL), mixWtSite1(NULL),
+              epsilon2(NULL), alpha2(NULL), rm2(NULL), mixWtSite2(NULL),
+              epsilonOld1(NULL), alphaOld1(NULL), rmOld1(NULL), mixWtSite1old(NULL),
+              epsilonOld2(NULL), alphaOld2(NULL), rmOld2(NULL), mixWtSite2old(NULL)
    {}
 };
 
-struct TagPairExp6rxgetParamsEXP6{};
+struct TagPairExp6rxgetMixingWeights{};
 
 template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
 struct TagPairExp6rxCompute{};
@@ -64,10 +67,11 @@ class PairExp6rxKokkos : public PairExp6rx {
   PairExp6rxKokkos(class LAMMPS *);
   virtual ~PairExp6rxKokkos();
   void compute(int, int);
+  void coeff(int, char **);
   void init_style();
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(TagPairExp6rxgetParamsEXP6, const int&) const;
+  void operator()(TagPairExp6rxgetMixingWeights, const int&) const;
 
   template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
@@ -127,7 +131,15 @@ class PairExp6rxKokkos : public PairExp6rx {
   void setup();
 
   KOKKOS_INLINE_FUNCTION
-  void getParamsEXP6(int, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &) const;
+  void getMixingWeights(int, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void exponentScaling(double, double &, double &) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void polynomialScaling(double, double &, double &, double &) const;
+
+  double s_coeffAlpha[6],s_coeffEps[6],s_coeffRm[6];
 
   KOKKOS_INLINE_FUNCTION
   double func_rin(const double &) const;
@@ -196,7 +208,7 @@ E:  Potential file has duplicate entry.
 
 Self-explanatory
 
-E:  The number of molecules in CG particle is less than 1e-8.
+E:  The number of molecules in CG particle is less than 10*DBL_EPSILON.
 
 Self-explanatory.  Check the species concentrations have been properly set
 and check the reaction kinetic solver parameters in fix rx to more for
diff --git a/src/USER-DPD/pair_exp6_rx.h b/src/USER-DPD/pair_exp6_rx.h
index a7531da318..31d4ffb20b 100644
--- a/src/USER-DPD/pair_exp6_rx.h
+++ b/src/USER-DPD/pair_exp6_rx.h
@@ -30,7 +30,7 @@ class PairExp6rx : public Pair {
   virtual ~PairExp6rx();
   virtual void compute(int, int);
   void settings(int, char **);
-  void coeff(int, char **);
+  virtual void coeff(int, char **);
   double init_one(int, int);
   void write_restart(FILE *);
   void read_restart(FILE *);

From ccaa0506cb93e9f884ebc22b32575feda7e99199 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Thu, 22 Dec 2016 07:55:15 -0700
Subject: [PATCH 036/267] LAMMPS_LAMBDA from ibaned/lammps@7559bc9

---
 src/KOKKOS/kokkos_type.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h
index c1176122a7..cc096058ec 100644
--- a/src/KOKKOS/kokkos_type.h
+++ b/src/KOKKOS/kokkos_type.h
@@ -920,4 +920,10 @@ void memset_kokkos (ViewType &view) {
 #define ISFINITE(x) std::isfinite(x)
 #endif
 
+#ifdef KOKKOS_HAVE_CUDA
+#define LAMMPS_LAMBDA [=] __device__
+#else
+#define LAMMPS_LAMBDA [=]
+#endif
+
 #endif

From 66cdd3a708b911663cc45e7d6117f09d39784123 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Thu, 22 Dec 2016 08:01:46 -0700
Subject: [PATCH 037/267] draft fix_dpd_energy_kokkos.h

---
 src/KOKKOS/fix_dpd_energy_kokkos.h | 44 ++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 src/KOKKOS/fix_dpd_energy_kokkos.h

diff --git a/src/KOKKOS/fix_dpd_energy_kokkos.h b/src/KOKKOS/fix_dpd_energy_kokkos.h
new file mode 100644
index 0000000000..399cf91334
--- /dev/null
+++ b/src/KOKKOS/fix_dpd_energy_kokkos.h
@@ -0,0 +1,44 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(dpd/energy/kk,FixDPDenergyKokkos<LMPDeviceType>)
+FixStyle(dpd/energy/kk/device,FixDPDenergyKokkos<LMPDeviceType>)
+FixStyle(dpd/energy/kk/host,FixDPDenergyKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_FIX_DPDE_H
+#define LMP_FIX_DPDE_H
+
+#include "fix_dpd_energy.h"
+
+namespace LAMMPS_NS {
+
+class FixDPDenergyKokkos : public FixDPDEnergy {
+ public:
+  FixDPDenergyKokkos(class LAMMPS *, int, char **);
+  virtual ~FixDPDenergyKokkos() {}
+  virtual void initial_integrate(int);
+  virtual void final_integrate();
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+*/

From 53e07996c6929f422568b5473a77cbcaea799c1e Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Thu, 22 Dec 2016 08:07:48 -0700
Subject: [PATCH 038/267] save draft of fix_dpd_energy_kokkos.cpp

---
 src/KOKKOS/fix_dpd_energy_kokkos.cpp | 77 ++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 src/KOKKOS/fix_dpd_energy_kokkos.cpp

diff --git a/src/KOKKOS/fix_dpd_energy_kokkos.cpp b/src/KOKKOS/fix_dpd_energy_kokkos.cpp
new file mode 100644
index 0000000000..ea93c28b01
--- /dev/null
+++ b/src/KOKKOS/fix_dpd_energy_kokkos.cpp
@@ -0,0 +1,77 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <string.h>
+#include "fix_dpd_energy_kokkos.h"
+#include "atom_kokkos.h"
+#include "force.h"
+#include "update.h"
+#include "respa.h"
+#include "modify.h"
+#include "error.h"
+#include "pair_dpd_fdt_energy.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+/* ---------------------------------------------------------------------- */
+
+FixDPDenergyKokkos::FixDPDenergyKokkos(LAMMPS *lmp, int narg, char **arg) :
+  FixDPDenergy(lmp, narg, arg)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+}
+
+/* ----------------------------------------------------------------------
+   allow for both per-type and per-atom mass
+------------------------------------------------------------------------- */
+
+void FixDPDenergyKokkos::initial_integrate(int vflag)
+{
+  int nlocal = atom->nlocal;
+  if (igroup == atom->firstgroup) nlocal = atom->nfirst;
+
+  t_efloat_1d uCond = atomKK
+  double *uCond = atom->uCond;
+  double *uMech = atom->uMech;
+  double *duCond = pairDPDE->duCond;
+  double *duMech = pairDPDE->duMech;
+
+  for (int i = 0; i < nlocal; i++){
+    uCond[i] += 0.5*update->dt*duCond[i];
+    uMech[i] += 0.5*update->dt*duMech[i];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixDPDenergyKokkos::final_integrate()
+{
+  int nlocal = atom->nlocal;
+  if (igroup == atom->firstgroup) nlocal = atom->nfirst;
+
+  double *uCond = atom->uCond;
+  double *uMech = atom->uMech;
+  double *duCond = pairDPDE->duCond;
+  double *duMech = pairDPDE->duMech;
+
+  for (int i = 0; i < nlocal; i++){
+    uCond[i] += 0.5*update->dt*duCond[i];
+    uMech[i] += 0.5*update->dt*duMech[i];
+  }
+}

From 04e2f170a33c2162f841b8a21403d95048b86371 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Thu, 22 Dec 2016 09:28:25 -0700
Subject: [PATCH 039/267] first draft fix_dpd_energy_kokkos

had to make k_duCond and k_duMech
in pair_dpd_fdt_energy_kokkos
public so they could be accessed
and sync'ed
---
 src/KOKKOS/fix_dpd_energy_kokkos.cpp    | 76 ++++++++++++++-----------
 src/KOKKOS/fix_dpd_energy_kokkos.h      | 10 ++++
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.h |  3 +-
 3 files changed, 56 insertions(+), 33 deletions(-)

diff --git a/src/KOKKOS/fix_dpd_energy_kokkos.cpp b/src/KOKKOS/fix_dpd_energy_kokkos.cpp
index ea93c28b01..38671d66ab 100644
--- a/src/KOKKOS/fix_dpd_energy_kokkos.cpp
+++ b/src/KOKKOS/fix_dpd_energy_kokkos.cpp
@@ -20,14 +20,14 @@
 #include "respa.h"
 #include "modify.h"
 #include "error.h"
-#include "pair_dpd_fdt_energy.h"
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
 
 /* ---------------------------------------------------------------------- */
 
-FixDPDenergyKokkos::FixDPDenergyKokkos(LAMMPS *lmp, int narg, char **arg) :
+template <typename DeviceType>
+FixDPDenergyKokkos<DeviceType>::FixDPDenergyKokkos(LAMMPS *lmp, int narg, char **arg) :
   FixDPDenergy(lmp, narg, arg)
 {
   kokkosable = 1;
@@ -35,43 +35,55 @@ FixDPDenergyKokkos::FixDPDenergyKokkos(LAMMPS *lmp, int narg, char **arg) :
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
   datamask_read = EMPTY_MASK;
   datamask_modify = EMPTY_MASK;
-}
-
-/* ----------------------------------------------------------------------
-   allow for both per-type and per-atom mass
-------------------------------------------------------------------------- */
-
-void FixDPDenergyKokkos::initial_integrate(int vflag)
-{
-  int nlocal = atom->nlocal;
-  if (igroup == atom->firstgroup) nlocal = atom->nfirst;
-
-  t_efloat_1d uCond = atomKK
-  double *uCond = atom->uCond;
-  double *uMech = atom->uMech;
-  double *duCond = pairDPDE->duCond;
-  double *duMech = pairDPDE->duMech;
-
-  for (int i = 0; i < nlocal; i++){
-    uCond[i] += 0.5*update->dt*duCond[i];
-    uMech[i] += 0.5*update->dt*duMech[i];
-  }
+  pairDPDEKK = dynamic_cast<decltype(pairDPDEKK)>(pairDPDE);
+  if (!pairDPDEKK)
+    error->all(FLERR,"Must use pair_style dpd/fdt/energy/kk with fix dpd/energy/kk");
 }
 
 /* ---------------------------------------------------------------------- */
 
-void FixDPDenergyKokkos::final_integrate()
+template <typename DeviceType>
+void FixDPDenergyKokkos<DeviceType>::take_half_step()
 {
   int nlocal = atom->nlocal;
   if (igroup == atom->firstgroup) nlocal = atom->nfirst;
 
-  double *uCond = atom->uCond;
-  double *uMech = atom->uMech;
-  double *duCond = pairDPDE->duCond;
-  double *duMech = pairDPDE->duMech;
+  atomKK->sync(execution_space, UCOND_MASK);
+  t_efloat_1d uCond = atomKK->k_uCond.view<DeviceType>();
+  atomKK->sync(execution_space, UMECH_MASK);
+  t_efloat_1d uMech = atomKK->k_uMech.view<DeviceType>();
 
-  for (int i = 0; i < nlocal; i++){
-    uCond[i] += 0.5*update->dt*duCond[i];
-    uMech[i] += 0.5*update->dt*duMech[i];
-  }
+  pairDPDEKK->k_duCond.sync<DeviceType>();
+  t_efloat_1d_const duCond = pairDPDEKK->k_duCond.view<DeviceType>();
+  pairDPDEKK->k_duMech.sync<DeviceType>();
+  t_efloat_1d_const duMech = pairDPDEKK->k_duMech.view<DeviceType>();
+
+  auto dt = update->dt;
+
+  Kokkos::parallel_for(nlocal, LAMMPS_LAMBDA(int i) {
+    uCond(i) += 0.5*dt*duCond(i);
+    uMech(i) += 0.5*dt*duMech(i);
+  });
+
+  atomKK->modified(execution_space, UCOND_MASK);
+  atomKK->modified(execution_space, UMECH_MASK);
+  //should not be needed once everything is Kokkos
+  atomKK->sync(ExecutionSpaceFromDevice<LMPHostType>, UCOND_MASK);
+  atomKK->sync(ExecutionSpaceFromDevice<LMPHostType>, UMECH_MASK);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixDPDenergyKokkos<DeviceType>::initial_integrate(int)
+{
+  take_half_step();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixDPDenergyKokkos<DeviceType>::final_integrate()
+{
+  take_half_step();
 }
diff --git a/src/KOKKOS/fix_dpd_energy_kokkos.h b/src/KOKKOS/fix_dpd_energy_kokkos.h
index 399cf91334..e5ae2b0127 100644
--- a/src/KOKKOS/fix_dpd_energy_kokkos.h
+++ b/src/KOKKOS/fix_dpd_energy_kokkos.h
@@ -23,15 +23,21 @@ FixStyle(dpd/energy/kk/host,FixDPDenergyKokkos<LMPHostType>)
 #define LMP_FIX_DPDE_H
 
 #include "fix_dpd_energy.h"
+#include "pair_dpd_dft_energy_kokkos.h"
 
 namespace LAMMPS_NS {
 
+template <typename DeviceType>
 class FixDPDenergyKokkos : public FixDPDEnergy {
  public:
   FixDPDenergyKokkos(class LAMMPS *, int, char **);
   virtual ~FixDPDenergyKokkos() {}
   virtual void initial_integrate(int);
   virtual void final_integrate();
+
+ protected:
+  void take_half_step();
+  PairDPDfdtEnergyKokkos<DeviceType>* pairDPDEKK;
 };
 
 }
@@ -41,4 +47,8 @@ class FixDPDenergyKokkos : public FixDPDEnergy {
 
 /* ERROR/WARNING messages:
 
+E: Must use pair_style dpd/fdt/energy/kk with fix dpd/energy/kk
+
+Self-explanatory.
+
 */
diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
index 8e7d01de2a..41360091bc 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
@@ -84,6 +84,8 @@ class PairDPDfdtEnergyKokkos : public PairDPDfdtEnergy {
     F_FLOAT cut,a0,sigma,kappa;
   };
 
+  DAT::tdual_efloat_1d k_duCond,k_duMech;
+
  protected:
   int eflag,vflag;
   int nlocal,neighflag;
@@ -110,7 +112,6 @@ class PairDPDfdtEnergyKokkos : public PairDPDfdtEnergy {
   typename ArrayTypes<DeviceType>::t_float_1d_randomread mass;
   double *rmass;
   typename AT::t_efloat_1d dpdTheta;
-  DAT::tdual_efloat_1d k_duCond,k_duMech;
   typename AT::t_efloat_1d d_duCond,d_duMech;
   HAT::t_efloat_1d h_duCond,h_duMech;
 

From 89795b3653ea6270b90911cf30b3777312af5828 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Thu, 22 Dec 2016 10:18:30 -0700
Subject: [PATCH 040/267] got fix_dpd_energy_kokkos to compile

---
 src/KOKKOS/Install.sh                |  2 ++
 src/KOKKOS/fix_dpd_energy_kokkos.cpp | 30 ++++++++++++++++++----------
 src/KOKKOS/fix_dpd_energy_kokkos.h   |  8 ++++----
 3 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index 17e9f93c9d..96ec348b30 100644
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -99,6 +99,8 @@ action fix_setforce_kokkos.cpp
 action fix_setforce_kokkos.h
 action fix_wall_reflect_kokkos.cpp
 action fix_wall_reflect_kokkos.h
+action fix_dpd_energy_kokkos.cpp fix_dpd_energy.cpp
+action fix_dpd_energy_kokkos.h fix_dpd_energy.h
 action gridcomm_kokkos.cpp gridcomm.cpp
 action gridcomm_kokkos.h gridcomm.h
 action improper_harmonic_kokkos.cpp improper_harmonic.cpp
diff --git a/src/KOKKOS/fix_dpd_energy_kokkos.cpp b/src/KOKKOS/fix_dpd_energy_kokkos.cpp
index 38671d66ab..6ab0b215b4 100644
--- a/src/KOKKOS/fix_dpd_energy_kokkos.cpp
+++ b/src/KOKKOS/fix_dpd_energy_kokkos.cpp
@@ -14,6 +14,7 @@
 #include <stdio.h>
 #include <string.h>
 #include "fix_dpd_energy_kokkos.h"
+#include "atom_masks.h"
 #include "atom_kokkos.h"
 #include "force.h"
 #include "update.h"
@@ -48,15 +49,17 @@ void FixDPDenergyKokkos<DeviceType>::take_half_step()
   int nlocal = atom->nlocal;
   if (igroup == atom->firstgroup) nlocal = atom->nfirst;
 
-  atomKK->sync(execution_space, UCOND_MASK);
-  t_efloat_1d uCond = atomKK->k_uCond.view<DeviceType>();
-  atomKK->sync(execution_space, UMECH_MASK);
-  t_efloat_1d uMech = atomKK->k_uMech.view<DeviceType>();
+  using AT = ArrayTypes<DeviceType>;
 
-  pairDPDEKK->k_duCond.sync<DeviceType>();
-  t_efloat_1d_const duCond = pairDPDEKK->k_duCond.view<DeviceType>();
-  pairDPDEKK->k_duMech.sync<DeviceType>();
-  t_efloat_1d_const duMech = pairDPDEKK->k_duMech.view<DeviceType>();
+  atomKK->sync(execution_space, UCOND_MASK);
+  typename AT::t_efloat_1d uCond = atomKK->k_uCond.view<DeviceType>();
+  atomKK->sync(execution_space, UMECH_MASK);
+  typename AT::t_efloat_1d uMech = atomKK->k_uMech.view<DeviceType>();
+
+  pairDPDEKK->k_duCond.template sync<DeviceType>();
+  typename AT::t_efloat_1d_const duCond = pairDPDEKK->k_duCond.template view<DeviceType>();
+  pairDPDEKK->k_duMech.template sync<DeviceType>();
+  typename AT::t_efloat_1d_const duMech = pairDPDEKK->k_duMech.template view<DeviceType>();
 
   auto dt = update->dt;
 
@@ -68,8 +71,8 @@ void FixDPDenergyKokkos<DeviceType>::take_half_step()
   atomKK->modified(execution_space, UCOND_MASK);
   atomKK->modified(execution_space, UMECH_MASK);
   //should not be needed once everything is Kokkos
-  atomKK->sync(ExecutionSpaceFromDevice<LMPHostType>, UCOND_MASK);
-  atomKK->sync(ExecutionSpaceFromDevice<LMPHostType>, UMECH_MASK);
+  atomKK->sync(ExecutionSpaceFromDevice<LMPHostType>::space, UCOND_MASK);
+  atomKK->sync(ExecutionSpaceFromDevice<LMPHostType>::space, UMECH_MASK);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -87,3 +90,10 @@ void FixDPDenergyKokkos<DeviceType>::final_integrate()
 {
   take_half_step();
 }
+
+namespace LAMMPS_NS {
+template class FixDPDenergyKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class FixDPDenergyKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/fix_dpd_energy_kokkos.h b/src/KOKKOS/fix_dpd_energy_kokkos.h
index e5ae2b0127..0c43ecf422 100644
--- a/src/KOKKOS/fix_dpd_energy_kokkos.h
+++ b/src/KOKKOS/fix_dpd_energy_kokkos.h
@@ -19,16 +19,16 @@ FixStyle(dpd/energy/kk/host,FixDPDenergyKokkos<LMPHostType>)
 
 #else
 
-#ifndef LMP_FIX_DPDE_H
-#define LMP_FIX_DPDE_H
+#ifndef LMP_FIX_DPDE_KOKKOS_H
+#define LMP_FIX_DPDE_KOKKOS_H
 
 #include "fix_dpd_energy.h"
-#include "pair_dpd_dft_energy_kokkos.h"
+#include "pair_dpd_fdt_energy_kokkos.h"
 
 namespace LAMMPS_NS {
 
 template <typename DeviceType>
-class FixDPDenergyKokkos : public FixDPDEnergy {
+class FixDPDenergyKokkos : public FixDPDenergy {
  public:
   FixDPDenergyKokkos(class LAMMPS *, int, char **);
   virtual ~FixDPDenergyKokkos() {}

From e632f8597ac7a5229d51e8f12bfbd776f9369aee Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Tue, 3 Jan 2017 09:04:10 -0700
Subject: [PATCH 041/267] fix warning about enum comparisons

---
 src/KOKKOS/pair_table_kokkos.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/KOKKOS/pair_table_kokkos.h b/src/KOKKOS/pair_table_kokkos.h
index 4d3a9ec106..e768c97164 100644
--- a/src/KOKKOS/pair_table_kokkos.h
+++ b/src/KOKKOS/pair_table_kokkos.h
@@ -31,7 +31,7 @@ namespace LAMMPS_NS {
 
 template<class Device,int TABSTYLE>
 struct S_TableCompute {
-  enum {TabStyle = TABSTYLE};
+  static constexpr int TabStyle = TABSTYLE;
 };
 
 template <class DeviceType, int NEIGHFLAG, int TABSTYLE>

From dae132c77099b2b5a895b3c08ee7df7bd11a74d4 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Tue, 3 Jan 2017 10:53:10 -0700
Subject: [PATCH 042/267] place newline at end of file

---
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
index 133d366fbc..4f04da2f3b 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
@@ -682,4 +682,4 @@ template class PairDPDfdtEnergyKokkos<LMPDeviceType>;
 #ifdef KOKKOS_HAVE_CUDA
 template class PairDPDfdtEnergyKokkos<LMPHostType>;
 #endif
-}
\ No newline at end of file
+}

From e3ebd8e7f1793caa1b7f686f6d7ae9a671be25ac Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Tue, 3 Jan 2017 09:15:30 -0700
Subject: [PATCH 043/267] remove syncs that shouldn't be needed

---
 src/KOKKOS/fix_dpd_energy_kokkos.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/KOKKOS/fix_dpd_energy_kokkos.cpp b/src/KOKKOS/fix_dpd_energy_kokkos.cpp
index 6ab0b215b4..e6878afed4 100644
--- a/src/KOKKOS/fix_dpd_energy_kokkos.cpp
+++ b/src/KOKKOS/fix_dpd_energy_kokkos.cpp
@@ -70,9 +70,6 @@ void FixDPDenergyKokkos<DeviceType>::take_half_step()
 
   atomKK->modified(execution_space, UCOND_MASK);
   atomKK->modified(execution_space, UMECH_MASK);
-  //should not be needed once everything is Kokkos
-  atomKK->sync(ExecutionSpaceFromDevice<LMPHostType>::space, UCOND_MASK);
-  atomKK->sync(ExecutionSpaceFromDevice<LMPHostType>::space, UMECH_MASK);
 }
 
 /* ---------------------------------------------------------------------- */

From 6d7607a6ade5299b021190373add4a1d50765aaa Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Tue, 3 Jan 2017 11:13:46 -0700
Subject: [PATCH 044/267] member function containing lambdas must be public

---
 src/KOKKOS/fix_dpd_energy_kokkos.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/KOKKOS/fix_dpd_energy_kokkos.h b/src/KOKKOS/fix_dpd_energy_kokkos.h
index 0c43ecf422..ebf3a796fe 100644
--- a/src/KOKKOS/fix_dpd_energy_kokkos.h
+++ b/src/KOKKOS/fix_dpd_energy_kokkos.h
@@ -35,8 +35,8 @@ class FixDPDenergyKokkos : public FixDPDenergy {
   virtual void initial_integrate(int);
   virtual void final_integrate();
 
- protected:
   void take_half_step();
+ protected:
   PairDPDfdtEnergyKokkos<DeviceType>* pairDPDEKK;
 };
 

From ae0e882cde4c74c5cae0af09fa5fd46c783fad24 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Tue, 3 Jan 2017 11:51:28 -0700
Subject: [PATCH 045/267] Updating pair_multi_lucy_rx_kokkos to USER-DPD
 changes

---
 src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp | 111 ++++++++++++++++-------
 src/KOKKOS/pair_multi_lucy_rx_kokkos.h   |   8 +-
 2 files changed, 82 insertions(+), 37 deletions(-)

diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
index 1dc8ccbae9..7cff630cb0 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
@@ -45,6 +45,12 @@ enum{NONE,RLINEAR,RSQ};
 
 #define MAXLINE 1024
 
+#ifdef DBL_EPSILON
+  #define MY_EPSILON (10.0*DBL_EPSILON)
+#else
+  #define MY_EPSILON (10.0*2.220446049250313e-16)
+#endif
+
 #define oneFluidParameter (-1)
 #define isOneFluid(_site) ( (_site) == oneFluidParameter )
 
@@ -187,12 +193,12 @@ void PairMultiLucyRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in
 
   {
     const int ntotal = nlocal + nghost;
-    d_fractionOld1 = typename AT::t_float_1d("PairMultiLucyRX::fractionOld1",ntotal);
-    d_fractionOld2 = typename AT::t_float_1d("PairMultiLucyRX::fractionOld2",ntotal);
-    d_fraction1 = typename AT::t_float_1d("PairMultiLucyRX::fraction1",ntotal);
-    d_fraction2 = typename AT::t_float_1d("PairMultiLucyRX::fraction2",ntotal);
+    d_mixWtSite1old = typename AT::t_float_1d("PairMultiLucyRX::mixWtSite1old",ntotal);
+    d_mixWtSite2old = typename AT::t_float_1d("PairMultiLucyRX::mixWtSite2old",ntotal);
+    d_mixWtSite1 = typename AT::t_float_1d("PairMultiLucyRX::mixWtSite1",ntotal);
+    d_mixWtSite2 = typename AT::t_float_1d("PairMultiLucyRX::mixWtSite2",ntotal);
 
-    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXgetParams>(0,ntotal),*this);
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXgetMixingWeights>(0,ntotal),*this);
   }
 
   const int inum = list->inum;
@@ -259,8 +265,8 @@ void PairMultiLucyRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXgetParams, const int &i) const {
-  getParams(i, d_fractionOld1[i], d_fractionOld2[i], d_fraction1[i], d_fraction2[i]);
+void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXgetMixingWeights, const int &i) const {
+  getMixingWeights(i, d_mixWtSite1old[i], d_mixWtSite2old[i], d_mixWtSite1[i], d_mixWtSite2[i]);
 }
 
 template<class DeviceType>
@@ -275,9 +281,9 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,evdwlOld,fpair;
   double rsq;
 
-  double fractionOld1_i,fractionOld1_j;
-  double fractionOld2_i,fractionOld2_j;
-  double fraction1_i;
+  double mixWtSite1old_i,mixWtSite1old_j;
+  double mixWtSite2old_i,mixWtSite2old_j;
+  double mixWtSite1_i;
 
   double pi = MathConst::MY_PI;
   double A_i, A_j;
@@ -297,9 +303,9 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
   double fy_i = 0.0;
   double fz_i = 0.0;
 
-  fractionOld1_i = d_fractionOld1[i];
-  fractionOld2_i = d_fractionOld2[i];
-  fraction1_i = d_fraction1[i];
+  mixWtSite1old_i = d_mixWtSite1old[i];
+  mixWtSite2old_i = d_mixWtSite2old[i];
+  mixWtSite1_i = d_mixWtSite1[i];
 
   for (jj = 0; jj < jnum; jj++) {
     int j = d_neighbors(i,jj);
@@ -314,8 +320,8 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
     if (rsq < d_cutsq(itype,jtype)) { // optimize
       fpair = 0.0;
 
-      fractionOld1_j = d_fractionOld1[j];
-      fractionOld2_j = d_fractionOld2[j];
+      mixWtSite1old_j = d_mixWtSite1old[j];
+      mixWtSite2old_j = d_mixWtSite2old[j];
 
       //tb = &tables[tabindex[itype][jtype]];
       const int tidx = d_table_const.tabindex(itype,jtype);
@@ -376,8 +382,8 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
 
       } else k_error_flag.d_view() = 3;
 
-      if (isite1 == isite2) fpair = sqrt(fractionOld1_i*fractionOld2_j)*fpair;
-      else fpair = (sqrt(fractionOld1_i*fractionOld2_j) + sqrt(fractionOld2_i*fractionOld1_j))*fpair;
+      if (isite1 == isite2) fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpair;
+      else fpair = (sqrt(mixWtSite1old_i*mixWtSite2old_j) + sqrt(mixWtSite2old_i*mixWtSite1old_j))*fpair;
 
       fx_i += delx*fpair;
       fy_i += dely*fpair;
@@ -415,8 +421,8 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
   } else k_error_flag.d_view() = 3;
 
   evdwl *=(pi*d_cutsq(itype,itype)*d_cutsq(itype,itype))/84.0;
-  evdwlOld = fractionOld1_i*evdwl;
-  evdwl = fraction1_i*evdwl;
+  evdwlOld = mixWtSite1old_i*evdwl;
+  evdwl = mixWtSite1_i*evdwl;
 
   uCG[i] += evdwlOld;
   uCGnew[i] += evdwl;
@@ -565,9 +571,14 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXComputeLoca
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairMultiLucyRXKokkos<DeviceType>::getParams(int id, double &fractionOld1, double &fractionOld2, double &fraction1, double &fraction2) const
+void PairMultiLucyRXKokkos<DeviceType>::getMixingWeights(int id, double &mixWtSite1old, double &mixWtSite2old, double &mixWtSite1, double &mixWtSite2) const
 {
-  double fractionOld, fraction;
+  double fractionOFAold, fractionOFA;
+  double fractionOld1, fraction1;
+  double fractionOld2, fraction2;
+  double nMoleculesOFAold, nMoleculesOFA;
+  double nMoleculesOld1, nMolecules1;
+  double nMoleculesOld2, nMolecules2;
   double nTotal, nTotalOld;
 
 
@@ -579,32 +590,56 @@ void PairMultiLucyRXKokkos<DeviceType>::getParams(int id, double &fractionOld1,
   }
 
   if (isOneFluid(isite1) == false){
-    fractionOld1 = dvector(isite1+nspecies,id)/nTotalOld;
-    fraction1 = dvector(isite1,id)/nTotal;
+    nMoleculesOld1 = dvector(isite1+nspecies,id);
+    nMolecules1 = dvector(isite1,id);
+    fractionOld1 = nMoleculesOld1/nTotalOld;
+    fraction1 = nMolecules1/nTotal;
   }
   if (isOneFluid(isite2) == false){
-    fractionOld2 = dvector(isite2+nspecies,id)/nTotalOld;
-    fraction2 = dvector(isite2,id)/nTotal;
+    nMoleculesOld2 = dvector(isite2+nspecies,id);
+    nMolecules2 = dvector(isite2,id);
+    fractionOld2 = nMoleculesOld2/nTotalOld;
+    fraction2 = nMolecules2/nTotal;
   }
 
   if (isOneFluid(isite1) || isOneFluid(isite2)){
-    fractionOld  = 0.0;
-    fraction  = 0.0;
+    nMoleculesOFAold  = 0.0;
+    nMoleculesOFA  = 0.0;
+    fractionOFAold  = 0.0;
+    fractionOFA  = 0.0;
 
     for (int ispecies = 0; ispecies < nspecies; ispecies++){
       if (isite1 == ispecies || isite2 == ispecies) continue;
-      fractionOld += dvector(ispecies+nspecies,id) / nTotalOld;
-      fraction += dvector(ispecies,id) / nTotal;
+      nMoleculesOFAold += dvector(ispecies+nspecies,id);
+      nMoleculesOFA += dvector(ispecies,id);
+      fractionOFAold += dvector(ispecies+nspecies,id) / nTotalOld;
+      fractionOFA += dvector(ispecies,id) / nTotal;
     }
     if (isOneFluid(isite1)){
-      fractionOld1 = fractionOld;
-      fraction1 = fraction;
+      nMoleculesOld1 = 1.0-(nTotalOld-nMoleculesOFAold);
+      nMolecules1 = 1.0-(nTotal-nMoleculesOFA);
+      fractionOld1 = fractionOFAold;
+      fraction1 = fractionOFA;
     }
     if (isOneFluid(isite2)){
-      fractionOld2 = fractionOld;
-      fraction2 = fraction;
+      nMoleculesOld2 = 1.0-(nTotalOld-nMoleculesOFAold);
+      nMolecules2 = 1.0-(nTotal-nMoleculesOFA);
+      fractionOld2 = fractionOFAold;
+      fraction2 = fractionOFA;
     }
   }
+
+  if(fractionalWeighting){
+    mixWtSite1old = fractionOld1;
+    mixWtSite1 = fraction1;
+    mixWtSite2old = fractionOld2;
+    mixWtSite2 = fraction2;
+  } else {
+    mixWtSite1old = nMoleculesOld1;
+    mixWtSite1 = nMolecules1;
+    mixWtSite2old = nMoleculesOld2;
+    mixWtSite2 = nMolecules2;
+  }
 }
 
 /* ---------------------------------------------------------------------- */
@@ -897,6 +932,16 @@ void PairMultiLucyRXKokkos<DeviceType>::settings(int narg, char **arg)
   tablength = force->inumeric(FLERR,arg[1]);
   if (tablength < 2) error->all(FLERR,"Illegal number of pair table entries");
 
+  // optional keywords
+
+  int iarg = 2;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"fractional") == 0)   fractionalWeighting = true;
+    else if (strcmp(arg[iarg],"molecular") == 0)   fractionalWeighting = false;
+    else error->all(FLERR,"Illegal pair_style command");
+    iarg++;
+  }
+
   // delete old tables, since cannot just change settings
 
   for (int m = 0; m < ntables; m++) free_table(&tables[m]);
diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.h b/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
index a6622ac4ec..1e84e3efd8 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
@@ -32,7 +32,7 @@ namespace LAMMPS_NS {
 struct TagPairMultiLucyRXPackForwardComm{};
 struct TagPairMultiLucyRXUnpackForwardComm{};
 
-struct TagPairMultiLucyRXgetParams{};
+struct TagPairMultiLucyRXgetMixingWeights{};
 
 template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, int TABSTYLE>
 struct TagPairMultiLucyRXCompute{};
@@ -75,7 +75,7 @@ class PairMultiLucyRXKokkos : public PairMultiLucyRX {
   void operator()(TagPairMultiLucyRXUnpackForwardComm, const int&) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(TagPairMultiLucyRXgetParams, const int&) const;
+  void operator()(TagPairMultiLucyRXgetMixingWeights, const int&) const;
 
   template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, int TABSTYLE>
   KOKKOS_INLINE_FUNCTION
@@ -154,9 +154,9 @@ class PairMultiLucyRXKokkos : public PairMultiLucyRX {
   void create_kokkos_tables();
 
   KOKKOS_INLINE_FUNCTION
-  void getParams(int, double &, double &, double &, double &) const;
+  void getMixingWeights(int, double &, double &, double &, double &) const;
 
-  typename AT::t_float_1d d_fractionOld1,d_fractionOld2,d_fraction1,d_fraction2;
+  typename AT::t_float_1d d_mixWtSite1old,d_mixWtSite2old,d_mixWtSite1,d_mixWtSite2;
 
   typename AT::t_x_array_randomread x;
   typename AT::t_f_array f;

From a4271ae8c5c2367cfc33f08b4994849a0cccd40b Mon Sep 17 00:00:00 2001
From: Christopher Stone <chris.stone@computational-science.com>
Date: Wed, 4 Jan 2017 15:13:46 -0500
Subject: [PATCH 046/267] Added a Makefile for AFRL Thunder.

---
 src/MAKE/MACHINES/Makefile.afrl_thunder | 116 ++++++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 src/MAKE/MACHINES/Makefile.afrl_thunder

diff --git a/src/MAKE/MACHINES/Makefile.afrl_thunder b/src/MAKE/MACHINES/Makefile.afrl_thunder
new file mode 100644
index 0000000000..ceeec48870
--- /dev/null
+++ b/src/MAKE/MACHINES/Makefile.afrl_thunder
@@ -0,0 +1,116 @@
+# mpi = MPI with its default compiler
+
+SHELL = /bin/sh
+
+# ---------------------------------------------------------------------
+# compiler/linker settings
+# specify flags and libraries needed for your compiler
+
+CC =		mpicxx
+CCFLAGS =	-g -O3 -Wall -Wextra -frounding-math -fsignaling-nans -march=native
+SHFLAGS =	-shared -MD -mcmodel=medium -fpic -fPIC
+DEPFLAGS =	-M
+
+LINK =		mpicxx
+LINKFLAGS =	-g -O
+LIB = 
+SIZE =		size
+
+ARCHIVE =	ar
+ARFLAGS =	-rc
+SHLIBFLAGS =	-shared
+
+# ---------------------------------------------------------------------
+# LAMMPS-specific settings, all OPTIONAL
+# specify settings for LAMMPS features you will use
+# if you change any -D setting, do full re-compile after "make clean"
+
+# LAMMPS ifdef settings
+# see possible settings in Section 2.2 (step 4) of manual
+
+LMP_INC =	-DLAMMPS_GZIP
+#LMP_INC +=	-DLAMMPS_JPEG
+LMP_INC +=	-DLAMMPS_MEMALIGN=64
+
+# MPI library
+# see discussion in Section 2.2 (step 5) of manual
+# MPI wrapper compiler/linker can provide this info
+# can point to dummy MPI library in src/STUBS as in Makefile.serial
+# use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts
+# INC = path for mpi.h, MPI compiler settings
+# PATH = path for MPI library
+# LIB = name of MPI library
+
+MPI_INC =       -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1
+MPI_PATH = 
+MPI_LIB =	
+
+# FFT library
+# see discussion in Section 2.2 (step 6) of manual
+# can be left blank to use provided KISS FFT library
+# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
+# PATH = path for FFT library
+# LIB = name of FFT library
+
+FFT_INC =    	
+FFT_PATH = 
+FFT_LIB =	
+
+# JPEG and/or PNG library
+# see discussion in Section 2.2 (step 7) of manual
+# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
+# INC = path(s) for jpeglib.h and/or png.h
+# PATH = path(s) for JPEG library and/or PNG library
+# LIB = name(s) of JPEG library and/or PNG library
+
+JPG_INC =       
+JPG_PATH = 	
+JPG_LIB =	
+
+# ---------------------------------------------------------------------
+# build rules and dependencies
+# do not edit this section
+
+include	Makefile.package.settings
+include	Makefile.package
+
+EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
+EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
+EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
+EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS)
+EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS)
+
+# Path to src files
+
+vpath %.cpp ..
+vpath %.h ..
+
+# Link target
+
+$(EXE):	$(OBJ) $(EXTRA_LINK_DEPENDS)
+	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
+	$(SIZE) $(EXE)
+
+# Library targets
+
+lib:	$(OBJ) $(EXTRA_LINK_DEPENDS)
+	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
+
+shlib:	$(OBJ) $(EXTRA_LINK_DEPENDS)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
+# Compilation rules
+
+%.o:%.cpp
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
+
+# Individual dependencies
+
+depend : fastdep.exe $(SRC)
+	@./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1
+
+fastdep.exe: ../DEPEND/fastdep.c
+	cc -O -o $@ $<
+
+sinclude .depend

From 8503ac22a859abb2617b5fe9bab2e5db6ec0803f Mon Sep 17 00:00:00 2001
From: Christopher Stone <chris.stone@computational-science.com>
Date: Thu, 5 Jan 2017 13:58:11 -0500
Subject: [PATCH 047/267] Fixed error->all instead of error->one bug in
 USER-DPD/fix_shardlow.cpp.

During dynamic load balancing, the subdomains will not be uniform so the
bbox size test in USER-DPD/fix_shardlow.cpp may only be called by one rank.
Using error->one allows any rank to stop the simulation in this scenario.
Added rcut and bbox information to help in diagnostics.
---
 src/USER-DPD/fix_shardlow.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/USER-DPD/fix_shardlow.cpp b/src/USER-DPD/fix_shardlow.cpp
index 541f4ba3c3..108b82a5b6 100644
--- a/src/USER-DPD/fix_shardlow.cpp
+++ b/src/USER-DPD/fix_shardlow.cpp
@@ -445,7 +445,12 @@ void FixShardlow::initial_integrate(int vflag)
     error->all(FLERR,"Fix shardlow does not yet support triclinic geometries");
 
   if(rcut >= bbx || rcut >= bby || rcut>= bbz )
-    error->all(FLERR,"Shardlow algorithm requires sub-domain length > 2*(rcut+skin). Either reduce the number of processors requested, or change the cutoff/skin\n");
+  {
+    char fmt[] = {"Shardlow algorithm requires sub-domain length > 2*(rcut+skin). Either reduce the number of processors requested, or change the cutoff/skin: rcut= %e bbx= %e bby= %e bbz= %e\n"};
+    char *msg = (char *) malloc(sizeof(fmt) + 4*15);
+    sprintf(msg, fmt, rcut, bbx, bby, bbz);
+    error->one(FLERR, msg);
+  }
 
   // Allocate memory for v_t0 to hold the initial velocities for the ghosts
   v_t0 = (double (*)[3]) memory->smalloc(sizeof(double)*3*nghost, "FixShardlow:v_t0");

From 332372dec2caab9d6f8fdcf3c87d3d6b37466999 Mon Sep 17 00:00:00 2001
From: Christopher Stone <chris.stone@computational-science.com>
Date: Thu, 5 Jan 2017 14:03:16 -0500
Subject: [PATCH 048/267] Renamed Makefile.afrl_thunder to Makefile.icex to be
 more general.

---
 src/MAKE/MACHINES/{Makefile.afrl_thunder => Makefile.icex} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/MAKE/MACHINES/{Makefile.afrl_thunder => Makefile.icex} (100%)

diff --git a/src/MAKE/MACHINES/Makefile.afrl_thunder b/src/MAKE/MACHINES/Makefile.icex
similarity index 100%
rename from src/MAKE/MACHINES/Makefile.afrl_thunder
rename to src/MAKE/MACHINES/Makefile.icex

From 19f2d2d1ecb13dae5ef26f16c8ba75b935c5fcf5 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Thu, 5 Jan 2017 15:22:59 -0700
Subject: [PATCH 049/267] fix many warnings in pair_dpd_fdt_energy_kokkos

one Kokkos kernel was not annotated consistently,
STACKPARAMS was essentially uninitialized and
confused with a local variable,
plus lots of variables were unused in some
of the Kokkos kernels.
---
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp | 14 +++-----------
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.h   |  1 +
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
index 4f04da2f3b..5de2b38ed0 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
@@ -49,6 +49,7 @@ PairDPDfdtEnergyKokkos<DeviceType>::PairDPDfdtEnergyKokkos(LAMMPS *lmp) :
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
   datamask_read = EMPTY_MASK;
   datamask_modify = EMPTY_MASK;
+  STACKPARAMS = 0;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -164,8 +165,6 @@ void PairDPDfdtEnergyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   boltz = force->boltz;
   ftm2v = force->ftm2v;
 
-  int STACKPARAMS = 0; // optimize
-
   // loop over neighbors of my atoms
 
   EV_FLOAT ev;
@@ -278,14 +277,7 @@ void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeSp
 
   int i,j,jj,inum,jnum,itype,jtype;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
-  double vxtmp,vytmp,vztmp,delvx,delvy,delvz;
-  double rsq,r,rinv,wd,wr,factor_dpd,uTmp;
-  double dot,randnum;
-
-  double kappa_ij, alpha_ij, theta_ij, gamma_ij;
-  double mass_i, mass_j;
-  double massinv_i, massinv_j;
-  double randPair, mu_ij;
+  double rsq,r,rinv,wd,wr,factor_dpd;
 
   i = d_ilist[ii];
   xtmp = x(i,0);
@@ -369,7 +361,7 @@ void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeNo
   Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_duCond = d_duCond;
   Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_duMech = d_duMech;
 
-  int i,j,jj,inum,jnum,itype,jtype;
+  int i,j,jj,jnum,itype,jtype;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
   double vxtmp,vytmp,vztmp,delvx,delvy,delvz;
   double rsq,r,rinv,wd,wr,factor_dpd,uTmp;
diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
index 41360091bc..2c2b78ac57 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
@@ -51,6 +51,7 @@ class PairDPDfdtEnergyKokkos : public PairDPDfdtEnergy {
   void init_style();
   double init_one(int, int);
 
+  KOKKOS_INLINE_FUNCTION
   void operator()(TagPairDPDfdtEnergyZero, const int&) const;
 
   template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>

From 318ab9a18506143905a0692f33dd9ac6e22ac4d0 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Tue, 3 Jan 2017 16:33:15 -0700
Subject: [PATCH 050/267] trying PairTableRX : public PairTable

saves a lot of duplicate code
---
 src/USER-DPD/pair_table_rx.cpp | 662 +--------------------------------
 src/USER-DPD/pair_table_rx.h   |  40 +-
 src/pair_table.cpp             |   4 +-
 src/pair_table.h               |   7 +-
 4 files changed, 13 insertions(+), 700 deletions(-)

diff --git a/src/USER-DPD/pair_table_rx.cpp b/src/USER-DPD/pair_table_rx.cpp
index e3cacc6155..c8d59c052d 100644
--- a/src/USER-DPD/pair_table_rx.cpp
+++ b/src/USER-DPD/pair_table_rx.cpp
@@ -31,10 +31,6 @@
 
 using namespace LAMMPS_NS;
 
-enum{NONE,RLINEAR,RSQ,BMP};
-
-#define MAXLINE 1024
-
 #ifdef DBL_EPSILON
   #define MY_EPSILON (10.0*DBL_EPSILON)
 #else
@@ -46,31 +42,13 @@ enum{NONE,RLINEAR,RSQ,BMP};
 
 /* ---------------------------------------------------------------------- */
 
-PairTableRX::PairTableRX(LAMMPS *lmp) : Pair(lmp)
+PairTableRX::PairTableRX(LAMMPS *lmp) : PairTable(lmp)
 {
-  ntables = 0;
-  tables = NULL;
   fractionalWeighting = true;
 }
 
 /* ---------------------------------------------------------------------- */
 
-PairTableRX::~PairTableRX()
-{
-  if (copymode) return;
-
-  for (int m = 0; m < ntables; m++) free_table(&tables[m]);
-  memory->sfree(tables);
-
-  if (allocated) {
-    memory->destroy(setflag);
-    memory->destroy(cutsq);
-    memory->destroy(tabindex);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
 void PairTableRX::compute(int eflag, int vflag)
 {
   int i,j,ii,jj,inum,jnum,itype,jtype,itable;
@@ -254,24 +232,6 @@ void PairTableRX::compute(int eflag, int vflag)
   memory->destroy(mixWtSite2);
 }
 
-/* ----------------------------------------------------------------------
-   allocate all arrays
-------------------------------------------------------------------------- */
-
-void PairTableRX::allocate()
-{
-  allocated = 1;
-  const int nt = atom->ntypes + 1;
-
-  memory->create(setflag,nt,nt,"pair:setflag");
-  memory->create(cutsq,nt,nt,"pair:cutsq");
-  memory->create(tabindex,nt,nt,"pair:tabindex");
-
-  memset(&setflag[0][0],0,nt*nt*sizeof(int));
-  memset(&cutsq[0][0],0,nt*nt*sizeof(double));
-  memset(&tabindex[0][0],0,nt*nt*sizeof(int));
-}
-
 /* ----------------------------------------------------------------------
    global settings
 ------------------------------------------------------------------------- */
@@ -301,8 +261,8 @@ void PairTableRX::settings(int narg, char **arg)
     else if (strcmp(arg[iarg],"msm") == 0) msmflag = 1;
     else if (strcmp(arg[iarg],"dispersion") == 0) dispersionflag = 1;
     else if (strcmp(arg[iarg],"tip4p") == 0) tip4pflag = 1;
-    else if (strcmp(arg[iarg],"fractional") == 0)   fractionalWeighting = true;
-    else if (strcmp(arg[iarg],"molecular") == 0)   fractionalWeighting = false;
+    else if (strcmp(arg[iarg],"fractional") == 0) fractionalWeighting = true;
+    else if (strcmp(arg[iarg],"molecular") == 0) fractionalWeighting = false;
     else error->all(FLERR,"Illegal pair_style command");
     iarg++;
   }
@@ -464,602 +424,6 @@ void PairTableRX::coeff(int narg, char **arg)
 
 }
 
-/* ----------------------------------------------------------------------
-   init for one type pair i,j and corresponding j,i
-------------------------------------------------------------------------- */
-
-double PairTableRX::init_one(int i, int j)
-{
-  if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
-
-  tabindex[j][i] = tabindex[i][j];
-
-  return tables[tabindex[i][j]].cut;
-}
-
-/* ----------------------------------------------------------------------
-   read a table section from a tabulated potential file
-   only called by proc 0
-   this function sets these values in Table:
-     ninput,rfile,efile,ffile,rflag,rlo,rhi,fpflag,fplo,fphi,ntablebits
-------------------------------------------------------------------------- */
-
-void PairTableRX::read_table(Table *tb, char *file, char *keyword)
-{
-  char line[MAXLINE];
-
-  // open file
-
-  FILE *fp = force->open_potential(file);
-  if (fp == NULL) {
-    char str[128];
-    sprintf(str,"Cannot open file %s",file);
-    error->one(FLERR,str);
-  }
-
-  // loop until section found with matching keyword
-
-  while (1) {
-    if (fgets(line,MAXLINE,fp) == NULL)
-      error->one(FLERR,"Did not find keyword in table file");
-    if (strspn(line," \t\n\r") == strlen(line)) continue;  // blank line
-    if (line[0] == '#') continue;                          // comment
-    char *word = strtok(line," \t\n\r");
-    if (strcmp(word,keyword) == 0) break;           // matching keyword
-    fgets(line,MAXLINE,fp);                         // no match, skip section
-    param_extract(tb,line);
-    fgets(line,MAXLINE,fp);
-    for (int i = 0; i < tb->ninput; i++) fgets(line,MAXLINE,fp);
-  }
-
-  // read args on 2nd line of section
-  // allocate table arrays for file values
-
-  fgets(line,MAXLINE,fp);
-  param_extract(tb,line);
-  memory->create(tb->rfile,tb->ninput,"pair:rfile");
-  memory->create(tb->efile,tb->ninput,"pair:efile");
-  memory->create(tb->ffile,tb->ninput,"pair:ffile");
-
-  // setup bitmap parameters for table to read in
-
-  tb->ntablebits = 0;
-  int masklo,maskhi,nmask,nshiftbits;
-  if (tb->rflag == BMP) {
-    while (1 << tb->ntablebits < tb->ninput) tb->ntablebits++;
-    if (1 << tb->ntablebits != tb->ninput)
-      error->one(FLERR,"Bitmapped table is incorrect length in table file");
-    init_bitmap(tb->rlo,tb->rhi,tb->ntablebits,masklo,maskhi,nmask,nshiftbits);
-  }
-
-  // read r,e,f table values from file
-  // if rflag set, compute r
-  // if rflag not set, use r from file
-
-  int itmp;
-  double rtmp;
-  union_int_float_t rsq_lookup;
-
-  fgets(line,MAXLINE,fp);
-  for (int i = 0; i < tb->ninput; i++) {
-    fgets(line,MAXLINE,fp);
-    sscanf(line,"%d %lg %lg %lg",&itmp,&rtmp,&tb->efile[i],&tb->ffile[i]);
-
-    if (tb->rflag == RLINEAR)
-      rtmp = tb->rlo + (tb->rhi - tb->rlo)*i/(tb->ninput-1);
-    else if (tb->rflag == RSQ) {
-      rtmp = tb->rlo*tb->rlo +
-        (tb->rhi*tb->rhi - tb->rlo*tb->rlo)*i/(tb->ninput-1);
-      rtmp = sqrt(rtmp);
-    } else if (tb->rflag == BMP) {
-      rsq_lookup.i = i << nshiftbits;
-      rsq_lookup.i |= masklo;
-      if (rsq_lookup.f < tb->rlo*tb->rlo) {
-        rsq_lookup.i = i << nshiftbits;
-        rsq_lookup.i |= maskhi;
-      }
-      rtmp = sqrtf(rsq_lookup.f);
-    }
-
-    tb->rfile[i] = rtmp;
-  }
-
-  // close file
-
-  fclose(fp);
-}
-
-/* ----------------------------------------------------------------------
-   broadcast read-in table info from proc 0 to other procs
-   this function communicates these values in Table:
-     ninput,rfile,efile,ffile,rflag,rlo,rhi,fpflag,fplo,fphi
-------------------------------------------------------------------------- */
-
-void PairTableRX::bcast_table(Table *tb)
-{
-  MPI_Bcast(&tb->ninput,1,MPI_INT,0,world);
-
-  int me;
-  MPI_Comm_rank(world,&me);
-  if (me > 0) {
-    memory->create(tb->rfile,tb->ninput,"pair:rfile");
-    memory->create(tb->efile,tb->ninput,"pair:efile");
-    memory->create(tb->ffile,tb->ninput,"pair:ffile");
-  }
-
-  MPI_Bcast(tb->rfile,tb->ninput,MPI_DOUBLE,0,world);
-  MPI_Bcast(tb->efile,tb->ninput,MPI_DOUBLE,0,world);
-  MPI_Bcast(tb->ffile,tb->ninput,MPI_DOUBLE,0,world);
-
-  MPI_Bcast(&tb->rflag,1,MPI_INT,0,world);
-  if (tb->rflag) {
-    MPI_Bcast(&tb->rlo,1,MPI_DOUBLE,0,world);
-    MPI_Bcast(&tb->rhi,1,MPI_DOUBLE,0,world);
-  }
-  MPI_Bcast(&tb->fpflag,1,MPI_INT,0,world);
-  if (tb->fpflag) {
-    MPI_Bcast(&tb->fplo,1,MPI_DOUBLE,0,world);
-    MPI_Bcast(&tb->fphi,1,MPI_DOUBLE,0,world);
-  }
-}
-
-/* ----------------------------------------------------------------------
-   build spline representation of e,f over entire range of read-in table
-   this function sets these values in Table: e2file,f2file
-------------------------------------------------------------------------- */
-
-void PairTableRX::spline_table(Table *tb)
-{
-  memory->create(tb->e2file,tb->ninput,"pair:e2file");
-  memory->create(tb->f2file,tb->ninput,"pair:f2file");
-
-  double ep0 = - tb->ffile[0];
-  double epn = - tb->ffile[tb->ninput-1];
-  spline(tb->rfile,tb->efile,tb->ninput,ep0,epn,tb->e2file);
-
-  if (tb->fpflag == 0) {
-    tb->fplo = (tb->ffile[1] - tb->ffile[0]) / (tb->rfile[1] - tb->rfile[0]);
-    tb->fphi = (tb->ffile[tb->ninput-1] - tb->ffile[tb->ninput-2]) /
-      (tb->rfile[tb->ninput-1] - tb->rfile[tb->ninput-2]);
-  }
-
-  double fp0 = tb->fplo;
-  double fpn = tb->fphi;
-  spline(tb->rfile,tb->ffile,tb->ninput,fp0,fpn,tb->f2file);
-}
-
-/* ----------------------------------------------------------------------
-   extract attributes from parameter line in table section
-   format of line: N value R/RSQ/BITMAP lo hi FP fplo fphi
-   N is required, other params are optional
-------------------------------------------------------------------------- */
-
-void PairTableRX::param_extract(Table *tb, char *line)
-{
-  tb->ninput = 0;
-  tb->rflag = NONE;
-  tb->fpflag = 0;
-
-  char *word = strtok(line," \t\n\r\f");
-  while (word) {
-    if (strcmp(word,"N") == 0) {
-      word = strtok(NULL," \t\n\r\f");
-      tb->ninput = atoi(word);
-    } else if (strcmp(word,"R") == 0 || strcmp(word,"RSQ") == 0 ||
-               strcmp(word,"BITMAP") == 0) {
-      if (strcmp(word,"R") == 0) tb->rflag = RLINEAR;
-      else if (strcmp(word,"RSQ") == 0) tb->rflag = RSQ;
-      else if (strcmp(word,"BITMAP") == 0) tb->rflag = BMP;
-      word = strtok(NULL," \t\n\r\f");
-      tb->rlo = atof(word);
-      word = strtok(NULL," \t\n\r\f");
-      tb->rhi = atof(word);
-    } else if (strcmp(word,"FP") == 0) {
-      tb->fpflag = 1;
-      word = strtok(NULL," \t\n\r\f");
-      tb->fplo = atof(word);
-      word = strtok(NULL," \t\n\r\f");
-      tb->fphi = atof(word);
-    } else {
-      printf("WORD: %s\n",word);
-      error->one(FLERR,"Invalid keyword in pair table parameters");
-    }
-    word = strtok(NULL," \t\n\r\f");
-  }
-
-  if (tb->ninput == 0) error->one(FLERR,"Pair table parameters did not set N");
-}
-
-/* ----------------------------------------------------------------------
-   compute r,e,f vectors from splined values
-------------------------------------------------------------------------- */
-
-void PairTableRX::compute_table(Table *tb)
-{
-  int tlm1 = tablength-1;
-
-  // inner = inner table bound
-  // cut = outer table bound
-  // delta = table spacing in rsq for N-1 bins
-
-  double inner;
-  if (tb->rflag) inner = tb->rlo;
-  else inner = tb->rfile[0];
-  tb->innersq = double(inner)*double(inner);
-  tb->delta = double(tb->cut*tb->cut - double(tb->innersq)) / double(tlm1);
-  tb->invdelta = 1.0/double(tb->delta);
-
-  // direct lookup tables
-  // N-1 evenly spaced bins in rsq from inner to cut
-  // e,f = value at midpt of bin
-  // e,f are N-1 in length since store 1 value at bin midpt
-  // f is converted to f/r when stored in f[i]
-  // e,f are never a match to read-in values, always computed via spline interp
-
-  if (tabstyle == LOOKUP) {
-    memory->create(tb->e,tlm1,"pair:e");
-    memory->create(tb->f,tlm1,"pair:f");
-
-    double r,rsq;
-    for (int i = 0; i < tlm1; i++) {
-      rsq = tb->innersq + (i+0.5)*tb->delta;
-      r = sqrt(rsq);
-      tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
-      tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
-    }
-  }
-
-  // linear tables
-  // N-1 evenly spaced bins in rsq from inner to cut
-  // rsq,e,f = value at lower edge of bin
-  // de,df values = delta from lower edge to upper edge of bin
-  // rsq,e,f are N in length so de,df arrays can compute difference
-  // f is converted to f/r when stored in f[i]
-  // e,f can match read-in values, else compute via spline interp
-
-  if (tabstyle == LINEAR) {
-    memory->create(tb->rsq,tablength,"pair:rsq");
-    memory->create(tb->e,tablength,"pair:e");
-    memory->create(tb->f,tablength,"pair:f");
-    memory->create(tb->de,tlm1,"pair:de");
-    memory->create(tb->df,tlm1,"pair:df");
-
-    double r,rsq;
-    for (int i = 0; i < tablength; i++) {
-      rsq = tb->innersq + i*tb->delta;
-      r = sqrt(rsq);
-      tb->rsq[i] = rsq;
-      if (tb->match) {
-        tb->e[i] = tb->efile[i];
-        tb->f[i] = tb->ffile[i]/r;
-      } else {
-        tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
-        tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
-      }
-    }
-
-    for (int i = 0; i < tlm1; i++) {
-      tb->de[i] = tb->e[i+1] - tb->e[i];
-      tb->df[i] = tb->f[i+1] - tb->f[i];
-    }
-  }
-
-  // cubic spline tables
-  // N-1 evenly spaced bins in rsq from inner to cut
-  // rsq,e,f = value at lower edge of bin
-  // e2,f2 = spline coefficient for each bin
-  // rsq,e,f,e2,f2 are N in length so have N-1 spline bins
-  // f is converted to f/r after e is splined
-  // e,f can match read-in values, else compute via spline interp
-
-  if (tabstyle == SPLINE) {
-    memory->create(tb->rsq,tablength,"pair:rsq");
-    memory->create(tb->e,tablength,"pair:e");
-    memory->create(tb->f,tablength,"pair:f");
-    memory->create(tb->e2,tablength,"pair:e2");
-    memory->create(tb->f2,tablength,"pair:f2");
-
-    tb->deltasq6 = tb->delta*tb->delta / 6.0;
-
-    double r,rsq;
-    for (int i = 0; i < tablength; i++) {
-      rsq = tb->innersq + i*tb->delta;
-      r = sqrt(rsq);
-      tb->rsq[i] = rsq;
-      if (tb->match) {
-        tb->e[i] = tb->efile[i];
-        tb->f[i] = tb->ffile[i]/r;
-      } else {
-        tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
-        tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r);
-      }
-    }
-
-    // ep0,epn = dh/dg at inner and at cut
-    // h(r) = e(r) and g(r) = r^2
-    // dh/dg = (de/dr) / 2r = -f/2r
-
-    double ep0 = - tb->f[0] / (2.0 * sqrt(tb->innersq));
-    double epn = - tb->f[tlm1] / (2.0 * tb->cut);
-    spline(tb->rsq,tb->e,tablength,ep0,epn,tb->e2);
-
-    // fp0,fpn = dh/dg at inner and at cut
-    // h(r) = f(r)/r and g(r) = r^2
-    // dh/dg = (1/r df/dr - f/r^2) / 2r
-    // dh/dg in secant approx = (f(r2)/r2 - f(r1)/r1) / (g(r2) - g(r1))
-
-    double fp0,fpn;
-    double secant_factor = 0.1;
-    if (tb->fpflag) fp0 = (tb->fplo/sqrt(tb->innersq) - tb->f[0]/tb->innersq) /
-      (2.0 * sqrt(tb->innersq));
-    else {
-      double rsq1 = tb->innersq;
-      double rsq2 = rsq1 + secant_factor*tb->delta;
-      fp0 = (splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,sqrt(rsq2)) /
-             sqrt(rsq2) - tb->f[0] / sqrt(rsq1)) / (secant_factor*tb->delta);
-    }
-
-    if (tb->fpflag && tb->cut == tb->rfile[tb->ninput-1]) fpn =
-      (tb->fphi/tb->cut - tb->f[tlm1]/(tb->cut*tb->cut)) / (2.0 * tb->cut);
-    else {
-      double rsq2 = tb->cut * tb->cut;
-      double rsq1 = rsq2 - secant_factor*tb->delta;
-      fpn = (tb->f[tlm1] / sqrt(rsq2) -
-             splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,sqrt(rsq1)) /
-             sqrt(rsq1)) / (secant_factor*tb->delta);
-    }
-
-    for (int i = 0; i < tablength; i++) tb->f[i] /= sqrt(tb->rsq[i]);
-    spline(tb->rsq,tb->f,tablength,fp0,fpn,tb->f2);
-  }
-
-  // bitmapped linear tables
-  // 2^N bins from inner to cut, spaced in bitmapped manner
-  // f is converted to f/r when stored in f[i]
-  // e,f can match read-in values, else compute via spline interp
-
-  if (tabstyle == BITMAP) {
-    double r;
-    union_int_float_t rsq_lookup;
-    int masklo,maskhi;
-
-    // linear lookup tables of length ntable = 2^n
-    // stored value = value at lower edge of bin
-
-    init_bitmap(inner,tb->cut,tablength,masklo,maskhi,tb->nmask,tb->nshiftbits);
-    int ntable = 1 << tablength;
-    int ntablem1 = ntable - 1;
-
-    memory->create(tb->rsq,ntable,"pair:rsq");
-    memory->create(tb->e,ntable,"pair:e");
-    memory->create(tb->f,ntable,"pair:f");
-    memory->create(tb->de,ntable,"pair:de");
-    memory->create(tb->df,ntable,"pair:df");
-    memory->create(tb->drsq,ntable,"pair:drsq");
-
-    union_int_float_t minrsq_lookup;
-    minrsq_lookup.i = 0 << tb->nshiftbits;
-    minrsq_lookup.i |= maskhi;
-
-    for (int i = 0; i < ntable; i++) {
-      rsq_lookup.i = i << tb->nshiftbits;
-      rsq_lookup.i |= masklo;
-      if (rsq_lookup.f < tb->innersq) {
-        rsq_lookup.i = i << tb->nshiftbits;
-        rsq_lookup.i |= maskhi;
-      }
-      r = sqrtf(rsq_lookup.f);
-      tb->rsq[i] = rsq_lookup.f;
-      if (tb->match) {
-        tb->e[i] = tb->efile[i];
-        tb->f[i] = tb->ffile[i]/r;
-      } else {
-        tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
-        tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
-      }
-      minrsq_lookup.f = MIN(minrsq_lookup.f,rsq_lookup.f);
-    }
-
-    tb->innersq = minrsq_lookup.f;
-
-    for (int i = 0; i < ntablem1; i++) {
-      tb->de[i] = tb->e[i+1] - tb->e[i];
-      tb->df[i] = tb->f[i+1] - tb->f[i];
-      tb->drsq[i] = 1.0/(tb->rsq[i+1] - tb->rsq[i]);
-    }
-
-    // get the delta values for the last table entries
-    // tables are connected periodically between 0 and ntablem1
-
-    tb->de[ntablem1] = tb->e[0] - tb->e[ntablem1];
-    tb->df[ntablem1] = tb->f[0] - tb->f[ntablem1];
-    tb->drsq[ntablem1] = 1.0/(tb->rsq[0] - tb->rsq[ntablem1]);
-
-    // get the correct delta values at itablemax
-    // smallest r is in bin itablemin
-    // largest r is in bin itablemax, which is itablemin-1,
-    //   or ntablem1 if itablemin=0
-
-    // deltas at itablemax only needed if corresponding rsq < cut*cut
-    // if so, compute deltas between rsq and cut*cut
-    //   if tb->match, data at cut*cut is unavailable, so we'll take
-    //   deltas at itablemax-1 as a good approximation
-
-    double e_tmp,f_tmp;
-    int itablemin = minrsq_lookup.i & tb->nmask;
-    itablemin >>= tb->nshiftbits;
-    int itablemax = itablemin - 1;
-    if (itablemin == 0) itablemax = ntablem1;
-    int itablemaxm1 = itablemax - 1;
-    if (itablemax == 0) itablemaxm1 = ntablem1;
-    rsq_lookup.i = itablemax << tb->nshiftbits;
-    rsq_lookup.i |= maskhi;
-    if (rsq_lookup.f < tb->cut*tb->cut) {
-      if (tb->match) {
-        tb->de[itablemax] = tb->de[itablemaxm1];
-        tb->df[itablemax] = tb->df[itablemaxm1];
-        tb->drsq[itablemax] = tb->drsq[itablemaxm1];
-      } else {
-            rsq_lookup.f = tb->cut*tb->cut;
-        r = sqrtf(rsq_lookup.f);
-        e_tmp = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
-        f_tmp = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
-        tb->de[itablemax] = e_tmp - tb->e[itablemax];
-        tb->df[itablemax] = f_tmp - tb->f[itablemax];
-        tb->drsq[itablemax] = 1.0/(rsq_lookup.f - tb->rsq[itablemax]);
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   set all ptrs in a table to NULL, so can be freed safely
-------------------------------------------------------------------------- */
-
-void PairTableRX::null_table(Table *tb)
-{
-  tb->rfile = tb->efile = tb->ffile = NULL;
-  tb->e2file = tb->f2file = NULL;
-  tb->rsq = tb->drsq = tb->e = tb->de = NULL;
-  tb->f = tb->df = tb->e2 = tb->f2 = NULL;
-}
-
-/* ----------------------------------------------------------------------
-   free all arrays in a table
-------------------------------------------------------------------------- */
-
-void PairTableRX::free_table(Table *tb)
-{
-  memory->destroy(tb->rfile);
-  memory->destroy(tb->efile);
-  memory->destroy(tb->ffile);
-  memory->destroy(tb->e2file);
-  memory->destroy(tb->f2file);
-
-  memory->destroy(tb->rsq);
-  memory->destroy(tb->drsq);
-  memory->destroy(tb->e);
-  memory->destroy(tb->de);
-  memory->destroy(tb->f);
-  memory->destroy(tb->df);
-  memory->destroy(tb->e2);
-  memory->destroy(tb->f2);
-}
-
-/* ----------------------------------------------------------------------
-   spline and splint routines modified from Numerical Recipes
-------------------------------------------------------------------------- */
-
-void PairTableRX::spline(double *x, double *y, int n,
-                       double yp1, double ypn, double *y2)
-{
-  int i,k;
-  double p,qn,sig,un;
-  double *u = new double[n];
-
-  if (yp1 > 0.99e30) y2[0] = u[0] = 0.0;
-  else {
-    y2[0] = -0.5;
-    u[0] = (3.0/(x[1]-x[0])) * ((y[1]-y[0]) / (x[1]-x[0]) - yp1);
-  }
-  for (i = 1; i < n-1; i++) {
-    sig = (x[i]-x[i-1]) / (x[i+1]-x[i-1]);
-    p = sig*y2[i-1] + 2.0;
-    y2[i] = (sig-1.0) / p;
-    u[i] = (y[i+1]-y[i]) / (x[i+1]-x[i]) - (y[i]-y[i-1]) / (x[i]-x[i-1]);
-    u[i] = (6.0*u[i] / (x[i+1]-x[i-1]) - sig*u[i-1]) / p;
-  }
-  if (ypn > 0.99e30) qn = un = 0.0;
-  else {
-    qn = 0.5;
-    un = (3.0/(x[n-1]-x[n-2])) * (ypn - (y[n-1]-y[n-2]) / (x[n-1]-x[n-2]));
-  }
-  y2[n-1] = (un-qn*u[n-2]) / (qn*y2[n-2] + 1.0);
-  for (k = n-2; k >= 0; k--) y2[k] = y2[k]*y2[k+1] + u[k];
-
-  delete [] u;
-}
-
-/* ---------------------------------------------------------------------- */
-
-double PairTableRX::splint(double *xa, double *ya, double *y2a, int n, double x)
-{
-  int klo,khi,k;
-  double h,b,a,y;
-
-  klo = 0;
-  khi = n-1;
-  while (khi-klo > 1) {
-    k = (khi+klo) >> 1;
-    if (xa[k] > x) khi = k;
-    else klo = k;
-  }
-  h = xa[khi]-xa[klo];
-  a = (xa[khi]-x) / h;
-  b = (x-xa[klo]) / h;
-  y = a*ya[klo] + b*ya[khi] +
-    ((a*a*a-a)*y2a[klo] + (b*b*b-b)*y2a[khi]) * (h*h)/6.0;
-  return y;
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 writes to restart file
-------------------------------------------------------------------------- */
-
-void PairTableRX::write_restart(FILE *fp)
-{
-  write_restart_settings(fp);
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 reads from restart file, bcasts
-------------------------------------------------------------------------- */
-
-void PairTableRX::read_restart(FILE *fp)
-{
-  read_restart_settings(fp);
-  allocate();
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 writes to restart file
-------------------------------------------------------------------------- */
-
-void PairTableRX::write_restart_settings(FILE *fp)
-{
-  fwrite(&tabstyle,sizeof(int),1,fp);
-  fwrite(&tablength,sizeof(int),1,fp);
-  fwrite(&ewaldflag,sizeof(int),1,fp);
-  fwrite(&pppmflag,sizeof(int),1,fp);
-  fwrite(&msmflag,sizeof(int),1,fp);
-  fwrite(&dispersionflag,sizeof(int),1,fp);
-  fwrite(&tip4pflag,sizeof(int),1,fp);
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 reads from restart file, bcasts
-------------------------------------------------------------------------- */
-
-void PairTableRX::read_restart_settings(FILE *fp)
-{
-  if (comm->me == 0) {
-    fread(&tabstyle,sizeof(int),1,fp);
-    fread(&tablength,sizeof(int),1,fp);
-    fread(&ewaldflag,sizeof(int),1,fp);
-    fread(&pppmflag,sizeof(int),1,fp);
-    fread(&msmflag,sizeof(int),1,fp);
-    fread(&dispersionflag,sizeof(int),1,fp);
-    fread(&tip4pflag,sizeof(int),1,fp);
-  }
-  MPI_Bcast(&tabstyle,1,MPI_INT,0,world);
-  MPI_Bcast(&tablength,1,MPI_INT,0,world);
-  MPI_Bcast(&ewaldflag,1,MPI_INT,0,world);
-  MPI_Bcast(&pppmflag,1,MPI_INT,0,world);
-  MPI_Bcast(&msmflag,1,MPI_INT,0,world);
-  MPI_Bcast(&dispersionflag,1,MPI_INT,0,world);
-  MPI_Bcast(&tip4pflag,1,MPI_INT,0,world);
-}
-
 /* ---------------------------------------------------------------------- */
 
 double PairTableRX::single(int i, int j, int itype, int jtype, double rsq,
@@ -1131,26 +495,6 @@ double PairTableRX::single(int i, int j, int itype, int jtype, double rsq,
   return factor_lj*phi;
 }
 
-/* ----------------------------------------------------------------------
-   return the Coulomb cutoff for tabled potentials
-   called by KSpace solvers which require that all pairwise cutoffs be the same
-   loop over all tables not just those indexed by tabindex[i][j] since
-     no way to know which tables are active since pair::init() not yet called
-------------------------------------------------------------------------- */
-
-void *PairTableRX::extract(const char *str, int &dim)
-{
-  if (strcmp(str,"cut_coul") != 0) return NULL;
-  if (ntables == 0) error->all(FLERR,"All pair coeffs are not set");
-
-  double cut_coul = tables[0].cut;
-  for (int m = 1; m < ntables; m++)
-    if (tables[m].cut != cut_coul)
-      error->all(FLERR,"Pair table cutoffs must all be equal to use with KSpace");
-  dim = 0;
-  return &tables[0].cut;
-}
-
 /* ---------------------------------------------------------------------- */
 
 void PairTableRX::getMixingWeights(int id, double &mixWtSite1old, double &mixWtSite2old, double &mixWtSite1, double &mixWtSite2)
diff --git a/src/USER-DPD/pair_table_rx.h b/src/USER-DPD/pair_table_rx.h
index c6afe6a8d5..4f80872029 100644
--- a/src/USER-DPD/pair_table_rx.h
+++ b/src/USER-DPD/pair_table_rx.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
@@ -20,11 +20,11 @@ PairStyle(table/rx,PairTableRX)
 #ifndef LMP_PAIR_TABLE_RX_H
 #define LMP_PAIR_TABLE_RX_H
 
-#include "pair.h"
+#include "pair_table.h"
 
 namespace LAMMPS_NS {
 
-class PairTableRX : public Pair {
+class PairTableRX : public PairTable {
  public:
   PairTableRX(class LAMMPS *);
   virtual ~PairTableRX();
@@ -32,43 +32,11 @@ class PairTableRX : public Pair {
   virtual void compute(int, int);
   void settings(int, char **);
   void coeff(int, char **);
-  double init_one(int, int);
-  void write_restart(FILE *);
-  void read_restart(FILE *);
-  void write_restart_settings(FILE *);
-  void read_restart_settings(FILE *);
-  double single(int, int, int, int, double, double, double, double &);
-  void *extract(const char *, int &);
+  virtual double single(int, int, int, int, double, double, double, double &);
 
  protected:
   enum{LOOKUP,LINEAR,SPLINE,BITMAP};
 
-  int tabstyle,tablength;
-  struct Table {
-    int ninput,rflag,fpflag,match,ntablebits;
-    int nshiftbits,nmask;
-    double rlo,rhi,fplo,fphi,cut;
-    double *rfile,*efile,*ffile;
-    double *e2file,*f2file;
-    double innersq,delta,invdelta,deltasq6;
-    double *rsq,*drsq,*e,*de,*f,*df,*e2,*f2;
-  };
-  int ntables;
-  Table *tables;
-
-  int **tabindex;
-
-  void allocate();
-  void read_table(Table *, char *, char *);
-  void param_extract(Table *, char *);
-  void bcast_table(Table *);
-  void spline_table(Table *);
-  void compute_table(Table *);
-  void null_table(Table *);
-  void free_table(Table *);
-  void spline(double *, double *, int, double, double, double *);
-  double splint(double *, double *, double *, int, double);
-
   int nspecies;
   char *site1, *site2;
   int isite1, isite2;
diff --git a/src/pair_table.cpp b/src/pair_table.cpp
index c4bc3e7dd2..1c6bfe128e 100644
--- a/src/pair_table.cpp
+++ b/src/pair_table.cpp
@@ -29,8 +29,6 @@
 
 using namespace LAMMPS_NS;
 
-enum{NONE,RLINEAR,RSQ,BMP};
-
 #define MAXLINE 1024
 #define EPSILONR 1.0e-6
 
@@ -46,6 +44,8 @@ PairTable::PairTable(LAMMPS *lmp) : Pair(lmp)
 
 PairTable::~PairTable()
 {
+  if (copymode) return;
+
   for (int m = 0; m < ntables; m++) free_table(&tables[m]);
   memory->sfree(tables);
 
diff --git a/src/pair_table.h b/src/pair_table.h
index 358491f7cf..370efcec2f 100644
--- a/src/pair_table.h
+++ b/src/pair_table.h
@@ -37,11 +37,12 @@ class PairTable : public Pair {
   void read_restart(FILE *);
   void write_restart_settings(FILE *);
   void read_restart_settings(FILE *);
-  double single(int, int, int, int, double, double, double, double &);
+  virtual double single(int, int, int, int, double, double, double, double &);
   void *extract(const char *, int &);
 
  protected:
   enum{LOOKUP,LINEAR,SPLINE,BITMAP};
+  enum{NONE,RLINEAR,RSQ,BMP};
 
   int tabstyle,tablength;
   struct Table {
@@ -66,8 +67,8 @@ class PairTable : public Pair {
   void compute_table(Table *);
   void null_table(Table *);
   void free_table(Table *);
-  void spline(double *, double *, int, double, double, double *);
-  double splint(double *, double *, double *, int, double);
+  static void spline(double *, double *, int, double, double, double *);
+  static double splint(double *, double *, double *, int, double);
 };
 
 }

From 3941fe9ab7488bf91fca7b6c529ad2c2cd7ba35b Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Tue, 3 Jan 2017 16:42:24 -0700
Subject: [PATCH 051/267] fix compilation

---
 src/USER-DPD/pair_table_rx.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/USER-DPD/pair_table_rx.h b/src/USER-DPD/pair_table_rx.h
index 4f80872029..00314ac424 100644
--- a/src/USER-DPD/pair_table_rx.h
+++ b/src/USER-DPD/pair_table_rx.h
@@ -27,7 +27,7 @@ namespace LAMMPS_NS {
 class PairTableRX : public PairTable {
  public:
   PairTableRX(class LAMMPS *);
-  virtual ~PairTableRX();
+  virtual ~PairTableRX() {}
 
   virtual void compute(int, int);
   void settings(int, char **);

From a1ac2ae9b7570d27148064d90ea8051c7e30c75e Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Wed, 4 Jan 2017 10:51:31 -0700
Subject: [PATCH 052/267] move enum to pair.h

to avoid having it be replicated
in several different locations
---
 src/pair.cpp     | 2 --
 src/pair.h       | 2 ++
 src/pair_table.h | 1 -
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/pair.cpp b/src/pair.cpp
index 5d73a592e8..f8ae641d2f 100644
--- a/src/pair.cpp
+++ b/src/pair.cpp
@@ -43,8 +43,6 @@
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
-enum{NONE,RLINEAR,RSQ,BMP};
-
 // allocate space for static class instance variable and initialize it
 
 int Pair::instance_total = 0;
diff --git a/src/pair.h b/src/pair.h
index 3378115e49..fbb6d8408b 100644
--- a/src/pair.h
+++ b/src/pair.h
@@ -32,6 +32,8 @@ class Pair : protected Pointers {
   friend class Info;
 
  public:
+  enum{NONE,RLINEAR,RSQ,BMP};
+
   static int instance_total;     // # of Pair classes ever instantiated
 
   double eng_vdwl,eng_coul;      // accumulated energies
diff --git a/src/pair_table.h b/src/pair_table.h
index 370efcec2f..8d5dbdb28a 100644
--- a/src/pair_table.h
+++ b/src/pair_table.h
@@ -42,7 +42,6 @@ class PairTable : public Pair {
 
  protected:
   enum{LOOKUP,LINEAR,SPLINE,BITMAP};
-  enum{NONE,RLINEAR,RSQ,BMP};
 
   int tabstyle,tablength;
   struct Table {

From 70927d08e734d418ed61f9492160ca23b65c6e6f Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Thu, 5 Jan 2017 13:25:30 -0700
Subject: [PATCH 053/267] remove duplicate enum

---
 src/USER-DPD/pair_table_rx.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/USER-DPD/pair_table_rx.h b/src/USER-DPD/pair_table_rx.h
index 00314ac424..9dee5df266 100644
--- a/src/USER-DPD/pair_table_rx.h
+++ b/src/USER-DPD/pair_table_rx.h
@@ -35,7 +35,6 @@ class PairTableRX : public PairTable {
   virtual double single(int, int, int, int, double, double, double, double &);
 
  protected:
-  enum{LOOKUP,LINEAR,SPLINE,BITMAP};
 
   int nspecies;
   char *site1, *site2;

From ad1402562d70ffa4a03e150ed9246e8ae710c684 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Thu, 5 Jan 2017 20:54:24 -0700
Subject: [PATCH 054/267] Revert "move enum to pair.h"

This reverts commit a1ac2ae9b7570d27148064d90ea8051c7e30c75e.
---
 src/pair.cpp     | 2 ++
 src/pair.h       | 2 --
 src/pair_table.h | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/pair.cpp b/src/pair.cpp
index f8ae641d2f..5d73a592e8 100644
--- a/src/pair.cpp
+++ b/src/pair.cpp
@@ -43,6 +43,8 @@
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
+enum{NONE,RLINEAR,RSQ,BMP};
+
 // allocate space for static class instance variable and initialize it
 
 int Pair::instance_total = 0;
diff --git a/src/pair.h b/src/pair.h
index fbb6d8408b..3378115e49 100644
--- a/src/pair.h
+++ b/src/pair.h
@@ -32,8 +32,6 @@ class Pair : protected Pointers {
   friend class Info;
 
  public:
-  enum{NONE,RLINEAR,RSQ,BMP};
-
   static int instance_total;     // # of Pair classes ever instantiated
 
   double eng_vdwl,eng_coul;      // accumulated energies
diff --git a/src/pair_table.h b/src/pair_table.h
index 8d5dbdb28a..370efcec2f 100644
--- a/src/pair_table.h
+++ b/src/pair_table.h
@@ -42,6 +42,7 @@ class PairTable : public Pair {
 
  protected:
   enum{LOOKUP,LINEAR,SPLINE,BITMAP};
+  enum{NONE,RLINEAR,RSQ,BMP};
 
   int tabstyle,tablength;
   struct Table {

From d8ddef37ed5407a3723b854ffc8ae077fb4c9fc5 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Thu, 5 Jan 2017 20:56:37 -0700
Subject: [PATCH 055/267] put enum back in .cpp file

see lammps/lammps#325
---
 src/USER-DPD/pair_table_rx.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/USER-DPD/pair_table_rx.cpp b/src/USER-DPD/pair_table_rx.cpp
index c8d59c052d..e8f0e81057 100644
--- a/src/USER-DPD/pair_table_rx.cpp
+++ b/src/USER-DPD/pair_table_rx.cpp
@@ -31,6 +31,8 @@
 
 using namespace LAMMPS_NS;
 
+enum{NONE,RLINEAR,RSQ,BMP};
+
 #ifdef DBL_EPSILON
   #define MY_EPSILON (10.0*DBL_EPSILON)
 #else

From 7201f003e57716ac7a14378127dc22fbc63954f1 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Thu, 5 Jan 2017 21:00:39 -0700
Subject: [PATCH 056/267] move another enum back

see lammps/lammps#325
---
 src/pair_table.cpp | 2 ++
 src/pair_table.h   | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/pair_table.cpp b/src/pair_table.cpp
index 1c6bfe128e..b36843ff44 100644
--- a/src/pair_table.cpp
+++ b/src/pair_table.cpp
@@ -29,6 +29,8 @@
 
 using namespace LAMMPS_NS;
 
+enum{NONE,RLINEAR,RSQ,BMP};
+
 #define MAXLINE 1024
 #define EPSILONR 1.0e-6
 
diff --git a/src/pair_table.h b/src/pair_table.h
index 370efcec2f..8d5dbdb28a 100644
--- a/src/pair_table.h
+++ b/src/pair_table.h
@@ -42,7 +42,6 @@ class PairTable : public Pair {
 
  protected:
   enum{LOOKUP,LINEAR,SPLINE,BITMAP};
-  enum{NONE,RLINEAR,RSQ,BMP};
 
   int tabstyle,tablength;
   struct Table {

From d26f1403cdb70e88abd9f9d8dced12a3ef16bd51 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Tue, 10 Jan 2017 15:22:52 -0700
Subject: [PATCH 057/267] fix race condition on rho

the main bug here is the use of a local
rho_i accumulator which later gets assigned
back to rho[i].
in parallel, atomic additions can happen to
rho[i] while the local accumulator is held;
those atomic additions are lost when
the accumulator is atomically assigned.
we instead initialize the accumulator to zero
and atomically add it back to rho[i].
---
 src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
index 7cff630cb0..24502f875c 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
@@ -526,7 +526,7 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXComputeLoca
   const double ytmp = x(i,1);
   const double ztmp = x(i,2);
 
-  double rho_i = rho[i];
+  double rho_i_contrib = 0.0;
 
   const int itype = type[i];
   const int jnum = d_numneigh[i];
@@ -549,7 +549,7 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXComputeLoca
         const double tmpFactor = 1.0 - r_over_rcut;
         const double tmpFactor4 = tmpFactor*tmpFactor*tmpFactor*tmpFactor;
         const double factor = factor_type11*(1.0 + 1.5*r_over_rcut)*tmpFactor4;
-        rho_i += factor;
+        rho_i_contrib += factor;
         if (NEWTON_PAIR || j < nlocal)
           a_rho[j] += factor;
       } else if (rsq < d_cutsq(itype,jtype)) {
@@ -557,14 +557,14 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXComputeLoca
         const double tmpFactor = 1.0-sqrt(rsq)/rcut;
         const double tmpFactor4 = tmpFactor*tmpFactor*tmpFactor*tmpFactor;
         const double factor = (84.0/(5.0*pi*rcut*rcut*rcut))*(1.0+3.0*sqrt(rsq)/(2.0*rcut))*tmpFactor4;
-        rho_i += factor;
+        rho_i_contrib += factor;
         if (NEWTON_PAIR || j < nlocal)
           a_rho[j] += factor;
       }
     }
   }
 
-  a_rho[i] = rho_i;
+  a_rho[i] += rho_i_contrib;
 }
 
 /* ---------------------------------------------------------------------- */

From 6abefe7ef956621d52941fb2f1778665fd6a5e3d Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Fri, 6 Jan 2017 15:41:41 -0700
Subject: [PATCH 058/267] restarting PairTableRXKokkos

as an exact copy of PairTableKokkos,
now that it derives from PairTable
---
 src/KOKKOS/Install.sh               |   4 +-
 src/KOKKOS/pair_table_rx_kokkos.cpp | 241 ++++++++++------------------
 src/KOKKOS/pair_table_rx_kokkos.h   | 185 +++++----------------
 3 files changed, 128 insertions(+), 302 deletions(-)

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index d796de5e2f..cfda7dbf94 100644
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -195,8 +195,8 @@ action pair_vashishta_kokkos.cpp pair_vashishta.cpp
 action pair_vashishta_kokkos.h pair_vashishta.h
 action pair_table_kokkos.cpp
 action pair_table_kokkos.h
-#action pair_table_rx_kokkos.cpp pair_table_rx.cpp
-#action pair_table_rx_kokkos.h pair_table_rx.h  
+action pair_table_rx_kokkos.cpp pair_table_rx.cpp
+action pair_table_rx_kokkos.h pair_table_rx.h  
 action pair_tersoff_kokkos.cpp pair_tersoff.cpp
 action pair_tersoff_kokkos.h pair_tersoff.h
 action pair_tersoff_mod_kokkos.cpp pair_tersoff_mod.cpp
diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index bf32d1c14f..2ccdefd05d 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -12,7 +12,7 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing author: Stan Moore (SNL)
+   Contributing author: Dan Ibanez (SNL)
 ------------------------------------------------------------------------- */
 
 #include <mpi.h>
@@ -33,20 +33,13 @@
 
 using namespace LAMMPS_NS;
 
-enum{NONE,RLINEAR,RSQ,BMP};
-enum{FULL,HALFTHREAD,HALF};
-
-#define MAXLINE 1024
-
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
-PairTableRXKokkos<DeviceType>::PairTableRXKokkos(LAMMPS *lmp) : PairTableRX(lmp)
+PairTableRXKokkos<DeviceType>::PairTableRXKokkos(LAMMPS *lmp) : PairTable(lmp)
 {
   update_table = 0;
   atomKK = (AtomKokkos *) atom;
-  ntables = 0;
-  tables = NULL;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
   datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
   datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
@@ -59,17 +52,12 @@ PairTableRXKokkos<DeviceType>::PairTableRXKokkos(LAMMPS *lmp) : PairTableRX(lmp)
 template<class DeviceType>
 PairTableRXKokkos<DeviceType>::~PairTableRXKokkos()
 {
-/*  for (int m = 0; m < ntables; m++) free_table(&tables[m]);
-  memory->sfree(tables);
-
-  if (allocated) {
-    memory->destroy(setflag);
-    memory->destroy(cutsq);
-    memory->destroy(tabindex);
-  }*/
+  if (copymode) return;
   delete h_table;
+  h_table = nullptr;
   delete d_table;
-
+  d_table = nullptr;
+  copymode = true; //prevents base class destructor from running
 }
 
 /* ---------------------------------------------------------------------- */
@@ -98,7 +86,6 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
 
   if (neighflag == FULL) no_virial_fdotr_compute = 1;
 
-
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
 
@@ -124,44 +111,44 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
   EV_FLOAT ev;
   if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
     if (neighflag == FULL) {
-      PairComputeFunctor<PairTableRXKokkos<DeviceType>,FULL,false,S_TableRXCompute<DeviceType,TABSTYLE> >
+      PairComputeFunctor<PairTableRXKokkos<DeviceType>,FULL,false,S_TableCompute<DeviceType,TABSTYLE> >
         ff(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
       else Kokkos::parallel_for(list->inum,ff);
     } else if (neighflag == HALFTHREAD) {
-      PairComputeFunctor<PairTableRXKokkos<DeviceType>,HALFTHREAD,false,S_TableRXCompute<DeviceType,TABSTYLE> >
+      PairComputeFunctor<PairTableRXKokkos<DeviceType>,HALFTHREAD,false,S_TableCompute<DeviceType,TABSTYLE> >
         ff(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
       else Kokkos::parallel_for(list->inum,ff);
     } else if (neighflag == HALF) {
-      PairComputeFunctor<PairTableRXKokkos<DeviceType>,HALF,false,S_TableRXCompute<DeviceType,TABSTYLE> >
+      PairComputeFunctor<PairTableRXKokkos<DeviceType>,HALF,false,S_TableCompute<DeviceType,TABSTYLE> >
         f(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
       else Kokkos::parallel_for(list->inum,f);
     } else if (neighflag == N2) {
-      PairComputeFunctor<PairTableRXKokkos<DeviceType>,N2,false,S_TableRXCompute<DeviceType,TABSTYLE> >
+      PairComputeFunctor<PairTableRXKokkos<DeviceType>,N2,false,S_TableCompute<DeviceType,TABSTYLE> >
         f(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(nlocal,f,ev);
       else Kokkos::parallel_for(nlocal,f);
     }
   } else {
     if (neighflag == FULL) {
-      PairComputeFunctor<PairTableRXKokkos<DeviceType>,FULL,true,S_TableRXCompute<DeviceType,TABSTYLE> >
+      PairComputeFunctor<PairTableRXKokkos<DeviceType>,FULL,true,S_TableCompute<DeviceType,TABSTYLE> >
         f(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
       else Kokkos::parallel_for(list->inum,f);
     } else if (neighflag == HALFTHREAD) {
-      PairComputeFunctor<PairTableRXKokkos<DeviceType>,HALFTHREAD,true,S_TableRXCompute<DeviceType,TABSTYLE> >
+      PairComputeFunctor<PairTableRXKokkos<DeviceType>,HALFTHREAD,true,S_TableCompute<DeviceType,TABSTYLE> >
         f(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
       else Kokkos::parallel_for(list->inum,f);
     } else if (neighflag == HALF) {
-      PairComputeFunctor<PairTableRXKokkos<DeviceType>,HALF,true,S_TableRXCompute<DeviceType,TABSTYLE> >
+      PairComputeFunctor<PairTableRXKokkos<DeviceType>,HALF,true,S_TableCompute<DeviceType,TABSTYLE> >
         f(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
       else Kokkos::parallel_for(list->inum,f);
     } else if (neighflag == N2) {
-      PairComputeFunctor<PairTableRXKokkos<DeviceType>,N2,true,S_TableRXCompute<DeviceType,TABSTYLE> >
+      PairComputeFunctor<PairTableRXKokkos<DeviceType>,N2,true,S_TableCompute<DeviceType,TABSTYLE> >
         f(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(nlocal,f,ev);
       else Kokkos::parallel_for(nlocal,f);
@@ -191,27 +178,15 @@ compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, c
   union_int_float_t rsq_lookup;
   double fpair;
   const int tidx = d_table_const.tabindex(itype,jtype);
-  //const Table* const tb = &tables[tabindex[itype][jtype]];
-
-  //if (rsq < d_table_const.innersq(tidx))
-  //  error->one(FLERR,"Pair distance < table inner cutoff");
-
-
   if (Specialisation::TabStyle == LOOKUP) {
     const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
-    //if (itable >= tlm1)
-    //  error->one(FLERR,"Pair distance > table outer cutoff");
     fpair = d_table_const.f(tidx,itable);
   } else if (Specialisation::TabStyle == LINEAR) {
     const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
-    //if (itable >= tlm1)
-    //  error->one(FLERR,"Pair distance > table outer cutoff");
     const double fraction = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
     fpair = d_table_const.f(tidx,itable) + fraction*d_table_const.df(tidx,itable);
   } else if (Specialisation::TabStyle == SPLINE) {
     const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
-    //if (itable >= tlm1)
-    //  error->one(FLERR,"Pair distance > table outer cutoff");
     const double b = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
     const double a = 1.0 - b;
     fpair = a * d_table_const.f(tidx,itable) + b * d_table_const.f(tidx,itable+1) +
@@ -237,26 +212,15 @@ compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, c
   double evdwl;
   union_int_float_t rsq_lookup;
   const int tidx = d_table_const.tabindex(itype,jtype);
-  //const Table* const tb = &tables[tabindex[itype][jtype]];
-
-  //if (rsq < d_table_const.innersq(tidx))
-  //  error->one(FLERR,"Pair distance < table inner cutoff");
-
   if (Specialisation::TabStyle == LOOKUP) {
     const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
-    //if (itable >= tlm1)
-    //  error->one(FLERR,"Pair distance > table outer cutoff");
     evdwl = d_table_const.e(tidx,itable);
   } else if (Specialisation::TabStyle == LINEAR) {
     const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
-    //if (itable >= tlm1)
-    //  error->one(FLERR,"Pair distance > table outer cutoff");
     const double fraction = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
     evdwl = d_table_const.e(tidx,itable) + fraction*d_table_const.de(tidx,itable);
   } else if (Specialisation::TabStyle == SPLINE) {
     const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
-    //if (itable >= tlm1)
-    //  error->one(FLERR,"Pair distance > table outer cutoff");
     const double b = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
     const double a = 1.0 - b;
     evdwl = a * d_table_const.e(tidx,itable) + b * d_table_const.e(tidx,itable+1) +
@@ -314,6 +278,8 @@ void PairTableRXKokkos<DeviceType>::create_kokkos_tables()
     memory->create_kokkos(d_table->drsq,h_table->drsq,ntables,ntable,"Table::drsq");
   }
 
+
+
   for(int i=0; i < ntables; i++) {
     Table* tb = &tables[i];
 
@@ -343,36 +309,69 @@ void PairTableRXKokkos<DeviceType>::create_kokkos_tables()
 
 
   Kokkos::deep_copy(d_table->nshiftbits,h_table->nshiftbits);
-  Kokkos::deep_copy(d_table->nmask,h_table->nmask);
-  Kokkos::deep_copy(d_table->innersq,h_table->innersq);
-  Kokkos::deep_copy(d_table->invdelta,h_table->invdelta);
-  Kokkos::deep_copy(d_table->deltasq6,h_table->deltasq6);
-  Kokkos::deep_copy(d_table->rsq,h_table->rsq);
-  Kokkos::deep_copy(d_table->drsq,h_table->drsq);
-  Kokkos::deep_copy(d_table->e,h_table->e);
-  Kokkos::deep_copy(d_table->de,h_table->de);
-  Kokkos::deep_copy(d_table->f,h_table->f);
-  Kokkos::deep_copy(d_table->df,h_table->df);
-  Kokkos::deep_copy(d_table->e2,h_table->e2);
-  Kokkos::deep_copy(d_table->f2,h_table->f2);
-  Kokkos::deep_copy(d_table->tabindex,h_table->tabindex);
-
   d_table_const.nshiftbits = d_table->nshiftbits;
+  Kokkos::deep_copy(d_table->nmask,h_table->nmask);
   d_table_const.nmask = d_table->nmask;
+  Kokkos::deep_copy(d_table->innersq,h_table->innersq);
   d_table_const.innersq = d_table->innersq;
+  Kokkos::deep_copy(d_table->invdelta,h_table->invdelta);
   d_table_const.invdelta = d_table->invdelta;
+  Kokkos::deep_copy(d_table->deltasq6,h_table->deltasq6);
   d_table_const.deltasq6 = d_table->deltasq6;
-  d_table_const.rsq = d_table->rsq;
-  d_table_const.drsq = d_table->drsq;
-  d_table_const.e = d_table->e;
-  d_table_const.de = d_table->de;
-  d_table_const.f = d_table->f;
-  d_table_const.df = d_table->df;
-  d_table_const.e2 = d_table->e2;
-  d_table_const.f2 = d_table->f2;
 
+  if(tabstyle == LOOKUP) {
+    Kokkos::deep_copy(d_table->e,h_table->e);
+    d_table_const.e = d_table->e;
+    Kokkos::deep_copy(d_table->f,h_table->f);
+    d_table_const.f = d_table->f;
+  }
+
+  if(tabstyle == LINEAR) {
+    Kokkos::deep_copy(d_table->rsq,h_table->rsq);
+    d_table_const.rsq = d_table->rsq;
+    Kokkos::deep_copy(d_table->e,h_table->e);
+    d_table_const.e = d_table->e;
+    Kokkos::deep_copy(d_table->f,h_table->f);
+    d_table_const.f = d_table->f;
+    Kokkos::deep_copy(d_table->de,h_table->de);
+    d_table_const.de = d_table->de;
+    Kokkos::deep_copy(d_table->df,h_table->df);
+    d_table_const.df = d_table->df;
+  }
+
+  if(tabstyle == SPLINE) {
+    Kokkos::deep_copy(d_table->rsq,h_table->rsq);
+    d_table_const.rsq = d_table->rsq;
+    Kokkos::deep_copy(d_table->e,h_table->e);
+    d_table_const.e = d_table->e;
+    Kokkos::deep_copy(d_table->f,h_table->f);
+    d_table_const.f = d_table->f;
+    Kokkos::deep_copy(d_table->e2,h_table->e2);
+    d_table_const.e2 = d_table->e2;
+    Kokkos::deep_copy(d_table->f2,h_table->f2);
+    d_table_const.f2 = d_table->f2;
+  }
+
+  if(tabstyle == BITMAP) {
+    Kokkos::deep_copy(d_table->rsq,h_table->rsq);
+    d_table_const.rsq = d_table->rsq;
+    Kokkos::deep_copy(d_table->e,h_table->e);
+    d_table_const.e = d_table->e;
+    Kokkos::deep_copy(d_table->f,h_table->f);
+    d_table_const.f = d_table->f;
+    Kokkos::deep_copy(d_table->de,h_table->de);
+    d_table_const.de = d_table->de;
+    Kokkos::deep_copy(d_table->df,h_table->df);
+    d_table_const.df = d_table->df;
+    Kokkos::deep_copy(d_table->drsq,h_table->drsq);
+    d_table_const.drsq = d_table->drsq;
+  }
 
   Kokkos::deep_copy(d_table->cutsq,h_table->cutsq);
+  d_table_const.cutsq = d_table->cutsq;
+  Kokkos::deep_copy(d_table->tabindex,h_table->tabindex);
+  d_table_const.tabindex = d_table->tabindex;
+
   update_table = 0;
 }
 
@@ -389,9 +388,9 @@ void PairTableRXKokkos<DeviceType>::allocate()
   memory->create(setflag,nt,nt,"pair:setflag");
   memory->create_kokkos(d_table->cutsq,h_table->cutsq,cutsq,nt,nt,"pair:cutsq");
   memory->create_kokkos(d_table->tabindex,h_table->tabindex,tabindex,nt,nt,"pair:tabindex");
-
   d_table_const.cutsq = d_table->cutsq;
   d_table_const.tabindex = d_table->tabindex;
+
   memset(&setflag[0][0],0,nt*nt*sizeof(int));
   memset(&cutsq[0][0],0,nt*nt*sizeof(double));
   memset(&tabindex[0][0],0,nt*nt*sizeof(int));
@@ -469,6 +468,17 @@ double PairTableRXKokkos<DeviceType>::init_one(int i, int j)
   return tables[tabindex[i][j]].cut;
 }
 
+/* ----------------------------------------------------------------------
+   compute r,e,f vectors from splined values
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableRXKokkos<DeviceType>::compute_table(Table *tb)
+{
+  update_table = 1;
+  PairTable::compute_table(tb);
+}
+
 template<class DeviceType>
 void PairTableRXKokkos<DeviceType>::init_style()
 {
@@ -496,91 +506,6 @@ void PairTableRXKokkos<DeviceType>::init_style()
   }
 }
 
-/*
-template <class DeviceType> template<int NEIGHFLAG>
-KOKKOS_INLINE_FUNCTION
-void PairTableRXKokkos<DeviceType>::
-ev_tally(EV_FLOAT &ev, const int &i, const int &j, const F_FLOAT &fpair,
-         const F_FLOAT &delx, const F_FLOAT &dely, const F_FLOAT &delz) const
-{
-  const int EFLAG = eflag;
-  const int NEWTON_PAIR = newton_pair;
-  const int VFLAG = vflag_either;
-
-  if (EFLAG) {
-    if (eflag_atom) {
-      E_FLOAT epairhalf = 0.5 * (ev.evdwl + ev.ecoul);
-      if (NEWTON_PAIR || i < nlocal) eatom[i] += epairhalf;
-      if (NEWTON_PAIR || j < nlocal) eatom[j] += epairhalf;
-    }
-  }
-
-  if (VFLAG) {
-    const E_FLOAT v0 = delx*delx*fpair;
-    const E_FLOAT v1 = dely*dely*fpair;
-    const E_FLOAT v2 = delz*delz*fpair;
-    const E_FLOAT v3 = delx*dely*fpair;
-    const E_FLOAT v4 = delx*delz*fpair;
-    const E_FLOAT v5 = dely*delz*fpair;
-
-    if (vflag_global) {
-      if (NEIGHFLAG) {
-        if (NEWTON_PAIR) {
-          ev.v[0] += v0;
-          ev.v[1] += v1;
-          ev.v[2] += v2;
-          ev.v[3] += v3;
-          ev.v[4] += v4;
-          ev.v[5] += v5;
-        } else {
-          if (i < nlocal) {
-            ev.v[0] += 0.5*v0;
-            ev.v[1] += 0.5*v1;
-            ev.v[2] += 0.5*v2;
-            ev.v[3] += 0.5*v3;
-            ev.v[4] += 0.5*v4;
-            ev.v[5] += 0.5*v5;
-          }
-          if (j < nlocal) {
-            ev.v[0] += 0.5*v0;
-            ev.v[1] += 0.5*v1;
-            ev.v[2] += 0.5*v2;
-            ev.v[3] += 0.5*v3;
-            ev.v[4] += 0.5*v4;
-            ev.v[5] += 0.5*v5;
-          }
-        }
-      } else {
-        ev.v[0] += 0.5*v0;
-        ev.v[1] += 0.5*v1;
-        ev.v[2] += 0.5*v2;
-        ev.v[3] += 0.5*v3;
-        ev.v[4] += 0.5*v4;
-        ev.v[5] += 0.5*v5;
-      }
-    }
-
-    if (vflag_atom) {
-      if (NEWTON_PAIR || i < nlocal) {
-        d_vatom(i,0) += 0.5*v0;
-        d_vatom(i,1) += 0.5*v1;
-        d_vatom(i,2) += 0.5*v2;
-        d_vatom(i,3) += 0.5*v3;
-        d_vatom(i,4) += 0.5*v4;
-        d_vatom(i,5) += 0.5*v5;
-      }
-      if (NEWTON_PAIR || (NEIGHFLAG && j < nlocal)) {
-        d_vatom(j,0) += 0.5*v0;
-        d_vatom(j,1) += 0.5*v1;
-        d_vatom(j,2) += 0.5*v2;
-        d_vatom(j,3) += 0.5*v3;
-        d_vatom(j,4) += 0.5*v4;
-        d_vatom(j,5) += 0.5*v5;
-      }
-    }
-  }
-}
-*/
 template<class DeviceType>
 void PairTableRXKokkos<DeviceType>::cleanup_copy() {
   // WHY needed: this prevents parent copy from deallocating any arrays
diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
index b379901201..c4e07d41d6 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.h
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -13,32 +13,21 @@
 
 #ifdef PAIR_CLASS
 
-PairStyle(table/rx/kk,PairTableRXKokkos<LMPDeviceType>)
-PairStyle(table/rx/kk/device,PairTableRXKokkos<LMPDeviceType>)
-PairStyle(table/rx/kk/host,PairTableRXKokkos<LMPHostType>)
+PairStyle(table/rx/kk,PairTableKokkos<LMPDeviceType>)
+PairStyle(table/rx/kk/device,PairTableKokkos<LMPDeviceType>)
+PairStyle(table/rx/kk/host,PairTableKokkos<LMPHostType>)
 
 #else
 
 #ifndef LMP_PAIR_TABLE_RX_KOKKOS_H
 #define LMP_PAIR_TABLE_RX_KOKKOS_H
 
-#include "pair_table_rx.h"
-#include "pair_kokkos.h"
-#include "neigh_list_kokkos.h"
-#include "atom_kokkos.h"
+#include "pair_table_kokkos.h"
 
 namespace LAMMPS_NS {
 
-template<class Device,int TABSTYLE>
-struct S_TableRXCompute {
-  enum {TabStyle = TABSTYLE};
-};
-
-template <class DeviceType, int NEIGHFLAG, int TABSTYLE>
-class PairTableRXComputeFunctor;
-
 template<class DeviceType>
-class PairTableRXKokkos : public PairTableRX {
+class PairTableRXKokkos : public PairTable {
  public:
 
   enum {EnabledNeighFlags=FULL|HALFTHREAD|HALF|N2};
@@ -53,27 +42,14 @@ class PairTableRXKokkos : public PairTableRX {
   template<int TABSTYLE>
   void compute_style(int, int);
 
-  /*template<int EVFLAG, int NEIGHFLAG, int NEWTON_PAIR, int TABSTYLE>
-  KOKKOS_FUNCTION
-  EV_FLOAT compute_item(const int& i,
-                        const NeighListKokkos<DeviceType> &list) const;
-*/
   void settings(int, char **);
   double init_one(int, int);
+
   void init_style();
 
- protected:
-  enum{LOOKUP,LINEAR,SPLINE,BITMAP};
 
-  int tabstyle,tablength;
-  /*struct TableDeviceConst {
-    typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread cutsq;
-    typename ArrayTypes<DeviceType>::t_int_2d_randomread tabindex;
-    typename ArrayTypes<DeviceType>::t_int_1d_randomread nshiftbits,nmask;
-    typename ArrayTypes<DeviceType>::t_ffloat_1d_randomread innersq,invdelta,deltasq6;
-    typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread rsq,drsq,e,de,f,df,e2,f2;
-  };*/
- //Its faster not to use texture fetch if the number of tables is less than 32!
+ protected:
+
   struct TableDeviceConst {
     typename ArrayTypes<DeviceType>::t_ffloat_2d cutsq;
     typename ArrayTypes<DeviceType>::t_int_2d tabindex;
@@ -102,12 +78,12 @@ class PairTableRXKokkos : public PairTableRX {
   TableDevice* d_table;
   TableHost* h_table;
 
-  int **tabindex;
   F_FLOAT m_cutsq[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
 
   typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq;
 
-  void allocate();
+  virtual void allocate();
+  void compute_table(Table *);
 
   typename ArrayTypes<DeviceType>::t_x_array_randomread x;
   typename ArrayTypes<DeviceType>::t_x_array_const c_x;
@@ -137,41 +113,41 @@ class PairTableRXKokkos : public PairTableRX {
     return 0;
   }
 
-  friend class PairComputeFunctor<PairTableRXKokkos,FULL,true,S_TableRXCompute<DeviceType,LOOKUP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALF,true,S_TableRXCompute<DeviceType,LOOKUP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,true,S_TableRXCompute<DeviceType,LOOKUP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,N2,true,S_TableRXCompute<DeviceType,LOOKUP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,FULL,false,S_TableRXCompute<DeviceType,LOOKUP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALF,false,S_TableRXCompute<DeviceType,LOOKUP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,false,S_TableRXCompute<DeviceType,LOOKUP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,N2,false,S_TableRXCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,FULL,true,S_TableCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALF,true,S_TableCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,true,S_TableCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,N2,true,S_TableCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,FULL,false,S_TableCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALF,false,S_TableCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,false,S_TableCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,N2,false,S_TableCompute<DeviceType,LOOKUP> >;
 
-  friend class PairComputeFunctor<PairTableRXKokkos,FULL,true,S_TableRXCompute<DeviceType,LINEAR> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALF,true,S_TableRXCompute<DeviceType,LINEAR> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,true,S_TableRXCompute<DeviceType,LINEAR> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,N2,true,S_TableRXCompute<DeviceType,LINEAR> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,FULL,false,S_TableRXCompute<DeviceType,LINEAR> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALF,false,S_TableRXCompute<DeviceType,LINEAR> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,false,S_TableRXCompute<DeviceType,LINEAR> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,N2,false,S_TableRXCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,FULL,true,S_TableCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALF,true,S_TableCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,true,S_TableCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,N2,true,S_TableCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,FULL,false,S_TableCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALF,false,S_TableCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,false,S_TableCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,N2,false,S_TableCompute<DeviceType,LINEAR> >;
 
-  friend class PairComputeFunctor<PairTableRXKokkos,FULL,true,S_TableRXCompute<DeviceType,SPLINE> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALF,true,S_TableRXCompute<DeviceType,SPLINE> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,true,S_TableRXCompute<DeviceType,SPLINE> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,N2,true,S_TableRXCompute<DeviceType,SPLINE> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,FULL,false,S_TableRXCompute<DeviceType,SPLINE> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALF,false,S_TableRXCompute<DeviceType,SPLINE> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,false,S_TableRXCompute<DeviceType,SPLINE> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,N2,false,S_TableRXCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,FULL,true,S_TableCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALF,true,S_TableCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,true,S_TableCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,N2,true,S_TableCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,FULL,false,S_TableCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALF,false,S_TableCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,false,S_TableCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,N2,false,S_TableCompute<DeviceType,SPLINE> >;
 
-  friend class PairComputeFunctor<PairTableRXKokkos,FULL,true,S_TableRXCompute<DeviceType,BITMAP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALF,true,S_TableRXCompute<DeviceType,BITMAP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,true,S_TableRXCompute<DeviceType,BITMAP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,N2,true,S_TableRXCompute<DeviceType,BITMAP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,FULL,false,S_TableRXCompute<DeviceType,BITMAP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALF,false,S_TableRXCompute<DeviceType,BITMAP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,false,S_TableRXCompute<DeviceType,BITMAP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,N2,false,S_TableRXCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,FULL,true,S_TableCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALF,true,S_TableCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,true,S_TableCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,N2,true,S_TableCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,FULL,false,S_TableCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALF,false,S_TableCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,false,S_TableCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableRXKokkos,N2,false,S_TableCompute<DeviceType,BITMAP> >;
 
   friend void pair_virial_fdotr_compute<PairTableRXKokkos>(PairTableRXKokkos*);
 };
@@ -183,79 +159,4 @@ class PairTableRXKokkos : public PairTableRX {
 
 /* ERROR/WARNING messages:
 
-E: Pair distance < table inner cutoff
-
-Two atoms are closer together than the pairwise table allows.
-
-E: Pair distance > table outer cutoff
-
-Two atoms are further apart than the pairwise table allows.
-
-E: Illegal ... command
-
-Self-explanatory.  Check the input script syntax and compare to the
-documentation for the command.  You can use -echo screen as a
-command-line option when running LAMMPS to see the offending line.
-
-E: Unknown table style in pair_style command
-
-Style of table is invalid for use with pair_style table command.
-
-E: Illegal number of pair table entries
-
-There must be at least 2 table entries.
-
-E: Invalid pair table length
-
-Length of read-in pair table is invalid
-
-E: Invalid pair table cutoff
-
-Cutoffs in pair_coeff command are not valid with read-in pair table.
-
-E: Bitmapped table in file does not match requested table
-
-Setting for bitmapped table in pair_coeff command must match table
-in file exactly.
-
-E: All pair coeffs are not set
-
-All pair coefficients must be set in the data file or by the
-pair_coeff command before running a simulation.
-
-E: Cannot open file %s
-
-The specified file cannot be opened.  Check that the path and name are
-correct. If the file is a compressed file, also check that the gzip
-executable can be found and run.
-
-E: Did not find keyword in table file
-
-Keyword used in pair_coeff command was not found in table file.
-
-E: Bitmapped table is incorrect length in table file
-
-Number of table entries is not a correct power of 2.
-
-E: Invalid keyword in pair table parameters
-
-Keyword used in list of table parameters is not recognized.
-
-E: Pair table parameters did not set N
-
-List of pair table parameters must include N setting.
-
-E: Pair table cutoffs must all be equal to use with KSpace
-
-When using pair style table with a long-range KSpace solver, the
-cutoffs for all atom type pairs must all be the same, since the
-long-range solver starts at that cutoff.
-
-E: Cannot use chosen neighbor list style with lj/cut/kk
-
-That style is not supported by Kokkos.
-
-
-
-
-*/
\ No newline at end of file
+ */

From f995bb43355f412709cf3420d4b215f39d8bbf61 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Fri, 6 Jan 2017 16:00:35 -0700
Subject: [PATCH 059/267] starting to add getMixingWeights

some compile errors to work out
---
 src/KOKKOS/pair_table_rx_kokkos.cpp | 82 +++++++++++++++++++++++++++++
 src/KOKKOS/pair_table_rx_kokkos.h   |  5 ++
 2 files changed, 87 insertions(+)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index 2ccdefd05d..54882ec3ce 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -33,6 +33,15 @@
 
 using namespace LAMMPS_NS;
 
+#ifdef DBL_EPSILON
+  #define MY_EPSILON (10.0*DBL_EPSILON)
+#else
+  #define MY_EPSILON (10.0*2.220446049250313e-16)
+#endif
+
+#define OneFluidValue (-1)
+#define isOneFluid(_site_) ( (_site_) == OneFluidValue )
+
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
@@ -516,6 +525,79 @@ void PairTableRXKokkos<DeviceType>::cleanup_copy() {
   h_table=NULL; d_table=NULL;
 }
 
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairTableRXKokkos<DeviceType>::getMixingWeights(typename DAT::t_float_2d_randomread dvector, int, double &, double &, double &, double &) {
+  double fractionOFAold, fractionOFA;
+  double fractionOld1, fraction1;
+  double fractionOld2, fraction2;
+  double nMoleculesOFAold, nMoleculesOFA;
+  double nMoleculesOld1, nMolecules1;
+  double nMoleculesOld2, nMolecules2;
+  double nTotal, nTotalOld;
+
+  nTotal = 0.0;
+  nTotalOld = 0.0;
+  for (int ispecies = 0; ispecies < nspecies; ++ispecies){
+    nTotal += dvector(ispecies,id);
+    nTotalOld += dvector(ispecies+nspecies,id);
+  }
+  if(nTotal < MY_EPSILON || nTotalOld < MY_EPSILON)
+    error->all(FLERR,"The number of molecules in CG particle is less than 10*DBL_EPSILON.");
+
+  if (isOneFluid(isite1) == false){
+    nMoleculesOld1 = dvector(isite1+nspecies,id);
+    nMolecules1 = dvector(isite1,id);
+    fractionOld1 = nMoleculesOld1/nTotalOld;
+    fraction1 = nMolecules1/nTotal;
+  }
+  if (isOneFluid(isite2) == false){
+    nMoleculesOld2 = dvector(isite2+nspecies,id);
+    nMolecules2 = dvector(isite2,id);
+    fractionOld2 = nMoleculesOld2/nTotalOld;
+    fraction2 = nMolecules2/nTotal;
+  }
+
+  if (isOneFluid(isite1) || isOneFluid(isite2)){
+    nMoleculesOFAold  = 0.0;
+    nMoleculesOFA  = 0.0;
+    fractionOFAold  = 0.0;
+    fractionOFA  = 0.0;
+
+    for (int ispecies = 0; ispecies < nspecies; ispecies++){
+      if (isite1 == ispecies || isite2 == ispecies) continue;
+      nMoleculesOFAold += dvector(ispecies+nspecies,id);
+      nMoleculesOFA += dvector(ispecies,id);
+      fractionOFAold += dvector(ispecies+nspecies,id)/nTotalOld;
+      fractionOFA += dvector(ispecies,id)/nTotal;
+    }
+    if(isOneFluid(isite1)){
+      nMoleculesOld1 = 1.0-(nTotalOld-nMoleculesOFAold);
+      nMolecules1 = 1.0-(nTotal-nMoleculesOFA);
+      fractionOld1 = fractionOFAold;
+      fraction1 = fractionOFA;
+    }
+    if(isOneFluid(isite2)){
+      nMoleculesOld2 = 1.0-(nTotalOld-nMoleculesOFAold);
+      nMolecules2 = 1.0-(nTotal-nMoleculesOFA);
+      fractionOld2 = fractionOFAold;
+      fraction2 = fractionOFA;
+    }
+  }
+
+  if(fractionalWeighting){
+    mixWtSite1old = fractionOld1;
+    mixWtSite1 = fraction1;
+    mixWtSite2old = fractionOld2;
+    mixWtSite2 = fraction2;
+  } else {
+    mixWtSite1old = nMoleculesOld1;
+    mixWtSite1 = nMolecules1;
+    mixWtSite2old = nMoleculesOld2;
+    mixWtSite2 = nMolecules2;
+  }
+}
+
 namespace LAMMPS_NS {
 template class PairTableRXKokkos<LMPDeviceType>;
 #ifdef KOKKOS_HAVE_CUDA
diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
index c4e07d41d6..1878faf16c 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.h
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -30,6 +30,8 @@ template<class DeviceType>
 class PairTableRXKokkos : public PairTable {
  public:
 
+  using DAT = ArrayTypes<DeviceType>;
+
   enum {EnabledNeighFlags=FULL|HALFTHREAD|HALF|N2};
   enum {COUL_FLAG=0};
   typedef DeviceType device_type;
@@ -150,6 +152,9 @@ class PairTableRXKokkos : public PairTable {
   friend class PairComputeFunctor<PairTableRXKokkos,N2,false,S_TableCompute<DeviceType,BITMAP> >;
 
   friend void pair_virial_fdotr_compute<PairTableRXKokkos>(PairTableRXKokkos*);
+
+  KOKKOS_INLINE_FUNCTION
+  void getMixingWeights(typename DAT::t_float_2d_randomread dvector, int, double &, double &, double &, double &);
 };
 
 }

From 21cde6261aa476d59e32788b036195c7ebb98498 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Mon, 9 Jan 2017 12:29:15 -0700
Subject: [PATCH 060/267] add member variables from PairTableRX

---
 src/KOKKOS/pair_table_rx_kokkos.cpp | 5 ++++-
 src/KOKKOS/pair_table_rx_kokkos.h   | 9 ++++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index 54882ec3ce..83fcb2ce1d 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -527,7 +527,10 @@ void PairTableRXKokkos<DeviceType>::cleanup_copy() {
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairTableRXKokkos<DeviceType>::getMixingWeights(typename DAT::t_float_2d_randomread dvector, int, double &, double &, double &, double &) {
+void PairTableRXKokkos<DeviceType>::getMixingWeights(
+    typename DAT::t_float_2d_randomread dvector, int id,
+    double &mixWtSite1old, double &mixWtSite2old,
+    double &mixWtSite1, double &mixWtSite2) {
   double fractionOFAold, fractionOFA;
   double fractionOld1, fraction1;
   double fractionOld2, fraction2;
diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
index 1878faf16c..0d8a8f151e 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.h
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -153,8 +153,15 @@ class PairTableRXKokkos : public PairTable {
 
   friend void pair_virial_fdotr_compute<PairTableRXKokkos>(PairTableRXKokkos*);
 
+  /* PairTableRX members */
+
+  int nspecies;
+  char *site1, *site2;
+  int isite1, isite2;
+  bool fractionalWeighting;
+
   KOKKOS_INLINE_FUNCTION
-  void getMixingWeights(typename DAT::t_float_2d_randomread dvector, int, double &, double &, double &, double &);
+  void getMixingWeights(typename DAT::t_float_2d_randomread, int, double &, double &, double &, double &);
 };
 
 }

From afbc6fc628b68baae2286c1c953a29dd78f1779e Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Mon, 9 Jan 2017 13:17:23 -0700
Subject: [PATCH 061/267] added coeff, settings, single, fix compile

---
 src/KOKKOS/pair_table_rx_kokkos.cpp | 228 +++++++++++++++++++++++++++-
 src/KOKKOS/pair_table_rx_kokkos.h   |   6 +-
 2 files changed, 232 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index 83fcb2ce1d..5a71739b6d 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -30,9 +30,12 @@
 #include "memory.h"
 #include "error.h"
 #include "atom_masks.h"
+#include "fix.h"
 
 using namespace LAMMPS_NS;
 
+enum{NONE,RLINEAR,RSQ,BMP};
+
 #ifdef DBL_EPSILON
   #define MY_EPSILON (10.0*DBL_EPSILON)
 #else
@@ -54,6 +57,7 @@ PairTableRXKokkos<DeviceType>::PairTableRXKokkos(LAMMPS *lmp) : PairTable(lmp)
   datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
   h_table = new TableHost();
   d_table = new TableDevice();
+  fractionalWeighting = true;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -435,6 +439,8 @@ void PairTableRXKokkos<DeviceType>::settings(int narg, char **arg)
     else if (strcmp(arg[iarg],"msm") == 0) msmflag = 1;
     else if (strcmp(arg[iarg],"dispersion") == 0) dispersionflag = 1;
     else if (strcmp(arg[iarg],"tip4p") == 0) tip4pflag = 1;
+    else if (strcmp(arg[iarg],"fractional") == 0) fractionalWeighting = true;
+    else if (strcmp(arg[iarg],"molecular") == 0) fractionalWeighting = false;
     else error->all(FLERR,"Illegal pair_style command");
     iarg++;
   }
@@ -459,6 +465,148 @@ void PairTableRXKokkos<DeviceType>::settings(int narg, char **arg)
   tables = NULL;
 }
 
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableRXKokkos<DeviceType>::coeff(int narg, char **arg)
+{
+  if (narg != 6 && narg != 7) error->all(FLERR,"Illegal pair_coeff command");
+  if (!allocated) allocate();
+
+  bool rx_flag = false;
+  for (int i = 0; i < modify->nfix; i++)
+    if (strncmp(modify->fix[i]->style,"rx",2) == 0) rx_flag = true;
+  if (!rx_flag) error->all(FLERR,"PairTableRX requires a fix rx command.");
+
+  int ilo,ihi,jlo,jhi;
+  force->bounds(FLERR,arg[0],atom->ntypes,ilo,ihi);
+  force->bounds(FLERR,arg[1],atom->ntypes,jlo,jhi);
+
+  int me;
+  MPI_Comm_rank(world,&me);
+  tables = (Table *)
+    memory->srealloc(tables,(ntables+1)*sizeof(Table),"pair:tables");
+  Table *tb = &tables[ntables];
+  null_table(tb);
+  if (me == 0) read_table(tb,arg[2],arg[3]);
+  bcast_table(tb);
+
+  nspecies = atom->nspecies_dpd;
+  if(nspecies==0) error->all(FLERR,"There are no rx species specified.");
+  int n;
+  n = strlen(arg[3]) + 1;
+  site1 = new char[n];
+  strcpy(site1,arg[4]);
+
+  int ispecies;
+  for (ispecies = 0; ispecies < nspecies; ispecies++){
+    if (strcmp(site1,&atom->dname[ispecies][0]) == 0) break;
+  }
+  if (ispecies == nspecies && strcmp(site1,"1fluid") != 0)
+    error->all(FLERR,"Site1 name not recognized in pair coefficients");
+
+  n = strlen(arg[4]) + 1;
+  site2 = new char[n];
+  strcpy(site2,arg[5]);
+
+  for (ispecies = 0; ispecies < nspecies; ispecies++){
+    if (strcmp(site2,&atom->dname[ispecies][0]) == 0) break;
+  }
+  if (ispecies == nspecies && strcmp(site2,"1fluid") != 0)
+    error->all(FLERR,"Site2 name not recognized in pair coefficients");
+
+  // set table cutoff
+
+  if (narg == 7) tb->cut = force->numeric(FLERR,arg[6]);
+  else if (tb->rflag) tb->cut = tb->rhi;
+  else tb->cut = tb->rfile[tb->ninput-1];
+
+  // error check on table parameters
+  // insure cutoff is within table
+  // for BITMAP tables, file values can be in non-ascending order
+
+  if (tb->ninput <= 1) error->one(FLERR,"Invalid pair table length");
+  double rlo,rhi;
+  if (tb->rflag == 0) {
+    rlo = tb->rfile[0];
+    rhi = tb->rfile[tb->ninput-1];
+  } else {
+    rlo = tb->rlo;
+    rhi = tb->rhi;
+  }
+  if (tb->cut <= rlo || tb->cut > rhi)
+    error->all(FLERR,"Invalid pair table cutoff");
+  if (rlo <= 0.0) error->all(FLERR,"Invalid pair table cutoff");
+
+  // match = 1 if don't need to spline read-in tables
+  // this is only the case if r values needed by final tables
+  //   exactly match r values read from file
+  // for tabstyle SPLINE, always need to build spline tables
+
+  tb->match = 0;
+  if (tabstyle == LINEAR && tb->ninput == tablength &&
+      tb->rflag == RSQ && tb->rhi == tb->cut) tb->match = 1;
+  if (tabstyle == BITMAP && tb->ninput == 1 << tablength &&
+      tb->rflag == BMP && tb->rhi == tb->cut) tb->match = 1;
+  if (tb->rflag == BMP && tb->match == 0)
+    error->all(FLERR,"Bitmapped table in file does not match requested table");
+
+  // spline read-in values and compute r,e,f vectors within table
+
+  if (tb->match == 0) spline_table(tb);
+  compute_table(tb);
+
+  // store ptr to table in tabindex
+
+  int count = 0;
+  for (int i = ilo; i <= ihi; i++) {
+    for (int j = MAX(jlo,i); j <= jhi; j++) {
+      tabindex[i][j] = ntables;
+      setflag[i][j] = 1;
+      count++;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Illegal pair_coeff command");
+  ntables++;
+
+  {
+     if ( strcmp(site1,"1fluid") == 0 )
+       isite1 = OneFluidValue;
+     else {
+       isite1 = nspecies;
+
+       for (int k = 0; k < nspecies; k++){
+         if (strcmp(site1, atom->dname[k]) == 0){
+           isite1 = k;
+           break;
+         }
+       }
+
+       if (isite1 == nspecies) error->all(FLERR,"isite1 == nspecies");
+     }
+
+     if ( strcmp(site2,"1fluid") == 0 )
+       isite2 = OneFluidValue;
+     else {
+       isite2 = nspecies;
+
+       for (int k = 0; k < nspecies; k++){
+         if (strcmp(site2, atom->dname[k]) == 0){
+           isite2 = ispecies;
+           break;
+         }
+       }
+
+       if (isite2 == nspecies)
+         error->all(FLERR,"isite2 == nspecies");
+     }
+  }
+
+}
+
 /* ----------------------------------------------------------------------
    init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
@@ -477,6 +625,82 @@ double PairTableRXKokkos<DeviceType>::init_one(int i, int j)
   return tables[tabindex[i][j]].cut;
 }
 
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PairTableRXKokkos<DeviceType>::single(int i, int j, int itype, int jtype, double rsq,
+                         double factor_coul, double factor_lj,
+                         double &fforce)
+{
+  int itable;
+  double fraction,value,a,b,phi;
+  int tlm1 = tablength - 1;
+
+  Table *tb = &tables[tabindex[itype][jtype]];
+  double mixWtSite1_i, mixWtSite1_j;
+  double mixWtSite2_i, mixWtSite2_j;
+  double mixWtSite1old_i, mixWtSite1old_j;
+  double mixWtSite2old_i, mixWtSite2old_j;
+
+  fraction = 0.0;
+  a = 0.0;
+  b = 0.0;
+
+  typename ArrayTypes<LMPHostType>::t_float_2d_randomread h_dvector =
+    atomKK->k_dvector.view<LMPHostType>();
+  getMixingWeights<LMPHostType>(h_dvector,i,mixWtSite1old_i,mixWtSite2old_i,
+      mixWtSite1_i,mixWtSite2_i);
+  getMixingWeights<LMPHostType>(h_dvector,j,mixWtSite1old_j,mixWtSite2old_j,
+      mixWtSite1_j,mixWtSite2_j);
+
+  if (rsq < tb->innersq) error->one(FLERR,"Pair distance < table inner cutoff");
+
+  if (tabstyle == LOOKUP) {
+    itable = static_cast<int> ((rsq-tb->innersq) * tb->invdelta);
+    if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff");
+    fforce = factor_lj * tb->f[itable];
+  } else if (tabstyle == LINEAR) {
+    itable = static_cast<int> ((rsq-tb->innersq) * tb->invdelta);
+    if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff");
+    fraction = (rsq - tb->rsq[itable]) * tb->invdelta;
+    value = tb->f[itable] + fraction*tb->df[itable];
+    fforce = factor_lj * value;
+  } else if (tabstyle == SPLINE) {
+    itable = static_cast<int> ((rsq-tb->innersq) * tb->invdelta);
+    if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff");
+    b = (rsq - tb->rsq[itable]) * tb->invdelta;
+    a = 1.0 - b;
+    value = a * tb->f[itable] + b * tb->f[itable+1] +
+      ((a*a*a-a)*tb->f2[itable] + (b*b*b-b)*tb->f2[itable+1]) *
+      tb->deltasq6;
+    fforce = factor_lj * value;
+  } else {
+    union_int_float_t rsq_lookup;
+    rsq_lookup.f = rsq;
+    itable = rsq_lookup.i & tb->nmask;
+    itable >>= tb->nshiftbits;
+    fraction = (rsq_lookup.f - tb->rsq[itable]) * tb->drsq[itable];
+    value = tb->f[itable] + fraction*tb->df[itable];
+    fforce = factor_lj * value;
+  }
+
+  if (isite1 == isite2) fforce = sqrt(mixWtSite1_i*mixWtSite2_j)*fforce;
+  else fforce = (sqrt(mixWtSite1_i*mixWtSite2_j) + sqrt(mixWtSite2_i*mixWtSite1_j))*fforce;
+
+  if (tabstyle == LOOKUP)
+    phi = tb->e[itable];
+  else if (tabstyle == LINEAR || tabstyle == BITMAP)
+    phi = tb->e[itable] + fraction*tb->de[itable];
+  else
+    phi = a * tb->e[itable] + b * tb->e[itable+1] +
+      ((a*a*a-a)*tb->e2[itable] + (b*b*b-b)*tb->e2[itable+1]) * tb->deltasq6;
+
+  if (isite1 == isite2) phi = sqrt(mixWtSite1_i*mixWtSite2_j)*phi;
+  else phi = (sqrt(mixWtSite1_i*mixWtSite2_j) + sqrt(mixWtSite2_i*mixWtSite1_j))*phi;
+
+  return factor_lj*phi;
+}
+
 /* ----------------------------------------------------------------------
    compute r,e,f vectors from splined values
 ------------------------------------------------------------------------- */
@@ -526,9 +750,11 @@ void PairTableRXKokkos<DeviceType>::cleanup_copy() {
 }
 
 template<class DeviceType>
+template<class ExecDevice>
 KOKKOS_INLINE_FUNCTION
 void PairTableRXKokkos<DeviceType>::getMixingWeights(
-    typename DAT::t_float_2d_randomread dvector, int id,
+    typename ArrayTypes<ExecDevice>::t_float_2d_randomread dvector,
+    int id,
     double &mixWtSite1old, double &mixWtSite2old,
     double &mixWtSite1, double &mixWtSite2) {
   double fractionOFAold, fractionOFA;
diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
index 0d8a8f151e..de6de61429 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.h
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -45,7 +45,9 @@ class PairTableRXKokkos : public PairTable {
   void compute_style(int, int);
 
   void settings(int, char **);
+  void coeff(int, char **);
   double init_one(int, int);
+  virtual double single(int, int, int, int, double, double, double, double &);
 
   void init_style();
 
@@ -160,8 +162,10 @@ class PairTableRXKokkos : public PairTable {
   int isite1, isite2;
   bool fractionalWeighting;
 
+  template <class ExecDevice>
   KOKKOS_INLINE_FUNCTION
-  void getMixingWeights(typename DAT::t_float_2d_randomread, int, double &, double &, double &, double &);
+  void getMixingWeights(typename ArrayTypes<ExecDevice>::t_float_2d_randomread,
+      int, double &, double &, double &, double &);
 };
 
 }

From 4d5abe64d5cef0e1299bc7d43ac4482f25333d4f Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Mon, 9 Jan 2017 14:04:03 -0700
Subject: [PATCH 062/267] draft compute_fpair for PairTableRXKokkos

---
 src/KOKKOS/pair_table_rx_kokkos.cpp | 21 ++++++++++++++++++---
 src/KOKKOS/pair_table_rx_kokkos.h   |  5 +++++
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index 5a71739b6d..bb6c034dc0 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -53,7 +53,7 @@ PairTableRXKokkos<DeviceType>::PairTableRXKokkos(LAMMPS *lmp) : PairTable(lmp)
   update_table = 0;
   atomKK = (AtomKokkos *) atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
-  datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
+  datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK | DVECTOR_MASK;
   datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
   h_table = new TableHost();
   d_table = new TableDevice();
@@ -121,6 +121,19 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
   d_cutsq = d_table->cutsq;
   // loop over neighbors of my atoms
 
+  const int ntotal = atom->nlocal + atom->nghost;
+  mixWtSite1old_ = Kokkos::View<double*, DeviceType>("PairTableRxKokkos::mixWtSite1old", ntotal);
+  mixWtSite2old_ = Kokkos::View<double*, DeviceType>("PairTableRxKokkos::mixWtSite2old", ntotal);
+  mixWtSite1_ = Kokkos::View<double*, DeviceType>("PairTableRxKokkos::mixWtSite1", ntotal);
+  mixWtSite2_ = Kokkos::View<double*, DeviceType>("PairTableRxKokkos::mixWtSite2", ntotal);
+
+  typename DAT::t_float_2d_randomread d_dvector = atomKK->k_dvector.view<DeviceType>();
+
+  Kokkos::parallel_for(ntotal, LAMMPS_LAMBDA(int i) {
+    getMixingWeights<DeviceType>(d_dvector, i, mixWtSite1old_(i), mixWtSite2old_(i),
+        mixWtSite1_(i), mixWtSite2_(i));
+  });
+
   EV_FLOAT ev;
   if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
     if (neighflag == FULL) {
@@ -186,8 +199,6 @@ template<bool STACKPARAMS, class Specialisation>
 KOKKOS_INLINE_FUNCTION
 F_FLOAT PairTableRXKokkos<DeviceType>::
 compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const {
-  (void) i;
-  (void) j;
   union_int_float_t rsq_lookup;
   double fpair;
   const int tidx = d_table_const.tabindex(itype,jtype);
@@ -212,6 +223,9 @@ compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, c
     const double fraction = (rsq_lookup.f - d_table_const.rsq(tidx,itable)) * d_table_const.drsq(tidx,itable);
     fpair = d_table_const.f(tidx,itable) + fraction*d_table_const.df(tidx,itable);
   }
+  if (isite1 == isite2) fpair *= sqrt(mixWtSite1old_(i) * mixWtSite2old_(j));
+  else fpair *= (sqrt(mixWtSite1old_(i) * mixWtSite2old_(j)) +
+                 sqrt(mixWtSite2old_(i) * mixWtSite1old_(j)));
   return fpair;
 }
 
@@ -646,6 +660,7 @@ double PairTableRXKokkos<DeviceType>::single(int i, int j, int itype, int jtype,
   a = 0.0;
   b = 0.0;
 
+  atomKK->k_dvector.template sync<LMPHostType>();
   typename ArrayTypes<LMPHostType>::t_float_2d_randomread h_dvector =
     atomKK->k_dvector.view<LMPHostType>();
   getMixingWeights<LMPHostType>(h_dvector,i,mixWtSite1old_i,mixWtSite2old_i,
diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
index de6de61429..a0d937549f 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.h
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -166,6 +166,11 @@ class PairTableRXKokkos : public PairTable {
   KOKKOS_INLINE_FUNCTION
   void getMixingWeights(typename ArrayTypes<ExecDevice>::t_float_2d_randomread,
       int, double &, double &, double &, double &);
+
+  Kokkos::View<double*, DeviceType> mixWtSite1old_;
+  Kokkos::View<double*, DeviceType> mixWtSite2old_;
+  Kokkos::View<double*, DeviceType> mixWtSite1_;
+  Kokkos::View<double*, DeviceType> mixWtSite2_;
 };
 
 }

From c877c07491e32160f42dbde04a7e34e5f6637b57 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Mon, 9 Jan 2017 16:21:32 -0700
Subject: [PATCH 063/267] progress towards custom compute functor

which is needed to handle uCG contributions.
---
 src/KOKKOS/pair_table_rx_kokkos.cpp | 220 ++++++++++++++++++++++++++--
 src/KOKKOS/pair_table_rx_kokkos.h   |  76 +++++-----
 2 files changed, 237 insertions(+), 59 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index bb6c034dc0..cc0a416ad9 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -90,6 +90,195 @@ void PairTableRXKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     compute_style<BITMAP>(eflag_in,vflag_in);
 }
 
+template<class DeviceType>
+template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
+PairTableRXKokkos<DeviceType>::Full<NEIGHFLAG,STACKPARAMS,TABSTYLE>::Functor(
+    PairTableRXKokkos* c_ptr, NeighListKokkos<device_type>* list_ptr):
+  c(*c_ptr),f(c.f),list(*list_ptr)
+{}
+
+template<class DeviceType>
+template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
+PairTableRXKokkos<DeviceType>::Full<NEIGHFLAG,STACKPARAMS,TABSTYLE>::~Functor() {
+  c.cleanup_copy();
+  list.clean_copy();
+}
+
+template<class DeviceType>
+template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
+template<int EVFLAG, int NEWTON_PAIR>
+KOKKOS_INLINE_FUNCTION
+EV_FLOAT
+PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::
+compute_item(const int& ii) {
+  EV_FLOAT ev;
+  const int i = list.d_ilist[ii];
+  const X_FLOAT xtmp = c.x(i,0);
+  const X_FLOAT ytmp = c.x(i,1);
+  const X_FLOAT ztmp = c.x(i,2);
+  const int itype = c.type(i);
+
+  const AtomNeighborsConst jlist = list.get_neighbors_const(i);
+  const int jnum = list.d_numneigh[i];
+
+  double uCG_i = 0.0;
+  double uCGnew_i = 0.0;
+  double fx_i = 0.0, fy_i = 0.0, fz_i = 0.0;
+
+  double mixWtSite1old_i = mixWtSite1old(i);
+  double mixWtSite2old_i = mixWtSite2old(i);
+  double mixWtSite1_i = mixWtSite1(i);
+  double mixWtSite2_i = mixWtSite2(i);
+
+  for (int jj = 0; jj < jnum; jj++) {
+    int j = jlist(jj);
+    const F_FLOAT factor_lj = c.special_lj[sbmask(j)];
+    j &= NEIGHMASK;
+
+    const X_FLOAT delx = xtmp - c.x(j,0);
+    const X_FLOAT dely = ytmp - c.x(j,1);
+    const X_FLOAT delz = ztmp - c.x(j,2);
+    const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+    const int jtype = c.type(j);
+
+    if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) {
+      double mixWtSite1old_j = mixWtSite1old[j];
+      double mixWtSite2old_j = mixWtSite2old[j];
+      double mixWtSite1_j = mixWtSite1[j];
+      double mixWtSite2_j = mixWtSite2[j];
+
+      const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
+
+      fx_i += delx*fpair;
+      fy_i += dely*fpair;
+      fz_i += delz*fpair;
+
+      bool do_half = (NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) &&
+                     (NEWTON_PAIR || j < c.nlocal);
+      if (do_half) {
+        f(j,0) -= delx*fpair;
+        f(j,1) -= dely*fpair;
+        f(j,2) -= delz*fpair;
+      }
+
+      auto evdwl = c.template compute_evdwl<STACKPARAMS,TABSTYLE>(rsq,i,j,itype,jtype);
+
+      double evdwlOld;
+      if (isite1 == isite2) {
+        evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwl;
+        evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwl;
+      } else {
+        evdwlOld = (sqrt(mixWtSite1old_i*mixWtSite2old_j) +
+                    sqrt(mixWtSite2old_i*mixWtSite1old_j))*evdwl;
+        evdwl = (sqrt(mixWtSite1_i*mixWtSite2_j) +
+                 sqrt(mixWtSite2_i*mixWtSite1_j))*evdwl;
+      }
+      evdwlOld *= factor_lj;
+      evdwl *= factor_lj;
+
+      uCG_i += 0.5*evdwlOld;
+      if (do_half) uCG(j) += 0.5*evdwlOld;
+
+      uCGnew_i += 0.5*evdwl;
+      if (do_half) uCGnew(j) += 0.5*evdwl;
+      evdwl = evdwlOld;
+
+      ev.evdwl += (do_half ? 1.0 : 0.5)*evdwl;
+
+      if (EVFLAG) ev_tally(ev,i,j,evdwl,fpair,delx,dely,delz);
+    }
+  }
+
+  uCG(i) += uCG_i;
+  uCGnew(i) += uCGnew_i;
+
+  f(i,0) += fx_i;
+  f(i,1) += fy_i;
+  f(i,2) += fz_i;
+
+  return ev;
+}
+
+template<class DeviceType>
+template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
+KOKKOS_INLINE_FUNCTION
+void
+PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::
+ev_tally(EV_FLOAT &ev, const int &i, const int &j,
+         const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
+         const F_FLOAT &dely, const F_FLOAT &delz) const
+{
+  const int EFLAG = c.eflag;
+  const int NEWTON_PAIR = c.newton_pair;
+  const int VFLAG = c.vflag_either;
+
+  if (VFLAG) {
+    const E_FLOAT v0 = delx*delx*fpair;
+    const E_FLOAT v1 = dely*dely*fpair;
+    const E_FLOAT v2 = delz*delz*fpair;
+    const E_FLOAT v3 = delx*dely*fpair;
+    const E_FLOAT v4 = delx*delz*fpair;
+    const E_FLOAT v5 = dely*delz*fpair;
+
+    if (c.vflag_global) {
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR) {
+          ev.v[0] += v0;
+          ev.v[1] += v1;
+          ev.v[2] += v2;
+          ev.v[3] += v3;
+          ev.v[4] += v4;
+          ev.v[5] += v5;
+        } else {
+          if (i < c.nlocal) {
+            ev.v[0] += 0.5*v0;
+            ev.v[1] += 0.5*v1;
+            ev.v[2] += 0.5*v2;
+            ev.v[3] += 0.5*v3;
+            ev.v[4] += 0.5*v4;
+            ev.v[5] += 0.5*v5;
+          }
+          if (j < c.nlocal) {
+            ev.v[0] += 0.5*v0;
+            ev.v[1] += 0.5*v1;
+            ev.v[2] += 0.5*v2;
+            ev.v[3] += 0.5*v3;
+            ev.v[4] += 0.5*v4;
+            ev.v[5] += 0.5*v5;
+          }
+        }
+      } else {
+        ev.v[0] += 0.5*v0;
+        ev.v[1] += 0.5*v1;
+        ev.v[2] += 0.5*v2;
+        ev.v[3] += 0.5*v3;
+        ev.v[4] += 0.5*v4;
+        ev.v[5] += 0.5*v5;
+      }
+    }
+  }
+}
+
+template<class DeviceType>
+template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
+KOKKOS_INLINE_FUNCTION
+void
+PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::
+operator()(const int i) const {
+  if (c.newton_pair) compute_item<0,1>(i);
+  else compute_item<0,0>(i);
+}
+
+template<class DeviceType>
+template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
+KOKKOS_INLINE_FUNCTION
+void
+PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::
+operator()(const int i, value_type &energy_virial) const {
+  if (c.newton_pair) energy_virial += compute_item<1,1>(i);
+  else energy_virial += compute_item<1,0>(i);
+}
+
 template<class DeviceType>
 template<int TABSTYLE>
 void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
@@ -102,9 +291,10 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
 
+  if (eflag_atom) error->all(FLERR, "pair table/rx/kk does not handle eflag_atom\n");
+  if (vflag_atom) error->all(FLERR, "pair table/rx/kk does not handle vflag_atom\n");
+
   atomKK->sync(execution_space,datamask_read);
-  //k_cutsq.template sync<DeviceType>();
-  //k_params.template sync<DeviceType>();
   if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
   else atomKK->modified(execution_space,F_MASK);
 
@@ -122,10 +312,10 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
   // loop over neighbors of my atoms
 
   const int ntotal = atom->nlocal + atom->nghost;
-  mixWtSite1old_ = Kokkos::View<double*, DeviceType>("PairTableRxKokkos::mixWtSite1old", ntotal);
-  mixWtSite2old_ = Kokkos::View<double*, DeviceType>("PairTableRxKokkos::mixWtSite2old", ntotal);
-  mixWtSite1_ = Kokkos::View<double*, DeviceType>("PairTableRxKokkos::mixWtSite1", ntotal);
-  mixWtSite2_ = Kokkos::View<double*, DeviceType>("PairTableRxKokkos::mixWtSite2", ntotal);
+  mixWtSite1old_ = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite1old", ntotal);
+  mixWtSite2old_ = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite2old", ntotal);
+  mixWtSite1_ = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite1", ntotal);
+  mixWtSite2_ = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite2", ntotal);
 
   typename DAT::t_float_2d_randomread d_dvector = atomKK->k_dvector.view<DeviceType>();
 
@@ -195,21 +385,21 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
 }
 
 template<class DeviceType>
-template<bool STACKPARAMS, class Specialisation>
+template<bool STACKPARAMS, int TABSTYLE>
 KOKKOS_INLINE_FUNCTION
 F_FLOAT PairTableRXKokkos<DeviceType>::
 compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const {
   union_int_float_t rsq_lookup;
   double fpair;
   const int tidx = d_table_const.tabindex(itype,jtype);
-  if (Specialisation::TabStyle == LOOKUP) {
+  if (TABSTYLE == LOOKUP) {
     const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
     fpair = d_table_const.f(tidx,itable);
-  } else if (Specialisation::TabStyle == LINEAR) {
+  } else if (TABSTYLE == LINEAR) {
     const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
     const double fraction = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
     fpair = d_table_const.f(tidx,itable) + fraction*d_table_const.df(tidx,itable);
-  } else if (Specialisation::TabStyle == SPLINE) {
+  } else if (TABSTYLE == SPLINE) {
     const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
     const double b = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
     const double a = 1.0 - b;
@@ -230,23 +420,21 @@ compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, c
 }
 
 template<class DeviceType>
-template<bool STACKPARAMS, class Specialisation>
+template<bool STACKPARAMS, int TABSTYLE>
 KOKKOS_INLINE_FUNCTION
 F_FLOAT PairTableRXKokkos<DeviceType>::
 compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const {
-  (void) i;
-  (void) j;
   double evdwl;
   union_int_float_t rsq_lookup;
   const int tidx = d_table_const.tabindex(itype,jtype);
-  if (Specialisation::TabStyle == LOOKUP) {
+  if (TABSTYLE == LOOKUP) {
     const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
     evdwl = d_table_const.e(tidx,itable);
-  } else if (Specialisation::TabStyle == LINEAR) {
+  } else if (TABSTYLE == LINEAR) {
     const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
     const double fraction = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
     evdwl = d_table_const.e(tidx,itable) + fraction*d_table_const.de(tidx,itable);
-  } else if (Specialisation::TabStyle == SPLINE) {
+  } else if (TABSTYLE == SPLINE) {
     const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
     const double b = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
     const double a = 1.0 - b;
diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
index a0d937549f..f717dc3f8a 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.h
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -33,7 +33,6 @@ class PairTableRXKokkos : public PairTable {
   using DAT = ArrayTypes<DeviceType>;
 
   enum {EnabledNeighFlags=FULL|HALFTHREAD|HALF|N2};
-  enum {COUL_FLAG=0};
   typedef DeviceType device_type;
 
   PairTableRXKokkos(class LAMMPS *);
@@ -111,48 +110,6 @@ class PairTableRXKokkos : public PairTable {
   KOKKOS_INLINE_FUNCTION
   F_FLOAT compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const;
 
-  template<bool STACKPARAMS, class Specialisation>
-  KOKKOS_INLINE_FUNCTION
-  F_FLOAT compute_ecoul(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const {
-    return 0;
-  }
-
-  friend class PairComputeFunctor<PairTableRXKokkos,FULL,true,S_TableCompute<DeviceType,LOOKUP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALF,true,S_TableCompute<DeviceType,LOOKUP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,true,S_TableCompute<DeviceType,LOOKUP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,N2,true,S_TableCompute<DeviceType,LOOKUP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,FULL,false,S_TableCompute<DeviceType,LOOKUP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALF,false,S_TableCompute<DeviceType,LOOKUP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,false,S_TableCompute<DeviceType,LOOKUP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,N2,false,S_TableCompute<DeviceType,LOOKUP> >;
-
-  friend class PairComputeFunctor<PairTableRXKokkos,FULL,true,S_TableCompute<DeviceType,LINEAR> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALF,true,S_TableCompute<DeviceType,LINEAR> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,true,S_TableCompute<DeviceType,LINEAR> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,N2,true,S_TableCompute<DeviceType,LINEAR> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,FULL,false,S_TableCompute<DeviceType,LINEAR> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALF,false,S_TableCompute<DeviceType,LINEAR> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,false,S_TableCompute<DeviceType,LINEAR> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,N2,false,S_TableCompute<DeviceType,LINEAR> >;
-
-  friend class PairComputeFunctor<PairTableRXKokkos,FULL,true,S_TableCompute<DeviceType,SPLINE> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALF,true,S_TableCompute<DeviceType,SPLINE> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,true,S_TableCompute<DeviceType,SPLINE> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,N2,true,S_TableCompute<DeviceType,SPLINE> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,FULL,false,S_TableCompute<DeviceType,SPLINE> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALF,false,S_TableCompute<DeviceType,SPLINE> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,false,S_TableCompute<DeviceType,SPLINE> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,N2,false,S_TableCompute<DeviceType,SPLINE> >;
-
-  friend class PairComputeFunctor<PairTableRXKokkos,FULL,true,S_TableCompute<DeviceType,BITMAP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALF,true,S_TableCompute<DeviceType,BITMAP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,true,S_TableCompute<DeviceType,BITMAP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,N2,true,S_TableCompute<DeviceType,BITMAP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,FULL,false,S_TableCompute<DeviceType,BITMAP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALF,false,S_TableCompute<DeviceType,BITMAP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,HALFTHREAD,false,S_TableCompute<DeviceType,BITMAP> >;
-  friend class PairComputeFunctor<PairTableRXKokkos,N2,false,S_TableCompute<DeviceType,BITMAP> >;
-
   friend void pair_virial_fdotr_compute<PairTableRXKokkos>(PairTableRXKokkos*);
 
   /* PairTableRX members */
@@ -171,6 +128,39 @@ class PairTableRXKokkos : public PairTable {
   Kokkos::View<double*, DeviceType> mixWtSite2old_;
   Kokkos::View<double*, DeviceType> mixWtSite1_;
   Kokkos::View<double*, DeviceType> mixWtSite2_;
+
+  /* a duplicate of PairComputeFunctor to deal with uCG */
+  template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
+  struct Functor {
+    using device_type = DeviceType;
+    typedef EV_FLOAT value_type;
+    PairTableRXKokkos c;
+    // arrays are atomic for Half(Thread) neighbor style
+    Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,
+                 device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > f;
+    Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,
+                 device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCG;
+    Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,
+                 device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCGnew;
+    NeighListKokkos<device_type> list;
+    Functor(PairTableRXKokkos* c_ptr, NeighListKokkos<device_type>* list_ptr);
+    ~Functor();
+    KOKKOS_INLINE_FUNCTION int sbmask(const int& j) const {
+      return j >> SBBITS & 3;
+    }
+    template<int EVFLAG, int NEWTON_PAIR>
+    KOKKOS_INLINE_FUNCTION
+    EV_FLOAT compute_item(const int&,
+                          const NeighListKokkos<device_type>&, const NoCoulTag&) const;
+    KOKKOS_INLINE_FUNCTION
+    ev_tally(EV_FLOAT &ev, const int &i, const int &j,
+             const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
+             const F_FLOAT &dely, const F_FLOAT &delz) const
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const int) const;
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const int, value_type&) const;
+  };
 };
 
 }

From e4673d7fa80b40dca606c4cf85f92e2f6d2b098b Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Mon, 9 Jan 2017 16:36:25 -0700
Subject: [PATCH 064/267] fix compilation

---
 src/KOKKOS/pair_table_rx_kokkos.cpp | 52 +++++++++++++----------------
 src/KOKKOS/pair_table_rx_kokkos.h   | 10 +++---
 2 files changed, 28 insertions(+), 34 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index cc0a416ad9..26e335fcff 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -92,14 +92,14 @@ void PairTableRXKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
 template<class DeviceType>
 template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
-PairTableRXKokkos<DeviceType>::Full<NEIGHFLAG,STACKPARAMS,TABSTYLE>::Functor(
+PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::Functor(
     PairTableRXKokkos* c_ptr, NeighListKokkos<device_type>* list_ptr):
   c(*c_ptr),f(c.f),list(*list_ptr)
 {}
 
 template<class DeviceType>
 template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
-PairTableRXKokkos<DeviceType>::Full<NEIGHFLAG,STACKPARAMS,TABSTYLE>::~Functor() {
+PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::~Functor() {
   c.cleanup_copy();
   list.clean_copy();
 }
@@ -110,7 +110,7 @@ template<int EVFLAG, int NEWTON_PAIR>
 KOKKOS_INLINE_FUNCTION
 EV_FLOAT
 PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::
-compute_item(const int& ii) {
+compute_item(const int& ii) const {
   EV_FLOAT ev;
   const int i = list.d_ilist[ii];
   const X_FLOAT xtmp = c.x(i,0);
@@ -125,10 +125,10 @@ compute_item(const int& ii) {
   double uCGnew_i = 0.0;
   double fx_i = 0.0, fy_i = 0.0, fz_i = 0.0;
 
-  double mixWtSite1old_i = mixWtSite1old(i);
-  double mixWtSite2old_i = mixWtSite2old(i);
-  double mixWtSite1_i = mixWtSite1(i);
-  double mixWtSite2_i = mixWtSite2(i);
+  double mixWtSite1old_i = c.mixWtSite1old_(i);
+  double mixWtSite2old_i = c.mixWtSite2old_(i);
+  double mixWtSite1_i = c.mixWtSite1_(i);
+  double mixWtSite2_i = c.mixWtSite2_(i);
 
   for (int jj = 0; jj < jnum; jj++) {
     int j = jlist(jj);
@@ -142,12 +142,12 @@ compute_item(const int& ii) {
     const int jtype = c.type(j);
 
     if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) {
-      double mixWtSite1old_j = mixWtSite1old[j];
-      double mixWtSite2old_j = mixWtSite2old[j];
-      double mixWtSite1_j = mixWtSite1[j];
-      double mixWtSite2_j = mixWtSite2[j];
+      double mixWtSite1old_j = c.mixWtSite1old_(j);
+      double mixWtSite2old_j = c.mixWtSite2old_(j);
+      double mixWtSite1_j = c.mixWtSite1_(j);
+      double mixWtSite2_j = c.mixWtSite2_(j);
 
-      const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
+      const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,TABSTYLE>(rsq,i,j,itype,jtype);
 
       fx_i += delx*fpair;
       fy_i += dely*fpair;
@@ -164,7 +164,7 @@ compute_item(const int& ii) {
       auto evdwl = c.template compute_evdwl<STACKPARAMS,TABSTYLE>(rsq,i,j,itype,jtype);
 
       double evdwlOld;
-      if (isite1 == isite2) {
+      if (c.isite1 == c.isite2) {
         evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwl;
         evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwl;
       } else {
@@ -324,48 +324,42 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
         mixWtSite1_(i), mixWtSite2_(i));
   });
 
+  if (neighflag == N2) error->all(FLERR,"pair table/rx/kk can't handle N2 yet\n");
+
   EV_FLOAT ev;
   if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
     if (neighflag == FULL) {
-      PairComputeFunctor<PairTableRXKokkos<DeviceType>,FULL,false,S_TableCompute<DeviceType,TABSTYLE> >
-        ff(this,(NeighListKokkos<DeviceType>*) list);
+      Functor<FULL,false,TABSTYLE> ff(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
       else Kokkos::parallel_for(list->inum,ff);
     } else if (neighflag == HALFTHREAD) {
-      PairComputeFunctor<PairTableRXKokkos<DeviceType>,HALFTHREAD,false,S_TableCompute<DeviceType,TABSTYLE> >
-        ff(this,(NeighListKokkos<DeviceType>*) list);
+      Functor<HALFTHREAD,false,TABSTYLE> ff(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
       else Kokkos::parallel_for(list->inum,ff);
     } else if (neighflag == HALF) {
-      PairComputeFunctor<PairTableRXKokkos<DeviceType>,HALF,false,S_TableCompute<DeviceType,TABSTYLE> >
-        f(this,(NeighListKokkos<DeviceType>*) list);
+      Functor<HALF,false,TABSTYLE> f(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
       else Kokkos::parallel_for(list->inum,f);
     } else if (neighflag == N2) {
-      PairComputeFunctor<PairTableRXKokkos<DeviceType>,N2,false,S_TableCompute<DeviceType,TABSTYLE> >
-        f(this,(NeighListKokkos<DeviceType>*) list);
+      Functor<N2,false,TABSTYLE> f(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(nlocal,f,ev);
       else Kokkos::parallel_for(nlocal,f);
     }
   } else {
     if (neighflag == FULL) {
-      PairComputeFunctor<PairTableRXKokkos<DeviceType>,FULL,true,S_TableCompute<DeviceType,TABSTYLE> >
-        f(this,(NeighListKokkos<DeviceType>*) list);
+      Functor<FULL,true,TABSTYLE> f(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
       else Kokkos::parallel_for(list->inum,f);
     } else if (neighflag == HALFTHREAD) {
-      PairComputeFunctor<PairTableRXKokkos<DeviceType>,HALFTHREAD,true,S_TableCompute<DeviceType,TABSTYLE> >
-        f(this,(NeighListKokkos<DeviceType>*) list);
+      Functor<HALFTHREAD,true,TABSTYLE> f(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
       else Kokkos::parallel_for(list->inum,f);
     } else if (neighflag == HALF) {
-      PairComputeFunctor<PairTableRXKokkos<DeviceType>,HALF,true,S_TableCompute<DeviceType,TABSTYLE> >
-        f(this,(NeighListKokkos<DeviceType>*) list);
+      Functor<HALF,true,TABSTYLE> f(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
       else Kokkos::parallel_for(list->inum,f);
     } else if (neighflag == N2) {
-      PairComputeFunctor<PairTableRXKokkos<DeviceType>,N2,true,S_TableCompute<DeviceType,TABSTYLE> >
-        f(this,(NeighListKokkos<DeviceType>*) list);
+      Functor<N2,true,TABSTYLE> f(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(nlocal,f,ev);
       else Kokkos::parallel_for(nlocal,f);
     }
diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
index f717dc3f8a..c468461263 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.h
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -102,11 +102,11 @@ class PairTableRXKokkos : public PairTable {
   void create_kokkos_tables();
   void cleanup_copy();
 
-  template<bool STACKPARAMS, class Specialisation>
+  template<bool STACKPARAMS, int TABSTYLE>
   KOKKOS_INLINE_FUNCTION
   F_FLOAT compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const;
 
-  template<bool STACKPARAMS, class Specialisation>
+  template<bool STACKPARAMS, int TABSTYLE>
   KOKKOS_INLINE_FUNCTION
   F_FLOAT compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const;
 
@@ -150,12 +150,12 @@ class PairTableRXKokkos : public PairTable {
     }
     template<int EVFLAG, int NEWTON_PAIR>
     KOKKOS_INLINE_FUNCTION
-    EV_FLOAT compute_item(const int&,
-                          const NeighListKokkos<device_type>&, const NoCoulTag&) const;
+    EV_FLOAT compute_item(const int&) const;
     KOKKOS_INLINE_FUNCTION
+    void
     ev_tally(EV_FLOAT &ev, const int &i, const int &j,
              const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
-             const F_FLOAT &dely, const F_FLOAT &delz) const
+             const F_FLOAT &dely, const F_FLOAT &delz) const;
     KOKKOS_INLINE_FUNCTION
     void operator()(const int) const;
     KOKKOS_INLINE_FUNCTION

From 5d5751be190b198c8e8c48526d40217e0c443255 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Tue, 10 Jan 2017 12:38:48 -0700
Subject: [PATCH 065/267] fix class name in PAIR_CLASS setup

---
 src/KOKKOS/pair_table_rx_kokkos.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
index c468461263..de9ae20e35 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.h
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -13,9 +13,9 @@
 
 #ifdef PAIR_CLASS
 
-PairStyle(table/rx/kk,PairTableKokkos<LMPDeviceType>)
-PairStyle(table/rx/kk/device,PairTableKokkos<LMPDeviceType>)
-PairStyle(table/rx/kk/host,PairTableKokkos<LMPHostType>)
+PairStyle(table/rx/kk,PairTableRXKokkos<LMPDeviceType>)
+PairStyle(table/rx/kk/device,PairTableRXKokkos<LMPDeviceType>)
+PairStyle(table/rx/kk/host,PairTableRXKokkos<LMPHostType>)
 
 #else
 

From 55aa91be6b88670dc16658883e30bfe0710e695e Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Tue, 10 Jan 2017 13:07:22 -0700
Subject: [PATCH 066/267] copy uCG and uCGnew correctly

---
 src/KOKKOS/pair_table_rx_kokkos.cpp | 9 ++++++---
 src/KOKKOS/pair_table_rx_kokkos.h   | 2 ++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index 26e335fcff..0cb2f11efc 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -53,8 +53,9 @@ PairTableRXKokkos<DeviceType>::PairTableRXKokkos(LAMMPS *lmp) : PairTable(lmp)
   update_table = 0;
   atomKK = (AtomKokkos *) atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
-  datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK | DVECTOR_MASK;
-  datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
+  datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK |
+                  DVECTOR_MASK | UCG_MASK | UCGNEW_MASK;
+  datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK | UCG_MASK | UCGNEW_MASK;
   h_table = new TableHost();
   d_table = new TableDevice();
   fractionalWeighting = true;
@@ -94,7 +95,7 @@ template<class DeviceType>
 template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
 PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::Functor(
     PairTableRXKokkos* c_ptr, NeighListKokkos<device_type>* list_ptr):
-  c(*c_ptr),f(c.f),list(*list_ptr)
+  c(*c_ptr),f(c.f),uCG(c.uCG),uCGnew(c.uCGnew),list(*list_ptr)
 {}
 
 template<class DeviceType>
@@ -301,6 +302,8 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
   x = c_x = atomKK->k_x.view<DeviceType>();
   f = atomKK->k_f.view<DeviceType>();
   type = atomKK->k_type.view<DeviceType>();
+  uCG = atomKK->k_uCG.view<DeviceType>();
+  uCGnew = atomKK->k_uCGnew.view<DeviceType>();
   nlocal = atom->nlocal;
   nall = atom->nlocal + atom->nghost;
   special_lj[0] = force->special_lj[0];
diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
index de9ae20e35..ad8071800f 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.h
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -92,6 +92,8 @@ class PairTableRXKokkos : public PairTable {
   typename ArrayTypes<DeviceType>::t_x_array_const c_x;
   typename ArrayTypes<DeviceType>::t_f_array f;
   typename ArrayTypes<DeviceType>::t_int_1d_randomread type;
+  typename ArrayTypes<DeviceType>::t_efloat_1d uCG;
+  typename ArrayTypes<DeviceType>::t_efloat_1d uCGnew;
   typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom;
   typename ArrayTypes<DeviceType>::t_virial_array d_vatom;
 

From d65676e981bdb5d8a9cc2c4dad9c8dfb09ea1d74 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Tue, 10 Jan 2017 16:08:55 -0700
Subject: [PATCH 067/267] make everything public to appease NVCC

---
 src/KOKKOS/pair_table_rx_kokkos.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
index ad8071800f..ed0f0c2eb2 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.h
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -50,9 +50,6 @@ class PairTableRXKokkos : public PairTable {
 
   void init_style();
 
-
- protected:
-
   struct TableDeviceConst {
     typename ArrayTypes<DeviceType>::t_ffloat_2d cutsq;
     typename ArrayTypes<DeviceType>::t_int_2d tabindex;
@@ -97,7 +94,6 @@ class PairTableRXKokkos : public PairTable {
   typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom;
   typename ArrayTypes<DeviceType>::t_virial_array d_vatom;
 
- protected:
   int nlocal,nall,eflag,vflag,neighflag,newton_pair;
 
   int update_table;
@@ -163,6 +159,7 @@ class PairTableRXKokkos : public PairTable {
     KOKKOS_INLINE_FUNCTION
     void operator()(const int, value_type&) const;
   };
+
 };
 
 }

From 6a9a0e8c334f60ccc8a6e4a8ff19308cac09a156 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Wed, 11 Jan 2017 09:25:13 -0700
Subject: [PATCH 068/267] tracking down some invalid reads...

---
 src/KOKKOS/Install.sh               |   4 +-
 src/KOKKOS/pair_table_rx_kokkos.cpp | 252 ++++++++++++++--------------
 src/KOKKOS/pair_table_rx_kokkos.h   |   4 +-
 3 files changed, 129 insertions(+), 131 deletions(-)

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index cfda7dbf94..e76f62d65d 100644
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -28,8 +28,8 @@ action () {
 
 # force rebuild of files with LMP_KOKKOS switch
 
-touch ../accelerator_kokkos.h
-touch ../memory.h
+#touch ../accelerator_kokkos.h
+#touch ../memory.h
 
 # list of files with optional dependcies
 
diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index 0cb2f11efc..6c7c7b0efe 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -94,15 +94,15 @@ void PairTableRXKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 template<class DeviceType>
 template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
 PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::Functor(
-    PairTableRXKokkos* c_ptr, NeighListKokkos<device_type>* list_ptr):
-  c(*c_ptr),f(c.f),uCG(c.uCG),uCGnew(c.uCGnew),list(*list_ptr)
+    PairTableRXKokkos* c_ptr, NeighListKokkos<device_type>* list_ptr)//:
+//c(*c_ptr),f(c.f),uCG(c.uCG),uCGnew(c.uCGnew),list(*list_ptr)
 {}
 
 template<class DeviceType>
 template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
 PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::~Functor() {
-  c.cleanup_copy();
-  list.clean_copy();
+//c.cleanup_copy();
+//list.clean_copy();
 }
 
 template<class DeviceType>
@@ -113,89 +113,89 @@ EV_FLOAT
 PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::
 compute_item(const int& ii) const {
   EV_FLOAT ev;
-  const int i = list.d_ilist[ii];
-  const X_FLOAT xtmp = c.x(i,0);
-  const X_FLOAT ytmp = c.x(i,1);
-  const X_FLOAT ztmp = c.x(i,2);
-  const int itype = c.type(i);
+//const int i = list.d_ilist[ii];
+//const X_FLOAT xtmp = c.x(i,0);
+//const X_FLOAT ytmp = c.x(i,1);
+//const X_FLOAT ztmp = c.x(i,2);
+//const int itype = c.type(i);
 
-  const AtomNeighborsConst jlist = list.get_neighbors_const(i);
-  const int jnum = list.d_numneigh[i];
+//const AtomNeighborsConst jlist = list.get_neighbors_const(i);
+//const int jnum = list.d_numneigh[i];
 
-  double uCG_i = 0.0;
-  double uCGnew_i = 0.0;
-  double fx_i = 0.0, fy_i = 0.0, fz_i = 0.0;
+//double uCG_i = 0.0;
+//double uCGnew_i = 0.0;
+//double fx_i = 0.0, fy_i = 0.0, fz_i = 0.0;
 
-  double mixWtSite1old_i = c.mixWtSite1old_(i);
-  double mixWtSite2old_i = c.mixWtSite2old_(i);
-  double mixWtSite1_i = c.mixWtSite1_(i);
-  double mixWtSite2_i = c.mixWtSite2_(i);
+//double mixWtSite1old_i = c.mixWtSite1old_(i);
+//double mixWtSite2old_i = c.mixWtSite2old_(i);
+//double mixWtSite1_i = c.mixWtSite1_(i);
+//double mixWtSite2_i = c.mixWtSite2_(i);
 
-  for (int jj = 0; jj < jnum; jj++) {
-    int j = jlist(jj);
-    const F_FLOAT factor_lj = c.special_lj[sbmask(j)];
-    j &= NEIGHMASK;
+//for (int jj = 0; jj < jnum; jj++) {
+//  int j = jlist(jj);
+//  const F_FLOAT factor_lj = c.special_lj[sbmask(j)];
+//  j &= NEIGHMASK;
 
-    const X_FLOAT delx = xtmp - c.x(j,0);
-    const X_FLOAT dely = ytmp - c.x(j,1);
-    const X_FLOAT delz = ztmp - c.x(j,2);
-    const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
-    const int jtype = c.type(j);
+//  const X_FLOAT delx = xtmp - c.x(j,0);
+//  const X_FLOAT dely = ytmp - c.x(j,1);
+//  const X_FLOAT delz = ztmp - c.x(j,2);
+//  const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+//  const int jtype = c.type(j);
 
-    if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) {
-      double mixWtSite1old_j = c.mixWtSite1old_(j);
-      double mixWtSite2old_j = c.mixWtSite2old_(j);
-      double mixWtSite1_j = c.mixWtSite1_(j);
-      double mixWtSite2_j = c.mixWtSite2_(j);
+//  if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) {
+//    double mixWtSite1old_j = c.mixWtSite1old_(j);
+//    double mixWtSite2old_j = c.mixWtSite2old_(j);
+//    double mixWtSite1_j = c.mixWtSite1_(j);
+//    double mixWtSite2_j = c.mixWtSite2_(j);
 
-      const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,TABSTYLE>(rsq,i,j,itype,jtype);
+//    const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,TABSTYLE>(rsq,i,j,itype,jtype);
 
-      fx_i += delx*fpair;
-      fy_i += dely*fpair;
-      fz_i += delz*fpair;
+//    fx_i += delx*fpair;
+//    fy_i += dely*fpair;
+//    fz_i += delz*fpair;
 
-      bool do_half = (NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) &&
-                     (NEWTON_PAIR || j < c.nlocal);
-      if (do_half) {
-        f(j,0) -= delx*fpair;
-        f(j,1) -= dely*fpair;
-        f(j,2) -= delz*fpair;
-      }
+//    bool do_half = (NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) &&
+//                   (NEWTON_PAIR || j < c.nlocal);
+//    if (do_half) {
+//      f(j,0) -= delx*fpair;
+//      f(j,1) -= dely*fpair;
+//      f(j,2) -= delz*fpair;
+//    }
 
-      auto evdwl = c.template compute_evdwl<STACKPARAMS,TABSTYLE>(rsq,i,j,itype,jtype);
+//    auto evdwl = c.template compute_evdwl<STACKPARAMS,TABSTYLE>(rsq,i,j,itype,jtype);
 
-      double evdwlOld;
-      if (c.isite1 == c.isite2) {
-        evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwl;
-        evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwl;
-      } else {
-        evdwlOld = (sqrt(mixWtSite1old_i*mixWtSite2old_j) +
-                    sqrt(mixWtSite2old_i*mixWtSite1old_j))*evdwl;
-        evdwl = (sqrt(mixWtSite1_i*mixWtSite2_j) +
-                 sqrt(mixWtSite2_i*mixWtSite1_j))*evdwl;
-      }
-      evdwlOld *= factor_lj;
-      evdwl *= factor_lj;
+//    double evdwlOld;
+//    if (c.isite1 == c.isite2) {
+//      evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwl;
+//      evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwl;
+//    } else {
+//      evdwlOld = (sqrt(mixWtSite1old_i*mixWtSite2old_j) +
+//                  sqrt(mixWtSite2old_i*mixWtSite1old_j))*evdwl;
+//      evdwl = (sqrt(mixWtSite1_i*mixWtSite2_j) +
+//               sqrt(mixWtSite2_i*mixWtSite1_j))*evdwl;
+//    }
+//    evdwlOld *= factor_lj;
+//    evdwl *= factor_lj;
 
-      uCG_i += 0.5*evdwlOld;
-      if (do_half) uCG(j) += 0.5*evdwlOld;
+//    uCG_i += 0.5*evdwlOld;
+//    if (do_half) uCG(j) += 0.5*evdwlOld;
 
-      uCGnew_i += 0.5*evdwl;
-      if (do_half) uCGnew(j) += 0.5*evdwl;
-      evdwl = evdwlOld;
+//    uCGnew_i += 0.5*evdwl;
+//    if (do_half) uCGnew(j) += 0.5*evdwl;
+//    evdwl = evdwlOld;
 
-      ev.evdwl += (do_half ? 1.0 : 0.5)*evdwl;
+//    ev.evdwl += (do_half ? 1.0 : 0.5)*evdwl;
 
-      if (EVFLAG) ev_tally(ev,i,j,evdwl,fpair,delx,dely,delz);
-    }
-  }
+//    if (EVFLAG) ev_tally(ev,i,j,evdwl,fpair,delx,dely,delz);
+//  }
+//}
 
-  uCG(i) += uCG_i;
-  uCGnew(i) += uCGnew_i;
+//uCG(i) += uCG_i;
+//uCGnew(i) += uCGnew_i;
 
-  f(i,0) += fx_i;
-  f(i,1) += fy_i;
-  f(i,2) += fz_i;
+//f(i,0) += fx_i;
+//f(i,1) += fy_i;
+//f(i,2) += fz_i;
 
   return ev;
 }
@@ -209,55 +209,55 @@ ev_tally(EV_FLOAT &ev, const int &i, const int &j,
          const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
          const F_FLOAT &dely, const F_FLOAT &delz) const
 {
-  const int EFLAG = c.eflag;
-  const int NEWTON_PAIR = c.newton_pair;
-  const int VFLAG = c.vflag_either;
+//const int EFLAG = c.eflag;
+//const int NEWTON_PAIR = c.newton_pair;
+//const int VFLAG = c.vflag_either;
 
-  if (VFLAG) {
-    const E_FLOAT v0 = delx*delx*fpair;
-    const E_FLOAT v1 = dely*dely*fpair;
-    const E_FLOAT v2 = delz*delz*fpair;
-    const E_FLOAT v3 = delx*dely*fpair;
-    const E_FLOAT v4 = delx*delz*fpair;
-    const E_FLOAT v5 = dely*delz*fpair;
+//if (VFLAG) {
+//  const E_FLOAT v0 = delx*delx*fpair;
+//  const E_FLOAT v1 = dely*dely*fpair;
+//  const E_FLOAT v2 = delz*delz*fpair;
+//  const E_FLOAT v3 = delx*dely*fpair;
+//  const E_FLOAT v4 = delx*delz*fpair;
+//  const E_FLOAT v5 = dely*delz*fpair;
 
-    if (c.vflag_global) {
-      if (NEIGHFLAG!=FULL) {
-        if (NEWTON_PAIR) {
-          ev.v[0] += v0;
-          ev.v[1] += v1;
-          ev.v[2] += v2;
-          ev.v[3] += v3;
-          ev.v[4] += v4;
-          ev.v[5] += v5;
-        } else {
-          if (i < c.nlocal) {
-            ev.v[0] += 0.5*v0;
-            ev.v[1] += 0.5*v1;
-            ev.v[2] += 0.5*v2;
-            ev.v[3] += 0.5*v3;
-            ev.v[4] += 0.5*v4;
-            ev.v[5] += 0.5*v5;
-          }
-          if (j < c.nlocal) {
-            ev.v[0] += 0.5*v0;
-            ev.v[1] += 0.5*v1;
-            ev.v[2] += 0.5*v2;
-            ev.v[3] += 0.5*v3;
-            ev.v[4] += 0.5*v4;
-            ev.v[5] += 0.5*v5;
-          }
-        }
-      } else {
-        ev.v[0] += 0.5*v0;
-        ev.v[1] += 0.5*v1;
-        ev.v[2] += 0.5*v2;
-        ev.v[3] += 0.5*v3;
-        ev.v[4] += 0.5*v4;
-        ev.v[5] += 0.5*v5;
-      }
-    }
-  }
+//  if (c.vflag_global) {
+//    if (NEIGHFLAG!=FULL) {
+//      if (NEWTON_PAIR) {
+//        ev.v[0] += v0;
+//        ev.v[1] += v1;
+//        ev.v[2] += v2;
+//        ev.v[3] += v3;
+//        ev.v[4] += v4;
+//        ev.v[5] += v5;
+//      } else {
+//        if (i < c.nlocal) {
+//          ev.v[0] += 0.5*v0;
+//          ev.v[1] += 0.5*v1;
+//          ev.v[2] += 0.5*v2;
+//          ev.v[3] += 0.5*v3;
+//          ev.v[4] += 0.5*v4;
+//          ev.v[5] += 0.5*v5;
+//        }
+//        if (j < c.nlocal) {
+//          ev.v[0] += 0.5*v0;
+//          ev.v[1] += 0.5*v1;
+//          ev.v[2] += 0.5*v2;
+//          ev.v[3] += 0.5*v3;
+//          ev.v[4] += 0.5*v4;
+//          ev.v[5] += 0.5*v5;
+//        }
+//      }
+//    } else {
+//      ev.v[0] += 0.5*v0;
+//      ev.v[1] += 0.5*v1;
+//      ev.v[2] += 0.5*v2;
+//      ev.v[3] += 0.5*v3;
+//      ev.v[4] += 0.5*v4;
+//      ev.v[5] += 0.5*v5;
+//    }
+//  }
+//}
 }
 
 template<class DeviceType>
@@ -266,8 +266,8 @@ KOKKOS_INLINE_FUNCTION
 void
 PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::
 operator()(const int i) const {
-  if (c.newton_pair) compute_item<0,1>(i);
-  else compute_item<0,0>(i);
+//if (c.newton_pair) compute_item<0,1>(i);
+//else compute_item<0,0>(i);
 }
 
 template<class DeviceType>
@@ -276,8 +276,8 @@ KOKKOS_INLINE_FUNCTION
 void
 PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::
 operator()(const int i, value_type &energy_virial) const {
-  if (c.newton_pair) energy_virial += compute_item<1,1>(i);
-  else energy_virial += compute_item<1,0>(i);
+//if (c.newton_pair) energy_virial += compute_item<1,1>(i);
+//else energy_virial += compute_item<1,0>(i);
 }
 
 template<class DeviceType>
@@ -322,10 +322,10 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
 
   typename DAT::t_float_2d_randomread d_dvector = atomKK->k_dvector.view<DeviceType>();
 
-  Kokkos::parallel_for(ntotal, LAMMPS_LAMBDA(int i) {
-    getMixingWeights<DeviceType>(d_dvector, i, mixWtSite1old_(i), mixWtSite2old_(i),
-        mixWtSite1_(i), mixWtSite2_(i));
-  });
+//Kokkos::parallel_for(ntotal, LAMMPS_LAMBDA(int i) {
+//  getMixingWeights<DeviceType>(d_dvector, i, mixWtSite1old_(i), mixWtSite2old_(i),
+//      mixWtSite1_(i), mixWtSite2_(i));
+//});
 
   if (neighflag == N2) error->all(FLERR,"pair table/rx/kk can't handle N2 yet\n");
 
@@ -971,8 +971,6 @@ void PairTableRXKokkos<DeviceType>::getMixingWeights(
     nTotal += dvector(ispecies,id);
     nTotalOld += dvector(ispecies+nspecies,id);
   }
-  if(nTotal < MY_EPSILON || nTotalOld < MY_EPSILON)
-    error->all(FLERR,"The number of molecules in CG particle is less than 10*DBL_EPSILON.");
 
   if (isOneFluid(isite1) == false){
     nMoleculesOld1 = dvector(isite1+nspecies,id);
diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
index ed0f0c2eb2..b71f57076d 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.h
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -132,7 +132,7 @@ class PairTableRXKokkos : public PairTable {
   struct Functor {
     using device_type = DeviceType;
     typedef EV_FLOAT value_type;
-    PairTableRXKokkos c;
+  //PairTableRXKokkos<device_type> c;
     // arrays are atomic for Half(Thread) neighbor style
     Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,
                  device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > f;
@@ -140,7 +140,7 @@ class PairTableRXKokkos : public PairTable {
                  device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCG;
     Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,
                  device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCGnew;
-    NeighListKokkos<device_type> list;
+  //NeighListKokkos<device_type> list;
     Functor(PairTableRXKokkos* c_ptr, NeighListKokkos<device_type>* list_ptr);
     ~Functor();
     KOKKOS_INLINE_FUNCTION int sbmask(const int& j) const {

From b5ff41f5efedd0aea4674263c7f4d620308e3100 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Wed, 11 Jan 2017 11:10:33 -0700
Subject: [PATCH 069/267] made MixingWeights code non-member

CUDA was simply giving too many
errors dealing with captures of
member variables.
---
 src/KOKKOS/pair_table_rx_kokkos.cpp | 199 ++++++++++++++++------------
 1 file changed, 114 insertions(+), 85 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index 6c7c7b0efe..63db613538 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -31,6 +31,7 @@
 #include "error.h"
 #include "atom_masks.h"
 #include "fix.h"
+#include <cassert>
 
 using namespace LAMMPS_NS;
 
@@ -45,6 +46,92 @@ enum{NONE,RLINEAR,RSQ,BMP};
 #define OneFluidValue (-1)
 #define isOneFluid(_site_) ( (_site_) == OneFluidValue )
 
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void getMixingWeights(
+    typename ArrayTypes<DeviceType>::t_float_2d_randomread dvector,
+    int nspecies,
+    int isite1, int isite2,
+    bool fractionalWeighting,
+    int id,
+    double &mixWtSite1old, double &mixWtSite2old,
+    double &mixWtSite1, double &mixWtSite2) {
+  double fractionOFAold, fractionOFA;
+  double fractionOld1, fraction1;
+  double fractionOld2, fraction2;
+  double nMoleculesOFAold, nMoleculesOFA;
+  double nMoleculesOld1, nMolecules1;
+  double nMoleculesOld2, nMolecules2;
+  double nTotal, nTotalOld;
+
+  nTotal = 0.0;
+  nTotalOld = 0.0;
+  assert(id >= 0);
+  assert(id < dvector.dimension_1());
+  for (int ispecies = 0; ispecies < nspecies; ++ispecies){
+    assert(ispecies < dvector.dimension_0());
+    nTotal += dvector(ispecies,id);
+    assert(ispecies+nspecies < dvector.dimension_0());
+    nTotalOld += dvector(ispecies+nspecies,id);
+  }
+
+  assert(isite1 >= 0);
+  assert(isite1 < nspecies);
+  assert(isite2 >= 0);
+  assert(isite2 < nspecies);
+  if (isOneFluid(isite1) == false){
+    nMoleculesOld1 = dvector(isite1+nspecies,id);
+    nMolecules1 = dvector(isite1,id);
+    fractionOld1 = nMoleculesOld1/nTotalOld;
+    fraction1 = nMolecules1/nTotal;
+  }
+  if (isOneFluid(isite2) == false){
+    nMoleculesOld2 = dvector(isite2+nspecies,id);
+    nMolecules2 = dvector(isite2,id);
+    fractionOld2 = nMoleculesOld2/nTotalOld;
+    fraction2 = nMolecules2/nTotal;
+  }
+
+  if (isOneFluid(isite1) || isOneFluid(isite2)){
+    nMoleculesOFAold  = 0.0;
+    nMoleculesOFA  = 0.0;
+    fractionOFAold  = 0.0;
+    fractionOFA  = 0.0;
+
+    for (int ispecies = 0; ispecies < nspecies; ispecies++){
+      if (isite1 == ispecies || isite2 == ispecies) continue;
+      nMoleculesOFAold += dvector(ispecies+nspecies,id);
+      nMoleculesOFA += dvector(ispecies,id);
+      fractionOFAold += dvector(ispecies+nspecies,id)/nTotalOld;
+      fractionOFA += dvector(ispecies,id)/nTotal;
+    }
+    if(isOneFluid(isite1)){
+      nMoleculesOld1 = 1.0-(nTotalOld-nMoleculesOFAold);
+      nMolecules1 = 1.0-(nTotal-nMoleculesOFA);
+      fractionOld1 = fractionOFAold;
+      fraction1 = fractionOFA;
+    }
+    if(isOneFluid(isite2)){
+      nMoleculesOld2 = 1.0-(nTotalOld-nMoleculesOFAold);
+      nMolecules2 = 1.0-(nTotal-nMoleculesOFA);
+      fractionOld2 = fractionOFAold;
+      fraction2 = fractionOFA;
+    }
+  }
+
+  if(fractionalWeighting){
+    mixWtSite1old = fractionOld1;
+    mixWtSite1 = fraction1;
+    mixWtSite2old = fractionOld2;
+    mixWtSite2 = fraction2;
+  } else {
+    mixWtSite1old = nMoleculesOld1;
+    mixWtSite1 = nMolecules1;
+    mixWtSite2old = nMoleculesOld2;
+    mixWtSite2 = nMolecules2;
+  }
+}
+
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
@@ -280,6 +367,24 @@ operator()(const int i, value_type &energy_virial) const {
 //else energy_virial += compute_item<1,0>(i);
 }
 
+template<class DeviceType>
+static void getAllMixingWeights(
+    int ntotal,
+    typename ArrayTypes<DeviceType>::t_float_2d_randomread dvector,
+    int nspecies,
+    int isite1, int isite2,
+    bool fractionalWeighting,
+    Kokkos::View<double*, DeviceType> mixWtSite1old,
+    Kokkos::View<double*, DeviceType> mixWtSite2old,
+    Kokkos::View<double*, DeviceType> mixWtSite1,
+    Kokkos::View<double*, DeviceType> mixWtSite2) {
+  Kokkos::parallel_for(ntotal,
+  LAMMPS_LAMBDA(int i) {
+      getMixingWeights<DeviceType>(dvector,nspecies,isite1,isite2,fractionalWeighting,
+        i, mixWtSite1old(i), mixWtSite2old(i), mixWtSite1(i), mixWtSite2(i));
+  });
+}
+
 template<class DeviceType>
 template<int TABSTYLE>
 void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
@@ -320,12 +425,9 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
   mixWtSite1_ = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite1", ntotal);
   mixWtSite2_ = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite2", ntotal);
 
-  typename DAT::t_float_2d_randomread d_dvector = atomKK->k_dvector.view<DeviceType>();
-
-//Kokkos::parallel_for(ntotal, LAMMPS_LAMBDA(int i) {
-//  getMixingWeights<DeviceType>(d_dvector, i, mixWtSite1old_(i), mixWtSite2old_(i),
-//      mixWtSite1_(i), mixWtSite2_(i));
-//});
+  getAllMixingWeights(ntotal, atomKK->k_dvector.template view<DeviceType>(),
+      nspecies, isite1, isite2, fractionalWeighting,
+      mixWtSite1old_, mixWtSite2old_, mixWtSite1_, mixWtSite2_);
 
   if (neighflag == N2) error->all(FLERR,"pair table/rx/kk can't handle N2 yet\n");
 
@@ -848,9 +950,13 @@ double PairTableRXKokkos<DeviceType>::single(int i, int j, int itype, int jtype,
   atomKK->k_dvector.template sync<LMPHostType>();
   typename ArrayTypes<LMPHostType>::t_float_2d_randomread h_dvector =
     atomKK->k_dvector.view<LMPHostType>();
-  getMixingWeights<LMPHostType>(h_dvector,i,mixWtSite1old_i,mixWtSite2old_i,
+  getMixingWeights<LMPHostType>(h_dvector,
+      nspecies, isite1, isite2, fractionalWeighting,
+      i,mixWtSite1old_i,mixWtSite2old_i,
       mixWtSite1_i,mixWtSite2_i);
-  getMixingWeights<LMPHostType>(h_dvector,j,mixWtSite1old_j,mixWtSite2old_j,
+  getMixingWeights<LMPHostType>(h_dvector,
+      nspecies, isite1, isite2, fractionalWeighting,
+      j,mixWtSite1old_j,mixWtSite2old_j,
       mixWtSite1_j,mixWtSite2_j);
 
   if (rsq < tb->innersq) error->one(FLERR,"Pair distance < table inner cutoff");
@@ -948,83 +1054,6 @@ void PairTableRXKokkos<DeviceType>::cleanup_copy() {
   vatom = NULL;
   h_table=NULL; d_table=NULL;
 }
-
-template<class DeviceType>
-template<class ExecDevice>
-KOKKOS_INLINE_FUNCTION
-void PairTableRXKokkos<DeviceType>::getMixingWeights(
-    typename ArrayTypes<ExecDevice>::t_float_2d_randomread dvector,
-    int id,
-    double &mixWtSite1old, double &mixWtSite2old,
-    double &mixWtSite1, double &mixWtSite2) {
-  double fractionOFAold, fractionOFA;
-  double fractionOld1, fraction1;
-  double fractionOld2, fraction2;
-  double nMoleculesOFAold, nMoleculesOFA;
-  double nMoleculesOld1, nMolecules1;
-  double nMoleculesOld2, nMolecules2;
-  double nTotal, nTotalOld;
-
-  nTotal = 0.0;
-  nTotalOld = 0.0;
-  for (int ispecies = 0; ispecies < nspecies; ++ispecies){
-    nTotal += dvector(ispecies,id);
-    nTotalOld += dvector(ispecies+nspecies,id);
-  }
-
-  if (isOneFluid(isite1) == false){
-    nMoleculesOld1 = dvector(isite1+nspecies,id);
-    nMolecules1 = dvector(isite1,id);
-    fractionOld1 = nMoleculesOld1/nTotalOld;
-    fraction1 = nMolecules1/nTotal;
-  }
-  if (isOneFluid(isite2) == false){
-    nMoleculesOld2 = dvector(isite2+nspecies,id);
-    nMolecules2 = dvector(isite2,id);
-    fractionOld2 = nMoleculesOld2/nTotalOld;
-    fraction2 = nMolecules2/nTotal;
-  }
-
-  if (isOneFluid(isite1) || isOneFluid(isite2)){
-    nMoleculesOFAold  = 0.0;
-    nMoleculesOFA  = 0.0;
-    fractionOFAold  = 0.0;
-    fractionOFA  = 0.0;
-
-    for (int ispecies = 0; ispecies < nspecies; ispecies++){
-      if (isite1 == ispecies || isite2 == ispecies) continue;
-      nMoleculesOFAold += dvector(ispecies+nspecies,id);
-      nMoleculesOFA += dvector(ispecies,id);
-      fractionOFAold += dvector(ispecies+nspecies,id)/nTotalOld;
-      fractionOFA += dvector(ispecies,id)/nTotal;
-    }
-    if(isOneFluid(isite1)){
-      nMoleculesOld1 = 1.0-(nTotalOld-nMoleculesOFAold);
-      nMolecules1 = 1.0-(nTotal-nMoleculesOFA);
-      fractionOld1 = fractionOFAold;
-      fraction1 = fractionOFA;
-    }
-    if(isOneFluid(isite2)){
-      nMoleculesOld2 = 1.0-(nTotalOld-nMoleculesOFAold);
-      nMolecules2 = 1.0-(nTotal-nMoleculesOFA);
-      fractionOld2 = fractionOFAold;
-      fraction2 = fractionOFA;
-    }
-  }
-
-  if(fractionalWeighting){
-    mixWtSite1old = fractionOld1;
-    mixWtSite1 = fraction1;
-    mixWtSite2old = fractionOld2;
-    mixWtSite2 = fraction2;
-  } else {
-    mixWtSite1old = nMoleculesOld1;
-    mixWtSite1 = nMolecules1;
-    mixWtSite2old = nMoleculesOld2;
-    mixWtSite2 = nMolecules2;
-  }
-}
-
 namespace LAMMPS_NS {
 template class PairTableRXKokkos<LMPDeviceType>;
 #ifdef KOKKOS_HAVE_CUDA

From cb9fdf7801bed0d31e08286fee04655f13ea0435 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Wed, 11 Jan 2017 11:44:54 -0700
Subject: [PATCH 070/267] starting to separate compute_item from the class

---
 src/KOKKOS/neigh_list_kokkos.h      |  10 +-
 src/KOKKOS/pair_table_rx_kokkos.cpp | 211 ++++++++++++++++++----------
 src/KOKKOS/pair_table_rx_kokkos.h   |   7 -
 3 files changed, 148 insertions(+), 80 deletions(-)

diff --git a/src/KOKKOS/neigh_list_kokkos.h b/src/KOKKOS/neigh_list_kokkos.h
index 45e768927c..32e6e704ae 100644
--- a/src/KOKKOS/neigh_list_kokkos.h
+++ b/src/KOKKOS/neigh_list_kokkos.h
@@ -48,7 +48,7 @@ class AtomNeighborsConst
   const int num_neighs;
 
   KOKKOS_INLINE_FUNCTION
-  AtomNeighborsConst(int* const & firstneigh, const int & _num_neighs,
+  AtomNeighborsConst(const int* const & firstneigh, const int & _num_neighs,
                      const int & stride):
   _firstneigh(firstneigh), num_neighs(_num_neighs), _stride(stride) {};
   KOKKOS_INLINE_FUNCTION
@@ -87,6 +87,14 @@ public:
                          &d_neighbors(i,1)-&d_neighbors(i,0));
   }
 
+  KOKKOS_INLINE_FUNCTION
+  static AtomNeighborsConst static_neighbors_const(int i,
+           typename ArrayTypes<Device>::t_neighbors_2d_const d_neighbors,
+           typename ArrayTypes<Device>::t_int_1d d_numneigh) {
+    return AtomNeighborsConst(&d_neighbors(i,0),d_numneigh(i),
+                              &d_neighbors(i,1)-&d_neighbors(i,0));
+  }
+
   KOKKOS_INLINE_FUNCTION
   AtomNeighborsConst get_neighbors_const(const int &i) const {
     return AtomNeighborsConst(&d_neighbors(i,0),d_numneigh(i),
diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index 63db613538..c96da87d2f 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -31,6 +31,7 @@
 #include "error.h"
 #include "atom_masks.h"
 #include "fix.h"
+#include "kokkos_few.h"
 #include <cassert>
 
 using namespace LAMMPS_NS;
@@ -192,101 +193,167 @@ PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::~Functor
 //list.clean_copy();
 }
 
-template<class DeviceType>
-template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
-template<int EVFLAG, int NEWTON_PAIR>
+KOKKOS_INLINE_FUNCTION static int sbmask(const int& j) const
+{
+  return j >> SBBITS & 3;
+}
+
+template <class DeviceType, int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE,
+          int EVFLAG, int NEWTON_PAIR>
 KOKKOS_INLINE_FUNCTION
-EV_FLOAT
-PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::
-compute_item(const int& ii) const {
+static EV_FLOAT compute_item(int ii,
+    typename ArrayTypes<DeviceType>::t_in_1d_const d_ilist,
+    typename ArrayTypes<Device>::t_neighbors_2d_const d_neighbors,
+    typename ArrayTypes<DeviceType>::t_in_1d_const d_numneigh,
+    typename ArrayTypes<DeviceType>::t_x_array_randomread x,
+    typename ArrayTypes<DeviceType>::t_int_1d_randomread type,
+    Kokkos::View<double*, DeviceType> mixWtSite1old,
+    Kokkos::View<double*, DeviceType> mixWtSite2old,
+    Kokkos::View<double*, DeviceType> mixWtSite1,
+    Kokkos::View<double*, DeviceType> mixWtSite2,
+    Few<int, 4> special_lj,
+    Few<Few<F_FLOAT, MAX_TYPES_STACKPARAMS+1>, MAX_TYPES_STACKPARAMS+1> m_cutsq,
+    typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq,
+    Kokkos::View<F_FLOAT*[3],
+      typename ArrayTypes<DeviceType>::t_f_array::array_layout,
+      DeviceType,
+      Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > f;
+    Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,
+                 device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCG;
+    Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,
+                 device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCGnew;
+    ) {
   EV_FLOAT ev;
-//const int i = list.d_ilist[ii];
-//const X_FLOAT xtmp = c.x(i,0);
-//const X_FLOAT ytmp = c.x(i,1);
-//const X_FLOAT ztmp = c.x(i,2);
-//const int itype = c.type(i);
+  auto i = d_ilist(ii);
+  auto xtmp = x(i,0);
+  auto ytmp = x(i,1);
+  auto ztmp = x(i,2);
+  auto itype = type(i);
 
-//const AtomNeighborsConst jlist = list.get_neighbors_const(i);
-//const int jnum = list.d_numneigh[i];
+  auto jlist = NeighListKokkos<DeviceType>::static_neighbors_const(i,
+      d_neighbors, d_numneigh);
+  auto jnum = d_numneigh(i);
 
-//double uCG_i = 0.0;
-//double uCGnew_i = 0.0;
-//double fx_i = 0.0, fy_i = 0.0, fz_i = 0.0;
+  double uCG_i = 0.0;
+  double uCGnew_i = 0.0;
+  double fx_i = 0.0, fy_i = 0.0, fz_i = 0.0;
 
-//double mixWtSite1old_i = c.mixWtSite1old_(i);
-//double mixWtSite2old_i = c.mixWtSite2old_(i);
-//double mixWtSite1_i = c.mixWtSite1_(i);
-//double mixWtSite2_i = c.mixWtSite2_(i);
+  auto mixWtSite1old_i = mixWtSite1old(i);
+  auto mixWtSite2old_i = mixWtSite2old(i);
+  auto mixWtSite1_i = mixWtSite1(i);
+  auto mixWtSite2_i = mixWtSite2(i);
 
-//for (int jj = 0; jj < jnum; jj++) {
-//  int j = jlist(jj);
-//  const F_FLOAT factor_lj = c.special_lj[sbmask(j)];
-//  j &= NEIGHMASK;
+  for (int jj = 0; jj < jnum; jj++) {
+    auto j = jlist(jj);
+    const F_FLOAT factor_lj = special_lj[sbmask(j)];
+    j &= NEIGHMASK;
 
-//  const X_FLOAT delx = xtmp - c.x(j,0);
-//  const X_FLOAT dely = ytmp - c.x(j,1);
-//  const X_FLOAT delz = ztmp - c.x(j,2);
-//  const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
-//  const int jtype = c.type(j);
+    auto delx = xtmp - x(j,0);
+    auto dely = ytmp - x(j,1);
+    auto delz = ztmp - x(j,2);
+    auto rsq = delx*delx + dely*dely + delz*delz;
+    auto jtype = type(j);
 
-//  if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) {
-//    double mixWtSite1old_j = c.mixWtSite1old_(j);
-//    double mixWtSite2old_j = c.mixWtSite2old_(j);
-//    double mixWtSite1_j = c.mixWtSite1_(j);
-//    double mixWtSite2_j = c.mixWtSite2_(j);
+    if(rsq < (STACKPARAMS ? m_cutsq[itype][jtype] : d_cutsq(itype,jtype))) {
+      auto mixWtSite1old_j = mixWtSite1old_(j);
+      auto mixWtSite2old_j = mixWtSite2old_(j);
+      auto mixWtSite1_j = mixWtSite1_(j);
+      auto mixWtSite2_j = mixWtSite2_(j);
 
-//    const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,TABSTYLE>(rsq,i,j,itype,jtype);
+      auto fpair = factor_lj * compute_fpair<STACKPARAMS,TABSTYLE>(
+          rsq,i,j,itype,jtype);
 
-//    fx_i += delx*fpair;
-//    fy_i += dely*fpair;
-//    fz_i += delz*fpair;
+      fx_i += delx*fpair;
+      fy_i += dely*fpair;
+      fz_i += delz*fpair;
 
-//    bool do_half = (NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) &&
-//                   (NEWTON_PAIR || j < c.nlocal);
-//    if (do_half) {
-//      f(j,0) -= delx*fpair;
-//      f(j,1) -= dely*fpair;
-//      f(j,2) -= delz*fpair;
-//    }
+      auto do_half = (NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) &&
+                     (NEWTON_PAIR || j < c.nlocal);
+      if (do_half) {
+        f(j,0) -= delx*fpair;
+        f(j,1) -= dely*fpair;
+        f(j,2) -= delz*fpair;
+      }
 
-//    auto evdwl = c.template compute_evdwl<STACKPARAMS,TABSTYLE>(rsq,i,j,itype,jtype);
+      auto evdwl = compute_evdwl<STACKPARAMS,TABSTYLE>(
+          rsq,i,j,itype,jtype);
 
-//    double evdwlOld;
-//    if (c.isite1 == c.isite2) {
-//      evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwl;
-//      evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwl;
-//    } else {
-//      evdwlOld = (sqrt(mixWtSite1old_i*mixWtSite2old_j) +
-//                  sqrt(mixWtSite2old_i*mixWtSite1old_j))*evdwl;
-//      evdwl = (sqrt(mixWtSite1_i*mixWtSite2_j) +
-//               sqrt(mixWtSite2_i*mixWtSite1_j))*evdwl;
-//    }
-//    evdwlOld *= factor_lj;
-//    evdwl *= factor_lj;
+      double evdwlOld;
+      if (c.isite1 == c.isite2) {
+        evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwl;
+        evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwl;
+      } else {
+        evdwlOld = (sqrt(mixWtSite1old_i*mixWtSite2old_j) +
+                    sqrt(mixWtSite2old_i*mixWtSite1old_j))*evdwl;
+        evdwl = (sqrt(mixWtSite1_i*mixWtSite2_j) +
+                 sqrt(mixWtSite2_i*mixWtSite1_j))*evdwl;
+      }
+      evdwlOld *= factor_lj;
+      evdwl *= factor_lj;
 
-//    uCG_i += 0.5*evdwlOld;
-//    if (do_half) uCG(j) += 0.5*evdwlOld;
+      uCG_i += 0.5*evdwlOld;
+      if (do_half) uCG(j) += 0.5*evdwlOld;
 
-//    uCGnew_i += 0.5*evdwl;
-//    if (do_half) uCGnew(j) += 0.5*evdwl;
-//    evdwl = evdwlOld;
+      uCGnew_i += 0.5*evdwl;
+      if (do_half) uCGnew(j) += 0.5*evdwl;
+      evdwl = evdwlOld;
 
-//    ev.evdwl += (do_half ? 1.0 : 0.5)*evdwl;
+      ev.evdwl += (do_half ? 1.0 : 0.5)*evdwl;
 
-//    if (EVFLAG) ev_tally(ev,i,j,evdwl,fpair,delx,dely,delz);
-//  }
-//}
+      if (EVFLAG) {
+        ev_tally(ev,i,j,evdwl,fpair,delx,dely,delz);
+      }
+    }
+  }
 
-//uCG(i) += uCG_i;
-//uCGnew(i) += uCGnew_i;
+  uCG(i) += uCG_i;
+  uCGnew(i) += uCGnew_i;
 
-//f(i,0) += fx_i;
-//f(i,1) += fy_i;
-//f(i,2) += fz_i;
+  f(i,0) += fx_i;
+  f(i,1) += fy_i;
+  f(i,2) += fz_i;
 
   return ev;
 }
 
+template<class DeviceType, int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
+static void compute_all_items(
+    int eflag, int vflag,
+    int newton_pair,
+    EV_FLOAT& ev,
+    Kokkos::View<double*, DeviceType> mixWtSite1old,
+    Kokkos::View<double*, DeviceType> mixWtSite2old,
+    Kokkos::View<double*, DeviceType> mixWtSite1,
+    Kokkos::View<double*, DeviceType> mixWtSite2,
+    int inum,
+  if (eflag || vflag) {
+    Kokkos::parallel_reduce(inum,
+    LAMMPS_LAMBDA(int i, EV_FLOAT& energy_virial) {
+      if (newton_pair) {
+        energy_virial +=
+          compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,1,1>(
+            );
+      } else {
+        energy_virial +=
+          compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,1,0>(
+            );
+        energy_virial += compute_item<1,0>(i);
+      }
+    }, ev);
+  } else {
+    Kokkos::parallel_for(inum,
+    LAMMPS_LAMBDA(int i) {
+      if (newton_pair) {
+        compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,1,1>(
+          );
+      } else {
+        compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,1,0>(
+          );
+      }
+    }, ev);
+  }
+}
+
 template<class DeviceType>
 template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
 KOKKOS_INLINE_FUNCTION
diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
index b71f57076d..33f96d4c32 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.h
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -85,10 +85,8 @@ class PairTableRXKokkos : public PairTable {
   virtual void allocate();
   void compute_table(Table *);
 
-  typename ArrayTypes<DeviceType>::t_x_array_randomread x;
   typename ArrayTypes<DeviceType>::t_x_array_const c_x;
   typename ArrayTypes<DeviceType>::t_f_array f;
-  typename ArrayTypes<DeviceType>::t_int_1d_randomread type;
   typename ArrayTypes<DeviceType>::t_efloat_1d uCG;
   typename ArrayTypes<DeviceType>::t_efloat_1d uCGnew;
   typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom;
@@ -117,11 +115,6 @@ class PairTableRXKokkos : public PairTable {
   int isite1, isite2;
   bool fractionalWeighting;
 
-  template <class ExecDevice>
-  KOKKOS_INLINE_FUNCTION
-  void getMixingWeights(typename ArrayTypes<ExecDevice>::t_float_2d_randomread,
-      int, double &, double &, double &, double &);
-
   Kokkos::View<double*, DeviceType> mixWtSite1old_;
   Kokkos::View<double*, DeviceType> mixWtSite2old_;
   Kokkos::View<double*, DeviceType> mixWtSite1_;

From c2bb20e60f8396158f6f712386a54a75b7d5ac43 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Wed, 11 Jan 2017 11:54:01 -0700
Subject: [PATCH 071/267] made compute_fpair a free function as well

---
 src/KOKKOS/pair_table_rx_kokkos.cpp | 81 +++++++++++++++--------------
 src/KOKKOS/pair_table_rx_kokkos.h   |  4 --
 2 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index c96da87d2f..26c5de87e4 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -198,6 +198,40 @@ KOKKOS_INLINE_FUNCTION static int sbmask(const int& j) const
   return j >> SBBITS & 3;
 }
 
+template <class DeviceType, int TABSTYLE>
+KOKKOS_INLINE_FUNCTION
+static F_FLOAT
+compute_fpair(F_FLOAT rsq,
+              int itype, int jtype,
+              PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const,
+              ) {
+  union_int_float_t rsq_lookup;
+  double fpair;
+  const int tidx = d_table_const.tabindex(itype,jtype);
+  if (TABSTYLE == LOOKUP) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    fpair = d_table_const.f(tidx,itable);
+  } else if (TABSTYLE == LINEAR) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    const double fraction = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
+    fpair = d_table_const.f(tidx,itable) + fraction*d_table_const.df(tidx,itable);
+  } else if (TABSTYLE == SPLINE) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    const double b = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
+    const double a = 1.0 - b;
+    fpair = a * d_table_const.f(tidx,itable) + b * d_table_const.f(tidx,itable+1) +
+      ((a*a*a-a)*d_table_const.f2(tidx,itable) + (b*b*b-b)*d_table_const.f2(tidx,itable+1)) *
+      d_table_const.deltasq6(tidx);
+  } else {
+    rsq_lookup.f = rsq;
+    int itable = rsq_lookup.i & d_table_const.nmask(tidx);
+    itable >>= d_table_const.nshiftbits(tidx);
+    const double fraction = (rsq_lookup.f - d_table_const.rsq(tidx,itable)) * d_table_const.drsq(tidx,itable);
+    fpair = d_table_const.f(tidx,itable) + fraction*d_table_const.df(tidx,itable);
+  }
+  return fpair;
+}
+
 template <class DeviceType, int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE,
           int EVFLAG, int NEWTON_PAIR>
 KOKKOS_INLINE_FUNCTION
@@ -222,6 +256,8 @@ static EV_FLOAT compute_item(int ii,
                  device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCG;
     Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,
                  device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCGnew;
+    int isite1, int isite2,
+    PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const,
     ) {
   EV_FLOAT ev;
   auto i = d_ilist(ii);
@@ -260,8 +296,12 @@ static EV_FLOAT compute_item(int ii,
       auto mixWtSite1_j = mixWtSite1_(j);
       auto mixWtSite2_j = mixWtSite2_(j);
 
-      auto fpair = factor_lj * compute_fpair<STACKPARAMS,TABSTYLE>(
-          rsq,i,j,itype,jtype);
+      auto fpair = factor_lj * compute_fpair<DeviceType,TABSTYLE>(
+          rsq,itype,jtype,d_table_const);
+
+      if (isite1 == isite2) fpair *= sqrt(mixWtSite1old_i * mixWtSite2old_j);
+      else fpair *= (sqrt(mixWtSite1old_i * mixWtSite2old_j) +
+                     sqrt(mixWtSite2old_i * mixWtSite1old_j));
 
       fx_i += delx*fpair;
       fy_i += dely*fpair;
@@ -279,7 +319,7 @@ static EV_FLOAT compute_item(int ii,
           rsq,i,j,itype,jtype);
 
       double evdwlOld;
-      if (c.isite1 == c.isite2) {
+      if (isite1 == isite2) {
         evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwl;
         evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwl;
       } else {
@@ -550,41 +590,6 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
   if (vflag_fdotr) pair_virial_fdotr_compute(this);
 }
 
-template<class DeviceType>
-template<bool STACKPARAMS, int TABSTYLE>
-KOKKOS_INLINE_FUNCTION
-F_FLOAT PairTableRXKokkos<DeviceType>::
-compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const {
-  union_int_float_t rsq_lookup;
-  double fpair;
-  const int tidx = d_table_const.tabindex(itype,jtype);
-  if (TABSTYLE == LOOKUP) {
-    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
-    fpair = d_table_const.f(tidx,itable);
-  } else if (TABSTYLE == LINEAR) {
-    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
-    const double fraction = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
-    fpair = d_table_const.f(tidx,itable) + fraction*d_table_const.df(tidx,itable);
-  } else if (TABSTYLE == SPLINE) {
-    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
-    const double b = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
-    const double a = 1.0 - b;
-    fpair = a * d_table_const.f(tidx,itable) + b * d_table_const.f(tidx,itable+1) +
-      ((a*a*a-a)*d_table_const.f2(tidx,itable) + (b*b*b-b)*d_table_const.f2(tidx,itable+1)) *
-      d_table_const.deltasq6(tidx);
-  } else {
-    rsq_lookup.f = rsq;
-    int itable = rsq_lookup.i & d_table_const.nmask(tidx);
-    itable >>= d_table_const.nshiftbits(tidx);
-    const double fraction = (rsq_lookup.f - d_table_const.rsq(tidx,itable)) * d_table_const.drsq(tidx,itable);
-    fpair = d_table_const.f(tidx,itable) + fraction*d_table_const.df(tidx,itable);
-  }
-  if (isite1 == isite2) fpair *= sqrt(mixWtSite1old_(i) * mixWtSite2old_(j));
-  else fpair *= (sqrt(mixWtSite1old_(i) * mixWtSite2old_(j)) +
-                 sqrt(mixWtSite2old_(i) * mixWtSite1old_(j)));
-  return fpair;
-}
-
 template<class DeviceType>
 template<bool STACKPARAMS, int TABSTYLE>
 KOKKOS_INLINE_FUNCTION
diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
index 33f96d4c32..b2814adcec 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.h
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -98,10 +98,6 @@ class PairTableRXKokkos : public PairTable {
   void create_kokkos_tables();
   void cleanup_copy();
 
-  template<bool STACKPARAMS, int TABSTYLE>
-  KOKKOS_INLINE_FUNCTION
-  F_FLOAT compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const;
-
   template<bool STACKPARAMS, int TABSTYLE>
   KOKKOS_INLINE_FUNCTION
   F_FLOAT compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const;

From 41804ff52464f6c60ae3a0b32ead2f0a5386d733 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Wed, 11 Jan 2017 12:42:05 -0700
Subject: [PATCH 072/267] progress converting compute_style

---
 src/KOKKOS/pair_table_rx_kokkos.cpp | 234 +++++++++++++++-------------
 src/KOKKOS/pair_table_rx_kokkos.h   |  11 +-
 2 files changed, 126 insertions(+), 119 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index 26c5de87e4..a9703dd927 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -232,10 +232,48 @@ compute_fpair(F_FLOAT rsq,
   return fpair;
 }
 
+template<class DeviceType, int TABSTYLE>
+KOKKOS_INLINE_FUNCTION
+static F_FLOAT
+compute_evdwl(
+    F_FLOAT rsq,
+    int itype, int jtype,
+    PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const,
+    ) const {
+  double evdwl;
+  union_int_float_t rsq_lookup;
+  const int tidx = d_table_const.tabindex(itype,jtype);
+  if (TABSTYLE == LOOKUP) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    evdwl = d_table_const.e(tidx,itable);
+  } else if (TABSTYLE == LINEAR) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    const double fraction = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
+    evdwl = d_table_const.e(tidx,itable) + fraction*d_table_const.de(tidx,itable);
+  } else if (TABSTYLE == SPLINE) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    const double b = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
+    const double a = 1.0 - b;
+    evdwl = a * d_table_const.e(tidx,itable) + b * d_table_const.e(tidx,itable+1) +
+        ((a*a*a-a)*d_table_const.e2(tidx,itable) + (b*b*b-b)*d_table_const.e2(tidx,itable+1)) *
+        d_table_const.deltasq6(tidx);
+  } else {
+    rsq_lookup.f = rsq;
+    int itable = rsq_lookup.i & d_table_const.nmask(tidx);
+    itable >>= d_table_const.nshiftbits(tidx);
+    const double fraction = (rsq_lookup.f - d_table_const.rsq(tidx,itable)) * d_table_const.drsq(tidx,itable);
+    evdwl = d_table_const.e(tidx,itable) + fraction*d_table_const.de(tidx,itable);
+  }
+  return evdwl;
+}
+
 template <class DeviceType, int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE,
           int EVFLAG, int NEWTON_PAIR>
 KOKKOS_INLINE_FUNCTION
-static EV_FLOAT compute_item(int ii,
+static EV_FLOAT
+compute_item(
+    int ii,
+    int nlocal,
     typename ArrayTypes<DeviceType>::t_in_1d_const d_ilist,
     typename ArrayTypes<Device>::t_neighbors_2d_const d_neighbors,
     typename ArrayTypes<DeviceType>::t_in_1d_const d_numneigh,
@@ -308,7 +346,7 @@ static EV_FLOAT compute_item(int ii,
       fz_i += delz*fpair;
 
       auto do_half = (NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) &&
-                     (NEWTON_PAIR || j < c.nlocal);
+                     (NEWTON_PAIR || j < nlocal);
       if (do_half) {
         f(j,0) -= delx*fpair;
         f(j,1) -= dely*fpair;
@@ -316,7 +354,7 @@ static EV_FLOAT compute_item(int ii,
       }
 
       auto evdwl = compute_evdwl<STACKPARAMS,TABSTYLE>(
-          rsq,i,j,itype,jtype);
+          rsq,itype,jtype,d_table_const);
 
       double evdwlOld;
       if (isite1 == isite2) {
@@ -361,22 +399,47 @@ static void compute_all_items(
     int eflag, int vflag,
     int newton_pair,
     EV_FLOAT& ev,
+    int nlocal,
+    int inum,
+    typename ArrayTypes<DeviceType>::t_in_1d_const d_ilist,
+    typename ArrayTypes<Device>::t_neighbors_2d_const d_neighbors,
+    typename ArrayTypes<DeviceType>::t_in_1d_const d_numneigh,
+    typename ArrayTypes<DeviceType>::t_x_array_randomread x,
+    typename ArrayTypes<DeviceType>::t_int_1d_randomread type,
     Kokkos::View<double*, DeviceType> mixWtSite1old,
     Kokkos::View<double*, DeviceType> mixWtSite2old,
     Kokkos::View<double*, DeviceType> mixWtSite1,
     Kokkos::View<double*, DeviceType> mixWtSite2,
-    int inum,
+    Few<int, 4> special_lj,
+    Few<Few<F_FLOAT, MAX_TYPES_STACKPARAMS+1>, MAX_TYPES_STACKPARAMS+1> m_cutsq,
+    typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq,
+    Kokkos::View<F_FLOAT*[3],
+      typename ArrayTypes<DeviceType>::t_f_array::array_layout,
+      DeviceType,
+      Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > f;
+    Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,
+                 device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCG;
+    Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,
+                 device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCGnew;
+    int isite1, int isite2,
+    PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const) {
   if (eflag || vflag) {
     Kokkos::parallel_reduce(inum,
     LAMMPS_LAMBDA(int i, EV_FLOAT& energy_virial) {
       if (newton_pair) {
         energy_virial +=
           compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,1,1>(
-            );
+            i, nlocal, d_ilist, d_neighbors, d_numneigh, x, type,
+            mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+            special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+            d_table_const);
       } else {
         energy_virial +=
           compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,1,0>(
-            );
+            i, nlocal, d_ilist, d_neighbors, d_numneigh, x, type,
+            mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+            special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+            d_table_const);
         energy_virial += compute_item<1,0>(i);
       }
     }, ev);
@@ -385,10 +448,16 @@ static void compute_all_items(
     LAMMPS_LAMBDA(int i) {
       if (newton_pair) {
         compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,1,1>(
-          );
+            i, nlocal, d_ilist, d_neighbors, d_numneigh, x, type,
+            mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+            special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+            d_table_const);
       } else {
         compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,1,0>(
-          );
+            i, nlocal, d_ilist, d_neighbors, d_numneigh, x, type,
+            mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+            special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+            d_table_const);
       }
     }, ev);
   }
@@ -454,26 +523,6 @@ ev_tally(EV_FLOAT &ev, const int &i, const int &j,
 //}
 }
 
-template<class DeviceType>
-template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
-KOKKOS_INLINE_FUNCTION
-void
-PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::
-operator()(const int i) const {
-//if (c.newton_pair) compute_item<0,1>(i);
-//else compute_item<0,0>(i);
-}
-
-template<class DeviceType>
-template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
-KOKKOS_INLINE_FUNCTION
-void
-PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::
-operator()(const int i, value_type &energy_virial) const {
-//if (c.newton_pair) energy_virial += compute_item<1,1>(i);
-//else energy_virial += compute_item<1,0>(i);
-}
-
 template<class DeviceType>
 static void getAllMixingWeights(
     int ntotal,
@@ -496,8 +545,8 @@ template<class DeviceType>
 template<int TABSTYLE>
 void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
 {
-  eflag = eflag_in;
-  vflag = vflag_in;
+  auto eflag = eflag_in;
+  auto vflag = vflag_in;
 
   if (neighflag == FULL) no_virial_fdotr_compute = 1;
 
@@ -511,69 +560,68 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
   if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
   else atomKK->modified(execution_space,F_MASK);
 
-  x = c_x = atomKK->k_x.view<DeviceType>();
-  f = atomKK->k_f.view<DeviceType>();
-  type = atomKK->k_type.view<DeviceType>();
-  uCG = atomKK->k_uCG.view<DeviceType>();
-  uCGnew = atomKK->k_uCGnew.view<DeviceType>();
-  nlocal = atom->nlocal;
-  nall = atom->nlocal + atom->nghost;
-  special_lj[0] = force->special_lj[0];
-  special_lj[1] = force->special_lj[1];
-  special_lj[2] = force->special_lj[2];
-  special_lj[3] = force->special_lj[3];
-  newton_pair = force->newton_pair;
+  auto x = atomKK->k_x.view<DeviceType>();
+  auto f = atomKK->k_f.view<DeviceType>();
+  auto type = atomKK->k_type.view<DeviceType>();
+  auto uCG = atomKK->k_uCG.view<DeviceType>();
+  auto uCGnew = atomKK->k_uCGnew.view<DeviceType>();
+  auto nlocal = atom->nlocal;
+  Few<int, 4> special_lj_local;
+  special_lj_local[0] = force->special_lj[0];
+  special_lj_local[1] = force->special_lj[1];
+  special_lj_local[2] = force->special_lj[2];
+  special_lj_local[3] = force->special_lj[3];
+  auto newton_pair = force->newton_pair;
   d_cutsq = d_table->cutsq;
   // loop over neighbors of my atoms
 
   const int ntotal = atom->nlocal + atom->nghost;
-  mixWtSite1old_ = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite1old", ntotal);
-  mixWtSite2old_ = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite2old", ntotal);
-  mixWtSite1_ = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite1", ntotal);
-  mixWtSite2_ = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite2", ntotal);
+  auto mixWtSite1old = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite1old", ntotal);
+  auto mixWtSite2old = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite2old", ntotal);
+  auto mixWtSite1 = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite1", ntotal);
+  auto mixWtSite2 = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite2", ntotal);
 
   getAllMixingWeights(ntotal, atomKK->k_dvector.template view<DeviceType>(),
       nspecies, isite1, isite2, fractionalWeighting,
-      mixWtSite1old_, mixWtSite2old_, mixWtSite1_, mixWtSite2_);
+      mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2);
 
   if (neighflag == N2) error->all(FLERR,"pair table/rx/kk can't handle N2 yet\n");
 
+  NeighListKokkos<DeviceType>* l =
+    dynamic_cast<NeighListKokkos<DeviceType>*>(list);
+
   EV_FLOAT ev;
   if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
-    if (neighflag == FULL) {
-      Functor<FULL,false,TABSTYLE> ff(this,(NeighListKokkos<DeviceType>*) list);
-      if (eflag || vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
-      else Kokkos::parallel_for(list->inum,ff);
-    } else if (neighflag == HALFTHREAD) {
-      Functor<HALFTHREAD,false,TABSTYLE> ff(this,(NeighListKokkos<DeviceType>*) list);
-      if (eflag || vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
-      else Kokkos::parallel_for(list->inum,ff);
+    if (neighflag == HALFTHREAD) {
+      compute_all_items<DeviceType,HALFTHREAD,false,TABSTYLE>(
+          eflag, vflag, newton_pair, ev, nlocal,
+          l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const);
     } else if (neighflag == HALF) {
-      Functor<HALF,false,TABSTYLE> f(this,(NeighListKokkos<DeviceType>*) list);
-      if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
-      else Kokkos::parallel_for(list->inum,f);
-    } else if (neighflag == N2) {
-      Functor<N2,false,TABSTYLE> f(this,(NeighListKokkos<DeviceType>*) list);
-      if (eflag || vflag) Kokkos::parallel_reduce(nlocal,f,ev);
-      else Kokkos::parallel_for(nlocal,f);
+      compute_all_items<DeviceType,HALF,false,TABSTYLE>(
+          eflag, vflag, newton_pair, ev, nlocal,
+          l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const);
     }
   } else {
-    if (neighflag == FULL) {
-      Functor<FULL,true,TABSTYLE> f(this,(NeighListKokkos<DeviceType>*) list);
-      if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
-      else Kokkos::parallel_for(list->inum,f);
-    } else if (neighflag == HALFTHREAD) {
-      Functor<HALFTHREAD,true,TABSTYLE> f(this,(NeighListKokkos<DeviceType>*) list);
-      if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
-      else Kokkos::parallel_for(list->inum,f);
+    if (neighflag == HALFTHREAD) {
+      compute_all_items<DeviceType,HALFTHREAD,true,TABSTYLE>(
+          eflag, vflag, newton_pair, ev, nlocal,
+          l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const);
     } else if (neighflag == HALF) {
-      Functor<HALF,true,TABSTYLE> f(this,(NeighListKokkos<DeviceType>*) list);
-      if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
-      else Kokkos::parallel_for(list->inum,f);
-    } else if (neighflag == N2) {
-      Functor<N2,true,TABSTYLE> f(this,(NeighListKokkos<DeviceType>*) list);
-      if (eflag || vflag) Kokkos::parallel_reduce(nlocal,f,ev);
-      else Kokkos::parallel_for(nlocal,f);
+      compute_all_items<DeviceType,HALFT,true,TABSTYLE>(
+          eflag, vflag, newton_pair, ev, nlocal,
+          l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const);
     }
   }
 
@@ -590,38 +638,6 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
   if (vflag_fdotr) pair_virial_fdotr_compute(this);
 }
 
-template<class DeviceType>
-template<bool STACKPARAMS, int TABSTYLE>
-KOKKOS_INLINE_FUNCTION
-F_FLOAT PairTableRXKokkos<DeviceType>::
-compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const {
-  double evdwl;
-  union_int_float_t rsq_lookup;
-  const int tidx = d_table_const.tabindex(itype,jtype);
-  if (TABSTYLE == LOOKUP) {
-    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
-    evdwl = d_table_const.e(tidx,itable);
-  } else if (TABSTYLE == LINEAR) {
-    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
-    const double fraction = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
-    evdwl = d_table_const.e(tidx,itable) + fraction*d_table_const.de(tidx,itable);
-  } else if (TABSTYLE == SPLINE) {
-    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
-    const double b = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
-    const double a = 1.0 - b;
-    evdwl = a * d_table_const.e(tidx,itable) + b * d_table_const.e(tidx,itable+1) +
-        ((a*a*a-a)*d_table_const.e2(tidx,itable) + (b*b*b-b)*d_table_const.e2(tidx,itable+1)) *
-        d_table_const.deltasq6(tidx);
-  } else {
-    rsq_lookup.f = rsq;
-    int itable = rsq_lookup.i & d_table_const.nmask(tidx);
-    itable >>= d_table_const.nshiftbits(tidx);
-    const double fraction = (rsq_lookup.f - d_table_const.rsq(tidx,itable)) * d_table_const.drsq(tidx,itable);
-    evdwl = d_table_const.e(tidx,itable) + fraction*d_table_const.de(tidx,itable);
-  }
-  return evdwl;
-}
-
 template<class DeviceType>
 void PairTableRXKokkos<DeviceType>::create_kokkos_tables()
 {
diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
index b2814adcec..36441f78b5 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.h
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -92,16 +92,12 @@ class PairTableRXKokkos : public PairTable {
   typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom;
   typename ArrayTypes<DeviceType>::t_virial_array d_vatom;
 
-  int nlocal,nall,eflag,vflag,neighflag,newton_pair;
+  int neighflag;
 
   int update_table;
   void create_kokkos_tables();
   void cleanup_copy();
 
-  template<bool STACKPARAMS, int TABSTYLE>
-  KOKKOS_INLINE_FUNCTION
-  F_FLOAT compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const;
-
   friend void pair_virial_fdotr_compute<PairTableRXKokkos>(PairTableRXKokkos*);
 
   /* PairTableRX members */
@@ -111,11 +107,6 @@ class PairTableRXKokkos : public PairTable {
   int isite1, isite2;
   bool fractionalWeighting;
 
-  Kokkos::View<double*, DeviceType> mixWtSite1old_;
-  Kokkos::View<double*, DeviceType> mixWtSite2old_;
-  Kokkos::View<double*, DeviceType> mixWtSite1_;
-  Kokkos::View<double*, DeviceType> mixWtSite2_;
-
   /* a duplicate of PairComputeFunctor to deal with uCG */
   template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
   struct Functor {

From fdb6b91e29166f0882169946718b1c2d69c114ac Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Wed, 11 Jan 2017 12:50:32 -0700
Subject: [PATCH 073/267] near trying to compile

---
 src/KOKKOS/pair_table_rx_kokkos.cpp | 144 ++++++++++++++--------------
 1 file changed, 71 insertions(+), 73 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index a9703dd927..bed69fa0a0 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -267,6 +267,62 @@ compute_evdwl(
   return evdwl;
 }
 
+template<class DeviceType, int NEIGHFLAG, int TABSTYLE, int NEWTON_PAIR>
+KOKKOS_INLINE_FUNCTION
+void
+ev_tally(
+    int vflag_global,
+    int nlocal,
+    EV_FLOAT& ev,
+    F_FLOAT epair, F_FLOAT fpair,
+    F_FLOAT delx, F_FLOAT dely, F_FLOAT delz)
+{
+  if (vflag_global) {
+    auto v0 = delx*delx*fpair;
+    auto v1 = dely*dely*fpair;
+    auto v2 = delz*delz*fpair;
+    auto v3 = delx*dely*fpair;
+    auto v4 = delx*delz*fpair;
+    auto v5 = dely*delz*fpair;
+
+    if (NEIGHFLAG!=FULL) {
+      if (NEWTON_PAIR) {
+        ev.v[0] += v0;
+        ev.v[1] += v1;
+        ev.v[2] += v2;
+        ev.v[3] += v3;
+        ev.v[4] += v4;
+        ev.v[5] += v5;
+      } else {
+        if (i < c.nlocal) {
+          ev.v[0] += 0.5*v0;
+          ev.v[1] += 0.5*v1;
+          ev.v[2] += 0.5*v2;
+          ev.v[3] += 0.5*v3;
+          ev.v[4] += 0.5*v4;
+          ev.v[5] += 0.5*v5;
+        }
+        if (j < c.nlocal) {
+          ev.v[0] += 0.5*v0;
+          ev.v[1] += 0.5*v1;
+          ev.v[2] += 0.5*v2;
+          ev.v[3] += 0.5*v3;
+          ev.v[4] += 0.5*v4;
+          ev.v[5] += 0.5*v5;
+        }
+      }
+    } else {
+      ev.v[0] += 0.5*v0;
+      ev.v[1] += 0.5*v1;
+      ev.v[2] += 0.5*v2;
+      ev.v[3] += 0.5*v3;
+      ev.v[4] += 0.5*v4;
+      ev.v[5] += 0.5*v5;
+    }
+  }
+}
+}
+
 template <class DeviceType, int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE,
           int EVFLAG, int NEWTON_PAIR>
 KOKKOS_INLINE_FUNCTION
@@ -296,6 +352,7 @@ compute_item(
                  device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCGnew;
     int isite1, int isite2,
     PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const,
+    int vflag_global
     ) {
   EV_FLOAT ev;
   auto i = d_ilist(ii);
@@ -379,7 +436,8 @@ compute_item(
       ev.evdwl += (do_half ? 1.0 : 0.5)*evdwl;
 
       if (EVFLAG) {
-        ev_tally(ev,i,j,evdwl,fpair,delx,dely,delz);
+        ev_tally<DeviceType,NEIGHFLAG,TABSTYLE,NEWTON_PAIR>(
+            vflag_global,nlocal,ev,evdwl,fpair,delx,dely,delz);
       }
     }
   }
@@ -422,7 +480,8 @@ static void compute_all_items(
     Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,
                  device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCGnew;
     int isite1, int isite2,
-    PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const) {
+    PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const,
+    int vflag_global) {
   if (eflag || vflag) {
     Kokkos::parallel_reduce(inum,
     LAMMPS_LAMBDA(int i, EV_FLOAT& energy_virial) {
@@ -432,97 +491,36 @@ static void compute_all_items(
             i, nlocal, d_ilist, d_neighbors, d_numneigh, x, type,
             mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
             special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
-            d_table_const);
+            d_table_const, vflag_global);
       } else {
         energy_virial +=
           compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,1,0>(
             i, nlocal, d_ilist, d_neighbors, d_numneigh, x, type,
             mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
             special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
-            d_table_const);
-        energy_virial += compute_item<1,0>(i);
+            d_table_const, vflag_global);
       }
     }, ev);
   } else {
     Kokkos::parallel_for(inum,
     LAMMPS_LAMBDA(int i) {
       if (newton_pair) {
-        compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,1,1>(
+        compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,0,1>(
             i, nlocal, d_ilist, d_neighbors, d_numneigh, x, type,
             mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
             special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
-            d_table_const);
+            d_table_const, vflag_global);
       } else {
-        compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,1,0>(
+        compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,0,0>(
             i, nlocal, d_ilist, d_neighbors, d_numneigh, x, type,
             mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
             special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
-            d_table_const);
+            d_table_const, vflag_global);
       }
     }, ev);
   }
 }
 
-template<class DeviceType>
-template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
-KOKKOS_INLINE_FUNCTION
-void
-PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::
-ev_tally(EV_FLOAT &ev, const int &i, const int &j,
-         const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
-         const F_FLOAT &dely, const F_FLOAT &delz) const
-{
-//const int EFLAG = c.eflag;
-//const int NEWTON_PAIR = c.newton_pair;
-//const int VFLAG = c.vflag_either;
-
-//if (VFLAG) {
-//  const E_FLOAT v0 = delx*delx*fpair;
-//  const E_FLOAT v1 = dely*dely*fpair;
-//  const E_FLOAT v2 = delz*delz*fpair;
-//  const E_FLOAT v3 = delx*dely*fpair;
-//  const E_FLOAT v4 = delx*delz*fpair;
-//  const E_FLOAT v5 = dely*delz*fpair;
-
-//  if (c.vflag_global) {
-//    if (NEIGHFLAG!=FULL) {
-//      if (NEWTON_PAIR) {
-//        ev.v[0] += v0;
-//        ev.v[1] += v1;
-//        ev.v[2] += v2;
-//        ev.v[3] += v3;
-//        ev.v[4] += v4;
-//        ev.v[5] += v5;
-//      } else {
-//        if (i < c.nlocal) {
-//          ev.v[0] += 0.5*v0;
-//          ev.v[1] += 0.5*v1;
-//          ev.v[2] += 0.5*v2;
-//          ev.v[3] += 0.5*v3;
-//          ev.v[4] += 0.5*v4;
-//          ev.v[5] += 0.5*v5;
-//        }
-//        if (j < c.nlocal) {
-//          ev.v[0] += 0.5*v0;
-//          ev.v[1] += 0.5*v1;
-//          ev.v[2] += 0.5*v2;
-//          ev.v[3] += 0.5*v3;
-//          ev.v[4] += 0.5*v4;
-//          ev.v[5] += 0.5*v5;
-//        }
-//      }
-//    } else {
-//      ev.v[0] += 0.5*v0;
-//      ev.v[1] += 0.5*v1;
-//      ev.v[2] += 0.5*v2;
-//      ev.v[3] += 0.5*v3;
-//      ev.v[4] += 0.5*v4;
-//      ev.v[5] += 0.5*v5;
-//    }
-//  }
-//}
-}
-
 template<class DeviceType>
 static void getAllMixingWeights(
     int ntotal,
@@ -598,14 +596,14 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
           l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
           x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
           special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
-          d_table_const);
+          d_table_const, vflag_global);
     } else if (neighflag == HALF) {
       compute_all_items<DeviceType,HALF,false,TABSTYLE>(
           eflag, vflag, newton_pair, ev, nlocal,
           l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
           x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
           special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
-          d_table_const);
+          d_table_const, vflag_global);
     }
   } else {
     if (neighflag == HALFTHREAD) {
@@ -614,14 +612,14 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
           l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
           x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
           special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
-          d_table_const);
+          d_table_const, vflag_global);
     } else if (neighflag == HALF) {
       compute_all_items<DeviceType,HALFT,true,TABSTYLE>(
           eflag, vflag, newton_pair, ev, nlocal,
           l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
           x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
           special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
-          d_table_const);
+          d_table_const, vflag_global);
     }
   }
 

From 5dcbbba4ce53654bc40dc363872878358bfc73b5 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Wed, 11 Jan 2017 13:15:01 -0700
Subject: [PATCH 074/267] lots of work towards compiling

---
 src/KOKKOS/pair_table_rx_kokkos.cpp | 88 ++++++++++++++---------------
 src/KOKKOS/pair_table_rx_kokkos.h   |  7 +--
 src/pair.h                          |  2 +
 src/pair_table.h                    |  2 +-
 4 files changed, 50 insertions(+), 49 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index bed69fa0a0..c6206b828b 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -182,7 +182,7 @@ void PairTableRXKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 template<class DeviceType>
 template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
 PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::Functor(
-    PairTableRXKokkos* c_ptr, NeighListKokkos<device_type>* list_ptr)//:
+    PairTableRXKokkos* c_ptr, NeighListKokkos<DeviceType>* list_ptr)//:
 //c(*c_ptr),f(c.f),uCG(c.uCG),uCGnew(c.uCGnew),list(*list_ptr)
 {}
 
@@ -193,7 +193,7 @@ PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::~Functor
 //list.clean_copy();
 }
 
-KOKKOS_INLINE_FUNCTION static int sbmask(const int& j) const
+KOKKOS_INLINE_FUNCTION static int sbmask(const int& j)
 {
   return j >> SBBITS & 3;
 }
@@ -203,19 +203,19 @@ KOKKOS_INLINE_FUNCTION
 static F_FLOAT
 compute_fpair(F_FLOAT rsq,
               int itype, int jtype,
-              PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const,
+              typename PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const
               ) {
-  union_int_float_t rsq_lookup;
+  Pair::union_int_float_t rsq_lookup;
   double fpair;
   const int tidx = d_table_const.tabindex(itype,jtype);
-  if (TABSTYLE == LOOKUP) {
+  if (TABSTYLE == PairTable::LOOKUP) {
     const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
     fpair = d_table_const.f(tidx,itable);
-  } else if (TABSTYLE == LINEAR) {
+  } else if (TABSTYLE == PairTable::LINEAR) {
     const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
     const double fraction = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
     fpair = d_table_const.f(tidx,itable) + fraction*d_table_const.df(tidx,itable);
-  } else if (TABSTYLE == SPLINE) {
+  } else if (TABSTYLE == PairTable::SPLINE) {
     const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
     const double b = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
     const double a = 1.0 - b;
@@ -238,19 +238,19 @@ static F_FLOAT
 compute_evdwl(
     F_FLOAT rsq,
     int itype, int jtype,
-    PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const,
-    ) const {
+    typename PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const
+    ) {
   double evdwl;
-  union_int_float_t rsq_lookup;
+  Pair::union_int_float_t rsq_lookup;
   const int tidx = d_table_const.tabindex(itype,jtype);
-  if (TABSTYLE == LOOKUP) {
+  if (TABSTYLE == PairTable::LOOKUP) {
     const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
     evdwl = d_table_const.e(tidx,itable);
-  } else if (TABSTYLE == LINEAR) {
+  } else if (TABSTYLE == PairTable::LINEAR) {
     const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
     const double fraction = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
     evdwl = d_table_const.e(tidx,itable) + fraction*d_table_const.de(tidx,itable);
-  } else if (TABSTYLE == SPLINE) {
+  } else if (TABSTYLE == PairTable::SPLINE) {
     const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
     const double b = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
     const double a = 1.0 - b;
@@ -273,6 +273,7 @@ void
 ev_tally(
     int vflag_global,
     int nlocal,
+    int i, int j,
     EV_FLOAT& ev,
     F_FLOAT epair, F_FLOAT fpair,
     F_FLOAT delx, F_FLOAT dely, F_FLOAT delz)
@@ -294,7 +295,7 @@ ev_tally(
         ev.v[4] += v4;
         ev.v[5] += v5;
       } else {
-        if (i < c.nlocal) {
+        if (i < nlocal) {
           ev.v[0] += 0.5*v0;
           ev.v[1] += 0.5*v1;
           ev.v[2] += 0.5*v2;
@@ -302,7 +303,7 @@ ev_tally(
           ev.v[4] += 0.5*v4;
           ev.v[5] += 0.5*v5;
         }
-        if (j < c.nlocal) {
+        if (j < nlocal) {
           ev.v[0] += 0.5*v0;
           ev.v[1] += 0.5*v1;
           ev.v[2] += 0.5*v2;
@@ -321,7 +322,6 @@ ev_tally(
     }
   }
 }
-}
 
 template <class DeviceType, int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE,
           int EVFLAG, int NEWTON_PAIR>
@@ -330,9 +330,9 @@ static EV_FLOAT
 compute_item(
     int ii,
     int nlocal,
-    typename ArrayTypes<DeviceType>::t_in_1d_const d_ilist,
-    typename ArrayTypes<Device>::t_neighbors_2d_const d_neighbors,
-    typename ArrayTypes<DeviceType>::t_in_1d_const d_numneigh,
+    typename ArrayTypes<DeviceType>::t_int_1d_const d_ilist,
+    typename ArrayTypes<DeviceType>::t_neighbors_2d_const d_neighbors,
+    typename ArrayTypes<DeviceType>::t_int_1d_const d_numneigh,
     typename ArrayTypes<DeviceType>::t_x_array_randomread x,
     typename ArrayTypes<DeviceType>::t_int_1d_randomread type,
     Kokkos::View<double*, DeviceType> mixWtSite1old,
@@ -345,13 +345,13 @@ compute_item(
     Kokkos::View<F_FLOAT*[3],
       typename ArrayTypes<DeviceType>::t_f_array::array_layout,
       DeviceType,
-      Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > f;
+      Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > f,
     Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,
-                 device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCG;
+                 DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCG,
     Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,
-                 device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCGnew;
+                 DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCGnew,
     int isite1, int isite2,
-    PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const,
+    typename PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const,
     int vflag_global
     ) {
   EV_FLOAT ev;
@@ -386,10 +386,10 @@ compute_item(
     auto jtype = type(j);
 
     if(rsq < (STACKPARAMS ? m_cutsq[itype][jtype] : d_cutsq(itype,jtype))) {
-      auto mixWtSite1old_j = mixWtSite1old_(j);
-      auto mixWtSite2old_j = mixWtSite2old_(j);
-      auto mixWtSite1_j = mixWtSite1_(j);
-      auto mixWtSite2_j = mixWtSite2_(j);
+      auto mixWtSite1old_j = mixWtSite1old(j);
+      auto mixWtSite2old_j = mixWtSite2old(j);
+      auto mixWtSite1_j = mixWtSite1(j);
+      auto mixWtSite2_j = mixWtSite2(j);
 
       auto fpair = factor_lj * compute_fpair<DeviceType,TABSTYLE>(
           rsq,itype,jtype,d_table_const);
@@ -437,7 +437,7 @@ compute_item(
 
       if (EVFLAG) {
         ev_tally<DeviceType,NEIGHFLAG,TABSTYLE,NEWTON_PAIR>(
-            vflag_global,nlocal,ev,evdwl,fpair,delx,dely,delz);
+            vflag_global,nlocal,i,j,ev,evdwl,fpair,delx,dely,delz);
       }
     }
   }
@@ -459,9 +459,9 @@ static void compute_all_items(
     EV_FLOAT& ev,
     int nlocal,
     int inum,
-    typename ArrayTypes<DeviceType>::t_in_1d_const d_ilist,
-    typename ArrayTypes<Device>::t_neighbors_2d_const d_neighbors,
-    typename ArrayTypes<DeviceType>::t_in_1d_const d_numneigh,
+    typename ArrayTypes<DeviceType>::t_int_1d_const d_ilist,
+    typename ArrayTypes<DeviceType>::t_neighbors_2d_const d_neighbors,
+    typename ArrayTypes<DeviceType>::t_int_1d_const d_numneigh,
     typename ArrayTypes<DeviceType>::t_x_array_randomread x,
     typename ArrayTypes<DeviceType>::t_int_1d_randomread type,
     Kokkos::View<double*, DeviceType> mixWtSite1old,
@@ -474,13 +474,13 @@ static void compute_all_items(
     Kokkos::View<F_FLOAT*[3],
       typename ArrayTypes<DeviceType>::t_f_array::array_layout,
       DeviceType,
-      Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > f;
+      Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > f,
     Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,
-                 device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCG;
+                 DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCG,
     Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,
-                 device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCGnew;
+                 DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCGnew,
     int isite1, int isite2,
-    PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const,
+    typename PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const,
     int vflag_global) {
   if (eflag || vflag) {
     Kokkos::parallel_reduce(inum,
@@ -517,7 +517,7 @@ static void compute_all_items(
             special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
             d_table_const, vflag_global);
       }
-    }, ev);
+    });
   }
 }
 
@@ -558,8 +558,8 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
   if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
   else atomKK->modified(execution_space,F_MASK);
 
-  auto x = atomKK->k_x.view<DeviceType>();
-  auto f = atomKK->k_f.view<DeviceType>();
+  x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
   auto type = atomKK->k_type.view<DeviceType>();
   auto uCG = atomKK->k_uCG.view<DeviceType>();
   auto uCGnew = atomKK->k_uCGnew.view<DeviceType>();
@@ -595,14 +595,14 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
           eflag, vflag, newton_pair, ev, nlocal,
           l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
           x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
-          special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
           d_table_const, vflag_global);
     } else if (neighflag == HALF) {
       compute_all_items<DeviceType,HALF,false,TABSTYLE>(
           eflag, vflag, newton_pair, ev, nlocal,
           l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
           x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
-          special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
           d_table_const, vflag_global);
     }
   } else {
@@ -611,14 +611,14 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
           eflag, vflag, newton_pair, ev, nlocal,
           l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
           x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
-          special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
           d_table_const, vflag_global);
     } else if (neighflag == HALF) {
-      compute_all_items<DeviceType,HALFT,true,TABSTYLE>(
+      compute_all_items<DeviceType,HALF,true,TABSTYLE>(
           eflag, vflag, newton_pair, ev, nlocal,
           l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
           x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
-          special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
           d_table_const, vflag_global);
     }
   }
@@ -1067,7 +1067,7 @@ double PairTableRXKokkos<DeviceType>::single(int i, int j, int itype, int jtype,
       tb->deltasq6;
     fforce = factor_lj * value;
   } else {
-    union_int_float_t rsq_lookup;
+    Pair::union_int_float_t rsq_lookup;
     rsq_lookup.f = rsq;
     itable = rsq_lookup.i & tb->nmask;
     itable >>= tb->nshiftbits;
diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
index 36441f78b5..fdd863e4bc 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.h
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -23,6 +23,7 @@ PairStyle(table/rx/kk/host,PairTableRXKokkos<LMPHostType>)
 #define LMP_PAIR_TABLE_RX_KOKKOS_H
 
 #include "pair_table_kokkos.h"
+#include "kokkos_few.h"
 
 namespace LAMMPS_NS {
 
@@ -78,17 +79,15 @@ class PairTableRXKokkos : public PairTable {
   TableDevice* d_table;
   TableHost* h_table;
 
-  F_FLOAT m_cutsq[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
+  Few<Few<F_FLOAT, MAX_TYPES_STACKPARAMS+1>, MAX_TYPES_STACKPARAMS+1> m_cutsq;
 
   typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq;
 
   virtual void allocate();
   void compute_table(Table *);
 
-  typename ArrayTypes<DeviceType>::t_x_array_const c_x;
+  typename ArrayTypes<DeviceType>::t_x_array_randomread x;
   typename ArrayTypes<DeviceType>::t_f_array f;
-  typename ArrayTypes<DeviceType>::t_efloat_1d uCG;
-  typename ArrayTypes<DeviceType>::t_efloat_1d uCGnew;
   typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom;
   typename ArrayTypes<DeviceType>::t_virial_array d_vatom;
 
diff --git a/src/pair.h b/src/pair.h
index 3378115e49..ecb54bcf4d 100644
--- a/src/pair.h
+++ b/src/pair.h
@@ -211,10 +211,12 @@ class Pair : protected Pointers {
   double tabinner;                     // inner cutoff for Coulomb table
   double tabinner_disp;                 // inner cutoff for dispersion table
 
+ public:
   // custom data type for accessing Coulomb tables
 
   typedef union {int i; float f;} union_int_float_t;
 
+ protected:
   int vflag_fdotr;
   int maxeatom,maxvatom;
 
diff --git a/src/pair_table.h b/src/pair_table.h
index caffebdf31..b723fd2d98 100644
--- a/src/pair_table.h
+++ b/src/pair_table.h
@@ -40,9 +40,9 @@ class PairTable : public Pair {
   virtual double single(int, int, int, int, double, double, double, double &);
   void *extract(const char *, int &);
 
- protected:
   enum{LOOKUP,LINEAR,SPLINE,BITMAP};
 
+ protected:
   int tabstyle,tablength;
   struct Table {
     int ninput,rflag,fpflag,match,ntablebits;

From 52761aee0dc0ab1a6319c9d8ab7baa1f6940351b Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Wed, 11 Jan 2017 13:18:13 -0700
Subject: [PATCH 075/267] it compiles.

---
 src/KOKKOS/neigh_list_kokkos.h      | 2 +-
 src/KOKKOS/pair_table_rx_kokkos.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/neigh_list_kokkos.h b/src/KOKKOS/neigh_list_kokkos.h
index 32e6e704ae..b43e1106f2 100644
--- a/src/KOKKOS/neigh_list_kokkos.h
+++ b/src/KOKKOS/neigh_list_kokkos.h
@@ -90,7 +90,7 @@ public:
   KOKKOS_INLINE_FUNCTION
   static AtomNeighborsConst static_neighbors_const(int i,
            typename ArrayTypes<Device>::t_neighbors_2d_const d_neighbors,
-           typename ArrayTypes<Device>::t_int_1d d_numneigh) {
+           typename ArrayTypes<Device>::t_int_1d_const d_numneigh) {
     return AtomNeighborsConst(&d_neighbors(i,0),d_numneigh(i),
                               &d_neighbors(i,1)-&d_neighbors(i,0));
   }
diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index c6206b828b..2a9e1bb13b 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -410,7 +410,7 @@ compute_item(
         f(j,2) -= delz*fpair;
       }
 
-      auto evdwl = compute_evdwl<STACKPARAMS,TABSTYLE>(
+      auto evdwl = compute_evdwl<DeviceType,TABSTYLE>(
           rsq,itype,jtype,d_table_const);
 
       double evdwlOld;

From 3580e5409de58417370feae7eb7727ef9480fbde Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Thu, 12 Jan 2017 09:00:07 -0700
Subject: [PATCH 076/267] Fixing Kokkos CUDA compile error

---
 lib/kokkos/Makefile.kokkos | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
index 73a332ee11..94d0452428 100644
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@@ -20,7 +20,7 @@ KOKKOS_OPTIONS ?= ""
 
 #Default settings specific options
 #Options: force_uvm,use_ldg,rdc,enable_lambda
-KOKKOS_CUDA_OPTIONS ?= ""
+KOKKOS_CUDA_OPTIONS ?= "enable_lambda"
 
 # Check for general settings
 

From 0c3b9426862c4da7730834a194a6e936c7593a7a Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Thu, 12 Jan 2017 13:50:30 -0700
Subject: [PATCH 077/267] cleanup changes to Install.sh

---
 src/KOKKOS/Install.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index e76f62d65d..cf753ecee8 100644
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -28,8 +28,8 @@ action () {
 
 # force rebuild of files with LMP_KOKKOS switch
 
-#touch ../accelerator_kokkos.h
-#touch ../memory.h
+touch ../accelerator_kokkos.h
+touch ../memory.h
 
 # list of files with optional dependcies
 
@@ -196,7 +196,7 @@ action pair_vashishta_kokkos.h pair_vashishta.h
 action pair_table_kokkos.cpp
 action pair_table_kokkos.h
 action pair_table_rx_kokkos.cpp pair_table_rx.cpp
-action pair_table_rx_kokkos.h pair_table_rx.h  
+action pair_table_rx_kokkos.h pair_table_rx.h
 action pair_tersoff_kokkos.cpp pair_tersoff.cpp
 action pair_tersoff_kokkos.h pair_tersoff.h
 action pair_tersoff_mod_kokkos.cpp pair_tersoff_mod.cpp

From 4dab6737ba59e402bf9a7609e66e24a75c313699 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Thu, 12 Jan 2017 14:15:42 -0700
Subject: [PATCH 078/267] remove leftover code

---
 src/KOKKOS/pair_table_rx_kokkos.cpp | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index 2a9e1bb13b..66089009a2 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -179,20 +179,6 @@ void PairTableRXKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     compute_style<BITMAP>(eflag_in,vflag_in);
 }
 
-template<class DeviceType>
-template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
-PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::Functor(
-    PairTableRXKokkos* c_ptr, NeighListKokkos<DeviceType>* list_ptr)//:
-//c(*c_ptr),f(c.f),uCG(c.uCG),uCGnew(c.uCGnew),list(*list_ptr)
-{}
-
-template<class DeviceType>
-template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
-PairTableRXKokkos<DeviceType>::Functor<NEIGHFLAG,STACKPARAMS,TABSTYLE>::~Functor() {
-//c.cleanup_copy();
-//list.clean_copy();
-}
-
 KOKKOS_INLINE_FUNCTION static int sbmask(const int& j)
 {
   return j >> SBBITS & 3;

From cce10f6dff0dc0c28729e5787e9bc998751692b5 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Thu, 12 Jan 2017 14:19:10 -0700
Subject: [PATCH 079/267] remove more leftover code

---
 src/KOKKOS/pair_table_rx_kokkos.h | 33 -------------------------------
 1 file changed, 33 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
index fdd863e4bc..4e94802d72 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.h
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -106,39 +106,6 @@ class PairTableRXKokkos : public PairTable {
   int isite1, isite2;
   bool fractionalWeighting;
 
-  /* a duplicate of PairComputeFunctor to deal with uCG */
-  template <int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
-  struct Functor {
-    using device_type = DeviceType;
-    typedef EV_FLOAT value_type;
-  //PairTableRXKokkos<device_type> c;
-    // arrays are atomic for Half(Thread) neighbor style
-    Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,
-                 device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > f;
-    Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,
-                 device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCG;
-    Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,
-                 device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCGnew;
-  //NeighListKokkos<device_type> list;
-    Functor(PairTableRXKokkos* c_ptr, NeighListKokkos<device_type>* list_ptr);
-    ~Functor();
-    KOKKOS_INLINE_FUNCTION int sbmask(const int& j) const {
-      return j >> SBBITS & 3;
-    }
-    template<int EVFLAG, int NEWTON_PAIR>
-    KOKKOS_INLINE_FUNCTION
-    EV_FLOAT compute_item(const int&) const;
-    KOKKOS_INLINE_FUNCTION
-    void
-    ev_tally(EV_FLOAT &ev, const int &i, const int &j,
-             const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
-             const F_FLOAT &dely, const F_FLOAT &delz) const;
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const int) const;
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const int, value_type&) const;
-  };
-
 };
 
 }

From 0635151e2db0e53b3680f5bc8613c078e99cf901 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Thu, 12 Jan 2017 16:22:24 -0700
Subject: [PATCH 080/267] Fixing neighbor bug

---
 src/neighbor.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/neighbor.cpp b/src/neighbor.cpp
index 59abc29f19..af59391209 100644
--- a/src/neighbor.cpp
+++ b/src/neighbor.cpp
@@ -909,9 +909,10 @@ void Neighbor::init_pair()
     done = 1;
     for (i = 0; i < npair_perpetual; i++) {
       ptr = NULL;
-      if (lists[plist[i]]->listcopy) ptr = lists[plist[i]]->listcopy;
-      if (lists[plist[i]]->listskip) ptr = lists[plist[i]]->listskip;
       if (lists[plist[i]]->listfull) ptr = lists[plist[i]]->listfull;
+      if (lists[plist[i]]->listcopy) ptr = lists[plist[i]]->listcopy;
+      // listskip check must be after listfull check
+      if (lists[plist[i]]->listskip) ptr = lists[plist[i]]->listskip;
       if (ptr == NULL) continue;
       for (m = 0; m < nrequest; m++)
         if (ptr == lists[m]) break;

From 5b7ab135dd849b2dbc74118036e872a1e20d2c43 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Thu, 12 Jan 2017 16:22:38 -0700
Subject: [PATCH 081/267] Fixing Kokkos neighbor bug

---
 src/neigh_request.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/neigh_request.cpp b/src/neigh_request.cpp
index a8ba8496cd..7f5d9a6195 100644
--- a/src/neigh_request.cpp
+++ b/src/neigh_request.cpp
@@ -138,6 +138,8 @@ int NeighRequest::identical(NeighRequest *other)
   if (ghost != other->ghost) same = 0;
   if (omp != other->omp) same = 0;
   if (intel != other->intel) same = 0;
+  if (kokkos_host != other->kokkos_host) same = 0;
+  if (kokkos_device != other->kokkos_device) same = 0;
   if (ssa != other->ssa) same = 0;
 
   if (copy != other->copy_original) same = 0;

From c15d6580da4174c247b9dc6af108b02b6b3aa47c Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Fri, 13 Jan 2017 10:01:22 -0700
Subject: [PATCH 082/267] Fixing issue in pair_multi_lucy_rx_kokkos found by
 ibaned

---
 src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
index 24502f875c..fac1478e32 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
@@ -277,7 +277,7 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
   // The f array is atomic for Half/Thread neighbor style
   Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
 
-  int i,j,jj,inum,jnum,itype,jtype,itable;
+  int i,jj,inum,jnum,itype,jtype,itable;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,evdwlOld,fpair;
   double rsq;
 
@@ -431,7 +431,7 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
 
   //if (evflag) ev_tally(0,0,nlocal,newton_pair,evdwl,0.0,0.0,0.0,0.0,0.0);
   if (EVFLAG)
-    ev.evdwl += ((NEWTON_PAIR||(j<nlocal))?1.0:0.5)*evdwl;
+    ev.evdwl += (NEWTON_PAIR?1.0:0.5)*evdwl;
 }
 
 template<class DeviceType>

From 2a35fa7a4e253facc698e834d47a0ccb2cd2cace Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Fri, 13 Jan 2017 10:37:31 -0700
Subject: [PATCH 083/267] Adding initial versions of pair_hybrid_kokkos and
 pair_hybrid_overlay_kokkos

---
 src/KOKKOS/Install.sh                     |   4 +
 src/KOKKOS/pair_hybrid_kokkos.cpp         | 147 ++++++++++++++++++++++
 src/KOKKOS/pair_hybrid_kokkos.h           | 109 ++++++++++++++++
 src/KOKKOS/pair_hybrid_overlay_kokkos.cpp |  28 +++++
 src/KOKKOS/pair_hybrid_overlay_kokkos.h   |  48 +++++++
 src/pair_hybrid.h                         |   6 +-
 src/pair_hybrid_overlay.h                 |   2 +-
 7 files changed, 338 insertions(+), 6 deletions(-)
 create mode 100644 src/KOKKOS/pair_hybrid_kokkos.cpp
 create mode 100644 src/KOKKOS/pair_hybrid_kokkos.h
 create mode 100644 src/KOKKOS/pair_hybrid_overlay_kokkos.cpp
 create mode 100644 src/KOKKOS/pair_hybrid_overlay_kokkos.h

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index cf753ecee8..198946d9f0 100644
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -154,6 +154,10 @@ action pair_eam_fs_kokkos.cpp pair_eam_fs.cpp
 action pair_eam_fs_kokkos.h pair_eam_fs.h
 action pair_exp6_rx_kokkos.cpp pair_exp6_rx.cpp
 action pair_exp6_rx_kokkos.h pair_exp6_rx.h
+action pair_hybrid_kokkos.cpp
+action pair_hybrid_kokkos.h
+action pair_hybrid_overlay_kokkos.cpp
+action pair_hybrid_overlay_kokkos.h
 action pair_kokkos.h
 action pair_lj_charmm_coul_charmm_implicit_kokkos.cpp pair_lj_charmm_coul_charmm_implicit.cpp
 action pair_lj_charmm_coul_charmm_implicit_kokkos.h pair_lj_charmm_coul_charmm_implicit.h
diff --git a/src/KOKKOS/pair_hybrid_kokkos.cpp b/src/KOKKOS/pair_hybrid_kokkos.cpp
new file mode 100644
index 0000000000..973d60348f
--- /dev/null
+++ b/src/KOKKOS/pair_hybrid_kokkos.cpp
@@ -0,0 +1,147 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include "pair_hybrid_kokkos.h"
+#include "atom_kokkos.h"
+#include "force.h"
+#include "pair.h"
+#include "neighbor.h"
+#include "neigh_request.h"
+#include "update.h"
+#include "comm.h"
+#include "memory.h"
+#include "error.h"
+#include "respa.h"
+#include "atom_masks.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairHybridKokkos::PairHybridKokkos(LAMMPS *lmp) : PairHybrid(lmp)
+{
+  atomKK = (AtomKokkos *) atom;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairHybridKokkos::~PairHybridKokkos()
+{
+
+}
+
+/* ----------------------------------------------------------------------
+  call each sub-style's compute() or compute_outer() function
+  accumulate sub-style global/peratom energy/virial in hybrid
+  for global vflag = 1:
+    each sub-style computes own virial[6]
+    sum sub-style virial[6] to hybrid's virial[6]
+  for global vflag = 2:
+    call sub-style with adjusted vflag to prevent it calling
+      virial_fdotr_compute()
+    hybrid calls virial_fdotr_compute() on final accumulated f
+------------------------------------------------------------------------- */
+
+void PairHybridKokkos::compute(int eflag, int vflag)
+{
+  int i,j,m,n;
+
+  // if no_virial_fdotr_compute is set and global component of
+  //   incoming vflag = 2, then
+  // reset vflag as if global component were 1
+  // necessary since one or more sub-styles cannot compute virial as F dot r
+
+  int neighflag = lmp->kokkos->neighflag;
+  if (neighflag == FULL) no_virial_fdotr_compute = 1;
+
+  if (no_virial_fdotr_compute && vflag % 4 == 2) vflag = 1 + vflag/4 * 4;
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = eflag_global = vflag_global =
+         eflag_atom = vflag_atom = 0;
+
+  // check if global component of incoming vflag = 2
+  // if so, reset vflag passed to substyle as if it were 0
+  // necessary so substyle will not invoke virial_fdotr_compute()
+
+  int vflag_substyle;
+  if (vflag % 4 == 2) vflag_substyle = vflag/4 * 4;
+  else vflag_substyle = vflag;
+
+  double *saved_special = save_special();
+
+  // check if we are running with r-RESPA using the hybrid keyword
+
+  Respa *respa = NULL;
+  respaflag = 0;
+  if (strstr(update->integrate_style,"respa")) {
+    respa = (Respa *) update->integrate;
+    if (respa->nhybrid_styles > 0) respaflag = 1;
+  }
+
+  for (m = 0; m < nstyles; m++) {
+
+    set_special(m);
+
+    if (!respaflag || (respaflag && respa->hybrid_compute[m])) {
+
+      // invoke compute() unless compute flag is turned off or
+      // outerflag is set and sub-style has a compute_outer() method
+
+      if (styles[m]->compute_flag == 0) continue;
+      atomKK->sync(styles[m]->execution_space,styles[m]->datamask_read);
+      if (outerflag && styles[m]->respa_enable)
+        styles[m]->compute_outer(eflag,vflag_substyle);
+      else styles[m]->compute(eflag,vflag_substyle);
+      atomKK->modified(styles[m]->execution_space,styles[m]->datamask_modify);
+    }
+
+    restore_special(saved_special);
+
+    // jump to next sub-style if r-RESPA does not want global accumulated data
+
+    if (respaflag && !respa->tally_global) continue;
+
+    if (eflag_global) {
+      eng_vdwl += styles[m]->eng_vdwl;
+      eng_coul += styles[m]->eng_coul;
+    }
+    if (vflag_global) {
+      for (n = 0; n < 6; n++) virial[n] += styles[m]->virial[n];
+    }
+    if (eflag_atom) {
+      n = atom->nlocal;
+      if (force->newton_pair) n += atom->nghost;
+      double *eatom_substyle = styles[m]->eatom;
+      for (i = 0; i < n; i++) eatom[i] += eatom_substyle[i];
+    }
+    if (vflag_atom) {
+      n = atom->nlocal;
+      if (force->newton_pair) n += atom->nghost;
+      double **vatom_substyle = styles[m]->vatom;
+      for (i = 0; i < n; i++)
+        for (j = 0; j < 6; j++)
+          vatom[i][j] += vatom_substyle[i][j];
+    }
+  }
+
+  delete [] saved_special;
+
+  if (vflag_fdotr) virial_fdotr_compute();
+}
diff --git a/src/KOKKOS/pair_hybrid_kokkos.h b/src/KOKKOS/pair_hybrid_kokkos.h
new file mode 100644
index 0000000000..cfcef7fb31
--- /dev/null
+++ b/src/KOKKOS/pair_hybrid_kokkos.h
@@ -0,0 +1,109 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(hybrid/kk,PairHybridKokkos)
+
+#else
+
+#ifndef LMP_PAIR_HYBRID_KOKKOS_H
+#define LMP_PAIR_HYBRID_KOKKOS_H
+
+#include <stdio.h>
+#include "pair_hybrid.h"
+
+namespace LAMMPS_NS {
+
+class PairHybridKokkos : public PairHybrid {
+  friend class FixGPU;
+  friend class FixIntel;
+  friend class FixOMP;
+  friend class Force;
+  friend class Respa;
+  friend class Info;
+ public:
+  PairHybridKokkos(class LAMMPS *);
+  virtual ~PairHybridKokkos();
+  void compute(int, int);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Pair style hybrid cannot have hybrid as an argument
+
+Self-explanatory.
+
+E: Pair style hybrid cannot have none as an argument
+
+Self-explanatory.
+
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: Pair coeff for hybrid has invalid style
+
+Style in pair coeff must have been listed in pair_style command.
+
+E: Pair hybrid sub-style is not used
+
+No pair_coeff command used a sub-style specified in the pair_style
+command.
+
+E: Pair_modify special setting for pair hybrid incompatible with global special_bonds setting
+
+Cannot override a setting of 0.0 or 1.0 or change a setting between
+0.0 and 1.0.
+
+E: All pair coeffs are not set
+
+All pair coefficients must be set in the data file or by the
+pair_coeff command before running a simulation.
+
+E: Invoked pair single on pair style none
+
+A command (e.g. a dump) attempted to invoke the single() function on a
+pair style none, which is illegal.  You are probably attempting to
+compute per-atom quantities with an undefined pair style.
+
+E: Pair hybrid sub-style does not support single call
+
+You are attempting to invoke a single() call on a pair style
+that doesn't support it.
+
+E: Pair hybrid single calls do not support per sub-style special bond values
+
+Self-explanatory.
+
+E: Unknown pair_modify hybrid sub-style
+
+The choice of sub-style is unknown.
+
+E: Coulomb cutoffs of pair hybrid sub-styles do not match
+
+If using a Kspace solver, all Coulomb cutoffs of long pair styles must
+be the same.
+
+*/
diff --git a/src/KOKKOS/pair_hybrid_overlay_kokkos.cpp b/src/KOKKOS/pair_hybrid_overlay_kokkos.cpp
new file mode 100644
index 0000000000..55fed33f96
--- /dev/null
+++ b/src/KOKKOS/pair_hybrid_overlay_kokkos.cpp
@@ -0,0 +1,28 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include "pair_hybrid_overlay_kokkos.h"
+#include "atom.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_request.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairHybridOverlayKokkos::PairHybridOverlayKokkos(LAMMPS *lmp) : PairHybridOverlay(lmp) {}
diff --git a/src/KOKKOS/pair_hybrid_overlay_kokkos.h b/src/KOKKOS/pair_hybrid_overlay_kokkos.h
new file mode 100644
index 0000000000..c9a50e3bb1
--- /dev/null
+++ b/src/KOKKOS/pair_hybrid_overlay_kokkos.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(hybrid/overlay/kk,PairHybridOverlayKokkos)
+
+#else
+
+#ifndef LMP_PAIR_HYBRID_OVERLAY_KOKKOS_H
+#define LMP_PAIR_HYBRID_OVERLAY_KOKKOS_H
+
+#include "pair_hybrid_overlay.h"
+
+namespace LAMMPS_NS {
+
+class PairHybridOverlayKokkos : public PairHybridOverlay {
+ public:
+  PairHybridOverlayKokkos(class LAMMPS *);
+  virtual ~PairHybridOverlayKokkos() {}
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: Pair coeff for hybrid has invalid style
+
+Style in pair coeff must have been listed in pair_style command.
+
+*/
diff --git a/src/pair_hybrid.h b/src/pair_hybrid.h
index 4d224dafc3..a7a236d269 100644
--- a/src/pair_hybrid.h
+++ b/src/pair_hybrid.h
@@ -35,7 +35,7 @@ class PairHybrid : public Pair {
  public:
   PairHybrid(class LAMMPS *);
   virtual ~PairHybrid();
-  void compute(int, int);
+  virtual void compute(int, int);
   void settings(int, char **);
   virtual void coeff(int, char **);
   void init_style();
@@ -88,10 +88,6 @@ class PairHybrid : public Pair {
 
 /* ERROR/WARNING messages:
 
-E: Cannot yet use pair hybrid with Kokkos
-
-This feature is not yet supported.
-
 E: Illegal ... command
 
 Self-explanatory.  Check the input script syntax and compare to the
diff --git a/src/pair_hybrid_overlay.h b/src/pair_hybrid_overlay.h
index 60cff45508..169583a48b 100644
--- a/src/pair_hybrid_overlay.h
+++ b/src/pair_hybrid_overlay.h
@@ -27,7 +27,7 @@ namespace LAMMPS_NS {
 class PairHybridOverlay : public PairHybrid {
  public:
   PairHybridOverlay(class LAMMPS *);
-  ~PairHybridOverlay() {}
+  virtual ~PairHybridOverlay() {}
   void coeff(int, char **);
 
  private:

From a42a666142cdab572ee6bac9ef81559f9c02ceb8 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Fri, 13 Jan 2017 13:23:26 -0700
Subject: [PATCH 084/267] support for eatom and vatom in pair_table_rx_kokkos

---
 src/KOKKOS/pair_table_rx_kokkos.cpp | 214 +++++++++++++++++++++-------
 src/KOKKOS/pair_table_rx_kokkos.h   |   6 +-
 2 files changed, 166 insertions(+), 54 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index 66089009a2..7402a00900 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -155,6 +155,10 @@ template<class DeviceType>
 PairTableRXKokkos<DeviceType>::~PairTableRXKokkos()
 {
   if (copymode) return;
+
+  memory->destroy_kokkos(k_eatom,eatom);
+  memory->destroy_kokkos(k_vatom,vatom);
+
   delete h_table;
   h_table = nullptr;
   delete d_table;
@@ -257,14 +261,38 @@ template<class DeviceType, int NEIGHFLAG, int TABSTYLE, int NEWTON_PAIR>
 KOKKOS_INLINE_FUNCTION
 void
 ev_tally(
+    int eflag,
+    int eflag_atom,
+    int vflag,
     int vflag_global,
+    int vflag_atom,
     int nlocal,
     int i, int j,
     EV_FLOAT& ev,
     F_FLOAT epair, F_FLOAT fpair,
-    F_FLOAT delx, F_FLOAT dely, F_FLOAT delz)
+    F_FLOAT delx, F_FLOAT dely, F_FLOAT delz,
+    Kokkos::View<F_FLOAT*[6],
+                 typename DAT::t_virial_array::array_layout,
+                 DeviceType,
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom,
+    Kokkos::View<E_FLOAT*,
+                 typename DAT::t_efloat_1d::array_layout,
+                 DeviceType,
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom)
 {
-  if (vflag_global) {
+  if (eflag) {
+    if (eflag_atom) {
+      auto epairhalf = 0.5 * epair;
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) v_eatom[i] += epairhalf;
+        if (NEWTON_PAIR || j < nlocal) v_eatom[j] += epairhalf;
+      } else {
+        v_eatom[i] += epairhalf;
+      }
+    }
+  }
+
+  if (vflag) {
     auto v0 = delx*delx*fpair;
     auto v1 = dely*dely*fpair;
     auto v2 = delz*delz*fpair;
@@ -272,39 +300,69 @@ ev_tally(
     auto v4 = delx*delz*fpair;
     auto v5 = dely*delz*fpair;
 
-    if (NEIGHFLAG!=FULL) {
-      if (NEWTON_PAIR) {
-        ev.v[0] += v0;
-        ev.v[1] += v1;
-        ev.v[2] += v2;
-        ev.v[3] += v3;
-        ev.v[4] += v4;
-        ev.v[5] += v5;
+    if (vflag_global) {
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR) {
+          ev.v[0] += v0;
+          ev.v[1] += v1;
+          ev.v[2] += v2;
+          ev.v[3] += v3;
+          ev.v[4] += v4;
+          ev.v[5] += v5;
+        } else {
+          if (i < nlocal) {
+            ev.v[0] += 0.5*v0;
+            ev.v[1] += 0.5*v1;
+            ev.v[2] += 0.5*v2;
+            ev.v[3] += 0.5*v3;
+            ev.v[4] += 0.5*v4;
+            ev.v[5] += 0.5*v5;
+          }
+          if (j < nlocal) {
+            ev.v[0] += 0.5*v0;
+            ev.v[1] += 0.5*v1;
+            ev.v[2] += 0.5*v2;
+            ev.v[3] += 0.5*v3;
+            ev.v[4] += 0.5*v4;
+            ev.v[5] += 0.5*v5;
+          }
+        }
       } else {
-        if (i < nlocal) {
-          ev.v[0] += 0.5*v0;
-          ev.v[1] += 0.5*v1;
-          ev.v[2] += 0.5*v2;
-          ev.v[3] += 0.5*v3;
-          ev.v[4] += 0.5*v4;
-          ev.v[5] += 0.5*v5;
-        }
-        if (j < nlocal) {
-          ev.v[0] += 0.5*v0;
-          ev.v[1] += 0.5*v1;
-          ev.v[2] += 0.5*v2;
-          ev.v[3] += 0.5*v3;
-          ev.v[4] += 0.5*v4;
-          ev.v[5] += 0.5*v5;
-        }
+        ev.v[0] += 0.5*v0;
+        ev.v[1] += 0.5*v1;
+        ev.v[2] += 0.5*v2;
+        ev.v[3] += 0.5*v3;
+        ev.v[4] += 0.5*v4;
+        ev.v[5] += 0.5*v5;
+      }
+    }
+
+    if (vflag_atom) {
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) {
+          v_vatom(i,0) += 0.5*v0;
+          v_vatom(i,1) += 0.5*v1;
+          v_vatom(i,2) += 0.5*v2;
+          v_vatom(i,3) += 0.5*v3;
+          v_vatom(i,4) += 0.5*v4;
+          v_vatom(i,5) += 0.5*v5;
+        }
+        if (NEWTON_PAIR || j < nlocal) {
+          v_vatom(j,0) += 0.5*v0;
+          v_vatom(j,1) += 0.5*v1;
+          v_vatom(j,2) += 0.5*v2;
+          v_vatom(j,3) += 0.5*v3;
+          v_vatom(j,4) += 0.5*v4;
+          v_vatom(j,5) += 0.5*v5;
+        }
+      } else {
+        v_vatom(i,0) += 0.5*v0;
+        v_vatom(i,1) += 0.5*v1;
+        v_vatom(i,2) += 0.5*v2;
+        v_vatom(i,3) += 0.5*v3;
+        v_vatom(i,4) += 0.5*v4;
+        v_vatom(i,5) += 0.5*v5;
       }
-    } else {
-      ev.v[0] += 0.5*v0;
-      ev.v[1] += 0.5*v1;
-      ev.v[2] += 0.5*v2;
-      ev.v[3] += 0.5*v3;
-      ev.v[4] += 0.5*v4;
-      ev.v[5] += 0.5*v5;
     }
   }
 }
@@ -338,8 +396,19 @@ compute_item(
                  DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCGnew,
     int isite1, int isite2,
     typename PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const,
-    int vflag_global
-    ) {
+    int eflag,
+    int eflag_atom,
+    int vflag,
+    int vflag_global,
+    int vflag_atom,
+    Kokkos::View<F_FLOAT*[6],
+                 typename DAT::t_virial_array::array_layout,
+                 DeviceType,
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom,
+    Kokkos::View<E_FLOAT*,
+                 typename DAT::t_efloat_1d::array_layout,
+                 DeviceType,
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom) {
   EV_FLOAT ev;
   auto i = d_ilist(ii);
   auto xtmp = x(i,0);
@@ -423,7 +492,10 @@ compute_item(
 
       if (EVFLAG) {
         ev_tally<DeviceType,NEIGHFLAG,TABSTYLE,NEWTON_PAIR>(
-            vflag_global,nlocal,i,j,ev,evdwl,fpair,delx,dely,delz);
+            eflag,eflag_atom,
+            vflag,vflag_global,vflag_atom,
+            nlocal,i,j,ev,evdwl,fpair,delx,dely,delz,
+            v_vatom, v_eatom);
       }
     }
   }
@@ -440,7 +512,6 @@ compute_item(
 
 template<class DeviceType, int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
 static void compute_all_items(
-    int eflag, int vflag,
     int newton_pair,
     EV_FLOAT& ev,
     int nlocal,
@@ -467,7 +538,19 @@ static void compute_all_items(
                  DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCGnew,
     int isite1, int isite2,
     typename PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const,
-    int vflag_global) {
+    int eflag,
+    int eflag_atom,
+    int vflag,
+    int vflag_global,
+    int vflag_atom,
+    Kokkos::View<F_FLOAT*[6],
+                 typename DAT::t_virial_array::array_layout,
+                 DeviceType,
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom,
+    Kokkos::View<E_FLOAT*,
+                 typename DAT::t_efloat_1d::array_layout,
+                 DeviceType,
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom) {
   if (eflag || vflag) {
     Kokkos::parallel_reduce(inum,
     LAMMPS_LAMBDA(int i, EV_FLOAT& energy_virial) {
@@ -477,14 +560,16 @@ static void compute_all_items(
             i, nlocal, d_ilist, d_neighbors, d_numneigh, x, type,
             mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
             special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
-            d_table_const, vflag_global);
+            d_table_const, eflag, eflag_atom,
+            vflag, vflag_global, vflag_atom, v_vatom, v_eatom);
       } else {
         energy_virial +=
           compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,1,0>(
             i, nlocal, d_ilist, d_neighbors, d_numneigh, x, type,
             mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
             special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
-            d_table_const, vflag_global);
+            d_table_const, eflag, eflag_atom,
+            vflag, vflag_global, vflag_atom, v_vatom, v_eatom);
       }
     }, ev);
   } else {
@@ -495,13 +580,15 @@ static void compute_all_items(
             i, nlocal, d_ilist, d_neighbors, d_numneigh, x, type,
             mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
             special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
-            d_table_const, vflag_global);
+            d_table_const, eflag, eflag_atom,
+            vflag, vflag_global, vflag_atom, v_vatom, v_eatom);
       } else {
         compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,0,0>(
             i, nlocal, d_ilist, d_neighbors, d_numneigh, x, type,
             mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
             special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
-            d_table_const, vflag_global);
+            d_table_const, eflag, eflag_atom,
+            vflag, vflag_global, vflag_atom, v_vatom, v_eatom);
       }
     });
   }
@@ -537,8 +624,16 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
 
-  if (eflag_atom) error->all(FLERR, "pair table/rx/kk does not handle eflag_atom\n");
-  if (vflag_atom) error->all(FLERR, "pair table/rx/kk does not handle vflag_atom\n");
+  if (eflag_atom) {
+    memory->destroy_kokkos(k_eatom,eatom);
+    memory->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
+    d_eatom = k_eatom.d_view;
+  }
+  if (vflag_atom) {
+    memory->destroy_kokkos(k_vatom,vatom);
+    memory->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom");
+    d_vatom = k_vatom.d_view;
+  }
 
   atomKK->sync(execution_space,datamask_read);
   if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
@@ -578,34 +673,38 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
   if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
     if (neighflag == HALFTHREAD) {
       compute_all_items<DeviceType,HALFTHREAD,false,TABSTYLE>(
-          eflag, vflag, newton_pair, ev, nlocal,
+          newton_pair, ev, nlocal,
           l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
           x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
           special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
-          d_table_const, vflag_global);
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
     } else if (neighflag == HALF) {
       compute_all_items<DeviceType,HALF,false,TABSTYLE>(
-          eflag, vflag, newton_pair, ev, nlocal,
+          newton_pair, ev, nlocal,
           l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
           x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
           special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
-          d_table_const, vflag_global);
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
     }
   } else {
     if (neighflag == HALFTHREAD) {
       compute_all_items<DeviceType,HALFTHREAD,true,TABSTYLE>(
-          eflag, vflag, newton_pair, ev, nlocal,
+          newton_pair, ev, nlocal,
           l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
           x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
           special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
-          d_table_const, vflag_global);
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
     } else if (neighflag == HALF) {
       compute_all_items<DeviceType,HALF,true,TABSTYLE>(
-          eflag, vflag, newton_pair, ev, nlocal,
+          newton_pair, ev, nlocal,
           l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
           x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
           special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
-          d_table_const, vflag_global);
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
     }
   }
 
@@ -620,6 +719,16 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
   }
 
   if (vflag_fdotr) pair_virial_fdotr_compute(this);
+
+  if (eflag_atom) {
+    k_eatom.template modify<DeviceType>();
+    k_eatom.template sync<LMPHostType>();
+  }
+
+  if (vflag_atom) {
+    k_vatom.template modify<DeviceType>();
+    k_vatom.template sync<LMPHostType>();
+  }
 }
 
 template<class DeviceType>
@@ -1126,6 +1235,7 @@ void PairTableRXKokkos<DeviceType>::cleanup_copy() {
   vatom = NULL;
   h_table=NULL; d_table=NULL;
 }
+
 namespace LAMMPS_NS {
 template class PairTableRXKokkos<LMPDeviceType>;
 #ifdef KOKKOS_HAVE_CUDA
diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
index 4e94802d72..c7ecd370a4 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.h
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -88,8 +88,6 @@ class PairTableRXKokkos : public PairTable {
 
   typename ArrayTypes<DeviceType>::t_x_array_randomread x;
   typename ArrayTypes<DeviceType>::t_f_array f;
-  typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom;
-  typename ArrayTypes<DeviceType>::t_virial_array d_vatom;
 
   int neighflag;
 
@@ -106,6 +104,10 @@ class PairTableRXKokkos : public PairTable {
   int isite1, isite2;
   bool fractionalWeighting;
 
+  typename ArrayTypes<DeviceType>::tdual_efloat_1d k_eatom;
+  typename ArrayTypes<DeviceType>::tdual_virial_array k_vatom;
+  typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom;
+  typename ArrayTypes<DeviceType>::t_virial_array d_vatom;
 };
 
 }

From 2b2998052c567719d6076435b640186105dda2ce Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Fri, 13 Jan 2017 13:50:21 -0700
Subject: [PATCH 085/267] Fixing inheritance issue in
 pair_hybrid_overlay_kokkos

---
 src/KOKKOS/pair_hybrid_overlay_kokkos.cpp | 116 +++++++++++++++++++++-
 src/KOKKOS/pair_hybrid_overlay_kokkos.h   |   8 +-
 2 files changed, 121 insertions(+), 3 deletions(-)

diff --git a/src/KOKKOS/pair_hybrid_overlay_kokkos.cpp b/src/KOKKOS/pair_hybrid_overlay_kokkos.cpp
index 55fed33f96..79d9c63221 100644
--- a/src/KOKKOS/pair_hybrid_overlay_kokkos.cpp
+++ b/src/KOKKOS/pair_hybrid_overlay_kokkos.cpp
@@ -25,4 +25,118 @@ using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-PairHybridOverlayKokkos::PairHybridOverlayKokkos(LAMMPS *lmp) : PairHybridOverlay(lmp) {}
+PairHybridOverlayKokkos::PairHybridOverlayKokkos(LAMMPS *lmp) : PairHybridKokkos(lmp) {}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+void PairHybridOverlayKokkos::coeff(int narg, char **arg)
+{
+  if (narg < 3) error->all(FLERR,"Incorrect args for pair coefficients");
+  if (!allocated) allocate();
+
+  int ilo,ihi,jlo,jhi;
+  force->bounds(FLERR,arg[0],atom->ntypes,ilo,ihi);
+  force->bounds(FLERR,arg[1],atom->ntypes,jlo,jhi);
+
+  // 3rd arg = pair sub-style name
+  // 4th arg = pair sub-style index if name used multiple times
+  // allow for "none" as valid sub-style name
+
+  int multflag;
+  int m;
+
+  for (m = 0; m < nstyles; m++) {
+    multflag = 0;
+    if (strcmp(arg[2],keywords[m]) == 0) {
+      if (multiple[m]) {
+        multflag = 1;
+        if (narg < 4) error->all(FLERR,"Incorrect args for pair coefficients");
+        if (!isdigit(arg[3][0]))
+          error->all(FLERR,"Incorrect args for pair coefficients");
+        int index = force->inumeric(FLERR,arg[3]);
+        if (index == multiple[m]) break;
+        else continue;
+      } else break;
+    }
+  }
+
+  int none = 0;
+  if (m == nstyles) {
+    if (strcmp(arg[2],"none") == 0) none = 1;
+    else error->all(FLERR,"Pair coeff for hybrid has invalid style");
+  }
+
+  // move 1st/2nd args to 2nd/3rd args
+  // if multflag: move 1st/2nd args to 3rd/4th args
+  // just copy ptrs, since arg[] points into original input line
+
+  arg[2+multflag] = arg[1];
+  arg[1+multflag] = arg[0];
+
+  // invoke sub-style coeff() starting with 1st remaining arg
+
+  if (!none) styles[m]->coeff(narg-1-multflag,&arg[1+multflag]);
+
+  // set setflag and which type pairs map to which sub-style
+  // if sub-style is none: set hybrid subflag, wipe out map
+  // else: set hybrid setflag & map only if substyle setflag is set
+  //       if sub-style is new for type pair, add as multiple mapping
+  //       if sub-style exists for type pair, don't add, just update coeffs
+
+  int count = 0;
+  for (int i = ilo; i <= ihi; i++) {
+    for (int j = MAX(jlo,i); j <= jhi; j++) {
+      if (none) {
+        setflag[i][j] = 1;
+        nmap[i][j] = 0;
+        count++;
+      } else if (styles[m]->setflag[i][j]) {
+        int k;
+        for (k = 0; k < nmap[i][j]; k++)
+          if (map[i][j][k] == m) break;
+        if (k == nmap[i][j]) map[i][j][nmap[i][j]++] = m;
+        setflag[i][j] = 1;
+        count++;
+      }
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   combine sub-style neigh list requests and create new ones if needed
+------------------------------------------------------------------------- */
+
+void PairHybridOverlayKokkos::modify_requests()
+{
+  int i,j;
+  NeighRequest *irq,*jrq;
+
+  // loop over pair requests only
+  // if a previous list is same kind with same skip attributes
+  // then make this one a copy list of that one
+  // works whether both lists are no-skip or yes-skip
+  // will not point a list at a copy list, but at copy list's parent
+
+  for (i = 0; i < neighbor->nrequest; i++) {
+    if (!neighbor->requests[i]->pair) continue;
+
+    irq = neighbor->requests[i];
+    for (j = 0; j < i; j++) {
+      if (!neighbor->requests[j]->pair) continue;
+      jrq = neighbor->requests[j];
+      if (irq->same_kind(jrq) && irq->same_skip(jrq)) {
+        irq->copy = 1;
+        irq->otherlist = j;
+        break;
+      }
+    }
+  }
+
+  // perform same operations on skip lists as pair style = hybrid
+
+  PairHybrid::modify_requests();
+}
diff --git a/src/KOKKOS/pair_hybrid_overlay_kokkos.h b/src/KOKKOS/pair_hybrid_overlay_kokkos.h
index c9a50e3bb1..2e4899a1f3 100644
--- a/src/KOKKOS/pair_hybrid_overlay_kokkos.h
+++ b/src/KOKKOS/pair_hybrid_overlay_kokkos.h
@@ -20,14 +20,18 @@ PairStyle(hybrid/overlay/kk,PairHybridOverlayKokkos)
 #ifndef LMP_PAIR_HYBRID_OVERLAY_KOKKOS_H
 #define LMP_PAIR_HYBRID_OVERLAY_KOKKOS_H
 
-#include "pair_hybrid_overlay.h"
+#include "pair_hybrid_kokkos.h"
 
 namespace LAMMPS_NS {
 
-class PairHybridOverlayKokkos : public PairHybridOverlay {
+class PairHybridOverlayKokkos : public PairHybridKokkos {
  public:
   PairHybridOverlayKokkos(class LAMMPS *);
   virtual ~PairHybridOverlayKokkos() {}
+  void coeff(int, char **);
+
+ private:
+  void modify_requests();
 };
 
 }

From 688df1c2542cef4461b99e99632cce54dd0eb51d Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Fri, 13 Jan 2017 14:40:36 -0700
Subject: [PATCH 086/267] fix CUDA type issues in pair_table_rx_kokkos

stop using the global DAT, use the pair's
DeviceType for all the relevant types.
---
 src/KOKKOS/pair_table_rx_kokkos.cpp | 34 +++++++++++++++++------------
 src/KOKKOS/pair_table_rx_kokkos.h   |  3 ---
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index 7402a00900..58108c9308 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -272,11 +272,11 @@ ev_tally(
     F_FLOAT epair, F_FLOAT fpair,
     F_FLOAT delx, F_FLOAT dely, F_FLOAT delz,
     Kokkos::View<F_FLOAT*[6],
-                 typename DAT::t_virial_array::array_layout,
+                 typename ArrayTypes<DeviceType>::t_virial_array::array_layout,
                  DeviceType,
                  Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom,
     Kokkos::View<E_FLOAT*,
-                 typename DAT::t_efloat_1d::array_layout,
+                 typename ArrayTypes<DeviceType>::t_efloat_1d::array_layout,
                  DeviceType,
                  Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom)
 {
@@ -390,10 +390,14 @@ compute_item(
       typename ArrayTypes<DeviceType>::t_f_array::array_layout,
       DeviceType,
       Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > f,
-    Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,
-                 DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCG,
-    Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,
-                 DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCGnew,
+    Kokkos::View<E_FLOAT*,
+                 typename ArrayTypes<DeviceType>::t_efloat_1d::array_layout,
+                 DeviceType,
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCG,
+    Kokkos::View<E_FLOAT*,
+                 typename ArrayTypes<DeviceType>::t_efloat_1d::array_layout,
+                 DeviceType,
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCGnew,
     int isite1, int isite2,
     typename PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const,
     int eflag,
@@ -402,11 +406,11 @@ compute_item(
     int vflag_global,
     int vflag_atom,
     Kokkos::View<F_FLOAT*[6],
-                 typename DAT::t_virial_array::array_layout,
+                 typename ArrayTypes<DeviceType>::t_virial_array::array_layout,
                  DeviceType,
                  Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom,
     Kokkos::View<E_FLOAT*,
-                 typename DAT::t_efloat_1d::array_layout,
+                 typename ArrayTypes<DeviceType>::t_efloat_1d::array_layout,
                  DeviceType,
                  Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom) {
   EV_FLOAT ev;
@@ -532,9 +536,11 @@ static void compute_all_items(
       typename ArrayTypes<DeviceType>::t_f_array::array_layout,
       DeviceType,
       Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > f,
-    Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,
+    Kokkos::View<E_FLOAT*,
+                 typename ArrayTypes<DeviceType>::t_efloat_1d::array_layout,
                  DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCG,
-    Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,
+    Kokkos::View<E_FLOAT*,
+                 typename ArrayTypes<DeviceType>::t_efloat_1d::array_layout,
                  DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCGnew,
     int isite1, int isite2,
     typename PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const,
@@ -544,11 +550,11 @@ static void compute_all_items(
     int vflag_global,
     int vflag_atom,
     Kokkos::View<F_FLOAT*[6],
-                 typename DAT::t_virial_array::array_layout,
+                 typename ArrayTypes<DeviceType>::t_virial_array::array_layout,
                  DeviceType,
                  Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom,
     Kokkos::View<E_FLOAT*,
-                 typename DAT::t_efloat_1d::array_layout,
+                 typename ArrayTypes<DeviceType>::t_efloat_1d::array_layout,
                  DeviceType,
                  Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom) {
   if (eflag || vflag) {
@@ -627,12 +633,12 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
   if (eflag_atom) {
     memory->destroy_kokkos(k_eatom,eatom);
     memory->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
-    d_eatom = k_eatom.d_view;
+    d_eatom = k_eatom.template view<DeviceType>();
   }
   if (vflag_atom) {
     memory->destroy_kokkos(k_vatom,vatom);
     memory->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom");
-    d_vatom = k_vatom.d_view;
+    d_vatom = k_vatom.template view<DeviceType>();
   }
 
   atomKK->sync(execution_space,datamask_read);
diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
index c7ecd370a4..54c114a433 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.h
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -30,9 +30,6 @@ namespace LAMMPS_NS {
 template<class DeviceType>
 class PairTableRXKokkos : public PairTable {
  public:
-
-  using DAT = ArrayTypes<DeviceType>;
-
   enum {EnabledNeighFlags=FULL|HALFTHREAD|HALF|N2};
   typedef DeviceType device_type;
 

From 91d68e26eff86b7e1fe50bb3786b13b7f6a07b30 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Tue, 17 Jan 2017 12:26:00 -0700
Subject: [PATCH 087/267] Prevent overlapping host/device computation in
 pair_hybrid_kokkos

---
 src/KOKKOS/pair_hybrid_kokkos.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/KOKKOS/pair_hybrid_kokkos.cpp b/src/KOKKOS/pair_hybrid_kokkos.cpp
index 973d60348f..9c0948b7d4 100644
--- a/src/KOKKOS/pair_hybrid_kokkos.cpp
+++ b/src/KOKKOS/pair_hybrid_kokkos.cpp
@@ -35,6 +35,11 @@ using namespace LAMMPS_NS;
 PairHybridKokkos::PairHybridKokkos(LAMMPS *lmp) : PairHybrid(lmp)
 {
   atomKK = (AtomKokkos *) atom;
+
+ // prevent overlapping host/device computation, which isn't
+ //  yet supported by pair_hybrid_kokkos
+ execution_space = Device;
+
   datamask_read = EMPTY_MASK;
   datamask_modify = EMPTY_MASK;
 }

From 8b4130c0cbbbf6bfb69e01d51f5ba47c94ecd3ed Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Tue, 17 Jan 2017 13:28:55 -0700
Subject: [PATCH 088/267] Fixing issue with pressure in pair_hybrid_kokkos

---
 src/KOKKOS/pair_hybrid_kokkos.cpp | 9 ++++++++-
 src/KOKKOS/pair_hybrid_kokkos.h   | 9 +++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/KOKKOS/pair_hybrid_kokkos.cpp b/src/KOKKOS/pair_hybrid_kokkos.cpp
index 9c0948b7d4..337b56c6ce 100644
--- a/src/KOKKOS/pair_hybrid_kokkos.cpp
+++ b/src/KOKKOS/pair_hybrid_kokkos.cpp
@@ -148,5 +148,12 @@ void PairHybridKokkos::compute(int eflag, int vflag)
 
   delete [] saved_special;
 
-  if (vflag_fdotr) virial_fdotr_compute();
+  // perform virial_fdotr on device
+
+  atomKK->sync(Device,X_MASK|F_MASK);
+  x = atomKK->k_x.view<LMPDeviceType>();
+  f = atomKK->k_f.view<LMPDeviceType>();
+
+  if (vflag_fdotr)
+    pair_virial_fdotr_compute(this);
 }
diff --git a/src/KOKKOS/pair_hybrid_kokkos.h b/src/KOKKOS/pair_hybrid_kokkos.h
index cfcef7fb31..62d325925b 100644
--- a/src/KOKKOS/pair_hybrid_kokkos.h
+++ b/src/KOKKOS/pair_hybrid_kokkos.h
@@ -22,6 +22,8 @@ PairStyle(hybrid/kk,PairHybridKokkos)
 
 #include <stdio.h>
 #include "pair_hybrid.h"
+#include "pair_kokkos.h"
+#include "kokkos_type.h"
 
 namespace LAMMPS_NS {
 
@@ -33,9 +35,16 @@ class PairHybridKokkos : public PairHybrid {
   friend class Respa;
   friend class Info;
  public:
+  typedef LMPDeviceType device_type;
+
   PairHybridKokkos(class LAMMPS *);
   virtual ~PairHybridKokkos();
   void compute(int, int);
+
+ private:
+  DAT::t_x_array_randomread x;
+  DAT::t_f_array f;
+  friend void pair_virial_fdotr_compute<PairHybridKokkos>(PairHybridKokkos*);
 };
 
 }

From 5569c4c130c08656ddf4313127effcd039185bf6 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Tue, 17 Jan 2017 16:19:25 -0700
Subject: [PATCH 089/267] Fixing GPU memory issue with fix_property_atom_kokkos

---
 src/KOKKOS/atom_vec_dpd_kokkos.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.cpp b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
index 58fc9c46c3..82d45dfcd4 100644
--- a/src/KOKKOS/atom_vec_dpd_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
@@ -86,12 +86,12 @@ void AtomVecDPDKokkos::grow(int n)
   memory->grow_kokkos(atomKK->k_uCGnew,atomKK->uCGnew,nmax,"atom:uCGnew");
   memory->grow_kokkos(atomKK->k_duChem,atomKK->duChem,nmax,"atom:duChem");
 
-  grow_reset();
-  sync(Host,ALL_MASK);
-
   if (atom->nextra_grow)
     for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
       modify->fix[atom->extra_grow[iextra]]->grow_arrays(nmax);
+
+  grow_reset();
+  sync(Host,ALL_MASK);
 }
 
 /* ----------------------------------------------------------------------

From 96636c7514a8fa9f978e3bbb42972a986e458285 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Tue, 17 Jan 2017 16:43:55 -0700
Subject: [PATCH 090/267] Fixing warnings in pair_exp6_rx_kokkos

---
 src/KOKKOS/pair_exp6_rx_kokkos.cpp |  4 ++--
 src/KOKKOS/pair_exp6_rx_kokkos.h   | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index 3ce6b78e57..9be44666aa 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -284,7 +284,7 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
   Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_uCG = uCG;
   Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_uCGnew = uCGnew;
 
-  int i,j,jj,jnum,itype,jtype;
+  int i,jj,jnum,itype,jtype;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,evdwlOld,fpair;
   double rsq,r2inv,r6inv,forceExp6,factor_lj;
   double rCut,rCutInv,rCut2inv,rCut6inv,rCutExp,urc,durc;
@@ -508,7 +508,7 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
 
       if(rm12_ij!=0.0 && rm21_ij!=0.0){
         if(alpha21_ij == 6.0 || alpha12_ij == 6.0)
-          error->all(FLERR,"alpha_ij is 6.0 in pair exp6");
+          k_error_flag.d_view() = 1;
 
         // A3.  Compute some convenient quantities for evaluating the force
         rminv = 1.0/rm12_ij;
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.h b/src/KOKKOS/pair_exp6_rx_kokkos.h
index 488c9d0039..1f2172471b 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.h
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.h
@@ -150,6 +150,24 @@ class PairExp6rxKokkos : public PairExp6rx {
   friend void pair_virial_fdotr_compute<PairExp6rxKokkos>(PairExp6rxKokkos*);
 };
 
+
+// optimized version of pow(x,n) with n being integer
+// up to 10x faster than pow(x,y)
+
+KOKKOS_INLINE_FUNCTION
+static double powint(const double &x, const int n) {
+  double yy,ww;
+
+  if (x == 0.0) return 0.0;
+  int nn = (n > 0) ? n : -n;
+  ww = x;
+
+  for (yy = 1.0; nn != 0; nn >>= 1, ww *=ww)
+    if (nn & 1) yy *= ww;
+
+  return (n > 0) ? yy : 1.0/yy;
+};
+
 }
 
 #endif

From b38733e5a2b73e6f1a3d6ec37958bc68251f2bca Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Wed, 18 Jan 2017 10:15:06 -0700
Subject: [PATCH 091/267] Fixing GPU memory issue in pair_exp6_rx_kokkos

---
 src/KOKKOS/pair_exp6_rx_kokkos.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index 9be44666aa..bde3a32b4b 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -659,6 +659,10 @@ void PairExp6rxKokkos<DeviceType>::coeff(int narg, char **arg)
       s_coeffEps[i] = coeffEps[i];
       s_coeffRm[i] = coeffRm[i];
     }
+
+  k_params.template modify<LMPHostType>();
+  k_params.template sync<DeviceType>();
+  d_params = k_params.template view<DeviceType>();
 }
 
 /* ---------------------------------------------------------------------- */
@@ -776,10 +780,6 @@ void PairExp6rxKokkos<DeviceType>::read_file(char *file)
   }
 
   delete [] words;
-
-  k_params.template modify<LMPHostType>();
-  k_params.template sync<DeviceType>();
-  d_params = k_params.template view<DeviceType>();
 }
 
 /* ---------------------------------------------------------------------- */

From 2d32fa8ccb046159155212e36b09b663608525d5 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Wed, 18 Jan 2017 12:53:40 -0700
Subject: [PATCH 092/267] Fixing GPU memory issues in atom_vec_dpd_kokkos

---
 src/KOKKOS/atom_vec_dpd_kokkos.cpp | 45 +++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.cpp b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
index 82d45dfcd4..699ea61c9d 100644
--- a/src/KOKKOS/atom_vec_dpd_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
@@ -256,7 +256,7 @@ int AtomVecDPDKokkos::pack_comm_kokkos(const int &n,
   // Choose correct forward PackComm kernel
 
   if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
+    sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
     if(pbc_flag) {
       if(domain->triclinic) {
         struct AtomVecDPDKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,
@@ -292,7 +292,7 @@ int AtomVecDPDKokkos::pack_comm_kokkos(const int &n,
     }
     LMPHostType::fence();
   } else {
-    sync(Device,X_MASK);
+    sync(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
     if(pbc_flag) {
       if(domain->triclinic) {
         struct AtomVecDPDKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,
@@ -400,8 +400,8 @@ struct AtomVecDPDKokkos_PackCommSelf {
 int AtomVecDPDKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
 										const int nfirst, const int &pbc_flag, const int* const pbc) {
   if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
+    sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    modified(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
     if(pbc_flag) {
       if(domain->triclinic) {
       struct AtomVecDPDKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,
@@ -437,8 +437,8 @@ int AtomVecDPDKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list
     }
     LMPHostType::fence();
   } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
+    sync(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    modified(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
     if(pbc_flag) {
       if(domain->triclinic) {
       struct AtomVecDPDKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,
@@ -520,16 +520,16 @@ struct AtomVecDPDKokkos_UnpackComm {
 void AtomVecDPDKokkos::unpack_comm_kokkos(const int &n, const int &first,
     const DAT::tdual_xfloat_2d &buf ) {
   if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
+    sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    modified(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
     struct AtomVecDPDKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,
     atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
     buf,first);
     Kokkos::parallel_for(n,f);
     LMPDeviceType::fence();
   } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
+    sync(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    modified(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
     struct AtomVecDPDKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,
     atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
     buf,first);
@@ -1107,9 +1107,13 @@ struct AtomVecDPDKokkos_UnpackBorder {
 
 void AtomVecDPDKokkos::unpack_border_kokkos(const int &n, const int &first,
                      const DAT::tdual_xfloat_2d &buf,ExecutionSpace space) {
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+                 DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK|
+                 UCG_MASK|UCGNEW_MASK);
   while (first+n >= nmax) grow(0);
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+                 DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK|
+                 UCG_MASK|UCGNEW_MASK);
   if(space==Host) {
     struct AtomVecDPDKokkos_UnpackBorder<LMPHostType> f(buf.view<LMPHostType>(),
       h_x,h_tag,h_type,h_mask,
@@ -1137,7 +1141,9 @@ void AtomVecDPDKokkos::unpack_border(int n, int first, double *buf)
   last = first + n;
   for (i = first; i < last; i++) {
     if (i == nmax) grow(0);
-    modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+    modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+                 DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK|
+                 UCG_MASK|UCGNEW_MASK);
     h_x(i,0) = buf[m++];
     h_x(i,1) = buf[m++];
     h_x(i,2) = buf[m++];
@@ -1168,7 +1174,9 @@ void AtomVecDPDKokkos::unpack_border_vel(int n, int first, double *buf)
   last = first + n;
   for (i = first; i < last; i++) {
     if (i == nmax) grow(0);
-    modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+    modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+                 DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK|
+                 UCG_MASK|UCGNEW_MASK);
     h_x(i,0) = buf[m++];
     h_x(i,1) = buf[m++];
     h_x(i,2) = buf[m++];
@@ -1489,7 +1497,8 @@ int AtomVecDPDKokkos::unpack_exchange(double *buf)
   int nlocal = atom->nlocal;
   if (nlocal == nmax) grow(0);
   modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
-           MASK_MASK | IMAGE_MASK);
+           MASK_MASK | IMAGE_MASK| DPDTHETA_MASK | UCOND_MASK |
+           UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
 
   int m = 1;
   h_x(nlocal,0) = buf[m++];
@@ -1547,7 +1556,8 @@ int AtomVecDPDKokkos::size_restart()
 int AtomVecDPDKokkos::pack_restart(int i, double *buf)
 {
   sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
-            MASK_MASK | IMAGE_MASK );
+            MASK_MASK | IMAGE_MASK | DPDTHETA_MASK |
+            UCOND_MASK | UMECH_MASK | UCHEM_MASK | DVECTOR_MASK);
 
   int m = 1;
   buf[m++] = h_x(i,0);
@@ -1586,7 +1596,8 @@ int AtomVecDPDKokkos::unpack_restart(double *buf)
       memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
   }
   modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
-                MASK_MASK | IMAGE_MASK );
+                MASK_MASK | IMAGE_MASK | DPDTHETA_MASK |
+                UCOND_MASK | UMECH_MASK | UCHEM_MASK | DVECTOR_MASK);
 
   int m = 1;
   h_x(nlocal,0) = buf[m++];

From e05b1322895337ed3653b74adebfc54208db3649 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Wed, 18 Jan 2017 14:18:35 -0700
Subject: [PATCH 093/267] Fixing error check in fix_eos_table_rx_kokkos

---
 src/KOKKOS/fix_eos_table_rx_kokkos.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
index aff2cdfa2d..40b44d6744 100644
--- a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
@@ -404,7 +404,8 @@ void FixEOStableRXKokkos<DeviceType>::temperature_lookup(int id, double ui, doub
   if(it==maxit){
     if(isnan(f1) || isnan(f2) || isnan(ui) || isnan(thetai) || isnan(t1) || isnan(t2))
       k_error_flag.d_view() = 2;
-    k_error_flag.d_view() = 3;
+    else
+      k_error_flag.d_view() = 3;
   }
   thetai = temp;
 }

From 116ae9d0c42aa949f9478a95ba20654804442381 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Wed, 18 Jan 2017 14:51:35 -0700
Subject: [PATCH 094/267] Fixing copy bug in pair_exp6_rx_kokkos

---
 src/KOKKOS/pair_exp6_rx_kokkos.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index bde3a32b4b..acba9e473b 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -1095,7 +1095,7 @@ void PairExp6rxKokkos<DeviceType>::polynomialScaling(double phi, double &alpha,
 
     alpha = (s_coeffAlpha[0]*phi5 + s_coeffAlpha[1]*phi4 + s_coeffAlpha[2]*phi3 + s_coeffAlpha[3]*phi2 + s_coeffAlpha[4]*phi + s_coeffAlpha[5]);
     epsilon *= (s_coeffEps[0]*phi5 + s_coeffEps[1]*phi4 + s_coeffEps[2]*phi3 + s_coeffEps[3]*phi2 + s_coeffEps[4]*phi + s_coeffEps[5]);
-    rm *= (s_coeffEps[0]*phi5 + s_coeffEps[1]*phi4 + s_coeffEps[2]*phi3 + s_coeffEps[3]*phi2 + s_coeffEps[4]*phi + s_coeffEps[5]);
+    rm *= (s_coeffRm[0]*phi5 + s_coeffRm[1]*phi4 + s_coeffRm[2]*phi3 + s_coeffRm[3]*phi2 + s_coeffRm[4]*phi + s_coeffRm[5]);
 }
 
 /* ---------------------------------------------------------------------- */

From cf83ce454369b365efff7210382b7f2a3a246cf1 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Thu, 19 Jan 2017 08:44:30 -0700
Subject: [PATCH 095/267] Adding zero compute to pair_dpd_fdt_energy_kokkos

---
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp | 32 ++++++++++++-----------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
index 5de2b38ed0..ec807a0e08 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
@@ -170,21 +170,23 @@ void PairDPDfdtEnergyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   EV_FLOAT ev;
 
   if (splitFDT_flag) {
-    if (neighflag == HALF) {
-      if (newton_pair) {
-        if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,1,1> >(0,inum),*this,ev);
-        else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,1,0> >(0,inum),*this);
-      } else {
-        if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,0,1> >(0,inum),*this,ev);
-        else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,0,0> >(0,inum),*this);
-      }
-    } else if (neighflag == HALFTHREAD) {
-      if (newton_pair) {
-        if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,1,1> >(0,inum),*this,ev);
-        else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,1,0> >(0,inum),*this);
-      } else {
-        if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,0,1> >(0,inum),*this,ev);
-        else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,0,0> >(0,inum),*this);
+    if (!a0_is_zero) {
+      if (neighflag == HALF) {
+        if (newton_pair) {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,1,1> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,1,0> >(0,inum),*this);
+        } else {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,0,1> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,0,0> >(0,inum),*this);
+        }
+      } else if (neighflag == HALFTHREAD) {
+        if (newton_pair) {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,1,1> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,1,0> >(0,inum),*this);
+        } else {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,0,1> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,0,0> >(0,inum),*this);
+        }
       }
     }
   } else {

From 917ca19b340dec624890201ebb7280c2a64fef0a Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Thu, 19 Jan 2017 09:54:15 -0700
Subject: [PATCH 096/267] Fixing GPU memory issue in modify_kokkos, need to
 cherry pick back to Master

---
 src/KOKKOS/modify_kokkos.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/KOKKOS/modify_kokkos.cpp b/src/KOKKOS/modify_kokkos.cpp
index ec3831dff8..b4a89c8e39 100644
--- a/src/KOKKOS/modify_kokkos.cpp
+++ b/src/KOKKOS/modify_kokkos.cpp
@@ -360,9 +360,7 @@ void ModifyKokkos::post_run()
   for (int i = 0; i < nfix; i++) {
     atomKK->sync(fix[i]->execution_space,
                  fix[i]->datamask_read);
-    if (!fix[i]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[i]->post_run();
-    lmp->kokkos->auto_sync = 0;
     atomKK->modified(fix[i]->execution_space,
                      fix[i]->datamask_modify);
   }

From de6442d8450cbebc62267dcc6872c58e68947766 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Thu, 19 Jan 2017 11:55:22 -0700
Subject: [PATCH 097/267] Fixing GPU memory issues in Kokkos

---
 src/KOKKOS/domain_kokkos.cpp | 4 ++--
 src/KOKKOS/verlet_kokkos.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/KOKKOS/domain_kokkos.cpp b/src/KOKKOS/domain_kokkos.cpp
index cf65316ec9..5c1f1a60b9 100644
--- a/src/KOKKOS/domain_kokkos.cpp
+++ b/src/KOKKOS/domain_kokkos.cpp
@@ -354,7 +354,6 @@ void DomainKokkos::pbc()
   }
 
   atomKK->sync(Device,X_MASK|V_MASK|MASK_MASK|IMAGE_MASK);
-  atomKK->modified(Device,X_MASK|V_MASK|IMAGE_MASK);
 
   if (xperiodic || yperiodic || zperiodic) {
     if (deform_vremap) {
@@ -385,8 +384,9 @@ void DomainKokkos::pbc()
       Kokkos::parallel_for(nlocal,f);
     }
   }
-
   LMPDeviceType::fence();
+
+  atomKK->modified(Device,X_MASK|V_MASK|IMAGE_MASK);
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/KOKKOS/verlet_kokkos.cpp b/src/KOKKOS/verlet_kokkos.cpp
index 20c4035276..53b4042376 100644
--- a/src/KOKKOS/verlet_kokkos.cpp
+++ b/src/KOKKOS/verlet_kokkos.cpp
@@ -170,7 +170,7 @@ void VerletKokkos::setup()
 
   modify->setup(vflag);
   output->setup();
-  lmp->kokkos->auto_sync = 0;
+  lmp->kokkos->auto_sync = 1;
   update->setupflag = 1;
 }
 

From 521f3df3d5939fe2d61b3fb9f2e756200822ba6e Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Thu, 19 Jan 2017 16:54:50 -0700
Subject: [PATCH 098/267] Initialize variables in pair_exp6_rx_kokkos

---
 src/KOKKOS/pair_exp6_rx_kokkos.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index acba9e473b..dd3228efc4 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -311,6 +311,9 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
   double mixWtSite1_i, mixWtSite1_j;
   double mixWtSite2_i, mixWtSite2_j;
 
+  fpairOldEXP6_12 = 0.0;
+  fpairOldEXP6_21 = 0.0;
+
   const int nRep = 12;
   const double shift = 1.05;
   double rin1, aRep, uin1, win1, uin1rep, rin1exp, rin6, rin6inv;

From 5897955e2ee481733f61aa85767b4d2e7b228626 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Tue, 24 Jan 2017 11:24:47 -0700
Subject: [PATCH 099/267] Fixing GPU memory issue in fix_eos_table_rx_kokkos

---
 src/KOKKOS/fix_eos_table_rx_kokkos.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
index 40b44d6744..38222e6dd7 100644
--- a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
@@ -45,6 +45,8 @@ template<class DeviceType>
 FixEOStableRXKokkos<DeviceType>::FixEOStableRXKokkos(LAMMPS *lmp, int narg, char **arg) :
   FixEOStableRX(lmp, narg, arg)
 {
+  int kokkosable = 1;
+
   atomKK = (AtomKokkos *) atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
   datamask_read = EMPTY_MASK;
@@ -181,7 +183,7 @@ void FixEOStableRXKokkos<DeviceType>::init()
   } else {
     atomKK->sync(execution_space,MASK_MASK | UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK | DVECTOR_MASK);
     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXInit>(0,nlocal),*this);
-    atomKK->modified(execution_space,UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK);
+    atomKK->modified(execution_space,UCOND_MASK | UMECH_MASK | UCHEM_MASK);
   }
 
   error_check();
@@ -223,9 +225,8 @@ void FixEOStableRXKokkos<DeviceType>::post_integrate()
   dvector = atomKK->k_dvector.view<DeviceType>();
 
   atomKK->sync(execution_space,MASK_MASK | UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK | DVECTOR_MASK);
-  atomKK->modified(execution_space,DPDTHETA_MASK);
-
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXTemperatureLookup2>(0,nlocal),*this);
+  atomKK->modified(execution_space,DPDTHETA_MASK);
 
   error_check();
 

From 8e808f6c6b861cd46329b3c4d58e97631661896d Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Tue, 24 Jan 2017 11:45:27 -0700
Subject: [PATCH 100/267] Zeroing variables in pair_exp6_rx_kokkos to match
 pull request

---
 src/KOKKOS/pair_exp6_rx_kokkos.cpp | 66 ++++++++++++++++--------------
 1 file changed, 35 insertions(+), 31 deletions(-)

diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index dd3228efc4..23c217ef6e 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -311,9 +311,6 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
   double mixWtSite1_i, mixWtSite1_j;
   double mixWtSite2_i, mixWtSite2_j;
 
-  fpairOldEXP6_12 = 0.0;
-  fpairOldEXP6_21 = 0.0;
-
   const int nRep = 12;
   const double shift = 1.05;
   double rin1, aRep, uin1, win1, uin1rep, rin1exp, rin6, rin6inv;
@@ -415,6 +412,13 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
       rm21_ij = 0.5*(rm2_i + rm1_j);
       epsilon21_ij = sqrt(epsilon2_i*epsilon1_j);
 
+      evdwlOldEXP6_12 = 0.0;
+      evdwlOldEXP6_21 = 0.0;
+      evdwlEXP6_12 = 0.0;
+      evdwlEXP6_21 = 0.0;
+      fpairOldEXP6_12 = 0.0;
+      fpairOldEXP6_21 = 0.0;
+
       if(rmOld12_ij!=0.0 && rmOld21_ij!=0.0){
         if(alphaOld21_ij == 6.0 || alphaOld12_ij == 6.0)
           k_error_flag.d_view() = 1;
@@ -577,35 +581,35 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
         } else {
           evdwlEXP6_21 = buck1*(6.0*rexp - alpha21_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
         }
-
-        //
-        // Apply Mixing Rule to get the overall force for the CG pair
-        //
-        if (isite1 == isite2) fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpairOldEXP6_12;
-        else fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpairOldEXP6_12 + sqrt(mixWtSite2old_i*mixWtSite1old_j)*fpairOldEXP6_21;
-
-        fx_i += delx*fpair;
-        fy_i += dely*fpair;
-        fz_i += delz*fpair;
-        if (NEWTON_PAIR || j < nlocal) {
-          a_f(j,0) -= delx*fpair;
-          a_f(j,1) -= dely*fpair;
-          a_f(j,2) -= delz*fpair;
-        }
-
-        if (isite1 == isite2) evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwlEXP6_12;
-        else evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwlEXP6_12 + sqrt(mixWtSite2_i*mixWtSite1_j)*evdwlEXP6_21;
-        evdwl *= factor_lj;
-
-        uCGnew_i   += 0.5*evdwl;
-        if (NEWTON_PAIR || j < nlocal)
-          a_uCGnew[j] += 0.5*evdwl;
-        evdwl = evdwlOld;
-        if (EVFLAG)
-          ev.evdwl += ((NEWTON_PAIR||(j<nlocal))?1.0:0.5)*evdwl;
-        //if (vflag_either || eflag_atom) 
-        if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair,delx,dely,delz);
       }
+
+      //
+      // Apply Mixing Rule to get the overall force for the CG pair
+      //
+      if (isite1 == isite2) fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpairOldEXP6_12;
+      else fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpairOldEXP6_12 + sqrt(mixWtSite2old_i*mixWtSite1old_j)*fpairOldEXP6_21;
+
+      fx_i += delx*fpair;
+      fy_i += dely*fpair;
+      fz_i += delz*fpair;
+      if (NEWTON_PAIR || j < nlocal) {
+        a_f(j,0) -= delx*fpair;
+        a_f(j,1) -= dely*fpair;
+        a_f(j,2) -= delz*fpair;
+      }
+
+      if (isite1 == isite2) evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwlEXP6_12;
+      else evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwlEXP6_12 + sqrt(mixWtSite2_i*mixWtSite1_j)*evdwlEXP6_21;
+      evdwl *= factor_lj;
+
+      uCGnew_i   += 0.5*evdwl;
+      if (NEWTON_PAIR || j < nlocal)
+        a_uCGnew[j] += 0.5*evdwl;
+      evdwl = evdwlOld;
+      if (EVFLAG)
+        ev.evdwl += ((NEWTON_PAIR||(j<nlocal))?1.0:0.5)*evdwl;
+      //if (vflag_either || eflag_atom) 
+      if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair,delx,dely,delz);
     }
   }
 

From c617bc180afd1295fd49ffa71fdf779e8bf67603 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Thu, 26 Jan 2017 08:52:17 -0700
Subject: [PATCH 101/267] Adding sync/modify to pair_multi_lucy_rx_kokkos

---
 src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
index fac1478e32..8399fccc64 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
@@ -183,8 +183,6 @@ void PairMultiLucyRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in
   dvector = atomKK->k_dvector.view<DeviceType>();
 
   atomKK->sync(execution_space,X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK | DPDRHO_MASK | UCG_MASK | UCGNEW_MASK | DVECTOR_MASK);
-  if (evflag) atomKK->modified(execution_space,F_MASK | ENERGY_MASK | VIRIAL_MASK | UCG_MASK | UCGNEW_MASK);
-  else atomKK->modified(execution_space,F_MASK | UCG_MASK | UCGNEW_MASK);
   k_cutsq.template sync<DeviceType>();
 
   nlocal = atom->nlocal;
@@ -231,6 +229,9 @@ void PairMultiLucyRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in
     }
   }
 
+  if (evflag) atomKK->modified(execution_space,F_MASK | ENERGY_MASK | VIRIAL_MASK | UCG_MASK | UCGNEW_MASK);
+  else atomKK->modified(execution_space,F_MASK | UCG_MASK | UCGNEW_MASK);
+
   k_error_flag.template modify<DeviceType>();
   k_error_flag.template sync<LMPHostType>();
   if (k_error_flag.h_view() == 1)
@@ -454,7 +455,6 @@ void PairMultiLucyRXKokkos<DeviceType>::computeLocalDensity()
   nlocal = atom->nlocal;
 
   atomKK->sync(execution_space,X_MASK | TYPE_MASK | DPDRHO_MASK);
-  atomKK->modified(execution_space,DPDRHO_MASK);
 
   const int inum = list->inum;
   NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
@@ -492,14 +492,14 @@ void PairMultiLucyRXKokkos<DeviceType>::computeLocalDensity()
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALFTHREAD,0> >(0,inum),*this);
   }
 
+  atomKK->modified(execution_space,DPDRHO_MASK);
+
   // communicate and sum densities (on the host)
 
   if (newton_pair) {
-    atomKK->modified(execution_space,DPDRHO_MASK);
     atomKK->sync(Host,DPDRHO_MASK);
     comm->reverse_comm_pair(this);
     atomKK->modified(Host,DPDRHO_MASK);
-    atomKK->sync(execution_space,DPDRHO_MASK);
   }
 
   comm->forward_comm_pair(this);
@@ -687,6 +687,8 @@ int PairMultiLucyRXKokkos<DeviceType>::pack_forward_comm(int n, int *list, doubl
 {
   int i,j,m;
 
+  atomKK->sync(Host,DPDRHO_MASK);
+
   m = 0;
   for (i = 0; i < n; i++) {
     j = list[i];
@@ -705,6 +707,8 @@ void PairMultiLucyRXKokkos<DeviceType>::unpack_forward_comm(int n, int first, do
   m = 0;
   last = first + n;
   for (i = first; i < last; i++) h_rho[i] = buf[m++];
+
+  atomKK->modified(Host,DPDRHO_MASK);
 }
 
 /* ---------------------------------------------------------------------- */

From 8050eb3aa85c95fd55433208f185ff7f9bc74e02 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Thu, 26 Jan 2017 09:17:59 -0700
Subject: [PATCH 102/267] Another tweak to sync/modify in
 pair_multi_lucy_rx_kokkos

---
 src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
index 8399fccc64..2e6d48227f 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
@@ -496,11 +496,8 @@ void PairMultiLucyRXKokkos<DeviceType>::computeLocalDensity()
 
   // communicate and sum densities (on the host)
 
-  if (newton_pair) {
-    atomKK->sync(Host,DPDRHO_MASK);
+  if (newton_pair)
     comm->reverse_comm_pair(this);
-    atomKK->modified(Host,DPDRHO_MASK);
-  }
 
   comm->forward_comm_pair(this);
 }
@@ -648,6 +645,8 @@ template<class DeviceType>
 int PairMultiLucyRXKokkos<DeviceType>::pack_forward_comm_kokkos(int n, DAT::tdual_int_2d k_sendlist, int iswap_in, DAT::tdual_xfloat_1d &buf,
                                int pbc_flag, int *pbc)
 {
+  atomKK->sync(execution_space,DPDRHO_MASK);
+
   d_sendlist = k_sendlist.view<DeviceType>();
   iswap = iswap_in;
   v_buf = buf.view<DeviceType>();
@@ -672,6 +671,8 @@ void PairMultiLucyRXKokkos<DeviceType>::unpack_forward_comm_kokkos(int n, int fi
   v_buf = buf.view<DeviceType>();
   Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType, TagPairMultiLucyRXUnpackForwardComm>(0,n),*this);
   DeviceType::fence();
+
+  atomKK->modified(execution_space,DPDRHO_MASK);
 }
 
 template<class DeviceType>
@@ -718,6 +719,8 @@ int PairMultiLucyRXKokkos<DeviceType>::pack_reverse_comm(int n, int first, doubl
 {
   int i,m,last;
 
+  atomKK->sync(Host,DPDRHO_MASK);
+
   m = 0;
   last = first + n;
   for (i = first; i < last; i++) buf[m++] = h_rho[i];
@@ -736,6 +739,8 @@ void PairMultiLucyRXKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, do
     j = list[i];
     h_rho[j] += buf[m++];
   }
+
+  atomKK->modified(Host,DPDRHO_MASK);
 }
 
 /* ---------------------------------------------------------------------- */

From 6cc969db9282e5712ce659b951615b14366a7e78 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Thu, 26 Jan 2017 09:24:13 -0700
Subject: [PATCH 103/267] Fixing warnings in Kokkos

---
 src/KOKKOS/fix_eos_table_rx_kokkos.cpp | 2 +-
 src/KOKKOS/rand_pool_wrap_kokkos.h     | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
index 38222e6dd7..8487fd4c4f 100644
--- a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
@@ -45,7 +45,7 @@ template<class DeviceType>
 FixEOStableRXKokkos<DeviceType>::FixEOStableRXKokkos(LAMMPS *lmp, int narg, char **arg) :
   FixEOStableRX(lmp, narg, arg)
 {
-  int kokkosable = 1;
+  kokkosable = 1;
 
   atomKK = (AtomKokkos *) atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
diff --git a/src/KOKKOS/rand_pool_wrap_kokkos.h b/src/KOKKOS/rand_pool_wrap_kokkos.h
index 349896ee9a..ce134e5215 100644
--- a/src/KOKKOS/rand_pool_wrap_kokkos.h
+++ b/src/KOKKOS/rand_pool_wrap_kokkos.h
@@ -24,6 +24,7 @@ namespace LAMMPS_NS {
 struct RandWrap {
   class RanMars* rng;
 
+  KOKKOS_INLINE_FUNCTION
   RandWrap() {
     rng = NULL;
   }

From be13ecfa17cf837b0b1b4a69f8b0e733b9c5dae3 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Thu, 26 Jan 2017 10:03:43 -0700
Subject: [PATCH 104/267] Fixing Kokkos warnings

---
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.h  |  2 ++
 src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp | 13 +------------
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
index 2c2b78ac57..7d1749eb94 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
@@ -80,7 +80,9 @@ class PairDPDfdtEnergyKokkos : public PairDPDfdtEnergy {
   int sbmask(const int& j) const;
 
   struct params_dpd {
+    KOKKOS_INLINE_FUNCTION
     params_dpd(){cut=0;a0=0;sigma=0;kappa=0;};
+    KOKKOS_INLINE_FUNCTION
     params_dpd(int i){cut=0;a0=0;sigma=0;kappa=0;};
     F_FLOAT cut,a0,sigma,kappa;
   };
diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
index 2e6d48227f..30b49a8e8d 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
@@ -54,17 +54,6 @@ enum{NONE,RLINEAR,RSQ};
 #define oneFluidParameter (-1)
 #define isOneFluid(_site) ( (_site) == oneFluidParameter )
 
-static const char cite_pair_multi_lucy_rx[] =
-  "pair_style multi/lucy/rx command:\n\n"
-  "@Article{Moore16,\n"
-  " author = {J.D. Moore, B.C. Barnes, S. Izvekov, M. Lisal, M.S. Sellers, D.E. Taylor and J. K. Brennan},\n"
-  " title = {A coarse-grain force field for RDX:  Density dependent and energy conserving},\n"
-  " journal = {J. Chem. Phys.},\n"
-  " year =    2016,\n"
-  " volume =  144\n"
-  " pages =   {104501}\n"
-  "}\n\n";
-
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
@@ -278,7 +267,7 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
   // The f array is atomic for Half/Thread neighbor style
   Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
 
-  int i,jj,inum,jnum,itype,jtype,itable;
+  int i,jj,jnum,itype,jtype,itable;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,evdwlOld,fpair;
   double rsq;
 

From 85c8db5f86c7becfdb6c2d6831368abebabae0d4 Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Thu, 26 Jan 2017 10:09:45 -0700
Subject: [PATCH 105/267] Fixing warning in pair_dpd_fdt_energy_kokkos

---
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
index ec807a0e08..84a489bcc3 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
@@ -277,7 +277,7 @@ void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeSp
   // The f array is atomic for Half/Thread neighbor style
   Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
 
-  int i,j,jj,inum,jnum,itype,jtype;
+  int i,j,jj,jnum,itype,jtype;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
   double rsq,r,rinv,wd,wr,factor_dpd;
 

From ebe27c65e18645f6aded43a039c8c3af2337afac Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Thu, 26 Jan 2017 10:33:03 -0700
Subject: [PATCH 106/267] Removing duplicate code in pair_exp6_rx_kokkos

---
 src/KOKKOS/pair_exp6_rx_kokkos.cpp |  4 ++--
 src/KOKKOS/pair_exp6_rx_kokkos.h   | 18 ------------------
 2 files changed, 2 insertions(+), 20 deletions(-)

diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index 23c217ef6e..962dcfd031 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -25,7 +25,7 @@
 #include "force.h"
 #include "neigh_list.h"
 #include "math_const.h"
-#include "math_special.h"
+#include "math_special_kokkos.h"
 #include "memory.h"
 #include "error.h"
 #include "modify.h"
@@ -36,7 +36,7 @@
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
-using namespace MathSpecial;
+using namespace MathSpecialKokkos;
 
 #define MAXLINE 1024
 #define DELTA 4
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.h b/src/KOKKOS/pair_exp6_rx_kokkos.h
index 1f2172471b..488c9d0039 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.h
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.h
@@ -150,24 +150,6 @@ class PairExp6rxKokkos : public PairExp6rx {
   friend void pair_virial_fdotr_compute<PairExp6rxKokkos>(PairExp6rxKokkos*);
 };
 
-
-// optimized version of pow(x,n) with n being integer
-// up to 10x faster than pow(x,y)
-
-KOKKOS_INLINE_FUNCTION
-static double powint(const double &x, const int n) {
-  double yy,ww;
-
-  if (x == 0.0) return 0.0;
-  int nn = (n > 0) ? n : -n;
-  ww = x;
-
-  for (yy = 1.0; nn != 0; nn >>= 1, ww *=ww)
-    if (nn & 1) yy *= ww;
-
-  return (n > 0) ? yy : 1.0/yy;
-};
-
 }
 
 #endif

From a1f4551ac20e6660d7903f779ad67b4e56d7069d Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Fri, 27 Jan 2017 10:18:41 -0700
Subject: [PATCH 107/267] Adding missing sync/modified in atom_vec_dpd_kokkos

---
 src/KOKKOS/atom_vec_dpd_kokkos.cpp | 101 ++++++++++++++++++++++++-----
 1 file changed, 83 insertions(+), 18 deletions(-)

diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.cpp b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
index 699ea61c9d..820f11c215 100644
--- a/src/KOKKOS/atom_vec_dpd_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
@@ -156,6 +156,10 @@ void AtomVecDPDKokkos::grow_reset()
 
 void AtomVecDPDKokkos::copy(int i, int j, int delflag)
 {
+  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+            MASK_MASK | IMAGE_MASK | DPDTHETA_MASK |
+            UCOND_MASK | UMECH_MASK | UCHEM_MASK | DVECTOR_MASK);
+
   h_tag[j] = h_tag[i];
   h_type[j] = h_type[i];
   mask[j] = mask[i];
@@ -176,6 +180,10 @@ void AtomVecDPDKokkos::copy(int i, int j, int delflag)
   if (atom->nextra_grow)
     for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
       modify->fix[atom->extra_grow[iextra]]->copy_arrays(i,j,delflag);
+
+  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+                MASK_MASK | IMAGE_MASK | DPDTHETA_MASK |
+                UCOND_MASK | UMECH_MASK | UCHEM_MASK | DVECTOR_MASK);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -546,6 +554,8 @@ int AtomVecDPDKokkos::pack_comm(int n, int *list, double *buf,
   int i,j,m;
   double dx,dy,dz;
 
+  sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+
   m = 0;
   if (pbc_flag == 0) {
     for (i = 0; i < n; i++) {
@@ -590,6 +600,8 @@ int AtomVecDPDKokkos::pack_comm_vel(int n, int *list, double *buf,
   int i,j,m;
   double dx,dy,dz,dvx,dvy,dvz;
 
+  sync(Host,X_MASK|V_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+
   m = 0;
   if (pbc_flag == 0) {
     for (i = 0; i < n; i++) {
@@ -674,6 +686,8 @@ void AtomVecDPDKokkos::unpack_comm(int n, int first, double *buf)
     h_uMech[i] = buf[m++];
     h_uChem[i] = buf[m++];
   }
+
+  modified(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -696,6 +710,8 @@ void AtomVecDPDKokkos::unpack_comm_vel(int n, int first, double *buf)
     h_uMech[i] = buf[m++];
     h_uChem[i] = buf[m++];
   }
+
+  modified(Host,X_MASK|V_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -805,6 +821,8 @@ int AtomVecDPDKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DA
 {
   X_FLOAT dx,dy,dz;
 
+  sync(space,ALL_MASK);
+
   if (pbc_flag != 0) {
     if (domain->triclinic == 0) {
       dx = pbc[0]*domain->xprd;
@@ -864,6 +882,8 @@ int AtomVecDPDKokkos::pack_border(int n, int *list, double *buf,
   int i,j,m;
   double dx,dy,dz;
 
+  sync(Host,ALL_MASK);
+
   m = 0;
   if (pbc_flag == 0) {
     for (i = 0; i < n; i++) {
@@ -923,6 +943,8 @@ int AtomVecDPDKokkos::pack_border_vel(int n, int *list, double *buf,
   int i,j,m;
   double dx,dy,dz,dvx,dvy,dvz;
 
+  sync(Host,ALL_MASK);
+
   m = 0;
   if (pbc_flag == 0) {
     for (i = 0; i < n; i++) {
@@ -1016,6 +1038,9 @@ int AtomVecDPDKokkos::pack_comm_hybrid(int n, int *list, double *buf)
 {
   int i,j,m;
 
+  sync(Host,DPDTHETA_MASK | UCOND_MASK |
+            UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
+
   m = 0;
   for (i = 0; i < n; i++) {
     j = list[i];
@@ -1035,6 +1060,9 @@ int AtomVecDPDKokkos::pack_border_hybrid(int n, int *list, double *buf)
 {
   int i,j,m;
 
+  sync(Host,DPDTHETA_MASK | UCOND_MASK |
+            UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
+
   m = 0;
   for (i = 0; i < n; i++) {
     j = list[i];
@@ -1113,7 +1141,7 @@ void AtomVecDPDKokkos::unpack_border_kokkos(const int &n, const int &first,
   while (first+n >= nmax) grow(0);
   modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
                  DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK|
-                 UCG_MASK|UCGNEW_MASK);
+                 UCG_MASK|UCGNEW_MASK|DVECTOR_MASK);
   if(space==Host) {
     struct AtomVecDPDKokkos_UnpackBorder<LMPHostType> f(buf.view<LMPHostType>(),
       h_x,h_tag,h_type,h_mask,
@@ -1141,9 +1169,7 @@ void AtomVecDPDKokkos::unpack_border(int n, int first, double *buf)
   last = first + n;
   for (i = first; i < last; i++) {
     if (i == nmax) grow(0);
-    modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
-                 DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK|
-                 UCG_MASK|UCGNEW_MASK);
+
     h_x(i,0) = buf[m++];
     h_x(i,1) = buf[m++];
     h_x(i,2) = buf[m++];
@@ -1162,6 +1188,10 @@ void AtomVecDPDKokkos::unpack_border(int n, int first, double *buf)
     for (int iextra = 0; iextra < atom->nextra_border; iextra++)
       m += modify->fix[atom->extra_border[iextra]]->
         unpack_border(n,first,&buf[m]);
+
+  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+                DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK|
+                UCG_MASK|UCGNEW_MASK|DVECTOR_MASK);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -1174,9 +1204,7 @@ void AtomVecDPDKokkos::unpack_border_vel(int n, int first, double *buf)
   last = first + n;
   for (i = first; i < last; i++) {
     if (i == nmax) grow(0);
-    modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
-                 DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK|
-                 UCG_MASK|UCGNEW_MASK);
+
     h_x(i,0) = buf[m++];
     h_x(i,1) = buf[m++];
     h_x(i,2) = buf[m++];
@@ -1198,6 +1226,10 @@ void AtomVecDPDKokkos::unpack_border_vel(int n, int first, double *buf)
     for (int iextra = 0; iextra < atom->nextra_border; iextra++)
       m += modify->fix[atom->extra_border[iextra]]->
         unpack_border(n,first,&buf[m]);
+
+  modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+                DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK|
+                UCG_MASK|UCGNEW_MASK|DVECTOR_MASK);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -1216,6 +1248,10 @@ int AtomVecDPDKokkos::unpack_comm_hybrid(int n, int first, double *buf)
     h_uCG(i) = buf[m++];
     h_uCGnew(i) = buf[m++];
   }
+
+  modified(Host,DPDTHETA_MASK | UCOND_MASK |
+                UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
+
   return m;
 }
 
@@ -1235,6 +1271,10 @@ int AtomVecDPDKokkos::unpack_border_hybrid(int n, int first, double *buf)
     h_uCG(i) = buf[m++];
     h_uCGnew(i) = buf[m++];
   }
+
+  modified(Host,DPDTHETA_MASK | UCOND_MASK |
+                UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
+
   return m;
 }
 
@@ -1356,23 +1396,31 @@ int AtomVecDPDKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d
     int newsize = nsend*17/k_buf.view<LMPHostType>().dimension_1()+1;
     k_buf.resize(newsize,k_buf.view<LMPHostType>().dimension_1());
   }
+  sync(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+             MASK_MASK | IMAGE_MASK| DPDTHETA_MASK | UCOND_MASK |
+             UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK |
+             DVECTOR_MASK);
   if(space == Host) {
     AtomVecDPDKokkos_PackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
     LMPHostType::fence();
-    return nsend*17;
   } else {
     AtomVecDPDKokkos_PackExchangeFunctor<LMPDeviceType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
     LMPDeviceType::fence();
-    return nsend*17;
   }
+  return nsend*17;
 }
 
 /* ---------------------------------------------------------------------- */
 
 int AtomVecDPDKokkos::pack_exchange(int i, double *buf)
 {
+  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+            MASK_MASK | IMAGE_MASK| DPDTHETA_MASK | UCOND_MASK |
+            UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK |
+            DVECTOR_MASK);
+
   int m = 1;
   buf[m++] = h_x(i,0);
   buf[m++] = h_x(i,1);
@@ -1475,7 +1523,6 @@ int AtomVecDPDKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nre
     AtomVecDPDKokkos_UnpackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/17,f);
     LMPHostType::fence();
-    return k_count.h_view(0);
   } else {
     k_count.h_view(0) = nlocal;
     k_count.modify<LMPHostType>();
@@ -1485,9 +1532,14 @@ int AtomVecDPDKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nre
     LMPDeviceType::fence();
     k_count.modify<LMPDeviceType>();
     k_count.sync<LMPHostType>();
-
-    return k_count.h_view(0);
   }
+
+  modified(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+                 MASK_MASK | IMAGE_MASK| DPDTHETA_MASK | UCOND_MASK |
+                 UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK |
+                 DVECTOR_MASK);
+
+  return k_count.h_view(0);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -1496,9 +1548,6 @@ int AtomVecDPDKokkos::unpack_exchange(double *buf)
 {
   int nlocal = atom->nlocal;
   if (nlocal == nmax) grow(0);
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
-           MASK_MASK | IMAGE_MASK| DPDTHETA_MASK | UCOND_MASK |
-           UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
 
   int m = 1;
   h_x(nlocal,0) = buf[m++];
@@ -1523,6 +1572,11 @@ int AtomVecDPDKokkos::unpack_exchange(double *buf)
       m += modify->fix[atom->extra_grow[iextra]]->
         unpack_exchange(nlocal,&buf[m]);
 
+  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+           MASK_MASK | IMAGE_MASK| DPDTHETA_MASK | UCOND_MASK |
+           UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK |
+           DVECTOR_MASK);
+
   atom->nlocal++;
   return m;
 }
@@ -1595,9 +1649,6 @@ int AtomVecDPDKokkos::unpack_restart(double *buf)
     if (atom->nextra_store)
       memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
   }
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
-                MASK_MASK | IMAGE_MASK | DPDTHETA_MASK |
-                UCOND_MASK | UMECH_MASK | UCHEM_MASK | DVECTOR_MASK);
 
   int m = 1;
   h_x(nlocal,0) = buf[m++];
@@ -1621,6 +1672,10 @@ int AtomVecDPDKokkos::unpack_restart(double *buf)
     for (int i = 0; i < size; i++) extra[nlocal][i] = buf[m++];
   }
 
+  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+                MASK_MASK | IMAGE_MASK | DPDTHETA_MASK |
+                UCOND_MASK | UMECH_MASK | UCHEM_MASK | DVECTOR_MASK);
+
   atom->nlocal++;
   return m;
 }
@@ -1661,6 +1716,10 @@ void AtomVecDPDKokkos::create_atom(int itype, double *coord)
   h_uCGnew[nlocal] = 0.0;
   h_duChem[nlocal] = 0.0;
 
+  //atomKK->modified(Host,TAG_MASK|TYPE_MASK|DPDTHETA_MASK|X_MASK|IMAGE_MASK|
+  //                      MASK_MASK|V_MASK|DPDRHO_MASK|UCOND_MASK|UMECH_MASK|
+  //                      UCHEM_MASK|UCG_MASK|UCGNEW_MASK);
+
   atom->nlocal++;
 }
 
@@ -1716,6 +1775,8 @@ int AtomVecDPDKokkos::data_atom_hybrid(int nlocal, char **values)
 {
   h_dpdTheta(nlocal) = atof(values[0]);
 
+  atomKK->modified(Host,DPDTHETA_MASK);
+
   return 1;
 }
 
@@ -1725,6 +1786,8 @@ int AtomVecDPDKokkos::data_atom_hybrid(int nlocal, char **values)
 
 void AtomVecDPDKokkos::pack_data(double **buf)
 {
+  atomKK->sync(Host,TAG_MASK|TYPE_MASK|DPDTHETA_MASK|X_MASK|IMAGE_MASK);
+
   int nlocal = atom->nlocal;
   for (int i = 0; i < nlocal; i++) {
     buf[i][0] = ubuf(h_tag(i)).d;
@@ -1745,6 +1808,8 @@ void AtomVecDPDKokkos::pack_data(double **buf)
 
 int AtomVecDPDKokkos::pack_data_hybrid(int i, double *buf)
 {
+  atomKK->sync(Host,DPDTHETA_MASK);
+
   buf[0] = h_dpdTheta(i);
   return 1;
 }

From 43d61f313f566b53fd00112b594395a3c40b2145 Mon Sep 17 00:00:00 2001
From: Christopher Stone <chris.stone@computational-science.com>
Date: Sun, 22 Jan 2017 15:03:45 -0500
Subject: [PATCH 108/267] Initial bare-bones port of FixRX to Kokkos.

Initial port of USER-DPD/fix_rx.cpp to KOKKOS/fix_rx_kokkos.cpp.
Using parallel_reduce(...) but still using host-only data.
TODO:
  1. Switch to KOKKOS datatypes for sparse-kinetics data; dense
     is finished.
  2. Switch to using KOKKOS data for dvector.
  3. Remove dependencies in rhs(...) on atom. Store those consts
     in UserData{} or as member constants.
  4. Port ComputeLocalTemp(...) to Kokkos (needs pairing algorithm).
---
 src/KOKKOS/fix_rx_kokkos.cpp | 887 +++++++++++++++++++++++++++++++++++
 src/KOKKOS/fix_rx_kokkos.h   | 124 +++++
 2 files changed, 1011 insertions(+)
 create mode 100644 src/KOKKOS/fix_rx_kokkos.cpp
 create mode 100644 src/KOKKOS/fix_rx_kokkos.h

diff --git a/src/KOKKOS/fix_rx_kokkos.cpp b/src/KOKKOS/fix_rx_kokkos.cpp
new file mode 100644
index 0000000000..f8a10dff93
--- /dev/null
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@@ -0,0 +1,887 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <string.h>
+#include "fix_rx_kokkos.h"
+#include "atom_masks.h"
+#include "atom_kokkos.h"
+#include "force.h"
+#include "memory.h"
+#include "update.h"
+#include "respa.h"
+#include "modify.h"
+#include "error.h"
+#include "math_special.h"
+
+#include <float.h> // DBL_EPSILON
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+using namespace MathSpecial;
+
+#ifdef DBL_EPSILON
+  #define MY_EPSILON (10.0*DBL_EPSILON)
+#else
+  #define MY_EPSILON (10.0*2.220446049250313e-16)
+#endif
+
+#define SparseKinetics_enableIntegralReactions (true)
+#define SparseKinetics_invalidIndex (-1)
+
+namespace /* anonymous */
+{
+
+typedef double TimerType;
+TimerType getTimeStamp(void) { return MPI_Wtime(); }
+double getElapsedTime( const TimerType &t0, const TimerType &t1) { return t1-t0; }
+
+} // end namespace
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+FixRxKokkos<DeviceType>::FixRxKokkos(LAMMPS *lmp, int narg, char **arg) :
+  FixRX(lmp, narg, arg),
+  pairDPDEKK(NULL),
+  update_kinetics_data(true)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  printf("Inside FixRxKokkos::FixRxKokkos\n");
+}
+
+template <typename DeviceType>
+FixRxKokkos<DeviceType>::~FixRxKokkos()
+{
+  printf("Inside FixRxKokkos::~FixRxKokkos\n");
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::init()
+{
+  printf("Inside FixRxKokkos::init\n");
+
+  // Call the parent's version.
+  FixRX::init();
+
+  pairDPDEKK = dynamic_cast<decltype(pairDPDEKK)>(pairDPDE);
+  if (pairDPDEKK == NULL)
+    error->all(FLERR,"Must use pair_style dpd/fdt/energy/kk with fix rx/kk");
+
+  if (update_kinetics_data)
+    create_kinetics_data();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::rk4(const double t_stop, double *y, double *rwork, void* v_params) const
+{
+  double *k1 = rwork;
+  double *k2 = k1 + nspecies;
+  double *k3 = k2 + nspecies;
+  double *k4 = k3 + nspecies;
+  double *yp = k4 + nspecies;
+
+  const int numSteps = minSteps;
+
+  const double h = t_stop / double(numSteps);
+
+  // Run the requested steps with h.
+  for (int step = 0; step < numSteps; step++)
+  {
+    // k1
+    rhs(0.0,y,k1,v_params);
+
+    // k2
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+      yp[ispecies] = y[ispecies] + 0.5*h*k1[ispecies];
+
+    rhs(0.0,yp,k2,v_params);
+
+    // k3
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+      yp[ispecies] = y[ispecies] + 0.5*h*k2[ispecies];
+
+    rhs(0.0,yp,k3,v_params);
+
+    // k4
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+      yp[ispecies] = y[ispecies] + h*k3[ispecies];
+
+    rhs(0.0,yp,k4,v_params);
+
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+      y[ispecies] += h*(k1[ispecies]/6.0 + k2[ispecies]/3.0 + k3[ispecies]/3.0 + k4[ispecies]/6.0);
+
+  } // end for (int step...
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+//     f1 = dt*f(t,x)
+//     f2 = dt*f(t+ c20*dt,x + c21*f1)
+//     f3 = dt*f(t+ c30*dt,x + c31*f1 + c32*f2)
+//     f4 = dt*f(t+ c40*dt,x + c41*f1 + c42*f2 + c43*f3)
+//     f5 = dt*f(t+dt,x + c51*f1 + c52*f2 + c53*f3 + c54*f4)
+//     f6 = dt*f(t+ c60*dt,x + c61*f1 + c62*f2 + c63*f3 + c64*f4 + c65*f5)
+//
+//     fifth-order runge-kutta integration
+//        x5 = x + b1*f1 + b3*f3 + b4*f4 + b5*f5 + b6*f6
+//     fourth-order runge-kutta integration
+//        x  = x + a1*f1 + a3*f3 + a4*f4 + a5*f5
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::rkf45_step (const int neq, const double h, double y[], double y_out[], double rwk[], void* v_param) const
+{
+   const double c21=0.25;
+   const double c31=0.09375;
+   const double c32=0.28125;
+   const double c41=0.87938097405553;
+   const double c42=-3.2771961766045;
+   const double c43=3.3208921256258;
+   const double c51=2.0324074074074;
+   const double c52=-8.0;
+   const double c53=7.1734892787524;
+   const double c54=-0.20589668615984;
+   const double c61=-0.2962962962963;
+   const double c62=2.0;
+   const double c63=-1.3816764132554;
+   const double c64=0.45297270955166;
+   const double c65=-0.275;
+   const double a1=0.11574074074074;
+   const double a3=0.54892787524366;
+   const double a4=0.5353313840156;
+   const double a5=-0.2;
+   const double b1=0.11851851851852;
+   const double b3=0.51898635477583;
+   const double b4=0.50613149034201;
+   const double b5=-0.18;
+   const double b6=0.036363636363636;
+
+   // local dependent variables (5 total)
+   double* f1 = &rwk[    0];
+   double* f2 = &rwk[  neq];
+   double* f3 = &rwk[2*neq];
+   double* f4 = &rwk[3*neq];
+   double* f5 = &rwk[4*neq];
+   double* f6 = &rwk[5*neq];
+
+   // scratch for the intermediate solution.
+   //double* ytmp = &rwk[6*neq];
+   double* ytmp = y_out;
+
+   // 1)
+   rhs (0.0, y, f1, v_param);
+
+   for (int k = 0; k < neq; k++){
+      f1[k] *= h;
+      ytmp[k] = y[k] + c21 * f1[k];
+   }
+
+   // 2)
+   rhs(0.0, ytmp, f2, v_param);
+
+   for (int k = 0; k < neq; k++){
+      f2[k] *= h;
+      ytmp[k] = y[k] + c31 * f1[k] + c32 * f2[k];
+   }
+
+   // 3)
+   rhs(0.0, ytmp, f3, v_param);
+
+   for (int k = 0; k < neq; k++) {
+      f3[k] *= h;
+      ytmp[k] = y[k] + c41 * f1[k] + c42 * f2[k] + c43 * f3[k];
+   }
+
+   // 4)
+   rhs(0.0, ytmp, f4, v_param);
+
+   for (int k = 0; k < neq; k++) {
+      f4[k] *= h;
+      ytmp[k] = y[k] + c51 * f1[k] + c52 * f2[k] + c53 * f3[k] + c54 * f4[k];
+   }
+
+   // 5)
+   rhs(0.0, ytmp, f5, v_param);
+
+   for (int k = 0; k < neq; k++) {
+      f5[k] *= h;
+      ytmp[k] = y[k] + c61*f1[k] + c62*f2[k] + c63*f3[k] + c64*f4[k] + c65*f5[k];
+   }
+
+   // 6)
+   rhs(0.0, ytmp, f6, v_param);
+
+   for (int k = 0; k < neq; k++)
+   {
+      //const double f6 = h * ydot[k];
+      f6[k] *= h;
+
+      // 5th-order solution.
+      const double r5 = b1*f1[k] + b3*f3[k] + b4*f4[k] + b5*f5[k] + b6*f6[k];
+
+      // 4th-order solution.
+      const double r4 = a1*f1[k] + a3*f3[k] + a4*f4[k] + a5*f5[k];
+
+      // Truncation error: difference between 4th and 5th-order solutions.
+      rwk[k] = fabs(r5 - r4);
+
+      // Update solution.
+    //y_out[k] = y[k] + r5; // Local extrapolation
+      y_out[k] = y[k] + r4;
+   }
+
+   return;
+}
+
+template <typename DeviceType>
+int FixRxKokkos<DeviceType>::rkf45_h0
+                    (const int neq, const double t, const double t_stop,
+                     const double hmin, const double hmax,
+                     double& h0, double y[], double rwk[], void* v_params) const
+{
+   // Set lower and upper bounds on h0, and take geometric mean as first trial value.
+   // Exit with this value if the bounds cross each other.
+
+   // Adjust upper bound based on ydot ...
+   double hg = sqrt(hmin*hmax);
+
+   //if (hmax < hmin)
+   //{
+   //   h0 = hg;
+   //   return;
+   //}
+
+   // Start iteration to find solution to ... {WRMS norm of (h0^2 y'' / 2)} = 1
+
+   double *ydot  = rwk;
+   double *y1    = ydot + neq;
+   double *ydot1 = y1 + neq;
+
+   const int max_iters = 10;
+   bool hnew_is_ok = false;
+   double hnew = hg;
+   int iter = 0;
+
+   // compute ydot at t=t0
+   rhs (t, y, ydot, v_params);
+
+   while(1)
+   {
+      // Estimate y'' with finite-difference ...
+
+      for (int k = 0; k < neq; k++)
+         y1[k] = y[k] + hg * ydot[k];
+
+      // compute y' at t1
+      rhs (t + hg, y1, ydot1, v_params);
+
+      // Compute WRMS norm of y''
+      double yddnrm = 0.0;
+      for (int k = 0; k < neq; k++){
+         double ydd = (ydot1[k] - ydot[k]) / hg;
+         double wterr = ydd / (relTol * fabs( y[k] ) + absTol);
+         yddnrm += wterr * wterr;
+      }
+
+      yddnrm = sqrt( yddnrm / double(neq) );
+
+      //std::cout << "iter " << _iter << " hg " << hg << " y'' " << yddnrm << std::endl;
+      //std::cout << "ydot " << ydot[neq-1] << std::endl;
+
+      // should we accept this?
+      if (hnew_is_ok || iter == max_iters){
+         hnew = hg;
+         if (iter == max_iters)
+            fprintf(stderr, "ERROR_HIN_MAX_ITERS\n");
+         break;
+      }
+
+      // Get the new value of h ...
+      hnew = (yddnrm*hmax*hmax > 2.0) ? sqrt(2.0 / yddnrm) : sqrt(hg * hmax);
+
+      // test the stopping conditions.
+      double hrat = hnew / hg;
+
+      // Accept this value ... the bias factor should bring it within range.
+      if ( (hrat > 0.5) && (hrat < 2.0) )
+         hnew_is_ok = true;
+
+      // If y'' is still bad after a few iterations, just accept h and give up.
+      if ( (iter > 1) && hrat > 2.0 ) {
+         hnew = hg;
+         hnew_is_ok = true;
+      }
+
+      //printf("iter=%d, yddnrw=%e, hnew=%e, hmin=%e, hmax=%e\n", iter, yddnrm, hnew, hmin, hmax);
+
+      hg = hnew;
+      iter ++;
+   }
+
+   // bound and bias estimate
+   h0 = hnew * 0.5;
+   h0 = fmax(h0, hmin);
+   h0 = fmin(h0, hmax);
+   //printf("h0=%e, hmin=%e, hmax=%e\n", h0, hmin, hmax);
+
+   return (iter + 1);
+}
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::rkf45(const int neq, const double t_stop, double *y, double *rwork, void *v_param, CounterType& counter) const
+{
+  // Rounding coefficient.
+  const double uround = DBL_EPSILON;
+
+  // Adaption limit (shrink or grow)
+  const double adaption_limit = 4.0;
+
+  // Safety factor on the adaption. very specific but not necessary .. 0.9 is common.
+  const double hsafe = 0.840896415;
+
+  // Time rounding factor.
+  const double tround = t_stop * uround;
+
+  // Counters for diagnostics.
+  int nst = 0; // # of steps (accepted)
+  int nit = 0; // # of iterations total
+  int nfe = 0; // # of RHS evaluations
+
+  // Min/Max step-size limits.
+  const double h_min = 100.0 * tround;
+  const double h_max = (minSteps > 0) ? t_stop / double(minSteps) : t_stop;
+
+  // Set the initial step-size. 0 forces an internal estimate ... stable Euler step size.
+  double h = (minSteps > 0) ? t_stop / double(minSteps) : 0.0;
+
+  double t = 0.0;
+
+  if (h < h_min){
+    //fprintf(stderr,"hin not implemented yet\n");
+    //exit(-1);
+    nfe = rkf45_h0 (neq, t, t_stop, h_min, h_max, h, y, rwork, v_param);
+  }
+
+  //printf("t= %e t_stop= %e h= %e\n", t, t_stop, h);
+
+  // Integrate until we reach the end time.
+  while (fabs(t - t_stop) > tround){
+    double *yout = rwork;
+    double *eout = yout + neq;
+
+    // Take a trial step.
+    rkf45_step (neq, h, y, yout, eout, v_param);
+
+    // Estimate the solution error.
+      // ... weighted 2-norm of the error.
+      double err2 = 0.0;
+      for (int k = 0; k < neq; k++){
+        const double wterr = eout[k] / (relTol * fabs( y[k] ) + absTol);
+        err2 += wterr * wterr;
+      }
+
+    double err = fmax( uround, sqrt( err2 / double(nspecies) ));
+
+    // Accept the solution?
+    if (err <= 1.0 || h <= h_min){
+      t += h;
+      nst++;
+
+      for (int k = 0; k < neq; k++)
+        y[k] = yout[k];
+    }
+
+    // Adjust h for the next step.
+    double hfac = hsafe * sqrt( sqrt( 1.0 / err ) );
+
+    // Limit the adaption.
+    hfac = fmax( hfac, 1.0 / adaption_limit );
+    hfac = fmin( hfac,       adaption_limit );
+
+    // Apply the adaption factor...
+    h *= hfac;
+
+    // Limit h.
+    h = fmin( h, h_max );
+    h = fmax( h, h_min );
+
+    // Stretch h if we're within 5% ... and we didn't just fail.
+    if (err <= 1.0 && (t + 1.05*h) > t_stop)
+      h = t_stop - t;
+
+    // And don't overshoot the end.
+    if (t + h > t_stop)
+      h = t_stop - t;
+
+    nit++;
+    nfe += 6;
+
+    if (maxIters && nit > maxIters){
+      //fprintf(stderr,"atom[%d] took too many iterations in rkf45 %d %e %e\n", id, nit, t, t_stop);
+      counter.nFails ++;
+      break;
+      // We should set an error here so that the solution is not used!
+    }
+
+  } // end while
+
+  counter.nSteps += nst;
+  counter.nIters += nit;
+  counter.nFuncs += nfe;
+
+  //printf("id= %d nst= %d nit= %d\n", id, nst, nit);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+int FixRxKokkos<DeviceType>::rhs(double t, const double *y, double *dydt, void *params) const
+{
+  // Use the sparse format instead.
+  if (useSparseKinetics)
+    return this->rhs_sparse( t, y, dydt, params);
+  else
+    return this->rhs_dense ( t, y, dydt, params);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+int FixRxKokkos<DeviceType>::rhs_dense(double t, const double *y, double *dydt, void *params) const
+{
+  UserRHSData *userData = (UserRHSData *) params;
+
+  double *rxnRateLaw = userData->rxnRateLaw;
+  double *kFor       = userData->kFor;
+
+  const double VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
+  const int nspecies = atom->nspecies_dpd;
+
+  for(int ispecies=0; ispecies<nspecies; ispecies++)
+    dydt[ispecies] = 0.0;
+
+  // Construct the reaction rate laws
+  for(int jrxn=0; jrxn<nreactions; jrxn++){
+    double rxnRateLawForward = kFor[jrxn];
+
+    for(int ispecies=0; ispecies<nspecies; ispecies++){
+      const double concentration = y[ispecies]/VDPD;
+      rxnRateLawForward *= pow( concentration, d_kinetics_data.stoichReactants(jrxn,ispecies) );
+      //rxnRateLawForward *= pow(concentration,stoichReactants[jrxn][ispecies]);
+    }
+    rxnRateLaw[jrxn] = rxnRateLawForward;
+  }
+
+  // Construct the reaction rates for each species
+  for(int ispecies=0; ispecies<nspecies; ispecies++)
+    for(int jrxn=0; jrxn<nreactions; jrxn++)
+    {
+      dydt[ispecies] += d_kinetics_data.stoich(jrxn,ispecies) *VDPD*rxnRateLaw[jrxn];
+      //dydt[ispecies] += stoich[jrxn][ispecies]*VDPD*rxnRateLaw[jrxn];
+    }
+
+  return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+int FixRxKokkos<DeviceType>::rhs_sparse(double t, const double *y, double *dydt, void *v_params) const
+{
+   UserRHSData *userData = (UserRHSData *) v_params;
+
+   const double VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
+
+   #define kFor         (userData->kFor)
+   #define kRev         (NULL)
+   #define rxnRateLaw   (userData->rxnRateLaw)
+   #define conc         (dydt)
+   #define maxReactants (this->sparseKinetics_maxReactants)
+   #define maxSpecies   (this->sparseKinetics_maxSpecies)
+   #define nuk          (this->sparseKinetics_nuk)
+   #define nu           (this->sparseKinetics_nu)
+   #define inu          (this->sparseKinetics_inu)
+   #define isIntegral(idx) (SparseKinetics_enableIntegralReactions \
+                             && this->sparseKinetics_isIntegralReaction[idx])
+
+   for (int k = 0; k < nspecies; ++k)
+      conc[k] = y[k] / VDPD;
+
+   // Construct the reaction rate laws
+   for (int i = 0; i < nreactions; ++i)
+   {
+      double rxnRateLawForward;
+      if (isIntegral(i)){
+         rxnRateLawForward = kFor[i] * powint( conc[ nuk[i][0] ], inu[i][0]);
+         for (int kk = 1; kk < maxReactants; ++kk){
+            const int k = nuk[i][kk];
+            if (k == SparseKinetics_invalidIndex) break;
+            //if (k != SparseKinetics_invalidIndex)
+               rxnRateLawForward *= powint( conc[k], inu[i][kk] );
+         }
+      } else {
+         rxnRateLawForward = kFor[i] * pow( conc[ nuk[i][0] ], nu[i][0]);
+         for (int kk = 1; kk < maxReactants; ++kk){
+            const int k = nuk[i][kk];
+            if (k == SparseKinetics_invalidIndex) break;
+            //if (k != SparseKinetics_invalidIndex)
+               rxnRateLawForward *= pow( conc[k], nu[i][kk] );
+         }
+      }
+
+      rxnRateLaw[i] = rxnRateLawForward;
+   }
+
+   // Construct the reaction rates for each species from the
+   // Stoichiometric matrix and ROP vector.
+   for (int k = 0; k < nspecies; ++k)
+      dydt[k] = 0.0;
+
+   for (int i = 0; i < nreactions; ++i){
+      // Reactants ...
+      dydt[ nuk[i][0] ] -= nu[i][0] * rxnRateLaw[i];
+      for (int kk = 1; kk < maxReactants; ++kk){
+         const int k = nuk[i][kk];
+         if (k == SparseKinetics_invalidIndex) break;
+         //if (k != SparseKinetics_invalidIndex)
+            dydt[k] -= nu[i][kk] * rxnRateLaw[i];
+      }
+
+      // Products ...
+      dydt[ nuk[i][maxReactants] ] += nu[i][maxReactants] * rxnRateLaw[i];
+      for (int kk = maxReactants+1; kk < maxSpecies; ++kk){
+         const int k = nuk[i][kk];
+         if (k == SparseKinetics_invalidIndex) break;
+         //if (k != SparseKinetics_invalidIndex)
+            dydt[k] += nu[i][kk] * rxnRateLaw[i];
+      }
+   }
+
+   // Add in the volume factor to convert to the proper units.
+   for (int k = 0; k < nspecies; ++k)
+      dydt[k] *= VDPD;
+
+   #undef kFor
+   #undef kRev
+   #undef rxnRateLaw
+   #undef conc
+   #undef maxReactants
+   #undef maxSpecies
+   #undef nuk
+   #undef nu
+   #undef inu
+   #undef isIntegral
+   //#undef invalidIndex
+
+   return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*template <typename DeviceType>
+  template <typename SolverType>
+    KOKKOS_INLINE_FUNCTION
+void FixRxKokkos<DeviceType>::operator()(SolverType, const int &i) const
+{
+  if (atom->mask[i] & groupbit)
+  {
+    double *rwork = new double[8*nspecies];
+
+    UserRHSData userData;
+    userData.kFor = new double[nreactions];
+    userData.rxnRateLaw = new double[nreactions];
+
+    int ode_counter[4] = { 0 };
+
+    const double theta = (localTempFlag) ? dpdThetaLocal[i] : atom->dpdTheta[i];
+
+    //Compute the reaction rate constants
+    for (int irxn = 0; irxn < nreactions; irxn++)
+    {
+      if (SolverType::setToZero)
+        userData.kFor[irxn] = 0.0;
+      else
+        userData.kFor[irxn] = Arr[irxn]*pow(theta,nArr[irxn])*exp(-Ea[irxn]/force->boltz/theta);
+    }
+
+    if (odeIntegrationFlag == ODE_LAMMPS_RK4)
+      rk4(i, rwork, &userData);
+    else if (odeIntegrationFlag == ODE_LAMMPS_RKF45)
+      rkf45(i, rwork, &userData, ode_counter);
+
+    delete [] rwork;
+    delete [] userData.kFor;
+    delete [] userData.rxnRateLaw;
+  }
+} */
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::solve_reactions(void)
+{
+/*  int nlocal = atom->nlocal;
+  if (igroup == atom->firstgroup) nlocal = atom->nfirst;
+
+  using AT = ArrayTypes<DeviceType>;
+
+  atomKK->sync(execution_space, UCOND_MASK);
+  typename AT::t_efloat_1d uCond = atomKK->k_uCond.view<DeviceType>();
+  atomKK->sync(execution_space, UMECH_MASK);
+  typename AT::t_efloat_1d uMech = atomKK->k_uMech.view<DeviceType>();
+
+  pairDPDEKK->k_duCond.template sync<DeviceType>();
+  typename AT::t_efloat_1d_const duCond = pairDPDEKK->k_duCond.template view<DeviceType>();
+  pairDPDEKK->k_duMech.template sync<DeviceType>();
+  typename AT::t_efloat_1d_const duMech = pairDPDEKK->k_duMech.template view<DeviceType>();
+
+  auto dt = update->dt;
+
+  Kokkos::parallel_for(nlocal, LAMMPS_LAMBDA(int i) {
+    uCond(i) += 0.5*dt*duCond(i);
+    uMech(i) += 0.5*dt*duMech(i);
+  });
+
+  atomKK->modified(execution_space, UCOND_MASK);
+  atomKK->modified(execution_space, UMECH_MASK); */
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::create_kinetics_data(void)
+{
+  printf("Inside FixRxKokkos::create_kinetics_data\n");
+
+  memory->create_kokkos( d_kinetics_data.Arr, h_kinetics_data.Arr, nreactions, "KineticsType::Arr");
+  memory->create_kokkos( d_kinetics_data.nArr, h_kinetics_data.nArr, nreactions, "KineticsType::nArr");
+  memory->create_kokkos( d_kinetics_data.Ea, h_kinetics_data.Ea, nreactions, "KineticsType::Ea");
+
+  memory->create_kokkos( d_kinetics_data.stoich, h_kinetics_data.stoich, nreactions, nspecies, "KineticsType::stoich");
+  memory->create_kokkos( d_kinetics_data.stoichReactants, h_kinetics_data.stoichReactants, nreactions, nspecies, "KineticsType::stoichReactants");
+  memory->create_kokkos( d_kinetics_data.stoichProducts, h_kinetics_data.stoichProducts, nreactions, nspecies, "KineticsType::stoichProducts");
+
+  for (int i = 0; i < nreactions; ++i)
+  {
+    h_kinetics_data.Arr[i]  = Arr[i];
+    h_kinetics_data.nArr[i] = nArr[i];
+    h_kinetics_data.Ea[i]   = Ea[i];
+
+    for (int k = 0; k < nspecies; ++k)
+    {
+      h_kinetics_data.stoich(i,k) = stoich[i][k];
+      h_kinetics_data.stoichReactants(i,k) = stoichReactants[i][k];
+      h_kinetics_data.stoichProducts(i,k) = stoichProducts[i][k];
+    }
+  }
+
+  Kokkos::deep_copy( d_kinetics_data.Arr, h_kinetics_data.Arr );
+  Kokkos::deep_copy( d_kinetics_data.nArr, h_kinetics_data.nArr );
+  Kokkos::deep_copy( d_kinetics_data.Ea, h_kinetics_data.Ea );
+  Kokkos::deep_copy( d_kinetics_data.stoich, h_kinetics_data.stoich );
+  Kokkos::deep_copy( d_kinetics_data.stoichReactants, h_kinetics_data.stoichReactants );
+  Kokkos::deep_copy( d_kinetics_data.stoichProducts, h_kinetics_data.stoichProducts );
+
+  update_kinetics_data = false;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::pre_force(int vflag)
+{
+  printf("Inside FixRxKokkos<DeviceType>::pre_force localTempFlag= %d\n", localTempFlag);
+
+  if (update_kinetics_data)
+    create_kinetics_data();
+
+  TimerType timer_start = getTimeStamp();
+
+  int nlocal = atom->nlocal;
+  int nghost = atom->nghost;
+  int newton_pair = force->newton_pair;
+
+  const bool setToZero = false; // don't set the forward rates to zero.
+
+  if(localTempFlag){
+    int count = nlocal + (newton_pair ? nghost : 0);
+    dpdThetaLocal = new double[count];
+    memset(dpdThetaLocal, 0, sizeof(double)*count);
+    computeLocalTemperature();
+  }
+
+  TimerType timer_localTemperature = getTimeStamp();
+
+  // Total counters from the ODE solvers.
+  CounterType Counters;
+
+  // Set data needed in the operators.
+  int *mask = atom->mask;
+  double *dpdTheta = atom->dpdTheta;
+
+  const double boltz = force->boltz;
+  const double t_stop = update->dt; // DPD time-step and integration length.
+
+  /*if (odeIntegrationFlag == ODE_LAMMPS_RKF45 && diagnosticFrequency == 1)
+  {
+    memory->create( diagnosticCounterPerODE[StepSum], nlocal, "FixRX::diagnosticCounterPerODE");
+    memory->create( diagnosticCounterPerODE[FuncSum], nlocal, "FixRX::diagnosticCounterPerODE");
+  }*/
+
+  Kokkos::parallel_reduce( nlocal, LAMMPS_LAMBDA(int i, CounterType &counter)
+    {
+      if (mask[i] & groupbit)
+      {
+        double *y = new double[8*nspecies];
+        double *rwork = y + nspecies;
+
+        UserRHSData userData;
+        userData.kFor = new double[nreactions];
+        userData.rxnRateLaw = new double[nreactions];
+
+        CounterType counter_i;
+
+        const double theta = (localTempFlag) ? dpdThetaLocal[i] : dpdTheta[i];
+
+        //Compute the reaction rate constants
+        for (int irxn = 0; irxn < nreactions; irxn++)
+        {
+          if (setToZero)
+            userData.kFor[irxn] = 0.0;
+          else
+          {
+            userData.kFor[irxn] = d_kinetics_data.Arr(irxn) *
+                                   pow(theta, d_kinetics_data.nArr(irxn)) *
+                                   exp(-d_kinetics_data.Ea(irxn) / boltz / theta);
+            //userData.kFor[irxn] = Arr[irxn]*pow(theta,nArr[irxn])*exp(-Ea[irxn]/boltz/theta);
+          }
+        }
+
+        // Update ConcOld and initialize the ODE solution vector y[].
+        for (int ispecies = 0; ispecies < nspecies; ispecies++){
+          const double tmp = atom->dvector[ispecies][i];
+          atom->dvector[ispecies+nspecies][i] = tmp;
+          y[ispecies] = tmp;
+        }
+
+        // Solver the ODE system.
+        if (odeIntegrationFlag == ODE_LAMMPS_RK4)
+        {
+          rk4(t_stop, y, rwork, &userData);
+
+          /* This should be a duplicate of the copy-out in the 
+             rkf45 block but for the MY_EPSILON v. -1e-10 (literal)
+             difference. Can these be merged? */
+
+          // Store the solution back in atom->dvector.
+          for (int ispecies = 0; ispecies < nspecies; ispecies++){
+            if(y[ispecies] < -MY_EPSILON)
+              error->one(FLERR,"Computed concentration in RK4 solver is < -10*DBL_EPSILON");
+            else if(y[ispecies] < MY_EPSILON)
+              y[ispecies] = 0.0;
+            atom->dvector[ispecies][i] = y[ispecies];
+          }
+        }
+        else if (odeIntegrationFlag == ODE_LAMMPS_RKF45)
+        {
+          rkf45(nspecies, t_stop, y, rwork, &userData, counter_i);
+
+          // Store the solution back in atom->dvector.
+          for (int ispecies = 0; ispecies < nspecies; ispecies++){
+            if(y[ispecies] < -1.0e-10)
+              error->one(FLERR,"Computed concentration in RKF45 solver is < -1.0e-10");
+            else if(y[ispecies] < MY_EPSILON)
+              y[ispecies] = 0.0;
+            atom->dvector[ispecies][i] = y[ispecies];
+          }
+
+          //if (diagnosticFrequency == 1 && diagnosticCounterPerODE[StepSum] != NULL)
+          if (diagnosticCounterPerODE[StepSum] != NULL)
+          {
+            diagnosticCounterPerODE[StepSum][i] = counter_i.nSteps;
+            diagnosticCounterPerODE[FuncSum][i] = counter_i.nFuncs;
+          }
+        }
+
+        delete [] y;
+        delete [] userData.kFor;
+        delete [] userData.rxnRateLaw;
+
+        counter += counter_i;
+      } // if
+    } // parallel_for lambda-body
+
+    , Counters // reduction value
+  );
+
+  TimerType timer_ODE = getTimeStamp();
+
+  // Communicate the updated momenta and velocities to all nodes
+  comm->forward_comm_fix(this);
+  if(localTempFlag) delete [] dpdThetaLocal;
+
+  TimerType timer_stop = getTimeStamp();
+
+  double time_ODE = getElapsedTime(timer_localTemperature, timer_ODE);
+
+  printf("me= %d kokkos total= %g temp= %g ode= %g comm= %g nlocal= %d nfc= %d %d\n", comm->me,
+                         getElapsedTime(timer_start, timer_stop),
+                         getElapsedTime(timer_start, timer_localTemperature),
+                         getElapsedTime(timer_localTemperature, timer_ODE),
+                         getElapsedTime(timer_ODE, timer_stop), nlocal, Counters.nFuncs, Counters.nSteps);
+
+  // Warn the user if a failure was detected in the ODE solver.
+  if (Counters.nFails > 0){
+    char sbuf[128];
+    sprintf(sbuf,"in FixRX::pre_force, ODE solver failed for %d atoms.", Counters.nFails);
+    error->warning(FLERR, sbuf);
+  }
+
+/*
+  // Compute and report ODE diagnostics, if requested.
+  if (odeIntegrationFlag == ODE_LAMMPS_RKF45 && diagnosticFrequency != 0){
+    // Update the counters.
+    diagnosticCounter[StepSum] += nSteps;
+    diagnosticCounter[FuncSum] += nFuncs;
+    diagnosticCounter[TimeSum] += time_ODE;
+    diagnosticCounter[AtomSum] += nlocal;
+    diagnosticCounter[numDiagnosticCounters-1] ++;
+
+    if ( (diagnosticFrequency > 0 &&
+               ((update->ntimestep - update->firststep) % diagnosticFrequency) == 0) ||
+         (diagnosticFrequency < 0 && update->ntimestep == update->laststep) )
+      this->odeDiagnostics();
+
+    for (int i = 0; i < numDiagnosticCounters; ++i)
+      if (diagnosticCounterPerODE[i])
+        memory->destroy( diagnosticCounterPerODE[i] );
+  } */
+}
+
+namespace LAMMPS_NS {
+template class FixRxKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class FixRxKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/fix_rx_kokkos.h b/src/KOKKOS/fix_rx_kokkos.h
new file mode 100644
index 0000000000..4a41644257
--- /dev/null
+++ b/src/KOKKOS/fix_rx_kokkos.h
@@ -0,0 +1,124 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(rx/kk,FixRxKokkos<LMPDeviceType>)
+FixStyle(rx/kk/device,FixRxKokkos<LMPDeviceType>)
+FixStyle(rx/kk/host,FixRxKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_FIX_RX_KOKKOS_H
+#define LMP_FIX_RX_KOKKOS_H
+
+#include "fix_rx.h"
+#include "pair_dpd_fdt_energy_kokkos.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+template <bool _setToZero>
+struct TagFixRxKokkosSolver
+{
+  enum { setToZero = (_setToZero == true) ? 1 : 0 };
+};
+
+template <typename DeviceType>
+class FixRxKokkos : public FixRX {
+ public:
+  FixRxKokkos(class LAMMPS *, int, char **);
+  virtual ~FixRxKokkos();
+  virtual void init();
+  virtual void pre_force(int);
+
+  //template <typename SolverTag>
+  //  KOKKOS_INLINE_FUNCTION
+  //void operator()(SolverTag, const int&) const;
+
+  struct CounterType
+  {
+    int nSteps, nIters, nFuncs, nFails;
+
+    CounterType() : nSteps(0), nIters(0), nFuncs(0), nFails(0) {};
+
+    KOKKOS_INLINE_FUNCTION
+    CounterType& operator+=(const CounterType &rhs)
+    {
+      nSteps += rhs.nSteps;
+      nIters += rhs.nIters;
+      nFuncs += rhs.nFuncs;
+      nFails += rhs.nFails;
+      return *this;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    volatile CounterType& operator+=(const volatile CounterType &rhs) volatile
+    {
+      nSteps += rhs.nSteps;
+      nIters += rhs.nIters;
+      nFuncs += rhs.nFuncs;
+      nFails += rhs.nFails;
+      return *this;
+    }
+  };
+
+ protected:
+  PairDPDfdtEnergyKokkos<DeviceType>* pairDPDEKK;
+
+  void solve_reactions(void);
+
+  int rhs(double, const double *, double *, void *) const;
+  int rhs_dense (double, const double *, double *, void *) const;
+  int rhs_sparse(double, const double *, double *, void *) const;
+
+  //!< Classic Runge-Kutta 4th-order stepper.
+  void rk4(const double t_stop, double *y, double *rwork, void *v_params) const;
+
+  //!< Runge-Kutta-Fehlberg ODE Solver.
+  void rkf45(const int neq, const double t_stop, double *y, double *rwork, void *v_params, CounterType& counter) const;
+
+  //!< Runge-Kutta-Fehlberg ODE stepper function.
+  void rkf45_step (const int neq, const double h, double y[], double y_out[],
+                   double rwk[], void *) const;
+
+  //!< Initial step size estimation for the Runge-Kutta-Fehlberg ODE solver.
+  int rkf45_h0 (const int neq, const double t, const double t_stop,
+                     const double hmin, const double hmax,
+                     double& h0, double y[], double rwk[], void *v_params) const;
+
+  template <typename KokkosDeviceType>
+  struct KineticsType
+  {
+    typename ArrayTypes<KokkosDeviceType>::t_float_1d Arr, nArr, Ea;
+    typename ArrayTypes<KokkosDeviceType>::t_float_2d stoich, stoichReactants, stoichProducts;
+  };
+
+  //!< Kokkos versions of the kinetics data.
+  KineticsType<LMPHostType> h_kinetics_data;
+  KineticsType<DeviceType>  d_kinetics_data;
+
+  bool update_kinetics_data;
+
+  void create_kinetics_data(void);
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+*/

From 41d3903f5a7226ef4b30d3fd6b818123354300d9 Mon Sep 17 00:00:00 2001
From: Christopher Stone <chris.stone@computational-science.com>
Date: Sun, 22 Jan 2017 22:49:21 -0500
Subject: [PATCH 109/267] Added kokkos-managed parameters for FixRxKokkos.

- Added kokkos-managed parameter data for the kinetics equations.
- Removed dependencies in rhs() on atom and domain objects.

TODO:
  1. Switch to using KOKKOS data for dvector.
  2. Port ComputeLocalTemp(...) to Kokkos (needs pairing algorithm).
---
 src/KOKKOS/fix_rx_kokkos.cpp | 135 +++++++++++++++++++++++------------
 src/KOKKOS/fix_rx_kokkos.h   |  13 +++-
 2 files changed, 101 insertions(+), 47 deletions(-)

diff --git a/src/KOKKOS/fix_rx_kokkos.cpp b/src/KOKKOS/fix_rx_kokkos.cpp
index f8a10dff93..b989d6b2d4 100644
--- a/src/KOKKOS/fix_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@@ -475,8 +475,8 @@ int FixRxKokkos<DeviceType>::rhs_dense(double t, const double *y, double *dydt,
   double *rxnRateLaw = userData->rxnRateLaw;
   double *kFor       = userData->kFor;
 
-  const double VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
-  const int nspecies = atom->nspecies_dpd;
+  //const double VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
+  //const int nspecies = atom->nspecies_dpd;
 
   for(int ispecies=0; ispecies<nspecies; ispecies++)
     dydt[ispecies] = 0.0;
@@ -487,7 +487,7 @@ int FixRxKokkos<DeviceType>::rhs_dense(double t, const double *y, double *dydt,
 
     for(int ispecies=0; ispecies<nspecies; ispecies++){
       const double concentration = y[ispecies]/VDPD;
-      rxnRateLawForward *= pow( concentration, d_kinetics_data.stoichReactants(jrxn,ispecies) );
+      rxnRateLawForward *= pow( concentration, d_kineticsData.stoichReactants(jrxn,ispecies) );
       //rxnRateLawForward *= pow(concentration,stoichReactants[jrxn][ispecies]);
     }
     rxnRateLaw[jrxn] = rxnRateLawForward;
@@ -497,7 +497,7 @@ int FixRxKokkos<DeviceType>::rhs_dense(double t, const double *y, double *dydt,
   for(int ispecies=0; ispecies<nspecies; ispecies++)
     for(int jrxn=0; jrxn<nreactions; jrxn++)
     {
-      dydt[ispecies] += d_kinetics_data.stoich(jrxn,ispecies) *VDPD*rxnRateLaw[jrxn];
+      dydt[ispecies] += d_kineticsData.stoich(jrxn,ispecies) *VDPD*rxnRateLaw[jrxn];
       //dydt[ispecies] += stoich[jrxn][ispecies]*VDPD*rxnRateLaw[jrxn];
     }
 
@@ -511,7 +511,7 @@ int FixRxKokkos<DeviceType>::rhs_sparse(double t, const double *y, double *dydt,
 {
    UserRHSData *userData = (UserRHSData *) v_params;
 
-   const double VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
+   //const double VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
 
    #define kFor         (userData->kFor)
    #define kRev         (NULL)
@@ -519,11 +519,11 @@ int FixRxKokkos<DeviceType>::rhs_sparse(double t, const double *y, double *dydt,
    #define conc         (dydt)
    #define maxReactants (this->sparseKinetics_maxReactants)
    #define maxSpecies   (this->sparseKinetics_maxSpecies)
-   #define nuk          (this->sparseKinetics_nuk)
-   #define nu           (this->sparseKinetics_nu)
-   #define inu          (this->sparseKinetics_inu)
-   #define isIntegral(idx) (SparseKinetics_enableIntegralReactions \
-                             && this->sparseKinetics_isIntegralReaction[idx])
+   #define nuk          (this->d_kineticsData.nuk)
+   #define nu           (this->d_kineticsData.nu)
+   #define inu          (this->d_kineticsData.inu)
+   #define isIntegral(idx) ( SparseKinetics_enableIntegralReactions \
+                             && this->d_kineticsData.isIntegral(idx) )
 
    for (int k = 0; k < nspecies; ++k)
       conc[k] = y[k] / VDPD;
@@ -533,20 +533,20 @@ int FixRxKokkos<DeviceType>::rhs_sparse(double t, const double *y, double *dydt,
    {
       double rxnRateLawForward;
       if (isIntegral(i)){
-         rxnRateLawForward = kFor[i] * powint( conc[ nuk[i][0] ], inu[i][0]);
+         rxnRateLawForward = kFor[i] * powint( conc[ nuk(i,0) ], inu(i,0) );
          for (int kk = 1; kk < maxReactants; ++kk){
-            const int k = nuk[i][kk];
+            const int k = nuk(i,kk);
             if (k == SparseKinetics_invalidIndex) break;
             //if (k != SparseKinetics_invalidIndex)
-               rxnRateLawForward *= powint( conc[k], inu[i][kk] );
+               rxnRateLawForward *= powint( conc[k], inu(i,kk) );
          }
       } else {
-         rxnRateLawForward = kFor[i] * pow( conc[ nuk[i][0] ], nu[i][0]);
+         rxnRateLawForward = kFor[i] * pow( conc[ nuk(i,0) ], nu(i,0) );
          for (int kk = 1; kk < maxReactants; ++kk){
-            const int k = nuk[i][kk];
+            const int k = nuk(i,kk);
             if (k == SparseKinetics_invalidIndex) break;
             //if (k != SparseKinetics_invalidIndex)
-               rxnRateLawForward *= pow( conc[k], nu[i][kk] );
+               rxnRateLawForward *= pow( conc[k], nu(i,kk) );
          }
       }
 
@@ -560,21 +560,21 @@ int FixRxKokkos<DeviceType>::rhs_sparse(double t, const double *y, double *dydt,
 
    for (int i = 0; i < nreactions; ++i){
       // Reactants ...
-      dydt[ nuk[i][0] ] -= nu[i][0] * rxnRateLaw[i];
+      dydt[ nuk(i,0) ] -= nu(i,0) * rxnRateLaw[i];
       for (int kk = 1; kk < maxReactants; ++kk){
-         const int k = nuk[i][kk];
+         const int k = nuk(i,kk);
          if (k == SparseKinetics_invalidIndex) break;
          //if (k != SparseKinetics_invalidIndex)
-            dydt[k] -= nu[i][kk] * rxnRateLaw[i];
+            dydt[k] -= nu(i,kk) * rxnRateLaw[i];
       }
 
       // Products ...
-      dydt[ nuk[i][maxReactants] ] += nu[i][maxReactants] * rxnRateLaw[i];
+      dydt[ nuk(i,maxReactants) ] += nu(i,maxReactants) * rxnRateLaw[i];
       for (int kk = maxReactants+1; kk < maxSpecies; ++kk){
-         const int k = nuk[i][kk];
+         const int k = nuk(i,kk);
          if (k == SparseKinetics_invalidIndex) break;
          //if (k != SparseKinetics_invalidIndex)
-            dydt[k] += nu[i][kk] * rxnRateLaw[i];
+            dydt[k] += nu(i,kk) * rxnRateLaw[i];
       }
    }
 
@@ -674,34 +674,76 @@ void FixRxKokkos<DeviceType>::create_kinetics_data(void)
 {
   printf("Inside FixRxKokkos::create_kinetics_data\n");
 
-  memory->create_kokkos( d_kinetics_data.Arr, h_kinetics_data.Arr, nreactions, "KineticsType::Arr");
-  memory->create_kokkos( d_kinetics_data.nArr, h_kinetics_data.nArr, nreactions, "KineticsType::nArr");
-  memory->create_kokkos( d_kinetics_data.Ea, h_kinetics_data.Ea, nreactions, "KineticsType::Ea");
-
-  memory->create_kokkos( d_kinetics_data.stoich, h_kinetics_data.stoich, nreactions, nspecies, "KineticsType::stoich");
-  memory->create_kokkos( d_kinetics_data.stoichReactants, h_kinetics_data.stoichReactants, nreactions, nspecies, "KineticsType::stoichReactants");
-  memory->create_kokkos( d_kinetics_data.stoichProducts, h_kinetics_data.stoichProducts, nreactions, nspecies, "KineticsType::stoichProducts");
+  memory->create_kokkos( d_kineticsData.Arr, h_kineticsData.Arr, nreactions, "KineticsType::Arr");
+  memory->create_kokkos( d_kineticsData.nArr, h_kineticsData.nArr, nreactions, "KineticsType::nArr");
+  memory->create_kokkos( d_kineticsData.Ea, h_kineticsData.Ea, nreactions, "KineticsType::Ea");
 
   for (int i = 0; i < nreactions; ++i)
   {
-    h_kinetics_data.Arr[i]  = Arr[i];
-    h_kinetics_data.nArr[i] = nArr[i];
-    h_kinetics_data.Ea[i]   = Ea[i];
+    h_kineticsData.Arr[i]  = Arr[i];
+    h_kineticsData.nArr[i] = nArr[i];
+    h_kineticsData.Ea[i]   = Ea[i];
+  }
 
-    for (int k = 0; k < nspecies; ++k)
+  Kokkos::deep_copy( d_kineticsData.Arr, h_kineticsData.Arr );
+  Kokkos::deep_copy( d_kineticsData.nArr, h_kineticsData.nArr );
+  Kokkos::deep_copy( d_kineticsData.Ea, h_kineticsData.Ea );
+
+  if (useSparseKinetics)
+  {
+
+    memory->create_kokkos( d_kineticsData.nu , h_kineticsData.nu , nreactions, sparseKinetics_maxSpecies, "KineticsType::nu");
+    memory->create_kokkos( d_kineticsData.nuk, h_kineticsData.nuk, nreactions, sparseKinetics_maxSpecies, "KineticsType::nuk");
+
+    for (int i = 0; i < nreactions; ++i)
+      for (int k = 0; k < sparseKinetics_maxSpecies; ++k)
+      {
+        h_kineticsData.nu (i,k) = sparseKinetics_nu [i][k];
+        h_kineticsData.nuk(i,k) = sparseKinetics_nuk[i][k];
+      }
+
+    Kokkos::deep_copy( d_kineticsData.nu, h_kineticsData.nu );
+    Kokkos::deep_copy( d_kineticsData.nuk, h_kineticsData.nuk );
+
+    if (SparseKinetics_enableIntegralReactions)
     {
-      h_kinetics_data.stoich(i,k) = stoich[i][k];
-      h_kinetics_data.stoichReactants(i,k) = stoichReactants[i][k];
-      h_kinetics_data.stoichProducts(i,k) = stoichProducts[i][k];
+      memory->create_kokkos( d_kineticsData.inu, h_kineticsData.inu, nreactions, sparseKinetics_maxSpecies, "KineticsType::inu");
+      memory->create_kokkos( d_kineticsData.isIntegral, h_kineticsData.isIntegral, nreactions, "KineticsType::isIntegral");
+
+      for (int i = 0; i < nreactions; ++i)
+      {
+        h_kineticsData.isIntegral(i) = sparseKinetics_isIntegralReaction[i];
+
+        for (int k = 0; k < sparseKinetics_maxSpecies; ++k)
+          h_kineticsData.inu(i,k) = sparseKinetics_inu[i][k];
+      }
+
+      Kokkos::deep_copy( d_kineticsData.inu, h_kineticsData.inu );
+      Kokkos::deep_copy( d_kineticsData.isIntegral, h_kineticsData.isIntegral );
     }
   }
 
-  Kokkos::deep_copy( d_kinetics_data.Arr, h_kinetics_data.Arr );
-  Kokkos::deep_copy( d_kinetics_data.nArr, h_kinetics_data.nArr );
-  Kokkos::deep_copy( d_kinetics_data.Ea, h_kinetics_data.Ea );
-  Kokkos::deep_copy( d_kinetics_data.stoich, h_kinetics_data.stoich );
-  Kokkos::deep_copy( d_kinetics_data.stoichReactants, h_kinetics_data.stoichReactants );
-  Kokkos::deep_copy( d_kinetics_data.stoichProducts, h_kinetics_data.stoichProducts );
+  //else
+  //{
+
+    // Dense option
+    memory->create_kokkos( d_kineticsData.stoich, h_kineticsData.stoich, nreactions, nspecies, "KineticsType::stoich");
+    memory->create_kokkos( d_kineticsData.stoichReactants, h_kineticsData.stoichReactants, nreactions, nspecies, "KineticsType::stoichReactants");
+    memory->create_kokkos( d_kineticsData.stoichProducts, h_kineticsData.stoichProducts, nreactions, nspecies, "KineticsType::stoichProducts");
+
+    for (int i = 0; i < nreactions; ++i)
+      for (int k = 0; k < nspecies; ++k)
+      {
+        h_kineticsData.stoich(i,k) = stoich[i][k];
+        h_kineticsData.stoichReactants(i,k) = stoichReactants[i][k];
+        h_kineticsData.stoichProducts(i,k) = stoichProducts[i][k];
+      }
+
+    Kokkos::deep_copy( d_kineticsData.stoich, h_kineticsData.stoich );
+    Kokkos::deep_copy( d_kineticsData.stoichReactants, h_kineticsData.stoichReactants );
+    Kokkos::deep_copy( d_kineticsData.stoichProducts, h_kineticsData.stoichProducts );
+
+  //}
 
   update_kinetics_data = false;
 }
@@ -743,6 +785,9 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
   const double boltz = force->boltz;
   const double t_stop = update->dt; // DPD time-step and integration length.
 
+  // Average DPD volume. Used in the RHS function.
+  this->VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
+
   /*if (odeIntegrationFlag == ODE_LAMMPS_RKF45 && diagnosticFrequency == 1)
   {
     memory->create( diagnosticCounterPerODE[StepSum], nlocal, "FixRX::diagnosticCounterPerODE");
@@ -771,9 +816,9 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
             userData.kFor[irxn] = 0.0;
           else
           {
-            userData.kFor[irxn] = d_kinetics_data.Arr(irxn) *
-                                   pow(theta, d_kinetics_data.nArr(irxn)) *
-                                   exp(-d_kinetics_data.Ea(irxn) / boltz / theta);
+            userData.kFor[irxn] = d_kineticsData.Arr(irxn) *
+                                   pow(theta, d_kineticsData.nArr(irxn)) *
+                                   exp(-d_kineticsData.Ea(irxn) / boltz / theta);
             //userData.kFor[irxn] = Arr[irxn]*pow(theta,nArr[irxn])*exp(-Ea[irxn]/boltz/theta);
           }
         }
diff --git a/src/KOKKOS/fix_rx_kokkos.h b/src/KOKKOS/fix_rx_kokkos.h
index 4a41644257..95872c67e9 100644
--- a/src/KOKKOS/fix_rx_kokkos.h
+++ b/src/KOKKOS/fix_rx_kokkos.h
@@ -75,6 +75,7 @@ class FixRxKokkos : public FixRX {
 
  protected:
   PairDPDfdtEnergyKokkos<DeviceType>* pairDPDEKK;
+  double VDPD;
 
   void solve_reactions(void);
 
@@ -100,13 +101,21 @@ class FixRxKokkos : public FixRX {
   template <typename KokkosDeviceType>
   struct KineticsType
   {
+    // Arrhenius rate coefficients.
     typename ArrayTypes<KokkosDeviceType>::t_float_1d Arr, nArr, Ea;
+
+    // Dense versions.
     typename ArrayTypes<KokkosDeviceType>::t_float_2d stoich, stoichReactants, stoichProducts;
+
+    // Sparse versions.
+    typename ArrayTypes<KokkosDeviceType>::t_int_2d   nuk, inu;
+    typename ArrayTypes<KokkosDeviceType>::t_float_2d nu;
+    typename ArrayTypes<KokkosDeviceType>::t_int_1d   isIntegral;
   };
 
   //!< Kokkos versions of the kinetics data.
-  KineticsType<LMPHostType> h_kinetics_data;
-  KineticsType<DeviceType>  d_kinetics_data;
+  KineticsType<LMPHostType> h_kineticsData;
+  KineticsType<DeviceType>  d_kineticsData;
 
   bool update_kinetics_data;
 

From 70fa9189a8c8d74ac1dc09084c15260aaa204612 Mon Sep 17 00:00:00 2001
From: Christopher Stone <chris.stone@computational-science.com>
Date: Tue, 24 Jan 2017 21:49:16 -0500
Subject: [PATCH 110/267] Updated KOKKOS installer and updated USER-DPD FixRx
 to match KOKKOS version.

- Updated the KOKKOS installer to include the fix_rx_kokkos.[cpp,h].
- Updated the USER-DPD version of fix_rx.[cpp,h] to sync with the Kokkos
  version. Solves child->parent class dependencies.
---
 src/KOKKOS/Install.sh   |   2 +
 src/USER-DPD/fix_rx.cpp | 244 +++++++++++++++++++++++++++++-----------
 src/USER-DPD/fix_rx.h   |  23 ++--
 3 files changed, 193 insertions(+), 76 deletions(-)

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index f53f8624c4..db4fcf8ddc 100644
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -103,6 +103,8 @@ action fix_wall_reflect_kokkos.cpp
 action fix_wall_reflect_kokkos.h
 action fix_dpd_energy_kokkos.cpp fix_dpd_energy.cpp
 action fix_dpd_energy_kokkos.h fix_dpd_energy.h
+action fix_rx_kokkos.cpp fix_rx.cpp
+action fix_rx_kokkos.h fix_rx.h
 action gridcomm_kokkos.cpp gridcomm.cpp
 action gridcomm_kokkos.h gridcomm.h
 action improper_harmonic_kokkos.cpp improper_harmonic.cpp
diff --git a/src/USER-DPD/fix_rx.cpp b/src/USER-DPD/fix_rx.cpp
index a55ae78110..28321dbecf 100644
--- a/src/USER-DPD/fix_rx.cpp
+++ b/src/USER-DPD/fix_rx.cpp
@@ -673,7 +673,17 @@ void FixRX::setup_pre_force(int vflag)
 
   if(restartFlag){
     restartFlag = 0;
-  } else {
+  }
+  else
+  {
+    int ode_counter[4] = {0};
+
+    UserRHSData userData;
+    userData.kFor = new double[nreactions];
+    userData.rxnRateLaw = new double[nreactions];
+
+    double *rwork = new double[8*nspecies];
+
     if(localTempFlag){
       int count = nlocal + (newton_pair ? nghost : 0);
       dpdThetaLocal = new double[count];
@@ -686,22 +696,27 @@ void FixRX::setup_pre_force(int vflag)
         tmp = atom->dvector[ispecies][id];
         atom->dvector[ispecies+nspecies][id] = tmp;
       }
+
     for (int i = 0; i < nlocal; i++)
       if (mask[i] & groupbit){
 
         // Set the reaction rate constants to zero:  no reactions occur at step 0
         for(int irxn=0;irxn<nreactions;irxn++)
-          kR[irxn] = 0.0;
+          userData.kFor[irxn] = 0.0;
 
         if (odeIntegrationFlag == ODE_LAMMPS_RK4)
-          rk4(i,NULL);
+          rk4(i, rwork, &userData);
         else if (odeIntegrationFlag == ODE_LAMMPS_RKF45)
-          rkf45(i,NULL);
+          rkf45(i, rwork, &userData, ode_counter);
       }
 
     // Communicate the updated momenta and velocities to all nodes
     comm->forward_comm_fix(this);
     if(localTempFlag) delete [] dpdThetaLocal;
+
+    delete [] userData.kFor;
+    delete [] userData.rxnRateLaw;
+    delete [] rwork;
   }
 }
 
@@ -709,12 +724,13 @@ void FixRX::setup_pre_force(int vflag)
 
 void FixRX::pre_force(int vflag)
 {
+  TimerType timer_start = getTimeStamp();
+
   int nlocal = atom->nlocal;
   int nghost = atom->nghost;
   int *mask = atom->mask;
   double *dpdTheta = atom->dpdTheta;
   int newton_pair = force->newton_pair;
-  double theta;
 
   if(localTempFlag){
     int count = nlocal + (newton_pair ? nghost : 0);
@@ -726,7 +742,10 @@ void FixRX::pre_force(int vflag)
   TimerType timer_localTemperature = getTimeStamp();
 
   // Zero the counters for the ODE solvers.
-  this->nSteps = this->nIters = this->nFuncs = this->nFails = 0;
+  int nSteps = 0;
+  int nIters = 0;
+  int nFuncs = 0;
+  int nFails = 0;
 
   if (odeIntegrationFlag == ODE_LAMMPS_RKF45 && diagnosticFrequency == 1)
   {
@@ -734,35 +753,66 @@ void FixRX::pre_force(int vflag)
     memory->create( diagnosticCounterPerODE[FuncSum], nlocal, "FixRX::diagnosticCounterPerODE");
   }
 
-  double *rwork = new double[8*nspecies + nreactions];
+  #pragma omp parallel \
+     reduction(+: nSteps, nIters, nFuncs, nFails )
+  {
+    double *rwork = new double[8*nspecies];
 
-  for (int i = 0; i < nlocal; i++)
-    if (mask[i] & groupbit){
-      if (localTempFlag)
-        theta = dpdThetaLocal[i];
-      else
-        theta = dpdTheta[i];
+    UserRHSData userData;
+    userData.kFor = new double[nreactions];
+    userData.rxnRateLaw = new double[nreactions];
 
-      //Compute the reaction rate constants
-      for (int irxn = 0; irxn < nreactions; irxn++)
-        kR[irxn] = Arr[irxn]*pow(theta,nArr[irxn])*exp(-Ea[irxn]/force->boltz/theta);
+    int ode_counter[4] = { 0 };
 
-      if (odeIntegrationFlag == ODE_LAMMPS_RK4)
-        rk4(i,rwork);
-      else if (odeIntegrationFlag == ODE_LAMMPS_RKF45)
-        rkf45(i,rwork);
+    #pragma omp for schedule(runtime)
+    for (int i = 0; i < nlocal; i++)
+    {
+      if (mask[i] & groupbit)
+      {
+        double theta;
+        if (localTempFlag)
+          theta = dpdThetaLocal[i];
+        else
+          theta = dpdTheta[i];
+
+        //Compute the reaction rate constants
+        for (int irxn = 0; irxn < nreactions; irxn++)
+          userData.kFor[irxn] = Arr[irxn]*pow(theta,nArr[irxn])*exp(-Ea[irxn]/force->boltz/theta);
+
+        if (odeIntegrationFlag == ODE_LAMMPS_RK4)
+          rk4(i, rwork, &userData);
+        else if (odeIntegrationFlag == ODE_LAMMPS_RKF45)
+          rkf45(i, rwork, &userData, ode_counter);
+      }
     }
 
-  TimerType timer_ODE = getTimeStamp();
+    nSteps += ode_counter[0];
+    nIters += ode_counter[1];
+    nFuncs += ode_counter[2];
+    nFails += ode_counter[3];
 
-  delete [] rwork;
+    delete [] rwork;
+    delete [] userData.kFor;
+    delete [] userData.rxnRateLaw;
+
+  } // end parallel region
+
+  TimerType timer_ODE = getTimeStamp();
 
   // Communicate the updated momenta and velocities to all nodes
   comm->forward_comm_fix(this);
   if(localTempFlag) delete [] dpdThetaLocal;
 
+  TimerType timer_stop = getTimeStamp();
+
   double time_ODE = getElapsedTime(timer_localTemperature, timer_ODE);
 
+  printf("me= %d total= %g temp= %g ode= %g comm= %g nlocal= %d nfc= %d %d\n", comm->me,
+                         getElapsedTime(timer_start, timer_stop),
+                         getElapsedTime(timer_start, timer_localTemperature),
+                         getElapsedTime(timer_localTemperature, timer_ODE),
+                         getElapsedTime(timer_ODE, timer_stop), nlocal, nFuncs, nSteps);
+
   // Warn the user if a failure was detected in the ODE solver.
   if (nFails > 0){
     char sbuf[128];
@@ -958,21 +1008,15 @@ void FixRX::setupParams()
 
 /* ---------------------------------------------------------------------- */
 
-void FixRX::rk4(int id, double *rwork)
+void FixRX::rk4(int id, double *rwork, void* v_params)
 {
-  double *k1 = NULL;
-  if (rwork == NULL)
-    k1 = new double[6*nspecies + nreactions];
-  else
-    k1 = rwork;
+  double *k1 = rwork;
   double *k2 = k1 + nspecies;
   double *k3 = k2 + nspecies;
   double *k4 = k3 + nspecies;
   double *y  = k4 + nspecies;
   double *yp = y  + nspecies;
 
-  double *dummyArray = yp + nspecies; // Passed to the rhs function.
-
   const int numSteps = minSteps;
 
   const double h = update->dt / double(numSteps);
@@ -989,25 +1033,25 @@ void FixRX::rk4(int id, double *rwork)
   for (int step = 0; step < numSteps; step++)
   {
     // k1
-    rhs(0.0,y,k1,dummyArray);
+    rhs(0.0,y,k1,v_params);
 
     // k2
     for (int ispecies = 0; ispecies < nspecies; ispecies++)
       yp[ispecies] = y[ispecies] + 0.5*h*k1[ispecies];
 
-    rhs(0.0,yp,k2,dummyArray);
+    rhs(0.0,yp,k2,v_params);
 
     // k3
     for (int ispecies = 0; ispecies < nspecies; ispecies++)
       yp[ispecies] = y[ispecies] + 0.5*h*k2[ispecies];
 
-    rhs(0.0,yp,k3,dummyArray);
+    rhs(0.0,yp,k3,v_params);
 
     // k4
     for (int ispecies = 0; ispecies < nspecies; ispecies++)
       yp[ispecies] = y[ispecies] + h*k3[ispecies];
 
-    rhs(0.0,yp,k4,dummyArray);
+    rhs(0.0,yp,k4,v_params);
 
     for (int ispecies = 0; ispecies < nspecies; ispecies++)
       y[ispecies] += h*(k1[ispecies]/6.0 + k2[ispecies]/3.0 + k3[ispecies]/3.0 + k4[ispecies]/6.0);
@@ -1022,9 +1066,6 @@ void FixRX::rk4(int id, double *rwork)
       y[ispecies] = 0.0;
     atom->dvector[ispecies][id] = y[ispecies];
   }
-
-  if (rwork == NULL)
-    delete [] k1;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -1274,6 +1315,78 @@ void FixRX::odeDiagnostics(void)
   double max_per_proc[numCounters];
   double min_per_proc[numCounters];
 
+  if(1)
+  {
+     static bool firstStep = true;
+
+     static TimerType oldTimeStamp (-1);
+
+     TimerType now = getTimeStamp();
+
+     // Query the fix database and look for rx_weight for the balance fix.
+     int type_flag = -1;
+     int rx_weight_index = atom->find_custom( "rx_weight", /*0:int, 1:float*/ type_flag );
+
+     // Compute the average # of neighbors.
+     double averageNumNeighbors = 0;
+     {
+        const int inum = pairDPDE->list->inum;
+        const int* ilist = pairDPDE->list->ilist;
+        const int* numneigh = pairDPDE->list->numneigh;
+
+        for (int ii = 0; ii < inum; ++ii)
+        {
+           const int i = ilist[ii];
+           averageNumNeighbors += numneigh[i];
+        }
+
+        averageNumNeighbors /= inum;
+     }
+
+     printf("me= %d nst= %g nfc= %g time= %g nlocal= %g lmpnst= %g weight_idx= %d 1st= %d aveNeigh= %g\n", comm->me, this->diagnosticCounter[0], this->diagnosticCounter[1], this->diagnosticCounter[2], this->diagnosticCounter[3], this->diagnosticCounter[4], rx_weight_index, firstStep, averageNumNeighbors);
+
+     if (rx_weight_index != -1 && !firstStep && 0)
+     {
+        double *rx_weight = atom->dvector[rx_weight_index];
+
+        const int nlocal = atom->nlocal;
+        const int *mask = atom->mask;
+
+        if (odeIntegrationFlag == ODE_LAMMPS_RKF45 && diagnosticFrequency == 1)
+        {
+          const double total_time = getElapsedTime( oldTimeStamp, now );
+          const double fixrx_time = this->diagnosticCounter[TimeSum];
+          const double time_ratio = fixrx_time / total_time;
+
+          double tsum = 0.0;
+          double tmin = 100000, tmax = 0;
+          for (int i = 0; i < nlocal; ++i)
+            if (mask[i] & groupbit)
+            {
+              double nfunc_ratio = double( diagnosticCounterPerODE[FuncSum][i] ) / diagnosticCounter[FuncSum];
+              rx_weight[i] = nfunc_ratio * fixrx_time + (total_time - fixrx_time) / nlocal;
+              tmin = fmin( tmin, rx_weight[i] );
+              tmax = fmax( tmax, rx_weight[i] );
+              tsum += rx_weight[i];
+              //rx_weight[i] = (double) diagnosticCounterPerODE[FuncSum][i];
+            }
+
+          printf("me= %d total= %g fixrx= %g ratio= %g tsum= %g %g %g %g\n", comm->me, total_time, fixrx_time, time_ratio, tsum, (total_time - fixrx_time) / nlocal, tmin, tmax);
+        }
+        else
+        {
+          error->warning(FLERR, "Dynamic load balancing enabled but per-atom weights not available.");
+
+          for (int i = 0; i < nlocal; ++i)
+            if (mask[i] & groupbit)
+              rx_weight[i] = 1.0;
+        }
+     }
+
+     firstStep = false;
+     oldTimeStamp = now;
+  }
+
   // Compute counters per dpd time-step.
   for (int i = 0; i < numCounters; ++i){
     my_vals[i] = this->diagnosticCounter[i] / nTimes;
@@ -1347,7 +1460,7 @@ void FixRX::odeDiagnostics(void)
     if (screen)  fprintf(screen,"%s\n", smesg); \
     if (logfile) fprintf(logfile,"%s\n", smesg); }
 
-    sprintf(smesg, "FixRX::ODE Diagnostics:  # of steps  |# of rhs evals| run-time (sec)");
+    sprintf(smesg, "FixRX::ODE Diagnostics:  # of iters  |# of rhs evals| run-time (sec) | # atoms");
     print_mesg(smesg);
 
     sprintf(smesg, "         AVG per ODE  : %-12.5g | %-12.5g | %-12.5g", avg_per_atom[0], avg_per_atom[1], avg_per_atom[2]);
@@ -1369,7 +1482,7 @@ void FixRX::odeDiagnostics(void)
       print_mesg(smesg);
     }
 
-    sprintf(smesg, "         AVG per Proc : %-12.5g | %-12.5g | %-12.5g", avg_per_proc[0], avg_per_proc[1], avg_per_proc[2]);
+    sprintf(smesg, "         AVG per Proc : %-12.5g | %-12.5g | %-12.5g | %-12.5g", avg_per_proc[StepSum], avg_per_proc[FuncSum], avg_per_proc[TimeSum], avg_per_proc[AtomSum]);
     print_mesg(smesg);
 
     if (comm->nprocs > 1){
@@ -1377,13 +1490,13 @@ void FixRX::odeDiagnostics(void)
       for (int i = 0; i < numCounters; ++i)
         rms_per_proc[i] = sqrt( sum_sq[i] / comm->nprocs );
 
-      sprintf(smesg, "         RMS per Proc : %-12.5g | %-12.5g | %-12.5g", rms_per_proc[0], rms_per_proc[1], rms_per_proc[2]);
+      sprintf(smesg, "         RMS per Proc : %-12.5g | %-12.5g | %-12.5g | %-12.5g", rms_per_proc[0], rms_per_proc[1], rms_per_proc[2], rms_per_proc[AtomSum]);
       print_mesg(smesg);
 
-      sprintf(smesg, "         MAX per Proc : %-12.5g | %-12.5g | %-12.5g", max_per_proc[0], max_per_proc[1], max_per_proc[2]);
+      sprintf(smesg, "         MAX per Proc : %-12.5g | %-12.5g | %-12.5g | %-12.5g", max_per_proc[0], max_per_proc[1], max_per_proc[2], max_per_proc[AtomSum]);
       print_mesg(smesg);
 
-      sprintf(smesg, "         MIN per Proc : %-12.5g | %-12.5g | %-12.5g", min_per_proc[0], min_per_proc[1], min_per_proc[2]);
+      sprintf(smesg, "         MIN per Proc : %-12.5g | %-12.5g | %-12.5g | %-12.5g", min_per_proc[0], min_per_proc[1], min_per_proc[2], min_per_proc[AtomSum]);
       print_mesg(smesg);
     }
 
@@ -1403,7 +1516,7 @@ void FixRX::odeDiagnostics(void)
   return;
 }
 
-void FixRX::rkf45(int id, double *rwork)
+void FixRX::rkf45(int id, double *rwork, void *v_param, int ode_counter[])
 {
   // Rounding coefficient.
   const double uround = DBL_EPSILON;
@@ -1412,12 +1525,7 @@ void FixRX::rkf45(int id, double *rwork)
   const double adaption_limit = 4.0;
 
   //double *y = new double[8*nspecies + nreactions];
-  double *y = NULL;
-  if (rwork == NULL)
-    y = new double[8*nspecies + nreactions];
-  else
-    y = rwork;
-  double *rhstmp = y + 8*nspecies;
+  double *y = rwork;
 
   const int neq = nspecies;
 
@@ -1454,7 +1562,7 @@ void FixRX::rkf45(int id, double *rwork)
   if (h < h_min){
     //fprintf(stderr,"hin not implemented yet\n");
     //exit(-1);
-    nfe = rkf45_h0 (neq, t, t_stop, h_min, h_max, h, y, y + neq, rhstmp);
+    nfe = rkf45_h0 (neq, t, t_stop, h_min, h_max, h, y, y + neq, v_param);
   }
 
   //printf("t= %e t_stop= %e h= %e\n", t, t_stop, h);
@@ -1465,7 +1573,7 @@ void FixRX::rkf45(int id, double *rwork)
     double *eout = yout + neq;
 
     // Take a trial step.
-    rkf45_step (neq, h, y, yout, eout, rhstmp);
+    rkf45_step (neq, h, y, yout, eout, v_param);
 
     // Estimate the solution error.
       // ... weighted 2-norm of the error.
@@ -1513,16 +1621,17 @@ void FixRX::rkf45(int id, double *rwork)
 
     if (maxIters && nit > maxIters){
       //fprintf(stderr,"atom[%d] took too many iterations in rkf45 %d %e %e\n", id, nit, t, t_stop);
-      nFails ++;
+      //nFails ++;
+      ode_counter[3] ++;
       break;
       // We should set an error here so that the solution is not used!
     }
 
   } // end while
 
-  nSteps += nst;
-  nIters += nit;
-  nFuncs += nfe;
+  ode_counter[0] += nst;
+  ode_counter[1] += nit;
+  ode_counter[2] += nfe;
 
   //if (diagnosticFrequency == 1 && diagnosticCounterPerODE[StepSum] != NULL)
   if (diagnosticCounterPerODE[StepSum] != NULL){
@@ -1539,9 +1648,6 @@ void FixRX::rkf45(int id, double *rwork)
       y[ispecies] = 0.0;
     atom->dvector[ispecies][id] = y[ispecies];
   }
-
-  if (rwork == NULL)
-    delete [] y;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -1559,21 +1665,23 @@ int FixRX::rhs(double t, const double *y, double *dydt, void *params)
 
 int FixRX::rhs_dense(double t, const double *y, double *dydt, void *params)
 {
-  double rxnRateLawForward;
-  double *rxnRateLaw = (double *) params;
-  double VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
-  double concentration;
-  int nspecies = atom->nspecies_dpd;
+  UserRHSData *userData = (UserRHSData *) params;
+
+  double *rxnRateLaw = userData->rxnRateLaw;
+  double *kFor       = userData->kFor;
+
+  const double VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
+  const int nspecies = atom->nspecies_dpd;
 
   for(int ispecies=0; ispecies<nspecies; ispecies++)
     dydt[ispecies] = 0.0;
 
   // Construct the reaction rate laws
   for(int jrxn=0; jrxn<nreactions; jrxn++){
-    rxnRateLawForward = kR[jrxn];
+    double rxnRateLawForward = kFor[jrxn];
 
     for(int ispecies=0; ispecies<nspecies; ispecies++){
-      concentration = y[ispecies]/VDPD;
+      const double concentration = y[ispecies]/VDPD;
       rxnRateLawForward *= pow(concentration,stoichReactants[jrxn][ispecies]);
     }
     rxnRateLaw[jrxn] = rxnRateLawForward;
@@ -1591,13 +1699,13 @@ int FixRX::rhs_dense(double t, const double *y, double *dydt, void *params)
 
 int FixRX::rhs_sparse(double t, const double *y, double *dydt, void *v_params) const
 {
-   double *_rxnRateLaw = (double *) v_params;
+   UserRHSData *userData = (UserRHSData *) v_params;
 
    const double VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
 
-   #define kFor         (this->kR)
+   #define kFor         (userData->kFor)
    #define kRev         (NULL)
-   #define rxnRateLaw   (_rxnRateLaw)
+   #define rxnRateLaw   (userData->rxnRateLaw)
    #define conc         (dydt)
    #define maxReactants (this->sparseKinetics_maxReactants)
    #define maxSpecies   (this->sparseKinetics_maxSpecies)
diff --git a/src/USER-DPD/fix_rx.h b/src/USER-DPD/fix_rx.h
index c35c9afabf..5e226aec73 100644
--- a/src/USER-DPD/fix_rx.h
+++ b/src/USER-DPD/fix_rx.h
@@ -66,19 +66,19 @@ class FixRX : public Fix {
   double *kR;
 
   //!< Classic Runge-Kutta 4th-order stepper.
-  void rk4(int,double*);
+  void rk4(int, double*, void*);
 
   //!< Runge-Kutta-Fehlberg ODE Solver.
-  void rkf45(int,double*);
+  void rkf45(int, double*, void*, int ode_counter[]);
 
   //!< Runge-Kutta-Fehlberg ODE stepper function.
   void rkf45_step (const int neq, const double h, double y[], double y_out[],
-                   double rwk[], void* v_param);
+                   double rwk[], void *);
 
   //!< Initial step size estimation for the Runge-Kutta-Fehlberg ODE solver.
   int rkf45_h0 (const int neq, const double t, const double t_stop,
                      const double hmin, const double hmax,
-                     double& h0, double y[], double rwk[], void* v_params);
+                     double& h0, double y[], double rwk[], void *v_params);
 
   class PairDPDfdtEnergy *pairDPDE;
   double *dpdThetaLocal;
@@ -90,6 +90,13 @@ class FixRX : public Fix {
   int rhs(double, const double *, double *, void *);
   int rhs_dense (double, const double *, double *, void *);
 
+  // User-defined data container needed in rhs.
+  struct UserRHSData
+  {
+    double *kFor;
+    double *rxnRateLaw;
+  };
+
   // Sparse stoichiometric matrix storage format and methods.
   bool useSparseKinetics;
   //SparseKinetics sparseKinetics;
@@ -116,10 +123,10 @@ class FixRX : public Fix {
   double relTol, absTol; //!< Relative and absolute tolerances for the ODE solver(s).
 
   // ODE Diagnostics
-  int nSteps; //!< # of accepted steps taken over all atoms.
-  int nIters; //!< # of attemped steps for all atoms.
-  int nFuncs; //!< # of RHS evaluations for all atoms.
-  int nFails; //!< # of ODE systems that failed (for some reason).
+  //int nSteps; //!< # of accepted steps taken over all atoms.
+  //int nIters; //!< # of attemped steps for all atoms.
+  //int nFuncs; //!< # of RHS evaluations for all atoms.
+  //int nFails; //!< # of ODE systems that failed (for some reason).
 
   int diagnosticFrequency; //!< Frequency (LMP steps) that run-time diagnostics will be printed to the log.
   enum { numDiagnosticCounters = 5 };

From 2ea900df007e59905720503adbc4955c5c45b574 Mon Sep 17 00:00:00 2001
From: Christopher Stone <chris.stone@computational-science.com>
Date: Sat, 28 Jan 2017 10:41:16 -0500
Subject: [PATCH 111/267] Updated FixRxKokkos to use kokkos-managed data
 objects.

- Switched to use kokkos dvector, mask, and dpdTheta views
  from atomKK.
---
 src/KOKKOS/fix_rx_kokkos.cpp | 54 ++++++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 18 deletions(-)

diff --git a/src/KOKKOS/fix_rx_kokkos.cpp b/src/KOKKOS/fix_rx_kokkos.cpp
index b989d6b2d4..19da344db8 100644
--- a/src/KOKKOS/fix_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@@ -760,9 +760,9 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
 
   TimerType timer_start = getTimeStamp();
 
-  int nlocal = atom->nlocal;
-  int nghost = atom->nghost;
-  int newton_pair = force->newton_pair;
+  const int nlocal = atom->nlocal;
+  const int nghost = atom->nghost;
+  const int newton_pair = force->newton_pair;
 
   const bool setToZero = false; // don't set the forward rates to zero.
 
@@ -776,12 +776,23 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
   TimerType timer_localTemperature = getTimeStamp();
 
   // Total counters from the ODE solvers.
-  CounterType Counters;
+  CounterType TotalCounters;
 
   // Set data needed in the operators.
-  int *mask = atom->mask;
-  double *dpdTheta = atom->dpdTheta;
+  // ...
 
+  //int *mask = atom->mask;
+  //double *dpdTheta = atom->dpdTheta;
+
+  // Local references to the atomKK objects.
+  typename ArrayTypes<DeviceType>::t_efloat_1d d_dpdTheta = atomKK->k_dpdTheta.view<DeviceType>(); 
+  typename ArrayTypes<DeviceType>::t_float_2d d_dvector   = atomKK->k_dvector.view<DeviceType>();
+  typename ArrayTypes<DeviceType>::t_int_1d   d_mask      = atomKK->k_mask.view<DeviceType>();
+
+  // Get up-to-date data.
+  atomKK->sync( execution_space, MASK_MASK | DVECTOR_MASK | DPDTHETA_MASK );
+
+  // Set some constants outside of the parallel_for
   const double boltz = force->boltz;
   const double t_stop = update->dt; // DPD time-step and integration length.
 
@@ -796,7 +807,7 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
 
   Kokkos::parallel_reduce( nlocal, LAMMPS_LAMBDA(int i, CounterType &counter)
     {
-      if (mask[i] & groupbit)
+      if (d_mask(i) & groupbit)
       {
         double *y = new double[8*nspecies];
         double *rwork = y + nspecies;
@@ -807,7 +818,7 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
 
         CounterType counter_i;
 
-        const double theta = (localTempFlag) ? dpdThetaLocal[i] : dpdTheta[i];
+        const double theta = (localTempFlag) ? dpdThetaLocal[i] : d_dpdTheta(i);
 
         //Compute the reaction rate constants
         for (int irxn = 0; irxn < nreactions; irxn++)
@@ -819,14 +830,13 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
             userData.kFor[irxn] = d_kineticsData.Arr(irxn) *
                                    pow(theta, d_kineticsData.nArr(irxn)) *
                                    exp(-d_kineticsData.Ea(irxn) / boltz / theta);
-            //userData.kFor[irxn] = Arr[irxn]*pow(theta,nArr[irxn])*exp(-Ea[irxn]/boltz/theta);
           }
         }
 
         // Update ConcOld and initialize the ODE solution vector y[].
         for (int ispecies = 0; ispecies < nspecies; ispecies++){
-          const double tmp = atom->dvector[ispecies][i];
-          atom->dvector[ispecies+nspecies][i] = tmp;
+          const double tmp = d_dvector(ispecies, i);
+          d_dvector(ispecies+nspecies, i) = tmp;
           y[ispecies] = tmp;
         }
 
@@ -845,7 +855,7 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
               error->one(FLERR,"Computed concentration in RK4 solver is < -10*DBL_EPSILON");
             else if(y[ispecies] < MY_EPSILON)
               y[ispecies] = 0.0;
-            atom->dvector[ispecies][i] = y[ispecies];
+            d_dvector(ispecies,i) = y[ispecies];
           }
         }
         else if (odeIntegrationFlag == ODE_LAMMPS_RKF45)
@@ -858,7 +868,7 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
               error->one(FLERR,"Computed concentration in RKF45 solver is < -1.0e-10");
             else if(y[ispecies] < MY_EPSILON)
               y[ispecies] = 0.0;
-            atom->dvector[ispecies][i] = y[ispecies];
+            d_dvector(ispecies,i) = y[ispecies];
           }
 
           //if (diagnosticFrequency == 1 && diagnosticCounterPerODE[StepSum] != NULL)
@@ -877,13 +887,21 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
       } // if
     } // parallel_for lambda-body
 
-    , Counters // reduction value
+    , TotalCounters // reduction value
   );
 
   TimerType timer_ODE = getTimeStamp();
 
-  // Communicate the updated momenta and velocities to all nodes
+  // Signal that dvector has been modified on this execution space.
+  atomKK->modified( execution_space, DVECTOR_MASK );
+
+  // Communicate the updated species data to all nodes
+  atomKK->sync ( Host, DVECTOR_MASK );
+
   comm->forward_comm_fix(this);
+
+  atomKK->modified ( Host, DVECTOR_MASK );
+
   if(localTempFlag) delete [] dpdThetaLocal;
 
   TimerType timer_stop = getTimeStamp();
@@ -894,12 +912,12 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
                          getElapsedTime(timer_start, timer_stop),
                          getElapsedTime(timer_start, timer_localTemperature),
                          getElapsedTime(timer_localTemperature, timer_ODE),
-                         getElapsedTime(timer_ODE, timer_stop), nlocal, Counters.nFuncs, Counters.nSteps);
+                         getElapsedTime(timer_ODE, timer_stop), nlocal, TotalCounters.nFuncs, TotalCounters.nSteps);
 
   // Warn the user if a failure was detected in the ODE solver.
-  if (Counters.nFails > 0){
+  if (TotalCounters.nFails > 0){
     char sbuf[128];
-    sprintf(sbuf,"in FixRX::pre_force, ODE solver failed for %d atoms.", Counters.nFails);
+    sprintf(sbuf,"in FixRX::pre_force, ODE solver failed for %d atoms.", TotalCounters.nFails);
     error->warning(FLERR, sbuf);
   }
 

From 843f3a9192564bc863d739636453598f046a8555 Mon Sep 17 00:00:00 2001
From: Christopher Stone <chris.stone@computational-science.com>
Date: Sat, 28 Jan 2017 12:02:49 -0500
Subject: [PATCH 112/267] Updates to FixRxKokkos ...

- Added templated computeLocalTemp<>() to FixRxKokkos but still
  using the original host data pointers.
- Updated the copy-back to dvector operation to be the same with
  RK4 and RKF45 per discussion with J. Larentzos.
TODO:
  - Add kokkos data for computeLocalTemp and parallel_for loop.
---
 src/KOKKOS/fix_rx_kokkos.cpp | 163 ++++++++++++++++++++++++++++-------
 src/KOKKOS/fix_rx_kokkos.h   |   3 +
 2 files changed, 136 insertions(+), 30 deletions(-)

diff --git a/src/KOKKOS/fix_rx_kokkos.cpp b/src/KOKKOS/fix_rx_kokkos.cpp
index 19da344db8..45af816810 100644
--- a/src/KOKKOS/fix_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@@ -39,6 +39,10 @@ using namespace MathSpecial;
 #define SparseKinetics_enableIntegralReactions (true)
 #define SparseKinetics_invalidIndex (-1)
 
+// From fix_rx.cpp ... this should be lifted into fix_rx.h or fix_rx_kokkos.h?
+enum{NONE,HARMONIC};
+enum{LUCY};
+
 namespace /* anonymous */
 {
 
@@ -770,7 +774,19 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
     int count = nlocal + (newton_pair ? nghost : 0);
     dpdThetaLocal = new double[count];
     memset(dpdThetaLocal, 0, sizeof(double)*count);
-    computeLocalTemperature();
+    //FixRx::computeLocalTemperature();
+
+    // Are there is no other options than wtFlag = (0)LUCY and localTempFlag = NONE : HARMONIC?
+    if (localTempFlag == HARMONIC)
+      if (newton_pair)
+        computeLocalTemperature<LUCY, HARMONIC, true > ();
+      else
+        computeLocalTemperature<LUCY, HARMONIC, false> ();
+    else
+      if (newton_pair)
+        computeLocalTemperature<LUCY, NONE    , true > ();
+      else
+        computeLocalTemperature<LUCY, NONE    , false> ();
   }
 
   TimerType timer_localTemperature = getTimeStamp();
@@ -834,7 +850,8 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
         }
 
         // Update ConcOld and initialize the ODE solution vector y[].
-        for (int ispecies = 0; ispecies < nspecies; ispecies++){
+        for (int ispecies = 0; ispecies < nspecies; ispecies++)
+        {
           const double tmp = d_dvector(ispecies, i);
           d_dvector(ispecies+nspecies, i) = tmp;
           y[ispecies] = tmp;
@@ -844,50 +861,41 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
         if (odeIntegrationFlag == ODE_LAMMPS_RK4)
         {
           rk4(t_stop, y, rwork, &userData);
-
-          /* This should be a duplicate of the copy-out in the 
-             rkf45 block but for the MY_EPSILON v. -1e-10 (literal)
-             difference. Can these be merged? */
-
-          // Store the solution back in atom->dvector.
-          for (int ispecies = 0; ispecies < nspecies; ispecies++){
-            if(y[ispecies] < -MY_EPSILON)
-              error->one(FLERR,"Computed concentration in RK4 solver is < -10*DBL_EPSILON");
-            else if(y[ispecies] < MY_EPSILON)
-              y[ispecies] = 0.0;
-            d_dvector(ispecies,i) = y[ispecies];
-          }
         }
         else if (odeIntegrationFlag == ODE_LAMMPS_RKF45)
         {
           rkf45(nspecies, t_stop, y, rwork, &userData, counter_i);
 
-          // Store the solution back in atom->dvector.
-          for (int ispecies = 0; ispecies < nspecies; ispecies++){
-            if(y[ispecies] < -1.0e-10)
-              error->one(FLERR,"Computed concentration in RKF45 solver is < -1.0e-10");
-            else if(y[ispecies] < MY_EPSILON)
-              y[ispecies] = 0.0;
-            d_dvector(ispecies,i) = y[ispecies];
-          }
-
           //if (diagnosticFrequency == 1 && diagnosticCounterPerODE[StepSum] != NULL)
-          if (diagnosticCounterPerODE[StepSum] != NULL)
-          {
-            diagnosticCounterPerODE[StepSum][i] = counter_i.nSteps;
-            diagnosticCounterPerODE[FuncSum][i] = counter_i.nFuncs;
-          }
+          //if (diagnosticCounterPerODE[StepSum] != NULL)
+          //{
+          //  diagnosticCounterPerODE[StepSum][i] = counter_i.nSteps;
+          //  diagnosticCounterPerODE[FuncSum][i] = counter_i.nFuncs;
+          //}
+        }
+
+        // Store the solution back in dvector.
+        for (int ispecies = 0; ispecies < nspecies; ispecies++)
+        {
+          if (y[ispecies] < -MY_EPSILON)
+            error->one(FLERR,"Computed concentration in RK solver is < -10*DBL_EPSILON");
+          else if (y[ispecies] < MY_EPSILON)
+            y[ispecies] = 0.0;
+
+          d_dvector(ispecies,i) = y[ispecies];
         }
 
         delete [] y;
         delete [] userData.kFor;
         delete [] userData.rxnRateLaw;
 
+        // Update the iteration statistics counter. Is this unique for each iteration?
         counter += counter_i;
+
       } // if
     } // parallel_for lambda-body
 
-    , TotalCounters // reduction value
+    , TotalCounters // reduction value for all iterations.
   );
 
   TimerType timer_ODE = getTimeStamp();
@@ -942,6 +950,101 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
   } */
 }
 
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+  template <int WT_FLAG, int LOCAL_TEMP_FLAG, bool IS_NEWTON_PAIR>
+void FixRxKokkos<DeviceType>::computeLocalTemperature()
+{
+  int i,j,ii,jj,inum,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz;
+  double rsq;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int nghost = atom->nghost;
+  //int newton_pair = force->newton_pair;
+
+  // local temperature variables
+  double wij=0.0;
+  double *dpdTheta = atom->dpdTheta;
+
+  // Initialize the local temperature weight array
+  int sumWeightsCt = nlocal + (IS_NEWTON_PAIR ? nghost : 0);
+  sumWeights = new double[sumWeightsCt];
+  memset(sumWeights, 0, sizeof(double)*sumWeightsCt);
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtype = type[j];
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < pairDPDE->cutsq[itype][jtype]) {
+        double rcut = sqrt(pairDPDE->cutsq[itype][jtype]);
+        double rij = sqrt(rsq);
+        double ratio = rij/rcut;
+
+        // Lucy's Weight Function
+        if (WT_FLAG == LUCY)
+	{
+          wij = (1.0+3.0*ratio) * (1.0-ratio)*(1.0-ratio)*(1.0-ratio);
+          dpdThetaLocal[i] += wij/dpdTheta[j];
+          if (IS_NEWTON_PAIR || j < nlocal)
+            dpdThetaLocal[j] += wij/dpdTheta[i];
+        }
+
+        sumWeights[i] += wij;
+        if (IS_NEWTON_PAIR || j < nlocal)
+          sumWeights[j] += wij;
+      }
+    }
+  }
+  if (IS_NEWTON_PAIR) comm->reverse_comm_fix(this);
+
+  // self-interaction for local temperature
+  for (i = 0; i < nlocal; i++){
+
+    // Lucy Weight Function
+    if (WT_FLAG == LUCY)
+    {
+      wij = 1.0;
+      dpdThetaLocal[i] += wij / dpdTheta[i];
+    }
+    sumWeights[i] += wij;
+
+    // Normalized local temperature
+    dpdThetaLocal[i] = dpdThetaLocal[i] / sumWeights[i];
+
+    if (LOCAL_TEMP_FLAG == HARMONIC)
+      dpdThetaLocal[i] = 1.0 / dpdThetaLocal[i];
+
+  }
+
+  delete [] sumWeights;
+}
+
 namespace LAMMPS_NS {
 template class FixRxKokkos<LMPDeviceType>;
 #ifdef KOKKOS_HAVE_CUDA
diff --git a/src/KOKKOS/fix_rx_kokkos.h b/src/KOKKOS/fix_rx_kokkos.h
index 95872c67e9..ec9a8fa976 100644
--- a/src/KOKKOS/fix_rx_kokkos.h
+++ b/src/KOKKOS/fix_rx_kokkos.h
@@ -121,6 +121,9 @@ class FixRxKokkos : public FixRX {
 
   void create_kinetics_data(void);
 
+  template <int WT_FLAG, int LOCAL_TEMP_FLAG, bool IS_NEWTON_PAIR>
+  void computeLocalTemperature();
+
 };
 
 }

From acba25c3831249f54353f1a0a76b63f42881da6f Mon Sep 17 00:00:00 2001
From: Christopher Stone <chris.stone@computational-science.com>
Date: Sat, 28 Jan 2017 15:58:21 -0500
Subject: [PATCH 113/267] Added kokkos datatypes to
 FixRxKokkos::computeLocalTemperature(...)

Added kokkos dual-view datatypes used in computeLocalTemperature and
pre_force (e.g., dpdThetaLocal) but still using the original host
pointers for the pack/unpack operations.

TODO:
- The Kokkos neighbor list is not working. Need to request a Kokkos
  neighbor list in ::init(). Then, replace objects like list->ilist[]
  with k_list->d_ilist().
- Add another template parameter for HALFTHREAD and create (automatic)
  atomic view of dpdThetaLocal and sumWeights.
- Add modify/sync comments and replace the host-only pointers in the
  pack/unpack methods.
---
 src/KOKKOS/fix_rx_kokkos.cpp | 220 +++++++++++++++++++++++++----------
 src/KOKKOS/fix_rx_kokkos.h   |   6 +-
 2 files changed, 166 insertions(+), 60 deletions(-)

diff --git a/src/KOKKOS/fix_rx_kokkos.cpp b/src/KOKKOS/fix_rx_kokkos.cpp
index 45af816810..491b32e01d 100644
--- a/src/KOKKOS/fix_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@@ -95,6 +95,15 @@ void FixRxKokkos<DeviceType>::init()
 
 /* ---------------------------------------------------------------------- */
 
+//template <typename DeviceType>
+//void FixRXKokkos<DeviceType>::init_list(int, class NeighList* ptr)
+//{
+//  printf("Inside FixRxKokkos::init_list\n");
+//  this->list = ptr;
+//}
+
+/* ---------------------------------------------------------------------- */
+
 template <typename DeviceType>
 void FixRxKokkos<DeviceType>::rk4(const double t_stop, double *y, double *rwork, void* v_params) const
 {
@@ -770,12 +779,17 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
 
   const bool setToZero = false; // don't set the forward rates to zero.
 
-  if(localTempFlag){
-    int count = nlocal + (newton_pair ? nghost : 0);
-    dpdThetaLocal = new double[count];
-    memset(dpdThetaLocal, 0, sizeof(double)*count);
+  if (localTempFlag)
+  {
+    const int count = nlocal + (newton_pair ? nghost : 0);
+
+    //dpdThetaLocal = new double[count];
+    //memset(dpdThetaLocal, 0, sizeof(double)*count);
     //FixRx::computeLocalTemperature();
 
+    memory->create_kokkos (k_dpdThetaLocal, dpdThetaLocal, count, "FixRxKokkos::dpdThetaLocal");
+    d_dpdThetaLocal = k_dpdThetaLocal.d_view;
+
     // Are there is no other options than wtFlag = (0)LUCY and localTempFlag = NONE : HARMONIC?
     if (localTempFlag == HARMONIC)
       if (newton_pair)
@@ -802,8 +816,8 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
 
   // Local references to the atomKK objects.
   typename ArrayTypes<DeviceType>::t_efloat_1d d_dpdTheta = atomKK->k_dpdTheta.view<DeviceType>(); 
-  typename ArrayTypes<DeviceType>::t_float_2d d_dvector   = atomKK->k_dvector.view<DeviceType>();
-  typename ArrayTypes<DeviceType>::t_int_1d   d_mask      = atomKK->k_mask.view<DeviceType>();
+  typename ArrayTypes<DeviceType>::t_float_2d  d_dvector  = atomKK->k_dvector.view<DeviceType>();
+  typename ArrayTypes<DeviceType>::t_int_1d    d_mask     = atomKK->k_mask.view<DeviceType>();
 
   // Get up-to-date data.
   atomKK->sync( execution_space, MASK_MASK | DVECTOR_MASK | DPDTHETA_MASK );
@@ -834,7 +848,8 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
 
         CounterType counter_i;
 
-        const double theta = (localTempFlag) ? dpdThetaLocal[i] : d_dpdTheta(i);
+        //const double theta = (localTempFlag) ? dpdThetaLocal[i] : d_dpdTheta(i);
+        const double theta = (localTempFlag) ? d_dpdThetaLocal(i) : d_dpdTheta(i);
 
         //Compute the reaction rate constants
         for (int irxn = 0; irxn < nreactions; irxn++)
@@ -910,7 +925,11 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
 
   atomKK->modified ( Host, DVECTOR_MASK );
 
-  if(localTempFlag) delete [] dpdThetaLocal;
+  if (localTempFlag)
+  {
+    //delete [] dpdThetaLocal;
+    memory->destroy_kokkos(k_dpdThetaLocal, dpdThetaLocal);
+  }
 
   TimerType timer_stop = getTimeStamp();
 
@@ -953,96 +972,179 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
 /* ---------------------------------------------------------------------- */
 
 template <typename DeviceType>
-  template <int WT_FLAG, int LOCAL_TEMP_FLAG, bool IS_NEWTON_PAIR>
+  template <int WT_FLAG, int LOCAL_TEMP_FLAG, bool NEWTON_PAIR>
 void FixRxKokkos<DeviceType>::computeLocalTemperature()
 {
-  int i,j,ii,jj,inum,jnum,itype,jtype;
-  double xtmp,ytmp,ztmp,delx,dely,delz;
-  double rsq;
-  int *ilist,*jlist,*numneigh,**firstneigh;
+  printf("Inside FixRxKokkos::computeLocalTemperature: %d %d %d %d\n", WT_FLAG, LOCAL_TEMP_FLAG, NEWTON_PAIR, (int)lmp->kokkos->neighflag);
 
-  double **x = atom->x;
-  int *type = atom->type;
-  int nlocal = atom->nlocal;
-  int nghost = atom->nghost;
-  //int newton_pair = force->newton_pair;
+  //int inum,jnum,itype,jtype;
+  //double xtmp,ytmp,ztmp,delx,dely,delz;
+  //double rsq;
+  //int *ilist,*jlist,*numneigh,**firstneigh;
+
+  //double **x = atom->x;
+  //int *type = atom->type;
+  //double *dpdTheta = atom->dpdTheta;
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread d_x        = atomKK->k_x.view<DeviceType>();
+  typename ArrayTypes<DeviceType>::t_int_1d_randomread  d_type     = atomKK->k_type.view<DeviceType>();
+  typename ArrayTypes<DeviceType>::t_efloat_1d          d_dpdTheta = atomKK->k_dpdTheta.view<DeviceType>(); 
+
+  atomKK->sync(execution_space, X_MASK | TYPE_MASK | DPDTHETA_MASK );
+
+  const int nlocal = atom->nlocal;
+  const int nghost = atom->nghost;
+  //const int newton_pair = force->newton_pair;
 
   // local temperature variables
-  double wij=0.0;
-  double *dpdTheta = atom->dpdTheta;
+  //double wij=0.0;
+
+  // Pull from pairDPDE. The pairDPDEKK objects are producted so recreate here for now.
+  //pairDPDEKK->k_cutsq.template sync<DeviceType>();
+  //typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq = pairDPDEKK->k_cutsq.template view<DeviceType();
+
+  //!< Copies pulled from pairDPDE for local use since pairDPDEKK's objects are protected.
+  typename ArrayTypes<DeviceType>::tdual_ffloat_2d k_cutsq;
+  typename ArrayTypes<DeviceType>::t_ffloat_2d     d_cutsq;
+  double **h_cutsq;
+
+  {
+    const int ntypes = atom->ntypes;
+
+    memory->create_kokkos (k_cutsq, h_cutsq, ntypes+1, ntypes+1, "pair:cutsq");
+    d_cutsq = k_cutsq.template view<DeviceType>();
+
+    for (int i = 1; i <= ntypes; ++i)
+      for (int j = i; j <= ntypes; ++j)
+      {
+        k_cutsq.h_view(i,j) = pairDPDE->cutsq[i][j];
+        k_cutsq.h_view(j,i) = k_cutsq.h_view(i,j);
+      }
+
+    k_cutsq.template modify<LMPHostType>();
+    k_cutsq.template sync<DeviceType>();
+  }
 
   // Initialize the local temperature weight array
-  int sumWeightsCt = nlocal + (IS_NEWTON_PAIR ? nghost : 0);
-  sumWeights = new double[sumWeightsCt];
-  memset(sumWeights, 0, sizeof(double)*sumWeightsCt);
+  int sumWeightsCt = nlocal + (NEWTON_PAIR ? nghost : 0);
+  //sumWeights = new double[sumWeightsCt];
+  //memset(sumWeights, 0, sizeof(double)*sumWeightsCt);
 
-  inum = list->inum;
-  ilist = list->ilist;
-  numneigh = list->numneigh;
-  firstneigh = list->firstneigh;
+  memory->create_kokkos (k_sumWeights, sumWeights, sumWeightsCt, "FixRxKokkos::sumWeights");
+  d_sumWeights = k_sumWeights.d_view;
+
+  // Initialize the accumulator to zero ...
+  Kokkos::parallel_for(nlocal,
+		  LAMMPS_LAMBDA(int i)
+		  {
+		    d_sumWeights(i) = 0.0;
+		  }
+		  );
+
+  const int inum = list->inum;
+
+  // Local list views. (This isn't working!)
+  //NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
+  //if (not(list->kokkos))
+  //{
+  //  error->one(FLERR,"list is not a Kokkos list\n");
+  //}
+
+  //typename ArrayTypes<DeviceType>::t_neighbors_2d d_neighbors = k_list->d_neighbors;
+  //typename ArrayTypes<DeviceType>::t_int_1d       d_ilist     = k_list->d_ilist;
+  //typename ArrayTypes<DeviceType>::t_int_1d       d_numneigh  = k_list->d_numneigh;
+
+  int* ilist = list->ilist;
+  int* numneigh = list->numneigh;
+  int** firstneigh = list->firstneigh;
 
   // loop over neighbors of my atoms
-  for (ii = 0; ii < inum; ii++) {
-    i = ilist[ii];
-    xtmp = x[i][0];
-    ytmp = x[i][1];
-    ztmp = x[i][2];
-    itype = type[i];
-    jlist = firstneigh[i];
-    jnum = numneigh[i];
+  for (int ii = 0; ii < inum; ii++)
+  {
+    const int i = ilist[ii];
+    //const int i = d_ilist(ii);
+ 
+    //const double xtmp = x[i][0];
+    //const double ytmp = x[i][1];
+    //const double ztmp = x[i][2];
+    //const int itype = type[i];
+    const double xtmp = d_x(i,0);
+    const double ytmp = d_x(i,1);
+    const double ztmp = d_x(i,2);
+    const int itype = d_type(i);
 
-    for (jj = 0; jj < jnum; jj++) {
-      j = jlist[jj];
-      j &= NEIGHMASK;
-      jtype = type[j];
+    int *jlist = firstneigh[i];
+    const int jnum = numneigh[i];
+    //const int jnum = d_numneigh(i);
 
-      delx = xtmp - x[j][0];
-      dely = ytmp - x[j][1];
-      delz = ztmp - x[j][2];
-      rsq = delx*delx + dely*dely + delz*delz;
+    for (int jj = 0; jj < jnum; jj++)
+    {
+      const int j = (jlist[jj] & NEIGHMASK);
+      //const int j = (d_neighbors(i,jj) & NEIGHMASK);
+      //const int jtype = type[j];
+      const int jtype = d_type(j);
 
-      if (rsq < pairDPDE->cutsq[itype][jtype]) {
-        double rcut = sqrt(pairDPDE->cutsq[itype][jtype]);
+      //const double delx = xtmp - x[j][0];
+      //const double dely = ytmp - x[j][1];
+      //const double delz = ztmp - x[j][2];
+      const double delx = xtmp - d_x(j,0);
+      const double dely = ytmp - d_x(j,1);
+      const double delz = ztmp - d_x(j,2);
+      const double rsq = delx*delx + dely*dely + delz*delz;
+
+      const double cutsq_ij = d_cutsq(itype,jtype);
+
+      if (rsq < cutsq_ij)
+      {
+        const double rcut = sqrt( cutsq_ij );
         double rij = sqrt(rsq);
         double ratio = rij/rcut;
 
+        double wij = 0.0;
+
         // Lucy's Weight Function
         if (WT_FLAG == LUCY)
 	{
           wij = (1.0+3.0*ratio) * (1.0-ratio)*(1.0-ratio)*(1.0-ratio);
-          dpdThetaLocal[i] += wij/dpdTheta[j];
-          if (IS_NEWTON_PAIR || j < nlocal)
-            dpdThetaLocal[j] += wij/dpdTheta[i];
+          d_dpdThetaLocal(i) += wij / d_dpdTheta(j);
+          if (NEWTON_PAIR || j < nlocal)
+            d_dpdThetaLocal(j) += wij / d_dpdTheta(i);
         }
 
-        sumWeights[i] += wij;
-        if (IS_NEWTON_PAIR || j < nlocal)
-          sumWeights[j] += wij;
+        d_sumWeights(i) += wij;
+        if (NEWTON_PAIR || j < nlocal)
+          d_sumWeights(j) += wij;
       }
     }
   }
-  if (IS_NEWTON_PAIR) comm->reverse_comm_fix(this);
+
+  if (NEWTON_PAIR) comm->reverse_comm_fix(this);
 
   // self-interaction for local temperature
-  for (i = 0; i < nlocal; i++){
+  for (int i = 0; i < nlocal; i++)
+  {
+    double wij = 0.0;
 
     // Lucy Weight Function
     if (WT_FLAG == LUCY)
     {
       wij = 1.0;
-      dpdThetaLocal[i] += wij / dpdTheta[i];
+      d_dpdThetaLocal(i) += wij / d_dpdTheta(i);
     }
-    sumWeights[i] += wij;
+    d_sumWeights(i) += wij;
 
     // Normalized local temperature
-    dpdThetaLocal[i] = dpdThetaLocal[i] / sumWeights[i];
+    d_dpdThetaLocal(i) = d_dpdThetaLocal(i) / d_sumWeights(i);
 
     if (LOCAL_TEMP_FLAG == HARMONIC)
-      dpdThetaLocal[i] = 1.0 / dpdThetaLocal[i];
-
+      d_dpdThetaLocal(i) = 1.0 / d_dpdThetaLocal(i);
   }
 
-  delete [] sumWeights;
+  // Clean up the local kokkos data.
+  memory->destroy_kokkos(k_cutsq, h_cutsq);
+  memory->destroy_kokkos(k_sumWeights, sumWeights);
+
+  //delete [] sumWeights;
 }
 
 namespace LAMMPS_NS {
diff --git a/src/KOKKOS/fix_rx_kokkos.h b/src/KOKKOS/fix_rx_kokkos.h
index ec9a8fa976..9d60f2b99e 100644
--- a/src/KOKKOS/fix_rx_kokkos.h
+++ b/src/KOKKOS/fix_rx_kokkos.h
@@ -121,7 +121,11 @@ class FixRxKokkos : public FixRX {
 
   void create_kinetics_data(void);
 
-  template <int WT_FLAG, int LOCAL_TEMP_FLAG, bool IS_NEWTON_PAIR>
+  // Need a dual-view and device-view for dpdThetaLocal and sumWeights since they're used in several callbacks.
+  DAT::tdual_efloat_1d k_dpdThetaLocal, k_sumWeights;
+  typename ArrayTypes<DeviceType>::t_efloat_1d d_dpdThetaLocal, d_sumWeights;
+
+  template <int WT_FLAG, int LOCAL_TEMP_FLAG, bool NEWTON_PAIR>
   void computeLocalTemperature();
 
 };

From 0d57a1d831e6a73e55491adb982c44175be4c76f Mon Sep 17 00:00:00 2001
From: Christopher Stone <chris.stone@computational-science.com>
Date: Fri, 3 Feb 2017 16:09:06 -0500
Subject: [PATCH 114/267] Added setup_pre_force, pack/unpack methods to
 FixRxKokkos.

- Added a kokkos version of setup_pre_force that only sets dvector
  and then communicates that.
- Converted all for loops to parallel_for's in computeLocalTemperator()
  and setup_pre_force.
- Added pack/unpack forward/reverse methods with Kokkos host views.

TODO:
 - The Kokkos neighbor list is not working. Need to request a Kokkos
   neighbor list in ::init(). Then, replace objects like list->ilist[]
   with k_list->d_ilist().
---
 src/KOKKOS/fix_rx_kokkos.cpp | 343 ++++++++++++++++++++++++++---------
 src/KOKKOS/fix_rx_kokkos.h   |  12 +-
 2 files changed, 272 insertions(+), 83 deletions(-)

diff --git a/src/KOKKOS/fix_rx_kokkos.cpp b/src/KOKKOS/fix_rx_kokkos.cpp
index 491b32e01d..167f2713ea 100644
--- a/src/KOKKOS/fix_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@@ -77,6 +77,18 @@ FixRxKokkos<DeviceType>::~FixRxKokkos()
 
 /* ---------------------------------------------------------------------- */
 
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::post_constructor()
+{
+  // Run the parents and then reset one value.
+  FixRX::post_constructor();
+
+  // Need a copy of this
+  this->my_restartFlag = modify->fix[modify->nfix-1]->restart_reset;
+}
+
+/* ---------------------------------------------------------------------- */
+
 template <typename DeviceType>
 void FixRxKokkos<DeviceType>::init()
 {
@@ -763,6 +775,51 @@ void FixRxKokkos<DeviceType>::create_kinetics_data(void)
 
 /* ---------------------------------------------------------------------- */
 
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::setup_pre_force(int vflag)
+{
+  printf("Inside FixRxKokkos<DeviceType>::setup_pre_force restartFlag= %d\n", my_restartFlag);
+
+  if (my_restartFlag)
+    my_restartFlag = 0;
+  else
+  {
+    const int nlocal = atom->nlocal;
+    //const int nghost = atom->nghost;
+    //const int *mask = atom->mask;
+    //const int newton_pair = force->newton_pair;
+
+    typename ArrayTypes<DeviceType>::t_float_2d  d_dvector = atomKK->k_dvector.view<DeviceType>();
+
+    // Get up-to-date data.
+    atomKK->sync( execution_space, DVECTOR_MASK );
+
+    // The only net effect from fix_rx.cpp is to set dvector[nspecies:2*nspecies]
+    // since the reactions are set to zero for step 0.
+    Kokkos::parallel_for ( nlocal,
+        LAMMPS_LAMBDA(const int i)
+        {
+          for (int ispecies = 0; ispecies < nspecies; ispecies++)
+            d_dvector(ispecies+nspecies,i) = d_dvector(ispecies,i);
+        }
+      );
+
+    // Signal that dvector has been modified on this execution space.
+    atomKK->modified( execution_space, DVECTOR_MASK );
+
+    // Communicate the updated species data to all nodes
+    atomKK->sync ( Host, DVECTOR_MASK );
+
+    // Communicate the dvector to all nodes
+    comm->forward_comm_fix(this);
+
+    // Flag that dvector was updated on the host in the comm.
+    atomKK->modified ( Host, DVECTOR_MASK );
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
 template <typename DeviceType>
 void FixRxKokkos<DeviceType>::pre_force(int vflag)
 {
@@ -789,18 +846,31 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
 
     memory->create_kokkos (k_dpdThetaLocal, dpdThetaLocal, count, "FixRxKokkos::dpdThetaLocal");
     d_dpdThetaLocal = k_dpdThetaLocal.d_view;
+    h_dpdThetaLocal = k_dpdThetaLocal.h_view;
+
+    const int neighflag = lmp->kokkos->neighflag;
+
+#define _template_switch(_wtflag, _localTempFlag) { \
+       if (neighflag == HALF) \
+          if (newton_pair) \
+             computeLocalTemperature<_wtflag, _localTempFlag, true , HALF> (); \
+          else \
+             computeLocalTemperature<_wtflag, _localTempFlag, false, HALF> (); \
+       else if (neighflag == HALFTHREAD) \
+          if (newton_pair) \
+             computeLocalTemperature<_wtflag, _localTempFlag, true , HALFTHREAD> (); \
+          else \
+             computeLocalTemperature<_wtflag, _localTempFlag, false, HALFTHREAD> (); \
+    }
 
     // Are there is no other options than wtFlag = (0)LUCY and localTempFlag = NONE : HARMONIC?
-    if (localTempFlag == HARMONIC)
-      if (newton_pair)
-        computeLocalTemperature<LUCY, HARMONIC, true > ();
-      else
-        computeLocalTemperature<LUCY, HARMONIC, false> ();
-    else
-      if (newton_pair)
-        computeLocalTemperature<LUCY, NONE    , true > ();
-      else
-        computeLocalTemperature<LUCY, NONE    , false> ();
+    if (localTempFlag == HARMONIC) {
+       _template_switch(LUCY, HARMONIC)
+    }
+    else {
+       _template_switch(LUCY, NONE)
+    }
+#undef _template_switch
   }
 
   TimerType timer_localTemperature = getTimeStamp();
@@ -972,10 +1042,9 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
 /* ---------------------------------------------------------------------- */
 
 template <typename DeviceType>
-  template <int WT_FLAG, int LOCAL_TEMP_FLAG, bool NEWTON_PAIR>
+  template <int WT_FLAG, int LOCAL_TEMP_FLAG, bool NEWTON_PAIR, int NEIGHFLAG>
 void FixRxKokkos<DeviceType>::computeLocalTemperature()
 {
-  printf("Inside FixRxKokkos::computeLocalTemperature: %d %d %d %d\n", WT_FLAG, LOCAL_TEMP_FLAG, NEWTON_PAIR, (int)lmp->kokkos->neighflag);
 
   //int inum,jnum,itype,jtype;
   //double xtmp,ytmp,ztmp,delx,dely,delz;
@@ -996,10 +1065,12 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
   const int nghost = atom->nghost;
   //const int newton_pair = force->newton_pair;
 
+  printf("Inside FixRxKokkos::computeLocalTemperature: %d %d %d %d %d\n", WT_FLAG, LOCAL_TEMP_FLAG, NEWTON_PAIR, (int)lmp->kokkos->neighflag, NEIGHFLAG, nlocal, nghost);
+
   // local temperature variables
   //double wij=0.0;
 
-  // Pull from pairDPDE. The pairDPDEKK objects are producted so recreate here for now.
+  // Pull from pairDPDE. The pairDPDEKK objects are protected so recreate here for now.
   //pairDPDEKK->k_cutsq.template sync<DeviceType>();
   //typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq = pairDPDEKK->k_cutsq.template view<DeviceType();
 
@@ -1032,14 +1103,15 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
 
   memory->create_kokkos (k_sumWeights, sumWeights, sumWeightsCt, "FixRxKokkos::sumWeights");
   d_sumWeights = k_sumWeights.d_view;
+  h_sumWeights = k_sumWeights.h_view;
 
   // Initialize the accumulator to zero ...
-  Kokkos::parallel_for(nlocal,
-		  LAMMPS_LAMBDA(int i)
-		  {
-		    d_sumWeights(i) = 0.0;
-		  }
-		  );
+  Kokkos::parallel_for (sumWeightsCt,
+        LAMMPS_LAMBDA(const int i)
+        {
+           d_sumWeights(i) = 0.0;
+        }
+     );
 
   const int inum = list->inum;
 
@@ -1059,86 +1131,106 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
   int** firstneigh = list->firstneigh;
 
   // loop over neighbors of my atoms
-  for (int ii = 0; ii < inum; ii++)
-  {
-    const int i = ilist[ii];
-    //const int i = d_ilist(ii);
+  Kokkos::parallel_for ( inum,
+        LAMMPS_LAMBDA(const int ii)
+        {
+          // Create an atomic view of sumWeights and dpdThetaLocal. Only needed
+          // for Half/thread scenarios.
+          typedef Kokkos::View< E_FLOAT*, typename DAT::t_efloat_1d::array_layout, DeviceType, Kokkos::MemoryTraits< AtomicF< NEIGHFLAG >::value> > AtomicViewType;
+
+          AtomicViewType a_dpdThetaLocal = d_dpdThetaLocal;
+          AtomicViewType a_sumWeights    = d_sumWeights;
+
+          // Local scalar accumulators.
+          double i_dpdThetaLocal = 0.0;
+          double i_sumWeights    = 0.0;
+
+          const int i = ilist[ii];
+          //const int i = d_ilist(ii);
  
-    //const double xtmp = x[i][0];
-    //const double ytmp = x[i][1];
-    //const double ztmp = x[i][2];
-    //const int itype = type[i];
-    const double xtmp = d_x(i,0);
-    const double ytmp = d_x(i,1);
-    const double ztmp = d_x(i,2);
-    const int itype = d_type(i);
+          const double xtmp = d_x(i,0);
+          const double ytmp = d_x(i,1);
+          const double ztmp = d_x(i,2);
+          const int itype = d_type(i);
 
-    int *jlist = firstneigh[i];
-    const int jnum = numneigh[i];
-    //const int jnum = d_numneigh(i);
+          int *jlist = firstneigh[i];
+          const int jnum = numneigh[i];
+          //const int jnum = d_numneigh(i);
 
-    for (int jj = 0; jj < jnum; jj++)
-    {
-      const int j = (jlist[jj] & NEIGHMASK);
-      //const int j = (d_neighbors(i,jj) & NEIGHMASK);
-      //const int jtype = type[j];
-      const int jtype = d_type(j);
+          for (int jj = 0; jj < jnum; jj++)
+          {
+            const int j = (jlist[jj] & NEIGHMASK);
+            //const int j = (d_neighbors(i,jj) & NEIGHMASK);
+            const int jtype = d_type(j);
 
-      //const double delx = xtmp - x[j][0];
-      //const double dely = ytmp - x[j][1];
-      //const double delz = ztmp - x[j][2];
-      const double delx = xtmp - d_x(j,0);
-      const double dely = ytmp - d_x(j,1);
-      const double delz = ztmp - d_x(j,2);
-      const double rsq = delx*delx + dely*dely + delz*delz;
+            const double delx = xtmp - d_x(j,0);
+            const double dely = ytmp - d_x(j,1);
+            const double delz = ztmp - d_x(j,2);
+            const double rsq = delx*delx + dely*dely + delz*delz;
 
-      const double cutsq_ij = d_cutsq(itype,jtype);
+            const double cutsq_ij = d_cutsq(itype,jtype);
 
-      if (rsq < cutsq_ij)
-      {
-        const double rcut = sqrt( cutsq_ij );
-        double rij = sqrt(rsq);
-        double ratio = rij/rcut;
+            if (rsq < cutsq_ij)
+            {
+              const double rcut = sqrt( cutsq_ij );
+              double rij = sqrt(rsq);
+              double ratio = rij/rcut;
 
-        double wij = 0.0;
+              double wij = 0.0;
 
-        // Lucy's Weight Function
-        if (WT_FLAG == LUCY)
-	{
-          wij = (1.0+3.0*ratio) * (1.0-ratio)*(1.0-ratio)*(1.0-ratio);
-          d_dpdThetaLocal(i) += wij / d_dpdTheta(j);
-          if (NEWTON_PAIR || j < nlocal)
-            d_dpdThetaLocal(j) += wij / d_dpdTheta(i);
+              // Lucy's Weight Function
+              if (WT_FLAG == LUCY)
+              {
+                wij = (1.0+3.0*ratio) * (1.0-ratio)*(1.0-ratio)*(1.0-ratio);
+                i_dpdThetaLocal += wij / d_dpdTheta(j);
+                if (NEWTON_PAIR || j < nlocal)
+                  a_dpdThetaLocal(j) += wij / d_dpdTheta(i);
+              }
+
+              i_sumWeights += wij;
+              if (NEWTON_PAIR || j < nlocal)
+                a_sumWeights(j) += wij;
+            }
+          }
+
+          // Update, don't assign, the array value (because another iteration may have hit it).
+          a_dpdThetaLocal(i) += i_dpdThetaLocal;
+          a_sumWeights(i) += i_sumWeights;
         }
+     );
 
-        d_sumWeights(i) += wij;
-        if (NEWTON_PAIR || j < nlocal)
-          d_sumWeights(j) += wij;
-      }
-    }
-  }
+  // Signal that dpdThetaLocal and sumWeights have been modified.
+  k_dpdThetaLocal.template modify<DeviceType>();
+  k_sumWeights.   template modify<DeviceType>();
 
+  // Communicate the sum dpdTheta and the weights on the host.
   if (NEWTON_PAIR) comm->reverse_comm_fix(this);
 
+  // Update the device view in case they got changed.
+  k_dpdThetaLocal.template sync<DeviceType>();
+  k_sumWeights.   template sync<DeviceType>();
+
   // self-interaction for local temperature
-  for (int i = 0; i < nlocal; i++)
-  {
-    double wij = 0.0;
+  Kokkos::parallel_for ( nlocal,
+        LAMMPS_LAMBDA(const int i)
+        {
+          double wij = 0.0;
 
-    // Lucy Weight Function
-    if (WT_FLAG == LUCY)
-    {
-      wij = 1.0;
-      d_dpdThetaLocal(i) += wij / d_dpdTheta(i);
-    }
-    d_sumWeights(i) += wij;
+          // Lucy Weight Function
+          if (WT_FLAG == LUCY)
+          {
+            wij = 1.0;
+            d_dpdThetaLocal(i) += wij / d_dpdTheta(i);
+          }
+          d_sumWeights(i) += wij;
 
-    // Normalized local temperature
-    d_dpdThetaLocal(i) = d_dpdThetaLocal(i) / d_sumWeights(i);
+          // Normalized local temperature
+          d_dpdThetaLocal(i) = d_dpdThetaLocal(i) / d_sumWeights(i);
 
-    if (LOCAL_TEMP_FLAG == HARMONIC)
-      d_dpdThetaLocal(i) = 1.0 / d_dpdThetaLocal(i);
-  }
+          if (LOCAL_TEMP_FLAG == HARMONIC)
+            d_dpdThetaLocal(i) = 1.0 / d_dpdThetaLocal(i);
+        }
+     );
 
   // Clean up the local kokkos data.
   memory->destroy_kokkos(k_cutsq, h_cutsq);
@@ -1147,6 +1239,93 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
   //delete [] sumWeights;
 }
 
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+int FixRxKokkos<DeviceType>::pack_forward_comm(int n, int *list, double *buf, int pbc_flag, int *pbc)
+{
+  //printf("inside FixRxKokkos::pack_forward_comm %d\n", comm->me);
+
+  HAT::t_float_2d h_dvector = atomKK->k_dvector.h_view;
+
+  int m = 0;
+  for (int ii = 0; ii < n; ii++) {
+    const int jj = list[ii];
+    for(int ispecies = 0; ispecies < nspecies; ispecies++){
+      buf[m++] = h_dvector(ispecies,jj);
+      buf[m++] = h_dvector(ispecies+nspecies,jj);
+    }
+  }
+
+  //printf("done with FixRxKokkos::pack_forward_comm %d\n", comm->me);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *buf)
+{
+  //printf("inside FixRxKokkos::unpack_forward_comm %d\n", comm->me);
+
+  HAT::t_float_2d h_dvector = atomKK->k_dvector.h_view;
+
+  const int last = first + n ;
+  int m = 0;
+  for (int ii = first; ii < last; ii++){
+    for (int ispecies = 0; ispecies < nspecies; ispecies++){
+      h_dvector(ispecies,ii) = buf[m++];
+      h_dvector(ispecies+nspecies,ii) = buf[m++];
+    }
+  }
+
+  //printf("done with FixRxKokkos::unpack_forward_comm %d\n", comm->me);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+int FixRxKokkos<DeviceType>::pack_reverse_comm(int n, int first, double *buf)
+{
+  //printf("inside FixRxKokkos::pack_reverse_comm %d %d %d\n", comm->me, first, n);
+  // Sync the host view.
+  k_dpdThetaLocal.template sync<LMPHostType>();
+  k_sumWeights.   template sync<LMPHostType>();
+
+  const int last = first + n;
+  int m = 0;
+  for (int i = first; i < last; ++i)
+  {
+    buf[m++] = h_dpdThetaLocal(i);
+    buf[m++] = h_sumWeights(i);
+  }
+  //printf("done with FixRxKokkos::pack_reverse_comm %d\n", comm->me);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, double *buf)
+{
+  // printf("inside FixRxKokkos::unpack_reverse_comm %d\n", comm->me);
+  int m = 0;
+  for (int i = 0; i < n; i++) {
+    const int j = list[i];
+
+    h_dpdThetaLocal(j) += buf[m++];
+    h_sumWeights(j) += buf[m++];
+  }
+
+  // Signal that the host view has been modified.
+  k_dpdThetaLocal.template modify<LMPHostType>();
+  k_sumWeights.   template modify<LMPHostType>();
+
+  // printf("done with FixRxKokkos::unpack_reverse_comm %d\n", comm->me);
+}
+
 namespace LAMMPS_NS {
 template class FixRxKokkos<LMPDeviceType>;
 #ifdef KOKKOS_HAVE_CUDA
diff --git a/src/KOKKOS/fix_rx_kokkos.h b/src/KOKKOS/fix_rx_kokkos.h
index 9d60f2b99e..d397d91499 100644
--- a/src/KOKKOS/fix_rx_kokkos.h
+++ b/src/KOKKOS/fix_rx_kokkos.h
@@ -40,6 +40,8 @@ class FixRxKokkos : public FixRX {
   FixRxKokkos(class LAMMPS *, int, char **);
   virtual ~FixRxKokkos();
   virtual void init();
+  void post_constructor();
+  virtual void setup_pre_force(int);
   virtual void pre_force(int);
 
   //template <typename SolverTag>
@@ -124,10 +126,18 @@ class FixRxKokkos : public FixRX {
   // Need a dual-view and device-view for dpdThetaLocal and sumWeights since they're used in several callbacks.
   DAT::tdual_efloat_1d k_dpdThetaLocal, k_sumWeights;
   typename ArrayTypes<DeviceType>::t_efloat_1d d_dpdThetaLocal, d_sumWeights;
+  typename HAT::t_efloat_1d h_dpdThetaLocal, h_sumWeights;
 
-  template <int WT_FLAG, int LOCAL_TEMP_FLAG, bool NEWTON_PAIR>
+  template <int WT_FLAG, int LOCAL_TEMP_FLAG, bool NEWTON_PAIR, int NEIGHFLAG>
   void computeLocalTemperature();
 
+  int pack_reverse_comm(int, int, double *);
+  void unpack_reverse_comm(int, int *, double *);
+  int pack_forward_comm(int , int *, double *, int, int *);
+  void unpack_forward_comm(int , int , double *);
+
+ private: // replicate a few from FixRX
+  int my_restartFlag;
 };
 
 }

From f2d005fb8db00fa90a151c2237985a77c7473b2a Mon Sep 17 00:00:00 2001
From: Christopher Stone <chris.stone@computational-science.com>
Date: Tue, 7 Feb 2017 16:24:59 -0500
Subject: [PATCH 115/267] Fixed errors in FixRxKokkos kokkos neighbor lists
 initialization and usage and calls to computeLocalTemperature.

- Created request for kokkos neighbor list for fix and switched to
  that neighbor list datatype in computeLocalTemperature.
- Reconfigured pre_force and setup_pre_force to call a common
  solve_reactions() method to avoid duplicate code.
TODO:
  - Clean-up
  - Provide per-problem scratch data within kokkos framework (instead
    of C++ new/delete data).
---
 src/KOKKOS/fix_rx_kokkos.cpp | 195 ++++++++++++++++++++++++++---------
 src/KOKKOS/fix_rx_kokkos.h   |  11 +-
 2 files changed, 148 insertions(+), 58 deletions(-)

diff --git a/src/KOKKOS/fix_rx_kokkos.cpp b/src/KOKKOS/fix_rx_kokkos.cpp
index 167f2713ea..1497fea6c1 100644
--- a/src/KOKKOS/fix_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@@ -21,6 +21,9 @@
 #include "update.h"
 #include "respa.h"
 #include "modify.h"
+#include "neighbor.h"
+#include "neigh_list_kokkos.h"
+#include "neigh_request.h"
 #include "error.h"
 #include "math_special.h"
 
@@ -95,24 +98,61 @@ void FixRxKokkos<DeviceType>::init()
   printf("Inside FixRxKokkos::init\n");
 
   // Call the parent's version.
-  FixRX::init();
+  //FixRX::init();
+
+  pairDPDE = (PairDPDfdtEnergy *) force->pair_match("dpd/fdt/energy",1);
+  if (pairDPDE == NULL)
+    pairDPDE = (PairDPDfdtEnergy *) force->pair_match("dpd/fdt/energy/kk",1);
+
+  if (pairDPDE == NULL)
+    error->all(FLERR,"Must use pair_style dpd/fdt/energy with fix rx");
 
   pairDPDEKK = dynamic_cast<decltype(pairDPDEKK)>(pairDPDE);
   if (pairDPDEKK == NULL)
     error->all(FLERR,"Must use pair_style dpd/fdt/energy/kk with fix rx/kk");
 
+  bool eos_flag = false;
+  for (int i = 0; i < modify->nfix; i++)
+    if (strcmp(modify->fix[i]->style,"eos/table/rx") == 0) eos_flag = true;
+  if(!eos_flag) error->all(FLERR,"fix rx requires fix eos/table/rx to be specified");
+
   if (update_kinetics_data)
     create_kinetics_data();
+
+  // From FixRX::init()
+  // need a half neighbor list
+  // built whenever re-neighboring occurs
+
+  int irequest = neighbor->request(this,instance_me);
+  neighbor->requests[irequest]->pair = 0;
+  neighbor->requests[irequest]->fix = 1;
+
+  // Update the neighbor data for Kokkos.
+  int neighflag = lmp->kokkos->neighflag;
+
+  neighbor->requests[irequest]->
+    kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value &&
+    !Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+  neighbor->requests[irequest]->
+    kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+ 
+  if (neighflag == FULL) {
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+  } else { //if (neighflag == HALF || neighflag == HALFTHREAD)
+    neighbor->requests[irequest]->full = 0;
+    neighbor->requests[irequest]->half = 1;
+  }
 }
 
 /* ---------------------------------------------------------------------- */
 
-//template <typename DeviceType>
-//void FixRXKokkos<DeviceType>::init_list(int, class NeighList* ptr)
-//{
-//  printf("Inside FixRxKokkos::init_list\n");
-//  this->list = ptr;
-//}
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::init_list(int, class NeighList* ptr)
+{
+  printf("Inside FixRxKokkos::init_list\n");
+  this->list = ptr;
+}
 
 /* ---------------------------------------------------------------------- */
 
@@ -663,37 +703,6 @@ void FixRxKokkos<DeviceType>::operator()(SolverType, const int &i) const
 
 /* ---------------------------------------------------------------------- */
 
-template <typename DeviceType>
-void FixRxKokkos<DeviceType>::solve_reactions(void)
-{
-/*  int nlocal = atom->nlocal;
-  if (igroup == atom->firstgroup) nlocal = atom->nfirst;
-
-  using AT = ArrayTypes<DeviceType>;
-
-  atomKK->sync(execution_space, UCOND_MASK);
-  typename AT::t_efloat_1d uCond = atomKK->k_uCond.view<DeviceType>();
-  atomKK->sync(execution_space, UMECH_MASK);
-  typename AT::t_efloat_1d uMech = atomKK->k_uMech.view<DeviceType>();
-
-  pairDPDEKK->k_duCond.template sync<DeviceType>();
-  typename AT::t_efloat_1d_const duCond = pairDPDEKK->k_duCond.template view<DeviceType>();
-  pairDPDEKK->k_duMech.template sync<DeviceType>();
-  typename AT::t_efloat_1d_const duMech = pairDPDEKK->k_duMech.template view<DeviceType>();
-
-  auto dt = update->dt;
-
-  Kokkos::parallel_for(nlocal, LAMMPS_LAMBDA(int i) {
-    uCond(i) += 0.5*dt*duCond(i);
-    uMech(i) += 0.5*dt*duMech(i);
-  });
-
-  atomKK->modified(execution_space, UCOND_MASK);
-  atomKK->modified(execution_space, UMECH_MASK); */
-}
-
-/* ---------------------------------------------------------------------- */
-
 template <typename DeviceType>
 void FixRxKokkos<DeviceType>::create_kinetics_data(void)
 {
@@ -784,6 +793,9 @@ void FixRxKokkos<DeviceType>::setup_pre_force(int vflag)
     my_restartFlag = 0;
   else
   {
+#if 1
+    this->solve_reactions( vflag, false );
+#else
     const int nlocal = atom->nlocal;
     //const int nghost = atom->nghost;
     //const int *mask = atom->mask;
@@ -815,6 +827,7 @@ void FixRxKokkos<DeviceType>::setup_pre_force(int vflag)
 
     // Flag that dvector was updated on the host in the comm.
     atomKK->modified ( Host, DVECTOR_MASK );
+#endif
   }
 }
 
@@ -825,6 +838,15 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
 {
   printf("Inside FixRxKokkos<DeviceType>::pre_force localTempFlag= %d\n", localTempFlag);
 
+  this->solve_reactions( vflag, true );
+}
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreForce)
+{
+  printf("Inside FixRxKokkos<DeviceType>::solve_reactions localTempFlag= %d isPreForce= %s\n", localTempFlag, isPreForce ? "True" : "false");
+
   if (update_kinetics_data)
     create_kinetics_data();
 
@@ -834,7 +856,8 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
   const int nghost = atom->nghost;
   const int newton_pair = force->newton_pair;
 
-  const bool setToZero = false; // don't set the forward rates to zero.
+  //const bool setToZero = false; // don't set the forward rates to zero.
+  const bool setToZero = isPreForce == false; // Set the forward rates to zero if acting as setup_pre_force.
 
   if (localTempFlag)
   {
@@ -1115,16 +1138,71 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
 
   const int inum = list->inum;
 
-  // Local list views. (This isn't working!)
-  //NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
-  //if (not(list->kokkos))
-  //{
-  //  error->one(FLERR,"list is not a Kokkos list\n");
-  //}
+  bool useKokkosLists = false;
 
-  //typename ArrayTypes<DeviceType>::t_neighbors_2d d_neighbors = k_list->d_neighbors;
-  //typename ArrayTypes<DeviceType>::t_int_1d       d_ilist     = k_list->d_ilist;
-  //typename ArrayTypes<DeviceType>::t_int_1d       d_numneigh  = k_list->d_numneigh;
+  // Local list views. (This isn't working!)
+  NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
+  if (not(list->kokkos))
+  {
+     //error->one(FLERR,"list is not a Kokkos list\n");
+     printf("list is NOT a Kokkos list\n");
+
+     int* ilist = list->ilist;
+     int* numneigh = list->numneigh;
+     int** firstneigh = list->firstneigh;
+     printf("inum= %d ilist= %x\n", inum, ilist);
+     for (int ii = 0; ii < std::min(inum,10); ++ii)
+     {
+        const int i = ilist[ii];
+        int *jlist = firstneigh[i];
+        const int jnum = numneigh[i];
+        const int j = (jlist[0] & NEIGHMASK);
+        printf("  ilist[%d]= %d j= %d jnum= %d\n", ii, i, j, jnum);
+     }
+  }
+  else
+  {
+     printf("It's a kokkos list\n");
+
+     useKokkosLists = true;
+
+     typename ArrayTypes<DeviceType>::t_neighbors_2d d_neighbors = k_list->d_neighbors;
+     typename ArrayTypes<DeviceType>::t_int_1d       d_ilist     = k_list->d_ilist;
+     typename ArrayTypes<DeviceType>::t_int_1d       d_numneigh  = k_list->d_numneigh;
+
+     static FILE *fp1 = NULL;
+
+     //if (fp1 == NULL)
+     //   fp1 = fopen("kokkos_list.txt","w");
+
+     if (fp1 != NULL)
+     {
+        const int inum = list->inum;
+        fprintf(fp1, "inum= %d\n", inum);
+        for (int ii = 0; ii < inum; ++ii)
+        {
+           const int i = d_ilist[ii];
+           const int jnum = d_numneigh[i];
+           fprintf(fp1, "  %d %d %d\n", ii, i, jnum);
+           for (int jj = 0; jj < jnum; ++jj)
+           {
+              const int j = (d_neighbors(i,jj) & NEIGHMASK);
+              fprintf(fp1, "    %d %d\n", jj, j);
+           }
+        }
+     }
+  }
+
+  typename ArrayTypes<DeviceType>::t_neighbors_2d d_neighbors;
+  typename ArrayTypes<DeviceType>::t_int_1d       d_ilist;
+  typename ArrayTypes<DeviceType>::t_int_1d       d_numneigh;
+
+  if (useKokkosLists)
+  {
+     d_neighbors = k_list->d_neighbors;
+     d_ilist     = k_list->d_ilist;
+     d_numneigh  = k_list->d_numneigh;
+  }
 
   int* ilist = list->ilist;
   int* numneigh = list->numneigh;
@@ -1145,8 +1223,9 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
           double i_dpdThetaLocal = 0.0;
           double i_sumWeights    = 0.0;
 
-          const int i = ilist[ii];
+          //const int i = ilist[ii];
           //const int i = d_ilist(ii);
+          const int i = (useKokkosLists) ? d_ilist(ii) : ilist[ii];
  
           const double xtmp = d_x(i,0);
           const double ytmp = d_x(i,1);
@@ -1154,13 +1233,15 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
           const int itype = d_type(i);
 
           int *jlist = firstneigh[i];
-          const int jnum = numneigh[i];
+          //const int jnum = numneigh[i];
           //const int jnum = d_numneigh(i);
+          const int jnum = (useKokkosLists) ? d_numneigh(i) : numneigh[i];
 
           for (int jj = 0; jj < jnum; jj++)
           {
-            const int j = (jlist[jj] & NEIGHMASK);
+            //const int j = (jlist[jj] & NEIGHMASK);
             //const int j = (d_neighbors(i,jj) & NEIGHMASK);
+            const int j = (useKokkosLists) ? (d_neighbors(i,jj) & NEIGHMASK) : (jlist[jj] & NEIGHMASK);
             const int jtype = d_type(j);
 
             const double delx = xtmp - d_x(j,0);
@@ -1232,6 +1313,18 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
         }
      );
 
+  if (false)
+  {
+     static FILE *fp = NULL;
+
+     if (fp == NULL)
+        fp = fopen("kokkos_temp.txt","w");
+
+     fprintf(fp, "nlocal= %d %d\n", nlocal, nghost);
+     for (int i = 0; i < nlocal; ++i)
+        fprintf(fp, "%d %15.9e %15.9e\n", i, d_dpdThetaLocal[i], d_sumWeights[i]);
+  }
+
   // Clean up the local kokkos data.
   memory->destroy_kokkos(k_cutsq, h_cutsq);
   memory->destroy_kokkos(k_sumWeights, sumWeights);
diff --git a/src/KOKKOS/fix_rx_kokkos.h b/src/KOKKOS/fix_rx_kokkos.h
index d397d91499..36b05cb210 100644
--- a/src/KOKKOS/fix_rx_kokkos.h
+++ b/src/KOKKOS/fix_rx_kokkos.h
@@ -25,21 +25,18 @@ FixStyle(rx/kk/host,FixRxKokkos<LMPHostType>)
 #include "fix_rx.h"
 #include "pair_dpd_fdt_energy_kokkos.h"
 #include "kokkos_type.h"
+#include "neigh_list.h"
+#include "neigh_list_kokkos.h"
 
 namespace LAMMPS_NS {
 
-template <bool _setToZero>
-struct TagFixRxKokkosSolver
-{
-  enum { setToZero = (_setToZero == true) ? 1 : 0 };
-};
-
 template <typename DeviceType>
 class FixRxKokkos : public FixRX {
  public:
   FixRxKokkos(class LAMMPS *, int, char **);
   virtual ~FixRxKokkos();
   virtual void init();
+  void init_list(int, class NeighList *);
   void post_constructor();
   virtual void setup_pre_force(int);
   virtual void pre_force(int);
@@ -79,7 +76,7 @@ class FixRxKokkos : public FixRX {
   PairDPDfdtEnergyKokkos<DeviceType>* pairDPDEKK;
   double VDPD;
 
-  void solve_reactions(void);
+  void solve_reactions(const int vflag, const bool isPreForce = true);
 
   int rhs(double, const double *, double *, void *) const;
   int rhs_dense (double, const double *, double *, void *) const;

From 4e8351d9c8cb26328b667882290742029d5bbdfe Mon Sep 17 00:00:00 2001
From: Christopher Stone <chris.stone@computational-science.com>
Date: Tue, 7 Feb 2017 17:53:36 -0500
Subject: [PATCH 116/267] Code clean-up for FixRxKokkos.

- Removed dead code and old errors.
TODO:
  - Per-thread scratch data in kokkos.
  - ODE Diagnostics in kokkos.
---
 src/KOKKOS/fix_rx_kokkos.cpp | 166 +++--------------------------------
 1 file changed, 12 insertions(+), 154 deletions(-)

diff --git a/src/KOKKOS/fix_rx_kokkos.cpp b/src/KOKKOS/fix_rx_kokkos.cpp
index 1497fea6c1..b5055191c4 100644
--- a/src/KOKKOS/fix_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@@ -792,43 +792,7 @@ void FixRxKokkos<DeviceType>::setup_pre_force(int vflag)
   if (my_restartFlag)
     my_restartFlag = 0;
   else
-  {
-#if 1
     this->solve_reactions( vflag, false );
-#else
-    const int nlocal = atom->nlocal;
-    //const int nghost = atom->nghost;
-    //const int *mask = atom->mask;
-    //const int newton_pair = force->newton_pair;
-
-    typename ArrayTypes<DeviceType>::t_float_2d  d_dvector = atomKK->k_dvector.view<DeviceType>();
-
-    // Get up-to-date data.
-    atomKK->sync( execution_space, DVECTOR_MASK );
-
-    // The only net effect from fix_rx.cpp is to set dvector[nspecies:2*nspecies]
-    // since the reactions are set to zero for step 0.
-    Kokkos::parallel_for ( nlocal,
-        LAMMPS_LAMBDA(const int i)
-        {
-          for (int ispecies = 0; ispecies < nspecies; ispecies++)
-            d_dvector(ispecies+nspecies,i) = d_dvector(ispecies,i);
-        }
-      );
-
-    // Signal that dvector has been modified on this execution space.
-    atomKK->modified( execution_space, DVECTOR_MASK );
-
-    // Communicate the updated species data to all nodes
-    atomKK->sync ( Host, DVECTOR_MASK );
-
-    // Communicate the dvector to all nodes
-    comm->forward_comm_fix(this);
-
-    // Flag that dvector was updated on the host in the comm.
-    atomKK->modified ( Host, DVECTOR_MASK );
-#endif
-  }
 }
 
 /* ---------------------------------------------------------------------- */
@@ -856,17 +820,13 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
   const int nghost = atom->nghost;
   const int newton_pair = force->newton_pair;
 
-  //const bool setToZero = false; // don't set the forward rates to zero.
-  const bool setToZero = isPreForce == false; // Set the forward rates to zero if acting as setup_pre_force.
+  // Set the forward rates to zero if acting as setup_pre_force.
+  const bool setRatesToZero = (isPreForce == false);
 
   if (localTempFlag)
   {
     const int count = nlocal + (newton_pair ? nghost : 0);
 
-    //dpdThetaLocal = new double[count];
-    //memset(dpdThetaLocal, 0, sizeof(double)*count);
-    //FixRx::computeLocalTemperature();
-
     memory->create_kokkos (k_dpdThetaLocal, dpdThetaLocal, count, "FixRxKokkos::dpdThetaLocal");
     d_dpdThetaLocal = k_dpdThetaLocal.d_view;
     h_dpdThetaLocal = k_dpdThetaLocal.h_view;
@@ -904,9 +864,6 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
   // Set data needed in the operators.
   // ...
 
-  //int *mask = atom->mask;
-  //double *dpdTheta = atom->dpdTheta;
-
   // Local references to the atomKK objects.
   typename ArrayTypes<DeviceType>::t_efloat_1d d_dpdTheta = atomKK->k_dpdTheta.view<DeviceType>(); 
   typename ArrayTypes<DeviceType>::t_float_2d  d_dvector  = atomKK->k_dvector.view<DeviceType>();
@@ -941,13 +898,12 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
 
         CounterType counter_i;
 
-        //const double theta = (localTempFlag) ? dpdThetaLocal[i] : d_dpdTheta(i);
         const double theta = (localTempFlag) ? d_dpdThetaLocal(i) : d_dpdTheta(i);
 
         //Compute the reaction rate constants
         for (int irxn = 0; irxn < nreactions; irxn++)
         {
-          if (setToZero)
+          if (setRatesToZero)
             userData.kFor[irxn] = 0.0;
           else
           {
@@ -1068,16 +1024,6 @@ template <typename DeviceType>
   template <int WT_FLAG, int LOCAL_TEMP_FLAG, bool NEWTON_PAIR, int NEIGHFLAG>
 void FixRxKokkos<DeviceType>::computeLocalTemperature()
 {
-
-  //int inum,jnum,itype,jtype;
-  //double xtmp,ytmp,ztmp,delx,dely,delz;
-  //double rsq;
-  //int *ilist,*jlist,*numneigh,**firstneigh;
-
-  //double **x = atom->x;
-  //int *type = atom->type;
-  //double *dpdTheta = atom->dpdTheta;
-
   typename ArrayTypes<DeviceType>::t_x_array_randomread d_x        = atomKK->k_x.view<DeviceType>();
   typename ArrayTypes<DeviceType>::t_int_1d_randomread  d_type     = atomKK->k_type.view<DeviceType>();
   typename ArrayTypes<DeviceType>::t_efloat_1d          d_dpdTheta = atomKK->k_dpdTheta.view<DeviceType>(); 
@@ -1086,12 +1032,8 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
 
   const int nlocal = atom->nlocal;
   const int nghost = atom->nghost;
-  //const int newton_pair = force->newton_pair;
 
-  printf("Inside FixRxKokkos::computeLocalTemperature: %d %d %d %d %d\n", WT_FLAG, LOCAL_TEMP_FLAG, NEWTON_PAIR, (int)lmp->kokkos->neighflag, NEIGHFLAG, nlocal, nghost);
-
-  // local temperature variables
-  //double wij=0.0;
+  printf("Inside FixRxKokkos::computeLocalTemperature: %d %d %d %d %d %d %d\n", WT_FLAG, LOCAL_TEMP_FLAG, NEWTON_PAIR, (int)lmp->kokkos->neighflag, NEIGHFLAG, nlocal, nghost);
 
   // Pull from pairDPDE. The pairDPDEKK objects are protected so recreate here for now.
   //pairDPDEKK->k_cutsq.template sync<DeviceType>();
@@ -1121,8 +1063,6 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
 
   // Initialize the local temperature weight array
   int sumWeightsCt = nlocal + (NEWTON_PAIR ? nghost : 0);
-  //sumWeights = new double[sumWeightsCt];
-  //memset(sumWeights, 0, sizeof(double)*sumWeightsCt);
 
   memory->create_kokkos (k_sumWeights, sumWeights, sumWeightsCt, "FixRxKokkos::sumWeights");
   d_sumWeights = k_sumWeights.d_view;
@@ -1136,77 +1076,16 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
         }
      );
 
-  const int inum = list->inum;
-
-  bool useKokkosLists = false;
-
   // Local list views. (This isn't working!)
   NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
   if (not(list->kokkos))
-  {
-     //error->one(FLERR,"list is not a Kokkos list\n");
-     printf("list is NOT a Kokkos list\n");
+     error->one(FLERR,"list is not a Kokkos list\n");
 
-     int* ilist = list->ilist;
-     int* numneigh = list->numneigh;
-     int** firstneigh = list->firstneigh;
-     printf("inum= %d ilist= %x\n", inum, ilist);
-     for (int ii = 0; ii < std::min(inum,10); ++ii)
-     {
-        const int i = ilist[ii];
-        int *jlist = firstneigh[i];
-        const int jnum = numneigh[i];
-        const int j = (jlist[0] & NEIGHMASK);
-        printf("  ilist[%d]= %d j= %d jnum= %d\n", ii, i, j, jnum);
-     }
-  }
-  else
-  {
-     printf("It's a kokkos list\n");
+  typename ArrayTypes<DeviceType>::t_neighbors_2d d_neighbors = k_list->d_neighbors;
+  typename ArrayTypes<DeviceType>::t_int_1d       d_ilist     = k_list->d_ilist;
+  typename ArrayTypes<DeviceType>::t_int_1d       d_numneigh  = k_list->d_numneigh;
 
-     useKokkosLists = true;
-
-     typename ArrayTypes<DeviceType>::t_neighbors_2d d_neighbors = k_list->d_neighbors;
-     typename ArrayTypes<DeviceType>::t_int_1d       d_ilist     = k_list->d_ilist;
-     typename ArrayTypes<DeviceType>::t_int_1d       d_numneigh  = k_list->d_numneigh;
-
-     static FILE *fp1 = NULL;
-
-     //if (fp1 == NULL)
-     //   fp1 = fopen("kokkos_list.txt","w");
-
-     if (fp1 != NULL)
-     {
-        const int inum = list->inum;
-        fprintf(fp1, "inum= %d\n", inum);
-        for (int ii = 0; ii < inum; ++ii)
-        {
-           const int i = d_ilist[ii];
-           const int jnum = d_numneigh[i];
-           fprintf(fp1, "  %d %d %d\n", ii, i, jnum);
-           for (int jj = 0; jj < jnum; ++jj)
-           {
-              const int j = (d_neighbors(i,jj) & NEIGHMASK);
-              fprintf(fp1, "    %d %d\n", jj, j);
-           }
-        }
-     }
-  }
-
-  typename ArrayTypes<DeviceType>::t_neighbors_2d d_neighbors;
-  typename ArrayTypes<DeviceType>::t_int_1d       d_ilist;
-  typename ArrayTypes<DeviceType>::t_int_1d       d_numneigh;
-
-  if (useKokkosLists)
-  {
-     d_neighbors = k_list->d_neighbors;
-     d_ilist     = k_list->d_ilist;
-     d_numneigh  = k_list->d_numneigh;
-  }
-
-  int* ilist = list->ilist;
-  int* numneigh = list->numneigh;
-  int** firstneigh = list->firstneigh;
+  const int inum = list->inum;
 
   // loop over neighbors of my atoms
   Kokkos::parallel_for ( inum,
@@ -1223,25 +1102,18 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
           double i_dpdThetaLocal = 0.0;
           double i_sumWeights    = 0.0;
 
-          //const int i = ilist[ii];
-          //const int i = d_ilist(ii);
-          const int i = (useKokkosLists) ? d_ilist(ii) : ilist[ii];
+          const int i = d_ilist(ii);
  
           const double xtmp = d_x(i,0);
           const double ytmp = d_x(i,1);
           const double ztmp = d_x(i,2);
           const int itype = d_type(i);
 
-          int *jlist = firstneigh[i];
-          //const int jnum = numneigh[i];
-          //const int jnum = d_numneigh(i);
-          const int jnum = (useKokkosLists) ? d_numneigh(i) : numneigh[i];
+          const int jnum = d_numneigh(i);
 
           for (int jj = 0; jj < jnum; jj++)
           {
-            //const int j = (jlist[jj] & NEIGHMASK);
-            //const int j = (d_neighbors(i,jj) & NEIGHMASK);
-            const int j = (useKokkosLists) ? (d_neighbors(i,jj) & NEIGHMASK) : (jlist[jj] & NEIGHMASK);
+            const int j = (d_neighbors(i,jj) & NEIGHMASK);
             const int jtype = d_type(j);
 
             const double delx = xtmp - d_x(j,0);
@@ -1313,23 +1185,9 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
         }
      );
 
-  if (false)
-  {
-     static FILE *fp = NULL;
-
-     if (fp == NULL)
-        fp = fopen("kokkos_temp.txt","w");
-
-     fprintf(fp, "nlocal= %d %d\n", nlocal, nghost);
-     for (int i = 0; i < nlocal; ++i)
-        fprintf(fp, "%d %15.9e %15.9e\n", i, d_dpdThetaLocal[i], d_sumWeights[i]);
-  }
-
   // Clean up the local kokkos data.
   memory->destroy_kokkos(k_cutsq, h_cutsq);
   memory->destroy_kokkos(k_sumWeights, sumWeights);
-
-  //delete [] sumWeights;
 }
 
 /* ---------------------------------------------------------------------- */

From 93d99ec8d0576aebebbc7658a891013326de6f6c Mon Sep 17 00:00:00 2001
From: Christopher Stone <chris.stone@computational-science.com>
Date: Thu, 9 Feb 2017 22:38:58 -0500
Subject: [PATCH 117/267] Added ODE diagnostics to FixRxKokkos using Kokkos
 managed data.

- Added the diagnostics performance analysis routine to FixRxKokkos
  using Kokkos views.
TODO:
  - Switch to using Kokkos data for the per-iteration scratch data.
    How to allocate only enouch for each work-unit and not all
    iterations? Can the shared-memory scratch memory work for this,
    even for large sizes?
---
 src/KOKKOS/fix_rx_kokkos.cpp | 231 +++++++++++++++++++++++++++++++----
 src/KOKKOS/fix_rx_kokkos.h   |  13 ++
 2 files changed, 223 insertions(+), 21 deletions(-)

diff --git a/src/KOKKOS/fix_rx_kokkos.cpp b/src/KOKKOS/fix_rx_kokkos.cpp
index b5055191c4..2a3fc7547a 100644
--- a/src/KOKKOS/fix_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@@ -879,11 +879,22 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
   // Average DPD volume. Used in the RHS function.
   this->VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
 
-  /*if (odeIntegrationFlag == ODE_LAMMPS_RKF45 && diagnosticFrequency == 1)
+  if (odeIntegrationFlag == ODE_LAMMPS_RKF45 && diagnosticFrequency == 1)
   {
-    memory->create( diagnosticCounterPerODE[StepSum], nlocal, "FixRX::diagnosticCounterPerODE");
-    memory->create( diagnosticCounterPerODE[FuncSum], nlocal, "FixRX::diagnosticCounterPerODE");
-  }*/
+    memory->create_kokkos (k_diagnosticCounterPerODEnSteps, diagnosticCounterPerODEnSteps, nlocal, "FixRxKokkos::diagnosticCounterPerODEnSteps");
+    memory->create_kokkos (k_diagnosticCounterPerODEnFuncs, diagnosticCounterPerODEnFuncs, nlocal, "FixRxKokkos::diagnosticCounterPerODEnFuncs");
+
+    d_diagnosticCounterPerODEnSteps = k_diagnosticCounterPerODEnSteps.d_view;
+    d_diagnosticCounterPerODEnFuncs = k_diagnosticCounterPerODEnFuncs.d_view;
+
+    Kokkos::parallel_for ( nlocal,
+          LAMMPS_LAMBDA(const int i)
+          {
+             d_diagnosticCounterPerODEnSteps(i) = 0;
+             d_diagnosticCounterPerODEnFuncs(i) = 0;
+          }
+       );
+  }
 
   Kokkos::parallel_reduce( nlocal, LAMMPS_LAMBDA(int i, CounterType &counter)
     {
@@ -930,12 +941,11 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
         {
           rkf45(nspecies, t_stop, y, rwork, &userData, counter_i);
 
-          //if (diagnosticFrequency == 1 && diagnosticCounterPerODE[StepSum] != NULL)
-          //if (diagnosticCounterPerODE[StepSum] != NULL)
-          //{
-          //  diagnosticCounterPerODE[StepSum][i] = counter_i.nSteps;
-          //  diagnosticCounterPerODE[FuncSum][i] = counter_i.nFuncs;
-          //}
+          if (diagnosticFrequency == 1)
+          {
+            d_diagnosticCounterPerODEnSteps(i) = counter_i.nSteps;
+            d_diagnosticCounterPerODEnFuncs(i) = counter_i.nFuncs;
+          }
         }
 
         // Store the solution back in dvector.
@@ -975,10 +985,7 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
   atomKK->modified ( Host, DVECTOR_MASK );
 
   if (localTempFlag)
-  {
-    //delete [] dpdThetaLocal;
     memory->destroy_kokkos(k_dpdThetaLocal, dpdThetaLocal);
-  }
 
   TimerType timer_stop = getTimeStamp();
 
@@ -997,12 +1004,12 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
     error->warning(FLERR, sbuf);
   }
 
-/*
   // Compute and report ODE diagnostics, if requested.
-  if (odeIntegrationFlag == ODE_LAMMPS_RKF45 && diagnosticFrequency != 0){
+  if (odeIntegrationFlag == ODE_LAMMPS_RKF45 && diagnosticFrequency != 0)
+  {
     // Update the counters.
-    diagnosticCounter[StepSum] += nSteps;
-    diagnosticCounter[FuncSum] += nFuncs;
+    diagnosticCounter[StepSum] += TotalCounters.nSteps;
+    diagnosticCounter[FuncSum] += TotalCounters.nFuncs;
     diagnosticCounter[TimeSum] += time_ODE;
     diagnosticCounter[AtomSum] += nlocal;
     diagnosticCounter[numDiagnosticCounters-1] ++;
@@ -1011,11 +1018,193 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
                ((update->ntimestep - update->firststep) % diagnosticFrequency) == 0) ||
          (diagnosticFrequency < 0 && update->ntimestep == update->laststep) )
       this->odeDiagnostics();
+  }
+}
 
-    for (int i = 0; i < numDiagnosticCounters; ++i)
-      if (diagnosticCounterPerODE[i])
-        memory->destroy( diagnosticCounterPerODE[i] );
-  } */
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::odeDiagnostics(void)
+{
+  TimerType timer_start = getTimeStamp();
+
+  // Compute:
+  // 1) Average # of ODE integrator steps and RHS evaluations per atom globally.
+  // 2) RMS     # of  ...
+  // 3) Average # of ODE steps and RHS evaluations per MPI task.
+  // 4) RMS     # of ODE steps and RHS evaluations per MPI task.
+  // 5) MAX     # of ODE steps and RHS evaluations per MPI task.
+  //
+  // ... 1,2 are for ODE control diagnostics.
+  // ... 3-5 are for load balancing diagnostics.
+  //
+  // To do this, we'll need to
+  // a) Allreduce (sum) the sum of nSteps / nFuncs. Dividing by atom->natoms
+  //    gives the avg # of steps/funcs per atom globally.
+  // b) Reduce (sum) to root the sum of squares of the differences.
+  //    i) Sum_i (steps_i - avg_steps_global)^2
+  //   ii) Sum_i (funcs_i - avg_funcs_global)^2
+  //  iii) (avg_steps_local - avg_steps_global)^2
+  //   iv) (avg_funcs_local - avg_funcs_global)^2
+
+  const int numCounters = numDiagnosticCounters-1;
+
+  // # of time-steps for averaging.
+  const int nTimes = this->diagnosticCounter[numDiagnosticCounters-1];
+
+  // # of ODE's per time-step (on average).
+  //const int nODEs  = this->diagnosticCounter[AtomSum] / nTimes;
+
+  // Sum up the sums from each task.
+  double sums[numCounters];
+  double my_vals[numCounters];
+  double max_per_proc[numCounters];
+  double min_per_proc[numCounters];
+
+  // Compute counters per dpd time-step.
+  for (int i = 0; i < numCounters; ++i){
+    my_vals[i] = this->diagnosticCounter[i] / nTimes;
+    //printf("my sum[%d] = %f %d\n", i, my_vals[i], comm->me);
+  }
+
+  MPI_Allreduce (my_vals, sums, numCounters, MPI_DOUBLE, MPI_SUM, world);
+
+  MPI_Reduce (my_vals, max_per_proc, numCounters, MPI_DOUBLE, MPI_MAX, 0, world);
+  MPI_Reduce (my_vals, min_per_proc, numCounters, MPI_DOUBLE, MPI_MIN, 0, world);
+
+  const double nODEs = sums[numCounters-1];
+
+  double avg_per_atom[numCounters], avg_per_proc[numCounters];
+
+  // Averages per-ODE and per-proc per time-step.
+  for (int i = 0; i < numCounters; ++i){
+    avg_per_atom[i] = sums[i] / nODEs;
+    avg_per_proc[i] = sums[i] / comm->nprocs;
+  }
+
+  // Sum up the differences from each task.
+  double sum_sq[2*numCounters];
+  double my_sum_sq[2*numCounters];
+  for (int i = 0; i < numCounters; ++i){
+    double diff_i = my_vals[i] - avg_per_proc[i];
+    my_sum_sq[i] = diff_i * diff_i;
+  }
+
+  double max_per_ODE[numCounters], min_per_ODE[numCounters];
+
+  // Process the per-ODE RMS of the # of steps/funcs
+  if (diagnosticFrequency == 1)
+  {
+    h_diagnosticCounterPerODEnSteps = k_diagnosticCounterPerODEnSteps.h_view;
+    h_diagnosticCounterPerODEnFuncs = k_diagnosticCounterPerODEnFuncs.h_view;
+
+    Kokkos::deep_copy( h_diagnosticCounterPerODEnSteps, d_diagnosticCounterPerODEnSteps );
+    Kokkos::deep_copy( h_diagnosticCounterPerODEnFuncs, d_diagnosticCounterPerODEnFuncs );
+
+    double my_max[numCounters], my_min[numCounters];
+
+    const int nlocal = atom->nlocal;
+    HAT::t_int_1d h_mask = atomKK->k_mask.h_view;
+
+    for (int i = 0; i < numCounters; ++i)
+    {
+      my_sum_sq[i+numCounters] = 0;
+      my_max[i] = 0;
+      my_min[i] = DBL_MAX;
+    }
+
+    for (int j = 0; j < nlocal; ++j)
+      if (h_mask(j) & groupbit)
+      {
+        int nSteps = h_diagnosticCounterPerODEnSteps(j);
+        double diff_nSteps = double( nSteps ) - avg_per_atom[StepSum];
+        my_sum_sq[StepSum+numCounters] += diff_nSteps*diff_nSteps;
+        my_max[StepSum] = std::max( my_max[StepSum], (double)nSteps );
+        my_min[StepSum] = std::min( my_min[StepSum], (double)nSteps );
+
+        int nFuncs = h_diagnosticCounterPerODEnFuncs(j);
+        double diff_nFuncs = double( nFuncs ) - avg_per_atom[FuncSum];
+        my_sum_sq[FuncSum+numCounters] += diff_nFuncs*diff_nFuncs;
+
+        my_max[FuncSum] = std::max( my_max[FuncSum], (double)nFuncs );
+        my_min[FuncSum] = std::min( my_min[FuncSum], (double)nFuncs );
+      }
+
+    memory->destroy_kokkos( k_diagnosticCounterPerODEnSteps, diagnosticCounterPerODEnSteps );
+    memory->destroy_kokkos( k_diagnosticCounterPerODEnFuncs, diagnosticCounterPerODEnFuncs );
+
+    MPI_Reduce (my_sum_sq, sum_sq, 2*numCounters, MPI_DOUBLE, MPI_SUM, 0, world);
+
+    MPI_Reduce (my_max, max_per_ODE, numCounters, MPI_DOUBLE, MPI_MAX, 0, world);
+    MPI_Reduce (my_min, min_per_ODE, numCounters, MPI_DOUBLE, MPI_MIN, 0, world);
+  }
+  else
+    MPI_Reduce (my_sum_sq, sum_sq, numCounters, MPI_DOUBLE, MPI_SUM, 0, world);
+
+  TimerType timer_stop = getTimeStamp();
+  double time_local = getElapsedTime( timer_start, timer_stop );
+
+  if (comm->me == 0){
+    char smesg[128];
+
+#define print_mesg(smesg) {\
+    if (screen)  fprintf(screen,"%s\n", smesg); \
+    if (logfile) fprintf(logfile,"%s\n", smesg); }
+
+    sprintf(smesg, "FixRX::ODE Diagnostics:  # of iters  |# of rhs evals| run-time (sec) | # atoms");
+    print_mesg(smesg);
+
+    sprintf(smesg, "         AVG per ODE  : %-12.5g | %-12.5g | %-12.5g", avg_per_atom[0], avg_per_atom[1], avg_per_atom[2]);
+    print_mesg(smesg);
+
+    // only valid for single time-step!
+    if (diagnosticFrequency == 1){
+      double rms_per_ODE[numCounters];
+      for (int i = 0; i < numCounters; ++i)
+        rms_per_ODE[i] = sqrt( sum_sq[i+numCounters] / nODEs );
+
+      sprintf(smesg, "         RMS per ODE  : %-12.5g | %-12.5g ", rms_per_ODE[0], rms_per_ODE[1]);
+      print_mesg(smesg);
+
+      sprintf(smesg, "         MAX per ODE  : %-12.5g | %-12.5g ", max_per_ODE[0], max_per_ODE[1]);
+      print_mesg(smesg);
+
+      sprintf(smesg, "         MIN per ODE  : %-12.5g | %-12.5g ", min_per_ODE[0], min_per_ODE[1]);
+      print_mesg(smesg);
+    }
+
+    sprintf(smesg, "         AVG per Proc : %-12.5g | %-12.5g | %-12.5g | %-12.5g", avg_per_proc[StepSum], avg_per_proc[FuncSum], avg_per_proc[TimeSum], avg_per_proc[AtomSum]);
+    print_mesg(smesg);
+
+    if (comm->nprocs > 1){
+      double rms_per_proc[numCounters];
+      for (int i = 0; i < numCounters; ++i)
+        rms_per_proc[i] = sqrt( sum_sq[i] / comm->nprocs );
+
+      sprintf(smesg, "         RMS per Proc : %-12.5g | %-12.5g | %-12.5g | %-12.5g", rms_per_proc[0], rms_per_proc[1], rms_per_proc[2], rms_per_proc[AtomSum]);
+      print_mesg(smesg);
+
+      sprintf(smesg, "         MAX per Proc : %-12.5g | %-12.5g | %-12.5g | %-12.5g", max_per_proc[0], max_per_proc[1], max_per_proc[2], max_per_proc[AtomSum]);
+      print_mesg(smesg);
+
+      sprintf(smesg, "         MIN per Proc : %-12.5g | %-12.5g | %-12.5g | %-12.5g", min_per_proc[0], min_per_proc[1], min_per_proc[2], min_per_proc[AtomSum]);
+      print_mesg(smesg);
+    }
+
+    sprintf(smesg, "  AVG'd over %d time-steps", nTimes);
+    print_mesg(smesg);
+    sprintf(smesg, "  AVG'ing took %g sec", time_local);
+    print_mesg(smesg);
+
+#undef print_mesg
+
+  }
+
+  // Reset the counters.
+  for (int i = 0; i < numDiagnosticCounters; ++i)
+    diagnosticCounter[i] = 0;
+
+  return;
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/KOKKOS/fix_rx_kokkos.h b/src/KOKKOS/fix_rx_kokkos.h
index 36b05cb210..4a11ac9fb9 100644
--- a/src/KOKKOS/fix_rx_kokkos.h
+++ b/src/KOKKOS/fix_rx_kokkos.h
@@ -97,6 +97,19 @@ class FixRxKokkos : public FixRX {
                      const double hmin, const double hmax,
                      double& h0, double y[], double rwk[], void *v_params) const;
 
+  //!< ODE Solver diagnostics.
+  void odeDiagnostics(void);
+
+  //!< Special counters per-ode.
+  int *diagnosticCounterPerODEnSteps;
+  int *diagnosticCounterPerODEnFuncs;
+  DAT::tdual_int_1d k_diagnosticCounterPerODEnSteps;
+  DAT::tdual_int_1d k_diagnosticCounterPerODEnFuncs;
+  typename ArrayTypes<DeviceType>::t_int_1d d_diagnosticCounterPerODEnSteps;
+  typename ArrayTypes<DeviceType>::t_int_1d d_diagnosticCounterPerODEnFuncs;
+  typename HAT::t_int_1d h_diagnosticCounterPerODEnSteps;
+  typename HAT::t_int_1d h_diagnosticCounterPerODEnFuncs;
+
   template <typename KokkosDeviceType>
   struct KineticsType
   {

From 4ac7a5d1f2e6132595c8999090e7b4159aa6971a Mon Sep 17 00:00:00 2001
From: Christopher Stone <chris.stone@computational-science.com>
Date: Sun, 12 Feb 2017 21:21:11 -0500
Subject: [PATCH 118/267] Added Kokkos-like array datatype into RK4 and RHS in
 FixRXKokkos.

- Created an Array class that provides stride access for operator[]
  w/o needing Kokkos views. This was designed to avoid the performance
  issues encountered with Views and sub-views throughout the RHS and
  ODE solver functions.
---
 src/KOKKOS/fix_rx_kokkos.cpp | 520 ++++++++++++++++++++++++++++++++++-
 src/KOKKOS/fix_rx_kokkos.h   |  52 +++-
 2 files changed, 570 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/fix_rx_kokkos.cpp b/src/KOKKOS/fix_rx_kokkos.cpp
index 2a3fc7547a..a6da0306bb 100644
--- a/src/KOKKOS/fix_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@@ -202,6 +202,373 @@ void FixRxKokkos<DeviceType>::rk4(const double t_stop, double *y, double *rwork,
 
 /* ---------------------------------------------------------------------- */
 
+template <typename DeviceType>
+  template <typename UserDataType>
+void FixRxKokkos<DeviceType>::k_rk4(const double t_stop, double *y, double *rwork, UserDataType& userData) const
+{
+  double *k1 = rwork;
+  double *k2 = k1 + nspecies;
+  double *k3 = k2 + nspecies;
+  double *k4 = k3 + nspecies;
+  double *yp = k4 + nspecies;
+
+  const int numSteps = minSteps;
+
+  const double h = t_stop / double(numSteps);
+
+  // Run the requested steps with h.
+  for (int step = 0; step < numSteps; step++)
+  {
+    // k1
+    k_rhs(0.0,y,k1, userData);
+
+    // k2
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+      yp[ispecies] = y[ispecies] + 0.5*h*k1[ispecies];
+
+    k_rhs(0.0,yp,k2, userData);
+
+    // k3
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+      yp[ispecies] = y[ispecies] + 0.5*h*k2[ispecies];
+
+    k_rhs(0.0,yp,k3, userData);
+
+    // k4
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+      yp[ispecies] = y[ispecies] + h*k3[ispecies];
+
+    k_rhs(0.0,yp,k4, userData);
+
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+      y[ispecies] += h*(k1[ispecies]/6.0 + k2[ispecies]/3.0 + k3[ispecies]/3.0 + k4[ispecies]/6.0);
+
+  } // end for (int step...
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+//     f1 = dt*f(t,x)
+//     f2 = dt*f(t+ c20*dt,x + c21*f1)
+//     f3 = dt*f(t+ c30*dt,x + c31*f1 + c32*f2)
+//     f4 = dt*f(t+ c40*dt,x + c41*f1 + c42*f2 + c43*f3)
+//     f5 = dt*f(t+dt,x + c51*f1 + c52*f2 + c53*f3 + c54*f4)
+//     f6 = dt*f(t+ c60*dt,x + c61*f1 + c62*f2 + c63*f3 + c64*f4 + c65*f5)
+//
+//     fifth-order runge-kutta integration
+//        x5 = x + b1*f1 + b3*f3 + b4*f4 + b5*f5 + b6*f6
+//     fourth-order runge-kutta integration
+//        x  = x + a1*f1 + a3*f3 + a4*f4 + a5*f5
+
+template <typename DeviceType>
+  template <typename UserDataType>
+void FixRxKokkos<DeviceType>::k_rkf45_step (const int neq, const double h, double y[], double y_out[], double rwk[], UserDataType& userData) const
+{
+   const double c21=0.25;
+   const double c31=0.09375;
+   const double c32=0.28125;
+   const double c41=0.87938097405553;
+   const double c42=-3.2771961766045;
+   const double c43=3.3208921256258;
+   const double c51=2.0324074074074;
+   const double c52=-8.0;
+   const double c53=7.1734892787524;
+   const double c54=-0.20589668615984;
+   const double c61=-0.2962962962963;
+   const double c62=2.0;
+   const double c63=-1.3816764132554;
+   const double c64=0.45297270955166;
+   const double c65=-0.275;
+   const double a1=0.11574074074074;
+   const double a3=0.54892787524366;
+   const double a4=0.5353313840156;
+   const double a5=-0.2;
+   const double b1=0.11851851851852;
+   const double b3=0.51898635477583;
+   const double b4=0.50613149034201;
+   const double b5=-0.18;
+   const double b6=0.036363636363636;
+
+   // local dependent variables (5 total)
+   double* f1 = &rwk[    0];
+   double* f2 = &rwk[  neq];
+   double* f3 = &rwk[2*neq];
+   double* f4 = &rwk[3*neq];
+   double* f5 = &rwk[4*neq];
+   double* f6 = &rwk[5*neq];
+
+   // scratch for the intermediate solution.
+   //double* ytmp = &rwk[6*neq];
+   double* ytmp = y_out;
+
+   // 1)
+   k_rhs (0.0, y, f1, userData);
+
+   for (int k = 0; k < neq; k++){
+      f1[k] *= h;
+      ytmp[k] = y[k] + c21 * f1[k];
+   }
+
+   // 2)
+   k_rhs(0.0, ytmp, f2, userData);
+
+   for (int k = 0; k < neq; k++){
+      f2[k] *= h;
+      ytmp[k] = y[k] + c31 * f1[k] + c32 * f2[k];
+   }
+
+   // 3)
+   k_rhs(0.0, ytmp, f3, userData);
+
+   for (int k = 0; k < neq; k++) {
+      f3[k] *= h;
+      ytmp[k] = y[k] + c41 * f1[k] + c42 * f2[k] + c43 * f3[k];
+   }
+
+   // 4)
+   k_rhs(0.0, ytmp, f4, userData);
+
+   for (int k = 0; k < neq; k++) {
+      f4[k] *= h;
+      ytmp[k] = y[k] + c51 * f1[k] + c52 * f2[k] + c53 * f3[k] + c54 * f4[k];
+   }
+
+   // 5)
+   k_rhs(0.0, ytmp, f5, userData);
+
+   for (int k = 0; k < neq; k++) {
+      f5[k] *= h;
+      ytmp[k] = y[k] + c61*f1[k] + c62*f2[k] + c63*f3[k] + c64*f4[k] + c65*f5[k];
+   }
+
+   // 6)
+   k_rhs(0.0, ytmp, f6, userData);
+
+   for (int k = 0; k < neq; k++)
+   {
+      //const double f6 = h * ydot[k];
+      f6[k] *= h;
+
+      // 5th-order solution.
+      const double r5 = b1*f1[k] + b3*f3[k] + b4*f4[k] + b5*f5[k] + b6*f6[k];
+
+      // 4th-order solution.
+      const double r4 = a1*f1[k] + a3*f3[k] + a4*f4[k] + a5*f5[k];
+
+      // Truncation error: difference between 4th and 5th-order solutions.
+      rwk[k] = fabs(r5 - r4);
+
+      // Update solution.
+    //y_out[k] = y[k] + r5; // Local extrapolation
+      y_out[k] = y[k] + r4;
+   }
+
+   return;
+}
+
+template <typename DeviceType>
+  template <typename UserDataType>
+int FixRxKokkos<DeviceType>::k_rkf45_h0
+                    (const int neq, const double t, const double t_stop,
+                     const double hmin, const double hmax,
+                     double& h0, double y[], double rwk[], UserDataType& userData) const
+{
+   // Set lower and upper bounds on h0, and take geometric mean as first trial value.
+   // Exit with this value if the bounds cross each other.
+
+   // Adjust upper bound based on ydot ...
+   double hg = sqrt(hmin*hmax);
+
+   //if (hmax < hmin)
+   //{
+   //   h0 = hg;
+   //   return;
+   //}
+
+   // Start iteration to find solution to ... {WRMS norm of (h0^2 y'' / 2)} = 1
+
+   double *ydot  = rwk;
+   double *y1    = ydot + neq;
+   double *ydot1 = y1 + neq;
+
+   const int max_iters = 10;
+   bool hnew_is_ok = false;
+   double hnew = hg;
+   int iter = 0;
+
+   // compute ydot at t=t0
+   k_rhs (t, y, ydot, userData);
+
+   while(1)
+   {
+      // Estimate y'' with finite-difference ...
+
+      for (int k = 0; k < neq; k++)
+         y1[k] = y[k] + hg * ydot[k];
+
+      // compute y' at t1
+      k_rhs (t + hg, y1, ydot1, userData);
+
+      // Compute WRMS norm of y''
+      double yddnrm = 0.0;
+      for (int k = 0; k < neq; k++){
+         double ydd = (ydot1[k] - ydot[k]) / hg;
+         double wterr = ydd / (relTol * fabs( y[k] ) + absTol);
+         yddnrm += wterr * wterr;
+      }
+
+      yddnrm = sqrt( yddnrm / double(neq) );
+
+      //std::cout << "iter " << _iter << " hg " << hg << " y'' " << yddnrm << std::endl;
+      //std::cout << "ydot " << ydot[neq-1] << std::endl;
+
+      // should we accept this?
+      if (hnew_is_ok || iter == max_iters){
+         hnew = hg;
+         if (iter == max_iters)
+            fprintf(stderr, "ERROR_HIN_MAX_ITERS\n");
+         break;
+      }
+
+      // Get the new value of h ...
+      hnew = (yddnrm*hmax*hmax > 2.0) ? sqrt(2.0 / yddnrm) : sqrt(hg * hmax);
+
+      // test the stopping conditions.
+      double hrat = hnew / hg;
+
+      // Accept this value ... the bias factor should bring it within range.
+      if ( (hrat > 0.5) && (hrat < 2.0) )
+         hnew_is_ok = true;
+
+      // If y'' is still bad after a few iterations, just accept h and give up.
+      if ( (iter > 1) && hrat > 2.0 ) {
+         hnew = hg;
+         hnew_is_ok = true;
+      }
+
+      //printf("iter=%d, yddnrw=%e, hnew=%e, hmin=%e, hmax=%e\n", iter, yddnrm, hnew, hmin, hmax);
+
+      hg = hnew;
+      iter ++;
+   }
+
+   // bound and bias estimate
+   h0 = hnew * 0.5;
+   h0 = fmax(h0, hmin);
+   h0 = fmin(h0, hmax);
+   //printf("h0=%e, hmin=%e, hmax=%e\n", h0, hmin, hmax);
+
+   return (iter + 1);
+}
+
+template <typename DeviceType>
+  template <typename UserDataType>
+void FixRxKokkos<DeviceType>::k_rkf45(const int neq, const double t_stop, double *y, double *rwork, UserDataType& userData, CounterType& counter) const
+{
+  // Rounding coefficient.
+  const double uround = DBL_EPSILON;
+
+  // Adaption limit (shrink or grow)
+  const double adaption_limit = 4.0;
+
+  // Safety factor on the adaption. very specific but not necessary .. 0.9 is common.
+  const double hsafe = 0.840896415;
+
+  // Time rounding factor.
+  const double tround = t_stop * uround;
+
+  // Counters for diagnostics.
+  int nst = 0; // # of steps (accepted)
+  int nit = 0; // # of iterations total
+  int nfe = 0; // # of RHS evaluations
+
+  // Min/Max step-size limits.
+  const double h_min = 100.0 * tround;
+  const double h_max = (minSteps > 0) ? t_stop / double(minSteps) : t_stop;
+
+  // Set the initial step-size. 0 forces an internal estimate ... stable Euler step size.
+  double h = (minSteps > 0) ? t_stop / double(minSteps) : 0.0;
+
+  double t = 0.0;
+
+  if (h < h_min){
+    //fprintf(stderr,"hin not implemented yet\n");
+    //exit(-1);
+    nfe = k_rkf45_h0 (neq, t, t_stop, h_min, h_max, h, y, rwork, userData);
+  }
+
+  //printf("t= %e t_stop= %e h= %e\n", t, t_stop, h);
+
+  // Integrate until we reach the end time.
+  while (fabs(t - t_stop) > tround){
+    double *yout = rwork;
+    double *eout = yout + neq;
+
+    // Take a trial step.
+    k_rkf45_step (neq, h, y, yout, eout, userData);
+
+    // Estimate the solution error.
+      // ... weighted 2-norm of the error.
+      double err2 = 0.0;
+      for (int k = 0; k < neq; k++){
+        const double wterr = eout[k] / (relTol * fabs( y[k] ) + absTol);
+        err2 += wterr * wterr;
+      }
+
+    double err = fmax( uround, sqrt( err2 / double(nspecies) ));
+
+    // Accept the solution?
+    if (err <= 1.0 || h <= h_min){
+      t += h;
+      nst++;
+
+      for (int k = 0; k < neq; k++)
+        y[k] = yout[k];
+    }
+
+    // Adjust h for the next step.
+    double hfac = hsafe * sqrt( sqrt( 1.0 / err ) );
+
+    // Limit the adaption.
+    hfac = fmax( hfac, 1.0 / adaption_limit );
+    hfac = fmin( hfac,       adaption_limit );
+
+    // Apply the adaption factor...
+    h *= hfac;
+
+    // Limit h.
+    h = fmin( h, h_max );
+    h = fmax( h, h_min );
+
+    // Stretch h if we're within 5% ... and we didn't just fail.
+    if (err <= 1.0 && (t + 1.05*h) > t_stop)
+      h = t_stop - t;
+
+    // And don't overshoot the end.
+    if (t + h > t_stop)
+      h = t_stop - t;
+
+    nit++;
+    nfe += 6;
+
+    if (maxIters && nit > maxIters){
+      //fprintf(stderr,"atom[%d] took too many iterations in rkf45 %d %e %e\n", id, nit, t, t_stop);
+      counter.nFails ++;
+      break;
+      // We should set an error here so that the solution is not used!
+    }
+
+  } // end while
+
+  counter.nSteps += nst;
+  counter.nIters += nit;
+  counter.nFuncs += nfe;
+
+  //printf("id= %d nst= %d nit= %d\n", id, nst, nit);
+}
+/* ---------------------------------------------------------------------- */
+
 //     f1 = dt*f(t,x)
 //     f2 = dt*f(t+ c20*dt,x + c21*f1)
 //     f3 = dt*f(t+ c30*dt,x + c31*f1 + c32*f2)
@@ -664,6 +1031,152 @@ int FixRxKokkos<DeviceType>::rhs_sparse(double t, const double *y, double *dydt,
 
 /* ---------------------------------------------------------------------- */
 
+template <typename DeviceType>
+  template <typename VectorType, typename UserDataType>
+int FixRxKokkos<DeviceType>::k_rhs(double t, const VectorType& y, VectorType& dydt, UserDataType& userData) const
+{
+  //StridedArrayType<double,1> _y( const_cast<double *>( y ) ), _dydt( dydt );
+
+  // Use the sparse format instead.
+  if (useSparseKinetics)
+    return this->k_rhs_sparse( t, y, dydt, userData);
+  else
+    return this->k_rhs_dense ( t, y, dydt, userData);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+  template <typename VectorType, typename UserDataType>
+int FixRxKokkos<DeviceType>::k_rhs_dense(double t, const VectorType& y, VectorType& dydt, UserDataType& userData) const
+{
+  #define rxnRateLaw (userData.rxnRateLaw)
+  #define kFor       (userData.kFor      )
+
+  //const double VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
+  //const int nspecies = atom->nspecies_dpd;
+
+  for(int ispecies=0; ispecies<nspecies; ispecies++)
+    dydt[ispecies] = 0.0;
+
+  // Construct the reaction rate laws
+  for(int jrxn=0; jrxn<nreactions; jrxn++){
+    double rxnRateLawForward = kFor[jrxn];
+
+    for(int ispecies=0; ispecies<nspecies; ispecies++){
+      const double concentration = y[ispecies]/VDPD;
+      rxnRateLawForward *= pow( concentration, d_kineticsData.stoichReactants(jrxn,ispecies) );
+      //rxnRateLawForward *= pow(concentration,stoichReactants[jrxn][ispecies]);
+    }
+    rxnRateLaw[jrxn] = rxnRateLawForward;
+  }
+
+  // Construct the reaction rates for each species
+  for(int ispecies=0; ispecies<nspecies; ispecies++)
+    for(int jrxn=0; jrxn<nreactions; jrxn++)
+    {
+      dydt[ispecies] += d_kineticsData.stoich(jrxn,ispecies) *VDPD*rxnRateLaw[jrxn];
+      //dydt[ispecies] += stoich[jrxn][ispecies]*VDPD*rxnRateLaw[jrxn];
+    }
+
+  #undef rxnRateLaw
+  #undef kFor
+
+  return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+  template <typename VectorType, typename UserDataType>
+int FixRxKokkos<DeviceType>::k_rhs_sparse(double t, const VectorType& y, VectorType& dydt, UserDataType& userData) const
+{
+   #define kFor         (userData.kFor)
+   #define kRev         (NULL)
+   #define rxnRateLaw   (userData.rxnRateLaw)
+   #define conc         (dydt)
+   #define maxReactants (this->sparseKinetics_maxReactants)
+   #define maxSpecies   (this->sparseKinetics_maxSpecies)
+   #define nuk          (this->d_kineticsData.nuk)
+   #define nu           (this->d_kineticsData.nu)
+   #define inu          (this->d_kineticsData.inu)
+   #define isIntegral(idx) ( SparseKinetics_enableIntegralReactions \
+                             && this->d_kineticsData.isIntegral(idx) )
+
+   for (int k = 0; k < nspecies; ++k)
+      conc[k] = y[k] / VDPD;
+
+   // Construct the reaction rate laws
+   for (int i = 0; i < nreactions; ++i)
+   {
+      double rxnRateLawForward;
+      if (isIntegral(i)){
+         rxnRateLawForward = kFor[i] * powint( conc[ nuk(i,0) ], inu(i,0) );
+         for (int kk = 1; kk < maxReactants; ++kk){
+            const int k = nuk(i,kk);
+            if (k == SparseKinetics_invalidIndex) break;
+            //if (k != SparseKinetics_invalidIndex)
+               rxnRateLawForward *= powint( conc[k], inu(i,kk) );
+         }
+      } else {
+         rxnRateLawForward = kFor[i] * pow( conc[ nuk(i,0) ], nu(i,0) );
+         for (int kk = 1; kk < maxReactants; ++kk){
+            const int k = nuk(i,kk);
+            if (k == SparseKinetics_invalidIndex) break;
+            //if (k != SparseKinetics_invalidIndex)
+               rxnRateLawForward *= pow( conc[k], nu(i,kk) );
+         }
+      }
+
+      rxnRateLaw[i] = rxnRateLawForward;
+   }
+
+   // Construct the reaction rates for each species from the
+   // Stoichiometric matrix and ROP vector.
+   for (int k = 0; k < nspecies; ++k)
+      dydt[k] = 0.0;
+
+   for (int i = 0; i < nreactions; ++i){
+      // Reactants ...
+      dydt[ nuk(i,0) ] -= nu(i,0) * rxnRateLaw[i];
+      for (int kk = 1; kk < maxReactants; ++kk){
+         const int k = nuk(i,kk);
+         if (k == SparseKinetics_invalidIndex) break;
+         //if (k != SparseKinetics_invalidIndex)
+            dydt[k] -= nu(i,kk) * rxnRateLaw[i];
+      }
+
+      // Products ...
+      dydt[ nuk(i,maxReactants) ] += nu(i,maxReactants) * rxnRateLaw[i];
+      for (int kk = maxReactants+1; kk < maxSpecies; ++kk){
+         const int k = nuk(i,kk);
+         if (k == SparseKinetics_invalidIndex) break;
+         //if (k != SparseKinetics_invalidIndex)
+            dydt[k] += nu(i,kk) * rxnRateLaw[i];
+      }
+   }
+
+   // Add in the volume factor to convert to the proper units.
+   for (int k = 0; k < nspecies; ++k)
+      dydt[k] *= VDPD;
+
+   #undef kFor
+   #undef kRev
+   #undef rxnRateLaw
+   #undef conc
+   #undef maxReactants
+   #undef maxSpecies
+   #undef nuk
+   #undef nu
+   #undef inu
+   #undef isIntegral
+   //#undef invalidIndex
+
+   return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
 /*template <typename DeviceType>
   template <typename SolverType>
     KOKKOS_INLINE_FUNCTION
@@ -907,6 +1420,10 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
         userData.kFor = new double[nreactions];
         userData.rxnRateLaw = new double[nreactions];
 
+        UserRHSDataKokkos<1> userDataKokkos;
+        userDataKokkos.kFor.m_data = userData.kFor;
+        userDataKokkos.rxnRateLaw.m_data = userData.rxnRateLaw;
+
         CounterType counter_i;
 
         const double theta = (localTempFlag) ? d_dpdThetaLocal(i) : d_dpdTheta(i);
@@ -935,7 +1452,8 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
         // Solver the ODE system.
         if (odeIntegrationFlag == ODE_LAMMPS_RK4)
         {
-          rk4(t_stop, y, rwork, &userData);
+          //rk4(t_stop, y, rwork, &userData);
+          k_rk4(t_stop, y, rwork, userDataKokkos);
         }
         else if (odeIntegrationFlag == ODE_LAMMPS_RKF45)
         {
diff --git a/src/KOKKOS/fix_rx_kokkos.h b/src/KOKKOS/fix_rx_kokkos.h
index 4a11ac9fb9..e36d606525 100644
--- a/src/KOKKOS/fix_rx_kokkos.h
+++ b/src/KOKKOS/fix_rx_kokkos.h
@@ -76,12 +76,43 @@ class FixRxKokkos : public FixRX {
   PairDPDfdtEnergyKokkos<DeviceType>* pairDPDEKK;
   double VDPD;
 
+  template <typename T, int stride = 1>
+  struct StridedArrayType
+  {
+    typedef T value_type;
+    enum { Stride = stride };
+
+    value_type *m_data;
+
+    StridedArrayType() : m_data(NULL) {}
+    StridedArrayType(value_type *ptr) : m_data(ptr) {}
+
+    inline       value_type& operator()(const int idx)       { return m_data[Stride*idx]; }
+    inline const value_type& operator()(const int idx) const { return m_data[Stride*idx]; }
+    inline       value_type& operator[](const int idx)       { return m_data[Stride*idx]; }
+    inline const value_type& operator[](const int idx) const { return m_data[Stride*idx]; }
+  };
+
+  template <int stride = 1>
+  struct UserRHSDataKokkos
+  {
+    StridedArrayType<double,1> kFor;
+    StridedArrayType<double,1> rxnRateLaw;
+  };
+
   void solve_reactions(const int vflag, const bool isPreForce = true);
 
-  int rhs(double, const double *, double *, void *) const;
+  int rhs       (double, const double *, double *, void *) const;
   int rhs_dense (double, const double *, double *, void *) const;
   int rhs_sparse(double, const double *, double *, void *) const;
 
+  template <typename VectorType, typename UserDataType>
+  int k_rhs       (double, const VectorType&, VectorType&, UserDataType& ) const;
+  template <typename VectorType, typename UserDataType>
+  int k_rhs_dense (double, const VectorType&, VectorType&, UserDataType& ) const;
+  template <typename VectorType, typename UserDataType>
+  int k_rhs_sparse(double, const VectorType&, VectorType&, UserDataType& ) const;
+
   //!< Classic Runge-Kutta 4th-order stepper.
   void rk4(const double t_stop, double *y, double *rwork, void *v_params) const;
 
@@ -97,6 +128,25 @@ class FixRxKokkos : public FixRX {
                      const double hmin, const double hmax,
                      double& h0, double y[], double rwk[], void *v_params) const;
 
+  //!< Classic Runge-Kutta 4th-order stepper.
+  template <typename UserDataType>
+  void k_rk4(const double t_stop, double *y, double *rwork, UserDataType& userData) const;
+
+  //!< Runge-Kutta-Fehlberg ODE Solver.
+  template <typename UserDataType>
+  void k_rkf45(const int neq, const double t_stop, double *y, double *rwork, UserDataType& userData, CounterType& counter) const;
+
+  //!< Runge-Kutta-Fehlberg ODE stepper function.
+  template <typename UserDataType>
+  void k_rkf45_step (const int neq, const double h, double y[], double y_out[],
+                     double rwk[], UserDataType& userData) const;
+
+  //!< Initial step size estimation for the Runge-Kutta-Fehlberg ODE solver.
+  template <typename UserDataType>
+  int k_rkf45_h0 (const int neq, const double t, const double t_stop,
+                  const double hmin, const double hmax,
+                  double& h0, double y[], double rwk[], UserDataType& userData) const;
+
   //!< ODE Solver diagnostics.
   void odeDiagnostics(void);
 

From 2f32c1a9af6f8a8bb39c69f051552263dc313572 Mon Sep 17 00:00:00 2001
From: Christopher Stone <chris.stone@computational-science.com>
Date: Sun, 12 Feb 2017 22:48:02 -0500
Subject: [PATCH 119/267] Switched to using Kokkos device data for ODE scratch
 data.

- Finished porting all scratch arrays to using the StridedArrayType
  template.
- Created a single, large Kokkos device array and using that for all
  scratch data passed into the StridedArrayType objects.
---
 src/KOKKOS/fix_rx_kokkos.cpp | 101 ++++++++++++++++++++---------------
 src/KOKKOS/fix_rx_kokkos.h   |  18 +++----
 2 files changed, 67 insertions(+), 52 deletions(-)

diff --git a/src/KOKKOS/fix_rx_kokkos.cpp b/src/KOKKOS/fix_rx_kokkos.cpp
index a6da0306bb..09a122a108 100644
--- a/src/KOKKOS/fix_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@@ -203,14 +203,14 @@ void FixRxKokkos<DeviceType>::rk4(const double t_stop, double *y, double *rwork,
 /* ---------------------------------------------------------------------- */
 
 template <typename DeviceType>
-  template <typename UserDataType>
-void FixRxKokkos<DeviceType>::k_rk4(const double t_stop, double *y, double *rwork, UserDataType& userData) const
+  template <typename VectorType, typename UserDataType>
+void FixRxKokkos<DeviceType>::k_rk4(const double t_stop, VectorType& y, VectorType& rwork, UserDataType& userData) const
 {
-  double *k1 = rwork;
-  double *k2 = k1 + nspecies;
-  double *k3 = k2 + nspecies;
-  double *k4 = k3 + nspecies;
-  double *yp = k4 + nspecies;
+  VectorType k1( rwork );
+  VectorType k2( &k1[nspecies] );
+  VectorType k3( &k2[nspecies] );
+  VectorType k4( &k3[nspecies] );
+  VectorType yp( &k4[nspecies] );
 
   const int numSteps = minSteps;
 
@@ -262,8 +262,8 @@ void FixRxKokkos<DeviceType>::k_rk4(const double t_stop, double *y, double *rwor
 //        x  = x + a1*f1 + a3*f3 + a4*f4 + a5*f5
 
 template <typename DeviceType>
-  template <typename UserDataType>
-void FixRxKokkos<DeviceType>::k_rkf45_step (const int neq, const double h, double y[], double y_out[], double rwk[], UserDataType& userData) const
+  template <typename VectorType, typename UserDataType>
+void FixRxKokkos<DeviceType>::k_rkf45_step (const int neq, const double h, VectorType& y, VectorType& y_out, VectorType& rwk, UserDataType& userData) const
 {
    const double c21=0.25;
    const double c31=0.09375;
@@ -291,16 +291,15 @@ void FixRxKokkos<DeviceType>::k_rkf45_step (const int neq, const double h, doubl
    const double b6=0.036363636363636;
 
    // local dependent variables (5 total)
-   double* f1 = &rwk[    0];
-   double* f2 = &rwk[  neq];
-   double* f3 = &rwk[2*neq];
-   double* f4 = &rwk[3*neq];
-   double* f5 = &rwk[4*neq];
-   double* f6 = &rwk[5*neq];
+   VectorType& f1 = rwk;
+   VectorType  f2( &rwk[  neq] );
+   VectorType  f3( &rwk[2*neq] );
+   VectorType  f4( &rwk[3*neq] );
+   VectorType  f5( &rwk[4*neq] );
+   VectorType  f6( &rwk[5*neq] );
 
    // scratch for the intermediate solution.
-   //double* ytmp = &rwk[6*neq];
-   double* ytmp = y_out;
+   VectorType& ytmp = y_out;
 
    // 1)
    k_rhs (0.0, y, f1, userData);
@@ -368,11 +367,11 @@ void FixRxKokkos<DeviceType>::k_rkf45_step (const int neq, const double h, doubl
 }
 
 template <typename DeviceType>
-  template <typename UserDataType>
+  template <typename VectorType, typename UserDataType>
 int FixRxKokkos<DeviceType>::k_rkf45_h0
                     (const int neq, const double t, const double t_stop,
                      const double hmin, const double hmax,
-                     double& h0, double y[], double rwk[], UserDataType& userData) const
+                     double& h0, VectorType& y, VectorType& rwk, UserDataType& userData) const
 {
    // Set lower and upper bounds on h0, and take geometric mean as first trial value.
    // Exit with this value if the bounds cross each other.
@@ -388,9 +387,9 @@ int FixRxKokkos<DeviceType>::k_rkf45_h0
 
    // Start iteration to find solution to ... {WRMS norm of (h0^2 y'' / 2)} = 1
 
-   double *ydot  = rwk;
-   double *y1    = ydot + neq;
-   double *ydot1 = y1 + neq;
+   VectorType& ydot  = rwk;
+   VectorType  y1    ( &ydot[  neq] );
+   VectorType  ydot1 ( &ydot[2*neq] );
 
    const int max_iters = 10;
    bool hnew_is_ok = false;
@@ -463,8 +462,8 @@ int FixRxKokkos<DeviceType>::k_rkf45_h0
 }
 
 template <typename DeviceType>
-  template <typename UserDataType>
-void FixRxKokkos<DeviceType>::k_rkf45(const int neq, const double t_stop, double *y, double *rwork, UserDataType& userData, CounterType& counter) const
+  template <typename VectorType, typename UserDataType>
+void FixRxKokkos<DeviceType>::k_rkf45(const int neq, const double t_stop, VectorType& y, VectorType& rwork, UserDataType& userData, CounterType& counter) const
 {
   // Rounding coefficient.
   const double uround = DBL_EPSILON;
@@ -501,9 +500,10 @@ void FixRxKokkos<DeviceType>::k_rkf45(const int neq, const double t_stop, double
   //printf("t= %e t_stop= %e h= %e\n", t, t_stop, h);
 
   // Integrate until we reach the end time.
-  while (fabs(t - t_stop) > tround){
-    double *yout = rwork;
-    double *eout = yout + neq;
+  while (fabs(t - t_stop) > tround)
+  {
+    VectorType& yout = rwork;
+    VectorType  eout ( &yout[neq] );
 
     // Take a trial step.
     k_rkf45_step (neq, h, y, yout, eout, userData);
@@ -1035,8 +1035,6 @@ template <typename DeviceType>
   template <typename VectorType, typename UserDataType>
 int FixRxKokkos<DeviceType>::k_rhs(double t, const VectorType& y, VectorType& dydt, UserDataType& userData) const
 {
-  //StridedArrayType<double,1> _y( const_cast<double *>( y ) ), _dydt( dydt );
-
   // Use the sparse format instead.
   if (useSparseKinetics)
     return this->k_rhs_sparse( t, y, dydt, userData);
@@ -1409,20 +1407,36 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
        );
   }
 
+  // Create scratch array space.
+  const size_t scratchSpaceSize = (8*nspecies + 2*nreactions);
+  //double *scratchSpace = new double[ scratchSpaceSize * nlocal ];
+
+  typename ArrayTypes<DeviceType>::t_double_1d d_scratchSpace("d_scratchSpace", scratchSpaceSize * nlocal);
+
   Kokkos::parallel_reduce( nlocal, LAMMPS_LAMBDA(int i, CounterType &counter)
     {
       if (d_mask(i) & groupbit)
       {
-        double *y = new double[8*nspecies];
-        double *rwork = y + nspecies;
+        //double *y = new double[8*nspecies];
+        //double *rwork = y + nspecies;
 
-        UserRHSData userData;
-        userData.kFor = new double[nreactions];
-        userData.rxnRateLaw = new double[nreactions];
+        //StridedArrayType<double,1> _y( y );
+        //StridedArrayType<double,1> _rwork( rwork );
 
-        UserRHSDataKokkos<1> userDataKokkos;
-        userDataKokkos.kFor.m_data = userData.kFor;
-        userDataKokkos.rxnRateLaw.m_data = userData.rxnRateLaw;
+        StridedArrayType<double,1> y( d_scratchSpace.ptr_on_device() + scratchSpaceSize * i );
+        StridedArrayType<double,1> rwork( &y[nspecies] );
+
+        //UserRHSData userData;
+        //userData.kFor = new double[nreactions];
+        //userData.rxnRateLaw = new double[nreactions];
+
+        //UserRHSDataKokkos<1> userDataKokkos;
+        //userDataKokkos.kFor.m_data = userData.kFor;
+        //userDataKokkos.rxnRateLaw.m_data = userData.rxnRateLaw;
+
+        UserRHSDataKokkos<1> userData;
+        userData.kFor.m_data = &( rwork[7*nspecies] );
+        userData.rxnRateLaw.m_data = &( userData.kFor[ nreactions ] );
 
         CounterType counter_i;
 
@@ -1452,12 +1466,11 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
         // Solver the ODE system.
         if (odeIntegrationFlag == ODE_LAMMPS_RK4)
         {
-          //rk4(t_stop, y, rwork, &userData);
-          k_rk4(t_stop, y, rwork, userDataKokkos);
+          k_rk4(t_stop, y, rwork, userData);
         }
         else if (odeIntegrationFlag == ODE_LAMMPS_RKF45)
         {
-          rkf45(nspecies, t_stop, y, rwork, &userData, counter_i);
+          k_rkf45(nspecies, t_stop, y, rwork, userData, counter_i);
 
           if (diagnosticFrequency == 1)
           {
@@ -1477,9 +1490,9 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
           d_dvector(ispecies,i) = y[ispecies];
         }
 
-        delete [] y;
-        delete [] userData.kFor;
-        delete [] userData.rxnRateLaw;
+        //delete [] y;
+        //delete [] userData.kFor;
+        //delete [] userData.rxnRateLaw;
 
         // Update the iteration statistics counter. Is this unique for each iteration?
         counter += counter_i;
@@ -1490,6 +1503,8 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
     , TotalCounters // reduction value for all iterations.
   );
 
+  //delete [] scratchSpace;
+
   TimerType timer_ODE = getTimeStamp();
 
   // Signal that dvector has been modified on this execution space.
diff --git a/src/KOKKOS/fix_rx_kokkos.h b/src/KOKKOS/fix_rx_kokkos.h
index e36d606525..9ac944c6a5 100644
--- a/src/KOKKOS/fix_rx_kokkos.h
+++ b/src/KOKKOS/fix_rx_kokkos.h
@@ -129,23 +129,23 @@ class FixRxKokkos : public FixRX {
                      double& h0, double y[], double rwk[], void *v_params) const;
 
   //!< Classic Runge-Kutta 4th-order stepper.
-  template <typename UserDataType>
-  void k_rk4(const double t_stop, double *y, double *rwork, UserDataType& userData) const;
+  template <typename VectorType, typename UserDataType>
+  void k_rk4(const double t_stop, VectorType& y, VectorType& rwork, UserDataType& userData) const;
 
   //!< Runge-Kutta-Fehlberg ODE Solver.
-  template <typename UserDataType>
-  void k_rkf45(const int neq, const double t_stop, double *y, double *rwork, UserDataType& userData, CounterType& counter) const;
+  template <typename VectorType, typename UserDataType>
+  void k_rkf45(const int neq, const double t_stop, VectorType& y, VectorType& rwork, UserDataType& userData, CounterType& counter) const;
 
   //!< Runge-Kutta-Fehlberg ODE stepper function.
-  template <typename UserDataType>
-  void k_rkf45_step (const int neq, const double h, double y[], double y_out[],
-                     double rwk[], UserDataType& userData) const;
+  template <typename VectorType, typename UserDataType>
+  void k_rkf45_step (const int neq, const double h, VectorType& y, VectorType& y_out,
+                     VectorType& rwk, UserDataType& userData) const;
 
   //!< Initial step size estimation for the Runge-Kutta-Fehlberg ODE solver.
-  template <typename UserDataType>
+  template <typename VectorType, typename UserDataType>
   int k_rkf45_h0 (const int neq, const double t, const double t_stop,
                   const double hmin, const double hmax,
-                  double& h0, double y[], double rwk[], UserDataType& userData) const;
+                  double& h0, VectorType& y, VectorType& rwk, UserDataType& userData) const;
 
   //!< ODE Solver diagnostics.
   void odeDiagnostics(void);

From 4e9c8f496235016a5277a43e22f7bca5b85b4f10 Mon Sep 17 00:00:00 2001
From: Christopher Stone <chris.stone@computational-science.com>
Date: Mon, 13 Feb 2017 10:48:30 -0500
Subject: [PATCH 120/267] Update FixRXKokkos for Cuda build. Added inline and
 other KOKKOS macros.

- Updated the function prototypes to include the necessary KOKKOS
macros for __host__ and __device__ functions and inlined functions.
- Changed several View definitions to match the disjoint memory spaces
that only come up with Cuda builds.
---
 src/KOKKOS/fix_rx_kokkos.cpp | 31 +++++++++++++++++++++++++------
 src/KOKKOS/fix_rx_kokkos.h   | 35 +++++++++++++++++++++++++----------
 2 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/src/KOKKOS/fix_rx_kokkos.cpp b/src/KOKKOS/fix_rx_kokkos.cpp
index 09a122a108..71897157f3 100644
--- a/src/KOKKOS/fix_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@@ -25,13 +25,13 @@
 #include "neigh_list_kokkos.h"
 #include "neigh_request.h"
 #include "error.h"
-#include "math_special.h"
+#include "math_special_kokkos.h"
 
 #include <float.h> // DBL_EPSILON
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
-using namespace MathSpecial;
+using namespace MathSpecialKokkos;
 
 #ifdef DBL_EPSILON
   #define MY_EPSILON (10.0*DBL_EPSILON)
@@ -425,8 +425,8 @@ int FixRxKokkos<DeviceType>::k_rkf45_h0
       // should we accept this?
       if (hnew_is_ok || iter == max_iters){
          hnew = hg;
-         if (iter == max_iters)
-            fprintf(stderr, "ERROR_HIN_MAX_ITERS\n");
+         //if (iter == max_iters)
+         //   fprintf(stderr, "ERROR_HIN_MAX_ITERS\n");
          break;
       }
 
@@ -1407,6 +1407,14 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
        );
   }
 
+  // Error flag for any failures.
+  DAT::tdual_int_scalar k_error_flag("pair:error_flag");
+
+  // Initialize and sync the device flag.
+  k_error_flag.h_view() = 0;
+  k_error_flag.template modify<LMPHostType>();
+  k_error_flag.template sync<DeviceType>();
+
   // Create scratch array space.
   const size_t scratchSpaceSize = (8*nspecies + 2*nreactions);
   //double *scratchSpace = new double[ scratchSpaceSize * nlocal ];
@@ -1483,7 +1491,11 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
         for (int ispecies = 0; ispecies < nspecies; ispecies++)
         {
           if (y[ispecies] < -MY_EPSILON)
-            error->one(FLERR,"Computed concentration in RK solver is < -10*DBL_EPSILON");
+          {
+            //error->one(FLERR,"Computed concentration in RK solver is < -10*DBL_EPSILON");
+            k_error_flag.d_view() = 2;
+            // This should be an atomic update.
+          }
           else if (y[ispecies] < MY_EPSILON)
             y[ispecies] = 0.0;
 
@@ -1507,6 +1519,12 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
 
   TimerType timer_ODE = getTimeStamp();
 
+  // Check the error flag for any failures.
+  k_error_flag.template modify<DeviceType>();
+  k_error_flag.template sync<LMPHostType>();
+  if (k_error_flag.h_view() == 2)
+    error->one(FLERR,"Computed concentration in RK solver is < -10*DBL_EPSILON");
+
   // Signal that dvector has been modified on this execution space.
   atomKK->modified( execution_space, DVECTOR_MASK );
 
@@ -1815,7 +1833,8 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
         {
           // Create an atomic view of sumWeights and dpdThetaLocal. Only needed
           // for Half/thread scenarios.
-          typedef Kokkos::View< E_FLOAT*, typename DAT::t_efloat_1d::array_layout, DeviceType, Kokkos::MemoryTraits< AtomicF< NEIGHFLAG >::value> > AtomicViewType;
+          //typedef Kokkos::View< E_FLOAT*, typename DAT::t_efloat_1d::array_layout, DeviceType, Kokkos::MemoryTraits< AtomicF< NEIGHFLAG >::value> > AtomicViewType;
+          typedef Kokkos::View< E_FLOAT*, typename DAT::t_efloat_1d::array_layout, typename DAT::t_efloat_1d::device_type, Kokkos::MemoryTraits< AtomicF< NEIGHFLAG >::value> > AtomicViewType;
 
           AtomicViewType a_dpdThetaLocal = d_dpdThetaLocal;
           AtomicViewType a_sumWeights    = d_sumWeights;
diff --git a/src/KOKKOS/fix_rx_kokkos.h b/src/KOKKOS/fix_rx_kokkos.h
index 9ac944c6a5..c18ce6f151 100644
--- a/src/KOKKOS/fix_rx_kokkos.h
+++ b/src/KOKKOS/fix_rx_kokkos.h
@@ -49,6 +49,7 @@ class FixRxKokkos : public FixRX {
   {
     int nSteps, nIters, nFuncs, nFails;
 
+    KOKKOS_INLINE_FUNCTION
     CounterType() : nSteps(0), nIters(0), nFuncs(0), nFails(0) {};
 
     KOKKOS_INLINE_FUNCTION
@@ -72,7 +73,7 @@ class FixRxKokkos : public FixRX {
     }
   };
 
- protected:
+ //protected:
   PairDPDfdtEnergyKokkos<DeviceType>* pairDPDEKK;
   double VDPD;
 
@@ -84,13 +85,15 @@ class FixRxKokkos : public FixRX {
 
     value_type *m_data;
 
+    KOKKOS_INLINE_FUNCTION
     StridedArrayType() : m_data(NULL) {}
+    KOKKOS_INLINE_FUNCTION
     StridedArrayType(value_type *ptr) : m_data(ptr) {}
 
-    inline       value_type& operator()(const int idx)       { return m_data[Stride*idx]; }
-    inline const value_type& operator()(const int idx) const { return m_data[Stride*idx]; }
-    inline       value_type& operator[](const int idx)       { return m_data[Stride*idx]; }
-    inline const value_type& operator[](const int idx) const { return m_data[Stride*idx]; }
+    KOKKOS_INLINE_FUNCTION       value_type& operator()(const int idx)       { return m_data[Stride*idx]; }
+    KOKKOS_INLINE_FUNCTION const value_type& operator()(const int idx) const { return m_data[Stride*idx]; }
+    KOKKOS_INLINE_FUNCTION       value_type& operator[](const int idx)       { return m_data[Stride*idx]; }
+    KOKKOS_INLINE_FUNCTION const value_type& operator[](const int idx) const { return m_data[Stride*idx]; }
   };
 
   template <int stride = 1>
@@ -100,17 +103,22 @@ class FixRxKokkos : public FixRX {
     StridedArrayType<double,1> rxnRateLaw;
   };
 
-  void solve_reactions(const int vflag, const bool isPreForce = true);
+  void solve_reactions(const int vflag, const bool isPreForce);
 
   int rhs       (double, const double *, double *, void *) const;
   int rhs_dense (double, const double *, double *, void *) const;
   int rhs_sparse(double, const double *, double *, void *) const;
 
   template <typename VectorType, typename UserDataType>
+    KOKKOS_INLINE_FUNCTION
   int k_rhs       (double, const VectorType&, VectorType&, UserDataType& ) const;
+
   template <typename VectorType, typename UserDataType>
+    KOKKOS_INLINE_FUNCTION
   int k_rhs_dense (double, const VectorType&, VectorType&, UserDataType& ) const;
+
   template <typename VectorType, typename UserDataType>
+    KOKKOS_INLINE_FUNCTION
   int k_rhs_sparse(double, const VectorType&, VectorType&, UserDataType& ) const;
 
   //!< Classic Runge-Kutta 4th-order stepper.
@@ -130,19 +138,23 @@ class FixRxKokkos : public FixRX {
 
   //!< Classic Runge-Kutta 4th-order stepper.
   template <typename VectorType, typename UserDataType>
+    KOKKOS_INLINE_FUNCTION
   void k_rk4(const double t_stop, VectorType& y, VectorType& rwork, UserDataType& userData) const;
 
   //!< Runge-Kutta-Fehlberg ODE Solver.
   template <typename VectorType, typename UserDataType>
+    KOKKOS_INLINE_FUNCTION
   void k_rkf45(const int neq, const double t_stop, VectorType& y, VectorType& rwork, UserDataType& userData, CounterType& counter) const;
 
   //!< Runge-Kutta-Fehlberg ODE stepper function.
   template <typename VectorType, typename UserDataType>
+    KOKKOS_INLINE_FUNCTION
   void k_rkf45_step (const int neq, const double h, VectorType& y, VectorType& y_out,
                      VectorType& rwk, UserDataType& userData) const;
 
   //!< Initial step size estimation for the Runge-Kutta-Fehlberg ODE solver.
   template <typename VectorType, typename UserDataType>
+    KOKKOS_INLINE_FUNCTION
   int k_rkf45_h0 (const int neq, const double t, const double t_stop,
                   const double hmin, const double hmax,
                   double& h0, VectorType& y, VectorType& rwk, UserDataType& userData) const;
@@ -155,8 +167,10 @@ class FixRxKokkos : public FixRX {
   int *diagnosticCounterPerODEnFuncs;
   DAT::tdual_int_1d k_diagnosticCounterPerODEnSteps;
   DAT::tdual_int_1d k_diagnosticCounterPerODEnFuncs;
-  typename ArrayTypes<DeviceType>::t_int_1d d_diagnosticCounterPerODEnSteps;
-  typename ArrayTypes<DeviceType>::t_int_1d d_diagnosticCounterPerODEnFuncs;
+  //typename ArrayTypes<DeviceType>::t_int_1d d_diagnosticCounterPerODEnSteps;
+  //typename ArrayTypes<DeviceType>::t_int_1d d_diagnosticCounterPerODEnFuncs;
+  typename DAT::t_int_1d d_diagnosticCounterPerODEnSteps;
+  typename DAT::t_int_1d d_diagnosticCounterPerODEnFuncs;
   typename HAT::t_int_1d h_diagnosticCounterPerODEnSteps;
   typename HAT::t_int_1d h_diagnosticCounterPerODEnFuncs;
 
@@ -185,7 +199,8 @@ class FixRxKokkos : public FixRX {
 
   // Need a dual-view and device-view for dpdThetaLocal and sumWeights since they're used in several callbacks.
   DAT::tdual_efloat_1d k_dpdThetaLocal, k_sumWeights;
-  typename ArrayTypes<DeviceType>::t_efloat_1d d_dpdThetaLocal, d_sumWeights;
+  //typename ArrayTypes<DeviceType>::t_efloat_1d d_dpdThetaLocal, d_sumWeights;
+  typename DAT::t_efloat_1d d_dpdThetaLocal, d_sumWeights;
   typename HAT::t_efloat_1d h_dpdThetaLocal, h_sumWeights;
 
   template <int WT_FLAG, int LOCAL_TEMP_FLAG, bool NEWTON_PAIR, int NEIGHFLAG>
@@ -196,7 +211,7 @@ class FixRxKokkos : public FixRX {
   int pack_forward_comm(int , int *, double *, int, int *);
   void unpack_forward_comm(int , int , double *);
 
- private: // replicate a few from FixRX
+ //private: // replicate a few from FixRX
   int my_restartFlag;
 };
 

From 799d55e0971331c6b54527b38ec991b2f1a08212 Mon Sep 17 00:00:00 2001
From: Christopher Stone <chris.stone@computational-science.com>
Date: Mon, 13 Feb 2017 14:24:51 -0500
Subject: [PATCH 121/267] Switched to operator()'s and Tag's for the Kokkos
 launch objects.

- Switched from using lambda functions to operator()'s with type tags
  in FixRxKokkos. The lambda's were giving big problems in Cuda with
  the memory objects. This required that all referenced views be members
  of the FixRXKokkos class.
- Add copymode controls to solve_reactions() to avoid the destructor
  freeing pointers carried forward from the copy constructor. Added
  the same to FixRX since its called, too.
---
 src/KOKKOS/fix_rx_kokkos.cpp | 316 ++++++++++++++++++++++++++++++-----
 src/KOKKOS/fix_rx_kokkos.h   | 109 +++++++++---
 src/USER-DPD/fix_rx.cpp      |   3 +
 3 files changed, 361 insertions(+), 67 deletions(-)

diff --git a/src/KOKKOS/fix_rx_kokkos.cpp b/src/KOKKOS/fix_rx_kokkos.cpp
index 71897157f3..77e948be35 100644
--- a/src/KOKKOS/fix_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@@ -69,13 +69,16 @@ FixRxKokkos<DeviceType>::FixRxKokkos(LAMMPS *lmp, int narg, char **arg) :
   datamask_read = EMPTY_MASK;
   datamask_modify = EMPTY_MASK;
 
+  k_error_flag = DAT::tdual_int_scalar("FixRxKokkos::k_error_flag");
+
   printf("Inside FixRxKokkos::FixRxKokkos\n");
 }
 
 template <typename DeviceType>
 FixRxKokkos<DeviceType>::~FixRxKokkos()
 {
-  printf("Inside FixRxKokkos::~FixRxKokkos\n");
+  printf("Inside FixRxKokkos::~FixRxKokkos copymode= %d\n", copymode);
+  if (copymode) return;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -1315,6 +1318,95 @@ void FixRxKokkos<DeviceType>::pre_force(int vflag)
 
   this->solve_reactions( vflag, true );
 }
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+  KOKKOS_INLINE_FUNCTION
+void FixRxKokkos<DeviceType>::operator()(Tag_FixRxKokkos_zeroCounterViews, const int& i) const
+{
+  d_diagnosticCounterPerODEnSteps(i) = 0;
+  d_diagnosticCounterPerODEnFuncs(i) = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+  template <bool ZERO_RATES>
+  KOKKOS_INLINE_FUNCTION
+void FixRxKokkos<DeviceType>::operator()(Tag_FixRxKokkos_solveSystems<ZERO_RATES>, const int& i, CounterType& counter) const
+{
+  if (d_mask(i) & groupbit)
+  {
+    StridedArrayType<double,1> y( d_scratchSpace.ptr_on_device() + scratchSpaceSize * i );
+    StridedArrayType<double,1> rwork( &y[nspecies] );
+
+    UserRHSDataKokkos<1> userData;
+    userData.kFor.m_data = &( rwork[7*nspecies] );
+    userData.rxnRateLaw.m_data = &( userData.kFor[ nreactions ] );
+
+    CounterType counter_i;
+
+    const double theta = (localTempFlag) ? d_dpdThetaLocal(i) : d_dpdTheta(i);
+
+    //Compute the reaction rate constants
+    for (int irxn = 0; irxn < nreactions; irxn++)
+    {
+      if (ZERO_RATES)
+        userData.kFor[irxn] = 0.0;
+      else
+      {
+        userData.kFor[irxn] = d_kineticsData.Arr(irxn) *
+                               pow(theta, d_kineticsData.nArr(irxn)) *
+                               exp(-d_kineticsData.Ea(irxn) / boltz / theta);
+      }
+    }
+
+    // Update ConcOld and initialize the ODE solution vector y[].
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+    {
+      const double tmp = d_dvector(ispecies, i);
+      d_dvector(ispecies+nspecies, i) = tmp;
+      y[ispecies] = tmp;
+    }
+
+    // Solver the ODE system.
+    if (odeIntegrationFlag == ODE_LAMMPS_RK4)
+    {
+      k_rk4(t_stop, y, rwork, userData);
+    }
+    else if (odeIntegrationFlag == ODE_LAMMPS_RKF45)
+    {
+      k_rkf45(nspecies, t_stop, y, rwork, userData, counter_i);
+
+      if (diagnosticFrequency == 1)
+      {
+        d_diagnosticCounterPerODEnSteps(i) = counter_i.nSteps;
+        d_diagnosticCounterPerODEnFuncs(i) = counter_i.nFuncs;
+      }
+    }
+
+    // Store the solution back in dvector.
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+    {
+      if (y[ispecies] < -MY_EPSILON)
+      {
+        //error->one(FLERR,"Computed concentration in RK solver is < -10*DBL_EPSILON");
+        k_error_flag.d_view() = 2;
+        // This should be an atomic update.
+      }
+      else if (y[ispecies] < MY_EPSILON)
+        y[ispecies] = 0.0;
+
+      d_dvector(ispecies,i) = y[ispecies];
+    }
+
+    // Update the iteration statistics counter. Is this unique for each iteration?
+    counter += counter_i;
+
+  } // if
+}
+
 /* ---------------------------------------------------------------------- */
 
 template <typename DeviceType>
@@ -1322,12 +1414,15 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
 {
   printf("Inside FixRxKokkos<DeviceType>::solve_reactions localTempFlag= %d isPreForce= %s\n", localTempFlag, isPreForce ? "True" : "false");
 
+  copymode = 1;
+
   if (update_kinetics_data)
     create_kinetics_data();
 
   TimerType timer_start = getTimeStamp();
 
-  const int nlocal = atom->nlocal;
+  //const int nlocal = atom->nlocal;
+  this->nlocal = atom->nlocal;
   const int nghost = atom->nghost;
   const int newton_pair = force->newton_pair;
 
@@ -1339,8 +1434,8 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
     const int count = nlocal + (newton_pair ? nghost : 0);
 
     memory->create_kokkos (k_dpdThetaLocal, dpdThetaLocal, count, "FixRxKokkos::dpdThetaLocal");
-    d_dpdThetaLocal = k_dpdThetaLocal.d_view;
-    h_dpdThetaLocal = k_dpdThetaLocal.h_view;
+    this->d_dpdThetaLocal = k_dpdThetaLocal.d_view;
+    this->h_dpdThetaLocal = k_dpdThetaLocal.h_view;
 
     const int neighflag = lmp->kokkos->neighflag;
 
@@ -1376,16 +1471,21 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
   // ...
 
   // Local references to the atomKK objects.
-  typename ArrayTypes<DeviceType>::t_efloat_1d d_dpdTheta = atomKK->k_dpdTheta.view<DeviceType>(); 
-  typename ArrayTypes<DeviceType>::t_float_2d  d_dvector  = atomKK->k_dvector.view<DeviceType>();
-  typename ArrayTypes<DeviceType>::t_int_1d    d_mask     = atomKK->k_mask.view<DeviceType>();
+  //typename ArrayTypes<DeviceType>::t_efloat_1d d_dpdTheta = atomKK->k_dpdTheta.view<DeviceType>(); 
+  //typename ArrayTypes<DeviceType>::t_float_2d  d_dvector  = atomKK->k_dvector.view<DeviceType>();
+  //typename ArrayTypes<DeviceType>::t_int_1d    d_mask     = atomKK->k_mask.view<DeviceType>();
+  this->d_dpdTheta = atomKK->k_dpdTheta.view<DeviceType>(); 
+  this->d_dvector  = atomKK->k_dvector.view<DeviceType>();
+  this->d_mask     = atomKK->k_mask.view<DeviceType>();
 
   // Get up-to-date data.
   atomKK->sync( execution_space, MASK_MASK | DVECTOR_MASK | DPDTHETA_MASK );
 
   // Set some constants outside of the parallel_for
-  const double boltz = force->boltz;
-  const double t_stop = update->dt; // DPD time-step and integration length.
+  //const double boltz = force->boltz;
+  //const double t_stop = update->dt; // DPD time-step and integration length.
+  this->boltz = force->boltz;
+  this->t_stop = update->dt; // DPD time-step and integration length.
 
   // Average DPD volume. Used in the RHS function.
   this->VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
@@ -1398,17 +1498,18 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
     d_diagnosticCounterPerODEnSteps = k_diagnosticCounterPerODEnSteps.d_view;
     d_diagnosticCounterPerODEnFuncs = k_diagnosticCounterPerODEnFuncs.d_view;
 
-    Kokkos::parallel_for ( nlocal,
-          LAMMPS_LAMBDA(const int i)
-          {
-             d_diagnosticCounterPerODEnSteps(i) = 0;
-             d_diagnosticCounterPerODEnFuncs(i) = 0;
-          }
-       );
+    Kokkos::parallel_for ( Kokkos::RangePolicy<DeviceType, Tag_FixRxKokkos_zeroCounterViews>(0,nlocal), *this);
+    //Kokkos::parallel_for ( nlocal,
+    //      LAMMPS_LAMBDA(const int i)
+    //      {
+    //         d_diagnosticCounterPerODEnSteps(i) = 0;
+    //         d_diagnosticCounterPerODEnFuncs(i) = 0;
+    //      }
+    //   );
   }
 
   // Error flag for any failures.
-  DAT::tdual_int_scalar k_error_flag("pair:error_flag");
+  //DAT::tdual_int_scalar k_error_flag("pair:error_flag");
 
   // Initialize and sync the device flag.
   k_error_flag.h_view() = 0;
@@ -1416,11 +1517,14 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
   k_error_flag.template sync<DeviceType>();
 
   // Create scratch array space.
-  const size_t scratchSpaceSize = (8*nspecies + 2*nreactions);
+  //const size_t scratchSpaceSize = (8*nspecies + 2*nreactions);
+  this->scratchSpaceSize = (8*nspecies + 2*nreactions);
   //double *scratchSpace = new double[ scratchSpaceSize * nlocal ];
 
-  typename ArrayTypes<DeviceType>::t_double_1d d_scratchSpace("d_scratchSpace", scratchSpaceSize * nlocal);
+  //typename ArrayTypes<DeviceType>::t_double_1d d_scratchSpace("d_scratchSpace", scratchSpaceSize * nlocal);
+  memory->create_kokkos (d_scratchSpace, nlocal*scratchSpaceSize, "FixRxKokkos::d_scratchSpace");
 
+#if 0
   Kokkos::parallel_reduce( nlocal, LAMMPS_LAMBDA(int i, CounterType &counter)
     {
       if (d_mask(i) & groupbit)
@@ -1514,8 +1618,15 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
 
     , TotalCounters // reduction value for all iterations.
   );
+#else
+  if (setRatesToZero)
+    Kokkos::parallel_reduce( Kokkos::RangePolicy<DeviceType, Tag_FixRxKokkos_solveSystems<true > >(0,nlocal), *this, TotalCounters);
+  else
+    Kokkos::parallel_reduce( Kokkos::RangePolicy<DeviceType, Tag_FixRxKokkos_solveSystems<false> >(0,nlocal), *this, TotalCounters);
+#endif
 
   //delete [] scratchSpace;
+  memory->destroy_kokkos (d_scratchSpace);
 
   TimerType timer_ODE = getTimeStamp();
 
@@ -1570,6 +1681,8 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
          (diagnosticFrequency < 0 && update->ntimestep == update->laststep) )
       this->odeDiagnostics();
   }
+
+  copymode = 0;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -1654,7 +1767,8 @@ void FixRxKokkos<DeviceType>::odeDiagnostics(void)
 
     double my_max[numCounters], my_min[numCounters];
 
-    const int nlocal = atom->nlocal;
+    //const int nlocal = atom->nlocal;
+    nlocal = atom->nlocal;
     HAT::t_int_1d h_mask = atomKK->k_mask.h_view;
 
     for (int i = 0; i < numCounters; ++i)
@@ -1760,17 +1874,122 @@ void FixRxKokkos<DeviceType>::odeDiagnostics(void)
 
 /* ---------------------------------------------------------------------- */
 
+template <typename DeviceType>
+  KOKKOS_INLINE_FUNCTION
+void FixRxKokkos<DeviceType>::operator()(Tag_FixRxKokkos_zeroTemperatureViews, const int& i) const
+{
+  d_sumWeights(i) = 0.0;
+  d_dpdThetaLocal(i) = 0.0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+  template <int WT_FLAG, bool NEWTON_PAIR, int NEIGHFLAG>
+  KOKKOS_INLINE_FUNCTION
+void FixRxKokkos<DeviceType>::operator()(Tag_FixRxKokkos_firstPairOperator<WT_FLAG,NEWTON_PAIR,NEIGHFLAG>, const int& ii) const
+{
+  // Create an atomic view of sumWeights and dpdThetaLocal. Only needed
+  // for Half/thread scenarios.
+  typedef Kokkos::View< E_FLOAT*, typename DAT::t_efloat_1d::array_layout, typename DAT::t_efloat_1d::device_type, Kokkos::MemoryTraits< AtomicF< NEIGHFLAG >::value> > AtomicViewType;
+
+  AtomicViewType a_dpdThetaLocal = d_dpdThetaLocal;
+  AtomicViewType a_sumWeights    = d_sumWeights;
+
+  // Local scalar accumulators.
+  double i_dpdThetaLocal = 0.0;
+  double i_sumWeights    = 0.0;
+
+  const int i = d_ilist(ii);
+ 
+  const double xtmp = d_x(i,0);
+  const double ytmp = d_x(i,1);
+  const double ztmp = d_x(i,2);
+  const int itype = d_type(i);
+
+  const int jnum = d_numneigh(i);
+
+  for (int jj = 0; jj < jnum; jj++)
+  {
+    const int j = (d_neighbors(i,jj) & NEIGHMASK);
+    const int jtype = d_type(j);
+
+    const double delx = xtmp - d_x(j,0);
+    const double dely = ytmp - d_x(j,1);
+    const double delz = ztmp - d_x(j,2);
+    const double rsq = delx*delx + dely*dely + delz*delz;
+
+    const double cutsq_ij = d_cutsq(itype,jtype);
+
+    if (rsq < cutsq_ij)
+    {
+      const double rcut = sqrt( cutsq_ij );
+      double rij = sqrt(rsq);
+      double ratio = rij/rcut;
+
+      double wij = 0.0;
+
+      // Lucy's Weight Function
+      if (WT_FLAG == LUCY)
+      {
+        wij = (1.0+3.0*ratio) * (1.0-ratio)*(1.0-ratio)*(1.0-ratio);
+        i_dpdThetaLocal += wij / d_dpdTheta(j);
+        if (NEWTON_PAIR || j < nlocal)
+          a_dpdThetaLocal(j) += wij / d_dpdTheta(i);
+      }
+
+      i_sumWeights += wij;
+      if (NEWTON_PAIR || j < nlocal)
+        a_sumWeights(j) += wij;
+    }
+  }
+
+  // Update, don't assign, the array value (because another iteration may have hit it).
+  a_dpdThetaLocal(i) += i_dpdThetaLocal;
+  a_sumWeights(i) += i_sumWeights;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+  template <int WT_FLAG, int LOCAL_TEMP_FLAG>
+  KOKKOS_INLINE_FUNCTION
+void FixRxKokkos<DeviceType>::operator()(Tag_FixRxKokkos_2ndPairOperator<WT_FLAG,LOCAL_TEMP_FLAG>, const int& i) const
+{
+  double wij = 0.0;
+
+  // Lucy Weight Function
+  if (WT_FLAG == LUCY)
+  {
+    wij = 1.0;
+    d_dpdThetaLocal(i) += wij / d_dpdTheta(i);
+  }
+  d_sumWeights(i) += wij;
+
+  // Normalized local temperature
+  d_dpdThetaLocal(i) = d_dpdThetaLocal(i) / d_sumWeights(i);
+
+  if (LOCAL_TEMP_FLAG == HARMONIC)
+    d_dpdThetaLocal(i) = 1.0 / d_dpdThetaLocal(i);
+}
+
+/* ---------------------------------------------------------------------- */
+
 template <typename DeviceType>
   template <int WT_FLAG, int LOCAL_TEMP_FLAG, bool NEWTON_PAIR, int NEIGHFLAG>
 void FixRxKokkos<DeviceType>::computeLocalTemperature()
 {
-  typename ArrayTypes<DeviceType>::t_x_array_randomread d_x        = atomKK->k_x.view<DeviceType>();
-  typename ArrayTypes<DeviceType>::t_int_1d_randomread  d_type     = atomKK->k_type.view<DeviceType>();
-  typename ArrayTypes<DeviceType>::t_efloat_1d          d_dpdTheta = atomKK->k_dpdTheta.view<DeviceType>(); 
+  //typename ArrayTypes<DeviceType>::t_x_array_randomread d_x        = atomKK->k_x.view<DeviceType>();
+  //typename ArrayTypes<DeviceType>::t_int_1d_randomread  d_type     = atomKK->k_type.view<DeviceType>();
+  //typename ArrayTypes<DeviceType>::t_efloat_1d          d_dpdTheta = atomKK->k_dpdTheta.view<DeviceType>(); 
+  d_x        = atomKK->k_x.view<DeviceType>();
+  d_type     = atomKK->k_type.view<DeviceType>();
+  d_dpdTheta = atomKK->k_dpdTheta.view<DeviceType>(); 
 
   atomKK->sync(execution_space, X_MASK | TYPE_MASK | DPDTHETA_MASK );
 
-  const int nlocal = atom->nlocal;
+  //const int nlocal = atom->nlocal;
+  nlocal = atom->nlocal;
   const int nghost = atom->nghost;
 
   printf("Inside FixRxKokkos::computeLocalTemperature: %d %d %d %d %d %d %d\n", WT_FLAG, LOCAL_TEMP_FLAG, NEWTON_PAIR, (int)lmp->kokkos->neighflag, NEIGHFLAG, nlocal, nghost);
@@ -1780,14 +1999,15 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
   //typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq = pairDPDEKK->k_cutsq.template view<DeviceType();
 
   //!< Copies pulled from pairDPDE for local use since pairDPDEKK's objects are protected.
-  typename ArrayTypes<DeviceType>::tdual_ffloat_2d k_cutsq;
-  typename ArrayTypes<DeviceType>::t_ffloat_2d     d_cutsq;
-  double **h_cutsq;
+  //typename ArrayTypes<DeviceType>::tdual_ffloat_2d k_cutsq;
+  //typename ArrayTypes<DeviceType>::t_ffloat_2d     d_cutsq;
+  //double **h_cutsq;
 
   {
     const int ntypes = atom->ntypes;
 
-    memory->create_kokkos (k_cutsq, h_cutsq, ntypes+1, ntypes+1, "pair:cutsq");
+    //memory->create_kokkos (k_cutsq, h_cutsq, ntypes+1, ntypes+1, "pair:cutsq");
+    memory->create_kokkos (k_cutsq, ntypes+1, ntypes+1, "FixRxKokkos::k_cutsq");
     d_cutsq = k_cutsq.template view<DeviceType>();
 
     for (int i = 1; i <= ntypes; ++i)
@@ -1804,30 +2024,37 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
   // Initialize the local temperature weight array
   int sumWeightsCt = nlocal + (NEWTON_PAIR ? nghost : 0);
 
-  memory->create_kokkos (k_sumWeights, sumWeights, sumWeightsCt, "FixRxKokkos::sumWeights");
+  //memory->create_kokkos (k_sumWeights, sumWeights, sumWeightsCt, "FixRxKokkos::sumWeights");
+  memory->create_kokkos (k_sumWeights, sumWeightsCt, "FixRxKokkos::sumWeights");
   d_sumWeights = k_sumWeights.d_view;
   h_sumWeights = k_sumWeights.h_view;
 
   // Initialize the accumulator to zero ...
-  Kokkos::parallel_for (sumWeightsCt,
-        LAMMPS_LAMBDA(const int i)
-        {
-           d_sumWeights(i) = 0.0;
-        }
-     );
+  //Kokkos::parallel_for (sumWeightsCt,
+  //      LAMMPS_LAMBDA(const int i)
+  //      {
+  //         d_sumWeights(i) = 0.0;
+  //      }
+  //   );
+
+  Kokkos::parallel_for (Kokkos::RangePolicy<DeviceType, Tag_FixRxKokkos_zeroTemperatureViews>(0, sumWeightsCt), *this);
 
   // Local list views. (This isn't working!)
   NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
   if (not(list->kokkos))
      error->one(FLERR,"list is not a Kokkos list\n");
 
-  typename ArrayTypes<DeviceType>::t_neighbors_2d d_neighbors = k_list->d_neighbors;
-  typename ArrayTypes<DeviceType>::t_int_1d       d_ilist     = k_list->d_ilist;
-  typename ArrayTypes<DeviceType>::t_int_1d       d_numneigh  = k_list->d_numneigh;
+  //typename ArrayTypes<DeviceType>::t_neighbors_2d d_neighbors = k_list->d_neighbors;
+  //typename ArrayTypes<DeviceType>::t_int_1d       d_ilist     = k_list->d_ilist;
+  //typename ArrayTypes<DeviceType>::t_int_1d       d_numneigh  = k_list->d_numneigh;
+  d_neighbors = k_list->d_neighbors;
+  d_ilist     = k_list->d_ilist;
+  d_numneigh  = k_list->d_numneigh;
 
   const int inum = list->inum;
 
   // loop over neighbors of my atoms
+#if 0
   Kokkos::parallel_for ( inum,
         LAMMPS_LAMBDA(const int ii)
         {
@@ -1892,6 +2119,9 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
           a_sumWeights(i) += i_sumWeights;
         }
      );
+#else
+  Kokkos::parallel_for (Kokkos::RangePolicy<DeviceType, Tag_FixRxKokkos_firstPairOperator<WT_FLAG, NEWTON_PAIR, NEIGHFLAG> >(0, inum), *this);
+#endif
 
   // Signal that dpdThetaLocal and sumWeights have been modified.
   k_dpdThetaLocal.template modify<DeviceType>();
@@ -1905,6 +2135,7 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
   k_sumWeights.   template sync<DeviceType>();
 
   // self-interaction for local temperature
+#if 0
   Kokkos::parallel_for ( nlocal,
         LAMMPS_LAMBDA(const int i)
         {
@@ -1925,10 +2156,15 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
             d_dpdThetaLocal(i) = 1.0 / d_dpdThetaLocal(i);
         }
      );
+#else
+  Kokkos::parallel_for (Kokkos::RangePolicy<DeviceType, Tag_FixRxKokkos_2ndPairOperator<WT_FLAG, LOCAL_TEMP_FLAG> >(0, nlocal), *this);
+#endif
 
   // Clean up the local kokkos data.
-  memory->destroy_kokkos(k_cutsq, h_cutsq);
-  memory->destroy_kokkos(k_sumWeights, sumWeights);
+  //memory->destroy_kokkos(k_cutsq, h_cutsq);
+  memory->destroy_kokkos(k_cutsq);
+  //memory->destroy_kokkos(k_sumWeights, sumWeights);
+  memory->destroy_kokkos(k_sumWeights);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/KOKKOS/fix_rx_kokkos.h b/src/KOKKOS/fix_rx_kokkos.h
index c18ce6f151..169a87a2f9 100644
--- a/src/KOKKOS/fix_rx_kokkos.h
+++ b/src/KOKKOS/fix_rx_kokkos.h
@@ -30,6 +30,47 @@ FixStyle(rx/kk/host,FixRxKokkos<LMPHostType>)
 
 namespace LAMMPS_NS {
 
+struct Tag_FixRxKokkos_zeroTemperatureViews {};
+struct Tag_FixRxKokkos_zeroCounterViews {};
+
+template <int WT_FLAG, bool NEWTON_PAIR, int NEIGHFLAG>
+struct Tag_FixRxKokkos_firstPairOperator {};
+
+template <int WT_FLAG, int LOCAL_TEMP_FLAG>
+struct Tag_FixRxKokkos_2ndPairOperator {};
+
+template <bool ZERO_RATES>
+struct Tag_FixRxKokkos_solveSystems {};
+
+struct s_CounterType
+{
+  int nSteps, nIters, nFuncs, nFails;
+
+  KOKKOS_INLINE_FUNCTION
+  s_CounterType() : nSteps(0), nIters(0), nFuncs(0), nFails(0) {};
+
+  KOKKOS_INLINE_FUNCTION
+  s_CounterType& operator+=(const s_CounterType &rhs)
+  {
+    nSteps += rhs.nSteps;
+    nIters += rhs.nIters;
+    nFuncs += rhs.nFuncs;
+    nFails += rhs.nFails;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  volatile s_CounterType& operator+=(const volatile s_CounterType &rhs) volatile
+  {
+    nSteps += rhs.nSteps;
+    nIters += rhs.nIters;
+    nFuncs += rhs.nFuncs;
+    nFails += rhs.nFails;
+    return *this;
+  }
+};
+typedef struct s_CounterType CounterType;
+
 template <typename DeviceType>
 class FixRxKokkos : public FixRX {
  public:
@@ -41,42 +82,34 @@ class FixRxKokkos : public FixRX {
   virtual void setup_pre_force(int);
   virtual void pre_force(int);
 
-  //template <typename SolverTag>
-  //  KOKKOS_INLINE_FUNCTION
-  //void operator()(SolverTag, const int&) const;
+  // Define a value_type here for the reduction operator on CounterType.
+  typedef CounterType value_type;
 
-  struct CounterType
-  {
-    int nSteps, nIters, nFuncs, nFails;
+  KOKKOS_INLINE_FUNCTION
+  void operator()(Tag_FixRxKokkos_zeroCounterViews, const int&) const;
 
-    KOKKOS_INLINE_FUNCTION
-    CounterType() : nSteps(0), nIters(0), nFuncs(0), nFails(0) {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()(Tag_FixRxKokkos_zeroTemperatureViews, const int&) const;
 
-    KOKKOS_INLINE_FUNCTION
-    CounterType& operator+=(const CounterType &rhs)
-    {
-      nSteps += rhs.nSteps;
-      nIters += rhs.nIters;
-      nFuncs += rhs.nFuncs;
-      nFails += rhs.nFails;
-      return *this;
-    }
+  template <int WT_FLAG, bool NEWTON_PAIR, int NEIGHFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(Tag_FixRxKokkos_firstPairOperator<WT_FLAG,NEWTON_PAIR,NEIGHFLAG>, const int&) const;
 
-    KOKKOS_INLINE_FUNCTION
-    volatile CounterType& operator+=(const volatile CounterType &rhs) volatile
-    {
-      nSteps += rhs.nSteps;
-      nIters += rhs.nIters;
-      nFuncs += rhs.nFuncs;
-      nFails += rhs.nFails;
-      return *this;
-    }
-  };
+  template <int WT_FLAG, int LOCAL_TEMP_FLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(Tag_FixRxKokkos_2ndPairOperator<WT_FLAG,LOCAL_TEMP_FLAG>, const int&) const;
+
+  template <bool ZERO_RATES>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(Tag_FixRxKokkos_solveSystems<ZERO_RATES>, const int&, CounterType&) const;
 
  //protected:
   PairDPDfdtEnergyKokkos<DeviceType>* pairDPDEKK;
   double VDPD;
 
+  double boltz;
+  double t_stop;
+
   template <typename T, int stride = 1>
   struct StridedArrayType
   {
@@ -203,6 +236,27 @@ class FixRxKokkos : public FixRX {
   typename DAT::t_efloat_1d d_dpdThetaLocal, d_sumWeights;
   typename HAT::t_efloat_1d h_dpdThetaLocal, h_sumWeights;
 
+  typename ArrayTypes<DeviceType>::t_x_array_randomread d_x       ;
+  typename ArrayTypes<DeviceType>::t_int_1d_randomread  d_type    ;
+  typename ArrayTypes<DeviceType>::t_efloat_1d          d_dpdTheta;
+
+  typename ArrayTypes<DeviceType>::tdual_ffloat_2d k_cutsq;
+  typename ArrayTypes<DeviceType>::t_ffloat_2d     d_cutsq;
+  //double **h_cutsq;
+
+  typename ArrayTypes<DeviceType>::t_neighbors_2d d_neighbors;
+  typename ArrayTypes<DeviceType>::t_int_1d       d_ilist    ;
+  typename ArrayTypes<DeviceType>::t_int_1d       d_numneigh ;
+
+  typename ArrayTypes<DeviceType>::t_float_2d  d_dvector;
+  typename ArrayTypes<DeviceType>::t_int_1d    d_mask   ;
+
+  typename ArrayTypes<DeviceType>::t_double_1d d_scratchSpace;
+  size_t scratchSpaceSize;
+
+  // Error flag for any failures.
+  DAT::tdual_int_scalar k_error_flag;
+
   template <int WT_FLAG, int LOCAL_TEMP_FLAG, bool NEWTON_PAIR, int NEIGHFLAG>
   void computeLocalTemperature();
 
@@ -213,6 +267,7 @@ class FixRxKokkos : public FixRX {
 
  //private: // replicate a few from FixRX
   int my_restartFlag;
+  int nlocal;
 };
 
 }
diff --git a/src/USER-DPD/fix_rx.cpp b/src/USER-DPD/fix_rx.cpp
index 28321dbecf..8a8195da19 100644
--- a/src/USER-DPD/fix_rx.cpp
+++ b/src/USER-DPD/fix_rx.cpp
@@ -220,6 +220,9 @@ FixRX::FixRX(LAMMPS *lmp, int narg, char **arg) :
 
 FixRX::~FixRX()
 {
+  printf("Inside FixRX::~FixRX copymode= %d\n", copymode);
+  if (copymode) return;
+
   // De-Allocate memory to prevent memory leak
   for (int ii = 0; ii < nreactions; ii++){
     delete [] stoich[ii];

From acc5bde0fe53a2e9052ee7a27ceafb42acbea114 Mon Sep 17 00:00:00 2001
From: Christopher Stone <chris.stone@computational-science.com>
Date: Mon, 13 Feb 2017 16:36:30 -0500
Subject: [PATCH 122/267] Removed printf's from FixRXKokkos and FixRX.

- Commented out the printf's in FixRXKokkos and FixRX used for
  active debugging.
---
 src/KOKKOS/fix_rx_kokkos.cpp | 28 ++++++++++++++--------------
 src/USER-DPD/fix_rx.cpp      | 18 +++++++++---------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/src/KOKKOS/fix_rx_kokkos.cpp b/src/KOKKOS/fix_rx_kokkos.cpp
index 77e948be35..08a20ac9a7 100644
--- a/src/KOKKOS/fix_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@@ -71,13 +71,13 @@ FixRxKokkos<DeviceType>::FixRxKokkos(LAMMPS *lmp, int narg, char **arg) :
 
   k_error_flag = DAT::tdual_int_scalar("FixRxKokkos::k_error_flag");
 
-  printf("Inside FixRxKokkos::FixRxKokkos\n");
+  //printf("Inside FixRxKokkos::FixRxKokkos\n");
 }
 
 template <typename DeviceType>
 FixRxKokkos<DeviceType>::~FixRxKokkos()
 {
-  printf("Inside FixRxKokkos::~FixRxKokkos copymode= %d\n", copymode);
+  //printf("Inside FixRxKokkos::~FixRxKokkos copymode= %d\n", copymode);
   if (copymode) return;
 }
 
@@ -98,7 +98,7 @@ void FixRxKokkos<DeviceType>::post_constructor()
 template <typename DeviceType>
 void FixRxKokkos<DeviceType>::init()
 {
-  printf("Inside FixRxKokkos::init\n");
+  //printf("Inside FixRxKokkos::init\n");
 
   // Call the parent's version.
   //FixRX::init();
@@ -153,7 +153,7 @@ void FixRxKokkos<DeviceType>::init()
 template <typename DeviceType>
 void FixRxKokkos<DeviceType>::init_list(int, class NeighList* ptr)
 {
-  printf("Inside FixRxKokkos::init_list\n");
+  //printf("Inside FixRxKokkos::init_list\n");
   this->list = ptr;
 }
 
@@ -1220,7 +1220,7 @@ void FixRxKokkos<DeviceType>::operator()(SolverType, const int &i) const
 template <typename DeviceType>
 void FixRxKokkos<DeviceType>::create_kinetics_data(void)
 {
-  printf("Inside FixRxKokkos::create_kinetics_data\n");
+  //printf("Inside FixRxKokkos::create_kinetics_data\n");
 
   memory->create_kokkos( d_kineticsData.Arr, h_kineticsData.Arr, nreactions, "KineticsType::Arr");
   memory->create_kokkos( d_kineticsData.nArr, h_kineticsData.nArr, nreactions, "KineticsType::nArr");
@@ -1301,7 +1301,7 @@ void FixRxKokkos<DeviceType>::create_kinetics_data(void)
 template <typename DeviceType>
 void FixRxKokkos<DeviceType>::setup_pre_force(int vflag)
 {
-  printf("Inside FixRxKokkos<DeviceType>::setup_pre_force restartFlag= %d\n", my_restartFlag);
+  //printf("Inside FixRxKokkos<DeviceType>::setup_pre_force restartFlag= %d\n", my_restartFlag);
 
   if (my_restartFlag)
     my_restartFlag = 0;
@@ -1314,7 +1314,7 @@ void FixRxKokkos<DeviceType>::setup_pre_force(int vflag)
 template <typename DeviceType>
 void FixRxKokkos<DeviceType>::pre_force(int vflag)
 {
-  printf("Inside FixRxKokkos<DeviceType>::pre_force localTempFlag= %d\n", localTempFlag);
+  //printf("Inside FixRxKokkos<DeviceType>::pre_force localTempFlag= %d\n", localTempFlag);
 
   this->solve_reactions( vflag, true );
 }
@@ -1412,7 +1412,7 @@ void FixRxKokkos<DeviceType>::operator()(Tag_FixRxKokkos_solveSystems<ZERO_RATES
 template <typename DeviceType>
 void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreForce)
 {
-  printf("Inside FixRxKokkos<DeviceType>::solve_reactions localTempFlag= %d isPreForce= %s\n", localTempFlag, isPreForce ? "True" : "false");
+  //printf("Inside FixRxKokkos<DeviceType>::solve_reactions localTempFlag= %d isPreForce= %s\n", localTempFlag, isPreForce ? "True" : "false");
 
   copymode = 1;
 
@@ -1653,11 +1653,11 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
 
   double time_ODE = getElapsedTime(timer_localTemperature, timer_ODE);
 
-  printf("me= %d kokkos total= %g temp= %g ode= %g comm= %g nlocal= %d nfc= %d %d\n", comm->me,
-                         getElapsedTime(timer_start, timer_stop),
-                         getElapsedTime(timer_start, timer_localTemperature),
-                         getElapsedTime(timer_localTemperature, timer_ODE),
-                         getElapsedTime(timer_ODE, timer_stop), nlocal, TotalCounters.nFuncs, TotalCounters.nSteps);
+  //printf("me= %d kokkos total= %g temp= %g ode= %g comm= %g nlocal= %d nfc= %d %d\n", comm->me,
+  //                       getElapsedTime(timer_start, timer_stop),
+  //                       getElapsedTime(timer_start, timer_localTemperature),
+  //                       getElapsedTime(timer_localTemperature, timer_ODE),
+  //                       getElapsedTime(timer_ODE, timer_stop), nlocal, TotalCounters.nFuncs, TotalCounters.nSteps);
 
   // Warn the user if a failure was detected in the ODE solver.
   if (TotalCounters.nFails > 0){
@@ -1992,7 +1992,7 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
   nlocal = atom->nlocal;
   const int nghost = atom->nghost;
 
-  printf("Inside FixRxKokkos::computeLocalTemperature: %d %d %d %d %d %d %d\n", WT_FLAG, LOCAL_TEMP_FLAG, NEWTON_PAIR, (int)lmp->kokkos->neighflag, NEIGHFLAG, nlocal, nghost);
+  //printf("Inside FixRxKokkos::computeLocalTemperature: %d %d %d %d %d %d %d\n", WT_FLAG, LOCAL_TEMP_FLAG, NEWTON_PAIR, (int)lmp->kokkos->neighflag, NEIGHFLAG, nlocal, nghost);
 
   // Pull from pairDPDE. The pairDPDEKK objects are protected so recreate here for now.
   //pairDPDEKK->k_cutsq.template sync<DeviceType>();
diff --git a/src/USER-DPD/fix_rx.cpp b/src/USER-DPD/fix_rx.cpp
index 8a8195da19..a8939e27f2 100644
--- a/src/USER-DPD/fix_rx.cpp
+++ b/src/USER-DPD/fix_rx.cpp
@@ -220,7 +220,7 @@ FixRX::FixRX(LAMMPS *lmp, int narg, char **arg) :
 
 FixRX::~FixRX()
 {
-  printf("Inside FixRX::~FixRX copymode= %d\n", copymode);
+  //printf("Inside FixRX::~FixRX copymode= %d\n", copymode);
   if (copymode) return;
 
   // De-Allocate memory to prevent memory leak
@@ -756,8 +756,8 @@ void FixRX::pre_force(int vflag)
     memory->create( diagnosticCounterPerODE[FuncSum], nlocal, "FixRX::diagnosticCounterPerODE");
   }
 
-  #pragma omp parallel \
-     reduction(+: nSteps, nIters, nFuncs, nFails )
+  //#pragma omp parallel \
+  //   reduction(+: nSteps, nIters, nFuncs, nFails )
   {
     double *rwork = new double[8*nspecies];
 
@@ -767,7 +767,7 @@ void FixRX::pre_force(int vflag)
 
     int ode_counter[4] = { 0 };
 
-    #pragma omp for schedule(runtime)
+    //#pragma omp for schedule(runtime)
     for (int i = 0; i < nlocal; i++)
     {
       if (mask[i] & groupbit)
@@ -810,11 +810,11 @@ void FixRX::pre_force(int vflag)
 
   double time_ODE = getElapsedTime(timer_localTemperature, timer_ODE);
 
-  printf("me= %d total= %g temp= %g ode= %g comm= %g nlocal= %d nfc= %d %d\n", comm->me,
-                         getElapsedTime(timer_start, timer_stop),
-                         getElapsedTime(timer_start, timer_localTemperature),
-                         getElapsedTime(timer_localTemperature, timer_ODE),
-                         getElapsedTime(timer_ODE, timer_stop), nlocal, nFuncs, nSteps);
+  //printf("me= %d total= %g temp= %g ode= %g comm= %g nlocal= %d nfc= %d %d\n", comm->me,
+  //                       getElapsedTime(timer_start, timer_stop),
+  //                       getElapsedTime(timer_start, timer_localTemperature),
+  //                       getElapsedTime(timer_localTemperature, timer_ODE),
+  //                       getElapsedTime(timer_ODE, timer_stop), nlocal, nFuncs, nSteps);
 
   // Warn the user if a failure was detected in the ODE solver.
   if (nFails > 0){

From 0a751c59012ec2ef97d5d5313512983ef77f2c0f Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Wed, 22 Feb 2017 11:52:20 -0500
Subject: [PATCH 123/267] KOKKOS: fix a compile-time error caused by merge of
 patch 21Feb17 Remove the unused PairHybridOverlayKokkos::modify_requests()
 method The patch removed the parent PairHybridOverlay::modify_requests()

---
 src/KOKKOS/pair_hybrid_overlay_kokkos.cpp | 35 -----------------------
 src/KOKKOS/pair_hybrid_overlay_kokkos.h   |  3 --
 2 files changed, 38 deletions(-)

diff --git a/src/KOKKOS/pair_hybrid_overlay_kokkos.cpp b/src/KOKKOS/pair_hybrid_overlay_kokkos.cpp
index 79d9c63221..aa5d895155 100644
--- a/src/KOKKOS/pair_hybrid_overlay_kokkos.cpp
+++ b/src/KOKKOS/pair_hybrid_overlay_kokkos.cpp
@@ -105,38 +105,3 @@ void PairHybridOverlayKokkos::coeff(int narg, char **arg)
 
   if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
 }
-
-/* ----------------------------------------------------------------------
-   combine sub-style neigh list requests and create new ones if needed
-------------------------------------------------------------------------- */
-
-void PairHybridOverlayKokkos::modify_requests()
-{
-  int i,j;
-  NeighRequest *irq,*jrq;
-
-  // loop over pair requests only
-  // if a previous list is same kind with same skip attributes
-  // then make this one a copy list of that one
-  // works whether both lists are no-skip or yes-skip
-  // will not point a list at a copy list, but at copy list's parent
-
-  for (i = 0; i < neighbor->nrequest; i++) {
-    if (!neighbor->requests[i]->pair) continue;
-
-    irq = neighbor->requests[i];
-    for (j = 0; j < i; j++) {
-      if (!neighbor->requests[j]->pair) continue;
-      jrq = neighbor->requests[j];
-      if (irq->same_kind(jrq) && irq->same_skip(jrq)) {
-        irq->copy = 1;
-        irq->otherlist = j;
-        break;
-      }
-    }
-  }
-
-  // perform same operations on skip lists as pair style = hybrid
-
-  PairHybrid::modify_requests();
-}
diff --git a/src/KOKKOS/pair_hybrid_overlay_kokkos.h b/src/KOKKOS/pair_hybrid_overlay_kokkos.h
index 2e4899a1f3..6bec57c453 100644
--- a/src/KOKKOS/pair_hybrid_overlay_kokkos.h
+++ b/src/KOKKOS/pair_hybrid_overlay_kokkos.h
@@ -29,9 +29,6 @@ class PairHybridOverlayKokkos : public PairHybridKokkos {
   PairHybridOverlayKokkos(class LAMMPS *);
   virtual ~PairHybridOverlayKokkos() {}
   void coeff(int, char **);
-
- private:
-  void modify_requests();
 };
 
 }

From 2db66e49b444c829a27e7a874d0fba49faf0387b Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 30 Dec 2016 12:16:54 -0500
Subject: [PATCH 124/267] USER-DPD: make pair_dpd_fdt* check more generically
 for use of fix_shardlow Allows easier experimentation of alternative shardlow
 implementations.

---
 src/USER-DPD/pair_dpd_fdt.cpp        | 2 +-
 src/USER-DPD/pair_dpd_fdt_energy.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/USER-DPD/pair_dpd_fdt.cpp b/src/USER-DPD/pair_dpd_fdt.cpp
index e7e9febd82..90aa4f1eaf 100644
--- a/src/USER-DPD/pair_dpd_fdt.cpp
+++ b/src/USER-DPD/pair_dpd_fdt.cpp
@@ -325,7 +325,7 @@ void PairDPDfdt::init_style()
   splitFDT_flag = false;
   int irequest = neighbor->request(this,instance_me);
   for (int i = 0; i < modify->nfix; i++)
-    if (strcmp(modify->fix[i]->style,"shardlow") == 0){
+    if (strncmp(modify->fix[i]->style,"shardlow", 8) == 0){
       splitFDT_flag = true;
     }
 }
diff --git a/src/USER-DPD/pair_dpd_fdt_energy.cpp b/src/USER-DPD/pair_dpd_fdt_energy.cpp
index 9d08393b9d..ad6310a283 100644
--- a/src/USER-DPD/pair_dpd_fdt_energy.cpp
+++ b/src/USER-DPD/pair_dpd_fdt_energy.cpp
@@ -414,7 +414,7 @@ void PairDPDfdtEnergy::init_style()
   splitFDT_flag = false;
   int irequest = neighbor->request(this,instance_me);
   for (int i = 0; i < modify->nfix; i++)
-    if (strcmp(modify->fix[i]->style,"shardlow") == 0){
+    if (strncmp(modify->fix[i]->style,"shardlow", 8) == 0){
       splitFDT_flag = true;
     }
 

From 0512e7886067fbed5d9178654ffe74b16020e258 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 30 Dec 2016 14:42:21 -0500
Subject: [PATCH 125/267] USER-DPD: new neighbor list code for SSA that gives
 neighbors to ghosts. This simplifies the processing of the neighbor list in
 fix_shardlow. NOTE: pair evaluation order changes, causing numerical
 differences!

---
 src/USER-DPD/fix_shardlow.cpp                 |  18 ++-
 src/USER-DPD/nbin_ssa.cpp                     |  27 ++--
 src/USER-DPD/nbin_ssa.h                       |   2 +-
 src/USER-DPD/npair_half_bin_newton_ssa.cpp    | 153 ++++++++----------
 src/USER-DPD/npair_half_bin_newton_ssa.h      |   2 +-
 src/USER-DPD/npair_halffull_newton_ssa.cpp    |   4 +
 .../nstencil_half_bin_2d_newton_ssa.cpp       |  12 +-
 .../nstencil_half_bin_2d_newton_ssa.h         |   2 +-
 .../nstencil_half_bin_3d_newton_ssa.cpp       |  18 ++-
 .../nstencil_half_bin_3d_newton_ssa.h         |   2 +-
 src/USER-DPD/nstencil_ssa.h                   |   2 +-
 11 files changed, 129 insertions(+), 113 deletions(-)

diff --git a/src/USER-DPD/fix_shardlow.cpp b/src/USER-DPD/fix_shardlow.cpp
index bf8959fa9f..56597697f7 100644
--- a/src/USER-DPD/fix_shardlow.cpp
+++ b/src/USER-DPD/fix_shardlow.cpp
@@ -148,6 +148,7 @@ void FixShardlow::init()
   int irequest = neighbor->request(this,instance_me);
   neighbor->requests[irequest]->pair = 0;
   neighbor->requests[irequest]->fix  = 1;
+  neighbor->requests[irequest]->ghost= 1;
   neighbor->requests[irequest]->ssa  = 1;
 }
 
@@ -498,7 +499,7 @@ void FixShardlow::ssa_update_dpde(
 
 void FixShardlow::initial_integrate(int vflag)
 {
-  int i,ii,inum;
+  int i,ii,inum,anum;
   int *ilist;
 
   int nlocal = atom->nlocal;
@@ -531,10 +532,12 @@ void FixShardlow::initial_integrate(int vflag)
   v_t0 = (double (*)[3]) memory->smalloc(sizeof(double)*3*nghost, "FixShardlow:v_t0");
 
   inum = list->inum;
+  anum = inum + list->gnum;
   ilist = list->ilist;
 
   dtsqrt = sqrt(update->dt);
 
+  ii = 0;
   //Loop over all 14 directions (8 stages)
   for (airnum = 1; airnum <=8; airnum++){
 
@@ -549,15 +552,16 @@ void FixShardlow::initial_integrate(int vflag)
       }
     }
 
-    // Loop over neighbors of my atoms
-    for (ii = 0; ii < inum; ii++) {
+    // process neighbors in this AIR
+    while (ii < anum) {
       i = ilist[ii];
-      int start = (airnum < 2) ? 0 : list->ndxAIR_ssa[i][airnum - 2];
-      int len = list->ndxAIR_ssa[i][airnum - 1] - start;
+      if (atom->ssaAIR[i] > airnum) break; /* done with curent AIR */
+      int len = list->numneigh[i];
       if (len > 0) {
-        if (useDPDE) ssa_update_dpde(i, &(list->firstneigh[i][start]), len);
-        else ssa_update_dpd(i, &(list->firstneigh[i][start]), len);
+        if (useDPDE) ssa_update_dpde(i, &(list->firstneigh[i][0]), len);
+        else ssa_update_dpd(i, &(list->firstneigh[i][0]), len);
       }
+      ii++;
     }
 
     // Communicate the ghost deltas to the atom owners
diff --git a/src/USER-DPD/nbin_ssa.cpp b/src/USER-DPD/nbin_ssa.cpp
index 73da5e0df3..c2d780bac6 100644
--- a/src/USER-DPD/nbin_ssa.cpp
+++ b/src/USER-DPD/nbin_ssa.cpp
@@ -33,14 +33,13 @@ NBinSSA::NBinSSA(LAMMPS *lmp) : NBinStandard(lmp)
   bins_ssa = NULL;
   maxhead_ssa = 0;
   binhead_ssa = NULL;
-  gbinhead_ssa = NULL;
+  for (int i = 0; i < 9; i++) gairhead_ssa[i] = -1;
 }
 
 NBinSSA::~NBinSSA()
 {
   memory->destroy(bins_ssa);
   memory->destroy(binhead_ssa);
-  memory->destroy(gbinhead_ssa);
 }
 
 /* ----------------------------------------------------------------------
@@ -62,8 +61,11 @@ void NBinSSA::bin_atoms()
 
   last_bin = update->ntimestep;
 
+  for (i = 0; i < 9; i++) {
+    gairhead_ssa[i] = -1;
+  }
+
   for (i = 0; i < mbins; i++) {
-    gbinhead_ssa[i] = -1;
     binhead_ssa[i] = -1;
   }
 
@@ -73,19 +75,19 @@ void NBinSSA::bin_atoms()
     int bitmask = group->bitmask[includegroup];
     int nowned = atom->nlocal; // NOTE: nlocal was set to atom->nfirst above
     for (i = nall-1; i >= nowned; i--) {
-      if (ssaAIR[i] < 2) continue; // skip ghost atoms not in AIR
+      ibin = ssaAIR[i];
+      if (ibin < 2) continue; // skip ghost atoms not in AIR
       if (mask[i] & bitmask) {
-        ibin = coord2bin(x[i]);
-        bins_ssa[i] = gbinhead_ssa[ibin];
-        gbinhead_ssa[ibin] = i;
+        bins_ssa[i] = gairhead_ssa[ibin];
+        gairhead_ssa[ibin] = i;
       }
     }
   } else {
     for (i = nall-1; i >= nlocal; i--) {
-      if (ssaAIR[i] < 2) continue; // skip ghost atoms not in AIR
-      ibin = coord2bin(x[i]);
-      bins_ssa[i] = gbinhead_ssa[ibin];
-      gbinhead_ssa[ibin] = i;
+      ibin = ssaAIR[i];
+      if (ibin < 2) continue; // skip ghost atoms not in AIR
+      bins_ssa[i] = gairhead_ssa[ibin];
+      gairhead_ssa[ibin] = i;
     }
   }
   for (i = nlocal-1; i >= 0; i--) {
@@ -103,10 +105,8 @@ void NBinSSA::bin_atoms_setup(int nall)
 
   if (mbins > maxhead_ssa) {
     maxhead_ssa = mbins;
-    memory->destroy(gbinhead_ssa);
     memory->destroy(binhead_ssa);
     memory->create(binhead_ssa,maxhead_ssa,"binhead_ssa");
-    memory->create(gbinhead_ssa,maxhead_ssa,"gbinhead_ssa");
   }
 
   if (nall > maxbin_ssa) {
@@ -125,7 +125,6 @@ bigint NBinSSA::memory_usage()
   if (maxbin_ssa) bytes += memory->usage(bins_ssa,maxbin_ssa);
   if (maxhead_ssa) {
     bytes += memory->usage(binhead_ssa,maxhead_ssa);
-    bytes += memory->usage(gbinhead_ssa,maxhead_ssa);
   }
   return bytes;
 }
diff --git a/src/USER-DPD/nbin_ssa.h b/src/USER-DPD/nbin_ssa.h
index f0699b3a7a..5a2562d305 100644
--- a/src/USER-DPD/nbin_ssa.h
+++ b/src/USER-DPD/nbin_ssa.h
@@ -32,7 +32,7 @@ class NBinSSA : public NBinStandard {
   int *bins_ssa;             // index of next atom in each bin
   int maxbin_ssa;            // size of bins_ssa array
   int *binhead_ssa;          // index of 1st local atom in each bin
-  int *gbinhead_ssa;         // index of 1st ghost atom in each bin
+  int gairhead_ssa[9];       // index of 1st ghost atom in each AIR
   int maxhead_ssa;           // size of binhead_ssa and gbinhead_ssa arrays
 
   NBinSSA(class LAMMPS *);
diff --git a/src/USER-DPD/npair_half_bin_newton_ssa.cpp b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
index fd67b66e9b..4c9dc95308 100644
--- a/src/USER-DPD/npair_half_bin_newton_ssa.cpp
+++ b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
@@ -32,12 +32,6 @@
 
 using namespace LAMMPS_NS;
 
-// allocate space for static class variable
-// prototype for non-class function
-
-static int *ssaAIRptr;
-static int cmp_ssaAIR(const void *, const void *);
-
 /* ---------------------------------------------------------------------- */
 
 NPairHalfBinNewtonSSA::NPairHalfBinNewtonSSA(LAMMPS *lmp) : NPair(lmp) {}
@@ -64,9 +58,7 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
   tagint **special = atom->special;
   int **nspecial = atom->nspecial;
   int nlocal = atom->nlocal;
-  int nall = nlocal + atom->nghost;
   if (includegroup) nlocal = atom->nfirst;
-  int *ssaAIR = atom->ssaAIR;
 
   int *molindex = atom->molindex;
   int *molatom = atom->molatom;
@@ -89,16 +81,18 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
   if (!nb_ssa) error->one(FLERR, "NBin wasn't a NBinSSA object");
   int *bins_ssa = nb_ssa->bins_ssa;
   int *binhead_ssa = nb_ssa->binhead_ssa;
-  int *gbinhead_ssa = nb_ssa->gbinhead_ssa;
+  int *gairhead_ssa = &(nb_ssa->gairhead_ssa[0]);
 
   int inum = 0;
+  int gnum = 0;
+  int xbin,ybin,zbin,xbin2,ybin2,zbin2;
+  int **stencilxyz = ns_ssa->stencilxyz;
 
   ipage->reset();
 
   // loop over owned atoms, storing half of the neighbors
 
   for (i = 0; i < nlocal; i++) {
-    int AIRct[8] = { 0 };
     n = 0;
     neighptr = ipage->vget();
 
@@ -175,51 +169,6 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
         }
       }
     }
-    AIRct[0] = n;
-
-    // loop over AIR ghost atoms in all bins in "full" stencil
-    // Note: the non-AIR ghost atoms have already been filtered out
-    // That is a significant time savings because of the "full" stencil
-    // Note2: only non-pure locals can have ghosts as neighbors
-
-    if (ssaAIR[i] == 1) for (k = 0; k < nstencil_full; k++) {
-      for (j = gbinhead_ssa[ibin+stencil[k]]; j >= 0;
-           j = bins_ssa[j]) {
-
-        jtype = type[j];
-        if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
-
-        delx = xtmp - x[j][0];
-        dely = ytmp - x[j][1];
-        delz = ztmp - x[j][2];
-        rsq = delx*delx + dely*dely + delz*delz;
-
-        if (rsq <= cutneighsq[itype][jtype]) {
-          if (molecular) {
-            if (!moltemplate)
-              which = find_special(special[i],nspecial[i],tag[j]);
-            else if (imol >= 0)
-              which = find_special(onemols[imol]->special[iatom],
-                                   onemols[imol]->nspecial[iatom],
-                                   tag[j]-tagprev);
-            else which = 0;
-            if (which == 0) {
-              neighptr[n++] = j;
-              ++(AIRct[ssaAIR[j] - 1]);
-            } else if (domain->minimum_image_check(delx,dely,delz)) {
-              neighptr[n++] = j;
-              ++(AIRct[ssaAIR[j] - 1]);
-            } else if (which > 0) {
-              neighptr[n++] = j ^ (which << SBBITS);
-              ++(AIRct[ssaAIR[j] - 1]);
-            }
-          } else {
-            neighptr[n++] = j;
-            ++(AIRct[ssaAIR[j] - 1]);
-          }
-        }
-      }
-    }
 
     ilist[inum++] = i;
     firstneigh[i] = neighptr;
@@ -227,34 +176,74 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
     ipage->vgot(n);
     if (ipage->status())
       error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
-
-    // sort the ghosts in the neighbor list by their ssaAIR number
-
-    ssaAIRptr = atom->ssaAIR;
-    qsort(&(neighptr[AIRct[0]]), n - AIRct[0], sizeof(int), cmp_ssaAIR);
-
-    // do a prefix sum on the counts to turn them into indexes
-
-    list->ndxAIR_ssa[i][0] = AIRct[0];
-    for (int ndx = 1; ndx < 8; ++ndx) {
-      list->ndxAIR_ssa[i][ndx] = AIRct[ndx] + list->ndxAIR_ssa[i][ndx - 1];
-    }
   }
 
   list->inum = inum;
+
+  // loop over AIR ghost atoms, storing their local neighbors
+  // since these are ghosts, must check if stencil bin is out of bounds
+  for (int airnum = 2; airnum <= 8; airnum++) {
+    for (i = gairhead_ssa[airnum]; i >= 0; i = bins_ssa[i]) {
+      n = 0;
+      neighptr = ipage->vget();
+
+      itype = type[i];
+      xtmp = x[i][0];
+      ytmp = x[i][1];
+      ztmp = x[i][2];
+      if (moltemplate) {
+        imol = molindex[i];
+        iatom = molatom[i];
+        tagprev = tag[i] - iatom - 1;
+      }
+
+      ibin = coord2bin(x[i],xbin,ybin,zbin);
+
+      // loop over AIR ghost atoms in all bins in "full" stencil
+      // Note: the non-AIR ghost atoms have already been filtered out
+      for (k = 0; k < nstencil_full; k++) {
+        xbin2 = xbin + stencilxyz[k][0];
+        ybin2 = ybin + stencilxyz[k][1];
+        zbin2 = zbin + stencilxyz[k][2];
+        // since we only care about ghost to local neighbors, these "bounds" could be inset
+        if (xbin2 < 0 || xbin2 >= mbinx ||
+            ybin2 < 0 || ybin2 >= mbiny ||
+            zbin2 < 0 || zbin2 >= mbinz) continue;
+        for (j = binhead_ssa[ibin+stencil[k]]; j >= 0; j = bins_ssa[j]) {
+
+          jtype = type[j];
+          if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+          delx = xtmp - x[j][0];
+          dely = ytmp - x[j][1];
+          delz = ztmp - x[j][2];
+          rsq = delx*delx + dely*dely + delz*delz;
+
+          if (rsq <= cutneighsq[itype][jtype]) {
+            if (molecular) {
+              if (!moltemplate)
+                which = find_special(special[i],nspecial[i],tag[j]);
+              else if (imol >= 0)
+                which = find_special(onemols[imol]->special[iatom],
+                                     onemols[imol]->nspecial[iatom],
+                                     tag[j]-tagprev);
+              else which = 0;
+              if (which == 0) neighptr[n++] = j;
+              else if (domain->minimum_image_check(delx,dely,delz))
+                neighptr[n++] = j;
+              else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+            } else neighptr[n++] = j;
+          }
+        }
+      }
+
+      if (n > 0) ilist[inum + (gnum++)] = i;
+      firstneigh[i] = neighptr;
+      numneigh[i] = n;
+      ipage->vgot(n);
+      if (ipage->status())
+        error->one(FLERR,"Neighbor (ghost) list overflow, boost neigh_modify one");
+    }
+  }
+  list->gnum = gnum;
 }
-
-/* ----------------------------------------------------------------------
-   comparison function invoked by qsort()
-   accesses static class member ssaAIRptr, set before call to qsort()
-------------------------------------------------------------------------- */
-
-static int cmp_ssaAIR(const void *iptr, const void *jptr)
-{
-  int i = NEIGHMASK & *((int *) iptr);
-  int j = NEIGHMASK & *((int *) jptr);
-  if (ssaAIRptr[i] < ssaAIRptr[j]) return -1;
-  if (ssaAIRptr[i] > ssaAIRptr[j]) return 1;
-  return 0;
-}
-
diff --git a/src/USER-DPD/npair_half_bin_newton_ssa.h b/src/USER-DPD/npair_half_bin_newton_ssa.h
index 13347b33b0..c9ccbc4bd9 100644
--- a/src/USER-DPD/npair_half_bin_newton_ssa.h
+++ b/src/USER-DPD/npair_half_bin_newton_ssa.h
@@ -15,7 +15,7 @@
 
 NPairStyle(half/bin/newton/ssa,
            NPairHalfBinNewtonSSA,
-           NP_HALF | NP_BIN | NP_NEWTON | NP_ORTHO | NP_SSA)
+           NP_HALF | NP_BIN | NP_NEWTON | NP_ORTHO | NP_SSA | NP_GHOST)
 
 #else
 
diff --git a/src/USER-DPD/npair_halffull_newton_ssa.cpp b/src/USER-DPD/npair_halffull_newton_ssa.cpp
index 2c9de3e50f..d0be1685b6 100644
--- a/src/USER-DPD/npair_halffull_newton_ssa.cpp
+++ b/src/USER-DPD/npair_halffull_newton_ssa.cpp
@@ -64,6 +64,10 @@ void NPairHalffullNewtonSSA::build(NeighList *list)
   int inum_full = list->listfull->inum;
 
   int inum = 0;
+
+  error->one(FLERR,"NPairHalffullNewtonSSA not yet implemented for ghosts with neighbors.");
+  return;
+
   ipage->reset();
 
   // loop over parent full list
diff --git a/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp b/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp
index df379a109a..254339bffc 100644
--- a/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp
+++ b/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp
@@ -46,8 +46,12 @@ void NStencilHalfBin2dNewtonSSA::create()
   for (j = 0; j <= sy; j++)
     for (i = -sx; i <= sx; i++)
       if (j > 0 || (j == 0 && i > 0))
-        if (bin_distance(i,j,0) < cutneighmaxsq)
+        if (bin_distance(i,j,0) < cutneighmaxsq) {
+          stencilxyz[pos][0] = i;
+          stencilxyz[pos][1] = j;
+          stencilxyz[pos][2] = 0;
           stencil[pos++] = j*mbinx + i;
+        }
 
   nstencil_half = pos; // record where normal half stencil ends
 
@@ -56,8 +60,12 @@ void NStencilHalfBin2dNewtonSSA::create()
   for (j = -sy; j <= 0; j++)
     for (i = -sx; i <= sx; i++) {
       if (j == 0 && i > 0) continue;
-      if (bin_distance(i,j,0) < cutneighmaxsq)
+      if (bin_distance(i,j,0) < cutneighmaxsq) {
+        stencilxyz[pos][0] = i;
+        stencilxyz[pos][1] = j;
+        stencilxyz[pos][2] = 0;
         stencil[pos++] = j*mbinx + i;
+      }
     }
 
   nstencil = pos; // record where full stencil ends
diff --git a/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.h b/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.h
index 30901bb3e2..1d5cc3f6b2 100644
--- a/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.h
+++ b/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.h
@@ -15,7 +15,7 @@
 
 NStencilStyle(half/bin/2d/newton/ssa,
               NStencilHalfBin2dNewtonSSA,
-              NS_HALF | NS_BIN | NS_2D | NS_NEWTON | NS_SSA | NS_ORTHO)
+              NS_HALF | NS_BIN | NS_2D | NS_NEWTON | NS_SSA | NS_ORTHO | NS_GHOST)
 
 #else
 
diff --git a/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp b/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp
index 76c9931ab2..1e2c18c66a 100644
--- a/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp
+++ b/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp
@@ -47,8 +47,12 @@ void NStencilHalfBin3dNewtonSSA::create()
     for (j = -sy; j <= sy; j++)
       for (i = -sx; i <= sx; i++)
         if (k > 0 || j > 0 || (j == 0 && i > 0))
-          if (bin_distance(i,j,k) < cutneighmaxsq)
+          if (bin_distance(i,j,k) < cutneighmaxsq) {
+            stencilxyz[pos][0] = i;
+            stencilxyz[pos][1] = j;
+            stencilxyz[pos][2] = k;
             stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
+          }
 
   nstencil_half = pos; // record where normal half stencil ends
 
@@ -57,8 +61,12 @@ void NStencilHalfBin3dNewtonSSA::create()
   for (k = -sz; k < 0; k++)
     for (j = -sy; j <= sy; j++)
       for (i = -sx; i <= sx; i++)
-        if (bin_distance(i,j,k) < cutneighmaxsq)
+        if (bin_distance(i,j,k) < cutneighmaxsq) {
+          stencilxyz[pos][0] = i;
+          stencilxyz[pos][1] = j;
+          stencilxyz[pos][2] = k;
           stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
+        }
 
   // For k==0, make sure to skip already included bins
 
@@ -66,8 +74,12 @@ void NStencilHalfBin3dNewtonSSA::create()
   for (j = -sy; j <= 0; j++)
     for (i = -sx; i <= sx; i++) {
       if (j == 0 && i > 0) continue;
-      if (bin_distance(i,j,k) < cutneighmaxsq)
+      if (bin_distance(i,j,k) < cutneighmaxsq) {
+        stencilxyz[pos][0] = i;
+        stencilxyz[pos][1] = j;
+        stencilxyz[pos][2] = k;
         stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
+      }
     }
 
   nstencil = pos; // record where full stencil ends
diff --git a/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.h b/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.h
index 7765b256d3..450a696e46 100644
--- a/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.h
+++ b/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.h
@@ -15,7 +15,7 @@
 
 NStencilStyle(half/bin/3d/newton/ssa,
               NStencilHalfBin3dNewtonSSA,
-              NS_HALF | NS_BIN | NS_3D | NS_NEWTON | NS_SSA | NS_ORTHO)
+              NS_HALF | NS_BIN | NS_3D | NS_NEWTON | NS_SSA | NS_ORTHO | NS_GHOST)
 
 #else
 
diff --git a/src/USER-DPD/nstencil_ssa.h b/src/USER-DPD/nstencil_ssa.h
index 9fcd19ee26..e6dfce60f4 100644
--- a/src/USER-DPD/nstencil_ssa.h
+++ b/src/USER-DPD/nstencil_ssa.h
@@ -20,7 +20,7 @@ namespace LAMMPS_NS {
 
 class NStencilSSA : public NStencil {
  public:
-  NStencilSSA(class LAMMPS *lmp) : NStencil(lmp) { }
+  NStencilSSA(class LAMMPS *lmp) : NStencil(lmp) { xyzflag = 1; }
   ~NStencilSSA() {}
   virtual void create() = 0;
 

From 638448676404468bda5253124d27ccf3c13a043e Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Thu, 26 Jan 2017 13:12:28 -0500
Subject: [PATCH 126/267] USER-DPD: Copy inline coord2bin() functions from
 nbin_kokkos into nbin_ssa

---
 src/USER-DPD/nbin_ssa.cpp |  5 ++-
 src/USER-DPD/nbin_ssa.h   | 72 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/src/USER-DPD/nbin_ssa.cpp b/src/USER-DPD/nbin_ssa.cpp
index c2d780bac6..82cf6e7cac 100644
--- a/src/USER-DPD/nbin_ssa.cpp
+++ b/src/USER-DPD/nbin_ssa.cpp
@@ -61,6 +61,9 @@ void NBinSSA::bin_atoms()
 
   last_bin = update->ntimestep;
 
+  bboxlo_[0] = bboxlo[0]; bboxlo_[1] = bboxlo[1]; bboxlo_[2] = bboxlo[2];
+  bboxhi_[0] = bboxhi[0]; bboxhi_[1] = bboxhi[1]; bboxhi_[2] = bboxhi[2];
+
   for (i = 0; i < 9; i++) {
     gairhead_ssa[i] = -1;
   }
@@ -91,7 +94,7 @@ void NBinSSA::bin_atoms()
     }
   }
   for (i = nlocal-1; i >= 0; i--) {
-    ibin = coord2bin(x[i]);
+    ibin = coord2bin(x[i][0], x[i][1], x[i][2]);
     bins_ssa[i] = binhead_ssa[ibin];
     binhead_ssa[ibin] = i;
   }
diff --git a/src/USER-DPD/nbin_ssa.h b/src/USER-DPD/nbin_ssa.h
index 5a2562d305..c39d7c7bce 100644
--- a/src/USER-DPD/nbin_ssa.h
+++ b/src/USER-DPD/nbin_ssa.h
@@ -42,6 +42,78 @@ class NBinSSA : public NBinStandard {
   void bin_atoms();
 
   bigint memory_usage();
+
+  inline
+  int coord2bin(const double & x,const double & y,const double & z) const
+  {
+    int ix,iy,iz;
+
+    if (x >= bboxhi_[0])
+      ix = static_cast<int> ((x-bboxhi_[0])*bininvx) + nbinx;
+    else if (x >= bboxlo_[0]) {
+      ix = static_cast<int> ((x-bboxlo_[0])*bininvx);
+      ix = MIN(ix,nbinx-1);
+    } else
+      ix = static_cast<int> ((x-bboxlo_[0])*bininvx) - 1;
+
+    if (y >= bboxhi_[1])
+      iy = static_cast<int> ((y-bboxhi_[1])*bininvy) + nbiny;
+    else if (y >= bboxlo_[1]) {
+      iy = static_cast<int> ((y-bboxlo_[1])*bininvy);
+      iy = MIN(iy,nbiny-1);
+    } else
+      iy = static_cast<int> ((y-bboxlo_[1])*bininvy) - 1;
+
+    if (z >= bboxhi_[2])
+      iz = static_cast<int> ((z-bboxhi_[2])*bininvz) + nbinz;
+    else if (z >= bboxlo_[2]) {
+      iz = static_cast<int> ((z-bboxlo_[2])*bininvz);
+      iz = MIN(iz,nbinz-1);
+    } else
+      iz = static_cast<int> ((z-bboxlo_[2])*bininvz) - 1;
+
+    return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo);
+  }
+
+  inline
+  int coord2bin(const double & x,const double & y,const double & z, int* i) const
+  {
+    int ix,iy,iz;
+
+    if (x >= bboxhi_[0])
+      ix = static_cast<int> ((x-bboxhi_[0])*bininvx) + nbinx;
+    else if (x >= bboxlo_[0]) {
+      ix = static_cast<int> ((x-bboxlo_[0])*bininvx);
+      ix = MIN(ix,nbinx-1);
+    } else
+      ix = static_cast<int> ((x-bboxlo_[0])*bininvx) - 1;
+
+    if (y >= bboxhi_[1])
+      iy = static_cast<int> ((y-bboxhi_[1])*bininvy) + nbiny;
+    else if (y >= bboxlo_[1]) {
+      iy = static_cast<int> ((y-bboxlo_[1])*bininvy);
+      iy = MIN(iy,nbiny-1);
+    } else
+      iy = static_cast<int> ((y-bboxlo_[1])*bininvy) - 1;
+
+    if (z >= bboxhi_[2])
+      iz = static_cast<int> ((z-bboxhi_[2])*bininvz) + nbinz;
+    else if (z >= bboxlo_[2]) {
+      iz = static_cast<int> ((z-bboxlo_[2])*bininvz);
+      iz = MIN(iz,nbinz-1);
+    } else
+      iz = static_cast<int> ((z-bboxlo_[2])*bininvz) - 1;
+
+    i[0] = ix - mbinxlo;
+    i[1] = iy - mbinylo;
+    i[2] = iz - mbinzlo;
+
+    return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo);
+  }
+
+ private:
+  double bboxlo_[3],bboxhi_[3];
+
 };
 
 }

From ff2786c86c4c3c1e103fefeccd0bdaeee826a4d4 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Thu, 26 Jan 2017 14:28:54 -0500
Subject: [PATCH 127/267] USER-DPD: Make another version of coord2bin() for
 nbin_ssa

---
 src/USER-DPD/nbin_ssa.h | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/src/USER-DPD/nbin_ssa.h b/src/USER-DPD/nbin_ssa.h
index c39d7c7bce..75766ebcd2 100644
--- a/src/USER-DPD/nbin_ssa.h
+++ b/src/USER-DPD/nbin_ssa.h
@@ -111,6 +111,42 @@ class NBinSSA : public NBinStandard {
     return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo);
   }
 
+  inline
+  int coord2bin(const double & x,const double & y,const double & z, int &ixo, int &iyo, int &izo) const
+  {
+    int ix,iy,iz;
+
+    if (x >= bboxhi_[0])
+      ix = static_cast<int> ((x-bboxhi_[0])*bininvx) + nbinx;
+    else if (x >= bboxlo_[0]) {
+      ix = static_cast<int> ((x-bboxlo_[0])*bininvx);
+      ix = MIN(ix,nbinx-1);
+    } else
+      ix = static_cast<int> ((x-bboxlo_[0])*bininvx) - 1;
+
+    if (y >= bboxhi_[1])
+      iy = static_cast<int> ((y-bboxhi_[1])*bininvy) + nbiny;
+    else if (y >= bboxlo_[1]) {
+      iy = static_cast<int> ((y-bboxlo_[1])*bininvy);
+      iy = MIN(iy,nbiny-1);
+    } else
+      iy = static_cast<int> ((y-bboxlo_[1])*bininvy) - 1;
+
+    if (z >= bboxhi_[2])
+      iz = static_cast<int> ((z-bboxhi_[2])*bininvz) + nbinz;
+    else if (z >= bboxlo_[2]) {
+      iz = static_cast<int> ((z-bboxlo_[2])*bininvz);
+      iz = MIN(iz,nbinz-1);
+    } else
+      iz = static_cast<int> ((z-bboxlo_[2])*bininvz) - 1;
+
+    ixo = ix - mbinxlo;
+    iyo = iy - mbinylo;
+    izo = iz - mbinzlo;
+
+    return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo);
+  }
+
  private:
   double bboxlo_[3],bboxhi_[3];
 

From e42678ed517d8bc95f16da08329a2ae63bffe7bb Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Thu, 26 Jan 2017 16:20:12 -0500
Subject: [PATCH 128/267] USER-DPD: track & use the extent of the local atoms
 in the bins

---
 src/USER-DPD/nbin_ssa.cpp                  | 73 ++++++++++++++--------
 src/USER-DPD/nbin_ssa.h                    | 16 +++--
 src/USER-DPD/npair_half_bin_newton_ssa.cpp | 49 ++++++++++-----
 src/neigh_list.h                           |  1 +
 4 files changed, 93 insertions(+), 46 deletions(-)

diff --git a/src/USER-DPD/nbin_ssa.cpp b/src/USER-DPD/nbin_ssa.cpp
index 82cf6e7cac..321baf771a 100644
--- a/src/USER-DPD/nbin_ssa.cpp
+++ b/src/USER-DPD/nbin_ssa.cpp
@@ -30,22 +30,24 @@ using namespace LAMMPS_NS;
 NBinSSA::NBinSSA(LAMMPS *lmp) : NBinStandard(lmp)
 {
   maxbin_ssa = 0;
-  bins_ssa = NULL;
-  maxhead_ssa = 0;
-  binhead_ssa = NULL;
-  for (int i = 0; i < 9; i++) gairhead_ssa[i] = -1;
+  binlist_ssa = NULL;
+  binct_ssa = NULL;
+  for (int i = 0; i < 9; i++) {
+    gairhead_ssa[i] = -1;
+    gairct_ssa[i] = 0;
+  }
 }
 
 NBinSSA::~NBinSSA()
 {
-  memory->destroy(bins_ssa);
-  memory->destroy(binhead_ssa);
+  memory->destroy(binlist_ssa);
+  memory->destroy(binct_ssa);
 }
 
 /* ----------------------------------------------------------------------
    bin owned and ghost atoms for the Shardlow Splitting Algorithm (SSA)
-   local atoms are in distinct bins (binhead_ssa) from the ghosts
-   ghost atoms are in distinct bins (gbinhead_ssa) from the locals
+   local atoms are in distinct bins (binhead[]) from the ghosts
+   ghost atoms are "binned" in gairhead_ssa[] instead
      ghosts which are not in an Active Interaction Region (AIR) are skipped
 ------------------------------------------------------------------------- */
 
@@ -58,6 +60,7 @@ void NBinSSA::bin_atoms()
   double **x = atom->x;
   int *mask = atom->mask;
   int *ssaAIR = atom->ssaAIR;
+  int xbin,ybin,zbin;
 
   last_bin = update->ntimestep;
 
@@ -66,10 +69,13 @@ void NBinSSA::bin_atoms()
 
   for (i = 0; i < 9; i++) {
     gairhead_ssa[i] = -1;
+    gairct_ssa[i] = 0;
   }
 
   for (i = 0; i < mbins; i++) {
-    binhead_ssa[i] = -1;
+    binhead[i] = -1;
+    binlist_ssa[i] = -1;
+    binct_ssa[i] = 0;
   }
 
   // bin in reverse order so linked list will be in forward order
@@ -81,23 +87,34 @@ void NBinSSA::bin_atoms()
       ibin = ssaAIR[i];
       if (ibin < 2) continue; // skip ghost atoms not in AIR
       if (mask[i] & bitmask) {
-        bins_ssa[i] = gairhead_ssa[ibin];
+        bins[i] = gairhead_ssa[ibin];
         gairhead_ssa[ibin] = i;
+        ++(gairct_ssa[ibin]);
       }
     }
   } else {
     for (i = nall-1; i >= nlocal; i--) {
       ibin = ssaAIR[i];
       if (ibin < 2) continue; // skip ghost atoms not in AIR
-      bins_ssa[i] = gairhead_ssa[ibin];
+      bins[i] = gairhead_ssa[ibin];
       gairhead_ssa[ibin] = i;
+      ++(gairct_ssa[ibin]);
     }
   }
   for (i = nlocal-1; i >= 0; i--) {
-    ibin = coord2bin(x[i][0], x[i][1], x[i][2]);
-    bins_ssa[i] = binhead_ssa[ibin];
-    binhead_ssa[ibin] = i;
+    ibin = coord2bin(x[i][0], x[i][1], x[i][2], xbin, ybin, zbin);
+    // Find the bounding box of the local atoms in the bins
+    if (xbin < lbinxlo) lbinxlo = xbin;
+    if (xbin >= lbinxhi) lbinxhi = xbin + 1;
+    if (ybin < lbinylo) lbinylo = ybin;
+    if (ybin >= lbinyhi) lbinyhi = ybin + 1;
+    if (zbin < lbinzlo) lbinzlo = zbin;
+    if (zbin >= lbinzhi) lbinzhi = zbin + 1;
+    bins[i] = binhead[ibin];
+    binhead[ibin] = i;
+    ++(binct_ssa[ibin]);
   }
+
 }
 
 /* ---------------------------------------------------------------------- */
@@ -106,17 +123,21 @@ void NBinSSA::bin_atoms_setup(int nall)
 {
   NBinStandard::bin_atoms_setup(nall); // Setup the parent class's data too
 
-  if (mbins > maxhead_ssa) {
-    maxhead_ssa = mbins;
-    memory->destroy(binhead_ssa);
-    memory->create(binhead_ssa,maxhead_ssa,"binhead_ssa");
+  if (mbins > maxbin_ssa) {
+    maxbin_ssa = mbins;
+    memory->destroy(binlist_ssa);
+    memory->destroy(binct_ssa);
+    memory->create(binlist_ssa,maxbin_ssa,"binlist_ssa");
+    memory->create(binct_ssa,maxbin_ssa,"binct_ssa");
   }
 
-  if (nall > maxbin_ssa) {
-    maxbin_ssa = nall;
-    memory->destroy(bins_ssa);
-    memory->create(bins_ssa,maxbin_ssa,"bins_ssa");
-  }
+  // Clear the local bin extent bounding box.
+  lbinxlo = mbinx - 1; // Safe to = stencil->sx + 1
+  lbinylo = mbiny - 1; // Safe to = stencil->sy + 1
+  lbinzlo = mbinz - 1; // Safe to = stencil->sz + 1
+  lbinxhi = 0; // Safe to = mbinx - stencil->sx - 1
+  lbinyhi = 0; // Safe to = mbiny - stencil->sy - 1
+  lbinzhi = 0; // Safe to = mbinz - stencil->sz - 1
 }
 
 /* ---------------------------------------------------------------------- */
@@ -125,9 +146,9 @@ bigint NBinSSA::memory_usage()
 {
   bigint bytes = NBinStandard::memory_usage(); // Count the parent's usage too
 
-  if (maxbin_ssa) bytes += memory->usage(bins_ssa,maxbin_ssa);
-  if (maxhead_ssa) {
-    bytes += memory->usage(binhead_ssa,maxhead_ssa);
+  if (maxbin_ssa) {
+    bytes += memory->usage(binlist_ssa,maxbin_ssa);
+    bytes += memory->usage(binct_ssa,maxbin_ssa);
   }
   return bytes;
 }
diff --git a/src/USER-DPD/nbin_ssa.h b/src/USER-DPD/nbin_ssa.h
index 75766ebcd2..48694370b9 100644
--- a/src/USER-DPD/nbin_ssa.h
+++ b/src/USER-DPD/nbin_ssa.h
@@ -29,11 +29,19 @@ namespace LAMMPS_NS {
 class NBinSSA : public NBinStandard {
  public:
 
-  int *bins_ssa;             // index of next atom in each bin
-  int maxbin_ssa;            // size of bins_ssa array
-  int *binhead_ssa;          // index of 1st local atom in each bin
+  int *binlist_ssa;          // index in neighlist of 1st local atom in each bin
+  int *binct_ssa;            // count of local atoms in each bin
   int gairhead_ssa[9];       // index of 1st ghost atom in each AIR
-  int maxhead_ssa;           // size of binhead_ssa and gbinhead_ssa arrays
+  int gairct_ssa[9];         // count of ghost atoms in each AIR
+  int maxbin_ssa;            // size of binlist_ssa and binct_ssa arrays
+
+  // Bounds of the local atoms in the binhead array
+  int lbinxlo;               // lowest local bin x-dim coordinate
+  int lbinylo;               // lowest local bin y-dim coordinate
+  int lbinzlo;               // lowest local bin z-dim coordinate
+  int lbinxhi;               // highest local bin x-dim coordinate
+  int lbinyhi;               // highest local bin y-dim coordinate
+  int lbinzhi;               // highest local bin z-dim coordinate
 
   NBinSSA(class LAMMPS *);
   ~NBinSSA();
diff --git a/src/USER-DPD/npair_half_bin_newton_ssa.cpp b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
index 4c9dc95308..f0860cba4b 100644
--- a/src/USER-DPD/npair_half_bin_newton_ssa.cpp
+++ b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
@@ -79,20 +79,32 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
 
   NBinSSA *nb_ssa = dynamic_cast<NBinSSA*>(nb);
   if (!nb_ssa) error->one(FLERR, "NBin wasn't a NBinSSA object");
-  int *bins_ssa = nb_ssa->bins_ssa;
-  int *binhead_ssa = nb_ssa->binhead_ssa;
+  int *bins = nb_ssa->bins;
+  int *binhead = nb_ssa->binhead;
+  int *binlist_ssa = nb_ssa->binlist_ssa;
+  int *binct_ssa = nb_ssa->binct_ssa;
   int *gairhead_ssa = &(nb_ssa->gairhead_ssa[0]);
 
   int inum = 0;
   int gnum = 0;
   int xbin,ybin,zbin,xbin2,ybin2,zbin2;
   int **stencilxyz = ns_ssa->stencilxyz;
+  int lbinxlo = nb_ssa->lbinxlo;
+  int lbinxhi = nb_ssa->lbinxhi;
+  int lbinylo = nb_ssa->lbinylo;
+  int lbinyhi = nb_ssa->lbinyhi;
+  int lbinzlo = nb_ssa->lbinzlo;
+  int lbinzhi = nb_ssa->lbinzhi;
 
   ipage->reset();
 
-  // loop over owned atoms, storing half of the neighbors
-
-  for (i = 0; i < nlocal; i++) {
+  // loop over bins with local atoms, storing half of the neighbors
+  for (zbin = lbinzlo; zbin < lbinzhi; zbin++) {
+  for (ybin = lbinylo; ybin < lbinyhi; ybin++) {
+  for (xbin = lbinxlo; xbin < lbinxhi; xbin++) {
+  ibin = zbin*mbiny*mbinx + ybin*mbinx + xbin;
+  binlist_ssa[ibin] = inum; // record where ibin starts in ilist
+  for (i = binhead[ibin]; i >= 0; i = bins[i]) {
     n = 0;
     neighptr = ipage->vget();
 
@@ -109,7 +121,7 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
     // loop over rest of local atoms in i's bin
     // just store them, since j is beyond i in linked list
 
-    for (j = bins_ssa[i]; j >= 0; j = bins_ssa[j]) {
+    for (j = bins[i]; j >= 0; j = bins[j]) {
 
       jtype = type[j];
       if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
@@ -136,13 +148,11 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
       }
     }
 
-    ibin = coord2bin(x[i]);
-
     // loop over all local atoms in other bins in "half" stencil
 
     for (k = 0; k < nstencil_half; k++) {
-      for (j = binhead_ssa[ibin+stencil[k]]; j >= 0;
-           j = bins_ssa[j]) {
+      for (j = binhead[ibin+stencil[k]]; j >= 0;
+           j = bins[j]) {
 
         jtype = type[j];
         if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
@@ -177,13 +187,20 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
     if (ipage->status())
       error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
   }
+  // verify count of atoms in ibin
+  if (binct_ssa[ibin] != (inum - binlist_ssa[ibin]))
+    error->one(FLERR,"binct_ssa didn't agree with lenght in ilist");
+  }
+  }
+  }
 
   list->inum = inum;
 
   // loop over AIR ghost atoms, storing their local neighbors
   // since these are ghosts, must check if stencil bin is out of bounds
   for (int airnum = 2; airnum <= 8; airnum++) {
-    for (i = gairhead_ssa[airnum]; i >= 0; i = bins_ssa[i]) {
+    list->AIRct_ssa[airnum - 1] = nb_ssa->gairct_ssa[airnum];
+    for (i = gairhead_ssa[airnum]; i >= 0; i = bins[i]) {
       n = 0;
       neighptr = ipage->vget();
 
@@ -205,11 +222,11 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
         xbin2 = xbin + stencilxyz[k][0];
         ybin2 = ybin + stencilxyz[k][1];
         zbin2 = zbin + stencilxyz[k][2];
-        // since we only care about ghost to local neighbors, these "bounds" could be inset
-        if (xbin2 < 0 || xbin2 >= mbinx ||
-            ybin2 < 0 || ybin2 >= mbiny ||
-            zbin2 < 0 || zbin2 >= mbinz) continue;
-        for (j = binhead_ssa[ibin+stencil[k]]; j >= 0; j = bins_ssa[j]) {
+        // Skip it if this bin is outside the extent of local bins
+        if (xbin2 < lbinxlo || xbin2 >= lbinxhi ||
+            ybin2 < lbinylo || ybin2 >= lbinyhi ||
+            zbin2 < lbinzlo || zbin2 >= lbinzhi) continue;
+        for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
 
           jtype = type[j];
           if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
diff --git a/src/neigh_list.h b/src/neigh_list.h
index 9a77a0311d..7649245e99 100644
--- a/src/neigh_list.h
+++ b/src/neigh_list.h
@@ -80,6 +80,7 @@ class NeighList : protected Pointers {
 
   // USER-DPD package and Shardlow Splitting Algorithm (SSA) support
 
+  int AIRct_ssa[8]; // count of how many atoms in each AIR
   uint16_t (*ndxAIR_ssa)[8]; // for each atom, last neighbor index of each AIR
 
   // methods

From e9d46f4e7acb79d57c982f59cbcd335e96beb10e Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 27 Jan 2017 12:31:13 -0500
Subject: [PATCH 129/267] USER-DPD: Correct an error message typo.

---
 src/USER-DPD/npair_half_bin_newton_ssa.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/USER-DPD/npair_half_bin_newton_ssa.cpp b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
index f0860cba4b..b9306ee3b1 100644
--- a/src/USER-DPD/npair_half_bin_newton_ssa.cpp
+++ b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
@@ -189,7 +189,7 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
   }
   // verify count of atoms in ibin
   if (binct_ssa[ibin] != (inum - binlist_ssa[ibin]))
-    error->one(FLERR,"binct_ssa didn't agree with lenght in ilist");
+    error->one(FLERR,"binct_ssa didn't agree with length in ilist");
   }
   }
   }

From fb279a87f5eac44e02319e4464a05fa62aa87794 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 27 Jan 2017 13:24:46 -0500
Subject: [PATCH 130/267] USER-DPD: properly compute AIRct_ssa values, and use
 them in fix_shardlow. Eliminates last use of per-atom ssaAIR values within
 initial_integrate()

---
 src/USER-DPD/fix_shardlow.cpp              | 36 ++++++++++++++--------
 src/USER-DPD/npair_half_bin_newton_ssa.cpp |  8 +++--
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/src/USER-DPD/fix_shardlow.cpp b/src/USER-DPD/fix_shardlow.cpp
index 56597697f7..9253d17317 100644
--- a/src/USER-DPD/fix_shardlow.cpp
+++ b/src/USER-DPD/fix_shardlow.cpp
@@ -538,24 +538,34 @@ void FixShardlow::initial_integrate(int vflag)
   dtsqrt = sqrt(update->dt);
 
   ii = 0;
-  //Loop over all 14 directions (8 stages)
-  for (airnum = 1; airnum <=8; airnum++){
+  // process neighbors in the local AIR
+  while (ii < inum) {
+    i = ilist[ii];
+    int len = list->numneigh[i];
+    if (len > 0) {
+      if (useDPDE) ssa_update_dpde(i, &(list->firstneigh[i][0]), len);
+      else ssa_update_dpd(i, &(list->firstneigh[i][0]), len);
+    }
+    ii++;
+  }
 
-    if (airnum > 1) {
-      // Communicate the updated velocities to all nodes
-      comm->forward_comm_fix(this);
+  ii = inum;
+  //Loop over all 13 outward directions (7 stages)
+  for (airnum = 1; airnum <=7; airnum++){
+    int ct = list->AIRct_ssa[airnum];
 
-      if(useDPDE){
-        // Zero out the ghosts' uCond & uMech to be used as delta accumulators
-        memset(&(atom->uCond[nlocal]), 0, sizeof(double)*nghost);
-        memset(&(atom->uMech[nlocal]), 0, sizeof(double)*nghost);
-      }
+    // Communicate the updated velocities to all nodes
+    comm->forward_comm_fix(this);
+
+    if(useDPDE){
+      // Zero out the ghosts' uCond & uMech to be used as delta accumulators
+      memset(&(atom->uCond[nlocal]), 0, sizeof(double)*nghost);
+      memset(&(atom->uMech[nlocal]), 0, sizeof(double)*nghost);
     }
 
     // process neighbors in this AIR
-    while (ii < anum) {
+    while (ct-- > 0) {
       i = ilist[ii];
-      if (atom->ssaAIR[i] > airnum) break; /* done with curent AIR */
       int len = list->numneigh[i];
       if (len > 0) {
         if (useDPDE) ssa_update_dpde(i, &(list->firstneigh[i][0]), len);
@@ -565,7 +575,7 @@ void FixShardlow::initial_integrate(int vflag)
     }
 
     // Communicate the ghost deltas to the atom owners
-    if (airnum > 1) comm->reverse_comm_fix(this);
+    comm->reverse_comm_fix(this);
 
   }  //End Loop over all directions For airnum = Top, Top-Right, Right, Bottom-Right, Back
 
diff --git a/src/USER-DPD/npair_half_bin_newton_ssa.cpp b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
index b9306ee3b1..cc107a55c4 100644
--- a/src/USER-DPD/npair_half_bin_newton_ssa.cpp
+++ b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
@@ -199,7 +199,7 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
   // loop over AIR ghost atoms, storing their local neighbors
   // since these are ghosts, must check if stencil bin is out of bounds
   for (int airnum = 2; airnum <= 8; airnum++) {
-    list->AIRct_ssa[airnum - 1] = nb_ssa->gairct_ssa[airnum];
+    int locAIRct = 0;
     for (i = gairhead_ssa[airnum]; i >= 0; i = bins[i]) {
       n = 0;
       neighptr = ipage->vget();
@@ -254,13 +254,17 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
         }
       }
 
-      if (n > 0) ilist[inum + (gnum++)] = i;
+      if (n > 0) {
+        ilist[inum + (gnum++)] = i;
+        ++locAIRct;
+      }
       firstneigh[i] = neighptr;
       numneigh[i] = n;
       ipage->vgot(n);
       if (ipage->status())
         error->one(FLERR,"Neighbor (ghost) list overflow, boost neigh_modify one");
     }
+    list->AIRct_ssa[airnum - 1] = locAIRct;
   }
   list->gnum = gnum;
 }

From 3dddeef365cfcab63e909d8388cc0f674419f6cd Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 27 Jan 2017 14:02:56 -0500
Subject: [PATCH 131/267] USER-DPD: remove unneeded gairct_ssa[] & anum vars,
 and some > 0 guards

---
 src/USER-DPD/fix_shardlow.cpp              | 15 +++++----------
 src/USER-DPD/nbin_ssa.cpp                  |  4 ----
 src/USER-DPD/nbin_ssa.h                    |  1 -
 src/USER-DPD/npair_half_bin_newton_ssa.cpp |  9 ++++-----
 4 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/src/USER-DPD/fix_shardlow.cpp b/src/USER-DPD/fix_shardlow.cpp
index 9253d17317..4fa323a9d8 100644
--- a/src/USER-DPD/fix_shardlow.cpp
+++ b/src/USER-DPD/fix_shardlow.cpp
@@ -499,7 +499,7 @@ void FixShardlow::ssa_update_dpde(
 
 void FixShardlow::initial_integrate(int vflag)
 {
-  int i,ii,inum,anum;
+  int i,ii,inum;
   int *ilist;
 
   int nlocal = atom->nlocal;
@@ -532,7 +532,6 @@ void FixShardlow::initial_integrate(int vflag)
   v_t0 = (double (*)[3]) memory->smalloc(sizeof(double)*3*nghost, "FixShardlow:v_t0");
 
   inum = list->inum;
-  anum = inum + list->gnum;
   ilist = list->ilist;
 
   dtsqrt = sqrt(update->dt);
@@ -542,10 +541,8 @@ void FixShardlow::initial_integrate(int vflag)
   while (ii < inum) {
     i = ilist[ii];
     int len = list->numneigh[i];
-    if (len > 0) {
-      if (useDPDE) ssa_update_dpde(i, &(list->firstneigh[i][0]), len);
-      else ssa_update_dpd(i, &(list->firstneigh[i][0]), len);
-    }
+    if (useDPDE) ssa_update_dpde(i, &(list->firstneigh[i][0]), len);
+    else ssa_update_dpd(i, &(list->firstneigh[i][0]), len);
     ii++;
   }
 
@@ -567,10 +564,8 @@ void FixShardlow::initial_integrate(int vflag)
     while (ct-- > 0) {
       i = ilist[ii];
       int len = list->numneigh[i];
-      if (len > 0) {
-        if (useDPDE) ssa_update_dpde(i, &(list->firstneigh[i][0]), len);
-        else ssa_update_dpd(i, &(list->firstneigh[i][0]), len);
-      }
+      if (useDPDE) ssa_update_dpde(i, &(list->firstneigh[i][0]), len);
+      else ssa_update_dpd(i, &(list->firstneigh[i][0]), len);
       ii++;
     }
 
diff --git a/src/USER-DPD/nbin_ssa.cpp b/src/USER-DPD/nbin_ssa.cpp
index 321baf771a..7ea2117300 100644
--- a/src/USER-DPD/nbin_ssa.cpp
+++ b/src/USER-DPD/nbin_ssa.cpp
@@ -34,7 +34,6 @@ NBinSSA::NBinSSA(LAMMPS *lmp) : NBinStandard(lmp)
   binct_ssa = NULL;
   for (int i = 0; i < 9; i++) {
     gairhead_ssa[i] = -1;
-    gairct_ssa[i] = 0;
   }
 }
 
@@ -69,7 +68,6 @@ void NBinSSA::bin_atoms()
 
   for (i = 0; i < 9; i++) {
     gairhead_ssa[i] = -1;
-    gairct_ssa[i] = 0;
   }
 
   for (i = 0; i < mbins; i++) {
@@ -89,7 +87,6 @@ void NBinSSA::bin_atoms()
       if (mask[i] & bitmask) {
         bins[i] = gairhead_ssa[ibin];
         gairhead_ssa[ibin] = i;
-        ++(gairct_ssa[ibin]);
       }
     }
   } else {
@@ -98,7 +95,6 @@ void NBinSSA::bin_atoms()
       if (ibin < 2) continue; // skip ghost atoms not in AIR
       bins[i] = gairhead_ssa[ibin];
       gairhead_ssa[ibin] = i;
-      ++(gairct_ssa[ibin]);
     }
   }
   for (i = nlocal-1; i >= 0; i--) {
diff --git a/src/USER-DPD/nbin_ssa.h b/src/USER-DPD/nbin_ssa.h
index 48694370b9..4ec376200c 100644
--- a/src/USER-DPD/nbin_ssa.h
+++ b/src/USER-DPD/nbin_ssa.h
@@ -32,7 +32,6 @@ class NBinSSA : public NBinStandard {
   int *binlist_ssa;          // index in neighlist of 1st local atom in each bin
   int *binct_ssa;            // count of local atoms in each bin
   int gairhead_ssa[9];       // index of 1st ghost atom in each AIR
-  int gairct_ssa[9];         // count of ghost atoms in each AIR
   int maxbin_ssa;            // size of binlist_ssa and binct_ssa arrays
 
   // Bounds of the local atoms in the binhead array
diff --git a/src/USER-DPD/npair_half_bin_newton_ssa.cpp b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
index cc107a55c4..ccc41d1fc4 100644
--- a/src/USER-DPD/npair_half_bin_newton_ssa.cpp
+++ b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
@@ -180,21 +180,20 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
       }
     }
 
-    ilist[inum++] = i;
+    if (n > 0) {
+      ilist[inum++] = i;
+    }
     firstneigh[i] = neighptr;
     numneigh[i] = n;
     ipage->vgot(n);
     if (ipage->status())
       error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
   }
-  // verify count of atoms in ibin
-  if (binct_ssa[ibin] != (inum - binlist_ssa[ibin]))
-    error->one(FLERR,"binct_ssa didn't agree with length in ilist");
   }
   }
   }
 
-  list->inum = inum;
+  list->AIRct_ssa[0] = list->inum = inum;
 
   // loop over AIR ghost atoms, storing their local neighbors
   // since these are ghosts, must check if stencil bin is out of bounds

From f73c9a43aba9b5ff0837360ea7c3d4925906e5f6 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 27 Jan 2017 16:45:15 -0500
Subject: [PATCH 132/267] USER-DPD: remove broken code for building SSA half
 neighbor list from full

---
 src/USER-DPD/npair_halffull_newton_ssa.cpp | 136 ---------------------
 src/USER-DPD/npair_halffull_newton_ssa.h   |  44 -------
 2 files changed, 180 deletions(-)
 delete mode 100644 src/USER-DPD/npair_halffull_newton_ssa.cpp
 delete mode 100644 src/USER-DPD/npair_halffull_newton_ssa.h

diff --git a/src/USER-DPD/npair_halffull_newton_ssa.cpp b/src/USER-DPD/npair_halffull_newton_ssa.cpp
deleted file mode 100644
index d0be1685b6..0000000000
--- a/src/USER-DPD/npair_halffull_newton_ssa.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors:
-   James Larentzos and Timothy I. Mattox (Engility Corporation)
-------------------------------------------------------------------------- */
-
-#include "npair_halffull_newton_ssa.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "atom.h"
-#include "atom_vec.h"
-#include "molecule.h"
-#include "domain.h"
-#include "my_page.h"
-#include "error.h"
-
-using namespace LAMMPS_NS;
-
-// allocate space for static class variable
-// prototype for non-class function
-
-static int *ssaAIRptr;
-static int cmp_ssaAIR(const void *, const void *);
-
-/* ---------------------------------------------------------------------- */
-
-NPairHalffullNewtonSSA::NPairHalffullNewtonSSA(LAMMPS *lmp) : NPair(lmp) {}
-
-/* ----------------------------------------------------------------------
-   build half list from full list for use by Shardlow Spliting Algorithm
-   pair stored once if i,j are both owned and i < j
-   if j is ghost, only store if j coords are "above and to the right" of i
-   works if full list is a skip list
-------------------------------------------------------------------------- */
-
-void NPairHalffullNewtonSSA::build(NeighList *list)
-{
-  int i,j,ii,jj,n,jnum,joriginal;
-  int *neighptr,*jlist;
-
-  int nlocal = atom->nlocal;
-  int *ssaAIR = atom->ssaAIR;
-
-  int *ilist = list->ilist;
-  int *numneigh = list->numneigh;
-  int **firstneigh = list->firstneigh;
-  MyPage<int> *ipage = list->ipage;
-
-  int *ilist_full = list->listfull->ilist;
-  int *numneigh_full = list->listfull->numneigh;
-  int **firstneigh_full = list->listfull->firstneigh;
-  int inum_full = list->listfull->inum;
-
-  int inum = 0;
-
-  error->one(FLERR,"NPairHalffullNewtonSSA not yet implemented for ghosts with neighbors.");
-  return;
-
-  ipage->reset();
-
-  // loop over parent full list
-
-  for (ii = 0; ii < inum_full; ii++) {
-    int AIRct[8] = { 0 };
-    n = 0;
-    neighptr = ipage->vget();
-
-    i = ilist_full[ii];
-
-    // loop over full neighbor list
-
-    jlist = firstneigh_full[i];
-    jnum = numneigh_full[i];
-
-    for (jj = 0; jj < jnum; jj++) {
-      joriginal = jlist[jj];
-      j = joriginal & NEIGHMASK;
-      if (j < nlocal) {
-        if (i > j) continue;
-        ++(AIRct[0]);
-      } else {
-        if (ssaAIR[j] < 2) continue; // skip ghost atoms not in AIR
-        ++(AIRct[ssaAIR[j] - 1]);
-      }
-      neighptr[n++] = joriginal;
-    }
-
-    ilist[inum++] = i;
-    firstneigh[i] = neighptr;
-    numneigh[i] = n;
-    ipage->vgot(n);
-    if (ipage->status())
-      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
-
-    // sort the locals+ghosts in the neighbor list by their ssaAIR number
-
-    ssaAIRptr = atom->ssaAIR;
-    qsort(&(neighptr[0]), n, sizeof(int), cmp_ssaAIR);
-
-    // do a prefix sum on the counts to turn them into indexes
-
-    list->ndxAIR_ssa[i][0] = AIRct[0];
-    for (int ndx = 1; ndx < 8; ++ndx) {
-      list->ndxAIR_ssa[i][ndx] = AIRct[ndx] + list->ndxAIR_ssa[i][ndx - 1];
-    }
-  }
-
-  list->inum = inum;
-}
-
-/* ----------------------------------------------------------------------
-   comparison function invoked by qsort()
-   accesses static class member ssaAIRptr, set before call to qsort()
-------------------------------------------------------------------------- */
-
-static int cmp_ssaAIR(const void *iptr, const void *jptr)
-{
-  int i = NEIGHMASK & *((int *) iptr);
-  int j = NEIGHMASK & *((int *) jptr);
-  if (ssaAIRptr[i] < ssaAIRptr[j]) return -1;
-  if (ssaAIRptr[i] > ssaAIRptr[j]) return 1;
-  return 0;
-}
-
diff --git a/src/USER-DPD/npair_halffull_newton_ssa.h b/src/USER-DPD/npair_halffull_newton_ssa.h
deleted file mode 100644
index 03903815b1..0000000000
--- a/src/USER-DPD/npair_halffull_newton_ssa.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifdef NPAIR_CLASS
-
-NPairStyle(halffull/newton/ssa,
-           NPairHalffullNewtonSSA,
-           NP_HALF_FULL | NP_NSQ | NP_BIN | NP_MULTI | NP_NEWTON |
-           NP_ORTHO | NP_TRI | NP_SSA)
-
-#else
-
-#ifndef LMP_NPAIR_HALFFULL_NEWTON_SSA_H
-#define LMP_NPAIR_HALFFULL_NEWTON_SSA_H
-
-#include "npair.h"
-
-namespace LAMMPS_NS {
-
-class NPairHalffullNewtonSSA : public NPair {
- public:
-  NPairHalffullNewtonSSA(class LAMMPS *);
-  ~NPairHalffullNewtonSSA() {}
-  void build(class NeighList *);
-};
-
-}
-
-#endif
-#endif
-
-/* ERROR/WARNING messages:
-
-*/

From 641bb4bb16c8f852af189ba1e71cd7b035e0c8e2 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 27 Jan 2017 16:47:19 -0500
Subject: [PATCH 133/267] USER-DPD: remove use of ssaAIR[], move coord2ssaAIR()
 to nbin_ssa.cpp Saves an int per atom and ghost, also simplifies and reduces
 code size.

---
 src/USER-DPD/fix_shardlow.cpp | 116 ----------------------------------
 src/USER-DPD/fix_shardlow.h   |  13 ----
 src/USER-DPD/nbin_ssa.cpp     |  40 +++++++++++-
 src/USER-DPD/nbin_ssa.h       |   1 +
 4 files changed, 38 insertions(+), 132 deletions(-)

diff --git a/src/USER-DPD/fix_shardlow.cpp b/src/USER-DPD/fix_shardlow.cpp
index 4fa323a9d8..05bf1602f9 100644
--- a/src/USER-DPD/fix_shardlow.cpp
+++ b/src/USER-DPD/fix_shardlow.cpp
@@ -109,26 +109,12 @@ FixShardlow::FixShardlow(LAMMPS *lmp, int narg, char **arg) :
   if(pairDPD == NULL && pairDPDE == NULL)
     error->all(FLERR,"Must use pair_style dpd/fdt or dpd/fdt/energy with fix shardlow");
 
-  // Setup the ssaAIR array
-  atom->ssaAIR = NULL;
-  grow_arrays(atom->nmax);
-  memset(atom->ssaAIR, 0, sizeof(int)*atom->nlocal);
-
-  // Setup callbacks for maintaining atom->ssaAIR[]
-  atom->add_callback(0); // grow (aka exchange)
-  atom->add_callback(1); // restart
-  atom->add_callback(2); // border
 }
 
 /* ---------------------------------------------------------------------- */
 
 FixShardlow::~FixShardlow()
 {
-  atom->delete_callback(id, 0);
-  atom->delete_callback(id, 1);
-  atom->delete_callback(id, 2);
-
-  memory->destroy(atom->ssaAIR);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -137,7 +123,6 @@ int FixShardlow::setmask()
 {
   int mask = 0;
   mask |= INITIAL_INTEGRATE;
-  mask |= PRE_EXCHANGE | MIN_PRE_EXCHANGE;
   return mask;
 }
 
@@ -161,27 +146,6 @@ void FixShardlow::init_list(int id, NeighList *ptr)
 
 /* ---------------------------------------------------------------------- */
 
-void FixShardlow::pre_exchange()
-{
-  memset(atom->ssaAIR, 0, sizeof(int)*atom->nlocal);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixShardlow::setup_pre_exchange()
-{
-  memset(atom->ssaAIR, 0, sizeof(int)*atom->nlocal);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixShardlow::min_pre_exchange()
-{
-  memset(atom->ssaAIR, 0, sizeof(int)*atom->nlocal);
-}
-
-/* ---------------------------------------------------------------------- */
-
 void FixShardlow::setup(int vflag)
 {
   bool fixShardlow = false;
@@ -659,91 +623,11 @@ void FixShardlow::unpack_reverse_comm(int n, int *list, double *buf)
   }
 }
 
-/* ----------------------------------------------------------------------
-   convert atom coords into the ssa active interaction region number
-------------------------------------------------------------------------- */
-
-int FixShardlow::coord2ssaAIR(double *x)
-{
-  int ix, iy, iz;
-
-  ix = iy = iz = 0;
-  if (x[2] < domain->sublo[2]) iz = -1;
-  if (x[2] >= domain->subhi[2]) iz = 1;
-  if (x[1] < domain->sublo[1]) iy = -1;
-  if (x[1] >= domain->subhi[1]) iy = 1;
-  if (x[0] < domain->sublo[0]) ix = -1;
-  if (x[0] >= domain->subhi[0]) ix = 1;
-
-  if(iz < 0){
-    return -1;
-  } else if(iz == 0){
-    if( iy<0 ) return -1; // bottom left/middle/right
-    if( (iy==0) && (ix<0)  ) return -1; // left atoms
-    if( (iy==0) && (ix==0) ) return 0; // Locally owned atoms
-    if( (iy==0) && (ix>0)  ) return 3; // Right atoms
-    if( (iy>0)  && (ix==0) ) return 2; // Top-middle atoms
-    if( (iy>0)  && (ix!=0) ) return 4; // Top-right and top-left atoms
-  } else { // iz > 0
-    if((ix==0) && (iy==0)) return 5; // Back atoms
-    if((ix==0) && (iy!=0)) return 6; // Top-back and bottom-back atoms
-    if((ix!=0) && (iy==0)) return 7; // Left-back and right-back atoms
-    if((ix!=0) && (iy!=0)) return 8; // Back corner atoms
-  }
-
-  return -2;
-}
-
 /* ---------------------------------------------------------------------- */
 
-void FixShardlow::grow_arrays(int nmax)
-{
-  memory->grow(atom->ssaAIR,nmax,"fix_shardlow:ssaAIR");
-}
-
-void FixShardlow::copy_arrays(int i, int j, int delflag)
-{
-  atom->ssaAIR[j] = atom->ssaAIR[i];
-}
-
-void FixShardlow::set_arrays(int i)
-{
-  atom->ssaAIR[i] = 0; /* coord2ssaAIR(x[i]) */
-}
-
-int FixShardlow::pack_border(int n, int *list, double *buf)
-{
-  for (int i = 0; i < n; i++) {
-    int j = list[i];
-    if (atom->ssaAIR[j] == 0) atom->ssaAIR[j] = 1; // not purely local anymore
-  }
-  return 0;
-}
-
-int FixShardlow::unpack_border(int n, int first, double *buf)
-{
-  int i,last = first + n;
-  for (i = first; i < last; i++) {
-    atom->ssaAIR[i] = coord2ssaAIR(atom->x[i]);
-  }
-  return 0;
-}
-
-int FixShardlow::unpack_exchange(int i, double *buf)
-{
-  atom->ssaAIR[i] = 0; /* coord2ssaAIR(x[i]) */
-  return 0;
-}
-
-void FixShardlow::unpack_restart(int i, int nth)
-{
-  atom->ssaAIR[i] = 0; /* coord2ssaAIR(x[i]) */
-}
-
 double FixShardlow::memory_usage()
 {
   double bytes = 0.0;
-  bytes += memory->usage(atom->ssaAIR,atom->nmax);
   bytes += sizeof(double)*3*atom->nghost; // v_t0[]
   return bytes;
 }
diff --git a/src/USER-DPD/fix_shardlow.h b/src/USER-DPD/fix_shardlow.h
index 2ffb96ae7c..6fd438b8f0 100644
--- a/src/USER-DPD/fix_shardlow.h
+++ b/src/USER-DPD/fix_shardlow.h
@@ -35,18 +35,6 @@ class FixShardlow : public Fix {
   virtual void init_list(int, class NeighList *);
   virtual void setup(int);
   virtual void initial_integrate(int);
-  void setup_pre_exchange();
-  void pre_exchange();
-  void min_pre_exchange();
-
-  void grow_arrays(int);
-  void copy_arrays(int, int, int);
-  void set_arrays(int);
-
-  int pack_border(int, int *, double *);
-  int unpack_border(int, int, double *);
-  int unpack_exchange(int, double *);
-  void unpack_restart(int, int);
 
   double memory_usage();
 
@@ -63,7 +51,6 @@ class FixShardlow : public Fix {
  private:
   double dtsqrt; // = sqrt(update->dt);
 
-  int coord2ssaAIR(double *);  // map atom coord to an AIR number
   void ssa_update_dpd(int, int *, int);  // Constant Temperature
   void ssa_update_dpde(int, int *, int); // Constant Energy
 
diff --git a/src/USER-DPD/nbin_ssa.cpp b/src/USER-DPD/nbin_ssa.cpp
index 7ea2117300..25a2fb3b35 100644
--- a/src/USER-DPD/nbin_ssa.cpp
+++ b/src/USER-DPD/nbin_ssa.cpp
@@ -20,6 +20,7 @@
 #include "atom.h"
 #include "update.h"
 #include "group.h"
+#include "domain.h"
 #include "memory.h"
 #include "error.h"
 
@@ -58,7 +59,6 @@ void NBinSSA::bin_atoms()
   if (includegroup) nlocal = atom->nfirst;
   double **x = atom->x;
   int *mask = atom->mask;
-  int *ssaAIR = atom->ssaAIR;
   int xbin,ybin,zbin;
 
   last_bin = update->ntimestep;
@@ -82,7 +82,7 @@ void NBinSSA::bin_atoms()
     int bitmask = group->bitmask[includegroup];
     int nowned = atom->nlocal; // NOTE: nlocal was set to atom->nfirst above
     for (i = nall-1; i >= nowned; i--) {
-      ibin = ssaAIR[i];
+      ibin = coord2ssaAIR(x[i]);
       if (ibin < 2) continue; // skip ghost atoms not in AIR
       if (mask[i] & bitmask) {
         bins[i] = gairhead_ssa[ibin];
@@ -91,7 +91,7 @@ void NBinSSA::bin_atoms()
     }
   } else {
     for (i = nall-1; i >= nlocal; i--) {
-      ibin = ssaAIR[i];
+      ibin = coord2ssaAIR(x[i]);
       if (ibin < 2) continue; // skip ghost atoms not in AIR
       bins[i] = gairhead_ssa[ibin];
       gairhead_ssa[ibin] = i;
@@ -148,3 +148,37 @@ bigint NBinSSA::memory_usage()
   }
   return bytes;
 }
+
+/* ----------------------------------------------------------------------
+   convert atom coords into the ssa active interaction region number
+------------------------------------------------------------------------- */
+int NBinSSA::coord2ssaAIR(const double *x)
+{
+  int ix, iy, iz;
+
+  ix = iy = iz = 0;
+  if (x[2] < domain->sublo[2]) iz = -1;
+  if (x[2] >= domain->subhi[2]) iz = 1;
+  if (x[1] < domain->sublo[1]) iy = -1;
+  if (x[1] >= domain->subhi[1]) iy = 1;
+  if (x[0] < domain->sublo[0]) ix = -1;
+  if (x[0] >= domain->subhi[0]) ix = 1;
+
+  if(iz < 0){
+    return -1;
+  } else if(iz == 0){
+    if( iy<0 ) return -1; // bottom left/middle/right
+    if( (iy==0) && (ix<0)  ) return -1; // left atoms
+    if( (iy==0) && (ix==0) ) return 0; // Locally owned atoms
+    if( (iy==0) && (ix>0)  ) return 3; // Right atoms
+    if( (iy>0)  && (ix==0) ) return 2; // Top-middle atoms
+    if( (iy>0)  && (ix!=0) ) return 4; // Top-right and top-left atoms
+  } else { // iz > 0
+    if((ix==0) && (iy==0)) return 5; // Back atoms
+    if((ix==0) && (iy!=0)) return 6; // Top-back and bottom-back atoms
+    if((ix!=0) && (iy==0)) return 7; // Left-back and right-back atoms
+    if((ix!=0) && (iy!=0)) return 8; // Back corner atoms
+  }
+
+  return -2;
+}
diff --git a/src/USER-DPD/nbin_ssa.h b/src/USER-DPD/nbin_ssa.h
index 4ec376200c..5db5a0fa41 100644
--- a/src/USER-DPD/nbin_ssa.h
+++ b/src/USER-DPD/nbin_ssa.h
@@ -155,6 +155,7 @@ class NBinSSA : public NBinStandard {
   }
 
  private:
+  int coord2ssaAIR(const double *);  // map atom coord to an AIR number
   double bboxlo_[3],bboxhi_[3];
 
 };

From ce2da5068b6e77c5a508e554ab824dc54328bd21 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Mon, 30 Jan 2017 13:01:28 -0500
Subject: [PATCH 134/267] USER-DPD: renumber AIRs back to 1-7 for ghosts, and
 just 0 for locals. This removes the the distinction between pure and impure
 locals. Pure and impure locals messed up the directionality of half neighbor
 lists, which turns out is crucial to the approach for SSA with kokkos.

---
 src/USER-DPD/nbin_ssa.cpp                  | 22 +++++++++++-----------
 src/USER-DPD/nbin_ssa.h                    |  2 +-
 src/USER-DPD/npair_half_bin_newton_ssa.cpp |  4 ++--
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/USER-DPD/nbin_ssa.cpp b/src/USER-DPD/nbin_ssa.cpp
index 25a2fb3b35..7e603af714 100644
--- a/src/USER-DPD/nbin_ssa.cpp
+++ b/src/USER-DPD/nbin_ssa.cpp
@@ -33,7 +33,7 @@ NBinSSA::NBinSSA(LAMMPS *lmp) : NBinStandard(lmp)
   maxbin_ssa = 0;
   binlist_ssa = NULL;
   binct_ssa = NULL;
-  for (int i = 0; i < 9; i++) {
+  for (int i = 0; i < 8; i++) {
     gairhead_ssa[i] = -1;
   }
 }
@@ -66,7 +66,7 @@ void NBinSSA::bin_atoms()
   bboxlo_[0] = bboxlo[0]; bboxlo_[1] = bboxlo[1]; bboxlo_[2] = bboxlo[2];
   bboxhi_[0] = bboxhi[0]; bboxhi_[1] = bboxhi[1]; bboxhi_[2] = bboxhi[2];
 
-  for (i = 0; i < 9; i++) {
+  for (i = 0; i < 8; i++) {
     gairhead_ssa[i] = -1;
   }
 
@@ -83,7 +83,7 @@ void NBinSSA::bin_atoms()
     int nowned = atom->nlocal; // NOTE: nlocal was set to atom->nfirst above
     for (i = nall-1; i >= nowned; i--) {
       ibin = coord2ssaAIR(x[i]);
-      if (ibin < 2) continue; // skip ghost atoms not in AIR
+      if (ibin < 1) continue; // skip ghost atoms not in AIR
       if (mask[i] & bitmask) {
         bins[i] = gairhead_ssa[ibin];
         gairhead_ssa[ibin] = i;
@@ -92,7 +92,7 @@ void NBinSSA::bin_atoms()
   } else {
     for (i = nall-1; i >= nlocal; i--) {
       ibin = coord2ssaAIR(x[i]);
-      if (ibin < 2) continue; // skip ghost atoms not in AIR
+      if (ibin < 1) continue; // skip ghost atoms not in AIR
       bins[i] = gairhead_ssa[ibin];
       gairhead_ssa[ibin] = i;
     }
@@ -170,14 +170,14 @@ int NBinSSA::coord2ssaAIR(const double *x)
     if( iy<0 ) return -1; // bottom left/middle/right
     if( (iy==0) && (ix<0)  ) return -1; // left atoms
     if( (iy==0) && (ix==0) ) return 0; // Locally owned atoms
-    if( (iy==0) && (ix>0)  ) return 3; // Right atoms
-    if( (iy>0)  && (ix==0) ) return 2; // Top-middle atoms
-    if( (iy>0)  && (ix!=0) ) return 4; // Top-right and top-left atoms
+    if( (iy==0) && (ix>0)  ) return 2; // Right atoms
+    if( (iy>0)  && (ix==0) ) return 1; // Top-middle atoms
+    if( (iy>0)  && (ix!=0) ) return 3; // Top-right and top-left atoms
   } else { // iz > 0
-    if((ix==0) && (iy==0)) return 5; // Back atoms
-    if((ix==0) && (iy!=0)) return 6; // Top-back and bottom-back atoms
-    if((ix!=0) && (iy==0)) return 7; // Left-back and right-back atoms
-    if((ix!=0) && (iy!=0)) return 8; // Back corner atoms
+    if((ix==0) && (iy==0)) return 4; // Back atoms
+    if((ix==0) && (iy!=0)) return 5; // Top-back and bottom-back atoms
+    if((ix!=0) && (iy==0)) return 6; // Left-back and right-back atoms
+    if((ix!=0) && (iy!=0)) return 7; // Back corner atoms
   }
 
   return -2;
diff --git a/src/USER-DPD/nbin_ssa.h b/src/USER-DPD/nbin_ssa.h
index 5db5a0fa41..f26f8c77f0 100644
--- a/src/USER-DPD/nbin_ssa.h
+++ b/src/USER-DPD/nbin_ssa.h
@@ -31,7 +31,7 @@ class NBinSSA : public NBinStandard {
 
   int *binlist_ssa;          // index in neighlist of 1st local atom in each bin
   int *binct_ssa;            // count of local atoms in each bin
-  int gairhead_ssa[9];       // index of 1st ghost atom in each AIR
+  int gairhead_ssa[8];       // index of 1st ghost atom in each AIR
   int maxbin_ssa;            // size of binlist_ssa and binct_ssa arrays
 
   // Bounds of the local atoms in the binhead array
diff --git a/src/USER-DPD/npair_half_bin_newton_ssa.cpp b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
index ccc41d1fc4..f3b7094bd8 100644
--- a/src/USER-DPD/npair_half_bin_newton_ssa.cpp
+++ b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
@@ -197,7 +197,7 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
 
   // loop over AIR ghost atoms, storing their local neighbors
   // since these are ghosts, must check if stencil bin is out of bounds
-  for (int airnum = 2; airnum <= 8; airnum++) {
+  for (int airnum = 1; airnum <= 7; airnum++) {
     int locAIRct = 0;
     for (i = gairhead_ssa[airnum]; i >= 0; i = bins[i]) {
       n = 0;
@@ -263,7 +263,7 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
       if (ipage->status())
         error->one(FLERR,"Neighbor (ghost) list overflow, boost neigh_modify one");
     }
-    list->AIRct_ssa[airnum - 1] = locAIRct;
+    list->AIRct_ssa[airnum] = locAIRct;
   }
   list->gnum = gnum;
 }

From ee83b755eae4f3167fd2ab8b50bbf86cab3407ee Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Mon, 30 Jan 2017 14:32:18 -0500
Subject: [PATCH 135/267] USER-DPD: Split the SSA stencil and neighbor list
 into subphases. NOTE: pair evaluation order changes, causing numerical
 differences! This enables processing neighbors in subphase groups that
 enforce a geometrical seperation of pairs, allowing greater parallelism once
 fix_shardlow (SSA) is converted to Kokkos.

---
 src/USER-DPD/npair_half_bin_newton_ssa.cpp    |  52 +++++----
 .../nstencil_half_bin_2d_newton_ssa.cpp       |  60 ++++++++--
 .../nstencil_half_bin_3d_newton_ssa.cpp       | 110 ++++++++++++++----
 src/USER-DPD/nstencil_ssa.h                   |   2 +-
 4 files changed, 165 insertions(+), 59 deletions(-)

diff --git a/src/USER-DPD/npair_half_bin_newton_ssa.cpp b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
index f3b7094bd8..77b20966b0 100644
--- a/src/USER-DPD/npair_half_bin_newton_ssa.cpp
+++ b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
@@ -74,7 +74,7 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
 
   NStencilSSA *ns_ssa = dynamic_cast<NStencilSSA*>(ns);
   if (!ns_ssa) error->one(FLERR, "NStencil wasn't a NStencilSSA object");
-  int nstencil_half = ns_ssa->nstencil_half;
+  int *nstencil_ssa = &(ns_ssa->nstencil_ssa[0]);
   int nstencil_full = ns_ssa->nstencil;
 
   NBinSSA *nb_ssa = dynamic_cast<NBinSSA*>(nb);
@@ -150,34 +150,38 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
 
     // loop over all local atoms in other bins in "half" stencil
 
-    for (k = 0; k < nstencil_half; k++) {
-      for (j = binhead[ibin+stencil[k]]; j >= 0;
-           j = bins[j]) {
+    k = 0;
+    for (int subphase = 0; subphase < 4; subphase++) {
+      for (; k < nstencil_ssa[subphase]; k++) {
+        for (j = binhead[ibin+stencil[k]]; j >= 0;
+             j = bins[j]) {
 
-        jtype = type[j];
-        if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+          jtype = type[j];
+          if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
 
-        delx = xtmp - x[j][0];
-        dely = ytmp - x[j][1];
-        delz = ztmp - x[j][2];
-        rsq = delx*delx + dely*dely + delz*delz;
+          delx = xtmp - x[j][0];
+          dely = ytmp - x[j][1];
+          delz = ztmp - x[j][2];
+          rsq = delx*delx + dely*dely + delz*delz;
 
-        if (rsq <= cutneighsq[itype][jtype]) {
-          if (molecular) {
-            if (!moltemplate)
-              which = find_special(special[i],nspecial[i],tag[j]);
-            else if (imol >= 0)
-              which = find_special(onemols[imol]->special[iatom],
-                                   onemols[imol]->nspecial[iatom],
-                                   tag[j]-tagprev);
-            else which = 0;
-            if (which == 0) neighptr[n++] = j;
-            else if (domain->minimum_image_check(delx,dely,delz))
-              neighptr[n++] = j;
-            else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
-          } else neighptr[n++] = j;
+          if (rsq <= cutneighsq[itype][jtype]) {
+            if (molecular) {
+              if (!moltemplate)
+                which = find_special(special[i],nspecial[i],tag[j]);
+              else if (imol >= 0)
+                which = find_special(onemols[imol]->special[iatom],
+                                     onemols[imol]->nspecial[iatom],
+                                     tag[j]-tagprev);
+              else which = 0;
+              if (which == 0) neighptr[n++] = j;
+              else if (domain->minimum_image_check(delx,dely,delz))
+                neighptr[n++] = j;
+              else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+            } else neighptr[n++] = j;
+          }
         }
       }
+      list->ndxAIR_ssa[i][subphase] = n; // record end of this subphase
     }
 
     if (n > 0) {
diff --git a/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp b/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp
index 254339bffc..af337a38c6 100644
--- a/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp
+++ b/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp
@@ -42,31 +42,69 @@ NStencilHalfBin2dNewtonSSA::NStencilHalfBin2dNewtonSSA(LAMMPS *lmp) :
 void NStencilHalfBin2dNewtonSSA::create()
 {
   int i,j,pos = 0;
-
+  // Subphase 0: upper right front bins (red)
   for (j = 0; j <= sy; j++)
-    for (i = -sx; i <= sx; i++)
-      if (j > 0 || (j == 0 && i > 0))
+    for (i = 0; i <= sx; i++)
+      if (j > 0 || i > 0) // skip the centroid
         if (bin_distance(i,j,0) < cutneighmaxsq) {
           stencilxyz[pos][0] = i;
           stencilxyz[pos][1] = j;
           stencilxyz[pos][2] = 0;
           stencil[pos++] = j*mbinx + i;
         }
+  nstencil_ssa[0] = pos;
 
-  nstencil_half = pos; // record where normal half stencil ends
-
-  // include additional bins for AIR ghosts only
-
-  for (j = -sy; j <= 0; j++)
-    for (i = -sx; i <= sx; i++) {
-      if (j == 0 && i > 0) continue;
+  // Subphase 1: upper left front bins (light blue)
+  for (j = 1; j <= sy; j++)
+    for (i = -sx; i < 0; i++)
       if (bin_distance(i,j,0) < cutneighmaxsq) {
         stencilxyz[pos][0] = i;
         stencilxyz[pos][1] = j;
         stencilxyz[pos][2] = 0;
         stencil[pos++] = j*mbinx + i;
       }
-    }
+  nstencil_ssa[1] = pos;
+
+  // Subphase 2: lower left front bins (blue)
+  nstencil_ssa[2] = pos;
+
+  // Subphase 3: lower right front bins (yellow)
+  nstencil_ssa[3] = pos;
+
+  // Now include additional bins for AIR ghosts, and impure-to-pure locals
+  // Subphase 4: upper right back bins (pink)
+  nstencil_ssa[4] = pos;
+
+  // Subphase 5: upper left back bins (light green)
+  nstencil_ssa[5] = pos;
+
+  // Subphase 6: lower left back bins (purple)
+  for (j = -sy; j <= 0; j++)
+    for (i = -sx; i < 0; i++)
+      if (bin_distance(i,j,0) < cutneighmaxsq) {
+        stencilxyz[pos][0] = i;
+        stencilxyz[pos][1] = j;
+        stencilxyz[pos][2] = 0;
+        stencil[pos++] = j*mbinx + i;
+      }
+  nstencil_ssa[6] = pos;
+
+  // Subphase 7: lower right back bins (white)
+  for (j = -sy; j < 0; j++)
+    for (i = 0; i <= sx; i++)
+      if (bin_distance(i,j,0) < cutneighmaxsq) {
+        stencilxyz[pos][0] = i;
+        stencilxyz[pos][1] = j;
+        stencilxyz[pos][2] = 0;
+        stencil[pos++] = j*mbinx + i;
+      }
+  nstencil_ssa[7] = pos;
+
+  // Also, include the centroid for the AIR ghosts.
+  stencilxyz[pos][0] = 0;
+  stencilxyz[pos][1] = 0;
+  stencilxyz[pos][2] = 0;
+  stencil[pos++] = 0;
 
   nstencil = pos; // record where full stencil ends
 }
diff --git a/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp b/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp
index 1e2c18c66a..a2911a6d7b 100644
--- a/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp
+++ b/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp
@@ -42,45 +42,109 @@ NStencilHalfBin3dNewtonSSA::NStencilHalfBin3dNewtonSSA(LAMMPS *lmp) :
 void NStencilHalfBin3dNewtonSSA::create()
 {
   int i,j,k,pos = 0;
-
+  // Subphase 0: upper right front bins (red)
   for (k = 0; k <= sz; k++)
-    for (j = -sy; j <= sy; j++)
-      for (i = -sx; i <= sx; i++)
-        if (k > 0 || j > 0 || (j == 0 && i > 0))
+    for (j = 0; j <= sy; j++)
+      for (i = 0; i <= sx; i++)
+        if (k > 0 || j > 0 || i > 0) // skip the centroid
           if (bin_distance(i,j,k) < cutneighmaxsq) {
             stencilxyz[pos][0] = i;
             stencilxyz[pos][1] = j;
             stencilxyz[pos][2] = k;
             stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
           }
+  nstencil_ssa[0] = pos;
 
-  nstencil_half = pos; // record where normal half stencil ends
-
-  // include additional bins for AIR ghosts only
-
-  for (k = -sz; k < 0; k++)
-    for (j = -sy; j <= sy; j++)
-      for (i = -sx; i <= sx; i++)
+  // Subphase 1: upper left front bins (light blue)
+  for (k = 0; k <= sz; k++)
+    for (j = 1; j <= sy; j++)
+      for (i = -sx; i < 0; i++)
         if (bin_distance(i,j,k) < cutneighmaxsq) {
           stencilxyz[pos][0] = i;
           stencilxyz[pos][1] = j;
           stencilxyz[pos][2] = k;
           stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
         }
+  nstencil_ssa[1] = pos;
 
-  // For k==0, make sure to skip already included bins
+  // Subphase 2: lower left front bins (blue)
+  for (k = 1; k <= sz; k++)
+    for (j = -sy; j <= 0; j++)
+      for (i = -sx; i < 0; i++)
+        if (bin_distance(i,j,k) < cutneighmaxsq) {
+          stencilxyz[pos][0] = i;
+          stencilxyz[pos][1] = j;
+          stencilxyz[pos][2] = k;
+          stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
+        }
+  nstencil_ssa[2] = pos;
 
-  k = 0;
-  for (j = -sy; j <= 0; j++)
-    for (i = -sx; i <= sx; i++) {
-      if (j == 0 && i > 0) continue;
-      if (bin_distance(i,j,k) < cutneighmaxsq) {
-        stencilxyz[pos][0] = i;
-        stencilxyz[pos][1] = j;
-        stencilxyz[pos][2] = k;
-        stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
-      }
-    }
+  // Subphase 3: lower right front bins (yellow)
+  for (k = 1; k <= sz; k++)
+    for (j = -sy; j < 0; j++)
+      for (i = 0; i <= sx; i++)
+        if (bin_distance(i,j,k) < cutneighmaxsq) {
+          stencilxyz[pos][0] = i;
+          stencilxyz[pos][1] = j;
+          stencilxyz[pos][2] = k;
+          stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
+        }
+  nstencil_ssa[3] = pos;
+
+  // Now include additional bins for AIR ghosts, and impure-to-pure locals
+  // Subphase 4: upper right back bins (pink)
+  for (k = -sz; k < 0; k++)
+    for (j = 0; j <= sy; j++)
+      for (i = 0; i <= sx; i++)
+        if (bin_distance(i,j,k) < cutneighmaxsq) {
+          stencilxyz[pos][0] = i;
+          stencilxyz[pos][1] = j;
+          stencilxyz[pos][2] = k;
+          stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
+        }
+  nstencil_ssa[4] = pos;
+
+  // Subphase 5: upper left back bins (light green)
+  for (k = -sz; k < 0; k++)
+    for (j = 1; j <= sy; j++)
+      for (i = -sx; i < 0; i++)
+        if (bin_distance(i,j,k) < cutneighmaxsq) {
+          stencilxyz[pos][0] = i;
+          stencilxyz[pos][1] = j;
+          stencilxyz[pos][2] = k;
+          stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
+        }
+  nstencil_ssa[5] = pos;
+
+  // Subphase 6: lower left back bins (purple)
+  for (k = -sz; k <= 0; k++)
+    for (j = -sy; j <= 0; j++)
+      for (i = -sx; i < 0; i++)
+        if (bin_distance(i,j,k) < cutneighmaxsq) {
+          stencilxyz[pos][0] = i;
+          stencilxyz[pos][1] = j;
+          stencilxyz[pos][2] = k;
+          stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
+        }
+  nstencil_ssa[6] = pos;
+
+  // Subphase 7: lower right back bins (white)
+  for (k = -sz; k <= 0; k++)
+    for (j = -sy; j < 0; j++)
+      for (i = 0; i <= sx; i++)
+        if (bin_distance(i,j,k) < cutneighmaxsq) {
+          stencilxyz[pos][0] = i;
+          stencilxyz[pos][1] = j;
+          stencilxyz[pos][2] = k;
+          stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
+        }
+  nstencil_ssa[7] = pos;
+
+  // Also, include the centroid for the AIR ghosts.
+  stencilxyz[pos][0] = 0;
+  stencilxyz[pos][1] = 0;
+  stencilxyz[pos][2] = 0;
+  stencil[pos++] = 0;
 
   nstencil = pos; // record where full stencil ends
 }
diff --git a/src/USER-DPD/nstencil_ssa.h b/src/USER-DPD/nstencil_ssa.h
index e6dfce60f4..a5e3723271 100644
--- a/src/USER-DPD/nstencil_ssa.h
+++ b/src/USER-DPD/nstencil_ssa.h
@@ -24,7 +24,7 @@ class NStencilSSA : public NStencil {
   ~NStencilSSA() {}
   virtual void create() = 0;
 
-  int nstencil_half;   // where the half stencil ends
+  int nstencil_ssa[8];  // last stencil index for each subphase
 };
 
 }

From be166cb5bf3743b0598009570d44cfe96b327979 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Mon, 30 Jan 2017 15:03:43 -0500
Subject: [PATCH 136/267] USER-DPD: Use subphases when processing AIR zero
 (locals) in SSA. NOTE: pair ordering was NOT changed, but tiny differences
 could occur.

---
 src/USER-DPD/fix_shardlow.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/USER-DPD/fix_shardlow.cpp b/src/USER-DPD/fix_shardlow.cpp
index 05bf1602f9..4220760a9b 100644
--- a/src/USER-DPD/fix_shardlow.cpp
+++ b/src/USER-DPD/fix_shardlow.cpp
@@ -504,9 +504,14 @@ void FixShardlow::initial_integrate(int vflag)
   // process neighbors in the local AIR
   while (ii < inum) {
     i = ilist[ii];
-    int len = list->numneigh[i];
-    if (useDPDE) ssa_update_dpde(i, &(list->firstneigh[i][0]), len);
-    else ssa_update_dpd(i, &(list->firstneigh[i][0]), len);
+    for (int subphase = 0; subphase < 4; subphase++) {
+      int start = (subphase > 0) ? list->ndxAIR_ssa[i][subphase - 1] : 0;
+      int len = list->ndxAIR_ssa[i][subphase] - start;
+      if (len > 0) {
+        if (useDPDE) ssa_update_dpde(i, &(list->firstneigh[i][start]), len);
+        else ssa_update_dpd(i, &(list->firstneigh[i][start]), len);
+      }
+    }
     ii++;
   }
 

From 52aaad907f356e6ab1e553512d343fcf641a547a Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Tue, 7 Feb 2017 12:18:27 -0500
Subject: [PATCH 137/267] USER-DPD: SSA with Kokkos: Reorder stencil subphases
 to make things easier.

---
 .../nstencil_half_bin_2d_newton_ssa.cpp       | 16 ++++++-------
 .../nstencil_half_bin_3d_newton_ssa.cpp       | 24 +++++++++----------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp b/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp
index af337a38c6..084d5b0602 100644
--- a/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp
+++ b/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp
@@ -65,10 +65,10 @@ void NStencilHalfBin2dNewtonSSA::create()
       }
   nstencil_ssa[1] = pos;
 
-  // Subphase 2: lower left front bins (blue)
+  // Subphase 2: lower right front bins (yellow)
   nstencil_ssa[2] = pos;
 
-  // Subphase 3: lower right front bins (yellow)
+  // Subphase 3: lower left front bins (blue)
   nstencil_ssa[3] = pos;
 
   // Now include additional bins for AIR ghosts, and impure-to-pure locals
@@ -78,9 +78,9 @@ void NStencilHalfBin2dNewtonSSA::create()
   // Subphase 5: upper left back bins (light green)
   nstencil_ssa[5] = pos;
 
-  // Subphase 6: lower left back bins (purple)
-  for (j = -sy; j <= 0; j++)
-    for (i = -sx; i < 0; i++)
+  // Subphase 6: lower right back bins (white)
+  for (j = -sy; j < 0; j++)
+    for (i = 0; i <= sx; i++)
       if (bin_distance(i,j,0) < cutneighmaxsq) {
         stencilxyz[pos][0] = i;
         stencilxyz[pos][1] = j;
@@ -89,9 +89,9 @@ void NStencilHalfBin2dNewtonSSA::create()
       }
   nstencil_ssa[6] = pos;
 
-  // Subphase 7: lower right back bins (white)
-  for (j = -sy; j < 0; j++)
-    for (i = 0; i <= sx; i++)
+  // Subphase 7: lower left back bins (purple)
+  for (j = -sy; j <= 0; j++)
+    for (i = -sx; i < 0; i++)
       if (bin_distance(i,j,0) < cutneighmaxsq) {
         stencilxyz[pos][0] = i;
         stencilxyz[pos][1] = j;
diff --git a/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp b/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp
index a2911a6d7b..1741a1e847 100644
--- a/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp
+++ b/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp
@@ -67,10 +67,10 @@ void NStencilHalfBin3dNewtonSSA::create()
         }
   nstencil_ssa[1] = pos;
 
-  // Subphase 2: lower left front bins (blue)
+  // Subphase 2: lower right front bins (yellow)
   for (k = 1; k <= sz; k++)
-    for (j = -sy; j <= 0; j++)
-      for (i = -sx; i < 0; i++)
+    for (j = -sy; j < 0; j++)
+      for (i = 0; i <= sx; i++)
         if (bin_distance(i,j,k) < cutneighmaxsq) {
           stencilxyz[pos][0] = i;
           stencilxyz[pos][1] = j;
@@ -79,10 +79,10 @@ void NStencilHalfBin3dNewtonSSA::create()
         }
   nstencil_ssa[2] = pos;
 
-  // Subphase 3: lower right front bins (yellow)
+  // Subphase 3: lower left front bins (blue)
   for (k = 1; k <= sz; k++)
-    for (j = -sy; j < 0; j++)
-      for (i = 0; i <= sx; i++)
+    for (j = -sy; j <= 0; j++)
+      for (i = -sx; i < 0; i++)
         if (bin_distance(i,j,k) < cutneighmaxsq) {
           stencilxyz[pos][0] = i;
           stencilxyz[pos][1] = j;
@@ -116,10 +116,10 @@ void NStencilHalfBin3dNewtonSSA::create()
         }
   nstencil_ssa[5] = pos;
 
-  // Subphase 6: lower left back bins (purple)
+  // Subphase 6: lower right back bins (white)
   for (k = -sz; k <= 0; k++)
-    for (j = -sy; j <= 0; j++)
-      for (i = -sx; i < 0; i++)
+    for (j = -sy; j < 0; j++)
+      for (i = 0; i <= sx; i++)
         if (bin_distance(i,j,k) < cutneighmaxsq) {
           stencilxyz[pos][0] = i;
           stencilxyz[pos][1] = j;
@@ -128,10 +128,10 @@ void NStencilHalfBin3dNewtonSSA::create()
         }
   nstencil_ssa[6] = pos;
 
-  // Subphase 7: lower right back bins (white)
+  // Subphase 7: lower left back bins (purple)
   for (k = -sz; k <= 0; k++)
-    for (j = -sy; j < 0; j++)
-      for (i = 0; i <= sx; i++)
+    for (j = -sy; j <= 0; j++)
+      for (i = -sx; i < 0; i++)
         if (bin_distance(i,j,k) < cutneighmaxsq) {
           stencilxyz[pos][0] = i;
           stencilxyz[pos][1] = j;

From 151b3f552bfdfcc915a4ed429b210342a3e1837f Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Tue, 7 Feb 2017 12:53:45 -0500
Subject: [PATCH 138/267] USER-DPD: Save pointer to the NPair used to create
 the NeighList Gives a user of NeighList access to data stored in a custom
 NPair

---
 src/neigh_list.cpp | 1 +
 src/neigh_list.h   | 1 +
 src/neighbor.cpp   | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/neigh_list.cpp b/src/neigh_list.cpp
index edc8634373..e8fd4130fc 100644
--- a/src/neigh_list.cpp
+++ b/src/neigh_list.cpp
@@ -79,6 +79,7 @@ NeighList::NeighList(LAMMPS *lmp) : Pointers(lmp)
   // USER-DPD package
 
   ndxAIR_ssa = NULL;
+  np = NULL;
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/neigh_list.h b/src/neigh_list.h
index 7649245e99..ea88e9b28b 100644
--- a/src/neigh_list.h
+++ b/src/neigh_list.h
@@ -82,6 +82,7 @@ class NeighList : protected Pointers {
 
   int AIRct_ssa[8]; // count of how many atoms in each AIR
   uint16_t (*ndxAIR_ssa)[8]; // for each atom, last neighbor index of each AIR
+  class NPair *np;           // ptr to NPair instance I depend on
 
   // methods
 
diff --git a/src/neighbor.cpp b/src/neighbor.cpp
index e0b84cc410..148dcbd7e9 100644
--- a/src/neighbor.cpp
+++ b/src/neighbor.cpp
@@ -785,7 +785,7 @@ int Neighbor::init_pair()
     }
 
     PairCreator pair_creator = pairclass[flag-1];
-    neigh_pair[i] = pair_creator(lmp);
+    lists[i]->np = neigh_pair[i] = pair_creator(lmp);
     neigh_pair[i]->post_constructor(requests[i]);
     neigh_pair[i]->istyle = flag;
 

From ab32d136b97e0f83e4a8e21e60251394623f1787 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Tue, 7 Feb 2017 13:03:07 -0500
Subject: [PATCH 139/267] USER-DPD: SSA with Kokkos: make stencil's sx, sy, sz
 variables public

---
 src/nstencil.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nstencil.h b/src/nstencil.h
index 7985d23202..a4c6a4af66 100644
--- a/src/nstencil.h
+++ b/src/nstencil.h
@@ -30,6 +30,7 @@ class NStencil : protected Pointers {
   int *nstencil_multi;             // # bins in each type-based multi stencil
   int **stencil_multi;             // list of bin offsets in each stencil
   double **distsq_multi;           // sq distances to bins in each stencil
+  int sx,sy,sz;                    // extent of stencil in each dim
 
   double cutoff_custom;            // cutoff set by requestor
 
@@ -64,7 +65,6 @@ class NStencil : protected Pointers {
   int xyzflag;                     // 1 if stencilxyz is allocated
   int maxstencil;                  // max size of stencil
   int maxstencil_multi;            // max sizes of stencils
-  int sx,sy,sz;                    // extent of stencil in each dim
 
   int dimension;
 

From 4b3197202ba2fc56f946376719bce24f1452897f Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Tue, 7 Feb 2017 13:38:49 -0500
Subject: [PATCH 140/267] USER-DPD: Rework SSA to use a new neighbor list
 structure, ready for Kokkos NOTE: pair evaluation order changes, causing
 numerical differences! Atom pair processing order is fully planned out in
 npair_half_bin_newton_ssa Makes the SSA neighbor list structure very
 different. Do not use by others! Each local is in ilist, numneigh, and
 firstneigh four times instead of once.

Changes LAMMPS core code that had been previously changed for USER-DPD/SSA:
Removes ssaAIR[] from class Atom as it is now unused.
Removes ndxAIR_ssa[] from class NeighList as it is now unused.
Increases length of ilist[], numneigh[], and firstneigh[] if SSA flag set.
---
 src/USER-DPD/fix_shardlow.cpp              |  39 ++--
 src/USER-DPD/nbin_ssa.cpp                  |  20 --
 src/USER-DPD/nbin_ssa.h                    |   3 -
 src/USER-DPD/npair_half_bin_newton_ssa.cpp | 204 ++++++++++++++-------
 src/USER-DPD/npair_half_bin_newton_ssa.h   |  11 +-
 src/atom.cpp                               |   2 -
 src/atom.h                                 |   1 -
 src/neigh_list.cpp                         |  22 +--
 src/neigh_list.h                           |   1 -
 9 files changed, 174 insertions(+), 129 deletions(-)

diff --git a/src/USER-DPD/fix_shardlow.cpp b/src/USER-DPD/fix_shardlow.cpp
index 4220760a9b..4a7fff66cf 100644
--- a/src/USER-DPD/fix_shardlow.cpp
+++ b/src/USER-DPD/fix_shardlow.cpp
@@ -55,6 +55,7 @@
 #include "pair_dpd_fdt.h"
 #include "pair_dpd_fdt_energy.h"
 #include "pair.h"
+#include "npair_half_bin_newton_ssa.h"
 #include "citeme.h"
 
 using namespace LAMMPS_NS;
@@ -500,19 +501,30 @@ void FixShardlow::initial_integrate(int vflag)
 
   dtsqrt = sqrt(update->dt);
 
-  ii = 0;
+  NPairHalfBinNewtonSSA *np_ssa = dynamic_cast<NPairHalfBinNewtonSSA*>(list->np);
+  if (!np_ssa) error->one(FLERR, "NPair wasn't a NPairHalfBinNewtonSSA object");
+  int ssa_phaseCt = np_ssa->ssa_phaseCt;
+  int *ssa_phaseLen = np_ssa->ssa_phaseLen;
+  int **ssa_itemLoc = np_ssa->ssa_itemLoc;
+  int **ssa_itemLen = np_ssa->ssa_itemLen;
+
   // process neighbors in the local AIR
-  while (ii < inum) {
-    i = ilist[ii];
-    for (int subphase = 0; subphase < 4; subphase++) {
-      int start = (subphase > 0) ? list->ndxAIR_ssa[i][subphase - 1] : 0;
-      int len = list->ndxAIR_ssa[i][subphase] - start;
-      if (len > 0) {
-        if (useDPDE) ssa_update_dpde(i, &(list->firstneigh[i][start]), len);
-        else ssa_update_dpd(i, &(list->firstneigh[i][start]), len);
+  for (int workPhase = 0; workPhase < ssa_phaseCt; ++workPhase) {
+    int workItemCt = ssa_phaseLen[workPhase];
+
+    for (int workItem = 0; workItem < workItemCt; ++workItem) {
+      int ct = ssa_itemLen[workPhase][workItem];
+      ii = ssa_itemLoc[workPhase][workItem];
+
+      while (ct-- > 0) {
+        int len = list->numneigh[ii];
+        if (len > 0) {
+          if (useDPDE) ssa_update_dpde(ilist[ii], list->firstneigh[ii], len);
+          else ssa_update_dpd(ilist[ii], list->firstneigh[ii], len);
+        }
+        ii++;
       }
     }
-    ii++;
   }
 
   ii = inum;
@@ -531,10 +543,9 @@ void FixShardlow::initial_integrate(int vflag)
 
     // process neighbors in this AIR
     while (ct-- > 0) {
-      i = ilist[ii];
-      int len = list->numneigh[i];
-      if (useDPDE) ssa_update_dpde(i, &(list->firstneigh[i][0]), len);
-      else ssa_update_dpd(i, &(list->firstneigh[i][0]), len);
+      int len = list->numneigh[ii];
+      if (useDPDE) ssa_update_dpde(ilist[ii], list->firstneigh[ii], len);
+      else ssa_update_dpd(ilist[ii], list->firstneigh[ii], len);
       ii++;
     }
 
diff --git a/src/USER-DPD/nbin_ssa.cpp b/src/USER-DPD/nbin_ssa.cpp
index 7e603af714..4c57a8e70f 100644
--- a/src/USER-DPD/nbin_ssa.cpp
+++ b/src/USER-DPD/nbin_ssa.cpp
@@ -30,9 +30,6 @@ using namespace LAMMPS_NS;
 
 NBinSSA::NBinSSA(LAMMPS *lmp) : NBinStandard(lmp)
 {
-  maxbin_ssa = 0;
-  binlist_ssa = NULL;
-  binct_ssa = NULL;
   for (int i = 0; i < 8; i++) {
     gairhead_ssa[i] = -1;
   }
@@ -40,8 +37,6 @@ NBinSSA::NBinSSA(LAMMPS *lmp) : NBinStandard(lmp)
 
 NBinSSA::~NBinSSA()
 {
-  memory->destroy(binlist_ssa);
-  memory->destroy(binct_ssa);
 }
 
 /* ----------------------------------------------------------------------
@@ -72,8 +67,6 @@ void NBinSSA::bin_atoms()
 
   for (i = 0; i < mbins; i++) {
     binhead[i] = -1;
-    binlist_ssa[i] = -1;
-    binct_ssa[i] = 0;
   }
 
   // bin in reverse order so linked list will be in forward order
@@ -108,7 +101,6 @@ void NBinSSA::bin_atoms()
     if (zbin >= lbinzhi) lbinzhi = zbin + 1;
     bins[i] = binhead[ibin];
     binhead[ibin] = i;
-    ++(binct_ssa[ibin]);
   }
 
 }
@@ -119,14 +111,6 @@ void NBinSSA::bin_atoms_setup(int nall)
 {
   NBinStandard::bin_atoms_setup(nall); // Setup the parent class's data too
 
-  if (mbins > maxbin_ssa) {
-    maxbin_ssa = mbins;
-    memory->destroy(binlist_ssa);
-    memory->destroy(binct_ssa);
-    memory->create(binlist_ssa,maxbin_ssa,"binlist_ssa");
-    memory->create(binct_ssa,maxbin_ssa,"binct_ssa");
-  }
-
   // Clear the local bin extent bounding box.
   lbinxlo = mbinx - 1; // Safe to = stencil->sx + 1
   lbinylo = mbiny - 1; // Safe to = stencil->sy + 1
@@ -142,10 +126,6 @@ bigint NBinSSA::memory_usage()
 {
   bigint bytes = NBinStandard::memory_usage(); // Count the parent's usage too
 
-  if (maxbin_ssa) {
-    bytes += memory->usage(binlist_ssa,maxbin_ssa);
-    bytes += memory->usage(binct_ssa,maxbin_ssa);
-  }
   return bytes;
 }
 
diff --git a/src/USER-DPD/nbin_ssa.h b/src/USER-DPD/nbin_ssa.h
index f26f8c77f0..2a0175081e 100644
--- a/src/USER-DPD/nbin_ssa.h
+++ b/src/USER-DPD/nbin_ssa.h
@@ -29,10 +29,7 @@ namespace LAMMPS_NS {
 class NBinSSA : public NBinStandard {
  public:
 
-  int *binlist_ssa;          // index in neighlist of 1st local atom in each bin
-  int *binct_ssa;            // count of local atoms in each bin
   int gairhead_ssa[8];       // index of 1st ghost atom in each AIR
-  int maxbin_ssa;            // size of binlist_ssa and binct_ssa arrays
 
   // Bounds of the local atoms in the binhead array
   int lbinxlo;               // lowest local bin x-dim coordinate
diff --git a/src/USER-DPD/npair_half_bin_newton_ssa.cpp b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
index 77b20966b0..2c787d6398 100644
--- a/src/USER-DPD/npair_half_bin_newton_ssa.cpp
+++ b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
@@ -34,7 +34,27 @@ using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-NPairHalfBinNewtonSSA::NPairHalfBinNewtonSSA(LAMMPS *lmp) : NPair(lmp) {}
+NPairHalfBinNewtonSSA::NPairHalfBinNewtonSSA(LAMMPS *lmp) : NPair(lmp)
+{
+  ssa_maxPhaseCt = 0;
+  ssa_maxPhaseLen = 0;
+  ssa_phaseCt = 0;
+  ssa_phaseLen = NULL;
+  ssa_itemLoc = NULL;
+  ssa_itemLen = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+NPairHalfBinNewtonSSA::~NPairHalfBinNewtonSSA()
+{
+  ssa_maxPhaseCt = 0;
+  ssa_maxPhaseLen = 0;
+  ssa_phaseCt = 0;
+  memory->destroy(ssa_phaseLen);
+  memory->destroy(ssa_itemLoc);
+  memory->destroy(ssa_itemLen);
+}
 
 /* ----------------------------------------------------------------------
    binned neighbor list construction with full Newton's 3rd law
@@ -81,8 +101,6 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
   if (!nb_ssa) error->one(FLERR, "NBin wasn't a NBinSSA object");
   int *bins = nb_ssa->bins;
   int *binhead = nb_ssa->binhead;
-  int *binlist_ssa = nb_ssa->binlist_ssa;
-  int *binct_ssa = nb_ssa->binct_ssa;
   int *gairhead_ssa = &(nb_ssa->gairhead_ssa[0]);
 
   int inum = 0;
@@ -96,74 +114,81 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
   int lbinzlo = nb_ssa->lbinzlo;
   int lbinzhi = nb_ssa->lbinzhi;
 
+  int sx1 = ns_ssa->sx + 1;
+  int sy1 = ns_ssa->sy + 1;
+  int sz1 = ns_ssa->sz + 1;
+
+  ssa_phaseCt = sz1*sy1*sx1;
+
+  xbin = (lbinxhi - lbinxlo + sx1 - 1) / sx1 + 1;
+  ybin = (lbinyhi - lbinylo + sy1 - 1) / sy1 + 1;
+  zbin = (lbinzhi - lbinzlo + sz1 - 1) / sz1 + 1;
+
+  int phaseLenEstimate = xbin*ybin*zbin;
+
+  if (ssa_phaseCt > ssa_maxPhaseCt) {
+    ssa_maxPhaseCt = ssa_phaseCt;
+    ssa_maxPhaseLen = 0;
+    memory->destroy(ssa_phaseLen);
+    memory->destroy(ssa_itemLoc);
+    memory->destroy(ssa_itemLen);
+    memory->create(ssa_phaseLen,ssa_maxPhaseCt,"NPairHalfBinNewtonSSA:ssa_phaseLen");
+  }
+
+  if (phaseLenEstimate > ssa_maxPhaseLen) {
+    ssa_maxPhaseLen = phaseLenEstimate;
+    memory->destroy(ssa_itemLoc);
+    memory->destroy(ssa_itemLen);
+    memory->create(ssa_itemLoc,ssa_maxPhaseCt,ssa_maxPhaseLen,"NPairHalfBinNewtonSSA:ssa_itemLoc");
+    memory->create(ssa_itemLen,ssa_maxPhaseCt,ssa_maxPhaseLen,"NPairHalfBinNewtonSSA:ssa_itemLen");
+  }
+
   ipage->reset();
 
+  int workPhase = 0;
   // loop over bins with local atoms, storing half of the neighbors
-  for (zbin = lbinzlo; zbin < lbinzhi; zbin++) {
-  for (ybin = lbinylo; ybin < lbinyhi; ybin++) {
-  for (xbin = lbinxlo; xbin < lbinxhi; xbin++) {
-  ibin = zbin*mbiny*mbinx + ybin*mbinx + xbin;
-  binlist_ssa[ibin] = inum; // record where ibin starts in ilist
-  for (i = binhead[ibin]; i >= 0; i = bins[i]) {
-    n = 0;
-    neighptr = ipage->vget();
+  for (int zoff = ns_ssa->sz; zoff >= 0; --zoff) {
+  for (int yoff = ns_ssa->sy; yoff >= 0; --yoff) {
+  for (int xoff = ns_ssa->sx; xoff >= 0; --xoff) {
+    int workItem = 0;
+  for (zbin = lbinzlo + zoff; zbin < lbinzhi; zbin += sz1) {
+  for (ybin = lbinylo + yoff - ns_ssa->sy; ybin < lbinyhi; ybin += sy1) {
+  for (xbin = lbinxlo + xoff - ns_ssa->sx; xbin < lbinxhi; xbin += sz1) {
+    if (workItem >= phaseLenEstimate) error->one(FLERR,"phaseLenEstimate was too small");
+    ssa_itemLoc[workPhase][workItem] = inum; // record where workItem starts in ilist
 
-    itype = type[i];
-    xtmp = x[i][0];
-    ytmp = x[i][1];
-    ztmp = x[i][2];
-    if (moltemplate) {
-      imol = molindex[i];
-      iatom = molatom[i];
-      tagprev = tag[i] - iatom - 1;
-    }
-
-    // loop over rest of local atoms in i's bin
-    // just store them, since j is beyond i in linked list
-
-    for (j = bins[i]; j >= 0; j = bins[j]) {
-
-      jtype = type[j];
-      if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
-
-      delx = xtmp - x[j][0];
-      dely = ytmp - x[j][1];
-      delz = ztmp - x[j][2];
-      rsq = delx*delx + dely*dely + delz*delz;
-
-      if (rsq <= cutneighsq[itype][jtype]) {
-        if (molecular) {
-          if (!moltemplate)
-            which = find_special(special[i],nspecial[i],tag[j]);
-          else if (imol >= 0)
-            which = find_special(onemols[imol]->special[iatom],
-                                 onemols[imol]->nspecial[iatom],
-                                 tag[j]-tagprev);
-          else which = 0;
-          if (which == 0) neighptr[n++] = j;
-          else if (domain->minimum_image_check(delx,dely,delz))
-            neighptr[n++] = j;
-          else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
-        } else neighptr[n++] = j;
-      }
-    }
-
-    // loop over all local atoms in other bins in "half" stencil
-
-    k = 0;
     for (int subphase = 0; subphase < 4; subphase++) {
-      for (; k < nstencil_ssa[subphase]; k++) {
-        for (j = binhead[ibin+stencil[k]]; j >= 0;
-             j = bins[j]) {
+      int s_ybin = ybin + ((subphase & 0x2) ? ns_ssa->sy : 0);
+      int s_xbin = xbin + ((subphase & 0x1) ? ns_ssa->sx : 0);
+      int ibin, ct;
 
+      if ((s_ybin < lbinylo) || (s_ybin >= lbinyhi)) continue;
+      if ((s_xbin < lbinxlo) || (s_xbin >= lbinxhi)) continue;
+      ibin = zbin*nb_ssa->mbiny*nb_ssa->mbinx
+           + s_ybin*nb_ssa->mbinx
+           + s_xbin;
+
+      for (i = binhead[ibin]; i >= 0; i = bins[i]) {
+        n = 0;
+        neighptr = ipage->vget();
+        itype = type[i];
+        xtmp = x[i][0];
+        ytmp = x[i][1];
+        ztmp = x[i][2];
+        if (moltemplate) {
+          imol = molindex[i];
+          iatom = molatom[i];
+          tagprev = tag[i] - iatom - 1;
+        }
+        // loop over rest of local atoms in i's bin if this is subphase 0
+        // just store them, since j is beyond i in linked list
+        if (subphase == 0) for (j = bins[i]; j >= 0; j = bins[j]) {
           jtype = type[j];
           if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
-
           delx = xtmp - x[j][0];
           dely = ytmp - x[j][1];
           delz = ztmp - x[j][2];
           rsq = delx*delx + dely*dely + delz*delz;
-
           if (rsq <= cutneighsq[itype][jtype]) {
             if (molecular) {
               if (!moltemplate)
@@ -180,22 +205,59 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
             } else neighptr[n++] = j;
           }
         }
-      }
-      list->ndxAIR_ssa[i][subphase] = n; // record end of this subphase
-    }
 
-    if (n > 0) {
-      ilist[inum++] = i;
+        // loop over all local atoms in other bins in "subphase" of stencil
+        k = (subphase > 0) ? nstencil_ssa[subphase - 1] : 0;
+        for (; k < nstencil_ssa[subphase]; k++) {
+          for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
+            jtype = type[j];
+            if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+            delx = xtmp - x[j][0];
+            dely = ytmp - x[j][1];
+            delz = ztmp - x[j][2];
+            rsq = delx*delx + dely*dely + delz*delz;
+            if (rsq <= cutneighsq[itype][jtype]) {
+              if (molecular) {
+                if (!moltemplate)
+                  which = find_special(special[i],nspecial[i],tag[j]);
+                else if (imol >= 0)
+                  which = find_special(onemols[imol]->special[iatom],
+                                       onemols[imol]->nspecial[iatom],
+                                       tag[j]-tagprev);
+                else which = 0;
+                if (which == 0) neighptr[n++] = j;
+                else if (domain->minimum_image_check(delx,dely,delz))
+                  neighptr[n++] = j;
+                else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+              } else neighptr[n++] = j;
+            }
+          }
+        }
+
+        if (n > 0) {
+          firstneigh[inum] = neighptr;
+          numneigh[inum] = n;
+          ilist[inum++] = i;
+        }
+        ipage->vgot(n);
+        if (ipage->status())
+          error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+      }
     }
-    firstneigh[i] = neighptr;
-    numneigh[i] = n;
-    ipage->vgot(n);
-    if (ipage->status())
-      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+    // record where workItem ends in ilist
+    ssa_itemLen[workPhase][workItem] = inum - ssa_itemLoc[workPhase][workItem];
+    if (ssa_itemLen[workPhase][workItem] > 0) workItem++;
   }
   }
   }
+
+    // record where workPhase ends
+    ssa_phaseLen[workPhase++] = workItem;
   }
+  }
+  }
+
+  if (ssa_phaseCt != workPhase) error->one(FLERR,"ssa_phaseCt was wrong");
 
   list->AIRct_ssa[0] = list->inum = inum;
 
@@ -258,11 +320,11 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
       }
 
       if (n > 0) {
+        firstneigh[inum + gnum] = neighptr;
+        numneigh[inum + gnum] = n;
         ilist[inum + (gnum++)] = i;
         ++locAIRct;
       }
-      firstneigh[i] = neighptr;
-      numneigh[i] = n;
       ipage->vgot(n);
       if (ipage->status())
         error->one(FLERR,"Neighbor (ghost) list overflow, boost neigh_modify one");
diff --git a/src/USER-DPD/npair_half_bin_newton_ssa.h b/src/USER-DPD/npair_half_bin_newton_ssa.h
index c9ccbc4bd9..ea292316ca 100644
--- a/src/USER-DPD/npair_half_bin_newton_ssa.h
+++ b/src/USER-DPD/npair_half_bin_newton_ssa.h
@@ -28,9 +28,18 @@ namespace LAMMPS_NS {
 
 class NPairHalfBinNewtonSSA : public NPair {
  public:
+  // SSA Work plan data structures
+  int ssa_phaseCt;
+  int *ssa_phaseLen;
+  int **ssa_itemLoc;
+  int **ssa_itemLen;
+
   NPairHalfBinNewtonSSA(class LAMMPS *);
-  ~NPairHalfBinNewtonSSA() {}
+  ~NPairHalfBinNewtonSSA();
   void build(class NeighList *);
+ private:
+  int ssa_maxPhaseCt;
+  int ssa_maxPhaseLen;
 };
 
 }
diff --git a/src/atom.cpp b/src/atom.cpp
index 0920dc3a02..de98b65470 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -99,7 +99,6 @@ Atom::Atom(LAMMPS *lmp) : Pointers(lmp)
   uCond = uMech = uChem = uCG = uCGnew = NULL;
   duChem = NULL;
   dpdTheta = NULL;
-  ssaAIR = NULL;
 
   // USER-SMD
 
@@ -296,7 +295,6 @@ Atom::~Atom()
   memory->destroy(uCG);
   memory->destroy(uCGnew);
   memory->destroy(duChem);
-  memory->destroy(ssaAIR);
 
   memory->destroy(nspecial);
   memory->destroy(special);
diff --git a/src/atom.h b/src/atom.h
index de7cda06ac..745377cee1 100644
--- a/src/atom.h
+++ b/src/atom.h
@@ -93,7 +93,6 @@ class Atom : protected Pointers {
   double *duChem;
   double *dpdTheta;
   int nspecies_dpd;
-  int *ssaAIR; // Shardlow Splitting Algorithm Active Interaction Region number
 
   // molecular info
 
diff --git a/src/neigh_list.cpp b/src/neigh_list.cpp
index e8fd4130fc..6376637832 100644
--- a/src/neigh_list.cpp
+++ b/src/neigh_list.cpp
@@ -78,7 +78,7 @@ NeighList::NeighList(LAMMPS *lmp) : Pointers(lmp)
 
   // USER-DPD package
 
-  ndxAIR_ssa = NULL;
+  for (int i = 0; i < 8; i++) AIRct_ssa[i] = 0;
   np = NULL;
 }
 
@@ -98,10 +98,6 @@ NeighList::~NeighList()
 
   delete [] iskip;
   memory->destroy(ijskip);
-
-  if (ssa) {
-    memory->sfree(ndxAIR_ssa);
-  }
 }
 
 /* ----------------------------------------------------------------------
@@ -202,14 +198,16 @@ void NeighList::grow(int nlocal, int nall)
   if (listmiddle) listmiddle->grow(nlocal,nall);
 
   // skip if data structs are already big enough
-
-  if (ghost) {
+  if (ssa) {
+    if ((nlocal * 3) + nall <= maxatom) return;
+  } else if (ghost) {
     if (nall <= maxatom) return;
   } else {
     if (nlocal <= maxatom) return;
   }
 
-  maxatom = atom->nmax;
+  if (ssa) maxatom = (nlocal * 3) + nall;
+  else maxatom = atom->nmax;
 
   memory->destroy(ilist);
   memory->destroy(numneigh);
@@ -223,12 +221,6 @@ void NeighList::grow(int nlocal, int nall)
     firstdouble = (double **) memory->smalloc(maxatom*sizeof(double *),
                                               "neighlist:firstdouble");
   }
-
-  if (ssa) {
-    if (ndxAIR_ssa) memory->sfree(ndxAIR_ssa);
-    ndxAIR_ssa = (uint16_t (*)[8]) memory->smalloc(sizeof(uint16_t)*8*maxatom,
-      "neighlist:ndxAIR_ssa");
-  }
 }
 
 /* ----------------------------------------------------------------------
@@ -305,7 +297,5 @@ bigint NeighList::memory_usage()
     }
   }
 
-  if (ndxAIR_ssa) bytes += sizeof(uint16_t) * 8 * maxatom;
-
   return bytes;
 }
diff --git a/src/neigh_list.h b/src/neigh_list.h
index ea88e9b28b..bef512512c 100644
--- a/src/neigh_list.h
+++ b/src/neigh_list.h
@@ -81,7 +81,6 @@ class NeighList : protected Pointers {
   // USER-DPD package and Shardlow Splitting Algorithm (SSA) support
 
   int AIRct_ssa[8]; // count of how many atoms in each AIR
-  uint16_t (*ndxAIR_ssa)[8]; // for each atom, last neighbor index of each AIR
   class NPair *np;           // ptr to NPair instance I depend on
 
   // methods

From e0bafa499d55d0f273b23adf87cbaf824d5f9f11 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Wed, 15 Feb 2017 15:03:40 -0500
Subject: [PATCH 141/267] indentation fixes in npair_kokkos.cpp, plus a comment
 question

---
 src/KOKKOS/npair_kokkos.cpp | 38 ++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp
index f49e44c352..4f17835717 100644
--- a/src/KOKKOS/npair_kokkos.cpp
+++ b/src/KOKKOS/npair_kokkos.cpp
@@ -170,7 +170,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
   data.special_flag[2] = special_flag[2];
   data.special_flag[3] = special_flag[3];
 
-  if(list->d_neighbors.dimension_0()<nall) {
+  if(list->d_neighbors.dimension_0()<nall) { // Can this EVER be true??? - TIM 20170215
     list->d_neighbors = typename ArrayTypes<DeviceType>::t_neighbors_2d("neighbors", nall*1.1, list->maxneighs);
     list->d_numneigh = typename ArrayTypes<DeviceType>::t_int_1d("numneigh", nall*1.1);
     data.neigh_list.d_neighbors = list->d_neighbors;
@@ -179,10 +179,10 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
   data.h_resize()=1;
   while(data.h_resize()) {
     data.h_new_maxneighs() = list->maxneighs;
-  data.h_resize() = 0;
+    data.h_resize() = 0;
 
-  Kokkos::deep_copy(data.resize, data.h_resize);
-  Kokkos::deep_copy(data.new_maxneighs, data.h_new_maxneighs);
+    Kokkos::deep_copy(data.resize, data.h_resize);
+    Kokkos::deep_copy(data.new_maxneighs, data.h_new_maxneighs);
 #ifdef KOKKOS_HAVE_CUDA
     #define BINS_PER_BLOCK 2
     const int factor = atoms_per_bin<64?2:1;
@@ -191,27 +191,27 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
     const int factor = 1;
 #endif
 
-if (GHOST) {
-  NPairKokkosBuildFunctorGhost<DeviceType,HALF_NEIGH> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
-  Kokkos::parallel_for(nall, f);
-} else {
-  if (newton_pair) {
-    NPairKokkosBuildFunctor<DeviceType,TRI?0:HALF_NEIGH,1,TRI> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
+    if (GHOST) {
+      NPairKokkosBuildFunctorGhost<DeviceType,HALF_NEIGH> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
+      Kokkos::parallel_for(nall, f);
+    } else {
+      if (newton_pair) {
+        NPairKokkosBuildFunctor<DeviceType,TRI?0:HALF_NEIGH,1,TRI> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
 #ifdef KOKKOS_HAVE_CUDA
-    Kokkos::parallel_for(config, f);
+        Kokkos::parallel_for(config, f);
 #else
-    Kokkos::parallel_for(nall, f);
+        Kokkos::parallel_for(nall, f);
 #endif
-  } else {
-    NPairKokkosBuildFunctor<DeviceType,HALF_NEIGH,0,0> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
+      } else {
+        NPairKokkosBuildFunctor<DeviceType,HALF_NEIGH,0,0> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
 #ifdef KOKKOS_HAVE_CUDA
-    Kokkos::parallel_for(config, f);
+        Kokkos::parallel_for(config, f);
 #else
-    Kokkos::parallel_for(nall, f);
+        Kokkos::parallel_for(nall, f);
 #endif
-  }
-}
-  DeviceType::fence();
+      }
+    }
+    DeviceType::fence();
     deep_copy(data.h_resize, data.resize);
 
     if(data.h_resize()) {

From 5289ec0b39f2c6600da1c246bcb59c2a668e5b56 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 17 Feb 2017 18:39:04 -0500
Subject: [PATCH 142/267] cleanup: remove unused binatomsItem() declaration in
 npair_kokkos.h

---
 src/KOKKOS/npair_kokkos.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/KOKKOS/npair_kokkos.h b/src/KOKKOS/npair_kokkos.h
index 54726cb971..87fa0b8aee 100644
--- a/src/KOKKOS/npair_kokkos.h
+++ b/src/KOKKOS/npair_kokkos.h
@@ -277,9 +277,6 @@ class NeighborKokkosExecute
   void build_ItemCuda(typename Kokkos::TeamPolicy<DeviceType>::member_type dev) const;
 #endif
 
-  KOKKOS_INLINE_FUNCTION
-  void binatomsItem(const int &i) const;
-
   KOKKOS_INLINE_FUNCTION
   int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z) const
   {

From c2ee3285fc79729d03e458ca2d5118699bd40c7a Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 17 Feb 2017 21:54:41 -0500
Subject: [PATCH 143/267] USER-DPD: change nstencil_ssa[] to eliminate a corner
 case Saves a conditional inside an NPairHalfBinNewtonSSA::build() inner loop

---
 src/USER-DPD/npair_half_bin_newton_ssa.cpp      |  3 +--
 .../nstencil_half_bin_2d_newton_ssa.cpp         | 17 +++++++++--------
 .../nstencil_half_bin_3d_newton_ssa.cpp         | 17 +++++++++--------
 src/USER-DPD/nstencil_ssa.h                     |  3 ++-
 4 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/src/USER-DPD/npair_half_bin_newton_ssa.cpp b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
index 2c787d6398..8d260dd2be 100644
--- a/src/USER-DPD/npair_half_bin_newton_ssa.cpp
+++ b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
@@ -207,8 +207,7 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
         }
 
         // loop over all local atoms in other bins in "subphase" of stencil
-        k = (subphase > 0) ? nstencil_ssa[subphase - 1] : 0;
-        for (; k < nstencil_ssa[subphase]; k++) {
+        for (k = nstencil_ssa[subphase]; k < nstencil_ssa[subphase+1]; k++) {
           for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
             jtype = type[j];
             if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
diff --git a/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp b/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp
index 084d5b0602..5df65918d3 100644
--- a/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp
+++ b/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp
@@ -42,6 +42,7 @@ NStencilHalfBin2dNewtonSSA::NStencilHalfBin2dNewtonSSA(LAMMPS *lmp) :
 void NStencilHalfBin2dNewtonSSA::create()
 {
   int i,j,pos = 0;
+  nstencil_ssa[0] = 0; // redundant info, but saves a conditional
   // Subphase 0: upper right front bins (red)
   for (j = 0; j <= sy; j++)
     for (i = 0; i <= sx; i++)
@@ -52,8 +53,8 @@ void NStencilHalfBin2dNewtonSSA::create()
           stencilxyz[pos][2] = 0;
           stencil[pos++] = j*mbinx + i;
         }
-  nstencil_ssa[0] = pos;
 
+  nstencil_ssa[1] = pos;
   // Subphase 1: upper left front bins (light blue)
   for (j = 1; j <= sy; j++)
     for (i = -sx; i < 0; i++)
@@ -63,21 +64,21 @@ void NStencilHalfBin2dNewtonSSA::create()
         stencilxyz[pos][2] = 0;
         stencil[pos++] = j*mbinx + i;
       }
-  nstencil_ssa[1] = pos;
 
-  // Subphase 2: lower right front bins (yellow)
   nstencil_ssa[2] = pos;
+  // Subphase 2: lower right front bins (yellow)
 
-  // Subphase 3: lower left front bins (blue)
   nstencil_ssa[3] = pos;
+  // Subphase 3: lower left front bins (blue)
 
+  nstencil_ssa[4] = pos; // record end of half stencil
   // Now include additional bins for AIR ghosts, and impure-to-pure locals
   // Subphase 4: upper right back bins (pink)
-  nstencil_ssa[4] = pos;
 
+  // nstencil_ssa[5] = pos;
   // Subphase 5: upper left back bins (light green)
-  nstencil_ssa[5] = pos;
 
+  // nstencil_ssa[6] = pos;
   // Subphase 6: lower right back bins (white)
   for (j = -sy; j < 0; j++)
     for (i = 0; i <= sx; i++)
@@ -87,8 +88,8 @@ void NStencilHalfBin2dNewtonSSA::create()
         stencilxyz[pos][2] = 0;
         stencil[pos++] = j*mbinx + i;
       }
-  nstencil_ssa[6] = pos;
 
+  // nstencil_ssa[7] = pos;
   // Subphase 7: lower left back bins (purple)
   for (j = -sy; j <= 0; j++)
     for (i = -sx; i < 0; i++)
@@ -98,7 +99,7 @@ void NStencilHalfBin2dNewtonSSA::create()
         stencilxyz[pos][2] = 0;
         stencil[pos++] = j*mbinx + i;
       }
-  nstencil_ssa[7] = pos;
+  // nstencil_ssa[8] = pos;
 
   // Also, include the centroid for the AIR ghosts.
   stencilxyz[pos][0] = 0;
diff --git a/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp b/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp
index 1741a1e847..3b1c85bdc1 100644
--- a/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp
+++ b/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp
@@ -42,6 +42,7 @@ NStencilHalfBin3dNewtonSSA::NStencilHalfBin3dNewtonSSA(LAMMPS *lmp) :
 void NStencilHalfBin3dNewtonSSA::create()
 {
   int i,j,k,pos = 0;
+  nstencil_ssa[0] = 0; // redundant info, but saves a conditional
   // Subphase 0: upper right front bins (red)
   for (k = 0; k <= sz; k++)
     for (j = 0; j <= sy; j++)
@@ -53,8 +54,8 @@ void NStencilHalfBin3dNewtonSSA::create()
             stencilxyz[pos][2] = k;
             stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
           }
-  nstencil_ssa[0] = pos;
 
+  nstencil_ssa[1] = pos;
   // Subphase 1: upper left front bins (light blue)
   for (k = 0; k <= sz; k++)
     for (j = 1; j <= sy; j++)
@@ -65,8 +66,8 @@ void NStencilHalfBin3dNewtonSSA::create()
           stencilxyz[pos][2] = k;
           stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
         }
-  nstencil_ssa[1] = pos;
 
+  nstencil_ssa[2] = pos;
   // Subphase 2: lower right front bins (yellow)
   for (k = 1; k <= sz; k++)
     for (j = -sy; j < 0; j++)
@@ -77,8 +78,8 @@ void NStencilHalfBin3dNewtonSSA::create()
           stencilxyz[pos][2] = k;
           stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
         }
-  nstencil_ssa[2] = pos;
 
+  nstencil_ssa[3] = pos;
   // Subphase 3: lower left front bins (blue)
   for (k = 1; k <= sz; k++)
     for (j = -sy; j <= 0; j++)
@@ -89,8 +90,8 @@ void NStencilHalfBin3dNewtonSSA::create()
           stencilxyz[pos][2] = k;
           stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
         }
-  nstencil_ssa[3] = pos;
 
+  nstencil_ssa[4] = pos; // record end of half stencil
   // Now include additional bins for AIR ghosts, and impure-to-pure locals
   // Subphase 4: upper right back bins (pink)
   for (k = -sz; k < 0; k++)
@@ -102,8 +103,8 @@ void NStencilHalfBin3dNewtonSSA::create()
           stencilxyz[pos][2] = k;
           stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
         }
-  nstencil_ssa[4] = pos;
 
+  // nstencil_ssa[5] = pos;
   // Subphase 5: upper left back bins (light green)
   for (k = -sz; k < 0; k++)
     for (j = 1; j <= sy; j++)
@@ -114,8 +115,8 @@ void NStencilHalfBin3dNewtonSSA::create()
           stencilxyz[pos][2] = k;
           stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
         }
-  nstencil_ssa[5] = pos;
 
+  // nstencil_ssa[6] = pos;
   // Subphase 6: lower right back bins (white)
   for (k = -sz; k <= 0; k++)
     for (j = -sy; j < 0; j++)
@@ -126,8 +127,8 @@ void NStencilHalfBin3dNewtonSSA::create()
           stencilxyz[pos][2] = k;
           stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
         }
-  nstencil_ssa[6] = pos;
 
+  // nstencil_ssa[7] = pos;
   // Subphase 7: lower left back bins (purple)
   for (k = -sz; k <= 0; k++)
     for (j = -sy; j <= 0; j++)
@@ -138,7 +139,7 @@ void NStencilHalfBin3dNewtonSSA::create()
           stencilxyz[pos][2] = k;
           stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
         }
-  nstencil_ssa[7] = pos;
+  //nstencil_ssa[8] = pos;
 
   // Also, include the centroid for the AIR ghosts.
   stencilxyz[pos][0] = 0;
diff --git a/src/USER-DPD/nstencil_ssa.h b/src/USER-DPD/nstencil_ssa.h
index a5e3723271..f6f91fefde 100644
--- a/src/USER-DPD/nstencil_ssa.h
+++ b/src/USER-DPD/nstencil_ssa.h
@@ -24,7 +24,8 @@ class NStencilSSA : public NStencil {
   ~NStencilSSA() {}
   virtual void create() = 0;
 
-  int nstencil_ssa[8];  // last stencil index for each subphase
+  // first stencil index for each subphase, with last index at end
+  int nstencil_ssa[5];
 };
 
 }

From d1a0a3e1c369254f173845ca6c4200f956eed0f5 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 17 Feb 2017 22:00:56 -0500
Subject: [PATCH 144/267] USER-DPD: first attempt at nbin_ssa_kokkos... It
 compiles!

---
 src/KOKKOS/nbin_ssa_kokkos.cpp | 233 +++++++++++++++++++++++++++++++++
 src/KOKKOS/nbin_ssa_kokkos.h   | 193 +++++++++++++++++++++++++++
 2 files changed, 426 insertions(+)
 create mode 100644 src/KOKKOS/nbin_ssa_kokkos.cpp
 create mode 100644 src/KOKKOS/nbin_ssa_kokkos.h

diff --git a/src/KOKKOS/nbin_ssa_kokkos.cpp b/src/KOKKOS/nbin_ssa_kokkos.cpp
new file mode 100644
index 0000000000..6ed8e9f3e4
--- /dev/null
+++ b/src/KOKKOS/nbin_ssa_kokkos.cpp
@@ -0,0 +1,233 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors:
+   James Larentzos (ARL) and Timothy I. Mattox (Engility Corporation)
+------------------------------------------------------------------------- */
+
+#include "nbin_ssa_kokkos.h"
+#include "neighbor.h"
+#include "atom_kokkos.h"
+#include "group.h"
+#include "domain.h"
+#include "comm.h"
+#include "update.h"
+#include "error.h"
+#include "atom_masks.h"
+
+// #include "memory.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+NBinSSAKokkos<DeviceType>::NBinSSAKokkos(LAMMPS *lmp) : NBinStandard(lmp)
+{
+  atoms_per_bin = ghosts_per_gbin = 16;
+
+  d_resize = typename AT::t_int_scalar("NBinSSAKokkos::d_resize");
+#ifndef KOKKOS_USE_CUDA_UVM
+  h_resize = Kokkos::create_mirror_view(d_resize);
+#else
+  h_resize = d_resize;
+#endif
+  h_resize() = 1;
+
+  k_gbincount = DAT::tdual_int_1d("NBinSSAKokkos::gbincount",8);
+  gbincount = k_gbincount.view<DeviceType>();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void NBinSSAKokkos<DeviceType>::bin_atoms_setup(int nall)
+{
+  if (mbins > (int) k_bins.d_view.dimension_0()) {
+    k_bins = DAT::tdual_int_2d("NBinSSAKokkos::bins",mbins,atoms_per_bin);
+    bins = k_bins.view<DeviceType>();
+
+    k_bincount = DAT::tdual_int_1d("NBinSSAKokkos::bincount",mbins);
+    bincount = k_bincount.view<DeviceType>();
+  }
+
+  ghosts_per_gbin = atom->nghost / 7; // estimate needed size
+
+  if (ghosts_per_gbin > (int) k_gbins.d_view.dimension_1()) {
+    k_gbins = DAT::tdual_int_2d("NBinSSAKokkos::gbins",8,ghosts_per_gbin);
+    gbins = k_gbins.view<DeviceType>();
+  }
+
+  // Clear the local bin extent bounding box.
+  h_lbinxlo() = mbinx - 1; // Safe to = stencil->sx + 1
+  h_lbinylo() = mbiny - 1; // Safe to = stencil->sy + 1
+  h_lbinzlo() = mbinz - 1; // Safe to = stencil->sz + 1
+  h_lbinxhi() = 0; // Safe to = mbinx - stencil->sx - 1
+  h_lbinyhi() = 0; // Safe to = mbiny - stencil->sy - 1
+  h_lbinzhi() = 0; // Safe to = mbinz - stencil->sz - 1
+  deep_copy(d_lbinxlo, h_lbinxlo);
+  deep_copy(d_lbinylo, h_lbinylo);
+  deep_copy(d_lbinzlo, h_lbinzlo);
+  deep_copy(d_lbinxhi, h_lbinxhi);
+  deep_copy(d_lbinyhi, h_lbinyhi);
+  deep_copy(d_lbinzhi, h_lbinzhi);
+}
+
+/* ----------------------------------------------------------------------
+   bin owned and ghost atoms for the Shardlow Splitting Algorithm (SSA)
+   local atoms are in distinct bins (binhead[]) from the ghosts
+   ghost atoms are "binned" in gairhead_ssa[] instead
+     ghosts which are not in an Active Interaction Region (AIR) are skipped
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void NBinSSAKokkos<DeviceType>::bin_atoms()
+{
+  last_bin = update->ntimestep;
+
+  int i;
+
+  // bin the ghost atoms
+  h_resize() = 1;
+  while(h_resize() > 0) {
+    h_resize() = 0;
+    deep_copy(d_resize, h_resize);
+
+    for (int i = 0; i < 8; i++) {
+      k_gbincount.h_view(i) = 0;
+    }
+    k_gbincount.modify<LMPHostType>();
+    k_gbincount.sync<DeviceType>();
+    DeviceType::fence(); // FIXME?
+
+    atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);
+    x = atomKK->k_x.view<DeviceType>();
+
+    // I don't think these two lines need to be repeated here... - TIM 20170216
+    sublo_[0] = domain->sublo[0];
+    sublo_[1] = domain->sublo[1];
+    sublo_[2] = domain->sublo[2];
+    subhi_[0] = domain->subhi[0];
+    subhi_[1] = domain->subhi[1];
+    subhi_[2] = domain->subhi[2];
+
+    NPairSSAKokkosBinGhostsFunctor<DeviceType> f(*this);
+
+    Kokkos::parallel_for(atom->nghost, f);
+    DeviceType::fence();
+
+    deep_copy(h_resize, d_resize);
+    if(h_resize()) {
+      k_gbincount.modify<DeviceType>();
+      k_gbincount.sync<DeviceType>();
+      for (i = 1; i < 8; i++) {
+        if (k_gbincount.h_view(i) > ghosts_per_gbin) {
+          ghosts_per_gbin = k_gbincount.h_view(i);
+        }
+      }
+      k_gbins = DAT::tdual_int_2d("gbins", 8, ghosts_per_gbin);
+      gbins = k_gbins.view<DeviceType>();
+    }
+  }
+  c_gbins = gbins; // gbins won't change until the next bin_atoms
+
+  // bin the local atoms
+  h_resize() = 1;
+  while(h_resize() > 0) {
+    h_resize() = 0;
+    deep_copy(d_resize, h_resize);
+
+    MemsetZeroFunctor<DeviceType> f_zero;
+    f_zero.ptr = (void*) k_bincount.view<DeviceType>().ptr_on_device();
+    Kokkos::parallel_for(mbins, f_zero);
+    DeviceType::fence();
+
+    atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);
+    x = atomKK->k_x.view<DeviceType>();
+
+    // I don't think these two lines need to be repeated here... - TIM 20170216
+    bboxlo_[0] = bboxlo[0]; bboxlo_[1] = bboxlo[1]; bboxlo_[2] = bboxlo[2];
+    bboxhi_[0] = bboxhi[0]; bboxhi_[1] = bboxhi[1]; bboxhi_[2] = bboxhi[2];
+
+    NPairSSAKokkosBinAtomsFunctor<DeviceType> f(*this);
+
+    Kokkos::parallel_for(atom->nlocal, f);
+    DeviceType::fence();
+
+    deep_copy(h_resize, d_resize);
+    if(h_resize()) {
+
+      atoms_per_bin += 16;
+      k_bins = DAT::tdual_int_2d("bins", mbins, atoms_per_bin);
+      bins = k_bins.view<DeviceType>();
+    }
+  }
+  deep_copy(h_lbinxlo, d_lbinxlo);
+  deep_copy(h_lbinylo, d_lbinylo);
+  deep_copy(h_lbinzlo, d_lbinzlo);
+  deep_copy(h_lbinxhi, d_lbinxhi);
+  deep_copy(h_lbinyhi, d_lbinyhi);
+  deep_copy(h_lbinzhi, d_lbinzhi);
+  c_bins = bins; // bins won't change until the next bin_atoms
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void NBinSSAKokkos<DeviceType>::binGhostsItem(const int &i_) const
+{
+  const int i = i_ + atom->nlocal;
+  const int iAIR = coord2ssaAIR(x(i, 0), x(i, 1), x(i, 2));
+  if (iAIR > 0) { // include only ghost atoms in an AIR
+    const int ac = Kokkos::atomic_fetch_add(&gbincount[iAIR], (int)1);
+    if(ac < (int) gbins.dimension_1()) {
+      gbins(iAIR, ac) = i;
+    } else {
+      d_resize() = 1;
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void NBinSSAKokkos<DeviceType>::binAtomsItem(const int &i) const
+{
+  int loc[3];
+  const int ibin = coord2bin(x(i, 0), x(i, 1), x(i, 2), &(loc[0]));
+
+  // Find the bounding box of the local atoms in the bins
+  if (loc[0] < d_lbinxlo()) Kokkos::atomic_fetch_min(&d_lbinxlo(),loc[0]);
+  if (loc[0] >= d_lbinxhi()) Kokkos::atomic_fetch_max(&d_lbinxhi(),loc[0] + 1);
+  if (loc[1] < d_lbinylo()) Kokkos::atomic_fetch_min(&d_lbinylo(),loc[1]);
+  if (loc[1] >= d_lbinyhi()) Kokkos::atomic_fetch_max(&d_lbinyhi(),loc[1] + 1);
+  if (loc[2] < d_lbinzlo()) Kokkos::atomic_fetch_min(&d_lbinzlo(),loc[2]);
+  if (loc[2] >= d_lbinzhi()) Kokkos::atomic_fetch_max(&d_lbinzhi(),loc[2] + 1);
+
+  const int ac = Kokkos::atomic_fetch_add(&(bincount[ibin]), (int)1);
+  if(ac < (int) bins.dimension_1()) {
+    bins(ibin, ac) = i;
+  } else {
+    d_resize() = 1;
+  }
+}
+
+namespace LAMMPS_NS {
+template class NBinSSAKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class NBinSSAKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/nbin_ssa_kokkos.h b/src/KOKKOS/nbin_ssa_kokkos.h
new file mode 100644
index 0000000000..a16cb2d0b7
--- /dev/null
+++ b/src/KOKKOS/nbin_ssa_kokkos.h
@@ -0,0 +1,193 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef NBIN_CLASS
+
+NBinStyle(ssa/kk/host,
+          NBinSSAKokkos<LMPHostType>,
+          NB_SSA | NB_KOKKOS_HOST)
+
+NBinStyle(ssa/kk/device,
+          NBinSSAKokkos<LMPDeviceType>,
+          NB_SSA | NB_KOKKOS_DEVICE)
+
+#else
+
+#ifndef LMP_NBIN_SSA_KOKKOS_H
+#define LMP_NBIN_SSA_KOKKOS_H
+
+#include "nbin_standard.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+template<class DeviceType>
+class NBinSSAKokkos : public NBinStandard {
+ public:
+  typedef ArrayTypes<DeviceType> AT;
+
+  NBinSSAKokkos(class LAMMPS *);
+  ~NBinSSAKokkos() {}
+  void bin_atoms_setup(int);
+  void bin_atoms();
+
+  int atoms_per_bin;
+  DAT::tdual_int_1d k_bincount;
+  DAT::tdual_int_2d k_bins;
+  typename AT::t_int_1d bincount;
+  typename AT::t_int_2d bins;
+  typename AT::t_int_2d_const c_bins;
+
+  int ghosts_per_gbin;
+  DAT::tdual_int_1d k_gbincount;
+  DAT::tdual_int_2d k_gbins;
+  typename AT::t_int_1d gbincount;
+  typename AT::t_int_2d gbins;
+  typename AT::t_int_2d_const c_gbins;
+
+  typename AT::t_int_scalar d_resize;
+  typename ArrayTypes<LMPHostType>::t_int_scalar h_resize;
+  typename AT::t_x_array_randomread x;
+
+  // Bounds of the local atoms in the bins array
+  typename AT::t_int_scalar d_lbinxlo;  // lowest local bin x-dim coordinate
+  typename AT::t_int_scalar d_lbinylo;  // lowest local bin y-dim coordinate
+  typename AT::t_int_scalar d_lbinzlo;  // lowest local bin z-dim coordinate
+  typename AT::t_int_scalar d_lbinxhi;  // highest local bin x-dim coordinate
+  typename AT::t_int_scalar d_lbinyhi;  // highest local bin y-dim coordinate
+  typename AT::t_int_scalar d_lbinzhi;  // highest local bin z-dim coordinate
+  typename ArrayTypes<LMPHostType>::t_int_scalar h_lbinxlo;
+  typename ArrayTypes<LMPHostType>::t_int_scalar h_lbinylo;
+  typename ArrayTypes<LMPHostType>::t_int_scalar h_lbinzlo;
+  typename ArrayTypes<LMPHostType>::t_int_scalar h_lbinxhi;
+  typename ArrayTypes<LMPHostType>::t_int_scalar h_lbinyhi;
+  typename ArrayTypes<LMPHostType>::t_int_scalar h_lbinzhi;
+
+
+  KOKKOS_INLINE_FUNCTION
+  void binAtomsItem(const int &i) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void binGhostsItem(const int &i) const;
+
+/* ----------------------------------------------------------------------
+   convert atom coords into the ssa active interaction region number
+------------------------------------------------------------------------- */
+  KOKKOS_INLINE_FUNCTION
+  int coord2ssaAIR(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z) const
+  {
+    int ix, iy, iz;
+    ix = iy = iz = 0;
+    if (z < sublo_[2]) iz = -1;
+    if (z >= subhi_[2]) iz = 1;
+    if (y < sublo_[1]) iy = -1;
+    if (y >= subhi_[1]) iy = 1;
+    if (x < sublo_[0]) ix = -1;
+    if (x >= subhi_[0]) ix = 1;
+    if(iz < 0){
+      return -1;
+    } else if(iz == 0){
+      if( iy<0 ) return -1; // bottom left/middle/right
+      if( (iy==0) && (ix<0)  ) return -1; // left atoms
+      if( (iy==0) && (ix==0) ) return 0; // Locally owned atoms
+      if( (iy==0) && (ix>0)  ) return 2; // Right atoms
+      if( (iy>0)  && (ix==0) ) return 1; // Top-middle atoms
+      if( (iy>0)  && (ix!=0) ) return 3; // Top-right and top-left atoms
+    } else { // iz > 0
+      if((ix==0) && (iy==0)) return 4; // Back atoms
+      if((ix==0) && (iy!=0)) return 5; // Top-back and bottom-back atoms
+      if((ix!=0) && (iy==0)) return 6; // Left-back and right-back atoms
+      if((ix!=0) && (iy!=0)) return 7; // Back corner atoms
+    }
+    return -2;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z, int* i) const
+  {
+    int ix,iy,iz;
+
+    if (x >= bboxhi_[0])
+      ix = static_cast<int> ((x-bboxhi_[0])*bininvx) + nbinx;
+    else if (x >= bboxlo_[0]) {
+      ix = static_cast<int> ((x-bboxlo_[0])*bininvx);
+      ix = MIN(ix,nbinx-1);
+    } else
+      ix = static_cast<int> ((x-bboxlo_[0])*bininvx) - 1;
+
+    if (y >= bboxhi_[1])
+      iy = static_cast<int> ((y-bboxhi_[1])*bininvy) + nbiny;
+    else if (y >= bboxlo_[1]) {
+      iy = static_cast<int> ((y-bboxlo_[1])*bininvy);
+      iy = MIN(iy,nbiny-1);
+    } else
+      iy = static_cast<int> ((y-bboxlo_[1])*bininvy) - 1;
+
+    if (z >= bboxhi_[2])
+      iz = static_cast<int> ((z-bboxhi_[2])*bininvz) + nbinz;
+    else if (z >= bboxlo_[2]) {
+      iz = static_cast<int> ((z-bboxlo_[2])*bininvz);
+      iz = MIN(iz,nbinz-1);
+    } else
+      iz = static_cast<int> ((z-bboxlo_[2])*bininvz) - 1;
+
+    i[0] = ix - mbinxlo;
+    i[1] = iy - mbinylo;
+    i[2] = iz - mbinzlo;
+
+    return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo);
+  }
+
+ private:
+  double bboxlo_[3],bboxhi_[3];
+  double sublo_[3], subhi_[3];
+};
+
+template<class DeviceType>
+struct NPairSSAKokkosBinGhostsFunctor {
+  typedef DeviceType device_type;
+
+  const NBinSSAKokkos<DeviceType> c;
+
+  NPairSSAKokkosBinGhostsFunctor(const NBinSSAKokkos<DeviceType> &_c):
+    c(_c) {};
+  ~NPairSSAKokkosBinGhostsFunctor() {}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int & i) const {
+    c.binGhostsItem(i);
+  }
+};
+
+template<class DeviceType>
+struct NPairSSAKokkosBinAtomsFunctor {
+  typedef DeviceType device_type;
+
+  const NBinSSAKokkos<DeviceType> c;
+
+  NPairSSAKokkosBinAtomsFunctor(const NBinSSAKokkos<DeviceType> &_c):
+    c(_c) {};
+  ~NPairSSAKokkosBinAtomsFunctor() {}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int & i) const {
+    c.binAtomsItem(i);
+  }
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+*/

From 7feb6c2853b6f2b8f67f84bda07cd9d0ab287e8b Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 17 Feb 2017 22:41:32 -0500
Subject: [PATCH 145/267] USER-DPD: fix a bug in
 AtomVecDPDKokkos::unpack_restart()

---
 src/KOKKOS/atom_vec_dpd_kokkos.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.cpp b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
index 820f11c215..f46f284f14 100644
--- a/src/KOKKOS/atom_vec_dpd_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
@@ -1668,7 +1668,7 @@ int AtomVecDPDKokkos::unpack_restart(double *buf)
 
   double **extra = atom->extra;
   if (atom->nextra_store) {
-    int size = static_cast<int> (ubuf(buf[m++]).i) - m;
+    int size = static_cast<int> (buf[0]) - m;
     for (int i = 0; i < size; i++) extra[nlocal][i] = buf[m++];
   }
 

From 37810bdc530209dc776f79bdbadf41c232919d04 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 17 Feb 2017 23:06:53 -0500
Subject: [PATCH 146/267] USER-DPD: move centroid bin of stencil_ssa to the
 first slot. Eliminates a special case version of a loop just for Subphase 0.
 NOTE: pair evaluation order changes, causing numerical differences! This
 changed the order that close neighbors of ghosts are processed.

---
 src/USER-DPD/npair_half_bin_newton_ssa.cpp    | 32 +++----------------
 .../nstencil_half_bin_2d_newton_ssa.cpp       | 14 ++++----
 .../nstencil_half_bin_3d_newton_ssa.cpp       | 14 ++++----
 3 files changed, 21 insertions(+), 39 deletions(-)

diff --git a/src/USER-DPD/npair_half_bin_newton_ssa.cpp b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
index 8d260dd2be..14095bf349 100644
--- a/src/USER-DPD/npair_half_bin_newton_ssa.cpp
+++ b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
@@ -180,35 +180,13 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
           iatom = molatom[i];
           tagprev = tag[i] - iatom - 1;
         }
-        // loop over rest of local atoms in i's bin if this is subphase 0
-        // just store them, since j is beyond i in linked list
-        if (subphase == 0) for (j = bins[i]; j >= 0; j = bins[j]) {
-          jtype = type[j];
-          if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
-          delx = xtmp - x[j][0];
-          dely = ytmp - x[j][1];
-          delz = ztmp - x[j][2];
-          rsq = delx*delx + dely*dely + delz*delz;
-          if (rsq <= cutneighsq[itype][jtype]) {
-            if (molecular) {
-              if (!moltemplate)
-                which = find_special(special[i],nspecial[i],tag[j]);
-              else if (imol >= 0)
-                which = find_special(onemols[imol]->special[iatom],
-                                     onemols[imol]->nspecial[iatom],
-                                     tag[j]-tagprev);
-              else which = 0;
-              if (which == 0) neighptr[n++] = j;
-              else if (domain->minimum_image_check(delx,dely,delz))
-                neighptr[n++] = j;
-              else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
-            } else neighptr[n++] = j;
-          }
-        }
 
-        // loop over all local atoms in other bins in "subphase" of stencil
+        // loop over all local atoms in the current stencil "subphase"
         for (k = nstencil_ssa[subphase]; k < nstencil_ssa[subphase+1]; k++) {
-          for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
+          const int jbin = ibin+stencil[k];
+          if (jbin != ibin) j = binhead[jbin];
+          else j = bins[i]; // same bin as i, so start just past i in the bin
+          for (; j >= 0; j = bins[j]) {
             jtype = type[j];
             if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
             delx = xtmp - x[j][0];
diff --git a/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp b/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp
index 5df65918d3..451381c104 100644
--- a/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp
+++ b/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp
@@ -43,6 +43,14 @@ void NStencilHalfBin2dNewtonSSA::create()
 {
   int i,j,pos = 0;
   nstencil_ssa[0] = 0; // redundant info, but saves a conditional
+
+  // Include the centroid at the start.
+  // It will be handled as part of Subphase 0.
+  stencilxyz[pos][0] = 0;
+  stencilxyz[pos][1] = 0;
+  stencilxyz[pos][2] = 0;
+  stencil[pos++] = 0;
+
   // Subphase 0: upper right front bins (red)
   for (j = 0; j <= sy; j++)
     for (i = 0; i <= sx; i++)
@@ -101,11 +109,5 @@ void NStencilHalfBin2dNewtonSSA::create()
       }
   // nstencil_ssa[8] = pos;
 
-  // Also, include the centroid for the AIR ghosts.
-  stencilxyz[pos][0] = 0;
-  stencilxyz[pos][1] = 0;
-  stencilxyz[pos][2] = 0;
-  stencil[pos++] = 0;
-
   nstencil = pos; // record where full stencil ends
 }
diff --git a/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp b/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp
index 3b1c85bdc1..cdd3b8856f 100644
--- a/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp
+++ b/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp
@@ -43,6 +43,14 @@ void NStencilHalfBin3dNewtonSSA::create()
 {
   int i,j,k,pos = 0;
   nstencil_ssa[0] = 0; // redundant info, but saves a conditional
+
+  // Include the centroid at the start.
+  // It will be handled as part of Subphase 0.
+  stencilxyz[pos][0] = 0;
+  stencilxyz[pos][1] = 0;
+  stencilxyz[pos][2] = 0;
+  stencil[pos++] = 0;
+
   // Subphase 0: upper right front bins (red)
   for (k = 0; k <= sz; k++)
     for (j = 0; j <= sy; j++)
@@ -141,11 +149,5 @@ void NStencilHalfBin3dNewtonSSA::create()
         }
   //nstencil_ssa[8] = pos;
 
-  // Also, include the centroid for the AIR ghosts.
-  stencilxyz[pos][0] = 0;
-  stencilxyz[pos][1] = 0;
-  stencilxyz[pos][2] = 0;
-  stencil[pos++] = 0;
-
   nstencil = pos; // record where full stencil ends
 }

From 19ffe5931529b9c49cf9bc656c2e9fa01248aa3c Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Sat, 18 Feb 2017 00:32:14 -0500
Subject: [PATCH 147/267] USER-DPD: fix typo in NPairHalfBinNewtonSSA::build():
 sz1 instead of sx1 Luckily, no real change, since sz1 and sx1 are normally
 identical.

---
 src/USER-DPD/npair_half_bin_newton_ssa.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/USER-DPD/npair_half_bin_newton_ssa.cpp b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
index 14095bf349..ab439d3731 100644
--- a/src/USER-DPD/npair_half_bin_newton_ssa.cpp
+++ b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
@@ -153,7 +153,7 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
     int workItem = 0;
   for (zbin = lbinzlo + zoff; zbin < lbinzhi; zbin += sz1) {
   for (ybin = lbinylo + yoff - ns_ssa->sy; ybin < lbinyhi; ybin += sy1) {
-  for (xbin = lbinxlo + xoff - ns_ssa->sx; xbin < lbinxhi; xbin += sz1) {
+  for (xbin = lbinxlo + xoff - ns_ssa->sx; xbin < lbinxhi; xbin += sx1) {
     if (workItem >= phaseLenEstimate) error->one(FLERR,"phaseLenEstimate was too small");
     ssa_itemLoc[workPhase][workItem] = inum; // record where workItem starts in ilist
 

From 5c6e7b12c647a21d45e8d6460e53a5ff64f277d6 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Sat, 18 Feb 2017 01:09:02 -0500
Subject: [PATCH 148/267] BUGFIX: fix a copy-o in build_Item_Ghost(): xbin2,
 etc. should be an int xbin2, ybin2, and zbin2 are temporary integer bin
 coordinates, not floats!

---
 src/KOKKOS/npair_kokkos.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp
index 4f17835717..c750918695 100644
--- a/src/KOKKOS/npair_kokkos.cpp
+++ b/src/KOKKOS/npair_kokkos.cpp
@@ -724,9 +724,9 @@ void NeighborKokkosExecute<DeviceType>::
     const int ybin = binxyz[1];
     const int zbin = binxyz[2];
     for (int k = 0; k < nstencil; k++) {
-      const X_FLOAT xbin2 = xbin + stencilxyz(k,0);
-      const X_FLOAT ybin2 = ybin + stencilxyz(k,1);
-      const X_FLOAT zbin2 = zbin + stencilxyz(k,2);
+      const int xbin2 = xbin + stencilxyz(k,0);
+      const int ybin2 = ybin + stencilxyz(k,1);
+      const int zbin2 = zbin + stencilxyz(k,2);
       if (xbin2 < 0 || xbin2 >= mbinx ||
           ybin2 < 0 || ybin2 >= mbiny ||
           zbin2 < 0 || zbin2 >= mbinz) continue;

From 01d0a5c4a210617fb47b9aa85a79eeccd715fa5f Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Sat, 18 Feb 2017 01:38:55 -0500
Subject: [PATCH 149/267] BUGFIX: use Kokkos::atomic_fetch_max() to avoid a
 race on new_maxneighs

---
 src/KOKKOS/npair_kokkos.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp
index c750918695..5bfa147def 100644
--- a/src/KOKKOS/npair_kokkos.cpp
+++ b/src/KOKKOS/npair_kokkos.cpp
@@ -422,10 +422,10 @@ void NeighborKokkosExecute<DeviceType>::
 
   neigh_list.d_numneigh(i) = n;
 
-  if(n >= neigh_list.maxneighs) {
+  if(n > neigh_list.maxneighs) {
     resize() = 1;
 
-    if(n >= new_maxneighs()) new_maxneighs() = n;
+    if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
   }
 
   neigh_list.d_ilist(i) = i;
@@ -632,10 +632,10 @@ void NeighborKokkosExecute<DeviceType>::build_ItemCuda(typename Kokkos::TeamPoli
     neigh_list.d_ilist(i) = i;
   }
 
-  if(n >= neigh_list.maxneighs) {
+  if(n > neigh_list.maxneighs) {
     resize() = 1;
 
-    if(n >= new_maxneighs()) new_maxneighs() = n;
+    if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
   }
   }
 }
@@ -755,10 +755,10 @@ void NeighborKokkosExecute<DeviceType>::
 
   neigh_list.d_numneigh(i) = n;
 
-  if(n >= neigh_list.maxneighs) {
+  if(n > neigh_list.maxneighs) {
     resize() = 1;
 
-    if(n >= new_maxneighs()) new_maxneighs() = n;
+    if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
   }
   neigh_list.d_ilist(i) = i;
 }

From 8065d967612a04ce0f6e3f8286d7c66fe1e48f0d Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Sat, 18 Feb 2017 03:14:32 -0500
Subject: [PATCH 150/267] USER-DPD: first attempt at npair_ssa_kokkos... It
 compiles!

---
 src/KOKKOS/npair_ssa_kokkos.cpp | 539 ++++++++++++++++++++++++++++++++
 src/KOKKOS/npair_ssa_kokkos.h   | 334 ++++++++++++++++++++
 2 files changed, 873 insertions(+)
 create mode 100644 src/KOKKOS/npair_ssa_kokkos.cpp
 create mode 100644 src/KOKKOS/npair_ssa_kokkos.h

diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp
new file mode 100644
index 0000000000..752fc0c938
--- /dev/null
+++ b/src/KOKKOS/npair_ssa_kokkos.cpp
@@ -0,0 +1,539 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors:
+   James Larentzos and Timothy I. Mattox (Engility Corporation)
+------------------------------------------------------------------------- */
+
+#include "npair_ssa_kokkos.h"
+#include "neigh_list.h"
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "domain_kokkos.h"
+#include "neighbor_kokkos.h"
+#include "nbin_ssa_kokkos.h"
+#include "nstencil_ssa.h"
+#include "error.h"
+
+namespace LAMMPS_NS {
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+NPairSSAKokkos<DeviceType>::NPairSSAKokkos(LAMMPS *lmp) : NPair(lmp), ssa_phaseCt(27)
+{
+}
+
+/* ----------------------------------------------------------------------
+   copy needed info from Neighbor class to this build class
+   ------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void NPairSSAKokkos<DeviceType>::copy_neighbor_info()
+{
+  NPair::copy_neighbor_info();
+
+  NeighborKokkos* neighborKK = (NeighborKokkos*) neighbor;
+
+  // general params
+
+  k_cutneighsq = neighborKK->k_cutneighsq;
+
+  // exclusion info
+
+  k_ex1_type = neighborKK->k_ex1_type;
+  k_ex2_type = neighborKK->k_ex2_type;
+  k_ex_type = neighborKK->k_ex_type;
+  k_ex1_group = neighborKK->k_ex1_group;
+  k_ex2_group = neighborKK->k_ex2_group;
+  k_ex1_bit = neighborKK->k_ex1_bit;
+  k_ex2_bit = neighborKK->k_ex2_bit;
+  k_ex_mol_group = neighborKK->k_ex_mol_group;
+  k_ex_mol_bit = neighborKK->k_ex_mol_bit;
+}
+
+/* ----------------------------------------------------------------------
+ copy per-atom and per-bin vectors from NBinSSAKokkos class to this build class
+ ------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void NPairSSAKokkos<DeviceType>::copy_bin_info()
+{
+  NPair::copy_bin_info();
+
+  NBinSSAKokkos<DeviceType>* nbKK = dynamic_cast<NBinSSAKokkos<DeviceType>*>(nb);
+  if (!nbKK) error->one(FLERR, "NBin wasn't a NBinSSAKokkos object");
+
+  atoms_per_bin = nbKK->atoms_per_bin;
+  k_bincount = nbKK->k_bincount;
+  k_bins = nbKK->k_bins;
+
+  ghosts_per_gbin = nbKK->ghosts_per_gbin;
+  k_gbincount = nbKK->k_gbincount;
+  k_gbins = nbKK->k_gbins;
+
+  lbinxlo = nbKK->d_lbinxlo();
+  lbinxhi = nbKK->d_lbinxhi();
+  lbinylo = nbKK->d_lbinylo();
+  lbinyhi = nbKK->d_lbinyhi();
+  lbinzlo = nbKK->d_lbinzlo();
+  lbinzhi = nbKK->d_lbinzhi();
+}
+
+/* ----------------------------------------------------------------------
+ copy needed info from NStencil class to this build class
+ ------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void NPairSSAKokkos<DeviceType>::copy_stencil_info()
+{
+  NPair::copy_stencil_info();
+
+  nstencil = ns->nstencil;
+
+  int maxstencil = ns->get_maxstencil();
+
+  k_stencil = DAT::tdual_int_1d("NPairSSAKokkos:stencil",maxstencil);
+  for (int k = 0; k < maxstencil; k++) {
+    k_stencil.h_view(k) = ns->stencil[k];
+  }
+  k_stencil.modify<LMPHostType>();
+  k_stencil.sync<DeviceType>();
+  k_stencilxyz = DAT::tdual_int_1d_3("NPairSSAKokkos:stencilxyz",maxstencil);
+  for (int k = 0; k < maxstencil; k++) {
+    k_stencilxyz.h_view(k,0) = ns->stencilxyz[k][0];
+    k_stencilxyz.h_view(k,1) = ns->stencilxyz[k][1];
+    k_stencilxyz.h_view(k,2) = ns->stencilxyz[k][2];
+  }
+  k_stencilxyz.modify<LMPHostType>();
+  k_stencilxyz.sync<DeviceType>();
+
+  NStencilSSA *ns_ssa = dynamic_cast<NStencilSSA*>(ns);
+  if (!ns_ssa) error->one(FLERR, "NStencil wasn't a NStencilSSA object");
+
+  k_nstencil_ssa = DAT::tdual_int_1d("NPairSSAKokkos:nstencil_ssa",8);
+  for (int k = 0; k < 8; ++k) {
+    k_nstencil_ssa.h_view(k) = ns_ssa->nstencil_ssa[k];
+  }
+  k_nstencil_ssa.modify<LMPHostType>();
+  k_nstencil_ssa.sync<DeviceType>();
+  sx1 = ns_ssa->sx + 1;
+  sy1 = ns_ssa->sy + 1;
+  sz1 = ns_ssa->sz + 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+int NPairSSAKokkosExecute<DeviceType>::find_special(const int &i, const int &j) const
+{
+  const int n1 = nspecial(i,0);
+  const int n2 = nspecial(i,1);
+  const int n3 = nspecial(i,2);
+
+  for (int k = 0; k < n3; k++) {
+    if (special(i,k) == tag(j)) {
+      if (k < n1) {
+        if (special_flag[1] == 0) return -1;
+        else if (special_flag[1] == 1) return 0;
+        else return 1;
+      } else if (k < n2) {
+        if (special_flag[2] == 0) return -1;
+        else if (special_flag[2] == 1) return 0;
+        else return 2;
+      } else {
+        if (special_flag[3] == 0) return -1;
+        else if (special_flag[3] == 1) return 0;
+        else return 3;
+      }
+    }
+  }
+  return 0;
+};
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+int NPairSSAKokkosExecute<DeviceType>::exclusion(const int &i,const int &j,
+                                             const int &itype,const int &jtype) const
+{
+  int m;
+
+  if (nex_type && ex_type(itype,jtype)) return 1;
+
+  if (nex_group) {
+    for (m = 0; m < nex_group; m++) {
+      if (mask(i) & ex1_bit(m) && mask(j) & ex2_bit(m)) return 1;
+      if (mask(i) & ex2_bit(m) && mask(j) & ex1_bit(m)) return 1;
+    }
+  }
+
+  if (nex_mol) {
+    for (m = 0; m < nex_mol; m++)
+      if (mask(i) & ex_mol_bit(m) && mask(j) & ex_mol_bit(m) &&
+          molecule(i) == molecule(j)) return 1;
+  }
+
+  return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction with full Newton's 3rd law
+   for use by Shardlow Spliting Algorithm
+   each owned atom i checks its own bin and other bins in Newton stencil
+   every pair stored exactly once by some processor
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
+{
+  NeighListKokkos<DeviceType>* list = (NeighListKokkos<DeviceType>*) list_;
+  const int nlocal = includegroup?atom->nfirst:atom->nlocal;
+  const int nl_size = (nlocal + atom->nghost) * 4;
+  list->grow(nl_size); // Make special larger SSA neighbor list
+
+  ssa_phaseCt = sz1*sy1*sx1;
+
+  int xbin = (lbinxhi - lbinxlo + sx1 - 1) / sx1 + 1;
+  int ybin = (lbinyhi - lbinylo + sy1 - 1) / sy1 + 1;
+  int zbin = (lbinzhi - lbinzlo + sz1 - 1) / sz1 + 1;
+  int phaseLenEstimate = xbin*ybin*zbin;
+
+  if (ssa_phaseCt > (int) k_ssa_phaseLen.dimension_0()) {
+    k_ssa_phaseLen = DAT::tdual_int_1d("NPairSSAKokkos:ssa_phaseLen",ssa_phaseCt);
+    ssa_phaseLen = k_ssa_phaseLen.view<DeviceType>();
+  }
+  if ((ssa_phaseCt > (int) k_ssa_itemLoc.dimension_0()) ||
+      (phaseLenEstimate > (int) k_ssa_itemLoc.dimension_1())) {
+    k_ssa_itemLoc = DAT::tdual_int_2d("NPairSSAKokkos::ssa_itemLoc",ssa_phaseCt,phaseLenEstimate);
+    ssa_itemLoc = k_ssa_itemLoc.view<DeviceType>();
+    k_ssa_itemLen = DAT::tdual_int_2d("NPairSSAKokkos::ssa_itemLen",ssa_phaseCt,phaseLenEstimate);
+    ssa_itemLen = k_ssa_itemLen.view<DeviceType>();
+  }
+
+  NPairSSAKokkosExecute<DeviceType>
+    data(*list,
+         k_cutneighsq.view<DeviceType>(),
+         k_bincount.view<DeviceType>(),
+         k_bins.view<DeviceType>(),
+         k_gbincount.view<DeviceType>(),
+         k_gbins.view<DeviceType>(),
+         lbinxlo, lbinxhi, lbinylo, lbinyhi, lbinzlo, lbinzhi,
+         nstencil, sx1, sy1, sz1,
+         k_stencil.view<DeviceType>(),
+         k_stencilxyz.view<DeviceType>(),
+         k_nstencil_ssa.view<DeviceType>(),
+         ssa_phaseCt,
+         k_ssa_phaseLen.view<DeviceType>(),
+         k_ssa_itemLoc.view<DeviceType>(),
+         k_ssa_itemLen.view<DeviceType>(),
+         nlocal,
+         atomKK->k_x.view<DeviceType>(),
+         atomKK->k_type.view<DeviceType>(),
+         atomKK->k_mask.view<DeviceType>(),
+         atomKK->k_molecule.view<DeviceType>(),
+         atomKK->k_tag.view<DeviceType>(),
+         atomKK->k_special.view<DeviceType>(),
+         atomKK->k_nspecial.view<DeviceType>(),
+         atomKK->molecular,
+         nbinx,nbiny,nbinz,mbinx,mbiny,mbinz,mbinxlo,mbinylo,mbinzlo,
+         bininvx,bininvy,bininvz,
+         exclude, nex_type,
+         k_ex1_type.view<DeviceType>(),
+         k_ex2_type.view<DeviceType>(),
+         k_ex_type.view<DeviceType>(),
+         nex_group,
+         k_ex1_group.view<DeviceType>(),
+         k_ex2_group.view<DeviceType>(),
+         k_ex1_bit.view<DeviceType>(),
+         k_ex2_bit.view<DeviceType>(),
+         nex_mol,
+         k_ex_mol_group.view<DeviceType>(),
+         k_ex_mol_bit.view<DeviceType>(),
+         bboxhi,bboxlo,
+         domain->xperiodic,domain->yperiodic,domain->zperiodic,
+         domain->xprd_half,domain->yprd_half,domain->zprd_half);
+
+  k_cutneighsq.sync<DeviceType>();
+  k_ex1_type.sync<DeviceType>();
+  k_ex2_type.sync<DeviceType>();
+  k_ex_type.sync<DeviceType>();
+  k_ex1_group.sync<DeviceType>();
+  k_ex2_group.sync<DeviceType>();
+  k_ex1_bit.sync<DeviceType>();
+  k_ex2_bit.sync<DeviceType>();
+  k_ex_mol_group.sync<DeviceType>();
+  k_ex_mol_bit.sync<DeviceType>();
+  k_bincount.sync<DeviceType>();
+  k_bins.sync<DeviceType>();
+  k_gbincount.sync<DeviceType>();
+  k_gbins.sync<DeviceType>();
+  atomKK->sync(Device,X_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK|TAG_MASK|SPECIAL_MASK);
+
+  data.special_flag[0] = special_flag[0];
+  data.special_flag[1] = special_flag[1];
+  data.special_flag[2] = special_flag[2];
+  data.special_flag[3] = special_flag[3];
+
+  data.h_resize()=1;
+  while(data.h_resize()) {
+    data.h_new_maxneighs() = list->maxneighs;
+    data.h_resize() = 0;
+
+    Kokkos::deep_copy(data.resize, data.h_resize);
+    Kokkos::deep_copy(data.new_maxneighs, data.h_new_maxneighs);
+
+#ifdef NOTYET
+    NPairSSAKokkosBuildFunctor<DeviceType> f(data,atoms_per_bin*5*sizeof(X_FLOAT));
+    Kokkos::parallel_for(nall, f);
+#endif
+    data.build_locals();
+    data.build_ghosts();
+
+    DeviceType::fence();
+    deep_copy(data.h_resize, data.resize);
+
+    if(data.h_resize()) {
+      deep_copy(data.h_new_maxneighs, data.new_maxneighs);
+      list->maxneighs = data.h_new_maxneighs() * 1.2;
+      list->d_neighbors = typename ArrayTypes<DeviceType>::t_neighbors_2d("neighbors", list->d_neighbors.dimension_0(), list->maxneighs);
+      data.neigh_list.d_neighbors = list->d_neighbors;
+      data.neigh_list.maxneighs = list->maxneighs;
+    }
+  }
+
+  k_ssa_phaseLen.modify<DeviceType>();
+  k_ssa_itemLoc.modify<DeviceType>();
+  k_ssa_itemLen.modify<DeviceType>();
+
+  list->k_ilist.template modify<DeviceType>();
+}
+
+
+template<class DeviceType>
+void NPairSSAKokkosExecute<DeviceType>::build_locals()
+{
+  int n = 0;
+  int which = 0;
+  int inum = 0;
+
+  int workPhase = 0;
+  // loop over bins with local atoms, storing half of the neighbors
+  for (int zoff = sz1 - 1; zoff >= 0; --zoff) {
+  for (int yoff = sy1 - 1; yoff >= 0; --yoff) {
+  for (int xoff = sx1 - 1; xoff >= 0; --xoff) {
+    int workItem = 0;
+  for (int zbin = lbinzlo + zoff; zbin < lbinzhi; zbin += sz1) {
+  for (int ybin = lbinylo + yoff - sy1 + 1; ybin < lbinyhi; ybin += sy1) {
+  for (int xbin = lbinxlo + xoff - sx1 + 1; xbin < lbinxhi; xbin += sx1) {
+//    if (workItem >= phaseLenEstimate) error->one(FLERR,"phaseLenEstimate was too small");
+    d_ssa_itemLoc(workPhase, workItem) = inum; // record where workItem starts in ilist
+
+    for (int subphase = 0; subphase < 4; subphase++) {
+      int s_ybin = ybin + ((subphase & 0x2) ? sy1 - 1 : 0);
+      int s_xbin = xbin + ((subphase & 0x1) ? sx1 - 1 : 0);
+      if ((s_ybin < lbinylo) || (s_ybin >= lbinyhi)) continue;
+      if ((s_xbin < lbinxlo) || (s_xbin >= lbinxhi)) continue;
+
+      int ibin = zbin*mbiny*mbinx + s_ybin*mbinx + s_xbin;
+      for (int il = 0; il < c_bincount(ibin); ++il) {
+        const int i = c_bins(ibin, il);
+        n = 0;
+
+        const AtomNeighbors neighbors_i = neigh_list.get_neighbors(inum);
+        const X_FLOAT xtmp = x(i, 0);
+        const X_FLOAT ytmp = x(i, 1);
+        const X_FLOAT ztmp = x(i, 2);
+        const int itype = type(i);
+
+        const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil
+          = d_stencil;
+
+        // loop over all local atoms in the current stencil "subphase"
+        for (int k = d_nstencil_ssa(subphase); k < d_nstencil_ssa(subphase+1); k++) {
+          const int jbin = ibin+stencil(k);
+          int jl;
+          if (jbin != ibin) jl = 0;
+          else jl = il + 1; // same bin as i, so start just past i in the bin
+          for (; jl < c_bincount(jbin); ++jl) {
+            const int j = c_bins(jbin, jl);
+            const int jtype = type(j);
+            if(exclude && exclusion(i,j,itype,jtype)) continue;
+
+            const X_FLOAT delx = xtmp - x(j, 0);
+            const X_FLOAT dely = ytmp - x(j, 1);
+            const X_FLOAT delz = ztmp - x(j, 2);
+            const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+            if(rsq <= cutneighsq(itype,jtype)) {
+              if (molecular) {
+                if (!moltemplate)
+                  which = find_special(i,j);
+                    /* else if (imol >= 0) */
+                    /*   which = find_special(onemols[imol]->special[iatom], */
+                    /*                        onemols[imol]->nspecial[iatom], */
+                    /*                        tag[j]-tagprev); */
+                    /* else which = 0; */
+                if (which == 0){
+                  if(n<neigh_list.maxneighs) neighbors_i(n++) = j;
+                  else n++;
+                }else if (minimum_image_check(delx,dely,delz)){
+                  if(n<neigh_list.maxneighs) neighbors_i(n++) = j;
+                  else n++;
+                }
+                else if (which > 0) {
+                  if(n<neigh_list.maxneighs) neighbors_i(n++) = j ^ (which << SBBITS);
+                  else n++;
+                }
+              } else {
+                if(n<neigh_list.maxneighs) neighbors_i(n++) = j;
+                else n++;
+              }
+            }
+          }
+        }
+
+        if (n > 0) {
+          neigh_list.d_numneigh(inum) = n;
+          neigh_list.d_ilist(inum++) = i;
+          if(n > neigh_list.maxneighs) {
+            resize() = 1;
+            if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
+          }
+        }
+      }
+    }
+    // record where workItem ends in ilist
+    d_ssa_itemLen(workPhase,workItem) = inum - d_ssa_itemLoc(workPhase,workItem);
+    if (d_ssa_itemLen(workPhase,workItem) > 0) workItem++;
+  }
+  }
+  }
+
+    // record where workPhase ends
+    d_ssa_phaseLen(workPhase++) = workItem;
+  }
+  }
+  }
+
+//FIXME  if (ssa_phaseCt != workPhase) error->one(FLERR,"ssa_phaseCt was wrong");
+
+  neigh_list.inum = inum; //FIXME
+}
+
+
+template<class DeviceType>
+void NPairSSAKokkosExecute<DeviceType>::build_ghosts()
+{
+  int n = 0;
+  int which = 0;
+  int inum = neigh_list.inum;
+  int gnum = 0;
+  neigh_list.AIRct_ssa[0] = inum; //FIXME
+
+  // loop over AIR ghost atoms, storing their local neighbors
+  // since these are ghosts, must check if stencil bin is out of bounds
+  for (int airnum = 1; airnum <= 7; airnum++) {
+    int locAIRct = 0;
+    for (int il = 0; il < c_gbincount(airnum); ++il) {
+      const int i = c_gbins(airnum, il);
+      n = 0;
+
+      const AtomNeighbors neighbors_i = neigh_list.get_neighbors(inum + gnum);
+      const X_FLOAT xtmp = x(i, 0);
+      const X_FLOAT ytmp = x(i, 1);
+      const X_FLOAT ztmp = x(i, 2);
+      const int itype = type(i);
+
+      const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil
+        = d_stencil;
+
+      int loc[3];
+      const int ibin = coord2bin(x(i, 0), x(i, 1), x(i, 2), &(loc[0]));
+
+      // loop over AIR ghost atoms in all bins in "full" stencil
+      // Note: the non-AIR ghost atoms have already been filtered out
+      for (int k = 0; k < nstencil; k++) {
+        int xbin2 = loc[0] + d_stencilxyz(k,0);
+        int ybin2 = loc[1] + d_stencilxyz(k,1);
+        int zbin2 = loc[2] + d_stencilxyz(k,2);
+        // Skip it if this bin is outside the extent of local bins
+        if (xbin2 < lbinxlo || xbin2 >= lbinxhi ||
+            ybin2 < lbinylo || ybin2 >= lbinyhi ||
+            zbin2 < lbinzlo || zbin2 >= lbinzhi) continue;
+        const int jbin = ibin+stencil(k);
+        for (int jl = 0; jl < c_bincount(jbin); ++jl) {
+          const int j = c_bins(jbin, jl);
+          const int jtype = type(j);
+          if(exclude && exclusion(i,j,itype,jtype)) continue;
+
+          const X_FLOAT delx = xtmp - x(j, 0);
+          const X_FLOAT dely = ytmp - x(j, 1);
+          const X_FLOAT delz = ztmp - x(j, 2);
+          const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+          if(rsq <= cutneighsq(itype,jtype)) {
+            if (molecular) {
+              if (!moltemplate)
+                which = find_special(i,j);
+                  /* else if (imol >= 0) */
+                  /*   which = find_special(onemols[imol]->special[iatom], */
+                  /*                        onemols[imol]->nspecial[iatom], */
+                  /*                        tag[j]-tagprev); */
+                  /* else which = 0; */
+              if (which == 0){
+                if(n<neigh_list.maxneighs) neighbors_i(n++) = j;
+                else n++;
+              }else if (minimum_image_check(delx,dely,delz)){
+                if(n<neigh_list.maxneighs) neighbors_i(n++) = j;
+                else n++;
+              }
+              else if (which > 0) {
+                if(n<neigh_list.maxneighs) neighbors_i(n++) = j ^ (which << SBBITS);
+                else n++;
+              }
+            } else {
+              if(n<neigh_list.maxneighs) neighbors_i(n++) = j;
+              else n++;
+            }
+          }
+        }
+      }
+
+      if (n > 0) {
+        neigh_list.d_numneigh(inum + gnum) = n;
+        neigh_list.d_ilist(inum + (gnum++)) = i;
+        if(n > neigh_list.maxneighs) {
+          resize() = 1;
+          if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
+        }
+        ++locAIRct;
+      }
+    }
+    neigh_list.AIRct_ssa[airnum] = locAIRct; //FIXME
+  }
+  neigh_list.gnum = gnum; //FIXME
+}
+
+}
+
+namespace LAMMPS_NS {
+template class NPairSSAKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class NPairSSAKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/npair_ssa_kokkos.h b/src/KOKKOS/npair_ssa_kokkos.h
new file mode 100644
index 0000000000..a656fe32ba
--- /dev/null
+++ b/src/KOKKOS/npair_ssa_kokkos.h
@@ -0,0 +1,334 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef NPAIR_CLASS
+
+typedef NPairSSAKokkos<LMPHostType> NPairSSAKokkosHost;
+NPairStyle(half/bin/newton/ssa/kk/host,
+           NPairSSAKokkosHost,
+           NP_HALF | NP_BIN | NP_NEWTON | NP_ORTHO | NP_SSA | NP_GHOST | NP_KOKKOS_HOST)
+
+typedef NPairSSAKokkos<LMPDeviceType> NPairSSAKokkosDevice;
+NPairStyle(half/bin/newton/ssa/kk/device,
+           NPairSSAKokkosDevice,
+           NP_HALF | NP_BIN | NP_NEWTON | NP_ORTHO | NP_SSA | NP_GHOST | NP_KOKKOS_DEVICE)
+
+#else
+
+#ifndef LMP_NPAIR_SSA_KOKKOS_H
+#define LMP_NPAIR_SSA_KOKKOS_H
+
+#include "npair.h"
+#include "neigh_list_kokkos.h"
+
+namespace LAMMPS_NS {
+
+template<class DeviceType>
+class NPairSSAKokkos : public NPair {
+ public:
+  typedef ArrayTypes<DeviceType> AT;
+
+  // SSA Work plan data structures
+  int ssa_phaseCt;
+  DAT::tdual_int_1d k_ssa_phaseLen;
+  DAT::tdual_int_2d k_ssa_itemLoc;
+  DAT::tdual_int_2d k_ssa_itemLen;
+  typename AT::t_int_1d ssa_phaseLen;
+  typename AT::t_int_2d ssa_itemLoc;
+  typename AT::t_int_2d ssa_itemLen;
+
+  NPairSSAKokkos(class LAMMPS *);
+  ~NPairSSAKokkos() {}
+  void copy_neighbor_info();
+  void copy_bin_info();
+  void copy_stencil_info();
+  void build(class NeighList *);
+ private:
+  // data from Neighbor class
+
+  DAT::tdual_xfloat_2d k_cutneighsq;
+
+  // exclusion data from Neighbor class
+
+  DAT::tdual_int_1d k_ex1_type,k_ex2_type;
+  DAT::tdual_int_2d k_ex_type;
+  DAT::tdual_int_1d k_ex1_group,k_ex2_group;
+  DAT::tdual_int_1d k_ex1_bit,k_ex2_bit;
+  DAT::tdual_int_1d k_ex_mol_group;
+  DAT::tdual_int_1d k_ex_mol_bit;
+
+  // data from NBinSSA class
+
+  int atoms_per_bin;
+  DAT::tdual_int_1d k_bincount;
+  DAT::tdual_int_2d k_bins;
+  int ghosts_per_gbin;
+  DAT::tdual_int_1d k_gbincount;
+  DAT::tdual_int_2d k_gbins;
+  int lbinxlo, lbinxhi, lbinylo, lbinyhi, lbinzlo, lbinzhi;
+
+  // data from NStencilSSA class
+
+  int nstencil;
+  DAT::tdual_int_1d k_stencil;  // # of J neighs for each I
+  DAT::tdual_int_1d_3 k_stencilxyz;
+  DAT::tdual_int_1d k_nstencil_ssa;
+  int sx1, sy1, sz1;
+};
+
+template<class DeviceType>
+class NPairSSAKokkosExecute
+{
+  typedef ArrayTypes<DeviceType> AT;
+
+ public:
+  NeighListKokkos<DeviceType> neigh_list;
+
+  // data from Neighbor class
+
+  const typename AT::t_xfloat_2d_randomread cutneighsq;
+
+  // exclusion data from Neighbor class
+
+  const int exclude;
+
+  const int nex_type;
+  const typename AT::t_int_1d_const ex1_type,ex2_type;
+  const typename AT::t_int_2d_const ex_type;
+
+  const int nex_group;
+  const typename AT::t_int_1d_const ex1_group,ex2_group;
+  const typename AT::t_int_1d_const ex1_bit,ex2_bit;
+
+  const int nex_mol;
+  const typename AT::t_int_1d_const ex_mol_group;
+  const typename AT::t_int_1d_const ex_mol_bit;
+
+  // data from NBinSSA class
+
+  const typename AT::t_int_1d bincount;
+  const typename AT::t_int_1d_const c_bincount;
+  typename AT::t_int_2d bins;
+  typename AT::t_int_2d_const c_bins;
+  const typename AT::t_int_1d gbincount;
+  const typename AT::t_int_1d_const c_gbincount;
+  typename AT::t_int_2d gbins;
+  typename AT::t_int_2d_const c_gbins;
+  const int lbinxlo, lbinxhi, lbinylo, lbinyhi, lbinzlo, lbinzhi;
+
+
+  // data from NStencil class
+
+  const int nstencil;
+  const int sx1, sy1, sz1;
+  typename AT::t_int_1d d_stencil;  // # of J neighs for each I
+  typename AT::t_int_1d_3 d_stencilxyz;
+  typename AT::t_int_1d d_nstencil_ssa;
+
+  // data from Atom class
+
+  const typename AT::t_x_array_randomread x;
+  const typename AT::t_int_1d_const type,mask;
+  const typename AT::t_tagint_1d_const molecule;
+  const typename AT::t_tagint_1d_const tag;
+  const typename AT::t_tagint_2d_const special;
+  const typename AT::t_int_2d_const nspecial;
+  const int molecular;
+  int moltemplate;
+
+  int special_flag[4];
+
+  const int nbinx,nbiny,nbinz;
+  const int mbinx,mbiny,mbinz;
+  const int mbinxlo,mbinylo,mbinzlo;
+  const X_FLOAT bininvx,bininvy,bininvz;
+  X_FLOAT bboxhi[3],bboxlo[3];
+
+  const int nlocal;
+
+  typename AT::t_int_scalar resize;
+  typename AT::t_int_scalar new_maxneighs;
+  typename ArrayTypes<LMPHostType>::t_int_scalar h_resize;
+  typename ArrayTypes<LMPHostType>::t_int_scalar h_new_maxneighs;
+
+  const int xperiodic, yperiodic, zperiodic;
+  const int xprd_half, yprd_half, zprd_half;
+
+  // SSA Work plan data structures
+  int ssa_phaseCt;
+  typename AT::t_int_1d d_ssa_phaseLen;
+  typename AT::t_int_2d d_ssa_itemLoc;
+  typename AT::t_int_2d d_ssa_itemLen;
+
+  NPairSSAKokkosExecute(
+        const NeighListKokkos<DeviceType> &_neigh_list,
+        const typename AT::t_xfloat_2d_randomread &_cutneighsq,
+        const typename AT::t_int_1d &_bincount,
+        const typename AT::t_int_2d &_bins,
+        const typename AT::t_int_1d &_gbincount,
+        const typename AT::t_int_2d &_gbins,
+        const int _lbinxlo, const int _lbinxhi,
+        const int _lbinylo, const int _lbinyhi,
+        const int _lbinzlo, const int _lbinzhi,
+        const int _nstencil, const int _sx1, const int _sy1, const int _sz1,
+        const typename AT::t_int_1d &_d_stencil,
+        const typename AT::t_int_1d_3 &_d_stencilxyz,
+        const typename AT::t_int_1d &_d_nstencil_ssa,
+        const int _ssa_phaseCt,
+        const typename AT::t_int_1d &_d_ssa_phaseLen,
+        const typename AT::t_int_2d &_d_ssa_itemLoc,
+        const typename AT::t_int_2d &_d_ssa_itemLen,
+        const int _nlocal,
+        const typename AT::t_x_array_randomread &_x,
+        const typename AT::t_int_1d_const &_type,
+        const typename AT::t_int_1d_const &_mask,
+        const typename AT::t_tagint_1d_const &_molecule,
+        const typename AT::t_tagint_1d_const &_tag,
+        const typename AT::t_tagint_2d_const &_special,
+        const typename AT::t_int_2d_const &_nspecial,
+        const int &_molecular,
+        const int & _nbinx,const int & _nbiny,const int & _nbinz,
+        const int & _mbinx,const int & _mbiny,const int & _mbinz,
+        const int & _mbinxlo,const int & _mbinylo,const int & _mbinzlo,
+        const X_FLOAT &_bininvx,const X_FLOAT &_bininvy,const X_FLOAT &_bininvz,
+        const int & _exclude,const int & _nex_type,
+        const typename AT::t_int_1d_const & _ex1_type,
+        const typename AT::t_int_1d_const & _ex2_type,
+        const typename AT::t_int_2d_const & _ex_type,
+        const int & _nex_group,
+        const typename AT::t_int_1d_const & _ex1_group,
+        const typename AT::t_int_1d_const & _ex2_group,
+        const typename AT::t_int_1d_const & _ex1_bit,
+        const typename AT::t_int_1d_const & _ex2_bit,
+        const int & _nex_mol,
+        const typename AT::t_int_1d_const & _ex_mol_group,
+        const typename AT::t_int_1d_const & _ex_mol_bit,
+        const X_FLOAT *_bboxhi, const X_FLOAT* _bboxlo,
+        const int & _xperiodic, const int & _yperiodic, const int & _zperiodic,
+        const int & _xprd_half, const int & _yprd_half, const int & _zprd_half):
+    neigh_list(_neigh_list), cutneighsq(_cutneighsq),
+    bincount(_bincount),c_bincount(_bincount),bins(_bins),c_bins(_bins),
+    gbincount(_gbincount),c_gbincount(_gbincount),gbins(_gbins),c_gbins(_gbins),
+    lbinxlo(_lbinxlo),lbinxhi(_lbinxhi),
+    lbinylo(_lbinylo),lbinyhi(_lbinyhi),
+    lbinzlo(_lbinzlo),lbinzhi(_lbinzhi),
+    nstencil(_nstencil),sx1(_sx1),sy1(_sy1),sz1(_sz1),
+    d_stencil(_d_stencil),d_stencilxyz(_d_stencilxyz),d_nstencil_ssa(_d_nstencil_ssa),
+    ssa_phaseCt(_ssa_phaseCt),
+    d_ssa_phaseLen(_d_ssa_phaseLen),
+    d_ssa_itemLoc(_d_ssa_itemLoc),
+    d_ssa_itemLen(_d_ssa_itemLen),
+    nlocal(_nlocal),
+    x(_x),type(_type),mask(_mask),molecule(_molecule),
+    tag(_tag),special(_special),nspecial(_nspecial),molecular(_molecular),
+    nbinx(_nbinx),nbiny(_nbiny),nbinz(_nbinz),
+    mbinx(_mbinx),mbiny(_mbiny),mbinz(_mbinz),
+    mbinxlo(_mbinxlo),mbinylo(_mbinylo),mbinzlo(_mbinzlo),
+    bininvx(_bininvx),bininvy(_bininvy),bininvz(_bininvz),
+    exclude(_exclude),nex_type(_nex_type),
+    ex1_type(_ex1_type),ex2_type(_ex2_type),ex_type(_ex_type),
+    nex_group(_nex_group),
+    ex1_group(_ex1_group),ex2_group(_ex2_group),
+    ex1_bit(_ex1_bit),ex2_bit(_ex2_bit),nex_mol(_nex_mol),
+    ex_mol_group(_ex_mol_group),ex_mol_bit(_ex_mol_bit),
+    xperiodic(_xperiodic),yperiodic(_yperiodic),zperiodic(_zperiodic),
+    xprd_half(_xprd_half),yprd_half(_yprd_half),zprd_half(_zprd_half) {
+
+    if (molecular == 2) moltemplate = 1;
+    else moltemplate = 0;
+
+    bboxlo[0] = _bboxlo[0]; bboxlo[1] = _bboxlo[1]; bboxlo[2] = _bboxlo[2];
+    bboxhi[0] = _bboxhi[0]; bboxhi[1] = _bboxhi[1]; bboxhi[2] = _bboxhi[2];
+
+    resize = typename AT::t_int_scalar("NeighborKokkosFunctor::resize");
+#ifndef KOKKOS_USE_CUDA_UVM
+    h_resize = Kokkos::create_mirror_view(resize);
+#else
+    h_resize = resize;
+#endif
+    h_resize() = 1;
+    new_maxneighs = typename AT::
+      t_int_scalar("NeighborKokkosFunctor::new_maxneighs");
+#ifndef KOKKOS_USE_CUDA_UVM
+    h_new_maxneighs = Kokkos::create_mirror_view(new_maxneighs);
+#else
+    h_new_maxneighs = new_maxneighs;
+#endif
+    h_new_maxneighs() = neigh_list.maxneighs;
+  };
+
+  ~NPairSSAKokkosExecute() {neigh_list.clean_copy();};
+
+  void build_locals();
+  void build_ghosts();
+
+  KOKKOS_INLINE_FUNCTION
+  int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z, int* i) const
+  {
+    int ix,iy,iz;
+
+    if (x >= bboxhi[0])
+      ix = static_cast<int> ((x-bboxhi[0])*bininvx) + nbinx;
+    else if (x >= bboxlo[0]) {
+      ix = static_cast<int> ((x-bboxlo[0])*bininvx);
+      ix = MIN(ix,nbinx-1);
+    } else
+      ix = static_cast<int> ((x-bboxlo[0])*bininvx) - 1;
+
+    if (y >= bboxhi[1])
+      iy = static_cast<int> ((y-bboxhi[1])*bininvy) + nbiny;
+    else if (y >= bboxlo[1]) {
+      iy = static_cast<int> ((y-bboxlo[1])*bininvy);
+      iy = MIN(iy,nbiny-1);
+    } else
+      iy = static_cast<int> ((y-bboxlo[1])*bininvy) - 1;
+
+    if (z >= bboxhi[2])
+      iz = static_cast<int> ((z-bboxhi[2])*bininvz) + nbinz;
+    else if (z >= bboxlo[2]) {
+      iz = static_cast<int> ((z-bboxlo[2])*bininvz);
+      iz = MIN(iz,nbinz-1);
+    } else
+      iz = static_cast<int> ((z-bboxlo[2])*bininvz) - 1;
+
+    i[0] = ix - mbinxlo;
+    i[1] = iy - mbinylo;
+    i[2] = iz - mbinzlo;
+
+    return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int exclusion(const int &i,const int &j, const int &itype,const int &jtype) const;
+
+  KOKKOS_INLINE_FUNCTION
+  int find_special(const int &i, const int &j) const;
+
+  KOKKOS_INLINE_FUNCTION
+  int minimum_image_check(double dx, double dy, double dz) const {
+    if (xperiodic && fabs(dx) > xprd_half) return 1;
+    if (yperiodic && fabs(dy) > yprd_half) return 1;
+    if (zperiodic && fabs(dz) > zprd_half) return 1;
+    return 0;
+  }
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+*/

From b27cc8f474e1a1284d242d209e5fa3ba0e77c5f7 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Mon, 20 Feb 2017 14:09:11 -0500
Subject: [PATCH 151/267] USER-DPD: use LAMBDA instead of functor for ghost
 binning in nbin_ssa_kokkos

---
 src/KOKKOS/nbin_ssa_kokkos.cpp | 32 +++++++++++---------------------
 src/KOKKOS/nbin_ssa_kokkos.h   | 18 ------------------
 2 files changed, 11 insertions(+), 39 deletions(-)

diff --git a/src/KOKKOS/nbin_ssa_kokkos.cpp b/src/KOKKOS/nbin_ssa_kokkos.cpp
index 6ed8e9f3e4..32a77119de 100644
--- a/src/KOKKOS/nbin_ssa_kokkos.cpp
+++ b/src/KOKKOS/nbin_ssa_kokkos.cpp
@@ -122,9 +122,17 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
     subhi_[1] = domain->subhi[1];
     subhi_[2] = domain->subhi[2];
 
-    NPairSSAKokkosBinGhostsFunctor<DeviceType> f(*this);
-
-    Kokkos::parallel_for(atom->nghost, f);
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(atom->nlocal,atom->nlocal+atom->nghost), KOKKOS_LAMBDA (const int i) {
+      const int iAIR = coord2ssaAIR(x(i, 0), x(i, 1), x(i, 2));
+      if (iAIR > 0) { // include only ghost atoms in an AIR
+        const int ac = Kokkos::atomic_fetch_add(&gbincount[iAIR], (int)1);
+        if(ac < (int) gbins.dimension_1()) {
+          gbins(iAIR, ac) = i;
+        } else {
+          d_resize() = 1;
+        }
+      }
+    });
     DeviceType::fence();
 
     deep_copy(h_resize, d_resize);
@@ -184,24 +192,6 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void NBinSSAKokkos<DeviceType>::binGhostsItem(const int &i_) const
-{
-  const int i = i_ + atom->nlocal;
-  const int iAIR = coord2ssaAIR(x(i, 0), x(i, 1), x(i, 2));
-  if (iAIR > 0) { // include only ghost atoms in an AIR
-    const int ac = Kokkos::atomic_fetch_add(&gbincount[iAIR], (int)1);
-    if(ac < (int) gbins.dimension_1()) {
-      gbins(iAIR, ac) = i;
-    } else {
-      d_resize() = 1;
-    }
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void NBinSSAKokkos<DeviceType>::binAtomsItem(const int &i) const
diff --git a/src/KOKKOS/nbin_ssa_kokkos.h b/src/KOKKOS/nbin_ssa_kokkos.h
index a16cb2d0b7..488c1034f5 100644
--- a/src/KOKKOS/nbin_ssa_kokkos.h
+++ b/src/KOKKOS/nbin_ssa_kokkos.h
@@ -77,9 +77,6 @@ class NBinSSAKokkos : public NBinStandard {
   KOKKOS_INLINE_FUNCTION
   void binAtomsItem(const int &i) const;
 
-  KOKKOS_INLINE_FUNCTION
-  void binGhostsItem(const int &i) const;
-
 /* ----------------------------------------------------------------------
    convert atom coords into the ssa active interaction region number
 ------------------------------------------------------------------------- */
@@ -153,21 +150,6 @@ class NBinSSAKokkos : public NBinStandard {
   double sublo_[3], subhi_[3];
 };
 
-template<class DeviceType>
-struct NPairSSAKokkosBinGhostsFunctor {
-  typedef DeviceType device_type;
-
-  const NBinSSAKokkos<DeviceType> c;
-
-  NPairSSAKokkosBinGhostsFunctor(const NBinSSAKokkos<DeviceType> &_c):
-    c(_c) {};
-  ~NPairSSAKokkosBinGhostsFunctor() {}
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int & i) const {
-    c.binGhostsItem(i);
-  }
-};
-
 template<class DeviceType>
 struct NPairSSAKokkosBinAtomsFunctor {
   typedef DeviceType device_type;

From 1db62a57b5ddbd579f1040d977ee659b2c377f89 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 24 Feb 2017 13:17:49 -0500
Subject: [PATCH 152/267] USER-DPD: pair_dpd_fdt_energy_kokkos: enable
 STACKPARAMS specialization

---
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp | 117 ++++++++++++++--------
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.h   |  24 ++---
 2 files changed, 89 insertions(+), 52 deletions(-)

diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
index 84a489bcc3..aaf638fac3 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
@@ -49,7 +49,6 @@ PairDPDfdtEnergyKokkos<DeviceType>::PairDPDfdtEnergyKokkos(LAMMPS *lmp) :
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
   datamask_read = EMPTY_MASK;
   datamask_modify = EMPTY_MASK;
-  STACKPARAMS = 0;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -171,21 +170,41 @@ void PairDPDfdtEnergyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
   if (splitFDT_flag) {
     if (!a0_is_zero) {
-      if (neighflag == HALF) {
-        if (newton_pair) {
-          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,1,1> >(0,inum),*this,ev);
-          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,1,0> >(0,inum),*this);
-        } else {
-          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,0,1> >(0,inum),*this,ev);
-          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,0,0> >(0,inum),*this);
+      if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
+        if (neighflag == HALF) {
+          if (newton_pair) {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,1,1,false> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,1,0,false> >(0,inum),*this);
+          } else {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,0,1,false> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,0,0,false> >(0,inum),*this);
+          }
+        } else if (neighflag == HALFTHREAD) {
+          if (newton_pair) {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,1,1,false> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,1,0,false> >(0,inum),*this);
+          } else {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,0,1,false> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,0,0,false> >(0,inum),*this);
+          }
         }
-      } else if (neighflag == HALFTHREAD) {
-        if (newton_pair) {
-          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,1,1> >(0,inum),*this,ev);
-          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,1,0> >(0,inum),*this);
-        } else {
-          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,0,1> >(0,inum),*this,ev);
-          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,0,0> >(0,inum),*this);
+      } else {
+        if (neighflag == HALF) {
+          if (newton_pair) {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,1,1,true> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,1,0,true> >(0,inum),*this);
+          } else {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,0,1,true> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,0,0,true> >(0,inum),*this);
+          }
+        } else if (neighflag == HALFTHREAD) {
+          if (newton_pair) {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,1,1,true> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,1,0,true> >(0,inum),*this);
+          } else {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,0,1,true> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,0,0,true> >(0,inum),*this);
+          }
         }
       }
     }
@@ -209,21 +228,41 @@ void PairDPDfdtEnergyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
     // loop over neighbors of my atoms
 
-    if (neighflag == HALF) {
-      if (newton_pair) {
-        if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,1,1> >(0,inum),*this,ev);
-        else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,1,0> >(0,inum),*this);
-      } else {
-        if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,0,1> >(0,inum),*this,ev);
-        else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,0,0> >(0,inum),*this);
+    if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
+      if (neighflag == HALF) {
+        if (newton_pair) {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,1,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,1,0,false> >(0,inum),*this);
+        } else {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,0,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,0,0,false> >(0,inum),*this);
+        }
+      } else if (neighflag == HALFTHREAD) {
+        if (newton_pair) {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,1,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,1,0,false> >(0,inum),*this);
+        } else {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,0,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,0,0,false> >(0,inum),*this);
+        }
       }
-    } else if (neighflag == HALFTHREAD) {
-      if (newton_pair) {
-        if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,1,1> >(0,inum),*this,ev);
-        else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,1,0> >(0,inum),*this);
-      } else {
-        if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,0,1> >(0,inum),*this,ev);
-        else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,0,0> >(0,inum),*this);
+    } else {
+      if (neighflag == HALF) {
+        if (newton_pair) {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,1,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,1,0,false> >(0,inum),*this);
+        } else {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,0,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,0,0,false> >(0,inum),*this);
+        }
+      } else if (neighflag == HALFTHREAD) {
+        if (newton_pair) {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,1,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,1,0,false> >(0,inum),*this);
+        } else {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,0,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,0,0,false> >(0,inum),*this);
+        }
       }
     }
 
@@ -270,9 +309,9 @@ void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyZero, con
 }
 
 template<class DeviceType>
-template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool STACKPARAMS>
 KOKKOS_INLINE_FUNCTION
-void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii, EV_FLOAT& ev) const {
+void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG,STACKPARAMS>, const int &ii, EV_FLOAT& ev) const {
 
   // The f array is atomic for Half/Thread neighbor style
   Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
@@ -346,17 +385,17 @@ void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeSp
 }
 
 template<class DeviceType>
-template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool STACKPARAMS>
 KOKKOS_INLINE_FUNCTION
-void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii) const {
+void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG,STACKPARAMS>, const int &ii) const {
   EV_FLOAT ev;
-  this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(TagPairDPDfdtEnergyComputeSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(), ii, ev);
+  this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(TagPairDPDfdtEnergyComputeSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG,STACKPARAMS>(), ii, ev);
 }
 
 template<class DeviceType>
-template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool STACKPARAMS>
 KOKKOS_INLINE_FUNCTION
-void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeNoSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii, EV_FLOAT& ev) const {
+void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeNoSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG,STACKPARAMS>, const int &ii, EV_FLOAT& ev) const {
 
   // These array are atomic for Half/Thread neighbor style
   Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
@@ -503,11 +542,11 @@ void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeNo
 }
 
 template<class DeviceType>
-template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool STACKPARAMS>
 KOKKOS_INLINE_FUNCTION
-void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeNoSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii) const {
+void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeNoSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG,STACKPARAMS>, const int &ii) const {
   EV_FLOAT ev;
-  this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(TagPairDPDfdtEnergyComputeNoSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(), ii, ev);
+  this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(TagPairDPDfdtEnergyComputeNoSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG,STACKPARAMS>(), ii, ev);
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
index 7d1749eb94..9689712273 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
@@ -32,10 +32,10 @@ namespace LAMMPS_NS {
 
 struct TagPairDPDfdtEnergyZero{};
 
-template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool STACKPARAMS>
 struct TagPairDPDfdtEnergyComputeSplit{};
 
-template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool STACKPARAMS>
 struct TagPairDPDfdtEnergyComputeNoSplit{};
 
 template<class DeviceType>
@@ -54,21 +54,21 @@ class PairDPDfdtEnergyKokkos : public PairDPDfdtEnergy {
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairDPDfdtEnergyZero, const int&) const;
 
-  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool STACKPARAMS>
   KOKKOS_INLINE_FUNCTION
-  void operator()(TagPairDPDfdtEnergyComputeSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&, EV_FLOAT&) const;
+  void operator()(TagPairDPDfdtEnergyComputeSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG,STACKPARAMS>, const int&, EV_FLOAT&) const;
 
-  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool STACKPARAMS>
   KOKKOS_INLINE_FUNCTION
-  void operator()(TagPairDPDfdtEnergyComputeSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&) const;
+  void operator()(TagPairDPDfdtEnergyComputeSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG,STACKPARAMS>, const int&) const;
 
-  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool STACKPARAMS>
   KOKKOS_INLINE_FUNCTION
-  void operator()(TagPairDPDfdtEnergyComputeNoSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&, EV_FLOAT&) const;
+  void operator()(TagPairDPDfdtEnergyComputeNoSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG,STACKPARAMS>, const int&, EV_FLOAT&) const;
 
-  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool STACKPARAMS>
   KOKKOS_INLINE_FUNCTION
-  void operator()(TagPairDPDfdtEnergyComputeNoSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&) const;
+  void operator()(TagPairDPDfdtEnergyComputeNoSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG,STACKPARAMS>, const int&) const;
 
   template<int NEIGHFLAG, int NEWTON_PAIR>
   KOKKOS_INLINE_FUNCTION
@@ -92,7 +92,6 @@ class PairDPDfdtEnergyKokkos : public PairDPDfdtEnergy {
  protected:
   int eflag,vflag;
   int nlocal,neighflag;
-  int STACKPARAMS;
   double dtinvsqrt;
   double boltz,ftm2v;
   double special_lj[4];
@@ -102,11 +101,10 @@ class PairDPDfdtEnergyKokkos : public PairDPDfdtEnergy {
   Kokkos::DualView<params_dpd**,Kokkos::LayoutRight,DeviceType> k_params;
   typename Kokkos::DualView<params_dpd**,
     Kokkos::LayoutRight,DeviceType>::t_dev_const_um params;
-  // hardwired to space for 15 atom types
+  // hardwired to space for MAX_TYPES_STACKPARAMS (12) atom types
   params_dpd m_params[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
 
   F_FLOAT m_cutsq[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
-  F_FLOAT m_cut[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
   typename ArrayTypes<DeviceType>::t_x_array_randomread x;
   typename ArrayTypes<DeviceType>::t_x_array c_x;
   typename ArrayTypes<DeviceType>::t_v_array_randomread v;

From aecafecaa2f89f6db8c90ec5af0db429e736b82e Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 24 Feb 2017 13:21:26 -0500
Subject: [PATCH 153/267] USER-DPD: fix missing host prefixes in
 AtomVecDPDKokkos::pack_comm

---
 src/KOKKOS/atom_vec_dpd_kokkos.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.cpp b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
index f46f284f14..18f63599e4 100644
--- a/src/KOKKOS/atom_vec_dpd_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
@@ -563,10 +563,10 @@ int AtomVecDPDKokkos::pack_comm(int n, int *list, double *buf,
       buf[m++] = h_x(j,0);
       buf[m++] = h_x(j,1);
       buf[m++] = h_x(j,2);
-      buf[m++] = dpdTheta[j];
-      buf[m++] = uCond[j];
-      buf[m++] = uMech[j];
-      buf[m++] = uChem[j];
+      buf[m++] = h_dpdTheta[j];
+      buf[m++] = h_uCond[j];
+      buf[m++] = h_uMech[j];
+      buf[m++] = h_uChem[j];
     }
   } else {
     if (domain->triclinic == 0) {

From 2f04e87d0794c66e9fbe0073690e64f7353cfcec Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 24 Feb 2017 13:24:18 -0500
Subject: [PATCH 154/267] USER-DPD: make PairDPDfdtEnergyKokkos's rand_pool
 public so it can be reused

---
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
index 9689712273..deb264c37e 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
@@ -89,6 +89,15 @@ class PairDPDfdtEnergyKokkos : public PairDPDfdtEnergy {
 
   DAT::tdual_efloat_1d k_duCond,k_duMech;
 
+  Kokkos::Random_XorShift64_Pool<DeviceType> rand_pool;
+  typedef typename Kokkos::Random_XorShift64_Pool<DeviceType>::generator_type rand_type;
+
+  // RandPoolWrap rand_pool;
+  // typedef RandWrap rand_type;
+
+  typename ArrayTypes<DeviceType>::tdual_ffloat_2d k_cutsq;
+  typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq;
+
  protected:
   int eflag,vflag;
   int nlocal,neighflag;
@@ -125,15 +134,6 @@ class PairDPDfdtEnergyKokkos : public PairDPDfdtEnergy {
   typename AT::t_int_1d_randomread d_ilist;
   typename AT::t_int_1d_randomread d_numneigh;
 
-  typename ArrayTypes<DeviceType>::tdual_ffloat_2d k_cutsq;
-  typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq;
-
-  /**/Kokkos::Random_XorShift64_Pool<DeviceType> rand_pool;
-  typedef typename Kokkos::Random_XorShift64_Pool<DeviceType>::generator_type rand_type;/**/
-
-  /**RandPoolWrap rand_pool;
-  typedef RandWrap rand_type;/**/
-
   friend void pair_virial_fdotr_compute<PairDPDfdtEnergyKokkos>(PairDPDfdtEnergyKokkos*);
 };
 

From a341a6bca927e84a9fc947e402459466ceded503 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 24 Feb 2017 13:27:27 -0500
Subject: [PATCH 155/267] USER-DPD: make locals & ghosts use similar SSA work
 plan data structure Kokkos SSA won't use AIRct_ssa[], but still used for
 non-Kokkos for now.

---
 src/KOKKOS/npair_ssa_kokkos.cpp | 31 +++++++++++++++++++++++++------
 src/KOKKOS/npair_ssa_kokkos.h   | 20 ++++++++++++++++++++
 2 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp
index 752fc0c938..c70fd0087e 100644
--- a/src/KOKKOS/npair_ssa_kokkos.cpp
+++ b/src/KOKKOS/npair_ssa_kokkos.cpp
@@ -31,7 +31,7 @@ namespace LAMMPS_NS {
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
-NPairSSAKokkos<DeviceType>::NPairSSAKokkos(LAMMPS *lmp) : NPair(lmp), ssa_phaseCt(27)
+NPairSSAKokkos<DeviceType>::NPairSSAKokkos(LAMMPS *lmp) : NPair(lmp), ssa_phaseCt(27), ssa_gphaseCt(7)
 {
 }
 
@@ -214,6 +214,7 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
   int ybin = (lbinyhi - lbinylo + sy1 - 1) / sy1 + 1;
   int zbin = (lbinzhi - lbinzlo + sz1 - 1) / sz1 + 1;
   int phaseLenEstimate = xbin*ybin*zbin;
+  int gphaseLenEstimate = 1; //FIXME make this 4 eventually
 
   if (ssa_phaseCt > (int) k_ssa_phaseLen.dimension_0()) {
     k_ssa_phaseLen = DAT::tdual_int_1d("NPairSSAKokkos:ssa_phaseLen",ssa_phaseCt);
@@ -227,6 +228,18 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
     ssa_itemLen = k_ssa_itemLen.view<DeviceType>();
   }
 
+  if (ssa_gphaseCt > (int) k_ssa_gphaseLen.dimension_0()) {
+    k_ssa_gphaseLen = DAT::tdual_int_1d("NPairSSAKokkos:ssa_gphaseLen",ssa_gphaseCt);
+    ssa_gphaseLen = k_ssa_gphaseLen.view<DeviceType>();
+  }
+  if ((ssa_gphaseCt > (int) k_ssa_gitemLoc.dimension_0()) ||
+      (gphaseLenEstimate > (int) k_ssa_gitemLoc.dimension_1())) {
+    k_ssa_gitemLoc = DAT::tdual_int_2d("NPairSSAKokkos::ssa_gitemLoc",ssa_gphaseCt,gphaseLenEstimate);
+    ssa_gitemLoc = k_ssa_gitemLoc.view<DeviceType>();
+    k_ssa_gitemLen = DAT::tdual_int_2d("NPairSSAKokkos::ssa_gitemLen",ssa_gphaseCt,gphaseLenEstimate);
+    ssa_gitemLen = k_ssa_gitemLen.view<DeviceType>();
+  }
+
   NPairSSAKokkosExecute<DeviceType>
     data(*list,
          k_cutneighsq.view<DeviceType>(),
@@ -243,6 +256,10 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
          k_ssa_phaseLen.view<DeviceType>(),
          k_ssa_itemLoc.view<DeviceType>(),
          k_ssa_itemLen.view<DeviceType>(),
+         ssa_gphaseCt,
+         k_ssa_gphaseLen.view<DeviceType>(),
+         k_ssa_gitemLoc.view<DeviceType>(),
+         k_ssa_gitemLen.view<DeviceType>(),
          nlocal,
          atomKK->k_x.view<DeviceType>(),
          atomKK->k_type.view<DeviceType>(),
@@ -444,12 +461,13 @@ void NPairSSAKokkosExecute<DeviceType>::build_ghosts()
   int which = 0;
   int inum = neigh_list.inum;
   int gnum = 0;
-  neigh_list.AIRct_ssa[0] = inum; //FIXME
 
   // loop over AIR ghost atoms, storing their local neighbors
   // since these are ghosts, must check if stencil bin is out of bounds
-  for (int airnum = 1; airnum <= 7; airnum++) {
-    int locAIRct = 0;
+  for (int workPhase = 0; workPhase < ssa_gphaseCt; workPhase++) {
+    int airnum = workPhase + 1;
+    int workItem = 0; //FIXME for now, there is only 1 workItem for each ghost AIR
+    d_ssa_gitemLoc(workPhase, workItem) = inum + gnum; // record where workItem starts in ilist
     for (int il = 0; il < c_gbincount(airnum); ++il) {
       const int i = c_gbins(airnum, il);
       n = 0;
@@ -521,10 +539,11 @@ void NPairSSAKokkosExecute<DeviceType>::build_ghosts()
           resize() = 1;
           if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
         }
-        ++locAIRct;
       }
     }
-    neigh_list.AIRct_ssa[airnum] = locAIRct; //FIXME
+    // record where workItem ends in ilist
+    d_ssa_gitemLen(workPhase,workItem) = inum + gnum - d_ssa_gitemLoc(workPhase,workItem);
+    if (d_ssa_gitemLen(workPhase,workItem) > 0) workItem++;
   }
   neigh_list.gnum = gnum; //FIXME
 }
diff --git a/src/KOKKOS/npair_ssa_kokkos.h b/src/KOKKOS/npair_ssa_kokkos.h
index a656fe32ba..e38d648984 100644
--- a/src/KOKKOS/npair_ssa_kokkos.h
+++ b/src/KOKKOS/npair_ssa_kokkos.h
@@ -47,6 +47,14 @@ class NPairSSAKokkos : public NPair {
   typename AT::t_int_2d ssa_itemLoc;
   typename AT::t_int_2d ssa_itemLen;
 
+  const int ssa_gphaseCt;
+  DAT::tdual_int_1d k_ssa_gphaseLen;
+  DAT::tdual_int_2d k_ssa_gitemLoc;
+  DAT::tdual_int_2d k_ssa_gitemLen;
+  typename AT::t_int_1d ssa_gphaseLen;
+  typename AT::t_int_2d ssa_gitemLoc;
+  typename AT::t_int_2d ssa_gitemLen;
+
   NPairSSAKokkos(class LAMMPS *);
   ~NPairSSAKokkos() {}
   void copy_neighbor_info();
@@ -169,6 +177,10 @@ class NPairSSAKokkosExecute
   typename AT::t_int_1d d_ssa_phaseLen;
   typename AT::t_int_2d d_ssa_itemLoc;
   typename AT::t_int_2d d_ssa_itemLen;
+  int ssa_gphaseCt;
+  typename AT::t_int_1d d_ssa_gphaseLen;
+  typename AT::t_int_2d d_ssa_gitemLoc;
+  typename AT::t_int_2d d_ssa_gitemLen;
 
   NPairSSAKokkosExecute(
         const NeighListKokkos<DeviceType> &_neigh_list,
@@ -188,6 +200,10 @@ class NPairSSAKokkosExecute
         const typename AT::t_int_1d &_d_ssa_phaseLen,
         const typename AT::t_int_2d &_d_ssa_itemLoc,
         const typename AT::t_int_2d &_d_ssa_itemLen,
+        const int _ssa_gphaseCt,
+        const typename AT::t_int_1d &_d_ssa_gphaseLen,
+        const typename AT::t_int_2d &_d_ssa_gitemLoc,
+        const typename AT::t_int_2d &_d_ssa_gitemLen,
         const int _nlocal,
         const typename AT::t_x_array_randomread &_x,
         const typename AT::t_int_1d_const &_type,
@@ -228,6 +244,10 @@ class NPairSSAKokkosExecute
     d_ssa_phaseLen(_d_ssa_phaseLen),
     d_ssa_itemLoc(_d_ssa_itemLoc),
     d_ssa_itemLen(_d_ssa_itemLen),
+    ssa_gphaseCt(_ssa_gphaseCt),
+    d_ssa_gphaseLen(_d_ssa_gphaseLen),
+    d_ssa_gitemLoc(_d_ssa_gitemLoc),
+    d_ssa_gitemLen(_d_ssa_gitemLen),
     nlocal(_nlocal),
     x(_x),type(_type),mask(_mask),molecule(_molecule),
     tag(_tag),special(_special),nspecial(_nspecial),molecular(_molecular),

From f7a48719adba859eede3808e69556f1e33e4dbf0 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 24 Feb 2017 13:35:48 -0500
Subject: [PATCH 156/267] USER-DPD: first attempt at fix_shardlow_kokkos... It
 compiles!

---
 src/KOKKOS/fix_shardlow_kokkos.cpp | 718 +++++++++++++++++++++++++++++
 src/KOKKOS/fix_shardlow_kokkos.h   | 154 +++++++
 2 files changed, 872 insertions(+)
 create mode 100644 src/KOKKOS/fix_shardlow_kokkos.cpp
 create mode 100644 src/KOKKOS/fix_shardlow_kokkos.h

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
new file mode 100644
index 0000000000..7b2810bb4c
--- /dev/null
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -0,0 +1,718 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors:
+   James Larentzos (U.S. Army Research Laboratory)
+   and Timothy I. Mattox (Engility Corporation)
+
+   Martin Lisal (Institute of Chemical Process Fundamentals
+   of the Czech Academy of Sciences and J. E. Purkinje University)
+
+   John Brennan, Joshua Moore and William Mattson (Army Research Lab)
+
+   Please cite the related publications:
+   J. P. Larentzos, J. K. Brennan, J. D. Moore, M. Lisal, W. D. Mattson,
+   "Parallel implementation of isothermal and isoenergetic Dissipative
+   Particle Dynamics using Shardlow-like splitting algorithms",
+   Computer Physics Communications, 2014, 185, pp 1987--1998.
+
+   M. Lisal, J. K. Brennan, J. Bonet Avalos, "Dissipative particle dynamics
+   at isothermal, isobaric, isoenergetic, and isoenthalpic conditions using
+   Shardlow-like splitting algorithms", Journal of Chemical Physics, 2011,
+   135, 204105.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "fix_shardlow_kokkos.h"
+#include "atom.h"
+#include "atom_masks.h"
+#include "atom_kokkos.h"
+#include "force.h"
+#include "update.h"
+#include "respa.h"
+#include "error.h"
+#include <math.h>
+#include "atom_vec.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "neigh_list_kokkos.h"
+#include "neigh_request.h"
+#include "random_mars.h"
+#include "memory.h"
+#include "domain.h"
+#include "modify.h"
+// #include "pair_dpd_fdt.h"
+#include "pair_dpd_fdt_energy_kokkos.h"
+#include "pair.h"
+#include "npair_ssa_kokkos.h"
+#include "citeme.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+#define EPSILON 1.0e-10
+#define EPSILON_SQUARED ((EPSILON) * (EPSILON))
+
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+FixShardlowKokkos<DeviceType>::FixShardlowKokkos(LAMMPS *lmp, int narg, char **arg) :
+  FixShardlow(lmp, narg, arg), k_pairDPDE(NULL), ghostmax(0), nlocal(0) , nghost(0)
+{
+  kokkosable = 1;
+//  atomKK = (AtomKokkos *) atom;
+//  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+
+//  datamask_read = X_MASK | V_MASK | F_MASK | MASK_MASK | Q_MASK | TYPE_MASK;
+//  datamask_modify = Q_MASK | X_MASK;
+
+  if (narg != 3) error->all(FLERR,"Illegal fix shardlow command");
+
+//  k_pairDPD = NULL;
+  k_pairDPDE = NULL;
+//  k_pairDPD = (PairDPDfdtKokkos *) force->pair_match("dpd/fdt",1);
+  k_pairDPDE = (PairDPDfdtEnergyKokkos<DeviceType> *) force->pair_match("dpd/fdt/energy/kk",1);
+
+//   if(k_pairDPDE){
+    comm_forward = 3;
+    comm_reverse = 5;
+    p_rand_pool = &(k_pairDPDE->rand_pool);
+//   } else {
+//     comm_forward = 3;
+//     comm_reverse = 3;
+//     p_rand_pool = &(k_pairDPD->rand_pool);
+//   }
+
+
+  if(/* k_pairDPD == NULL &&*/ k_pairDPDE == NULL)
+    error->all(FLERR,"Must use pair_style "/*"dpd/fdt/kk or "*/"dpd/fdt/energy/kk with fix shardlow/kk");
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+FixShardlowKokkos<DeviceType>::~FixShardlowKokkos()
+{
+  ghostmax = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int FixShardlowKokkos<DeviceType>::setmask()
+{
+  int mask = 0;
+  mask |= INITIAL_INTEGRATE | PRE_NEIGHBOR;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixShardlowKokkos<DeviceType>::init()
+{
+  FixShardlow::init();
+
+  int irequest = neighbor->nrequest - 1;
+
+  neighbor->requests[irequest]->
+    kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value &&
+    !Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+  neighbor->requests[irequest]->
+    kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+
+//  neighbor->requests[irequest]->pair = 0;
+//  neighbor->requests[irequest]->fix  = 1;
+//  neighbor->requests[irequest]->ghost= 1;
+//  neighbor->requests[irequest]->ssa  = 1;
+
+  int ntypes = atom->ntypes;
+  k_params = Kokkos::DualView<params_ssa**,Kokkos::LayoutRight,DeviceType>
+    ("FixShardlowKokkos::params",ntypes+1,ntypes+1);
+  params = k_params.template view<DeviceType>();
+//FIXME either create cutsq and fill it in, or just point to pairDPD's...
+//  memory->destroy(cutsq); //FIXME
+//  memory->create_kokkos(k_cutsq,cutsq,ntypes+1,ntypes+1,"FixShardlowKokkos:cutsq");
+  d_cutsq = k_pairDPDE->k_cutsq.template view<DeviceType>(); //FIXME
+
+  const double boltz2 = 2.0*force->boltz;
+  for (int i = 1; i <= ntypes; i++) {
+    for (int j = i; j <= ntypes; j++) {
+      F_FLOAT cutone = k_pairDPDE->cut[i][j];
+//      k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutone*cutone; //FIXME
+      if (cutone > EPSILON) k_params.h_view(i,j).cutinv = 1.0/cutone;
+      else k_params.h_view(i,j).cutinv = FLT_MAX;
+      k_params.h_view(i,j).halfsigma = 0.5*k_pairDPDE->sigma[i][j];
+      k_params.h_view(i,j).kappa = k_pairDPDE->kappa[i][j];
+      k_params.h_view(i,j).alpha = sqrt(boltz2*k_pairDPDE->kappa[i][j]);
+
+      k_params.h_view(j,i) = k_params.h_view(i,j);
+
+      if(i<MAX_TYPES_STACKPARAMS+1 && j<MAX_TYPES_STACKPARAMS+1) {
+        m_params[i][j] = m_params[j][i] = k_params.h_view(i,j);
+        m_cutsq[j][i] = m_cutsq[i][j] = k_pairDPDE->k_cutsq.h_view(i,j);
+      }
+    }
+  }
+
+  // k_cutsq.template modify<LMPHostType>();
+  k_params.template modify<LMPHostType>();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixShardlowKokkos<DeviceType>::init_list(int id, NeighList *ptr)
+{
+  k_list = static_cast<NeighListKokkos<DeviceType>*>(ptr);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixShardlowKokkos<DeviceType>::pre_neighbor()
+{
+  // NOTE: this logic is specific to orthogonal boxes, not triclinic
+
+  // Enforce the constraint that ghosts must be contained in the nearest sub-domains
+  double bbx = domain->subhi[0] - domain->sublo[0];
+  double bby = domain->subhi[1] - domain->sublo[1];
+  double bbz = domain->subhi[2] - domain->sublo[2];
+
+  double rcut = 2.0*neighbor->cutneighmax;
+
+  if (domain->triclinic)
+    error->all(FLERR,"Fix shardlow does not yet support triclinic geometries");
+
+  if(rcut >= bbx || rcut >= bby || rcut>= bbz )
+  {
+    char fmt[] = {"Shardlow algorithm requires sub-domain length > 2*(rcut+skin). Either reduce the number of processors requested, or change the cutoff/skin: rcut= %e bbx= %e bby= %e bbz= %e\n"};
+    char *msg = (char *) malloc(sizeof(fmt) + 4*15);
+    sprintf(msg, fmt, rcut, bbx, bby, bbz);
+    error->one(FLERR, msg);
+  }
+
+  nlocal = atomKK->nlocal;
+  nghost = atomKK->nghost;
+
+  // Allocate memory for h_v_t0 to hold the initial velocities for the ghosts
+  if (nghost > ghostmax) {
+    ghostmax = nghost;
+    k_v_t0 = DAT::tdual_v_array("FixShardlowKokkos:v_t0", ghostmax);
+    // d_v_t0 = k_v_t0.template view<DeviceType>();
+    h_v_t0 = k_v_t0.h_view;
+  }
+
+  // Setup views of relevant data
+  x = atomKK->k_x.template view<DeviceType>();
+  v = atomKK->k_v.template view<DeviceType>();
+  h_v = atomKK->k_v.h_view;
+  uCond = atomKK->k_uCond.template view<DeviceType>();
+  h_uCond = atomKK->k_uCond.h_view;
+  uMech = atomKK->k_uMech.template view<DeviceType>();
+  h_uMech = atomKK->k_uMech.h_view;
+  type = atomKK->k_type.view<DeviceType>();
+  if (atomKK->rmass) {
+    massPerI = true;
+    masses = atomKK->k_rmass.view<DeviceType>();
+  } else {
+    massPerI = false;
+    masses = atomKK->k_mass.view<DeviceType>();
+  }
+//   if(k_pairDPDE){
+  dpdTheta = atomKK->k_dpdTheta.view<DeviceType>();
+
+//} else {
+//}
+}
+
+template<class DeviceType>
+void FixShardlowKokkos<DeviceType>::setup_pre_neighbor()
+{
+  pre_neighbor();
+}
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef NOTNOW
+/* ----------------------------------------------------------------------
+   Perform the stochastic integration and Shardlow update for constant temperature
+   Allow for both per-type and per-atom mass
+
+   NOTE: only implemented for orthogonal boxes, not triclinic
+------------------------------------------------------------------------- */
+template<class DeviceType>
+template<bool STACKPARAMS>
+void FixShardlowKokkos<DeviceType>::ssa_update_dpd(
+  int start_ii, int count
+)
+{
+  rand_type rand_gen = p_rand_pool->get_state();
+
+  const double theta_ij_inv = 1.0/k_pairDPD->temperature; // independent of i,j
+  const double boltz_inv = 1.0/force->boltz;
+  const double ftm2v = force->ftm2v;
+  const double dt     = update->dt;
+  int ct = count;
+  int ii = start_ii;
+
+  while (ct-- > 0) {
+    const int i = d_ilist(ii);
+    const int jlen = d_numneigh(ii);
+
+    const double xtmp = x(i, 0);
+    const double ytmp = x(i, 1);
+    const double ztmp = x(i, 2);
+
+    // load velocity for i from memory
+    double vxi = v(i, 0);
+    double vyi = v(i, 1);
+    double vzi = v(i, 2);
+
+    const int itype = type(i);
+
+    const double mass_i = masses(massPerI ? i : itype);
+    const double massinv_i = 1.0 / mass_i;
+
+    // Loop over Directional Neighbors only
+    for (int jj = 0; jj < jlen; jj++) {
+      const int j = d_neighbors(ii,jj) & NEIGHMASK;
+      int jtype = type[j];
+
+      const X_FLOAT delx = xtmp - x(j, 0);
+      const X_FLOAT dely = ytmp - x(j, 1);
+      const X_FLOAT delz = ztmp - x(j, 2);
+      const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+      // NOTE: r can be 0.0 in DPD systems, so do EPSILON_SQUARED test
+      if ((rsq < STACKPARAMS?m_cutsq[itype][jtype]:d_cutsq(itype,jtype))
+        && (rsq >= EPSILON_SQUARED)) {
+        double r = sqrt(rsq);
+        double rinv = 1.0/r;
+        double delx_rinv = delx*rinv;
+        double dely_rinv = dely*rinv;
+        double delz_rinv = delz*rinv;
+
+        double wr = 1.0 - r*(STACKPARAMS?m_params[itype][jtype].cutinv:params(itype,jtype).cutinv);
+        double wdt = wr*wr*dt;
+
+        double halfsigma_ij = STACKPARAMS?m_params[itype][jtype].halfsigma:params(itype,jtype).halfsigma;
+        double halfgamma_ij = halfsigma_ij*halfsigma_ij*boltz_inv*theta_ij_inv;
+
+        double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * pRNG->gaussian();
+
+        const double mass_j = masses(massPerI ? j : jtype);
+        double massinv_j = 1.0 / mass_j;
+
+        double gammaFactor = halfgamma_ij*wdt*ftm2v;
+        double inv_1p_mu_gammaFactor = 1.0/(1.0 + (massinv_i + massinv_j)*gammaFactor);
+
+        double vxj = v(j, 0);
+        double vyj = v(j, 1);
+        double vzj = v(j, 2);
+
+        // Compute the initial velocity difference between atom i and atom j
+        double delvx = vxi - vxj;
+        double delvy = vyi - vyj;
+        double delvz = vzi - vzj;
+        double dot_rinv = (delx_rinv*delvx + dely_rinv*delvy + delz_rinv*delvz);
+
+        // Compute momentum change between t and t+dt
+        double factorA = sigmaRand - gammaFactor*dot_rinv;
+
+        // Update the velocity on i
+        vxi += delx_rinv*factorA*massinv_i;
+        vyi += dely_rinv*factorA*massinv_i;
+        vzi += delz_rinv*factorA*massinv_i;
+
+        // Update the velocity on j
+        vxj -= delx_rinv*factorA*massinv_j;
+        vyj -= dely_rinv*factorA*massinv_j;
+        vzj -= delz_rinv*factorA*massinv_j;
+
+        //ii.   Compute the new velocity diff
+        delvx = vxi - vxj;
+        delvy = vyi - vyj;
+        delvz = vzi - vzj;
+        dot_rinv = delx_rinv*delvx + dely_rinv*delvy + delz_rinv*delvz;
+
+        // Compute the new momentum change between t and t+dt
+        double factorB = (sigmaRand - gammaFactor*dot_rinv)*inv_1p_mu_gammaFactor;
+
+        // Update the velocity on i
+        vxi += delx_rinv*factorB*massinv_i;
+        vyi += dely_rinv*factorB*massinv_i;
+        vzi += delz_rinv*factorB*massinv_i;
+
+        // Update the velocity on j
+        vxj -= delx_rinv*factorB*massinv_j;
+        vyj -= dely_rinv*factorB*massinv_j;
+        vzj -= delz_rinv*factorB*massinv_j;
+
+        // Store updated velocity for j
+        v(j, 0) = vxj;
+        v(j, 1) = vyj;
+        v(j, 2) = vzj;
+      }
+    }
+    // store updated velocity for i
+    v(i, 0) = vxi;
+    v(i, 1) = vyi;
+    v(i, 2) = vzi;
+  }
+
+  p_rand_pool->free_state(rand_gen);
+}
+#endif
+
+/* ----------------------------------------------------------------------
+   Perform the stochastic integration and Shardlow update for constant energy
+   Allow for both per-type and per-atom mass
+
+   NOTE: only implemented for orthogonal boxes, not triclinic
+------------------------------------------------------------------------- */
+template<class DeviceType>
+template<bool STACKPARAMS>
+void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
+  int start_ii, int count
+)
+{
+  rand_type rand_gen = p_rand_pool->get_state();
+
+  const double boltz_inv = 1.0/force->boltz;
+  const double ftm2v = force->ftm2v;
+  const double dt     = update->dt;
+  int ct = count;
+  int ii = start_ii;
+
+  while (ct-- > 0) {
+    const int i = d_ilist(ii);
+    const int jlen = d_numneigh(ii);
+
+    const double xtmp = x(i, 0);
+    const double ytmp = x(i, 1);
+    const double ztmp = x(i, 2);
+
+    // load velocity for i from memory
+    double vxi = v(i, 0);
+    double vyi = v(i, 1);
+    double vzi = v(i, 2);
+
+    double uMech_i = uMech(i);
+    double uCond_i = uCond(i);
+    const int itype = type(i);
+
+    const double theta_i_inv = 1.0/dpdTheta(i);
+    const double mass_i = masses(massPerI ? i : itype);
+    const double massinv_i = 1.0 / mass_i;
+    const double mass_i_div_neg4_ftm2v = mass_i*(-0.25)/ftm2v;
+
+    // Loop over Directional Neighbors only
+    for (int jj = 0; jj < jlen; jj++) {
+      const int j = d_neighbors(ii,jj) & NEIGHMASK;
+      const int jtype = type(j);
+
+      const X_FLOAT delx = xtmp - x(j, 0);
+      const X_FLOAT dely = ytmp - x(j, 1);
+      const X_FLOAT delz = ztmp - x(j, 2);
+      const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+      // NOTE: r can be 0.0 in DPD systems, so do EPSILON_SQUARED test
+      if ((rsq < STACKPARAMS?m_cutsq[itype][jtype]:d_cutsq(itype,jtype))
+        && (rsq >= EPSILON_SQUARED)) {
+        double r = sqrt(rsq);
+        double rinv = 1.0/r;
+        double delx_rinv = delx*rinv;
+        double dely_rinv = dely*rinv;
+        double delz_rinv = delz*rinv;
+
+        double wr = 1.0 - r*(STACKPARAMS?m_params[itype][jtype].cutinv:params(itype,jtype).cutinv);
+        double wdt = wr*wr*dt;
+
+        // Compute the current temperature
+        double theta_j_inv = 1.0/dpdTheta(j);
+        double theta_ij_inv = 0.5*(theta_i_inv + theta_j_inv);
+
+        double halfsigma_ij = STACKPARAMS?m_params[itype][jtype].halfsigma:params(itype,jtype).halfsigma;
+        double halfgamma_ij = halfsigma_ij*halfsigma_ij*boltz_inv*theta_ij_inv;
+
+        double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * rand_gen.normal();
+
+        const double mass_j = masses(massPerI ? j : jtype);
+        double mass_ij_div_neg4_ftm2v = mass_j*mass_i_div_neg4_ftm2v;
+        double massinv_j = 1.0 / mass_j;
+
+        // Compute uCond
+        double kappa_ij = STACKPARAMS?m_params[itype][jtype].kappa:params(itype,jtype).kappa;
+        double alpha_ij = STACKPARAMS?m_params[itype][jtype].alpha:params(itype,jtype).alpha;
+        double del_uCond = alpha_ij*wr*dtsqrt * rand_gen.normal();
+
+        del_uCond += kappa_ij*(theta_i_inv - theta_j_inv)*wdt;
+        uCond[j] -= del_uCond;
+        uCond_i += del_uCond;
+
+        double gammaFactor = halfgamma_ij*wdt*ftm2v;
+        double inv_1p_mu_gammaFactor = 1.0/(1.0 + (massinv_i + massinv_j)*gammaFactor);
+
+        double vxj = v(j, 0);
+        double vyj = v(j, 1);
+        double vzj = v(j, 2);
+        double dot4 = vxj*vxj + vyj*vyj + vzj*vzj;
+        double dot3 = vxi*vxi + vyi*vyi + vzi*vzi;
+
+        // Compute the initial velocity difference between atom i and atom j
+        double delvx = vxi - vxj;
+        double delvy = vyi - vyj;
+        double delvz = vzi - vzj;
+        double dot_rinv = (delx_rinv*delvx + dely_rinv*delvy + delz_rinv*delvz);
+
+        // Compute momentum change between t and t+dt
+        double factorA = sigmaRand - gammaFactor*dot_rinv;
+
+        // Update the velocity on i
+        vxi += delx_rinv*factorA*massinv_i;
+        vyi += dely_rinv*factorA*massinv_i;
+        vzi += delz_rinv*factorA*massinv_i;
+
+        // Update the velocity on j
+        vxj -= delx_rinv*factorA*massinv_j;
+        vyj -= dely_rinv*factorA*massinv_j;
+        vzj -= delz_rinv*factorA*massinv_j;
+
+        //ii.   Compute the new velocity diff
+        delvx = vxi - vxj;
+        delvy = vyi - vyj;
+        delvz = vzi - vzj;
+        dot_rinv = delx_rinv*delvx + dely_rinv*delvy + delz_rinv*delvz;
+
+        // Compute the new momentum change between t and t+dt
+        double factorB = (sigmaRand - gammaFactor*dot_rinv)*inv_1p_mu_gammaFactor;
+
+        // Update the velocity on i
+        vxi += delx_rinv*factorB*massinv_i;
+        vyi += dely_rinv*factorB*massinv_i;
+        vzi += delz_rinv*factorB*massinv_i;
+        double partial_uMech = (vxi*vxi + vyi*vyi + vzi*vzi - dot3)*massinv_j;
+
+        // Update the velocity on j
+        vxj -= delx_rinv*factorB*massinv_j;
+        vyj -= dely_rinv*factorB*massinv_j;
+        vzj -= delz_rinv*factorB*massinv_j;
+        partial_uMech += (vxj*vxj + vyj*vyj + vzj*vzj - dot4)*massinv_i;
+
+        // Store updated velocity for j
+        v(j, 0) = vxj;
+        v(j, 1) = vyj;
+        v(j, 2) = vzj;
+
+        // Compute uMech
+        double del_uMech = partial_uMech*mass_ij_div_neg4_ftm2v;
+        uMech_i += del_uMech;
+        uMech(j) += del_uMech;
+      }
+    }
+    // store updated velocity for i
+    v(i, 0) = vxi;
+    v(i, 1) = vyi;
+    v(i, 2) = vzi;
+    // store updated uMech and uCond for i
+    uMech(i) = uMech_i;
+    uCond(i) = uCond_i;
+    ii++;
+  }
+
+  p_rand_pool->free_state(rand_gen);
+}
+
+
+template<class DeviceType>
+void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
+{
+  d_numneigh = k_list->d_numneigh;
+  d_neighbors = k_list->d_neighbors;
+  d_ilist = k_list->d_ilist;
+
+  k_list->clean_copy();
+  //cleanup_copy();
+  copymode = 1;
+
+  dtsqrt = sqrt(update->dt);
+
+  NPairSSAKokkos<DeviceType> *np_ssa = dynamic_cast<NPairSSAKokkos<DeviceType>*>(list->np);
+  if (!np_ssa) error->one(FLERR, "NPair wasn't a NPairSSAKokkos object");
+  ssa_phaseCt = np_ssa->ssa_phaseCt;
+  ssa_phaseLen = np_ssa->ssa_phaseLen;
+  ssa_itemLoc = np_ssa->ssa_itemLoc;
+  ssa_itemLen = np_ssa->ssa_itemLen;
+  ssa_gphaseCt = np_ssa->ssa_gphaseCt;
+  ssa_gphaseLen = np_ssa->ssa_gphaseLen;
+  ssa_gitemLoc = np_ssa->ssa_gitemLoc;
+  ssa_gitemLen = np_ssa->ssa_gitemLen;
+
+  // process neighbors in the local AIR
+  for (int workPhase = 0; workPhase < ssa_phaseCt; ++workPhase) {
+    int workItemCt = ssa_phaseLen[workPhase];
+
+    if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
+      Kokkos::parallel_for(workItemCt, KOKKOS_LAMBDA (const int workItem ) {
+        int ct = ssa_itemLen(workPhase, workItem);
+        int ii = ssa_itemLoc(workPhase, workItem);
+        ssa_update_dpde<false>(ii, ct);
+      });
+    } else {
+      Kokkos::parallel_for(workItemCt, KOKKOS_LAMBDA (const int workItem ) {
+        int ct = ssa_itemLen(workPhase, workItem);
+        int ii = ssa_itemLoc(workPhase, workItem);
+        ssa_update_dpde<true>(ii, ct);
+      });
+    }
+  }
+
+  //Loop over all 13 outward directions (7 stages)
+  for (int workPhase = 0; workPhase < ssa_gphaseCt; ++workPhase) {
+    // int airnum = workPhase + 1;
+    int workItemCt = ssa_gphaseLen[workPhase];
+
+    // Communicate the updated velocities to all nodes
+    comm->forward_comm_fix(this);
+
+    if(k_pairDPDE){
+      // Zero out the ghosts' uCond & uMech to be used as delta accumulators
+//      memset(&(atom->uCond[nlocal]), 0, sizeof(double)*nghost);
+//      memset(&(atom->uMech[nlocal]), 0, sizeof(double)*nghost);
+
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(nlocal,nlocal+nghost), KOKKOS_LAMBDA (const int i) {
+        uCond(i) = 0.0;
+        uMech(i) = 0.0;
+      });
+      DeviceType::fence();
+    }
+
+    // process neighbors in this AIR
+    if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
+      Kokkos::parallel_for(workItemCt, KOKKOS_LAMBDA (const int workItem ) {
+        int ct = ssa_gitemLen(workPhase, workItem);
+        int ii = ssa_gitemLoc(workPhase, workItem);
+        ssa_update_dpde<false>(ii, ct);
+      });
+    } else {
+      Kokkos::parallel_for(workItemCt, KOKKOS_LAMBDA (const int workItem ) {
+        int ct = ssa_gitemLen(workPhase, workItem);
+        int ii = ssa_gitemLoc(workPhase, workItem);
+        ssa_update_dpde<true>(ii, ct);
+      });
+    }
+
+    // Communicate the ghost deltas to the atom owners
+    comm->reverse_comm_fix(this);
+
+  }  //End Loop over all directions For airnum = Top, Top-Right, Right, Bottom-Right, Back
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int FixShardlowKokkos<DeviceType>::pack_forward_comm(int n, int *list, double *buf, int pbc_flag, int *pbc)
+{
+  int ii,jj,m;
+
+  m = 0;
+  for (ii = 0; ii < n; ii++) {
+    jj = list[ii];
+    buf[m++] = h_v(jj, 0);
+    buf[m++] = h_v(jj, 1);
+    buf[m++] = h_v(jj, 2);
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixShardlowKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *buf)
+{
+  int ii,m,last;
+
+  m = 0;
+  last = first + n ;
+  for (ii = first; ii < last; ii++) {
+    h_v_t0(ii - nlocal, 0) = h_v(ii, 0) = buf[m++];
+    h_v_t0(ii - nlocal, 1) = h_v(ii, 1) = buf[m++];
+    h_v_t0(ii - nlocal, 2) = h_v(ii, 2) = buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int FixShardlowKokkos<DeviceType>::pack_reverse_comm(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    buf[m++] = h_v(i, 0) - h_v_t0(i - nlocal, 0);
+    buf[m++] = h_v(i, 1) - h_v_t0(i - nlocal, 1);
+    buf[m++] = h_v(i, 2) - h_v_t0(i - nlocal, 2);
+    if(k_pairDPDE){
+      buf[m++] = h_uCond(i); // for ghosts, this is an accumulated delta
+      buf[m++] = h_uMech(i); // for ghosts, this is an accumulated delta
+    }
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixShardlowKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, double *buf)
+{
+  int i,j,m;
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+
+    h_v(j, 0) += buf[m++];
+    h_v(j, 1) += buf[m++];
+    h_v(j, 2) += buf[m++];
+    if(k_pairDPDE){
+      h_uCond(j) += buf[m++]; // add in the accumulated delta
+      h_uMech(j) += buf[m++]; // add in the accumulated delta
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+double FixShardlowKokkos<DeviceType>::memory_usage()
+{
+  double bytes = 0.0;
+  bytes += sizeof(double)*3*ghostmax; // v_t0[]
+  return bytes;
+}
+
+namespace LAMMPS_NS {
+template class FixShardlowKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class FixShardlowKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/fix_shardlow_kokkos.h b/src/KOKKOS/fix_shardlow_kokkos.h
new file mode 100644
index 0000000000..08d9034fdf
--- /dev/null
+++ b/src/KOKKOS/fix_shardlow_kokkos.h
@@ -0,0 +1,154 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(shardlow/kk,FixShardlowKokkos<LMPDeviceType>)
+FixStyle(shardlow/kk/device,FixShardlowKokkos<LMPDeviceType>)
+FixStyle(shardlow/kk/host,FixShardlowKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_FIX_SHARDLOW_KOKKOS_H
+#define LMP_FIX_SHARDLOW_KOKKOS_H
+
+#include "float.h"
+#include "fix_shardlow.h"
+#include "kokkos_type.h"
+#include "neigh_list_kokkos.h"
+#include "pair_dpd_fdt_energy_kokkos.h"
+
+namespace LAMMPS_NS {
+
+template<class DeviceType>
+class FixShardlowKokkos : public FixShardlow {
+ public:
+  typedef ArrayTypes<DeviceType> AT;
+  NeighListKokkos<DeviceType> *k_list; // The SSA specific neighbor list
+
+  FixShardlowKokkos(class LAMMPS *, int, char **);
+  ~FixShardlowKokkos();
+  int setmask();
+  virtual void init();
+  virtual void init_list(int, class NeighList *);
+  virtual void initial_integrate(int);
+  void setup_pre_neighbor();
+  void pre_neighbor();
+
+  double memory_usage();
+
+  int pack_reverse_comm(int, int, double *);
+  void unpack_reverse_comm(int, int *, double *);
+  int pack_forward_comm(int , int *, double *, int, int *);
+  void unpack_forward_comm(int , int , double *);
+
+  struct params_ssa {
+    KOKKOS_INLINE_FUNCTION
+    params_ssa(){cutinv=FLT_MAX;halfsigma=0;kappa=0;alpha=0;};
+    KOKKOS_INLINE_FUNCTION
+    params_ssa(int i){cutinv=FLT_MAX;halfsigma=0;kappa=0;alpha=0;};
+    F_FLOAT cutinv,halfsigma,kappa,alpha;
+  };
+
+ protected:
+//  class PairDPDfdt *pairDPD;
+  PairDPDfdtEnergyKokkos<DeviceType> *k_pairDPDE;
+  Kokkos::Random_XorShift64_Pool<DeviceType> *p_rand_pool;
+  typedef typename Kokkos::Random_XorShift64_Pool<DeviceType>::generator_type rand_type;
+
+  Kokkos::DualView<params_ssa**,Kokkos::LayoutRight,DeviceType> k_params;
+  typename Kokkos::DualView<params_ssa**,
+    Kokkos::LayoutRight,DeviceType>::t_dev_const_um params;
+  // hardwired to space for MAX_TYPES_STACKPARAMS (12) atom types
+  params_ssa m_params[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
+
+  F_FLOAT m_cutsq[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
+  typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq;
+
+  typename DAT::tdual_v_array k_v_t0;
+  // typename AT::t_v_array d_v_t0; v_t0 only used in comm routines (on host)
+  typename HAT::t_v_array h_v_t0;
+
+  typename AT::t_x_array x;
+  typename AT::t_v_array v;
+  typename HAT::t_v_array h_v;
+  typename AT::t_efloat_1d uCond, uMech;
+  typename HAT::t_efloat_1d h_uCond, h_uMech;
+  typename AT::t_int_1d type;
+  bool massPerI;
+  typename AT::t_float_1d_randomread masses;
+  typename AT::t_efloat_1d dpdTheta;
+
+  double dtsqrt; // = sqrt(update->dt);
+  int ghostmax;
+  int nlocal, nghost;
+
+  typename AT::t_neighbors_2d d_neighbors;
+  typename AT::t_int_1d_randomread d_ilist, d_numneigh;
+
+  int ssa_phaseCt;
+  typename AT::t_int_1d ssa_phaseLen;
+  typename AT::t_int_2d ssa_itemLoc, ssa_itemLen;
+
+  int ssa_gphaseCt;
+  typename AT::t_int_1d ssa_gphaseLen;
+  typename AT::t_int_2d ssa_gitemLoc, ssa_gitemLen;
+
+
+//  template<bool STACKPARAMS>
+//  void ssa_update_dpd(int, int);  // Constant Temperature
+  template<bool STACKPARAMS>
+  void ssa_update_dpde(int, int); // Constant Energy
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Must use dpd/fdt pair_style with fix shardlow
+
+Self-explanatory.
+
+E: Must use pair_style dpd/fdt or dpd/fdt/energy with fix shardlow
+
+E: A deterministic integrator must be specified after fix shardlow in input
+file (e.g. fix nve or fix nph).
+
+Self-explanatory.
+
+E: Cannot use constant temperature integration routines with DPD
+
+Self-explanatory.  Must use deterministic integrators such as nve or nph
+
+E: Fix shardlow does not yet support triclinic geometries
+
+Self-explanatory.
+
+E:  Shardlow algorithm requires sub-domain length > 2*(rcut+skin). Either
+reduce the number of processors requested, or change the cutoff/skin
+
+The Shardlow splitting algorithm requires the size of the sub-domain lengths
+to be are larger than twice the cutoff+skin.  Generally, the domain decomposition
+is dependant on the number of processors requested.
+
+*/

From 71379487abc7062a698e960c68a97766829bffdf Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 24 Feb 2017 17:35:58 -0500
Subject: [PATCH 157/267] USER-DPD: variety of fixes for new SSA Kokkos code.
 Still not functional.

---
 src/KOKKOS/fix_shardlow_kokkos.cpp |  1 +
 src/KOKKOS/nbin_ssa_kokkos.cpp     | 18 ++++++++++++++++++
 src/KOKKOS/npair_ssa_kokkos.cpp    | 10 ++++++++--
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index 7b2810bb4c..a01cc36c3e 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -179,6 +179,7 @@ void FixShardlowKokkos<DeviceType>::init()
 template<class DeviceType>
 void FixShardlowKokkos<DeviceType>::init_list(int id, NeighList *ptr)
 {
+  FixShardlow::init_list(id, ptr);
   k_list = static_cast<NeighListKokkos<DeviceType>*>(ptr);
 }
 
diff --git a/src/KOKKOS/nbin_ssa_kokkos.cpp b/src/KOKKOS/nbin_ssa_kokkos.cpp
index 32a77119de..ebd07752b0 100644
--- a/src/KOKKOS/nbin_ssa_kokkos.cpp
+++ b/src/KOKKOS/nbin_ssa_kokkos.cpp
@@ -38,10 +38,28 @@ NBinSSAKokkos<DeviceType>::NBinSSAKokkos(LAMMPS *lmp) : NBinStandard(lmp)
   atoms_per_bin = ghosts_per_gbin = 16;
 
   d_resize = typename AT::t_int_scalar("NBinSSAKokkos::d_resize");
+  d_lbinxlo = typename AT::t_int_scalar("NBinSSAKokkos::d_lbinxlo");
+  d_lbinylo = typename AT::t_int_scalar("NBinSSAKokkos::d_lbinylo");
+  d_lbinzlo = typename AT::t_int_scalar("NBinSSAKokkos::d_lbinzlo");
+  d_lbinxhi = typename AT::t_int_scalar("NBinSSAKokkos::d_lbinxhi");
+  d_lbinyhi = typename AT::t_int_scalar("NBinSSAKokkos::d_lbinyhi");
+  d_lbinzhi = typename AT::t_int_scalar("NBinSSAKokkos::d_lbinzhi");
 #ifndef KOKKOS_USE_CUDA_UVM
   h_resize = Kokkos::create_mirror_view(d_resize);
+  h_lbinxlo = Kokkos::create_mirror_view(d_lbinxlo);
+  h_lbinylo = Kokkos::create_mirror_view(d_lbinylo);
+  h_lbinzlo = Kokkos::create_mirror_view(d_lbinzlo);
+  h_lbinxhi = Kokkos::create_mirror_view(d_lbinxhi);
+  h_lbinyhi = Kokkos::create_mirror_view(d_lbinyhi);
+  h_lbinzhi = Kokkos::create_mirror_view(d_lbinzhi);
 #else
   h_resize = d_resize;
+  h_lbinxlo = d_lbinxlo;
+  h_lbinylo = d_lbinylo;
+  h_lbinzlo = d_lbinzlo;
+  h_lbinxhi = d_lbinxhi;
+  h_lbinyhi = d_lbinyhi;
+  h_lbinzhi = d_lbinzhi;
 #endif
   h_resize() = 1;
 
diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp
index c70fd0087e..f94d51197a 100644
--- a/src/KOKKOS/npair_ssa_kokkos.cpp
+++ b/src/KOKKOS/npair_ssa_kokkos.cpp
@@ -338,6 +338,12 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
   k_ssa_phaseLen.modify<DeviceType>();
   k_ssa_itemLoc.modify<DeviceType>();
   k_ssa_itemLen.modify<DeviceType>();
+  k_ssa_gphaseLen.modify<DeviceType>();
+  k_ssa_gitemLoc.modify<DeviceType>();
+  k_ssa_gitemLen.modify<DeviceType>();
+
+  list->inum = data.neigh_list.inum; //FIXME once the above is in a parallel_for
+  list->gnum = data.neigh_list.gnum; // it will need a deep_copy or something
 
   list->k_ilist.template modify<DeviceType>();
 }
@@ -450,7 +456,7 @@ void NPairSSAKokkosExecute<DeviceType>::build_locals()
 
 //FIXME  if (ssa_phaseCt != workPhase) error->one(FLERR,"ssa_phaseCt was wrong");
 
-  neigh_list.inum = inum; //FIXME
+  neigh_list.inum = inum;
 }
 
 
@@ -545,7 +551,7 @@ void NPairSSAKokkosExecute<DeviceType>::build_ghosts()
     d_ssa_gitemLen(workPhase,workItem) = inum + gnum - d_ssa_gitemLoc(workPhase,workItem);
     if (d_ssa_gitemLen(workPhase,workItem) > 0) workItem++;
   }
-  neigh_list.gnum = gnum; //FIXME
+  neigh_list.gnum = gnum;
 }
 
 }

From c56e0692b9141d1f4442b61f95a7e47d998a44dc Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 24 Feb 2017 17:38:46 -0500
Subject: [PATCH 158/267] USER-DPD Kokkos: enable install of SSA Kokkos code

---
 src/KOKKOS/Install.sh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index ea70ae4ca1..dda1ba011b 100644
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -103,6 +103,8 @@ action fix_reaxc_species_kokkos.cpp fix_reaxc_species.cpp
 action fix_reaxc_species_kokkos.h fix_reaxc_species.h
 action fix_setforce_kokkos.cpp
 action fix_setforce_kokkos.h
+action fix_shardlow_kokkos.cpp fix_shardlow.cpp
+action fix_shardlow_kokkos.h fix_shardlow.h
 action fix_momentum_kokkos.cpp
 action fix_momentum_kokkos.h
 action fix_wall_reflect_kokkos.cpp
@@ -134,8 +136,12 @@ action npair_copy_kokkos.cpp
 action npair_copy_kokkos.h
 action npair_kokkos.cpp
 action npair_kokkos.h
+action npair_ssa_kokkos.cpp npair_half_bin_newton_ssa.cpp
+action npair_ssa_kokkos.h npair_half_bin_newton_ssa.h
 action nbin_kokkos.cpp
 action nbin_kokkos.h
+action nbin_ssa_kokkos.cpp nbin_ssa.cpp
+action nbin_ssa_kokkos.h nbin_ssa.h
 action math_special_kokkos.cpp
 action math_special_kokkos.h
 action pair_buck_coul_cut_kokkos.cpp

From 6ea290a69963a9e6619c51e81f004906194870fe Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 24 Feb 2017 17:41:57 -0500
Subject: [PATCH 159/267] DEBUG: make FixShardlowKokkos have it's own
 rand_pool,  plus debug code. ssa_update_dpde() hangs on first use of
 rand_gen.normal() Switching to not using a pointer to
 PairDPDfdtEnergyKokkos's rand_pool had no noticble effect.

---
 src/KOKKOS/fix_shardlow_kokkos.cpp | 37 ++++++++++++++++++++----------
 src/KOKKOS/fix_shardlow_kokkos.h   |  5 ++--
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index a01cc36c3e..fe05db6d33 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -71,7 +71,7 @@ using namespace FixConst;
 
 template<class DeviceType>
 FixShardlowKokkos<DeviceType>::FixShardlowKokkos(LAMMPS *lmp, int narg, char **arg) :
-  FixShardlow(lmp, narg, arg), k_pairDPDE(NULL), ghostmax(0), nlocal(0) , nghost(0)
+  FixShardlow(lmp, narg, arg), k_pairDPDE(NULL), ghostmax(0), nlocal(0) , nghost(0), rand_pool(comm->me)
 {
   kokkosable = 1;
 //  atomKK = (AtomKokkos *) atom;
@@ -85,12 +85,12 @@ FixShardlowKokkos<DeviceType>::FixShardlowKokkos(LAMMPS *lmp, int narg, char **a
 //  k_pairDPD = NULL;
   k_pairDPDE = NULL;
 //  k_pairDPD = (PairDPDfdtKokkos *) force->pair_match("dpd/fdt",1);
-  k_pairDPDE = (PairDPDfdtEnergyKokkos<DeviceType> *) force->pair_match("dpd/fdt/energy/kk",1);
+  k_pairDPDE = dynamic_cast<PairDPDfdtEnergyKokkos<DeviceType> *>(force->pair_match("dpd/fdt/energy",0));
 
 //   if(k_pairDPDE){
     comm_forward = 3;
     comm_reverse = 5;
-    p_rand_pool = &(k_pairDPDE->rand_pool);
+//    p_rand_pool = &(k_pairDPDE->rand_pool);
 //   } else {
 //     comm_forward = 3;
 //     comm_reverse = 3;
@@ -263,7 +263,8 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpd(
   int start_ii, int count
 )
 {
-  rand_type rand_gen = p_rand_pool->get_state();
+  rand_type rand_gen = rand_pool.get_state();
+//  rand_type rand_gen = p_rand_pool->get_state();
 
   const double theta_ij_inv = 1.0/k_pairDPD->temperature; // independent of i,j
   const double boltz_inv = 1.0/force->boltz;
@@ -377,7 +378,8 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpd(
     v(i, 2) = vzi;
   }
 
-  p_rand_pool->free_state(rand_gen);
+//  p_rand_pool->free_state(rand_gen);
+  rand_pool.free_state(rand_gen);
 }
 #endif
 
@@ -390,10 +392,13 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpd(
 template<class DeviceType>
 template<bool STACKPARAMS>
 void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
-  int start_ii, int count
+  int start_ii, int count, int id
 )
 {
-  rand_type rand_gen = p_rand_pool->get_state();
+  rand_type rand_gen = rand_pool.get_state();
+//  rand_type rand_gen = p_rand_pool->get_state();
+
+//fprintf(stderr, "ssa_update_dpde(%d,%d,%d)\n", start_ii, count, id);
 
   const double boltz_inv = 1.0/force->boltz;
   const double ftm2v = force->ftm2v;
@@ -401,6 +406,11 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
   int ct = count;
   int ii = start_ii;
 
+//  double randnum1 = rand_gen.normal();
+//fprintf(stderr, "randnum1 = %g\n", randnum1);
+//  double randnum2 = rand_gen.normal();
+//fprintf(stderr, "randnum2 = %g\n", randnum2);
+
   while (ct-- > 0) {
     const int i = d_ilist(ii);
     const int jlen = d_numneigh(ii);
@@ -453,6 +463,7 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
         double halfgamma_ij = halfsigma_ij*halfsigma_ij*boltz_inv*theta_ij_inv;
 
         double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * rand_gen.normal();
+//        double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * randnum1;//rand_gen.normal();
 
         const double mass_j = masses(massPerI ? j : jtype);
         double mass_ij_div_neg4_ftm2v = mass_j*mass_i_div_neg4_ftm2v;
@@ -462,6 +473,7 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
         double kappa_ij = STACKPARAMS?m_params[itype][jtype].kappa:params(itype,jtype).kappa;
         double alpha_ij = STACKPARAMS?m_params[itype][jtype].alpha:params(itype,jtype).alpha;
         double del_uCond = alpha_ij*wr*dtsqrt * rand_gen.normal();
+//        double del_uCond = alpha_ij*wr*dtsqrt * randnum2;//rand_gen.normal();
 
         del_uCond += kappa_ij*(theta_i_inv - theta_j_inv)*wdt;
         uCond[j] -= del_uCond;
@@ -537,7 +549,8 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
     ii++;
   }
 
-  p_rand_pool->free_state(rand_gen);
+  rand_pool.free_state(rand_gen);
+//  p_rand_pool->free_state(rand_gen);
 }
 
 
@@ -573,13 +586,13 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
       Kokkos::parallel_for(workItemCt, KOKKOS_LAMBDA (const int workItem ) {
         int ct = ssa_itemLen(workPhase, workItem);
         int ii = ssa_itemLoc(workPhase, workItem);
-        ssa_update_dpde<false>(ii, ct);
+        ssa_update_dpde<false>(ii, ct, workItem);
       });
     } else {
       Kokkos::parallel_for(workItemCt, KOKKOS_LAMBDA (const int workItem ) {
         int ct = ssa_itemLen(workPhase, workItem);
         int ii = ssa_itemLoc(workPhase, workItem);
-        ssa_update_dpde<true>(ii, ct);
+        ssa_update_dpde<true>(ii, ct, workItem);
       });
     }
   }
@@ -609,13 +622,13 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
       Kokkos::parallel_for(workItemCt, KOKKOS_LAMBDA (const int workItem ) {
         int ct = ssa_gitemLen(workPhase, workItem);
         int ii = ssa_gitemLoc(workPhase, workItem);
-        ssa_update_dpde<false>(ii, ct);
+        ssa_update_dpde<false>(ii, ct, workItem);
       });
     } else {
       Kokkos::parallel_for(workItemCt, KOKKOS_LAMBDA (const int workItem ) {
         int ct = ssa_gitemLen(workPhase, workItem);
         int ii = ssa_gitemLoc(workPhase, workItem);
-        ssa_update_dpde<true>(ii, ct);
+        ssa_update_dpde<true>(ii, ct, workItem);
       });
     }
 
diff --git a/src/KOKKOS/fix_shardlow_kokkos.h b/src/KOKKOS/fix_shardlow_kokkos.h
index 08d9034fdf..b4267226e6 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.h
+++ b/src/KOKKOS/fix_shardlow_kokkos.h
@@ -63,7 +63,8 @@ class FixShardlowKokkos : public FixShardlow {
  protected:
 //  class PairDPDfdt *pairDPD;
   PairDPDfdtEnergyKokkos<DeviceType> *k_pairDPDE;
-  Kokkos::Random_XorShift64_Pool<DeviceType> *p_rand_pool;
+  Kokkos::Random_XorShift64_Pool<DeviceType> rand_pool;
+//  Kokkos::Random_XorShift64_Pool<DeviceType> *p_rand_pool;
   typedef typename Kokkos::Random_XorShift64_Pool<DeviceType>::generator_type rand_type;
 
   Kokkos::DualView<params_ssa**,Kokkos::LayoutRight,DeviceType> k_params;
@@ -108,7 +109,7 @@ class FixShardlowKokkos : public FixShardlow {
 //  template<bool STACKPARAMS>
 //  void ssa_update_dpd(int, int);  // Constant Temperature
   template<bool STACKPARAMS>
-  void ssa_update_dpde(int, int); // Constant Energy
+  void ssa_update_dpde(int, int, int); // Constant Energy
 
 };
 

From c2e3a76225f421bee13b7256b8be8f1730049214 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 24 Feb 2017 19:07:55 -0500
Subject: [PATCH 160/267] USER-DPD Kokkos: rand seed can't be zero, so add some
 salt.

---
 src/KOKKOS/fix_shardlow_kokkos.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index fe05db6d33..65bb7033bb 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -71,7 +71,7 @@ using namespace FixConst;
 
 template<class DeviceType>
 FixShardlowKokkos<DeviceType>::FixShardlowKokkos(LAMMPS *lmp, int narg, char **arg) :
-  FixShardlow(lmp, narg, arg), k_pairDPDE(NULL), ghostmax(0), nlocal(0) , nghost(0), rand_pool(comm->me)
+  FixShardlow(lmp, narg, arg), k_pairDPDE(NULL), ghostmax(0), nlocal(0) , nghost(0), rand_pool(1234567 + comm->me)
 {
   kokkosable = 1;
 //  atomKK = (AtomKokkos *) atom;

From b053c367ea1edd00138e68f673a8928dd9d42151 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 24 Feb 2017 19:09:07 -0500
Subject: [PATCH 161/267] USER-DPD Kokkos: remove extranious debugging code

---
 src/KOKKOS/fix_shardlow_kokkos.cpp | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index 65bb7033bb..1ec1455b23 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -90,11 +90,9 @@ FixShardlowKokkos<DeviceType>::FixShardlowKokkos(LAMMPS *lmp, int narg, char **a
 //   if(k_pairDPDE){
     comm_forward = 3;
     comm_reverse = 5;
-//    p_rand_pool = &(k_pairDPDE->rand_pool);
 //   } else {
 //     comm_forward = 3;
 //     comm_reverse = 3;
-//     p_rand_pool = &(k_pairDPD->rand_pool);
 //   }
 
 
@@ -264,7 +262,6 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpd(
 )
 {
   rand_type rand_gen = rand_pool.get_state();
-//  rand_type rand_gen = p_rand_pool->get_state();
 
   const double theta_ij_inv = 1.0/k_pairDPD->temperature; // independent of i,j
   const double boltz_inv = 1.0/force->boltz;
@@ -378,7 +375,6 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpd(
     v(i, 2) = vzi;
   }
 
-//  p_rand_pool->free_state(rand_gen);
   rand_pool.free_state(rand_gen);
 }
 #endif
@@ -396,9 +392,6 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
 )
 {
   rand_type rand_gen = rand_pool.get_state();
-//  rand_type rand_gen = p_rand_pool->get_state();
-
-//fprintf(stderr, "ssa_update_dpde(%d,%d,%d)\n", start_ii, count, id);
 
   const double boltz_inv = 1.0/force->boltz;
   const double ftm2v = force->ftm2v;
@@ -406,11 +399,6 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
   int ct = count;
   int ii = start_ii;
 
-//  double randnum1 = rand_gen.normal();
-//fprintf(stderr, "randnum1 = %g\n", randnum1);
-//  double randnum2 = rand_gen.normal();
-//fprintf(stderr, "randnum2 = %g\n", randnum2);
-
   while (ct-- > 0) {
     const int i = d_ilist(ii);
     const int jlen = d_numneigh(ii);
@@ -463,7 +451,6 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
         double halfgamma_ij = halfsigma_ij*halfsigma_ij*boltz_inv*theta_ij_inv;
 
         double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * rand_gen.normal();
-//        double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * randnum1;//rand_gen.normal();
 
         const double mass_j = masses(massPerI ? j : jtype);
         double mass_ij_div_neg4_ftm2v = mass_j*mass_i_div_neg4_ftm2v;
@@ -473,7 +460,6 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
         double kappa_ij = STACKPARAMS?m_params[itype][jtype].kappa:params(itype,jtype).kappa;
         double alpha_ij = STACKPARAMS?m_params[itype][jtype].alpha:params(itype,jtype).alpha;
         double del_uCond = alpha_ij*wr*dtsqrt * rand_gen.normal();
-//        double del_uCond = alpha_ij*wr*dtsqrt * randnum2;//rand_gen.normal();
 
         del_uCond += kappa_ij*(theta_i_inv - theta_j_inv)*wdt;
         uCond[j] -= del_uCond;
@@ -550,7 +536,6 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
   }
 
   rand_pool.free_state(rand_gen);
-//  p_rand_pool->free_state(rand_gen);
 }
 
 

From 21619b29768553bfcf9d31347c00904973e155d4 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 24 Feb 2017 22:16:33 -0500
Subject: [PATCH 162/267] USER-DPD Kokkos: correct the setup of the ghost SSA
 workplan

---
 src/KOKKOS/npair_ssa_kokkos.cpp | 130 ++++++++++++++++----------------
 1 file changed, 67 insertions(+), 63 deletions(-)

diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp
index f94d51197a..7eea57d492 100644
--- a/src/KOKKOS/npair_ssa_kokkos.cpp
+++ b/src/KOKKOS/npair_ssa_kokkos.cpp
@@ -472,84 +472,88 @@ void NPairSSAKokkosExecute<DeviceType>::build_ghosts()
   // since these are ghosts, must check if stencil bin is out of bounds
   for (int workPhase = 0; workPhase < ssa_gphaseCt; workPhase++) {
     int airnum = workPhase + 1;
-    int workItem = 0; //FIXME for now, there is only 1 workItem for each ghost AIR
-    d_ssa_gitemLoc(workPhase, workItem) = inum + gnum; // record where workItem starts in ilist
-    for (int il = 0; il < c_gbincount(airnum); ++il) {
-      const int i = c_gbins(airnum, il);
-      n = 0;
+    //FIXME for now, there is only 1 workItem for each ghost AIR
+    int workItem;
+    for (workItem = 0; workItem < 1; ++workItem) {
+      d_ssa_gitemLoc(workPhase, workItem) = inum + gnum; // record where workItem starts in ilist
+      for (int il = 0; il < c_gbincount(airnum); ++il) {
+        const int i = c_gbins(airnum, il);
+        n = 0;
 
-      const AtomNeighbors neighbors_i = neigh_list.get_neighbors(inum + gnum);
-      const X_FLOAT xtmp = x(i, 0);
-      const X_FLOAT ytmp = x(i, 1);
-      const X_FLOAT ztmp = x(i, 2);
-      const int itype = type(i);
+        const AtomNeighbors neighbors_i = neigh_list.get_neighbors(inum + gnum);
+        const X_FLOAT xtmp = x(i, 0);
+        const X_FLOAT ytmp = x(i, 1);
+        const X_FLOAT ztmp = x(i, 2);
+        const int itype = type(i);
 
-      const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil
-        = d_stencil;
+        const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil
+          = d_stencil;
 
-      int loc[3];
-      const int ibin = coord2bin(x(i, 0), x(i, 1), x(i, 2), &(loc[0]));
+        int loc[3];
+        const int ibin = coord2bin(x(i, 0), x(i, 1), x(i, 2), &(loc[0]));
 
-      // loop over AIR ghost atoms in all bins in "full" stencil
-      // Note: the non-AIR ghost atoms have already been filtered out
-      for (int k = 0; k < nstencil; k++) {
-        int xbin2 = loc[0] + d_stencilxyz(k,0);
-        int ybin2 = loc[1] + d_stencilxyz(k,1);
-        int zbin2 = loc[2] + d_stencilxyz(k,2);
-        // Skip it if this bin is outside the extent of local bins
-        if (xbin2 < lbinxlo || xbin2 >= lbinxhi ||
-            ybin2 < lbinylo || ybin2 >= lbinyhi ||
-            zbin2 < lbinzlo || zbin2 >= lbinzhi) continue;
-        const int jbin = ibin+stencil(k);
-        for (int jl = 0; jl < c_bincount(jbin); ++jl) {
-          const int j = c_bins(jbin, jl);
-          const int jtype = type(j);
-          if(exclude && exclusion(i,j,itype,jtype)) continue;
+        // loop over AIR ghost atoms in all bins in "full" stencil
+        // Note: the non-AIR ghost atoms have already been filtered out
+        for (int k = 0; k < nstencil; k++) {
+          int xbin2 = loc[0] + d_stencilxyz(k,0);
+          int ybin2 = loc[1] + d_stencilxyz(k,1);
+          int zbin2 = loc[2] + d_stencilxyz(k,2);
+          // Skip it if this bin is outside the extent of local bins
+          if (xbin2 < lbinxlo || xbin2 >= lbinxhi ||
+              ybin2 < lbinylo || ybin2 >= lbinyhi ||
+              zbin2 < lbinzlo || zbin2 >= lbinzhi) continue;
+          const int jbin = ibin+stencil(k);
+          for (int jl = 0; jl < c_bincount(jbin); ++jl) {
+            const int j = c_bins(jbin, jl);
+            const int jtype = type(j);
+            if(exclude && exclusion(i,j,itype,jtype)) continue;
 
-          const X_FLOAT delx = xtmp - x(j, 0);
-          const X_FLOAT dely = ytmp - x(j, 1);
-          const X_FLOAT delz = ztmp - x(j, 2);
-          const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
-          if(rsq <= cutneighsq(itype,jtype)) {
-            if (molecular) {
-              if (!moltemplate)
-                which = find_special(i,j);
-                  /* else if (imol >= 0) */
-                  /*   which = find_special(onemols[imol]->special[iatom], */
-                  /*                        onemols[imol]->nspecial[iatom], */
-                  /*                        tag[j]-tagprev); */
-                  /* else which = 0; */
-              if (which == 0){
-                if(n<neigh_list.maxneighs) neighbors_i(n++) = j;
-                else n++;
-              }else if (minimum_image_check(delx,dely,delz)){
+            const X_FLOAT delx = xtmp - x(j, 0);
+            const X_FLOAT dely = ytmp - x(j, 1);
+            const X_FLOAT delz = ztmp - x(j, 2);
+            const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+            if(rsq <= cutneighsq(itype,jtype)) {
+              if (molecular) {
+                if (!moltemplate)
+                  which = find_special(i,j);
+                    /* else if (imol >= 0) */
+                    /*   which = find_special(onemols[imol]->special[iatom], */
+                    /*                        onemols[imol]->nspecial[iatom], */
+                    /*                        tag[j]-tagprev); */
+                    /* else which = 0; */
+                if (which == 0){
+                  if(n<neigh_list.maxneighs) neighbors_i(n++) = j;
+                  else n++;
+                }else if (minimum_image_check(delx,dely,delz)){
+                  if(n<neigh_list.maxneighs) neighbors_i(n++) = j;
+                  else n++;
+                }
+                else if (which > 0) {
+                  if(n<neigh_list.maxneighs) neighbors_i(n++) = j ^ (which << SBBITS);
+                  else n++;
+                }
+              } else {
                 if(n<neigh_list.maxneighs) neighbors_i(n++) = j;
                 else n++;
               }
-              else if (which > 0) {
-                if(n<neigh_list.maxneighs) neighbors_i(n++) = j ^ (which << SBBITS);
-                else n++;
-              }
-            } else {
-              if(n<neigh_list.maxneighs) neighbors_i(n++) = j;
-              else n++;
             }
           }
         }
-      }
 
-      if (n > 0) {
-        neigh_list.d_numneigh(inum + gnum) = n;
-        neigh_list.d_ilist(inum + (gnum++)) = i;
-        if(n > neigh_list.maxneighs) {
-          resize() = 1;
-          if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
+        if (n > 0) {
+          neigh_list.d_numneigh(inum + gnum) = n;
+          neigh_list.d_ilist(inum + (gnum++)) = i;
+          if(n > neigh_list.maxneighs) {
+            resize() = 1;
+            if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
+          }
         }
       }
+      // record where workItem ends in ilist
+      d_ssa_gitemLen(workPhase,workItem) = inum + gnum - d_ssa_gitemLoc(workPhase,workItem);
+      // if (d_ssa_gitemLen(workPhase,workItem) > 0) workItem++;
     }
-    // record where workItem ends in ilist
-    d_ssa_gitemLen(workPhase,workItem) = inum + gnum - d_ssa_gitemLoc(workPhase,workItem);
-    if (d_ssa_gitemLen(workPhase,workItem) > 0) workItem++;
+    d_ssa_gphaseLen(workPhase) = workItem;
   }
   neigh_list.gnum = gnum;
 }

From fd1523c7561e98d61b02c55f74939f5dda97cfe5 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 24 Feb 2017 22:19:53 -0500
Subject: [PATCH 163/267] USER-DPD Kokkos: add missing () in STACKPARAMS check
 in ssa_update_*

---
 src/KOKKOS/fix_shardlow_kokkos.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index 1ec1455b23..79e40dee98 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -299,7 +299,7 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpd(
       const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
 
       // NOTE: r can be 0.0 in DPD systems, so do EPSILON_SQUARED test
-      if ((rsq < STACKPARAMS?m_cutsq[itype][jtype]:d_cutsq(itype,jtype))
+      if ((rsq < (STACKPARAMS?m_cutsq[itype][jtype]:d_cutsq(itype,jtype)))
         && (rsq >= EPSILON_SQUARED)) {
         double r = sqrt(rsq);
         double rinv = 1.0/r;
@@ -432,7 +432,7 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
       const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
 
       // NOTE: r can be 0.0 in DPD systems, so do EPSILON_SQUARED test
-      if ((rsq < STACKPARAMS?m_cutsq[itype][jtype]:d_cutsq(itype,jtype))
+      if ((rsq < (STACKPARAMS?m_cutsq[itype][jtype]:d_cutsq(itype,jtype)))
         && (rsq >= EPSILON_SQUARED)) {
         double r = sqrt(rsq);
         double rinv = 1.0/r;

From e4500859a3e2388a2b3275ae1491f28589781e00 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 24 Feb 2017 22:24:29 -0500
Subject: [PATCH 164/267] USER-DPD: add "#ifdef DEBUG_PAIR_CT" debugging code
 to fix_shardlow*

---
 src/KOKKOS/fix_shardlow_kokkos.cpp | 61 ++++++++++++++++++++++++++++++
 src/KOKKOS/fix_shardlow_kokkos.h   |  7 ++++
 src/USER-DPD/fix_shardlow.cpp      | 53 ++++++++++++++++++++++++++
 src/USER-DPD/fix_shardlow.h        |  5 +++
 4 files changed, 126 insertions(+)

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index 79e40dee98..1459819430 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -99,6 +99,17 @@ FixShardlowKokkos<DeviceType>::FixShardlowKokkos(LAMMPS *lmp, int narg, char **a
   if(/* k_pairDPD == NULL &&*/ k_pairDPDE == NULL)
     error->all(FLERR,"Must use pair_style "/*"dpd/fdt/kk or "*/"dpd/fdt/energy/kk with fix shardlow/kk");
 
+#ifdef DEBUG_PAIR_CT
+  d_counters = typename AT::t_int_2d("FixShardlowKokkos::d_counters", 2, 3);
+  d_hist = typename AT::t_int_1d("FixShardlowKokkos::d_hist", 32);
+#ifndef KOKKOS_USE_CUDA_UVM
+  h_counters = Kokkos::create_mirror_view(d_counters);
+  h_hist = Kokkos::create_mirror_view(d_hist);
+#else
+  h_counters = d_counters;
+  h_hist = d_hist;
+#endif
+#endif
 }
 
 /* ---------------------------------------------------------------------- */
@@ -297,10 +308,24 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpd(
       const X_FLOAT dely = ytmp - x(j, 1);
       const X_FLOAT delz = ztmp - x(j, 2);
       const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+#ifdef DEBUG_PAIR_CT
+      if ((i < nlocal) && (j < nlocal)) Kokkos::atomic_increment(&(d_counters(0, 0)));
+      else Kokkos::atomic_increment(&(d_counters(0, 1)));
+      Kokkos::atomic_increment(&(d_counters(0, 2)));
+      int rsqi = rsq / 8;
+      if (rsqi < 0) rsqi = 0;
+      else if (rsqi > 31) rsqi = 31;
+      Kokkos::atomic_increment(&(d_hist(rsqi)));
+#endif
 
       // NOTE: r can be 0.0 in DPD systems, so do EPSILON_SQUARED test
       if ((rsq < (STACKPARAMS?m_cutsq[itype][jtype]:d_cutsq(itype,jtype)))
         && (rsq >= EPSILON_SQUARED)) {
+#ifdef DEBUG_PAIR_CT
+        if ((i < nlocal) && (j < nlocal)) Kokkos::atomic_increment(&(d_counters(1, 0)));
+        else Kokkos::atomic_increment(&(d_counters(1, 1)));
+        Kokkos::atomic_increment(&(d_counters(1, 2)));
+#endif
         double r = sqrt(rsq);
         double rinv = 1.0/r;
         double delx_rinv = delx*rinv;
@@ -430,10 +455,25 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
       const X_FLOAT dely = ytmp - x(j, 1);
       const X_FLOAT delz = ztmp - x(j, 2);
       const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+#ifdef DEBUG_PAIR_CT
+      if ((i < nlocal) && (j < nlocal)) Kokkos::atomic_increment(&(d_counters(0, 0)));
+      else Kokkos::atomic_increment(&(d_counters(0, 1)));
+      Kokkos::atomic_increment(&(d_counters(0, 2)));
+      int rsqi = rsq / 8;
+      if (rsqi < 0) rsqi = 0;
+      else if (rsqi > 31) rsqi = 31;
+      Kokkos::atomic_increment(&(d_hist(rsqi)));
+#endif
 
       // NOTE: r can be 0.0 in DPD systems, so do EPSILON_SQUARED test
       if ((rsq < (STACKPARAMS?m_cutsq[itype][jtype]:d_cutsq(itype,jtype)))
         && (rsq >= EPSILON_SQUARED)) {
+#ifdef DEBUG_PAIR_CT
+        if ((i < nlocal) && (j < nlocal)) Kokkos::atomic_increment(&(d_counters(1, 0)));
+        else Kokkos::atomic_increment(&(d_counters(1, 1)));
+        Kokkos::atomic_increment(&(d_counters(1, 2)));
+#endif
+
         double r = sqrt(rsq);
         double rinv = 1.0/r;
         double delx_rinv = delx*rinv;
@@ -563,6 +603,15 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
   ssa_gitemLoc = np_ssa->ssa_gitemLoc;
   ssa_gitemLen = np_ssa->ssa_gitemLen;
 
+#ifdef DEBUG_PAIR_CT
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 3; ++j)
+      h_counters(i,j) = 0;
+  for (int i = 0; i < 32; ++i) h_hist[i] = 0;
+  deep_copy(d_counters, h_counters);
+  deep_copy(d_hist, h_hist);
+#endif
+
   // process neighbors in the local AIR
   for (int workPhase = 0; workPhase < ssa_phaseCt; ++workPhase) {
     int workItemCt = ssa_phaseLen[workPhase];
@@ -622,6 +671,18 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
 
   }  //End Loop over all directions For airnum = Top, Top-Right, Right, Bottom-Right, Back
 
+#ifdef DEBUG_PAIR_CT
+deep_copy(h_counters, d_counters);
+deep_copy(h_hist, d_hist);
+for (int i = 0; i < 32; ++i) fprintf(stdout, "%8d", h_hist[i]);
+fprintf(stdout, "\n%6d %6d,%6d %6d: "
+  ,h_counters(0, 2)
+  ,h_counters(1, 2)
+  ,h_counters(0, 1)
+  ,h_counters(1, 1)
+);
+#endif
+
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/KOKKOS/fix_shardlow_kokkos.h b/src/KOKKOS/fix_shardlow_kokkos.h
index b4267226e6..ddd4f5b1ba 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.h
+++ b/src/KOKKOS/fix_shardlow_kokkos.h
@@ -60,6 +60,13 @@ class FixShardlowKokkos : public FixShardlow {
     F_FLOAT cutinv,halfsigma,kappa,alpha;
   };
 
+#ifdef DEBUG_PAIR_CT
+  typename AT::t_int_2d d_counters;
+  typename HAT::t_int_2d h_counters;
+  typename AT::t_int_1d d_hist;
+  typename HAT::t_int_1d h_hist;
+#endif
+
  protected:
 //  class PairDPDfdt *pairDPD;
   PairDPDfdtEnergyKokkos<DeviceType> *k_pairDPDE;
diff --git a/src/USER-DPD/fix_shardlow.cpp b/src/USER-DPD/fix_shardlow.cpp
index 4a7fff66cf..5132d937ea 100644
--- a/src/USER-DPD/fix_shardlow.cpp
+++ b/src/USER-DPD/fix_shardlow.cpp
@@ -211,6 +211,10 @@ void FixShardlow::ssa_update_dpd(
   const double mass_i = (rmass) ? rmass[i] : mass[itype];
   const double massinv_i = 1.0 / mass_i;
 
+#ifdef DEBUG_PAIR_CT
+  const int nlocal = atom->nlocal;
+#endif
+
   // Loop over Directional Neighbors only
   for (int jj = 0; jj < jlen; jj++) {
     int j = jlist[jj] & NEIGHMASK;
@@ -220,9 +224,23 @@ void FixShardlow::ssa_update_dpd(
     double dely = ytmp - x[j][1];
     double delz = ztmp - x[j][2];
     double rsq = delx*delx + dely*dely + delz*delz;
+#ifdef DEBUG_PAIR_CT
+    if ((i < nlocal) && (j < nlocal)) ++(counters[0][0]);
+    else ++(counters[0][1]);
+    ++(counters[0][2]);
+    int rsqi = rsq / 8;
+    if (rsqi < 0) rsqi = 0;
+    else if (rsqi > 31) rsqi = 31;
+    ++(hist[rsqi]);
+#endif
 
     // NOTE: r can be 0.0 in DPD systems, so do EPSILON_SQUARED test
     if ((rsq < cut2_i[jtype]) && (rsq >= EPSILON_SQUARED)) {
+#ifdef DEBUG_PAIR_CT
+      if ((i < nlocal) && (j < nlocal)) ++(counters[1][0]);
+      else ++(counters[1][1]);
+      ++(counters[1][2]);
+#endif
       double r = sqrt(rsq);
       double rinv = 1.0/r;
       double delx_rinv = delx*rinv;
@@ -350,6 +368,10 @@ void FixShardlow::ssa_update_dpde(
   const double massinv_i = 1.0 / mass_i;
   const double mass_i_div_neg4_ftm2v = mass_i*(-0.25)/ftm2v;
 
+#ifdef DEBUG_PAIR_CT
+  const int nlocal = atom->nlocal;
+#endif
+
   // Loop over Directional Neighbors only
   for (int jj = 0; jj < jlen; jj++) {
     int j = jlist[jj] & NEIGHMASK;
@@ -359,9 +381,23 @@ void FixShardlow::ssa_update_dpde(
     double dely = ytmp - x[j][1];
     double delz = ztmp - x[j][2];
     double rsq = delx*delx + dely*dely + delz*delz;
+#ifdef DEBUG_PAIR_CT
+    if ((i < nlocal) && (j < nlocal)) ++(counters[0][0]);
+    else ++(counters[0][1]);
+    ++(counters[0][2]);
+    int rsqi = rsq / 8;
+    if (rsqi < 0) rsqi = 0;
+    else if (rsqi > 31) rsqi = 31;
+    ++(hist[rsqi]);
+#endif
 
     // NOTE: r can be 0.0 in DPD systems, so do EPSILON_SQUARED test
     if ((rsq < cut2_i[jtype]) && (rsq >= EPSILON_SQUARED)) {
+#ifdef DEBUG_PAIR_CT
+      if ((i < nlocal) && (j < nlocal)) ++(counters[1][0]);
+      else ++(counters[1][1]);
+      ++(counters[1][2]);
+#endif
       double r = sqrt(rsq);
       double rinv = 1.0/r;
       double delx_rinv = delx*rinv;
@@ -493,6 +529,13 @@ void FixShardlow::initial_integrate(int vflag)
     error->one(FLERR, msg);
   }
 
+#ifdef DEBUG_PAIR_CT
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 3; ++j)
+      counters[i][j] = 0;
+  for (int i = 0; i < 32; ++i) hist[i] = 0;
+#endif
+
   // Allocate memory for v_t0 to hold the initial velocities for the ghosts
   v_t0 = (double (*)[3]) memory->smalloc(sizeof(double)*3*nghost, "FixShardlow:v_t0");
 
@@ -554,6 +597,16 @@ void FixShardlow::initial_integrate(int vflag)
 
   }  //End Loop over all directions For airnum = Top, Top-Right, Right, Bottom-Right, Back
 
+#ifdef DEBUG_PAIR_CT
+for (int i = 0; i < 32; ++i) fprintf(stdout, "%8d", hist[i]);
+fprintf(stdout, "\n%6d %6d,%6d %6d: "
+  ,counters[0][2]
+  ,counters[1][2]
+  ,counters[0][1]
+  ,counters[1][1]
+);
+#endif
+
   memory->sfree(v_t0);
   v_t0 = NULL;
 }
diff --git a/src/USER-DPD/fix_shardlow.h b/src/USER-DPD/fix_shardlow.h
index 6fd438b8f0..e87ae3c9cf 100644
--- a/src/USER-DPD/fix_shardlow.h
+++ b/src/USER-DPD/fix_shardlow.h
@@ -38,6 +38,11 @@ class FixShardlow : public Fix {
 
   double memory_usage();
 
+#ifdef DEBUG_PAIR_CT
+  int counters[2][3];
+  int hist[32];
+#endif
+
  protected:
   int pack_reverse_comm(int, int, double *);
   void unpack_reverse_comm(int, int *, double *);

From 35ee24cfad501b694ceba49b7104100101d0a5cb Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Sun, 26 Feb 2017 14:50:58 -0500
Subject: [PATCH 165/267] use RandWrap in pair_dpd_fdt_energy_kokkos and
 fix_shardlow_kokkos

---
 src/KOKKOS/fix_shardlow_kokkos.cpp        | 25 ++++++++++++++++++-----
 src/KOKKOS/fix_shardlow_kokkos.h          |  8 ++++++--
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp |  6 +++---
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.h   |  8 ++++----
 4 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index 1459819430..e82991bcba 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -71,7 +71,7 @@ using namespace FixConst;
 
 template<class DeviceType>
 FixShardlowKokkos<DeviceType>::FixShardlowKokkos(LAMMPS *lmp, int narg, char **arg) :
-  FixShardlow(lmp, narg, arg), k_pairDPDE(NULL), ghostmax(0), nlocal(0) , nghost(0), rand_pool(1234567 + comm->me)
+  FixShardlow(lmp, narg, arg), k_pairDPDE(NULL), ghostmax(0), nlocal(0) , nghost(0)
 {
   kokkosable = 1;
 //  atomKK = (AtomKokkos *) atom;
@@ -90,6 +90,7 @@ FixShardlowKokkos<DeviceType>::FixShardlowKokkos(LAMMPS *lmp, int narg, char **a
 //   if(k_pairDPDE){
     comm_forward = 3;
     comm_reverse = 5;
+    p_rand_pool = &(k_pairDPDE->rand_pool);
 //   } else {
 //     comm_forward = 3;
 //     comm_reverse = 3;
@@ -272,7 +273,7 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpd(
   int start_ii, int count
 )
 {
-  rand_type rand_gen = rand_pool.get_state();
+  rand_type rand_gen = p_rand_pool->get_state();
 
   const double theta_ij_inv = 1.0/k_pairDPD->temperature; // independent of i,j
   const double boltz_inv = 1.0/force->boltz;
@@ -400,7 +401,7 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpd(
     v(i, 2) = vzi;
   }
 
-  rand_pool.free_state(rand_gen);
+  p_rand_pool->free_state(rand_gen);
 }
 #endif
 
@@ -416,7 +417,11 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
   int start_ii, int count, int id
 )
 {
-  rand_type rand_gen = rand_pool.get_state();
+#ifdef USE_RAND_MARS
+  class RanMars *pRNG = k_pairDPDE->random;
+#else
+  rand_type rand_gen = p_rand_pool->get_state();
+#endif
 
   const double boltz_inv = 1.0/force->boltz;
   const double ftm2v = force->ftm2v;
@@ -490,7 +495,11 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
         double halfsigma_ij = STACKPARAMS?m_params[itype][jtype].halfsigma:params(itype,jtype).halfsigma;
         double halfgamma_ij = halfsigma_ij*halfsigma_ij*boltz_inv*theta_ij_inv;
 
+#ifdef USE_RAND_MARS
+        double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * pRNG->gaussian();
+#else
         double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * rand_gen.normal();
+#endif
 
         const double mass_j = masses(massPerI ? j : jtype);
         double mass_ij_div_neg4_ftm2v = mass_j*mass_i_div_neg4_ftm2v;
@@ -499,7 +508,11 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
         // Compute uCond
         double kappa_ij = STACKPARAMS?m_params[itype][jtype].kappa:params(itype,jtype).kappa;
         double alpha_ij = STACKPARAMS?m_params[itype][jtype].alpha:params(itype,jtype).alpha;
+#ifdef USE_RAND_MARS
+        double del_uCond = alpha_ij*wr*dtsqrt * pRNG->gaussian();
+#else
         double del_uCond = alpha_ij*wr*dtsqrt * rand_gen.normal();
+#endif
 
         del_uCond += kappa_ij*(theta_i_inv - theta_j_inv)*wdt;
         uCond[j] -= del_uCond;
@@ -575,7 +588,9 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
     ii++;
   }
 
-  rand_pool.free_state(rand_gen);
+#ifndef USE_RAND_MARS
+  p_rand_pool->free_state(rand_gen);
+#endif
 }
 
 
diff --git a/src/KOKKOS/fix_shardlow_kokkos.h b/src/KOKKOS/fix_shardlow_kokkos.h
index ddd4f5b1ba..f71ca1ce11 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.h
+++ b/src/KOKKOS/fix_shardlow_kokkos.h
@@ -70,9 +70,13 @@ class FixShardlowKokkos : public FixShardlow {
  protected:
 //  class PairDPDfdt *pairDPD;
   PairDPDfdtEnergyKokkos<DeviceType> *k_pairDPDE;
-  Kokkos::Random_XorShift64_Pool<DeviceType> rand_pool;
 //  Kokkos::Random_XorShift64_Pool<DeviceType> *p_rand_pool;
-  typedef typename Kokkos::Random_XorShift64_Pool<DeviceType>::generator_type rand_type;
+
+//  Kokkos::Random_XorShift64_Pool<DeviceType> rand_pool;
+//  typedef typename Kokkos::Random_XorShift64_Pool<DeviceType>::generator_type rand_type;
+
+  RandPoolWrap *p_rand_pool;
+  typedef RandWrap rand_type;
 
   Kokkos::DualView<params_ssa**,Kokkos::LayoutRight,DeviceType> k_params;
   typename Kokkos::DualView<params_ssa**,
diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
index aaf638fac3..89e5cd69f1 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
@@ -43,7 +43,7 @@ using namespace LAMMPS_NS;
 
 template<class DeviceType>
 PairDPDfdtEnergyKokkos<DeviceType>::PairDPDfdtEnergyKokkos(LAMMPS *lmp) :
-  PairDPDfdtEnergy(lmp),rand_pool(seed + comm->me /** , lmp/**/)
+  PairDPDfdtEnergy(lmp),rand_pool(12345 /* not actually used, seed + comm->me */, lmp)
 {
   atomKK = (AtomKokkos *) atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
@@ -68,7 +68,7 @@ PairDPDfdtEnergyKokkos<DeviceType>::~PairDPDfdtEnergyKokkos()
 
   memory->destroy_kokkos(k_cutsq,cutsq);
 
-  /** rand_pool.destroy();/**/
+  rand_pool.destroy();
 }
 
 /* ----------------------------------------------------------------------
@@ -101,7 +101,7 @@ void PairDPDfdtEnergyKokkos<DeviceType>::init_style()
     error->all(FLERR,"Cannot use chosen neighbor list style with reax/c/kk");
   }
 
-  /** rand_pool.init(random,seed);/**/
+  rand_pool.init(random,seed);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
index deb264c37e..e065d71d3e 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
@@ -89,11 +89,11 @@ class PairDPDfdtEnergyKokkos : public PairDPDfdtEnergy {
 
   DAT::tdual_efloat_1d k_duCond,k_duMech;
 
-  Kokkos::Random_XorShift64_Pool<DeviceType> rand_pool;
-  typedef typename Kokkos::Random_XorShift64_Pool<DeviceType>::generator_type rand_type;
+  // Kokkos::Random_XorShift64_Pool<DeviceType> rand_pool;
+  // typedef typename Kokkos::Random_XorShift64_Pool<DeviceType>::generator_type rand_type;
 
-  // RandPoolWrap rand_pool;
-  // typedef RandWrap rand_type;
+  RandPoolWrap rand_pool;
+  typedef RandWrap rand_type;
 
   typename ArrayTypes<DeviceType>::tdual_ffloat_2d k_cutsq;
   typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq;

From e4b544f934bd816a31759b654f3c26c9ecf36ebd Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Sun, 26 Feb 2017 17:53:45 -0500
Subject: [PATCH 166/267] Make pair_dpd_fdt_energy's random seed public so
 fix_shardlow can use it.

---
 src/USER-DPD/pair_dpd_fdt_energy.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/USER-DPD/pair_dpd_fdt_energy.h b/src/USER-DPD/pair_dpd_fdt_energy.h
index f8303d4854..dce39f83f0 100644
--- a/src/USER-DPD/pair_dpd_fdt_energy.h
+++ b/src/USER-DPD/pair_dpd_fdt_energy.h
@@ -46,11 +46,11 @@ class PairDPDfdtEnergy : public Pair {
   double **sigma,**kappa;
   double *duCond,*duMech;
 
+  int seed;
   class RanMars *random;
 
  protected:
   double cut_global;
-  int seed;
   bool splitFDT_flag;
   bool a0_is_zero;
 

From 3eba3e5a1b756e4a125e0f88201a2c54399e42ce Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Sun, 26 Feb 2017 17:57:13 -0500
Subject: [PATCH 167/267] USER-DPD Kokkos: for deterministic results, serialize
 bin_atoms() for now.

---
 src/KOKKOS/nbin_ssa_kokkos.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/nbin_ssa_kokkos.cpp b/src/KOKKOS/nbin_ssa_kokkos.cpp
index ebd07752b0..98ec638be9 100644
--- a/src/KOKKOS/nbin_ssa_kokkos.cpp
+++ b/src/KOKKOS/nbin_ssa_kokkos.cpp
@@ -140,7 +140,7 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
     subhi_[1] = domain->subhi[1];
     subhi_[2] = domain->subhi[2];
 
-    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(atom->nlocal,atom->nlocal+atom->nghost), KOKKOS_LAMBDA (const int i) {
+    Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Serial>(atom->nlocal,atom->nlocal+atom->nghost), KOKKOS_LAMBDA (const int i) {
       const int iAIR = coord2ssaAIR(x(i, 0), x(i, 1), x(i, 2));
       if (iAIR > 0) { // include only ghost atoms in an AIR
         const int ac = Kokkos::atomic_fetch_add(&gbincount[iAIR], (int)1);
@@ -188,7 +188,7 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
 
     NPairSSAKokkosBinAtomsFunctor<DeviceType> f(*this);
 
-    Kokkos::parallel_for(atom->nlocal, f);
+    Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Serial>(0, atom->nlocal), f);
     DeviceType::fence();
 
     deep_copy(h_resize, d_resize);

From a5507b291d3d78d93136053d530f8be51d57728c Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Sun, 26 Feb 2017 18:00:20 -0500
Subject: [PATCH 168/267] USER-DPD Kokkos: give each workItem index a unique
 instance of RanMars Makes fix_shardlow_kokkos deterministic across runs and
 thread count.

---
 src/KOKKOS/fix_shardlow_kokkos.cpp | 27 ++++++++++++++++++++++++++-
 src/KOKKOS/fix_shardlow_kokkos.h   |  6 ++++--
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index e82991bcba..a5e02f3dd8 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -90,7 +90,12 @@ FixShardlowKokkos<DeviceType>::FixShardlowKokkos(LAMMPS *lmp, int narg, char **a
 //   if(k_pairDPDE){
     comm_forward = 3;
     comm_reverse = 5;
+#ifdef USE_RAND_MARS
+    maxRNG = 0;
+    pp_random = NULL;
+#else
     p_rand_pool = &(k_pairDPDE->rand_pool);
+#endif
 //   } else {
 //     comm_forward = 3;
 //     comm_reverse = 3;
@@ -119,6 +124,11 @@ template<class DeviceType>
 FixShardlowKokkos<DeviceType>::~FixShardlowKokkos()
 {
   ghostmax = 0;
+  if (pp_random) {
+    for (int i = 1; i < maxRNG; ++i) delete pp_random[i];
+    delete[] pp_random;
+    pp_random = NULL;
+  }
 }
 
 /* ---------------------------------------------------------------------- */
@@ -418,7 +428,7 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
 )
 {
 #ifdef USE_RAND_MARS
-  class RanMars *pRNG = k_pairDPDE->random;
+  class RanMars *pRNG = pp_random[id];
 #else
   rand_type rand_gen = p_rand_pool->get_state();
 #endif
@@ -618,6 +628,21 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
   ssa_gitemLoc = np_ssa->ssa_gitemLoc;
   ssa_gitemLen = np_ssa->ssa_gitemLen;
 
+  int maxWorkItemCt = (int) ssa_itemLoc.dimension_1();
+  if (maxWorkItemCt > maxRNG) {
+    if (pp_random) {
+      for (int i = 1; i < maxRNG; ++i) delete pp_random[i];
+      delete[] pp_random;
+      pp_random = NULL;
+    }
+    maxRNG = maxWorkItemCt;
+    pp_random = new RanMars*[maxRNG];
+    for (int i = 1; i < maxRNG; ++i) {
+      pp_random[i] = new RanMars(lmp, k_pairDPDE->seed + comm->me + comm->nprocs*i);
+    }
+    pp_random[0] = k_pairDPDE->random;
+  }
+
 #ifdef DEBUG_PAIR_CT
   for (int i = 0; i < 2; ++i)
     for (int j = 0; j < 3; ++j)
diff --git a/src/KOKKOS/fix_shardlow_kokkos.h b/src/KOKKOS/fix_shardlow_kokkos.h
index f71ca1ce11..95e8add64a 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.h
+++ b/src/KOKKOS/fix_shardlow_kokkos.h
@@ -75,8 +75,10 @@ class FixShardlowKokkos : public FixShardlow {
 //  Kokkos::Random_XorShift64_Pool<DeviceType> rand_pool;
 //  typedef typename Kokkos::Random_XorShift64_Pool<DeviceType>::generator_type rand_type;
 
-  RandPoolWrap *p_rand_pool;
-  typedef RandWrap rand_type;
+//  RandPoolWrap *p_rand_pool;
+//  typedef RandWrap rand_type;
+  int maxRNG;
+  class RanMars **pp_random;
 
   Kokkos::DualView<params_ssa**,Kokkos::LayoutRight,DeviceType> k_params;
   typename Kokkos::DualView<params_ssa**,

From 2b78ac2146321e1e7f974e9812093214cb7e7f8a Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Tue, 28 Feb 2017 12:49:11 -0500
Subject: [PATCH 169/267] USER-DPD Kokkos: Add "#ifdef DPD_USE_RAN_MARS" toggle
 Also, initialize the rand_pool with a seed in init_style()

---
 src/KOKKOS/fix_shardlow_kokkos.cpp        | 14 ++++++++-----
 src/KOKKOS/fix_shardlow_kokkos.h          | 11 +++++------
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp | 13 +++++++++++-
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.h   | 24 +++++++++++++++++++----
 4 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index a5e02f3dd8..9bac6250da 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -90,7 +90,7 @@ FixShardlowKokkos<DeviceType>::FixShardlowKokkos(LAMMPS *lmp, int narg, char **a
 //   if(k_pairDPDE){
     comm_forward = 3;
     comm_reverse = 5;
-#ifdef USE_RAND_MARS
+#ifdef DPD_USE_RAN_MARS
     maxRNG = 0;
     pp_random = NULL;
 #else
@@ -124,11 +124,13 @@ template<class DeviceType>
 FixShardlowKokkos<DeviceType>::~FixShardlowKokkos()
 {
   ghostmax = 0;
+#ifdef DPD_USE_RAN_MARS
   if (pp_random) {
     for (int i = 1; i < maxRNG; ++i) delete pp_random[i];
     delete[] pp_random;
     pp_random = NULL;
   }
+#endif
 }
 
 /* ---------------------------------------------------------------------- */
@@ -427,7 +429,7 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
   int start_ii, int count, int id
 )
 {
-#ifdef USE_RAND_MARS
+#ifdef DPD_USE_RAN_MARS
   class RanMars *pRNG = pp_random[id];
 #else
   rand_type rand_gen = p_rand_pool->get_state();
@@ -505,7 +507,7 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
         double halfsigma_ij = STACKPARAMS?m_params[itype][jtype].halfsigma:params(itype,jtype).halfsigma;
         double halfgamma_ij = halfsigma_ij*halfsigma_ij*boltz_inv*theta_ij_inv;
 
-#ifdef USE_RAND_MARS
+#ifdef DPD_USE_RAN_MARS
         double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * pRNG->gaussian();
 #else
         double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * rand_gen.normal();
@@ -518,7 +520,7 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
         // Compute uCond
         double kappa_ij = STACKPARAMS?m_params[itype][jtype].kappa:params(itype,jtype).kappa;
         double alpha_ij = STACKPARAMS?m_params[itype][jtype].alpha:params(itype,jtype).alpha;
-#ifdef USE_RAND_MARS
+#ifdef DPD_USE_RAN_MARS
         double del_uCond = alpha_ij*wr*dtsqrt * pRNG->gaussian();
 #else
         double del_uCond = alpha_ij*wr*dtsqrt * rand_gen.normal();
@@ -598,7 +600,7 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
     ii++;
   }
 
-#ifndef USE_RAND_MARS
+#ifndef DPD_USE_RAN_MARS
   p_rand_pool->free_state(rand_gen);
 #endif
 }
@@ -628,6 +630,7 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
   ssa_gitemLoc = np_ssa->ssa_gitemLoc;
   ssa_gitemLen = np_ssa->ssa_gitemLen;
 
+#ifdef DPD_USE_RAN_MARS
   int maxWorkItemCt = (int) ssa_itemLoc.dimension_1();
   if (maxWorkItemCt > maxRNG) {
     if (pp_random) {
@@ -642,6 +645,7 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
     }
     pp_random[0] = k_pairDPDE->random;
   }
+#endif
 
 #ifdef DEBUG_PAIR_CT
   for (int i = 0; i < 2; ++i)
diff --git a/src/KOKKOS/fix_shardlow_kokkos.h b/src/KOKKOS/fix_shardlow_kokkos.h
index 95e8add64a..011c16dc60 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.h
+++ b/src/KOKKOS/fix_shardlow_kokkos.h
@@ -70,15 +70,14 @@ class FixShardlowKokkos : public FixShardlow {
  protected:
 //  class PairDPDfdt *pairDPD;
   PairDPDfdtEnergyKokkos<DeviceType> *k_pairDPDE;
-//  Kokkos::Random_XorShift64_Pool<DeviceType> *p_rand_pool;
 
-//  Kokkos::Random_XorShift64_Pool<DeviceType> rand_pool;
-//  typedef typename Kokkos::Random_XorShift64_Pool<DeviceType>::generator_type rand_type;
-
-//  RandPoolWrap *p_rand_pool;
-//  typedef RandWrap rand_type;
+#ifdef DPD_USE_RAN_MARS
   int maxRNG;
   class RanMars **pp_random;
+#else
+  Kokkos::Random_XorShift64_Pool<DeviceType> *p_rand_pool;
+  typedef typename Kokkos::Random_XorShift64_Pool<DeviceType>::generator_type rand_type;
+#endif
 
   Kokkos::DualView<params_ssa**,Kokkos::LayoutRight,DeviceType> k_params;
   typename Kokkos::DualView<params_ssa**,
diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
index 89e5cd69f1..e534f97391 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
@@ -43,7 +43,12 @@ using namespace LAMMPS_NS;
 
 template<class DeviceType>
 PairDPDfdtEnergyKokkos<DeviceType>::PairDPDfdtEnergyKokkos(LAMMPS *lmp) :
-  PairDPDfdtEnergy(lmp),rand_pool(12345 /* not actually used, seed + comm->me */, lmp)
+  PairDPDfdtEnergy(lmp),
+#ifdef DPD_USE_RAN_MARS
+  rand_pool(0 /* unused */, lmp)
+#else
+  rand_pool()
+#endif
 {
   atomKK = (AtomKokkos *) atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
@@ -68,7 +73,9 @@ PairDPDfdtEnergyKokkos<DeviceType>::~PairDPDfdtEnergyKokkos()
 
   memory->destroy_kokkos(k_cutsq,cutsq);
 
+#ifdef DPD_USE_RAN_MARS
   rand_pool.destroy();
+#endif
 }
 
 /* ----------------------------------------------------------------------
@@ -101,7 +108,11 @@ void PairDPDfdtEnergyKokkos<DeviceType>::init_style()
     error->all(FLERR,"Cannot use chosen neighbor list style with reax/c/kk");
   }
 
+#ifdef DPD_USE_RAN_MARS
   rand_pool.init(random,seed);
+#else
+  rand_pool.init(seed + comm->me,DeviceType::max_hardware_threads());
+#endif
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
index e065d71d3e..a32539242a 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
@@ -22,11 +22,25 @@ PairStyle(dpd/fdt/energy/kk/host,PairDPDfdtEnergyKokkos<LMPHostType>)
 #ifndef LMP_PAIR_DPD_FDT_ENERGY_KOKKOS_H
 #define LMP_PAIR_DPD_FDT_ENERGY_KOKKOS_H
 
+
+#ifndef ALLOW_NON_DETERMINISTIC_DPD
+#ifdef KOKKOS_HAVE_CUDA
+//FIXME print some warning
+#endif
+#ifndef DPD_USE_RAN_MARS
+#define DPD_USE_RAN_MARS
+#endif
+#endif
+
+
 #include "pair_dpd_fdt_energy.h"
 #include "pair_kokkos.h"
 #include "kokkos_type.h"
-#include "Kokkos_Random.hpp"
+#ifdef DPD_USE_RAN_MARS
 #include "rand_pool_wrap_kokkos.h"
+#else
+#include "Kokkos_Random.hpp"
+#endif
 
 namespace LAMMPS_NS {
 
@@ -89,11 +103,13 @@ class PairDPDfdtEnergyKokkos : public PairDPDfdtEnergy {
 
   DAT::tdual_efloat_1d k_duCond,k_duMech;
 
-  // Kokkos::Random_XorShift64_Pool<DeviceType> rand_pool;
-  // typedef typename Kokkos::Random_XorShift64_Pool<DeviceType>::generator_type rand_type;
-
+#ifdef DPD_USE_RAN_MARS
   RandPoolWrap rand_pool;
   typedef RandWrap rand_type;
+#else
+  Kokkos::Random_XorShift64_Pool<DeviceType> rand_pool;
+  typedef typename Kokkos::Random_XorShift64_Pool<DeviceType>::generator_type rand_type;
+#endif
 
   typename ArrayTypes<DeviceType>::tdual_ffloat_2d k_cutsq;
   typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq;

From b26a434a502d953b3b7fd2772a4bbc5a5091f468 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Tue, 28 Feb 2017 12:53:56 -0500
Subject: [PATCH 170/267] USER-DPD Kokkos: Add "#ifdef
 ALLOW_NON_DETERMINISTIC_SSA" toggle SSA atom binning algorithm was adjusted
 to do as much work in parallel while preserving deterministic behavior.  The
 final step is done serially to preserve deterministic behavior. An
 alternative would be to sort the contents of the bins so that they are always
 in the same order.

---
 src/KOKKOS/nbin_ssa_kokkos.cpp | 189 +++++++++++++++++++--------------
 src/KOKKOS/nbin_ssa_kokkos.h   |  65 ++++++++++++
 2 files changed, 172 insertions(+), 82 deletions(-)

diff --git a/src/KOKKOS/nbin_ssa_kokkos.cpp b/src/KOKKOS/nbin_ssa_kokkos.cpp
index 98ec638be9..53f3f2fc80 100644
--- a/src/KOKKOS/nbin_ssa_kokkos.cpp
+++ b/src/KOKKOS/nbin_ssa_kokkos.cpp
@@ -115,89 +115,31 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
   last_bin = update->ntimestep;
 
   int i;
+  int nlocal = atom->nlocal;
+  int nghost = atom->nghost;
+  int nall = nlocal + nghost;
 
-  // bin the ghost atoms
-  h_resize() = 1;
-  while(h_resize() > 0) {
-    h_resize() = 0;
-    deep_copy(d_resize, h_resize);
+  atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);
+  x = atomKK->k_x.view<DeviceType>();
 
-    for (int i = 0; i < 8; i++) {
-      k_gbincount.h_view(i) = 0;
-    }
-    k_gbincount.modify<LMPHostType>();
-    k_gbincount.sync<DeviceType>();
-    DeviceType::fence(); // FIXME?
+  sublo_[0] = domain->sublo[0];
+  sublo_[1] = domain->sublo[1];
+  sublo_[2] = domain->sublo[2];
+  subhi_[0] = domain->subhi[0];
+  subhi_[1] = domain->subhi[1];
+  subhi_[2] = domain->subhi[2];
 
-    atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);
-    x = atomKK->k_x.view<DeviceType>();
+  bboxlo_[0] = bboxlo[0]; bboxlo_[1] = bboxlo[1]; bboxlo_[2] = bboxlo[2];
+  bboxhi_[0] = bboxhi[0]; bboxhi_[1] = bboxhi[1]; bboxhi_[2] = bboxhi[2];
 
-    // I don't think these two lines need to be repeated here... - TIM 20170216
-    sublo_[0] = domain->sublo[0];
-    sublo_[1] = domain->sublo[1];
-    sublo_[2] = domain->sublo[2];
-    subhi_[0] = domain->subhi[0];
-    subhi_[1] = domain->subhi[1];
-    subhi_[2] = domain->subhi[2];
+  k_binID = DAT::tdual_int_1d("NBinSSAKokkos::binID",nall);
+  binID = k_binID.view<DeviceType>();
 
-    Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Serial>(atom->nlocal,atom->nlocal+atom->nghost), KOKKOS_LAMBDA (const int i) {
-      const int iAIR = coord2ssaAIR(x(i, 0), x(i, 1), x(i, 2));
-      if (iAIR > 0) { // include only ghost atoms in an AIR
-        const int ac = Kokkos::atomic_fetch_add(&gbincount[iAIR], (int)1);
-        if(ac < (int) gbins.dimension_1()) {
-          gbins(iAIR, ac) = i;
-        } else {
-          d_resize() = 1;
-        }
-      }
-    });
-    DeviceType::fence();
-
-    deep_copy(h_resize, d_resize);
-    if(h_resize()) {
-      k_gbincount.modify<DeviceType>();
-      k_gbincount.sync<DeviceType>();
-      for (i = 1; i < 8; i++) {
-        if (k_gbincount.h_view(i) > ghosts_per_gbin) {
-          ghosts_per_gbin = k_gbincount.h_view(i);
-        }
-      }
-      k_gbins = DAT::tdual_int_2d("gbins", 8, ghosts_per_gbin);
-      gbins = k_gbins.view<DeviceType>();
-    }
-  }
-  c_gbins = gbins; // gbins won't change until the next bin_atoms
-
-  // bin the local atoms
-  h_resize() = 1;
-  while(h_resize() > 0) {
-    h_resize() = 0;
-    deep_copy(d_resize, h_resize);
-
-    MemsetZeroFunctor<DeviceType> f_zero;
-    f_zero.ptr = (void*) k_bincount.view<DeviceType>().ptr_on_device();
-    Kokkos::parallel_for(mbins, f_zero);
-    DeviceType::fence();
-
-    atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);
-    x = atomKK->k_x.view<DeviceType>();
-
-    // I don't think these two lines need to be repeated here... - TIM 20170216
-    bboxlo_[0] = bboxlo[0]; bboxlo_[1] = bboxlo[1]; bboxlo_[2] = bboxlo[2];
-    bboxhi_[0] = bboxhi[0]; bboxhi_[1] = bboxhi[1]; bboxhi_[2] = bboxhi[2];
-
-    NPairSSAKokkosBinAtomsFunctor<DeviceType> f(*this);
-
-    Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Serial>(0, atom->nlocal), f);
-    DeviceType::fence();
-
-    deep_copy(h_resize, d_resize);
-    if(h_resize()) {
-
-      atoms_per_bin += 16;
-      k_bins = DAT::tdual_int_2d("bins", mbins, atoms_per_bin);
-      bins = k_bins.view<DeviceType>();
-    }
+  // find each local atom's binID
+  {
+    atoms_per_bin = 0;
+    NPairSSAKokkosBinIDAtomsFunctor<DeviceType> f(*this);
+    Kokkos::parallel_reduce(nlocal, f, atoms_per_bin);
   }
   deep_copy(h_lbinxlo, d_lbinxlo);
   deep_copy(h_lbinylo, d_lbinylo);
@@ -205,7 +147,72 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
   deep_copy(h_lbinxhi, d_lbinxhi);
   deep_copy(h_lbinyhi, d_lbinyhi);
   deep_copy(h_lbinzhi, d_lbinzhi);
+
+  // find each ghost's binID (AIR number)
+  {
+    for (int i = 0; i < 8; i++) k_gbincount.h_view(i) = 0;
+    k_gbincount.modify<LMPHostType>();
+    k_gbincount.sync<DeviceType>();
+    DeviceType::fence(); // FIXME?
+    ghosts_per_gbin = 0;
+    NPairSSAKokkosBinIDGhostsFunctor<DeviceType> f(*this);
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType>(nlocal,nall), f, ghosts_per_gbin);
+  }
+
+  // actually bin the ghost atoms
+  {
+    if(ghosts_per_gbin > (int) gbins.dimension_1()) {
+      k_gbins = DAT::tdual_int_2d("gbins", 8, ghosts_per_gbin);
+      gbins = k_gbins.view<DeviceType>();
+    }
+    for (int i = 0; i < 8; i++) k_gbincount.h_view(i) = 0;
+    k_gbincount.modify<LMPHostType>();
+    k_gbincount.sync<DeviceType>();
+    DeviceType::fence(); // FIXME?
+
+    Kokkos::parallel_for(
+#ifdef ALLOW_NON_DETERMINISTIC_SSA
+      Kokkos::RangePolicy<DeviceType>(nlocal,nall)
+#else
+      Kokkos::RangePolicy<Kokkos::Serial>(nlocal,nall)
+#endif
+      , KOKKOS_LAMBDA (const int i) {
+      const int iAIR = binID(i);
+      if (iAIR > 0) { // include only ghost atoms in an AIR
+        const int ac = Kokkos::atomic_fetch_add(&gbincount[iAIR], (int)1);
+        gbins(iAIR, ac) = i;
+      }
+    });
+    DeviceType::fence();
+  }
+  c_gbins = gbins; // gbins won't change until the next bin_atoms
+
+  // actually bin the local atoms
+  {
+    if ((mbins > (int) bins.dimension_0()) ||
+        (atoms_per_bin > (int) bins.dimension_1())) {
+      k_bins = DAT::tdual_int_2d("bins", mbins, atoms_per_bin);
+      bins = k_bins.view<DeviceType>();
+    }
+    MemsetZeroFunctor<DeviceType> f_zero;
+    f_zero.ptr = (void*) k_bincount.view<DeviceType>().ptr_on_device();
+    Kokkos::parallel_for(mbins, f_zero);
+    DeviceType::fence();
+
+    NPairSSAKokkosBinAtomsFunctor<DeviceType> f(*this);
+#ifdef ALLOW_NON_DETERMINISTIC_SSA
+    Kokkos::parallel_for(nlocal, f);
+#else
+    Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Serial>(0, nlocal), f);
+#endif
+    DeviceType::fence();
+
+  }
   c_bins = bins; // bins won't change until the next bin_atoms
+
+//now dispose of the k_binID array
+  k_binID = DAT::tdual_int_1d("NBinSSAKokkos::binID",0);
+  binID = k_binID.view<DeviceType>();
 }
 
 /* ---------------------------------------------------------------------- */
@@ -213,9 +220,19 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void NBinSSAKokkos<DeviceType>::binAtomsItem(const int &i) const
+{
+  const int ibin = binID(i);
+  const int ac = Kokkos::atomic_fetch_add(&(bincount[ibin]), (int)1);
+  bins(ibin, ac) = i;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void NBinSSAKokkos<DeviceType>::binIDAtomsItem(const int &i, int &update) const
 {
   int loc[3];
   const int ibin = coord2bin(x(i, 0), x(i, 1), x(i, 2), &(loc[0]));
+  binID(i) = ibin;
 
   // Find the bounding box of the local atoms in the bins
   if (loc[0] < d_lbinxlo()) Kokkos::atomic_fetch_min(&d_lbinxlo(),loc[0]);
@@ -226,10 +243,18 @@ void NBinSSAKokkos<DeviceType>::binAtomsItem(const int &i) const
   if (loc[2] >= d_lbinzhi()) Kokkos::atomic_fetch_max(&d_lbinzhi(),loc[2] + 1);
 
   const int ac = Kokkos::atomic_fetch_add(&(bincount[ibin]), (int)1);
-  if(ac < (int) bins.dimension_1()) {
-    bins(ibin, ac) = i;
-  } else {
-    d_resize() = 1;
+  if (update <= ac) update = ac + 1;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void NBinSSAKokkos<DeviceType>::binIDGhostsItem(const int &i, int &update) const
+{
+  const int iAIR = coord2ssaAIR(x(i, 0), x(i, 1), x(i, 2));
+  binID(i) = iAIR;
+  if (iAIR > 0) { // include only ghost atoms in an AIR
+    const int ac = Kokkos::atomic_fetch_add(&gbincount[iAIR], (int)1);
+    if (update <= ac) update = ac + 1;
   }
 }
 
diff --git a/src/KOKKOS/nbin_ssa_kokkos.h b/src/KOKKOS/nbin_ssa_kokkos.h
index 488c1034f5..69f05c9304 100644
--- a/src/KOKKOS/nbin_ssa_kokkos.h
+++ b/src/KOKKOS/nbin_ssa_kokkos.h
@@ -41,6 +41,11 @@ class NBinSSAKokkos : public NBinStandard {
   void bin_atoms_setup(int);
   void bin_atoms();
 
+   // temporary array to hold the binID for each atom
+  DAT::tdual_int_1d k_binID;
+  typename AT::t_int_1d binID;
+  typename AT::t_int_1d_const c_binID;
+
   int atoms_per_bin;
   DAT::tdual_int_1d k_bincount;
   DAT::tdual_int_2d k_bins;
@@ -77,6 +82,12 @@ class NBinSSAKokkos : public NBinStandard {
   KOKKOS_INLINE_FUNCTION
   void binAtomsItem(const int &i) const;
 
+  KOKKOS_INLINE_FUNCTION
+  void binIDAtomsItem(const int &i, int &update) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void binIDGhostsItem(const int &i, int &update) const;
+
 /* ----------------------------------------------------------------------
    convert atom coords into the ssa active interaction region number
 ------------------------------------------------------------------------- */
@@ -165,6 +176,60 @@ struct NPairSSAKokkosBinAtomsFunctor {
   }
 };
 
+template<class DeviceType>
+struct NPairSSAKokkosBinIDAtomsFunctor {
+  typedef DeviceType device_type;
+  typedef int value_type;
+
+  const NBinSSAKokkos<DeviceType> c;
+
+  NPairSSAKokkosBinIDAtomsFunctor(const NBinSSAKokkos<DeviceType> &_c):
+    c(_c) {};
+  ~NPairSSAKokkosBinIDAtomsFunctor() {}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int & i, value_type& update) const {
+    c.binIDAtomsItem(i, update);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join (volatile value_type& dst,
+             const volatile value_type& src) const {
+    if (dst < src) dst = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init (value_type& dst) const {
+    dst = INT_MIN;
+  }
+};
+
+template<class DeviceType>
+struct NPairSSAKokkosBinIDGhostsFunctor {
+  typedef DeviceType device_type;
+  typedef int value_type;
+
+  const NBinSSAKokkos<DeviceType> c;
+
+  NPairSSAKokkosBinIDGhostsFunctor(const NBinSSAKokkos<DeviceType> &_c):
+    c(_c) {};
+  ~NPairSSAKokkosBinIDGhostsFunctor() {}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int & i, value_type& update) const {
+    c.binIDGhostsItem(i, update);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join (volatile value_type& dst,
+             const volatile value_type& src) const {
+    if (dst < src) dst = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init (value_type& dst) const {
+    dst = INT_MIN;
+  }
+};
+
 }
 
 #endif

From 0982331c71fbb9d9420e26c47dfad1a0392fd028 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Wed, 1 Mar 2017 09:49:24 -0500
Subject: [PATCH 171/267] USER-DPD Kokkos: replicate 7a593c2f bugfix to
 pair_table_rx_kokkos.cpp

---
 src/KOKKOS/pair_table_rx_kokkos.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index 58108c9308..2a1ee2c0b1 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -984,7 +984,7 @@ void PairTableRXKokkos<DeviceType>::coeff(int narg, char **arg)
   nspecies = atom->nspecies_dpd;
   if(nspecies==0) error->all(FLERR,"There are no rx species specified.");
   int n;
-  n = strlen(arg[3]) + 1;
+  n = strlen(arg[4]) + 1;
   site1 = new char[n];
   strcpy(site1,arg[4]);
 
@@ -995,7 +995,7 @@ void PairTableRXKokkos<DeviceType>::coeff(int narg, char **arg)
   if (ispecies == nspecies && strcmp(site1,"1fluid") != 0)
     error->all(FLERR,"Site1 name not recognized in pair coefficients");
 
-  n = strlen(arg[4]) + 1;
+  n = strlen(arg[5]) + 1;
   site2 = new char[n];
   strcpy(site2,arg[5]);
 

From 6e26358ec3eade9bb35f4dc1ad48b5374cec417d Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Wed, 1 Mar 2017 11:46:26 -0500
Subject: [PATCH 172/267] lib kokkos bugfix: on a CUDA host, the random state
 wasn't preserved. Random_XorShift*_Pool<Kokkos::Cuda>::free_state() has two
 purposes: 1) update the state value kept in the pool 2) unlock the state For
 a CUDA host thread, ONLY skip step 2, not both.

---
 lib/kokkos/algorithms/src/Kokkos_Random.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
index d376173bf1..2fb6b553c2 100644
--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -1204,8 +1204,8 @@ Random_XorShift64<Kokkos::Cuda> Random_XorShift64_Pool<Kokkos::Cuda>::get_state(
 template<>
 KOKKOS_INLINE_FUNCTION
 void Random_XorShift64_Pool<Kokkos::Cuda>::free_state(const Random_XorShift64<Kokkos::Cuda> &state) const {
-#ifdef __CUDA_ARCH__
   state_(state.state_idx_) = state.state_;
+#ifdef __CUDA_ARCH__
   locks_(state.state_idx_) = 0;
   return;
 #endif
@@ -1240,9 +1240,9 @@ Random_XorShift1024<Kokkos::Cuda> Random_XorShift1024_Pool<Kokkos::Cuda>::get_st
 template<>
 KOKKOS_INLINE_FUNCTION
 void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift1024<Kokkos::Cuda> &state) const {
-#ifdef __CUDA_ARCH__
   for(int i=0; i<16; i++)
     state_(state.state_idx_,i) = state.state_[i];
+#ifdef __CUDA_ARCH__
   locks_(state.state_idx_) = 0;
   return;
 #endif

From 641bf72f2030f78d1ecfbbedc60704f4215d9662 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Wed, 1 Mar 2017 11:52:33 -0500
Subject: [PATCH 173/267] lib kokkos: Enable deterministic use of
 Random_XorShift*_Pool. Add support for lock-free and deterministic use of
 Random_XorShift*_Pool by giving state_idx selection and lock responsibility
 up to the application.  Done by an overload of get_state() to take sate_idx
 as an argument that the appplication guarantees is concurrently unique and
 within the range of num_states that the application passed to init(). In
 other words, this allows the RNG state to be associated with some application
 specific index, rather than a runtime arbitrary thread ID, and thus the
 application can control which work is performed using which RNG in a
 deterministic manner, regardless of which thread performs the work.

---
 lib/kokkos/algorithms/src/Kokkos_Random.hpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
index 2fb6b553c2..a0d666183c 100644
--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -752,6 +752,12 @@ namespace Kokkos {
       return Random_XorShift64<DeviceType>(state_(i),i);
     }
 
+    // NOTE: state_idx MUST be unique and less than num_states
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift64<DeviceType> get_state(const int state_idx) const {
+      return Random_XorShift64<DeviceType>(state_(state_idx),state_idx);
+    }
+
     KOKKOS_INLINE_FUNCTION
     void free_state(const Random_XorShift64<DeviceType>& state) const {
       state_(state.state_idx_) = state.state_;
@@ -1006,6 +1012,12 @@ namespace Kokkos {
       return Random_XorShift1024<DeviceType>(state_,p_(i),i);
     };
 
+    // NOTE: state_idx MUST be unique and less than num_states
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift1024<DeviceType> get_state(const int state_idx) const {
+      return Random_XorShift1024<DeviceType>(state_,p_(state_idx),state_idx);
+    }
+
     KOKKOS_INLINE_FUNCTION
     void free_state(const Random_XorShift1024<DeviceType>& state) const {
       for(int i = 0; i<16; i++)

From 268e855a151360861f7b6059414356af1323e997 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Wed, 1 Mar 2017 14:14:29 -0500
Subject: [PATCH 174/267] USER-DPD Kokkos: bugfix for the rare case were the
 SSA ghost processing has more parallelism than for the locals.

---
 src/KOKKOS/fix_shardlow_kokkos.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index 9bac6250da..8b6432e2dc 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -50,7 +50,6 @@
 #include "neighbor.h"
 #include "neigh_list_kokkos.h"
 #include "neigh_request.h"
-#include "random_mars.h"
 #include "memory.h"
 #include "domain.h"
 #include "modify.h"
@@ -632,6 +631,9 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
 
 #ifdef DPD_USE_RAN_MARS
   int maxWorkItemCt = (int) ssa_itemLoc.dimension_1();
+  if (maxWorkItemCt < (int) ssa_gitemLoc.dimension_1()) {
+    maxWorkItemCt = (int) ssa_gitemLoc.dimension_1();
+  }
   if (maxWorkItemCt > maxRNG) {
     if (pp_random) {
       for (int i = 1; i < maxRNG; ++i) delete pp_random[i];

From ed089c34cfda77885d235ade1a285d30109ce157 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Wed, 1 Mar 2017 14:18:14 -0500
Subject: [PATCH 175/267] USER-DPD Kokkos: Now use the deterministic
 Random_XorShift64() for SSA

---
 src/KOKKOS/fix_shardlow_kokkos.cpp        | 49 ++++++++++++++---------
 src/KOKKOS/fix_shardlow_kokkos.h          |  7 +++-
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp |  1 -
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.h   | 17 ++++----
 4 files changed, 44 insertions(+), 30 deletions(-)

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index 8b6432e2dc..996f37257d 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -89,11 +89,9 @@ FixShardlowKokkos<DeviceType>::FixShardlowKokkos(LAMMPS *lmp, int narg, char **a
 //   if(k_pairDPDE){
     comm_forward = 3;
     comm_reverse = 5;
-#ifdef DPD_USE_RAN_MARS
     maxRNG = 0;
+#ifdef DPD_USE_RAN_MARS
     pp_random = NULL;
-#else
-    p_rand_pool = &(k_pairDPDE->rand_pool);
 #endif
 //   } else {
 //     comm_forward = 3;
@@ -281,10 +279,14 @@ void FixShardlowKokkos<DeviceType>::setup_pre_neighbor()
 template<class DeviceType>
 template<bool STACKPARAMS>
 void FixShardlowKokkos<DeviceType>::ssa_update_dpd(
-  int start_ii, int count
+  int start_ii, int count, int id
 )
 {
-  rand_type rand_gen = p_rand_pool->get_state();
+#ifdef DPD_USE_RAN_MARS
+  class RanMars *pRNG = pp_random[id];
+#else
+  rand_type rand_gen = rand_pool.get_state(id);
+#endif
 
   const double theta_ij_inv = 1.0/k_pairDPD->temperature; // independent of i,j
   const double boltz_inv = 1.0/force->boltz;
@@ -350,7 +352,12 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpd(
         double halfsigma_ij = STACKPARAMS?m_params[itype][jtype].halfsigma:params(itype,jtype).halfsigma;
         double halfgamma_ij = halfsigma_ij*halfsigma_ij*boltz_inv*theta_ij_inv;
 
-        double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * pRNG->gaussian();
+        double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v *
+#ifdef DPD_USE_RAN_MARS
+            pRNG->gaussian();
+#else
+            rand_gen.normal();
+#endif
 
         const double mass_j = masses(massPerI ? j : jtype);
         double massinv_j = 1.0 / mass_j;
@@ -412,7 +419,9 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpd(
     v(i, 2) = vzi;
   }
 
-  p_rand_pool->free_state(rand_gen);
+#ifndef DPD_USE_RAN_MARS
+  rand_pool.free_state(rand_gen);
+#endif
 }
 #endif
 
@@ -431,7 +440,7 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
 #ifdef DPD_USE_RAN_MARS
   class RanMars *pRNG = pp_random[id];
 #else
-  rand_type rand_gen = p_rand_pool->get_state();
+  rand_type rand_gen = rand_pool.get_state(id);
 #endif
 
   const double boltz_inv = 1.0/force->boltz;
@@ -506,10 +515,11 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
         double halfsigma_ij = STACKPARAMS?m_params[itype][jtype].halfsigma:params(itype,jtype).halfsigma;
         double halfgamma_ij = halfsigma_ij*halfsigma_ij*boltz_inv*theta_ij_inv;
 
+        double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v *
 #ifdef DPD_USE_RAN_MARS
-        double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * pRNG->gaussian();
+            pRNG->gaussian();
 #else
-        double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * rand_gen.normal();
+            rand_gen.normal();
 #endif
 
         const double mass_j = masses(massPerI ? j : jtype);
@@ -519,10 +529,11 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
         // Compute uCond
         double kappa_ij = STACKPARAMS?m_params[itype][jtype].kappa:params(itype,jtype).kappa;
         double alpha_ij = STACKPARAMS?m_params[itype][jtype].alpha:params(itype,jtype).alpha;
+        double del_uCond = alpha_ij*wr*dtsqrt *
 #ifdef DPD_USE_RAN_MARS
-        double del_uCond = alpha_ij*wr*dtsqrt * pRNG->gaussian();
+            pRNG->gaussian();
 #else
-        double del_uCond = alpha_ij*wr*dtsqrt * rand_gen.normal();
+            rand_gen.normal();
 #endif
 
         del_uCond += kappa_ij*(theta_i_inv - theta_j_inv)*wdt;
@@ -600,7 +611,7 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
   }
 
 #ifndef DPD_USE_RAN_MARS
-  p_rand_pool->free_state(rand_gen);
+  rand_pool.free_state(rand_gen);
 #endif
 }
 
@@ -629,25 +640,27 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
   ssa_gitemLoc = np_ssa->ssa_gitemLoc;
   ssa_gitemLen = np_ssa->ssa_gitemLen;
 
-#ifdef DPD_USE_RAN_MARS
   int maxWorkItemCt = (int) ssa_itemLoc.dimension_1();
   if (maxWorkItemCt < (int) ssa_gitemLoc.dimension_1()) {
     maxWorkItemCt = (int) ssa_gitemLoc.dimension_1();
   }
   if (maxWorkItemCt > maxRNG) {
+#ifdef DPD_USE_RAN_MARS
     if (pp_random) {
       for (int i = 1; i < maxRNG; ++i) delete pp_random[i];
       delete[] pp_random;
       pp_random = NULL;
     }
-    maxRNG = maxWorkItemCt;
-    pp_random = new RanMars*[maxRNG];
-    for (int i = 1; i < maxRNG; ++i) {
+    pp_random = new RanMars*[maxWorkItemCt];
+    for (int i = 1; i < maxWorkItemCt; ++i) {
       pp_random[i] = new RanMars(lmp, k_pairDPDE->seed + comm->me + comm->nprocs*i);
     }
     pp_random[0] = k_pairDPDE->random;
-  }
+#else
+    rand_pool.init(k_pairDPDE->seed + comm->me, maxWorkItemCt);
 #endif
+    maxRNG = maxWorkItemCt;
+  }
 
 #ifdef DEBUG_PAIR_CT
   for (int i = 0; i < 2; ++i)
diff --git a/src/KOKKOS/fix_shardlow_kokkos.h b/src/KOKKOS/fix_shardlow_kokkos.h
index 011c16dc60..c4711f5b8b 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.h
+++ b/src/KOKKOS/fix_shardlow_kokkos.h
@@ -71,11 +71,14 @@ class FixShardlowKokkos : public FixShardlow {
 //  class PairDPDfdt *pairDPD;
   PairDPDfdtEnergyKokkos<DeviceType> *k_pairDPDE;
 
-#ifdef DPD_USE_RAN_MARS
   int maxRNG;
+#ifdef DPD_USE_RAN_MARS
   class RanMars **pp_random;
+#elif defined(DPD_USE_Random_XorShift1024)
+  Kokkos::Random_XorShift1024_Pool<DeviceType> rand_pool;
+  typedef typename Kokkos::Random_XorShift1024_Pool<DeviceType>::generator_type rand_type;
 #else
-  Kokkos::Random_XorShift64_Pool<DeviceType> *p_rand_pool;
+  Kokkos::Random_XorShift64_Pool<DeviceType> rand_pool;
   typedef typename Kokkos::Random_XorShift64_Pool<DeviceType>::generator_type rand_type;
 #endif
 
diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
index e534f97391..ba61185a57 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
@@ -28,7 +28,6 @@
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
-#include "random_mars.h"
 #include "memory.h"
 #include "modify.h"
 #include "pair_dpd_fdt_energy_kokkos.h"
diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
index a32539242a..74fe5a63b8 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
@@ -22,16 +22,12 @@ PairStyle(dpd/fdt/energy/kk/host,PairDPDfdtEnergyKokkos<LMPHostType>)
 #ifndef LMP_PAIR_DPD_FDT_ENERGY_KOKKOS_H
 #define LMP_PAIR_DPD_FDT_ENERGY_KOKKOS_H
 
+//#define DPD_USE_RAN_MARS
+#define DPD_USE_Random_XorShift64
 
-#ifndef ALLOW_NON_DETERMINISTIC_DPD
-#ifdef KOKKOS_HAVE_CUDA
-//FIXME print some warning
+#if !defined(DPD_USE_RAN_MARS) && !defined(DPD_USE_Random_XorShift64) && !defined(Random_XorShift1024)
+#define DPD_USE_Random_XorShift64
 #endif
-#ifndef DPD_USE_RAN_MARS
-#define DPD_USE_RAN_MARS
-#endif
-#endif
-
 
 #include "pair_dpd_fdt_energy.h"
 #include "pair_kokkos.h"
@@ -106,9 +102,12 @@ class PairDPDfdtEnergyKokkos : public PairDPDfdtEnergy {
 #ifdef DPD_USE_RAN_MARS
   RandPoolWrap rand_pool;
   typedef RandWrap rand_type;
-#else
+#elif defined(DPD_USE_Random_XorShift64)
   Kokkos::Random_XorShift64_Pool<DeviceType> rand_pool;
   typedef typename Kokkos::Random_XorShift64_Pool<DeviceType>::generator_type rand_type;
+#elif defined(DPD_USE_Random_XorShift1024)
+  Kokkos::Random_XorShift1024_Pool<DeviceType> rand_pool;
+  typedef typename Kokkos::Random_XorShift1024_Pool<DeviceType>::generator_type rand_type;
 #endif
 
   typename ArrayTypes<DeviceType>::tdual_ffloat_2d k_cutsq;

From 8210b25fb848c15484a2dd4dd46af2c933e28bd1 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Wed, 1 Mar 2017 15:34:24 -0500
Subject: [PATCH 176/267] USER-DPD Kokkos: replicate 9a560b90 bugfix to
 atom_vec_dpd_kokkos.cpp

---
 src/KOKKOS/atom_vec_dpd_kokkos.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.cpp b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
index 820f11c215..146ae8f7dd 100644
--- a/src/KOKKOS/atom_vec_dpd_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
@@ -1048,8 +1048,6 @@ int AtomVecDPDKokkos::pack_comm_hybrid(int n, int *list, double *buf)
     buf[m++] = h_uCond[j];
     buf[m++] = h_uMech[j];
     buf[m++] = h_uChem[j];
-    buf[m++] = h_uCG[j];
-    buf[m++] = h_uCGnew[j];
   }
   return m;
 }
@@ -1245,8 +1243,6 @@ int AtomVecDPDKokkos::unpack_comm_hybrid(int n, int first, double *buf)
     h_uCond(i) = buf[m++];
     h_uMech(i) = buf[m++];
     h_uChem(i) = buf[m++];
-    h_uCG(i) = buf[m++];
-    h_uCGnew(i) = buf[m++];
   }
 
   modified(Host,DPDTHETA_MASK | UCOND_MASK |

From d95fbf3a5e4afb3aaef5d9d931893a06dd609d94 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Thu, 2 Mar 2017 15:01:41 -0500
Subject: [PATCH 177/267] USER-DPD Kokkos: use Random_XorShift64() by default,
 but allow overrides

---
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
index 74fe5a63b8..fcf4b33a7a 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
@@ -22,9 +22,6 @@ PairStyle(dpd/fdt/energy/kk/host,PairDPDfdtEnergyKokkos<LMPHostType>)
 #ifndef LMP_PAIR_DPD_FDT_ENERGY_KOKKOS_H
 #define LMP_PAIR_DPD_FDT_ENERGY_KOKKOS_H
 
-//#define DPD_USE_RAN_MARS
-#define DPD_USE_Random_XorShift64
-
 #if !defined(DPD_USE_RAN_MARS) && !defined(DPD_USE_Random_XorShift64) && !defined(Random_XorShift1024)
 #define DPD_USE_Random_XorShift64
 #endif

From 27d2e9bf56f04fbb598443b84fb1c8f18f53a9aa Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Thu, 2 Mar 2017 15:03:33 -0500
Subject: [PATCH 178/267] USER-DPD: add npair_halffull_newton_ssa to Purge.list
 With the new SSA neighbor list, half from full can't work, and will break
 compiles if the old files are in the src directory

---
 src/Purge.list | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/Purge.list b/src/Purge.list
index 554c5df824..772961bbdf 100644
--- a/src/Purge.list
+++ b/src/Purge.list
@@ -16,6 +16,9 @@ style_region.h
 style_neigh_bin.h
 style_neigh_pair.h
 style_neigh_stencil.h
+# deleted on 01 Mar 2017
+npair_halffull_newton_ssa.cpp
+npair_halffull_newton_ssa.h
 # deleted on ## XXX 2016
 accelerator_intel.h
 neigh_bond.cpp

From 3820c5881d5c8af44baa45c12771a7971fb668ca Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Thu, 2 Mar 2017 14:02:49 -0700
Subject: [PATCH 179/267] Adding fix_wall_lj93_kokkos

---
 src/KOKKOS/Install.sh               |   2 +
 src/KOKKOS/fix_wall_lj93_kokkos.cpp | 104 ++++++++++++++++++++++++++++
 src/KOKKOS/fix_wall_lj93_kokkos.h   |  83 ++++++++++++++++++++++
 src/fix_wall.cpp                    |   2 +
 src/fix_wall_lj93.h                 |   4 +-
 5 files changed, 193 insertions(+), 2 deletions(-)
 create mode 100644 src/KOKKOS/fix_wall_lj93_kokkos.cpp
 create mode 100644 src/KOKKOS/fix_wall_lj93_kokkos.h

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index ea70ae4ca1..10245631ab 100644
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -105,6 +105,8 @@ action fix_setforce_kokkos.cpp
 action fix_setforce_kokkos.h
 action fix_momentum_kokkos.cpp
 action fix_momentum_kokkos.h
+action fix_wall_lj93_kokkos.cpp
+action fix_wall_lj93_kokkos.h
 action fix_wall_reflect_kokkos.cpp
 action fix_wall_reflect_kokkos.h
 action fix_dpd_energy_kokkos.cpp fix_dpd_energy.cpp
diff --git a/src/KOKKOS/fix_wall_lj93_kokkos.cpp b/src/KOKKOS/fix_wall_lj93_kokkos.cpp
new file mode 100644
index 0000000000..38c7347e97
--- /dev/null
+++ b/src/KOKKOS/fix_wall_lj93_kokkos.cpp
@@ -0,0 +1,104 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include "fix_wall_lj93_kokkos.h"
+#include "atom_kokkos.h"
+#include "error.h"
+#include "atom_masks.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+/* ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+FixWallLJ93Kokkos<DeviceType>::FixWallLJ93Kokkos(LAMMPS *lmp, int narg, char **arg) :
+  FixWallLJ93(lmp, narg, arg)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+}
+
+/* ----------------------------------------------------------------------
+   interaction of all particles in group with a wall
+   m = index of wall coeffs
+   which = xlo,xhi,ylo,yhi,zlo,zhi
+   error if any particle is on or behind wall
+------------------------------------------------------------------------- */
+
+template <class DeviceType>
+void FixWallLJ93Kokkos<DeviceType>::wall_particle(int m_in, int which, double coord_in)
+{
+  m = m_in;
+  coord = coord_in;
+
+  atomKK->sync(execution_space, X_MASK|F_MASK|MASK_MASK);
+  x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  mask = atomKK->k_mask.view<DeviceType>();
+  DAT::tdual_int_scalar k_oneflag = DAT::tdual_int_scalar("fix:oneflag");
+  d_oneflag = k_oneflag.view<DeviceType>();
+
+  int nlocal = atom->nlocal;
+
+  dim = which / 2;
+  side = which % 2;
+  if (side == 0) side = -1;
+
+  copymode = 1;
+  FixWallLJ93KokkosFunctor<DeviceType> wp_functor(this);
+  Kokkos::parallel_reduce(nlocal,wp_functor,ewall);
+  DeviceType::fence();
+  copymode = 0;
+
+  atomKK->modified(execution_space, F_MASK);
+
+  k_oneflag.template modify<DeviceType>();
+  k_oneflag.template sync<LMPHostType>();
+  if (k_oneflag.h_view()) error->one(FLERR,"Particle on or inside fix wall surface");
+}
+
+template <class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixWallLJ93Kokkos<DeviceType>::wall_particle_item(int i, value_type ewall) const {
+  if (mask(i) & groupbit) {
+    double delta;
+    if (side < 0) delta = x(i,dim) - coord;
+    else delta = coord - x(i,dim);
+    if (delta >= cutoff[m]) return;
+    if (delta <= 0.0) {
+      d_oneflag() = 1;
+      return;
+    }
+    double rinv = 1.0/delta;
+    double r2inv = rinv*rinv;
+    double r4inv = r2inv*r2inv;
+    double r10inv = r4inv*r4inv*r2inv;
+    double fwall = side * (coeff1[m]*r10inv - coeff2[m]*r4inv);
+    f(i,dim) -= fwall;
+    ewall[0] += coeff3[m]*r4inv*r4inv*rinv -
+      coeff4[m]*r2inv*rinv - offset[m];
+    ewall[m+1] += fwall;
+  }
+}
+
+namespace LAMMPS_NS {
+template class FixWallLJ93Kokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class FixWallLJ93Kokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/fix_wall_lj93_kokkos.h b/src/KOKKOS/fix_wall_lj93_kokkos.h
new file mode 100644
index 0000000000..3cb0a2d44c
--- /dev/null
+++ b/src/KOKKOS/fix_wall_lj93_kokkos.h
@@ -0,0 +1,83 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(wall/lj93/kk,FixWallLJ93Kokkos<LMPDeviceType>)
+FixStyle(wall/lj93/kk/device,FixWallLJ93Kokkos<LMPDeviceType>)
+FixStyle(wall/lj93/kk/host,FixWallLJ93Kokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_FIX_WALL_LJ93_KOKKOS_H
+#define LMP_FIX_WALL_LJ93_KOKKOS_H
+
+#include "fix_wall_lj93.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+template <class DeviceType>
+class FixWallLJ93Kokkos : public FixWallLJ93 {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typedef double value_type[];
+
+  FixWallLJ93Kokkos(class LAMMPS *, int, char **);
+  void wall_particle(int, int, double);
+
+  int m;
+
+  KOKKOS_INLINE_FUNCTION
+  void wall_particle_item(int, value_type) const;
+
+ private:
+  int dim,side;
+  double coord;
+
+  typename AT::t_x_array x;
+  typename AT::t_f_array f;
+  typename AT::t_int_1d mask;
+  typename AT::t_int_scalar d_oneflag;
+};
+
+template <class DeviceType>
+struct FixWallLJ93KokkosFunctor  {
+  typedef DeviceType device_type ;
+  typedef double value_type[];
+  const int value_count;
+
+  FixWallLJ93Kokkos<DeviceType> c;
+  FixWallLJ93KokkosFunctor(FixWallLJ93Kokkos<DeviceType>* c_ptr):
+    c(*c_ptr),
+    value_count(c.m) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i, value_type ewall) const {
+    c.wall_particle_item(i,ewall);
+  }
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Particle on or inside fix wall surface
+
+Particles must be "exterior" to the wall in order for energy/force to
+be calculated.
+
+*/
diff --git a/src/fix_wall.cpp b/src/fix_wall.cpp
index 503b87f4a7..8b569cafc6 100644
--- a/src/fix_wall.cpp
+++ b/src/fix_wall.cpp
@@ -201,6 +201,8 @@ FixWall::FixWall(LAMMPS *lmp, int narg, char **arg) :
 
 FixWall::~FixWall()
 {
+  if (copymode) return;
+
   for (int m = 0; m < nwall; m++) {
     delete [] xstr[m];
     delete [] estr[m];
diff --git a/src/fix_wall_lj93.h b/src/fix_wall_lj93.h
index 40337a5176..3763a02910 100644
--- a/src/fix_wall_lj93.h
+++ b/src/fix_wall_lj93.h
@@ -28,9 +28,9 @@ class FixWallLJ93 : public FixWall {
  public:
   FixWallLJ93(class LAMMPS *, int, char **);
   void precompute(int);
-  void wall_particle(int, int, double);
+  virtual void wall_particle(int, int, double);
 
- private:
+ protected:
   double coeff1[6],coeff2[6],coeff3[6],coeff4[6],offset[6];
 };
 

From 7e78921c96a5f822dd5c8942bc1fb204a3c747f3 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 3 Mar 2017 10:12:44 -0500
Subject: [PATCH 180/267] USER-DPD Kokkos: propagate 763a00e8 bugfix to
 pair_multi_lucy_rx_kokkos.cpp

---
 src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
index 30b49a8e8d..d087546619 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
@@ -538,15 +538,15 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXComputeLoca
         rho_i_contrib += factor;
         if (NEWTON_PAIR || j < nlocal)
           a_rho[j] += factor;
-      } else if (rsq < d_cutsq(itype,jtype)) {
-        const double rcut = sqrt(d_cutsq(itype,jtype));
-        const double tmpFactor = 1.0-sqrt(rsq)/rcut;
-        const double tmpFactor4 = tmpFactor*tmpFactor*tmpFactor*tmpFactor;
-        const double factor = (84.0/(5.0*pi*rcut*rcut*rcut))*(1.0+3.0*sqrt(rsq)/(2.0*rcut))*tmpFactor4;
-        rho_i_contrib += factor;
-        if (NEWTON_PAIR || j < nlocal)
-          a_rho[j] += factor;
       }
+    } else if (rsq < d_cutsq(itype,jtype)) {
+      const double rcut = sqrt(d_cutsq(itype,jtype));
+      const double tmpFactor = 1.0-sqrt(rsq)/rcut;
+      const double tmpFactor4 = tmpFactor*tmpFactor*tmpFactor*tmpFactor;
+      const double factor = (84.0/(5.0*pi*rcut*rcut*rcut))*(1.0+3.0*sqrt(rsq)/(2.0*rcut))*tmpFactor4;
+      rho_i_contrib += factor;
+      if (NEWTON_PAIR || j < nlocal)
+        a_rho[j] += factor;
     }
   }
 

From a7e855096215b5204b39bc7e54b080095909d6c4 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 3 Mar 2017 10:38:45 -0500
Subject: [PATCH 181/267] USER-DPD Kokkos: turn one_type optimization into a
 template specialization

---
 src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp | 28 +++++++++++++++++-------
 src/KOKKOS/pair_multi_lucy_rx_kokkos.h   |  7 +++---
 2 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
index d087546619..11dbfabf3a 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
@@ -454,7 +454,7 @@ void PairMultiLucyRXKokkos<DeviceType>::computeLocalDensity()
   const double pi = MathConst::MY_PI;
 
   const bool newton_pair = force->newton_pair;
-  one_type = (atom->ntypes == 1);
+  const bool one_type = (atom->ntypes == 1);
 
   // Special cut-off values for when there's only one type.
   cutsq_type11 = cutsq[1][1];
@@ -471,14 +471,26 @@ void PairMultiLucyRXKokkos<DeviceType>::computeLocalDensity()
 
   if (neighflag == HALF) {
     if (newton_pair)
-      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALF,1> >(0,inum),*this);
+      if (one_type)
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALF,1,true> >(0,inum),*this);
+      else
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALF,1,false> >(0,inum),*this);
     else
-      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALF,0> >(0,inum),*this);
+      if (one_type)
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALF,0,true> >(0,inum),*this);
+      else
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALF,0,false> >(0,inum),*this);
   } else if (neighflag == HALFTHREAD) {
     if (newton_pair)
-      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALFTHREAD,1> >(0,inum),*this);
+      if (one_type)
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALFTHREAD,1,true> >(0,inum),*this);
+      else
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALFTHREAD,1,false> >(0,inum),*this);
     else
-      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALFTHREAD,0> >(0,inum),*this);
+      if (one_type)
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALFTHREAD,0,true> >(0,inum),*this);
+      else
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALFTHREAD,0,false> >(0,inum),*this);
   }
 
   atomKK->modified(execution_space,DPDRHO_MASK);
@@ -498,9 +510,9 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXZero, const
 }
 
 template<class DeviceType>
-template<int NEIGHFLAG, int NEWTON_PAIR>
+template<int NEIGHFLAG, int NEWTON_PAIR, bool ONE_TYPE>
 KOKKOS_INLINE_FUNCTION
-void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXComputeLocalDensity<NEIGHFLAG,NEWTON_PAIR>, const int &ii) const {
+void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXComputeLocalDensity<NEIGHFLAG,NEWTON_PAIR,ONE_TYPE>, const int &ii) const {
 
 
   // The rho array is atomic for Half/Thread neighbor style
@@ -528,7 +540,7 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXComputeLoca
     const double delz = ztmp - x(j,2);
     const double rsq = delx*delx + dely*dely + delz*delz;
 
-    if (one_type) {
+    if (ONE_TYPE) {
       if (rsq < cutsq_type11) {
         const double rcut = rcut_type11;
         const double r_over_rcut = sqrt(rsq) / rcut;
diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.h b/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
index 1e84e3efd8..8556319531 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
@@ -39,7 +39,7 @@ struct TagPairMultiLucyRXCompute{};
 
 struct TagPairMultiLucyRXZero{};
 
-template<int NEIGHFLAG, int NEWTON_PAIR>
+template<int NEIGHFLAG, int NEWTON_PAIR, bool ONE_TYPE>
 struct TagPairMultiLucyRXComputeLocalDensity{};
 
 template<class DeviceType>
@@ -88,9 +88,9 @@ class PairMultiLucyRXKokkos : public PairMultiLucyRX {
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairMultiLucyRXZero, const int&) const;
 
-  template<int NEIGHFLAG, int NEWTON_PAIR>
+  template<int NEIGHFLAG, int NEWTON_PAIR, bool ONE_TYPE>
   KOKKOS_INLINE_FUNCTION
-  void operator()(TagPairMultiLucyRXComputeLocalDensity<NEIGHFLAG,NEWTON_PAIR>, const int&) const;
+  void operator()(TagPairMultiLucyRXComputeLocalDensity<NEIGHFLAG,NEWTON_PAIR,ONE_TYPE>, const int&) const;
 
   template<int NEIGHFLAG, int NEWTON_PAIR>
   KOKKOS_INLINE_FUNCTION
@@ -103,7 +103,6 @@ class PairMultiLucyRXKokkos : public PairMultiLucyRX {
   int neighflag;
   int eflag,vflag;
 
-  bool one_type;
   double cutsq_type11;
   double rcut_type11;
   double factor_type11;

From c468727db0927c45c25a43d0285265ed01a5a765 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Fri, 3 Mar 2017 10:49:15 -0700
Subject: [PATCH 182/267] Fixing issue in fix_wall_lj93_kokkos

---
 src/KOKKOS/fix_wall_lj93_kokkos.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/KOKKOS/fix_wall_lj93_kokkos.h b/src/KOKKOS/fix_wall_lj93_kokkos.h
index 3cb0a2d44c..64f3c59a62 100644
--- a/src/KOKKOS/fix_wall_lj93_kokkos.h
+++ b/src/KOKKOS/fix_wall_lj93_kokkos.h
@@ -61,7 +61,7 @@ struct FixWallLJ93KokkosFunctor  {
   FixWallLJ93Kokkos<DeviceType> c;
   FixWallLJ93KokkosFunctor(FixWallLJ93Kokkos<DeviceType>* c_ptr):
     c(*c_ptr),
-    value_count(c.m) {}
+    value_count(c_ptr->m+1) {}
   KOKKOS_INLINE_FUNCTION
   void operator()(const int i, value_type ewall) const {
     c.wall_particle_item(i,ewall);

From 0651ea7f69c33ed026199928476297f9a485e00b Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 3 Mar 2017 12:50:13 -0500
Subject: [PATCH 183/267] USER-DPD Kokkos: work around CUDA not having
 max_hardware_threads()

---
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp | 36 +++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
index ba61185a57..99a364eb86 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
@@ -114,6 +114,42 @@ void PairDPDfdtEnergyKokkos<DeviceType>::init_style()
 #endif
 }
 
+#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDACC__)
+// CUDA specialization of init_style to properly call rand_pool.init()
+template<>
+void PairDPDfdtEnergyKokkos<Kokkos::Cuda>::init_style()
+{
+  PairDPDfdtEnergy::init_style();
+
+  // irequest = neigh request made by parent class
+
+  neighflag = lmp->kokkos->neighflag;
+  int irequest = neighbor->nrequest - 1;
+
+  neighbor->requests[irequest]->
+    kokkos_host = Kokkos::Impl::is_same<Kokkos::Cuda,LMPHostType>::value &&
+    !Kokkos::Impl::is_same<Kokkos::Cuda,LMPDeviceType>::value;
+  neighbor->requests[irequest]->
+    kokkos_device = Kokkos::Impl::is_same<Kokkos::Cuda,LMPDeviceType>::value;
+
+  if (neighflag == FULL) {
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+  } else if (neighflag == HALF || neighflag == HALFTHREAD) {
+    neighbor->requests[irequest]->full = 0;
+    neighbor->requests[irequest]->half = 1;
+  } else {
+    error->all(FLERR,"Cannot use chosen neighbor list style with reax/c/kk");
+  }
+
+#ifdef DPD_USE_RAN_MARS
+  rand_pool.init(random,seed);
+#else
+  rand_pool.init(seed + comm->me,4*32768 /*fake max_hardware_threads()*/);
+#endif
+}
+#endif
+
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>

From 635c448b61bd0279c567ee4426fff6c6ee1ef88d Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 3 Mar 2017 14:57:35 -0500
Subject: [PATCH 184/267] USER-DPD: sort bins for deterministic SSA instead of
 using Kokkos::Serial

---
 src/KOKKOS/nbin_ssa_kokkos.cpp | 85 +++++++++++++++++++++++++++++-----
 src/KOKKOS/nbin_ssa_kokkos.h   |  6 +++
 2 files changed, 79 insertions(+), 12 deletions(-)

diff --git a/src/KOKKOS/nbin_ssa_kokkos.cpp b/src/KOKKOS/nbin_ssa_kokkos.cpp
index 53f3f2fc80..afe016c3f7 100644
--- a/src/KOKKOS/nbin_ssa_kokkos.cpp
+++ b/src/KOKKOS/nbin_ssa_kokkos.cpp
@@ -114,7 +114,6 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
 {
   last_bin = update->ntimestep;
 
-  int i;
   int nlocal = atom->nlocal;
   int nghost = atom->nghost;
   int nall = nlocal + nghost;
@@ -170,19 +169,17 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
     k_gbincount.sync<DeviceType>();
     DeviceType::fence(); // FIXME?
 
-    Kokkos::parallel_for(
-#ifdef ALLOW_NON_DETERMINISTIC_SSA
-      Kokkos::RangePolicy<DeviceType>(nlocal,nall)
-#else
-      Kokkos::RangePolicy<Kokkos::Serial>(nlocal,nall)
-#endif
-      , KOKKOS_LAMBDA (const int i) {
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(nlocal,nall),
+      KOKKOS_LAMBDA (const int i) {
       const int iAIR = binID(i);
       if (iAIR > 0) { // include only ghost atoms in an AIR
         const int ac = Kokkos::atomic_fetch_add(&gbincount[iAIR], (int)1);
         gbins(iAIR, ac) = i;
       }
     });
+#ifndef ALLOW_NON_DETERMINISTIC_DPD
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(1,8), KOKKOS_LAMBDA (const int i) { sortGhostBin(i); });
+#endif
     DeviceType::fence();
   }
   c_gbins = gbins; // gbins won't change until the next bin_atoms
@@ -200,13 +197,11 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
     DeviceType::fence();
 
     NPairSSAKokkosBinAtomsFunctor<DeviceType> f(*this);
-#ifdef ALLOW_NON_DETERMINISTIC_SSA
     Kokkos::parallel_for(nlocal, f);
-#else
-    Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Serial>(0, nlocal), f);
+#ifndef ALLOW_NON_DETERMINISTIC_DPD
+    Kokkos::parallel_for(mbins, KOKKOS_LAMBDA (const int i) { sortAtomBin(i); });
 #endif
     DeviceType::fence();
-
   }
   c_bins = bins; // bins won't change until the next bin_atoms
 
@@ -258,6 +253,72 @@ void NBinSSAKokkos<DeviceType>::binIDGhostsItem(const int &i, int &update) const
   }
 }
 
+// An implementation of heapsort without recursion
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void NBinSSAKokkos<DeviceType>::sortAtomBin(const int &ibin) const
+{
+  int n = bincount(ibin);
+  int i = n/2;
+  int t;
+
+  do { /* Loops until bin is sorted */
+    if (i > 0) { /* First stage - Sorting the heap */
+      i--;           /* Save its index to i */
+      t = bins(ibin, i);    /* Save parent value to t */
+    } else {     /* Second stage - Extracting elements in-place */
+      if ((--n) <= 0) return; /* When the heap is empty, we are done */
+      t = bins(ibin, n);    /* Save last value (it will be overwritten) */
+      bins(ibin, n) = bins(ibin, 0); /* Save largest value at the end of the bin */
+    }
+    int parent = i; /* We will start pushing down t from parent */
+    int child = i*2 + 1; /* parent's left child */
+    /* Sift operation - pushing the value of t down the heap */
+    while (child < n) {
+      /* Choose the largest child */
+      if ((child + 1 < n) && (bins(ibin, child + 1) > bins(ibin, child))) ++child;
+      if (bins(ibin, child) <= t) break; /* t's place is found */
+      bins(ibin, parent) = bins(ibin, child); /* Move the largest child up */
+      parent = child; /* Move parent pointer to this child */
+      child = parent*2+1; /* Find the next child */
+    }
+    bins(ibin, parent) = t; /* We save t in the heap */
+  } while(1);
+}
+
+// An implementation of heapsort without recursion
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void NBinSSAKokkos<DeviceType>::sortGhostBin(const int &ibin) const
+{
+  int n = gbincount(ibin);
+  int i = n/2;
+  int t;
+
+  do { /* Loops until bin is sorted */
+    if (i > 0) { /* First stage - Sorting the heap */
+      i--;           /* Save its index to i */
+      t = gbins(ibin, i);    /* Save parent value to t */
+    } else {     /* Second stage - Extracting elements in-place */
+      if (--n <= 0) return; /* When the heap is empty, we are done */
+      t = gbins(ibin, n);    /* Save last value (it will be overwritten) */
+      gbins(ibin, n) = gbins(ibin, 0); /* Save largest value at the end of the bin */
+    }
+    int parent = i; /* We will start pushing down t from parent */
+    int child = i*2 + 1; /* parent's left child */
+    /* Sift operation - pushing the value of t down the heap */
+    while (child < n) {
+      /* Choose the largest child */
+      if ((child + 1 < n) && (gbins(ibin, child + 1) > gbins(ibin, child))) ++child;
+      if (gbins(ibin, child) <= t) break; /* t's place is found */
+      gbins(ibin, parent) = gbins(ibin, child); /* Move the largest child up */
+      parent = child; /* Move parent pointer to this child */
+      child = parent*2+1; /* Find the next child */
+    }
+    gbins(ibin, parent) = t; /* We save t in the heap */
+  } while(1);
+}
+
 namespace LAMMPS_NS {
 template class NBinSSAKokkos<LMPDeviceType>;
 #ifdef KOKKOS_HAVE_CUDA
diff --git a/src/KOKKOS/nbin_ssa_kokkos.h b/src/KOKKOS/nbin_ssa_kokkos.h
index 69f05c9304..add400c573 100644
--- a/src/KOKKOS/nbin_ssa_kokkos.h
+++ b/src/KOKKOS/nbin_ssa_kokkos.h
@@ -88,6 +88,12 @@ class NBinSSAKokkos : public NBinStandard {
   KOKKOS_INLINE_FUNCTION
   void binIDGhostsItem(const int &i, int &update) const;
 
+  KOKKOS_INLINE_FUNCTION
+  void sortAtomBin(const int &ibin) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void sortGhostBin(const int &ibin) const;
+
 /* ----------------------------------------------------------------------
    convert atom coords into the ssa active interaction region number
 ------------------------------------------------------------------------- */

From b35895ca128b375595d3da401a88f842b8ad63bf Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 3 Mar 2017 15:21:09 -0500
Subject: [PATCH 185/267] USER-DPD Kokkos: Remove the SSA's
 ALLOW_NON_DETERMINISTIC_DPD option. There was no measurable performance
 benefit to turning it on.

---
 src/KOKKOS/nbin_ssa_kokkos.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/KOKKOS/nbin_ssa_kokkos.cpp b/src/KOKKOS/nbin_ssa_kokkos.cpp
index afe016c3f7..0f4a3b8d4f 100644
--- a/src/KOKKOS/nbin_ssa_kokkos.cpp
+++ b/src/KOKKOS/nbin_ssa_kokkos.cpp
@@ -177,9 +177,7 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
         gbins(iAIR, ac) = i;
       }
     });
-#ifndef ALLOW_NON_DETERMINISTIC_DPD
     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(1,8), KOKKOS_LAMBDA (const int i) { sortGhostBin(i); });
-#endif
     DeviceType::fence();
   }
   c_gbins = gbins; // gbins won't change until the next bin_atoms
@@ -198,9 +196,7 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
 
     NPairSSAKokkosBinAtomsFunctor<DeviceType> f(*this);
     Kokkos::parallel_for(nlocal, f);
-#ifndef ALLOW_NON_DETERMINISTIC_DPD
     Kokkos::parallel_for(mbins, KOKKOS_LAMBDA (const int i) { sortAtomBin(i); });
-#endif
     DeviceType::fence();
   }
   c_bins = bins; // bins won't change until the next bin_atoms

From c2c22fc2ede651fe5b9da1f7c45a4c05ab543951 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Mon, 6 Mar 2017 10:57:19 -0700
Subject: [PATCH 186/267] add missing KOKKOS_INLINE_FUNCTION to fix_shardlow

---
 src/KOKKOS/Install.sh              | 4 ++--
 src/KOKKOS/fix_shardlow_kokkos.cpp | 1 +
 src/KOKKOS/fix_shardlow_kokkos.h   | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index 68bd8d2ea8..5707a4e53c 100644
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -28,8 +28,8 @@ action () {
 
 # force rebuild of files with LMP_KOKKOS switch
 
-touch ../accelerator_kokkos.h
-touch ../memory.h
+#touch ../accelerator_kokkos.h
+#touch ../memory.h
 
 # list of files with optional dependcies
 
diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index 996f37257d..0dfbce5033 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -433,6 +433,7 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpd(
 ------------------------------------------------------------------------- */
 template<class DeviceType>
 template<bool STACKPARAMS>
+KOKKOS_INLINE_FUNCTION
 void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
   int start_ii, int count, int id
 )
diff --git a/src/KOKKOS/fix_shardlow_kokkos.h b/src/KOKKOS/fix_shardlow_kokkos.h
index c4711f5b8b..4dc47709e1 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.h
+++ b/src/KOKKOS/fix_shardlow_kokkos.h
@@ -124,6 +124,7 @@ class FixShardlowKokkos : public FixShardlow {
 //  template<bool STACKPARAMS>
 //  void ssa_update_dpd(int, int);  // Constant Temperature
   template<bool STACKPARAMS>
+  KOKKOS_INLINE_FUNCTION
   void ssa_update_dpde(int, int, int); // Constant Energy
 
 };

From 3e8cfb8247fdf7bb19ac21edfd971b3188f4881c Mon Sep 17 00:00:00 2001
From: Dan Ibanez <dan.a.ibanez@gmail.com>
Date: Mon, 6 Mar 2017 11:04:47 -0700
Subject: [PATCH 187/267] The wonders of git commit -a

---
 src/KOKKOS/Install.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index 5707a4e53c..68bd8d2ea8 100644
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -28,8 +28,8 @@ action () {
 
 # force rebuild of files with LMP_KOKKOS switch
 
-#touch ../accelerator_kokkos.h
-#touch ../memory.h
+touch ../accelerator_kokkos.h
+touch ../memory.h
 
 # list of files with optional dependcies
 

From 4a6f27935d26567203c595fdf8214c89f4e94643 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Mon, 6 Mar 2017 14:58:40 -0700
Subject: [PATCH 188/267] fix lambda syntax for CUDA

KOKKOS_LAMBDA doesn't quite work on CUDA,
you have to use LAMMPS_LAMBDA.
Also, if you do use LAMMPS_LAMBDA, you need
to run on the default device type,
i.e. no using lambdas to run on OpenMP
when LAMMPS has been compiled for CUDA.
---
 src/KOKKOS/fix_shardlow_kokkos.cpp | 10 +++++-----
 src/KOKKOS/nbin_ssa_kokkos.cpp     | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index 0dfbce5033..bf026552fa 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -677,13 +677,13 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
     int workItemCt = ssa_phaseLen[workPhase];
 
     if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
-      Kokkos::parallel_for(workItemCt, KOKKOS_LAMBDA (const int workItem ) {
+      Kokkos::parallel_for(workItemCt, LAMMPS_LAMBDA (const int workItem ) {
         int ct = ssa_itemLen(workPhase, workItem);
         int ii = ssa_itemLoc(workPhase, workItem);
         ssa_update_dpde<false>(ii, ct, workItem);
       });
     } else {
-      Kokkos::parallel_for(workItemCt, KOKKOS_LAMBDA (const int workItem ) {
+      Kokkos::parallel_for(workItemCt, LAMMPS_LAMBDA (const int workItem ) {
         int ct = ssa_itemLen(workPhase, workItem);
         int ii = ssa_itemLoc(workPhase, workItem);
         ssa_update_dpde<true>(ii, ct, workItem);
@@ -704,7 +704,7 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
 //      memset(&(atom->uCond[nlocal]), 0, sizeof(double)*nghost);
 //      memset(&(atom->uMech[nlocal]), 0, sizeof(double)*nghost);
 
-      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(nlocal,nlocal+nghost), KOKKOS_LAMBDA (const int i) {
+      Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType>(nlocal,nlocal+nghost), LAMMPS_LAMBDA (const int i) {
         uCond(i) = 0.0;
         uMech(i) = 0.0;
       });
@@ -713,13 +713,13 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
 
     // process neighbors in this AIR
     if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
-      Kokkos::parallel_for(workItemCt, KOKKOS_LAMBDA (const int workItem ) {
+      Kokkos::parallel_for(workItemCt, LAMMPS_LAMBDA (const int workItem ) {
         int ct = ssa_gitemLen(workPhase, workItem);
         int ii = ssa_gitemLoc(workPhase, workItem);
         ssa_update_dpde<false>(ii, ct, workItem);
       });
     } else {
-      Kokkos::parallel_for(workItemCt, KOKKOS_LAMBDA (const int workItem ) {
+      Kokkos::parallel_for(workItemCt, LAMMPS_LAMBDA (const int workItem ) {
         int ct = ssa_gitemLen(workPhase, workItem);
         int ii = ssa_gitemLoc(workPhase, workItem);
         ssa_update_dpde<true>(ii, ct, workItem);
diff --git a/src/KOKKOS/nbin_ssa_kokkos.cpp b/src/KOKKOS/nbin_ssa_kokkos.cpp
index 0f4a3b8d4f..b0e2d5be88 100644
--- a/src/KOKKOS/nbin_ssa_kokkos.cpp
+++ b/src/KOKKOS/nbin_ssa_kokkos.cpp
@@ -155,7 +155,7 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
     DeviceType::fence(); // FIXME?
     ghosts_per_gbin = 0;
     NPairSSAKokkosBinIDGhostsFunctor<DeviceType> f(*this);
-    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType>(nlocal,nall), f, ghosts_per_gbin);
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<LMPDeviceType>(nlocal,nall), f, ghosts_per_gbin);
   }
 
   // actually bin the ghost atoms
@@ -169,15 +169,15 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
     k_gbincount.sync<DeviceType>();
     DeviceType::fence(); // FIXME?
 
-    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(nlocal,nall),
-      KOKKOS_LAMBDA (const int i) {
+    Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType>(nlocal,nall),
+      LAMMPS_LAMBDA (const int i) {
       const int iAIR = binID(i);
       if (iAIR > 0) { // include only ghost atoms in an AIR
         const int ac = Kokkos::atomic_fetch_add(&gbincount[iAIR], (int)1);
         gbins(iAIR, ac) = i;
       }
     });
-    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(1,8), KOKKOS_LAMBDA (const int i) { sortGhostBin(i); });
+    Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType>(1,8), LAMMPS_LAMBDA (const int i) { sortGhostBin(i); });
     DeviceType::fence();
   }
   c_gbins = gbins; // gbins won't change until the next bin_atoms
@@ -196,7 +196,7 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
 
     NPairSSAKokkosBinAtomsFunctor<DeviceType> f(*this);
     Kokkos::parallel_for(nlocal, f);
-    Kokkos::parallel_for(mbins, KOKKOS_LAMBDA (const int i) { sortAtomBin(i); });
+    Kokkos::parallel_for(mbins, LAMMPS_LAMBDA (const int i) { sortAtomBin(i); });
     DeviceType::fence();
   }
   c_bins = bins; // bins won't change until the next bin_atoms

From a7d1b571be0ae8969911cba9cf4b4d5a9b879948 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Mon, 6 Mar 2017 15:07:07 -0700
Subject: [PATCH 189/267] don't capture "this" in lambdas

CUDA lambdas can't capture the calling
object very well.
make local shallow copies of variables needed.
---
 src/KOKKOS/nbin_ssa_kokkos.cpp | 21 +++++++++++++++------
 src/KOKKOS/nbin_ssa_kokkos.h   |  7 +++++--
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/KOKKOS/nbin_ssa_kokkos.cpp b/src/KOKKOS/nbin_ssa_kokkos.cpp
index b0e2d5be88..8c991cc0c2 100644
--- a/src/KOKKOS/nbin_ssa_kokkos.cpp
+++ b/src/KOKKOS/nbin_ssa_kokkos.cpp
@@ -169,16 +169,22 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
     k_gbincount.sync<DeviceType>();
     DeviceType::fence(); // FIXME?
 
+    auto binID_ = binID;
+    auto gbincount_ = gbincount;
+    auto gbins_ = gbins;
+
     Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType>(nlocal,nall),
       LAMMPS_LAMBDA (const int i) {
-      const int iAIR = binID(i);
+      const int iAIR = binID_(i);
       if (iAIR > 0) { // include only ghost atoms in an AIR
-        const int ac = Kokkos::atomic_fetch_add(&gbincount[iAIR], (int)1);
-        gbins(iAIR, ac) = i;
+        const int ac = Kokkos::atomic_fetch_add(&gbincount_[iAIR], (int)1);
+        gbins_(iAIR, ac) = i;
       }
     });
-    Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType>(1,8), LAMMPS_LAMBDA (const int i) { sortGhostBin(i); });
-    DeviceType::fence();
+    Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType>(1,8),
+      LAMMPS_LAMBDA (const int i) {
+      sortGhostBin(gbincount_, gbins_, i);
+    });
   }
   c_gbins = gbins; // gbins won't change until the next bin_atoms
 
@@ -285,7 +291,10 @@ void NBinSSAKokkos<DeviceType>::sortAtomBin(const int &ibin) const
 // An implementation of heapsort without recursion
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void NBinSSAKokkos<DeviceType>::sortGhostBin(const int &ibin) const
+void NBinSSAKokkos<DeviceType>::sortGhostBin(
+      typename AT::t_int_1d gbincount,
+      typename AT::t_int_2d gbins,
+      const int &ibin)
 {
   int n = gbincount(ibin);
   int i = n/2;
diff --git a/src/KOKKOS/nbin_ssa_kokkos.h b/src/KOKKOS/nbin_ssa_kokkos.h
index add400c573..ca1f81953f 100644
--- a/src/KOKKOS/nbin_ssa_kokkos.h
+++ b/src/KOKKOS/nbin_ssa_kokkos.h
@@ -91,8 +91,11 @@ class NBinSSAKokkos : public NBinStandard {
   KOKKOS_INLINE_FUNCTION
   void sortAtomBin(const int &ibin) const;
 
-  KOKKOS_INLINE_FUNCTION
-  void sortGhostBin(const int &ibin) const;
+  static KOKKOS_INLINE_FUNCTION
+  void sortGhostBin(
+      typename AT::t_int_1d gbincount,
+      typename AT::t_int_2d gbins,
+      const int &ibin);
 
 /* ----------------------------------------------------------------------
    convert atom coords into the ssa active interaction region number

From 3e3a24da48d41227f5bfc12f3337a0a39ce7e948 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Mon, 6 Mar 2017 15:28:25 -0700
Subject: [PATCH 190/267] consolidate sorting functions

two sort functions with different
names but identical functionality.
making them the same function
until we descide to use a different
algorithm for atoms and ghosts
---
 src/KOKKOS/nbin_ssa_kokkos.cpp | 48 ++++++++--------------------------
 src/KOKKOS/nbin_ssa_kokkos.h   |  5 +---
 2 files changed, 12 insertions(+), 41 deletions(-)

diff --git a/src/KOKKOS/nbin_ssa_kokkos.cpp b/src/KOKKOS/nbin_ssa_kokkos.cpp
index 8c991cc0c2..1fcbbed601 100644
--- a/src/KOKKOS/nbin_ssa_kokkos.cpp
+++ b/src/KOKKOS/nbin_ssa_kokkos.cpp
@@ -183,7 +183,7 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
     });
     Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType>(1,8),
       LAMMPS_LAMBDA (const int i) {
-      sortGhostBin(gbincount_, gbins_, i);
+      sortBin(gbincount_, gbins_, i);
     });
   }
   c_gbins = gbins; // gbins won't change until the next bin_atoms
@@ -200,9 +200,16 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
     Kokkos::parallel_for(mbins, f_zero);
     DeviceType::fence();
 
+    auto bincount_ = bincount;
+    auto bins_ = bins;
+
     NPairSSAKokkosBinAtomsFunctor<DeviceType> f(*this);
     Kokkos::parallel_for(nlocal, f);
-    Kokkos::parallel_for(mbins, LAMMPS_LAMBDA (const int i) { sortAtomBin(i); });
+
+    Kokkos::parallel_for(mbins,
+      LAMMPS_LAMBDA (const int i) {
+      sortBin(bincount_, bins_, i);
+    });
     DeviceType::fence();
   }
   c_bins = bins; // bins won't change until the next bin_atoms
@@ -258,40 +265,7 @@ void NBinSSAKokkos<DeviceType>::binIDGhostsItem(const int &i, int &update) const
 // An implementation of heapsort without recursion
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void NBinSSAKokkos<DeviceType>::sortAtomBin(const int &ibin) const
-{
-  int n = bincount(ibin);
-  int i = n/2;
-  int t;
-
-  do { /* Loops until bin is sorted */
-    if (i > 0) { /* First stage - Sorting the heap */
-      i--;           /* Save its index to i */
-      t = bins(ibin, i);    /* Save parent value to t */
-    } else {     /* Second stage - Extracting elements in-place */
-      if ((--n) <= 0) return; /* When the heap is empty, we are done */
-      t = bins(ibin, n);    /* Save last value (it will be overwritten) */
-      bins(ibin, n) = bins(ibin, 0); /* Save largest value at the end of the bin */
-    }
-    int parent = i; /* We will start pushing down t from parent */
-    int child = i*2 + 1; /* parent's left child */
-    /* Sift operation - pushing the value of t down the heap */
-    while (child < n) {
-      /* Choose the largest child */
-      if ((child + 1 < n) && (bins(ibin, child + 1) > bins(ibin, child))) ++child;
-      if (bins(ibin, child) <= t) break; /* t's place is found */
-      bins(ibin, parent) = bins(ibin, child); /* Move the largest child up */
-      parent = child; /* Move parent pointer to this child */
-      child = parent*2+1; /* Find the next child */
-    }
-    bins(ibin, parent) = t; /* We save t in the heap */
-  } while(1);
-}
-
-// An implementation of heapsort without recursion
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void NBinSSAKokkos<DeviceType>::sortGhostBin(
+void NBinSSAKokkos<DeviceType>::sortBin(
       typename AT::t_int_1d gbincount,
       typename AT::t_int_2d gbins,
       const int &ibin)
@@ -305,7 +279,7 @@ void NBinSSAKokkos<DeviceType>::sortGhostBin(
       i--;           /* Save its index to i */
       t = gbins(ibin, i);    /* Save parent value to t */
     } else {     /* Second stage - Extracting elements in-place */
-      if (--n <= 0) return; /* When the heap is empty, we are done */
+      if ((--n) <= 0) return; /* When the heap is empty, we are done */
       t = gbins(ibin, n);    /* Save last value (it will be overwritten) */
       gbins(ibin, n) = gbins(ibin, 0); /* Save largest value at the end of the bin */
     }
diff --git a/src/KOKKOS/nbin_ssa_kokkos.h b/src/KOKKOS/nbin_ssa_kokkos.h
index ca1f81953f..cc98859913 100644
--- a/src/KOKKOS/nbin_ssa_kokkos.h
+++ b/src/KOKKOS/nbin_ssa_kokkos.h
@@ -88,11 +88,8 @@ class NBinSSAKokkos : public NBinStandard {
   KOKKOS_INLINE_FUNCTION
   void binIDGhostsItem(const int &i, int &update) const;
 
-  KOKKOS_INLINE_FUNCTION
-  void sortAtomBin(const int &ibin) const;
-
   static KOKKOS_INLINE_FUNCTION
-  void sortGhostBin(
+  void sortBin(
       typename AT::t_int_1d gbincount,
       typename AT::t_int_2d gbins,
       const int &ibin);

From 527a573026d14fc068f0e44e8b676d98cd1816d6 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Mon, 6 Mar 2017 15:42:26 -0700
Subject: [PATCH 191/267] don't use device views to measure dimensions

---
 src/KOKKOS/nbin_ssa_kokkos.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/nbin_ssa_kokkos.cpp b/src/KOKKOS/nbin_ssa_kokkos.cpp
index 1fcbbed601..6c9e3a3446 100644
--- a/src/KOKKOS/nbin_ssa_kokkos.cpp
+++ b/src/KOKKOS/nbin_ssa_kokkos.cpp
@@ -72,7 +72,7 @@ NBinSSAKokkos<DeviceType>::NBinSSAKokkos(LAMMPS *lmp) : NBinStandard(lmp)
 template<class DeviceType>
 void NBinSSAKokkos<DeviceType>::bin_atoms_setup(int nall)
 {
-  if (mbins > (int) k_bins.d_view.dimension_0()) {
+  if (mbins > (int) k_bins.h_view.dimension_0()) {
     k_bins = DAT::tdual_int_2d("NBinSSAKokkos::bins",mbins,atoms_per_bin);
     bins = k_bins.view<DeviceType>();
 
@@ -82,7 +82,7 @@ void NBinSSAKokkos<DeviceType>::bin_atoms_setup(int nall)
 
   ghosts_per_gbin = atom->nghost / 7; // estimate needed size
 
-  if (ghosts_per_gbin > (int) k_gbins.d_view.dimension_1()) {
+  if (ghosts_per_gbin > (int) k_gbins.h_view.dimension_1()) {
     k_gbins = DAT::tdual_int_2d("NBinSSAKokkos::gbins",8,ghosts_per_gbin);
     gbins = k_gbins.view<DeviceType>();
   }

From b8c72c7bdb547c75c3fc4077353b8f15e4b7b240 Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Mon, 6 Mar 2017 15:51:09 -0700
Subject: [PATCH 192/267] don't query device variables from the host

---
 src/KOKKOS/npair_ssa_kokkos.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp
index 7eea57d492..a9b59bfc96 100644
--- a/src/KOKKOS/npair_ssa_kokkos.cpp
+++ b/src/KOKKOS/npair_ssa_kokkos.cpp
@@ -83,12 +83,12 @@ void NPairSSAKokkos<DeviceType>::copy_bin_info()
   k_gbincount = nbKK->k_gbincount;
   k_gbins = nbKK->k_gbins;
 
-  lbinxlo = nbKK->d_lbinxlo();
-  lbinxhi = nbKK->d_lbinxhi();
-  lbinylo = nbKK->d_lbinylo();
-  lbinyhi = nbKK->d_lbinyhi();
-  lbinzlo = nbKK->d_lbinzlo();
-  lbinzhi = nbKK->d_lbinzhi();
+  lbinxlo = nbKK->h_lbinxlo();
+  lbinxhi = nbKK->h_lbinxhi();
+  lbinylo = nbKK->h_lbinylo();
+  lbinyhi = nbKK->h_lbinyhi();
+  lbinzlo = nbKK->h_lbinzlo();
+  lbinzhi = nbKK->h_lbinzhi();
 }
 
 /* ----------------------------------------------------------------------

From d01f09dce237dabe50e22d78881380676db3451a Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Tue, 7 Mar 2017 15:23:17 -0500
Subject: [PATCH 193/267] Turn off use of OpenMP in MPIIO/dump_custom_mpiio.cpp
 if Kokkos is in use. The convert_string_omp() method breaks when Kokkos is
 also using OpenMP.

---
 src/MPIIO/dump_custom_mpiio.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/MPIIO/dump_custom_mpiio.cpp b/src/MPIIO/dump_custom_mpiio.cpp
index 6e48bfa146..0b282b77ef 100644
--- a/src/MPIIO/dump_custom_mpiio.cpp
+++ b/src/MPIIO/dump_custom_mpiio.cpp
@@ -542,8 +542,8 @@ void DumpCustomMPIIO::write_string(int n, double *mybuf)
 
 #if defined(_OPENMP)
     int nthreads = omp_get_max_threads();
-    if (nthreads > 1)
-      nsme = convert_string_omp(n,mybuf);
+    if ((nthreads > 1) && !(lmp->kokkos))
+      nsme = convert_string_omp(n,mybuf); // not (yet) compatible with Kokkos
     else
       nsme = convert_string(n,mybuf);
 #else

From fc23f9cfe897f383db0fa48ce28ea7a2dceb34e8 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 8 Mar 2017 13:07:52 -0700
Subject: [PATCH 194/267] Disable allocation of per-atom arrays in ev_setup for
 USER-DPD Kokkos styles

---
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp | 2 +-
 src/KOKKOS/pair_exp6_rx_kokkos.cpp        | 2 +-
 src/KOKKOS/pair_hybrid_kokkos.cpp         | 2 +-
 src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp  | 2 +-
 src/KOKKOS/pair_table_rx_kokkos.cpp       | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
index 99a364eb86..bd0f08efa6 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
@@ -161,7 +161,7 @@ void PairDPDfdtEnergyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   vflag = vflag_in;
 
   if (neighflag == FULL) no_virial_fdotr_compute = 1;
-  if (eflag || vflag) ev_setup(eflag,vflag);
+  if (eflag || vflag) ev_setup(eflag,vflag,0);
   else evflag = vflag_fdotr = 0;
 
   // reallocate per-atom arrays if necessary
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index 962dcfd031..4b0748721c 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -127,7 +127,7 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   vflag = vflag_in;
 
   if (neighflag == FULL) no_virial_fdotr_compute = 1;
-  if (eflag || vflag) ev_setup(eflag,vflag);
+  if (eflag || vflag) ev_setup(eflag,vflag,0);
   else evflag = vflag_fdotr = 0;
 
   // reallocate per-atom arrays if necessary
diff --git a/src/KOKKOS/pair_hybrid_kokkos.cpp b/src/KOKKOS/pair_hybrid_kokkos.cpp
index 337b56c6ce..629eee156a 100644
--- a/src/KOKKOS/pair_hybrid_kokkos.cpp
+++ b/src/KOKKOS/pair_hybrid_kokkos.cpp
@@ -77,7 +77,7 @@ void PairHybridKokkos::compute(int eflag, int vflag)
 
   if (no_virial_fdotr_compute && vflag % 4 == 2) vflag = 1 + vflag/4 * 4;
 
-  if (eflag || vflag) ev_setup(eflag,vflag);
+  if (eflag || vflag) ev_setup(eflag,vflag,0);
   else evflag = vflag_fdotr = eflag_global = vflag_global =
          eflag_atom = vflag_atom = 0;
 
diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
index 11dbfabf3a..4379cc4001 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
@@ -147,7 +147,7 @@ void PairMultiLucyRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in
   vflag = vflag_in;
 
   if (neighflag == FULL) no_virial_fdotr_compute = 1;
-  if (eflag || vflag) ev_setup(eflag,vflag);
+  if (eflag || vflag) ev_setup(eflag,vflag,0);
   else evflag = vflag_fdotr = 0;
 
   // reallocate per-atom arrays if necessary
diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index 2a1ee2c0b1..cbb1096712 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -627,7 +627,7 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
 
   if (neighflag == FULL) no_virial_fdotr_compute = 1;
 
-  if (eflag || vflag) ev_setup(eflag,vflag);
+  if (eflag || vflag) ev_setup(eflag,vflag,0);
   else evflag = vflag_fdotr = 0;
 
   if (eflag_atom) {

From 35e1cf1d6e006b4c4508c9eb34caa8563a0418e3 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 8 Mar 2017 20:02:02 -0700
Subject: [PATCH 195/267] Fixing issue with ev_setup in pair_hybrid_kokkos

---
 src/KOKKOS/pair_hybrid_kokkos.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/KOKKOS/pair_hybrid_kokkos.cpp b/src/KOKKOS/pair_hybrid_kokkos.cpp
index 629eee156a..337b56c6ce 100644
--- a/src/KOKKOS/pair_hybrid_kokkos.cpp
+++ b/src/KOKKOS/pair_hybrid_kokkos.cpp
@@ -77,7 +77,7 @@ void PairHybridKokkos::compute(int eflag, int vflag)
 
   if (no_virial_fdotr_compute && vflag % 4 == 2) vflag = 1 + vflag/4 * 4;
 
-  if (eflag || vflag) ev_setup(eflag,vflag,0);
+  if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = eflag_global = vflag_global =
          eflag_atom = vflag_atom = 0;
 

From 6f71275db30fcea912d9fb37fb13ab0608cc9d1b Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Thu, 9 Mar 2017 15:35:07 -0700
Subject: [PATCH 196/267] Add Kokkos version of atom_vec_hybrid_kokkos, without
 CUDA support

---
 src/KOKKOS/Install.sh                 |    2 +
 src/KOKKOS/atom_kokkos.cpp            |    1 +
 src/KOKKOS/atom_kokkos.h              |    1 +
 src/KOKKOS/atom_vec_hybrid_kokkos.cpp | 1191 +++++++++++++++++++++++++
 src/KOKKOS/atom_vec_hybrid_kokkos.h   |  161 ++++
 5 files changed, 1356 insertions(+)
 create mode 100644 src/KOKKOS/atom_vec_hybrid_kokkos.cpp
 create mode 100644 src/KOKKOS/atom_vec_hybrid_kokkos.h

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index 68bd8d2ea8..9c11e9321b 100644
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -53,6 +53,8 @@ action atom_vec_dpd_kokkos.cpp atom_vec_dpd.cpp
 action atom_vec_dpd_kokkos.h atom_vec_dpd.h
 action atom_vec_full_kokkos.cpp atom_vec_full.cpp
 action atom_vec_full_kokkos.h atom_vec_full.h
+action atom_vec_hybrid_kokkos.cpp
+action atom_vec_hybrid_kokkos.h
 action atom_vec_kokkos.cpp
 action atom_vec_kokkos.h
 action atom_vec_molecular_kokkos.cpp atom_vec_molecular.cpp
diff --git a/src/KOKKOS/atom_kokkos.cpp b/src/KOKKOS/atom_kokkos.cpp
index 97b76ba67c..31b33dbdc9 100644
--- a/src/KOKKOS/atom_kokkos.cpp
+++ b/src/KOKKOS/atom_kokkos.cpp
@@ -49,6 +49,7 @@ AtomKokkos::~AtomKokkos()
   memory->destroy_kokkos(k_radius, radius);
   memory->destroy_kokkos(k_rmass, rmass);
   memory->destroy_kokkos(k_omega, omega);
+  memory->destroy_kokkos(k_angmom, angmom);
   memory->destroy_kokkos(k_torque, torque);
 
   memory->destroy_kokkos(k_nspecial, nspecial);
diff --git a/src/KOKKOS/atom_kokkos.h b/src/KOKKOS/atom_kokkos.h
index cf454bcd0c..2245023189 100644
--- a/src/KOKKOS/atom_kokkos.h
+++ b/src/KOKKOS/atom_kokkos.h
@@ -34,6 +34,7 @@ class AtomKokkos : public Atom {
   DAT::tdual_float_1d k_radius;
   DAT::tdual_float_1d k_rmass;
   DAT::tdual_v_array k_omega;
+  DAT::tdual_v_array k_angmom;
   DAT::tdual_f_array k_torque;
   DAT::tdual_tagint_1d k_molecule;
   DAT::tdual_int_2d k_nspecial;
diff --git a/src/KOKKOS/atom_vec_hybrid_kokkos.cpp b/src/KOKKOS/atom_vec_hybrid_kokkos.cpp
new file mode 100644
index 0000000000..0c9d261be5
--- /dev/null
+++ b/src/KOKKOS/atom_vec_hybrid_kokkos.cpp
@@ -0,0 +1,1191 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <string.h>
+#include "atom_vec_kokkos.h"
+#include "atom_vec_hybrid_kokkos.h"
+#include "atom_kokkos.h"
+#include "domain.h"
+#include "modify.h"
+#include "fix.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+AtomVecHybridKokkos::AtomVecHybridKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp) {}
+
+/* ---------------------------------------------------------------------- */
+
+AtomVecHybridKokkos::~AtomVecHybridKokkos()
+{
+  for (int k = 0; k < nstyles; k++) delete styles[k];
+  delete [] styles;
+  for (int k = 0; k < nstyles; k++) delete [] keywords[k];
+  delete [] keywords;
+}
+
+/* ----------------------------------------------------------------------
+   process sub-style args
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::process_args(int narg, char **arg)
+{
+  // build list of all known atom styles
+
+  build_styles();
+
+  // allocate list of sub-styles as big as possibly needed if no extra args
+
+  styles = new AtomVec*[narg];
+  keywords = new char*[narg];
+
+  // allocate each sub-style
+  // call process_args() with set of args that are not atom style names
+  // use known_style() to determine which args these are
+
+  int i,jarg,dummy;
+
+  int iarg = 0;
+  nstyles = 0;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"hybrid") == 0)
+      error->all(FLERR,"Atom style hybrid cannot have hybrid as an argument");
+    for (i = 0; i < nstyles; i++)
+      if (strcmp(arg[iarg],keywords[i]) == 0)
+        error->all(FLERR,"Atom style hybrid cannot use same atom style twice");
+    styles[nstyles] = atom->new_avec(arg[iarg],1,dummy);
+    keywords[nstyles] = new char[strlen(arg[iarg])+1];
+    strcpy(keywords[nstyles],arg[iarg]);
+    jarg = iarg + 1;
+    while (jarg < narg && !known_style(arg[jarg])) jarg++;
+    styles[nstyles]->process_args(jarg-iarg-1,&arg[iarg+1]);
+    iarg = jarg;
+    nstyles++;
+  }
+
+  // free allstyles created by build_styles()
+
+  for (int i = 0; i < nallstyles; i++) delete [] allstyles[i];
+  delete [] allstyles;
+
+  // hybrid settings are MAX or MIN of sub-style settings
+  // hybrid sizes are minimal values plus extra values for each sub-style
+
+  molecular = 0;
+  comm_x_only = comm_f_only = 1;
+
+  size_forward = 3;
+  size_reverse = 3;
+  size_border = 6;
+  size_data_atom = 5;
+  size_data_vel = 4;
+  xcol_data = 3;
+
+  for (int k = 0; k < nstyles; k++) {
+    if ((styles[k]->molecular == 1 && molecular == 2) ||
+        (styles[k]->molecular == 2 && molecular == 1))
+      error->all(FLERR,"Cannot mix molecular and molecule template "
+                 "atom styles");
+    molecular = MAX(molecular,styles[k]->molecular);
+
+    bonds_allow = MAX(bonds_allow,styles[k]->bonds_allow);
+    angles_allow = MAX(angles_allow,styles[k]->angles_allow);
+    dihedrals_allow = MAX(dihedrals_allow,styles[k]->dihedrals_allow);
+    impropers_allow = MAX(impropers_allow,styles[k]->impropers_allow);
+    mass_type = MAX(mass_type,styles[k]->mass_type);
+    dipole_type = MAX(dipole_type,styles[k]->dipole_type);
+    forceclearflag = MAX(forceclearflag,styles[k]->forceclearflag);
+
+    if (styles[k]->molecular == 2) onemols = styles[k]->onemols;
+
+    comm_x_only = MIN(comm_x_only,styles[k]->comm_x_only);
+    comm_f_only = MIN(comm_f_only,styles[k]->comm_f_only);
+    size_forward += styles[k]->size_forward - 3;
+    size_reverse += styles[k]->size_reverse - 3;
+    size_border += styles[k]->size_border - 6;
+    size_data_atom += styles[k]->size_data_atom - 5;
+    size_data_vel += styles[k]->size_data_vel - 4;
+  }
+
+  size_velocity = 3;
+  if (atom->omega_flag) size_velocity += 3;
+  if (atom->angmom_flag) size_velocity += 3;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::init()
+{
+  AtomVec::init();
+  for (int k = 0; k < nstyles; k++) styles[k]->init();
+
+#ifdef KOKKOS_HAVE_CUDA
+  error->all(FLERR,"AtomVecHybridKokkos doesn't yet support CUDA");
+#endif
+}
+
+/* ----------------------------------------------------------------------
+   grow atom arrays
+   n = 0 grows arrays by a chunk
+   n > 0 allocates arrays to size n
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::grow(int n)
+{
+  if (n == 0) grow_nmax();
+  else nmax = n;
+  atom->nmax = nmax;
+  if (nmax < 0 || nmax > MAXSMALLINT)
+    error->one(FLERR,"Per-processor system is too big");
+
+  // sub-styles perform all reallocation
+  // turn off nextra_grow so hybrid can do that once below
+
+  int tmp = atom->nextra_grow;
+  atom->nextra_grow = 0;
+  for (int k = 0; k < nstyles; k++) styles[k]->grow(nmax);
+  atom->nextra_grow = tmp;
+
+  // insure hybrid local ptrs and sub-style ptrs are up to date
+  // for sub-styles, do this in case
+  //   multiple sub-style reallocs of same array occurred
+
+  grow_reset();
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      modify->fix[atom->extra_grow[iextra]]->grow_arrays(nmax);
+}
+
+/* ----------------------------------------------------------------------
+   reset local array ptrs
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::grow_reset()
+{
+  tag = atomKK->tag;
+  d_tag = atomKK->k_tag.d_view;
+  h_tag = atomKK->k_tag.h_view;
+
+  type = atomKK->type;
+  d_type = atomKK->k_type.d_view;
+  h_type = atomKK->k_type.h_view;
+
+  mask = atomKK->mask;
+  d_mask = atomKK->k_mask.d_view;
+  h_mask = atomKK->k_mask.h_view;
+
+  image = atomKK->image;
+  d_image = atomKK->k_image.d_view;
+  h_image = atomKK->k_image.h_view;
+
+  x = atomKK->x;
+  d_x = atomKK->k_x.d_view;
+  h_x = atomKK->k_x.h_view;
+
+  v = atomKK->v;
+  d_v = atomKK->k_v.d_view;
+  h_v = atomKK->k_v.h_view;
+
+  f = atomKK->f;
+  d_f = atomKK->k_f.d_view;
+  h_f = atomKK->k_f.h_view;
+
+  v = atomKK->v;
+  d_v = atomKK->k_v.d_view;
+  h_v = atomKK->k_v.h_view;
+
+  omega = atomKK->omega;
+  d_omega = atomKK->k_omega.d_view;
+  h_omega = atomKK->k_omega.h_view;
+
+  angmom = atomKK->angmom;
+  d_angmom = atomKK->k_angmom.d_view;
+  h_angmom = atomKK->k_angmom.h_view;
+
+  for (int k = 0; k < nstyles; k++) styles[k]->grow_reset();
+}
+
+/* ----------------------------------------------------------------------
+   copy atom I info to atom J for all sub-styles
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::copy(int i, int j, int delflag)
+{
+  int tmp = atom->nextra_grow;
+  atom->nextra_grow = 0;
+  for (int k = 0; k < nstyles; k++) styles[k]->copy(i,j,delflag);
+  atom->nextra_grow = tmp;
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      modify->fix[atom->extra_grow[iextra]]->copy_arrays(i,j,delflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::clear_bonus()
+{
+  for (int k = 0; k < nstyles; k++) styles[k]->clear_bonus();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::force_clear(int n, size_t nbytes)
+{
+  for (int k = 0; k < nstyles; k++)
+    if (styles[k]->forceclearflag) styles[k]->force_clear(n,nbytes);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
+                     const int & iswap,
+                     const DAT::tdual_xfloat_2d &buf,
+                     const int &pbc_flag, const int pbc[])
+{
+  error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm");
+}
+void AtomVecHybridKokkos::unpack_comm_kokkos(const int &n, const int &nfirst,
+                        const DAT::tdual_xfloat_2d &buf)
+{
+  error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm");
+}
+int AtomVecHybridKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
+                   const int & iswap, const int nfirst,
+                   const int &pbc_flag, const int pbc[])
+{
+  error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm");
+}
+int AtomVecHybridKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
+                       DAT::tdual_xfloat_2d buf,int iswap,
+                       int pbc_flag, int *pbc, ExecutionSpace space)
+{
+  error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm");
+}
+void AtomVecHybridKokkos::unpack_border_kokkos(const int &n, const int &nfirst,
+                          const DAT::tdual_xfloat_2d &buf,
+                          ExecutionSpace space)
+{
+  error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm");
+}
+int AtomVecHybridKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &buf,
+                         DAT::tdual_int_1d k_sendlist,
+                         DAT::tdual_int_1d k_copylist,
+                         ExecutionSpace space, int dim,
+                         X_FLOAT lo, X_FLOAT hi)
+{
+  error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm");
+}
+int AtomVecHybridKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv,
+                           int nlocal, int dim, X_FLOAT lo, X_FLOAT hi,
+                           ExecutionSpace space)
+{
+  error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm");
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::pack_comm(int n, int *list, double *buf,
+                             int pbc_flag, int *pbc)
+{
+  int i,j,k,m;
+  double dx,dy,dz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+      dz = pbc[2]*domain->zprd;
+    }
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0) + dx;
+      buf[m++] = h_x(j,1) + dy;
+      buf[m++] = h_x(j,2) + dz;
+    }
+  }
+
+  // pack sub-style contributions as contiguous chunks
+
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->pack_comm_hybrid(n,list,&buf[m]);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::pack_comm_vel(int n, int *list, double *buf,
+                                 int pbc_flag, int *pbc)
+{
+  int i,j,k,m;
+  double dx,dy,dz,dvx,dvy,dvz;
+  int omega_flag = atom->omega_flag;
+  int angmom_flag = atom->angmom_flag;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = h_v(j,0);
+      buf[m++] = h_v(j,1);
+      buf[m++] = h_v(j,2);
+      if (omega_flag) {
+        buf[m++] = h_omega(j,0);
+        buf[m++] = h_omega(j,1);
+        buf[m++] = h_omega(j,2);
+      }
+      if (angmom_flag) {
+        buf[m++] = h_angmom(j,0);
+        buf[m++] = h_angmom(j,1);
+        buf[m++] = h_angmom(j,2);
+      }
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+      dz = pbc[2]*domain->zprd;
+    }
+    if (!deform_vremap) {
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = h_v(j,0);
+        buf[m++] = h_v(j,1);
+        buf[m++] = h_v(j,2);
+        if (omega_flag) {
+          buf[m++] = h_omega(j,0);
+          buf[m++] = h_omega(j,1);
+          buf[m++] = h_omega(j,2);
+        }
+        if (angmom_flag) {
+          buf[m++] = h_angmom(j,0);
+          buf[m++] = h_angmom(j,1);
+          buf[m++] = h_angmom(j,2);
+        }
+      }
+    } else {
+      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
+      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
+      dvz = pbc[2]*h_rate[2];
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        if (h_mask[i] & deform_groupbit) {
+          buf[m++] = h_v(j,0) + dvx;
+          buf[m++] = h_v(j,1) + dvy;
+          buf[m++] = h_v(j,2) + dvz;
+        } else {
+          buf[m++] = h_v(j,0);
+          buf[m++] = h_v(j,1);
+          buf[m++] = h_v(j,2);
+        }
+        if (omega_flag) {
+          buf[m++] = h_omega(j,0);
+          buf[m++] = h_omega(j,1);
+          buf[m++] = h_omega(j,2);
+        }
+        if (angmom_flag) {
+          buf[m++] = h_angmom(j,0);
+          buf[m++] = h_angmom(j,1);
+          buf[m++] = h_angmom(j,2);
+        }
+      }
+    }
+  }
+
+  // pack sub-style contributions as contiguous chunks
+
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->pack_comm_hybrid(n,list,&buf[m]);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::unpack_comm(int n, int first, double *buf)
+{
+  int i,k,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+  }
+
+  // unpack sub-style contributions as contiguous chunks
+
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->unpack_comm_hybrid(n,first,&buf[m]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::unpack_comm_vel(int n, int first, double *buf)
+{
+  int i,k,m,last;
+  int omega_flag = atom->omega_flag;
+  int angmom_flag = atom->angmom_flag;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_v(i,0) = buf[m++];
+    h_v(i,1) = buf[m++];
+    h_v(i,2) = buf[m++];
+    if (omega_flag) {
+      h_omega(i,0) = buf[m++];
+      h_omega(i,1) = buf[m++];
+      h_omega(i,2) = buf[m++];
+    }
+    if (angmom_flag) {
+      h_angmom(i,0) = buf[m++];
+      h_angmom(i,1) = buf[m++];
+      h_angmom(i,2) = buf[m++];
+    }
+  }
+
+  // unpack sub-style contributions as contiguous chunks
+
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->unpack_comm_hybrid(n,first,&buf[m]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::pack_reverse(int n, int first, double *buf)
+{
+  int i,k,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    buf[m++] = h_f(i,0);
+    buf[m++] = h_f(i,1);
+    buf[m++] = h_f(i,2);
+  }
+
+  // pack sub-style contributions as contiguous chunks
+
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->pack_reverse_hybrid(n,first,&buf[m]);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::unpack_reverse(int n, int *list, double *buf)
+{
+  int i,j,k,m;
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    h_f(j,0) += buf[m++];
+    h_f(j,1) += buf[m++];
+    h_f(j,2) += buf[m++];
+  }
+
+  // unpack sub-style contributions as contiguous chunks
+
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->unpack_reverse_hybrid(n,list,&buf[m]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::pack_border(int n, int *list, double *buf,
+                               int pbc_flag, int *pbc)
+{
+  int i,j,k,m;
+  double dx,dy,dz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = ubuf(h_tag[j]).d;
+      buf[m++] = ubuf(h_type[j]).d;
+      buf[m++] = ubuf(h_mask[j]).d;
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0) + dx;
+      buf[m++] = h_x(j,1) + dy;
+      buf[m++] = h_x(j,2) + dz;
+      buf[m++] = ubuf(h_tag[j]).d;
+      buf[m++] = ubuf(h_type[j]).d;
+      buf[m++] = ubuf(h_mask[j]).d;
+    }
+  }
+
+  // pack sub-style contributions as contiguous chunks
+
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->pack_border_hybrid(n,list,&buf[m]);
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->pack_border(n,list,&buf[m]);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::pack_border_vel(int n, int *list, double *buf,
+                                   int pbc_flag, int *pbc)
+{
+  int i,j,k,m;
+  double dx,dy,dz,dvx,dvy,dvz;
+  int omega_flag = atom->omega_flag;
+  int angmom_flag = atom->angmom_flag;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = ubuf(h_tag[j]).d;
+      buf[m++] = ubuf(h_type[j]).d;
+      buf[m++] = ubuf(h_mask[j]).d;
+      buf[m++] = h_v(j,0);
+      buf[m++] = h_v(j,1);
+      buf[m++] = h_v(j,2);
+      if (omega_flag) {
+        buf[m++] = h_omega(j,0);
+        buf[m++] = h_omega(j,1);
+        buf[m++] = h_omega(j,2);
+      }
+      if (angmom_flag) {
+        buf[m++] = h_angmom(j,0);
+        buf[m++] = h_angmom(j,1);
+        buf[m++] = h_angmom(j,2);
+      }
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    if (!deform_vremap) {
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = ubuf(h_tag[j]).d;
+        buf[m++] = ubuf(h_type[j]).d;
+        buf[m++] = ubuf(h_mask[j]).d;
+        buf[m++] = h_v(j,0);
+        buf[m++] = h_v(j,1);
+        buf[m++] = h_v(j,2);
+        if (omega_flag) {
+          buf[m++] = h_omega(j,0);
+          buf[m++] = h_omega(j,1);
+          buf[m++] = h_omega(j,2);
+        }
+        if (angmom_flag) {
+          buf[m++] = h_angmom(j,0);
+          buf[m++] = h_angmom(j,1);
+          buf[m++] = h_angmom(j,2);
+        }
+      }
+    } else {
+      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
+      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
+      dvz = pbc[2]*h_rate[2];
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = ubuf(h_tag[j]).d;
+        buf[m++] = ubuf(h_type[j]).d;
+        buf[m++] = ubuf(h_mask[j]).d;
+        if (h_mask[i] & deform_groupbit) {
+          buf[m++] = h_v(j,0) + dvx;
+          buf[m++] = h_v(j,1) + dvy;
+          buf[m++] = h_v(j,2) + dvz;
+        } else {
+          buf[m++] = h_v(j,0);
+          buf[m++] = h_v(j,1);
+          buf[m++] = h_v(j,2);
+        }
+        if (omega_flag) {
+          buf[m++] = h_omega(j,0);
+          buf[m++] = h_omega(j,1);
+          buf[m++] = h_omega(j,2);
+        }
+        if (angmom_flag) {
+          buf[m++] = h_angmom(j,0);
+          buf[m++] = h_angmom(j,1);
+          buf[m++] = h_angmom(j,2);
+        }
+      }
+    }
+  }
+
+  // pack sub-style contributions as contiguous chunks
+
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->pack_border_hybrid(n,list,&buf[m]);
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->pack_border(n,list,&buf[m]);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::unpack_border(int n, int first, double *buf)
+{
+  int i,k,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    if (i == nmax) grow(0);
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_tag[i] = (tagint) ubuf(buf[m++]).i;
+    h_type[i] = (int) ubuf(buf[m++]).i;
+    h_mask[i] = (int) ubuf(buf[m++]).i;
+  }
+
+  // unpack sub-style contributions as contiguous chunks
+
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->unpack_border_hybrid(n,first,&buf[m]);
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->
+        unpack_border(n,first,&buf[m]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::unpack_border_vel(int n, int first, double *buf)
+{
+  int i,k,m,last;
+  int omega_flag = atom->omega_flag;
+  int angmom_flag = atom->angmom_flag;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    if (i == nmax) grow(0);
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_tag[i] = (tagint) ubuf(buf[m++]).i;
+    h_type[i] = (int) ubuf(buf[m++]).i;
+    h_mask[i] = (int) ubuf(buf[m++]).i;
+    h_v(i,0) = buf[m++];
+    h_v(i,1) = buf[m++];
+    h_v(i,2) = buf[m++];
+    if (omega_flag) {
+      h_omega(i,0) = buf[m++];
+      h_omega(i,1) = buf[m++];
+      h_omega(i,2) = buf[m++];
+    }
+    if (angmom_flag) {
+      h_angmom(i,0) = buf[m++];
+      h_angmom(i,1) = buf[m++];
+      h_angmom(i,2) = buf[m++];
+    }
+  }
+
+  // unpack sub-style contributions as contiguous chunks
+
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->unpack_border_hybrid(n,first,&buf[m]);
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->
+        unpack_border(n,first,&buf[m]);
+}
+
+/* ----------------------------------------------------------------------
+   pack data for atom I for sending to another proc
+   pack each sub-style one after the other
+------------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::pack_exchange(int i, double *buf)
+{
+  int k,m;
+
+  int tmp = atom->nextra_grow;
+  atom->nextra_grow = 0;
+
+  m = 0;
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->pack_exchange(i,&buf[m]);
+
+  atom->nextra_grow = tmp;
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      m += modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&buf[m]);
+
+  buf[0] = m;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   unpack data for single atom received from another proc
+   unpack each sub-style one after the other
+   grow() occurs here so arrays for all sub-styles are grown
+------------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::unpack_exchange(double *buf)
+{
+  int k,m;
+
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) grow(0);
+
+  int tmp = atom->nextra_grow;
+  atom->nextra_grow = 0;
+
+  m = 0;
+  for (k = 0; k < nstyles; k++) {
+    m += styles[k]->unpack_exchange(&buf[m]);
+    atom->nlocal--;
+  }
+
+  atom->nextra_grow = tmp;
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      m += modify->fix[atom->extra_grow[iextra]]->
+        unpack_exchange(nlocal,&buf[m]);
+
+  atom->nlocal++;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   size of restart data for all atoms owned by this proc
+   include extra data stored by fixes
+------------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::size_restart()
+{
+  int tmp = atom->nextra_restart;
+  atom->nextra_restart = 0;
+
+  int n = 0;
+  for (int k = 0; k < nstyles; k++)
+    n += styles[k]->size_restart();
+
+  atom->nextra_restart = tmp;
+
+  int nlocal = atom->nlocal;
+  if (atom->nextra_restart)
+    for (int iextra = 0; iextra < atom->nextra_restart; iextra++)
+      for (int i = 0; i < nlocal; i++)
+        n += modify->fix[atom->extra_restart[iextra]]->size_restart(i);
+
+  return n;
+}
+
+/* ----------------------------------------------------------------------
+   pack atom I's data for restart file including extra quantities
+   xyz must be 1st 3 values, so that read_restart can test on them
+   pack each sub-style one after the other
+------------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::pack_restart(int i, double *buf)
+{
+  int tmp = atom->nextra_restart;
+  atom->nextra_restart = 0;
+
+  int m = 0;
+  for (int k = 0; k < nstyles; k++)
+    m += styles[k]->pack_restart(i,&buf[m]);
+
+  atom->nextra_restart = tmp;
+
+  if (atom->nextra_restart)
+    for (int iextra = 0; iextra < atom->nextra_restart; iextra++)
+      m += modify->fix[atom->extra_restart[iextra]]->pack_restart(i,&buf[m]);
+
+  buf[0] = m;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   unpack data for one atom from restart file including extra quantities
+   unpack each sub-style one after the other
+   grow() occurs here so arrays for all sub-styles are grown
+------------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::unpack_restart(double *buf)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) {
+    grow(0);
+    if (atom->nextra_store)
+      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
+  }
+
+  int tmp = atom->nextra_store;
+  atom->nextra_store = 0;
+
+  int m = 0;
+  for (int k = 0; k < nstyles; k++) {
+    m += styles[k]->unpack_restart(&buf[m]);
+    atom->nlocal--;
+  }
+  atom->nextra_store = tmp;
+
+  double **extra = atom->extra;
+  if (atom->nextra_store) {
+    int size = static_cast<int> (buf[0]) - m;
+    for (int i = 0; i < size; i++) extra[nlocal][i] = buf[m++];
+  }
+
+  atom->nlocal++;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   create one atom of itype at coord
+   create each sub-style one after the other
+   grow() occurs here so arrays for all sub-styles are grown
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::create_atom(int itype, double *coord)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) grow(0);
+
+  for (int k = 0; k < nstyles; k++) {
+    styles[k]->create_atom(itype,coord);
+    atom->nlocal--;
+  }
+  atom->nlocal++;
+}
+
+/* ----------------------------------------------------------------------
+   unpack one line from Atoms section of data file
+   grow() occurs here so arrays for all sub-styles are grown
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::data_atom(double *coord, imageint imagetmp, char **values)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) grow(0);
+
+  h_tag[nlocal] = ATOTAGINT(values[0]);
+  h_type[nlocal] = atoi(values[1]);
+  if (h_type[nlocal] <= 0 || h_type[nlocal] > atom->ntypes)
+    error->one(FLERR,"Invalid atom h_type in Atoms section of data file");
+
+  h_x(nlocal,0) = coord[0];
+  h_x(nlocal,1) = coord[1];
+  h_x(nlocal,2) = coord[2];
+
+  h_image[nlocal] = imagetmp;
+  h_mask[nlocal] = 1;
+
+  h_v(nlocal,0) = 0.0;
+  h_v(nlocal,1) = 0.0;
+  h_v(nlocal,2) = 0.0;
+  if (atom->omega_flag) {
+    h_omega(nlocal,0) = 0.0;
+    h_omega(nlocal,1) = 0.0;
+    h_omega(nlocal,2) = 0.0;
+  }
+  if (atom->angmom_flag) {
+    h_angmom(nlocal,0) = 0.0;
+    h_angmom(nlocal,1) = 0.0;
+    h_angmom(nlocal,2) = 0.0;
+  }
+
+  // each sub-style parses sub-style specific values
+
+  int m = 5;
+  for (int k = 0; k < nstyles; k++)
+    m += styles[k]->data_atom_hybrid(nlocal,&values[m]);
+
+  atom->nlocal++;
+}
+
+/* ----------------------------------------------------------------------
+   unpack one line from Velocities section of data file
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::data_vel(int m, char **values)
+{
+  h_v(m,0) = atof(values[0]);
+  h_v(m,1) = atof(values[1]);
+  h_v(m,2) = atof(values[2]);
+
+  // each sub-style parses sub-style specific values
+
+  int n = 3;
+  for (int k = 0; k < nstyles; k++)
+    n += styles[k]->data_vel_hybrid(m,&values[n]);
+}
+
+/* ----------------------------------------------------------------------
+   pack atom info for data file including 3 image flags
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::pack_data(double **buf)
+{
+  int k,m;
+
+  int nlocal = atom->nlocal;
+  for (int i = 0; i < nlocal; i++) {
+    buf[i][0] = ubuf(h_tag[i]).d;
+    buf[i][1] = ubuf(h_type[i]).d;
+    buf[i][2] = h_x(i,0);
+    buf[i][3] = h_x(i,1);
+    buf[i][4] = h_x(i,2);
+
+    m = 5;
+    for (k = 0; k < nstyles; k++)
+      m += styles[k]->pack_data_hybrid(i,&buf[i][m]);
+
+    buf[i][m] = ubuf((h_image[i] & IMGMASK) - IMGMAX).d;
+    buf[i][m+1] = ubuf((h_image[i] >> IMGBITS & IMGMASK) - IMGMAX).d;
+    buf[i][m+2] = ubuf((h_image[i] >> IMG2BITS) - IMGMAX).d;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   write atom info to data file including 3 h_image flags
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::write_data(FILE *fp, int n, double **buf)
+{
+  int k,m;
+
+  for (int i = 0; i < n; i++) {
+    fprintf(fp,TAGINT_FORMAT " %d %-1.16e %-1.16e %-1.16e",
+            (tagint) ubuf(buf[i][0]).i,(int) ubuf(buf[i][1]).i,
+            buf[i][2],buf[i][3],buf[i][4]);
+
+    m = 5;
+    for (k = 0; k < nstyles; k++)
+      m += styles[k]->write_data_hybrid(fp,&buf[i][m]);
+
+    fprintf(fp," %d %d %d\n",
+            (int) ubuf(buf[i][m]).i,(int) ubuf(buf[i][m+1]).i,
+            (int) ubuf(buf[i][m+2]).i);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pack velocity info for data file
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::pack_vel(double **buf)
+{
+  int k,m;
+
+  int nlocal = atom->nlocal;
+  for (int i = 0; i < nlocal; i++) {
+    buf[i][0] = ubuf(h_tag[i]).d;
+    buf[i][1] = h_v(i,0);
+    buf[i][2] = h_v(i,1);
+    buf[i][3] = h_v(i,2);
+
+    m = 4;
+    for (k = 0; k < nstyles; k++)
+      m += styles[k]->pack_vel_hybrid(i,&buf[i][m]);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   write velocity info to data file
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::write_vel(FILE *fp, int n, double **buf)
+{
+  int k,m;
+
+  for (int i = 0; i < n; i++) {
+    fprintf(fp,TAGINT_FORMAT " %g %g %g",
+            (tagint) ubuf(buf[i][0]).i,buf[i][1],buf[i][2],buf[i][3]);
+
+    m = 4;
+    for (k = 0; k < nstyles; k++)
+      m += styles[k]->write_vel_hybrid(fp,&buf[i][m]);
+
+    fprintf(fp,"\n");
+  }
+}
+
+/* ----------------------------------------------------------------------
+   assign an index to named atom property and return index
+   returned value encodes which sub-style and index returned by sub-style
+   return -1 if name is unknown to any sub-styles
+------------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::property_atom(char *name)
+{
+  for (int k = 0; k < nstyles; k++) {
+    int index = styles[k]->property_atom(name);
+    if (index >= 0) return index*nstyles + k;
+  }
+  return -1;
+}
+
+/* ----------------------------------------------------------------------
+   pack per-atom data into buf for ComputePropertyAtom
+   index maps to data specific to this atom style
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::pack_property_atom(int multiindex, double *buf,
+                                       int nvalues, int groupbit)
+{
+  int k = multiindex % nstyles;
+  int index = multiindex/nstyles;
+  styles[k]->pack_property_atom(index,buf,nvalues,groupbit);
+}
+
+/* ----------------------------------------------------------------------
+   allstyles = list of all atom styles in this LAMMPS executable
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::build_styles()
+{
+  nallstyles = 0;
+#define ATOM_CLASS
+#define AtomStyle(key,Class) nallstyles++;
+#include "style_atom.h"
+#undef AtomStyle
+#undef ATOM_CLASS
+
+  allstyles = new char*[nallstyles];
+
+  int n;
+  nallstyles = 0;
+#define ATOM_CLASS
+#define AtomStyle(key,Class)                \
+  n = strlen(#key) + 1;                     \
+  allstyles[nallstyles] = new char[n];      \
+  strcpy(allstyles[nallstyles],#key);       \
+  nallstyles++;
+#include "style_atom.h"
+#undef AtomStyle
+#undef ATOM_CLASS
+}
+
+/* ----------------------------------------------------------------------
+   allstyles = list of all known atom styles
+------------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::known_style(char *str)
+{
+  for (int i = 0; i < nallstyles; i++)
+    if (strcmp(str,allstyles[i]) == 0) return 1;
+  return 0;
+}
+
+/* ----------------------------------------------------------------------
+   return # of bytes of allocated memory
+------------------------------------------------------------------------- */
+
+bigint AtomVecHybridKokkos::memory_usage()
+{
+  bigint bytes = 0;
+  for (int k = 0; k < nstyles; k++) bytes += styles[k]->memory_usage();
+  return bytes;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::sync(ExecutionSpace space, unsigned int h_mask)
+{
+  for (int k = 0; k < nstyles; k++) ((AtomVecKokkos*) styles[k])->sync(space,h_mask);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::sync_overlapping_device(ExecutionSpace space, unsigned int h_mask)
+{
+  for (int k = 0; k < nstyles; k++) ((AtomVecKokkos*) styles[k])->sync_overlapping_device(space,h_mask);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::modified(ExecutionSpace space, unsigned int h_mask)
+{
+  for (int k = 0; k < nstyles; k++) ((AtomVecKokkos*) styles[k])->modified(space,h_mask);
+}
diff --git a/src/KOKKOS/atom_vec_hybrid_kokkos.h b/src/KOKKOS/atom_vec_hybrid_kokkos.h
new file mode 100644
index 0000000000..802314bfa6
--- /dev/null
+++ b/src/KOKKOS/atom_vec_hybrid_kokkos.h
@@ -0,0 +1,161 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef ATOM_CLASS
+
+AtomStyle(hybrid/kk,AtomVecHybridKokkos)
+
+#else
+
+#ifndef LMP_ATOM_VEC_HYBRID_KOKKOS_H
+#define LMP_ATOM_VEC_HYBRID_KOKKOS_H
+
+#include <stdio.h>
+#include "atom_vec.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+class AtomVecHybridKokkos : public AtomVecKokkos {
+ public:
+  int nstyles;
+  class AtomVec **styles;
+  char **keywords;
+
+  AtomVecHybridKokkos(class LAMMPS *);
+  ~AtomVecHybridKokkos();
+  void process_args(int, char **);
+  void init();
+  void grow(int);
+  void grow_reset();
+  void copy(int, int, int);
+  void clear_bonus();
+  void force_clear(int, size_t);
+  int pack_comm(int, int *, double *, int, int *);
+  int pack_comm_vel(int, int *, double *, int, int *);
+  void unpack_comm(int, int, double *);
+  void unpack_comm_vel(int, int, double *);
+  int pack_reverse(int, int, double *);
+  void unpack_reverse(int, int *, double *);
+  int pack_border(int, int *, double *, int, int *);
+  int pack_border_vel(int, int *, double *, int, int *);
+  void unpack_border(int, int, double *);
+  void unpack_border_vel(int, int, double *);
+  int pack_exchange(int, double *);
+  int unpack_exchange(double *);
+  int size_restart();
+  int pack_restart(int, double *);
+  int unpack_restart(double *);
+  void create_atom(int, double *);
+  void data_atom(double *, imageint, char **);
+  int data_atom_hybrid(int, char **) {return 0;}
+  void data_vel(int, char **);
+  void pack_data(double **);
+  void write_data(FILE *, int, double **);
+  void pack_vel(double **);
+  void write_vel(FILE *, int, double **);
+  int property_atom(char *);
+  void pack_property_atom(int, double *, int, int);
+  bigint memory_usage();
+
+  int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
+                       const int & iswap,
+                       const DAT::tdual_xfloat_2d &buf,
+                       const int &pbc_flag, const int pbc[]);
+  void unpack_comm_kokkos(const int &n, const int &nfirst,
+                          const DAT::tdual_xfloat_2d &buf);
+  int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
+                     const int & iswap, const int nfirst,
+                     const int &pbc_flag, const int pbc[]);
+  int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
+                         DAT::tdual_xfloat_2d buf,int iswap,
+                         int pbc_flag, int *pbc, ExecutionSpace space);
+  void unpack_border_kokkos(const int &n, const int &nfirst,
+                            const DAT::tdual_xfloat_2d &buf,
+                            ExecutionSpace space);
+  int pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &buf,
+                           DAT::tdual_int_1d k_sendlist,
+                           DAT::tdual_int_1d k_copylist,
+                           ExecutionSpace space, int dim,
+                           X_FLOAT lo, X_FLOAT hi);
+  int unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv,
+                             int nlocal, int dim, X_FLOAT lo, X_FLOAT hi,
+                             ExecutionSpace space);
+
+  void sync(ExecutionSpace space, unsigned int mask);
+  void modified(ExecutionSpace space, unsigned int mask);
+  void sync_overlapping_device(ExecutionSpace space, unsigned int mask);
+
+ private:
+  tagint *tag;
+  int *type,*mask;
+  imageint *image;
+  double **x,**v,**f;
+  double **omega,**angmom;
+
+  DAT::t_tagint_1d d_tag;
+  DAT::t_int_1d d_type, d_mask;
+  HAT::t_tagint_1d h_tag;
+  HAT::t_int_1d h_type, h_mask;
+
+  DAT::t_imageint_1d d_image;
+  HAT::t_imageint_1d h_image;
+
+  DAT::t_x_array d_x;
+  DAT::t_v_array d_v;
+  DAT::t_f_array d_f;
+  HAT::t_x_array h_x;
+  HAT::t_v_array h_v;
+  HAT::t_f_array h_f;
+
+  DAT::t_v_array d_omega, d_angmom;
+  HAT::t_v_array h_omega, h_angmom;
+
+  DAT::tdual_int_1d k_count;
+
+  int nallstyles;
+  char **allstyles;
+
+  void build_styles();
+  int known_style(char *);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Atom style hybrid cannot have hybrid as an argument
+
+Self-explanatory.
+
+E: Atom style hybrid cannot use same atom style twice
+
+Self-explanatory.
+
+E: Cannot mix molecular and molecule template atom styles
+
+Self-explanatory.
+
+E: Per-processor system is too big
+
+The number of owned atoms plus ghost atoms on a single
+processor must fit in 32-bit integer.
+
+E: Invalid atom type in Atoms section of data file
+
+Atom types must range from 1 to specified # of types.
+
+*/

From d6f6c6faf1ecd6b25b1297b2f546632f3864fa45 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Sun, 12 Mar 2017 16:05:28 -0400
Subject: [PATCH 197/267] USER-DPD: Make newton-off warning in pair_dpd_fdt* be
 more selective. If using fix_shardlow, the pair_dpd_fdt* styles are okay with
 newton off, because the stocastic forces are thus only done in fix_shardlow.

---
 src/USER-DPD/pair_dpd_fdt.cpp        | 11 +++++------
 src/USER-DPD/pair_dpd_fdt_energy.cpp | 11 +++++------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/USER-DPD/pair_dpd_fdt.cpp b/src/USER-DPD/pair_dpd_fdt.cpp
index 90aa4f1eaf..987755db8a 100644
--- a/src/USER-DPD/pair_dpd_fdt.cpp
+++ b/src/USER-DPD/pair_dpd_fdt.cpp
@@ -316,18 +316,17 @@ void PairDPDfdt::init_style()
   if (comm->ghost_velocity == 0)
     error->all(FLERR,"Pair dpd/fdt requires ghost atoms store velocity");
 
-  // if newton off, forces between atoms ij will be double computed
-  // using different random numbers
-
-  if (force->newton_pair == 0 && comm->me == 0) error->warning(FLERR,
-      "Pair dpd/fdt requires newton pair on");
-
   splitFDT_flag = false;
   int irequest = neighbor->request(this,instance_me);
   for (int i = 0; i < modify->nfix; i++)
     if (strncmp(modify->fix[i]->style,"shardlow", 8) == 0){
       splitFDT_flag = true;
     }
+
+  // if newton off, forces between atoms ij will be double computed
+  // using different random numbers if splitFDT_flag is false
+  if (!splitFDT_flag && (force->newton_pair == 0) && (comm->me == 0)) error->warning(FLERR,
+      "Pair dpd/fdt requires newton pair on if not also using fix shardlow");
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/USER-DPD/pair_dpd_fdt_energy.cpp b/src/USER-DPD/pair_dpd_fdt_energy.cpp
index ad6310a283..bf86f95b5f 100644
--- a/src/USER-DPD/pair_dpd_fdt_energy.cpp
+++ b/src/USER-DPD/pair_dpd_fdt_energy.cpp
@@ -405,12 +405,6 @@ void PairDPDfdtEnergy::init_style()
   if (comm->ghost_velocity == 0)
     error->all(FLERR,"Pair dpd/fdt/energy requires ghost atoms store velocity");
 
-  // if newton off, forces between atoms ij will be double computed
-  // using different random numbers
-
-  if (force->newton_pair == 0 && comm->me == 0) error->warning(FLERR,
-      "Pair dpd/fdt/energy requires newton pair on");
-
   splitFDT_flag = false;
   int irequest = neighbor->request(this,instance_me);
   for (int i = 0; i < modify->nfix; i++)
@@ -418,6 +412,11 @@ void PairDPDfdtEnergy::init_style()
       splitFDT_flag = true;
     }
 
+  // if newton off, forces between atoms ij will be double computed
+  // using different random numbers if splitFDT_flag is false
+  if (!splitFDT_flag && (force->newton_pair == 0) && (comm->me == 0)) error->warning(FLERR,
+      "Pair dpd/fdt/energy requires newton pair on if not also using fix shardlow");
+
   bool eos_flag = false;
   for (int i = 0; i < modify->nfix; i++)
     if (strncmp(modify->fix[i]->style,"eos",3) == 0) eos_flag = true;

From e908b8dbea0284460ea070cd98862dc4abd5d4c1 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Sun, 12 Mar 2017 16:20:09 -0400
Subject: [PATCH 198/267] USER-DPD Kokkos: correct some error messages

---
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp | 4 ++--
 src/KOKKOS/pair_exp6_rx_kokkos.cpp        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
index bd0f08efa6..1c63b9af95 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
@@ -104,7 +104,7 @@ void PairDPDfdtEnergyKokkos<DeviceType>::init_style()
     neighbor->requests[irequest]->full = 0;
     neighbor->requests[irequest]->half = 1;
   } else {
-    error->all(FLERR,"Cannot use chosen neighbor list style with reax/c/kk");
+    error->all(FLERR,"Cannot use chosen neighbor list style with dpd/fdt/energy/kk");
   }
 
 #ifdef DPD_USE_RAN_MARS
@@ -139,7 +139,7 @@ void PairDPDfdtEnergyKokkos<Kokkos::Cuda>::init_style()
     neighbor->requests[irequest]->full = 0;
     neighbor->requests[irequest]->half = 1;
   } else {
-    error->all(FLERR,"Cannot use chosen neighbor list style with reax/c/kk");
+    error->all(FLERR,"Cannot use chosen neighbor list style with dpd/fdt/energy/kk");
   }
 
 #ifdef DPD_USE_RAN_MARS
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index 4b0748721c..e22a4bff22 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -112,7 +112,7 @@ void PairExp6rxKokkos<DeviceType>::init_style()
     neighbor->requests[irequest]->full = 0;
     neighbor->requests[irequest]->half = 1;
   } else {
-    error->all(FLERR,"Cannot use chosen neighbor list style with reax/c/kk");
+    error->all(FLERR,"Cannot use chosen neighbor list style with exp6/rx/kk");
   }
 }
 
@@ -1242,4 +1242,4 @@ template class PairExp6rxKokkos<LMPDeviceType>;
 #ifdef KOKKOS_HAVE_CUDA
 template class PairExp6rxKokkos<LMPHostType>;
 #endif
-}
\ No newline at end of file
+}

From b1b377cb594738d35b31ba33ea2d125e78483ee3 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Sun, 12 Mar 2017 17:48:51 -0400
Subject: [PATCH 199/267] USER-DPD: fix_shardlow's neighbor request needs
 "newton on" override. Even if other stuff is doing newton off, SSA must have
 it turned on.

---
 src/USER-DPD/fix_shardlow.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/USER-DPD/fix_shardlow.cpp b/src/USER-DPD/fix_shardlow.cpp
index 5132d937ea..2b7ef9314b 100644
--- a/src/USER-DPD/fix_shardlow.cpp
+++ b/src/USER-DPD/fix_shardlow.cpp
@@ -132,10 +132,11 @@ int FixShardlow::setmask()
 void FixShardlow::init()
 {
   int irequest = neighbor->request(this,instance_me);
-  neighbor->requests[irequest]->pair = 0;
-  neighbor->requests[irequest]->fix  = 1;
-  neighbor->requests[irequest]->ghost= 1;
-  neighbor->requests[irequest]->ssa  = 1;
+  neighbor->requests[irequest]->pair   = 0;
+  neighbor->requests[irequest]->fix    = 1;
+  neighbor->requests[irequest]->ghost  = 1;
+  neighbor->requests[irequest]->ssa    = 1;
+  neighbor->requests[irequest]->newton = 1; // SSA requires newton on
 }
 
 /* ---------------------------------------------------------------------- */

From d5eceebf3283cd460a4230672c582b952bef36f0 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Mon, 13 Mar 2017 01:56:00 -0400
Subject: [PATCH 200/267] USER-DPD Kokkos: add support for full neighbor lists.
 Note: "newton on" still required if using non-kokkos pair styles or fixes.
 Non-kokkos pairs/fixes don't expect their half lists with newton off, which
 happens if newton is turned off globally by kokkos via commandline. Note2:
 Regardless, fix_shardlow* will still use half lists and newton on.

---
 src/KOKKOS/fix_rx_kokkos.cpp              |  9 +++--
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp | 44 +++++++++++++++++++----
 src/KOKKOS/pair_exp6_rx_kokkos.cpp        | 16 ++++++---
 src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp  | 27 +++++++++++---
 src/KOKKOS/pair_table_rx_kokkos.cpp       | 16 +++++++++
 5 files changed, 96 insertions(+), 16 deletions(-)

diff --git a/src/KOKKOS/fix_rx_kokkos.cpp b/src/KOKKOS/fix_rx_kokkos.cpp
index 08a20ac9a7..ac81e5c2a7 100644
--- a/src/KOKKOS/fix_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@@ -1450,6 +1450,11 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
              computeLocalTemperature<_wtflag, _localTempFlag, true , HALFTHREAD> (); \
           else \
              computeLocalTemperature<_wtflag, _localTempFlag, false, HALFTHREAD> (); \
+       else if (neighflag == FULL) \
+          if (newton_pair) \
+             computeLocalTemperature<_wtflag, _localTempFlag, true , FULL> (); \
+          else \
+             computeLocalTemperature<_wtflag, _localTempFlag, false, FULL> (); \
     }
 
     // Are there is no other options than wtFlag = (0)LUCY and localTempFlag = NONE : HARMONIC?
@@ -1934,12 +1939,12 @@ void FixRxKokkos<DeviceType>::operator()(Tag_FixRxKokkos_firstPairOperator<WT_FL
       {
         wij = (1.0+3.0*ratio) * (1.0-ratio)*(1.0-ratio)*(1.0-ratio);
         i_dpdThetaLocal += wij / d_dpdTheta(j);
-        if (NEWTON_PAIR || j < nlocal)
+        if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal))
           a_dpdThetaLocal(j) += wij / d_dpdTheta(i);
       }
 
       i_sumWeights += wij;
-      if (NEWTON_PAIR || j < nlocal)
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal))
         a_sumWeights(j) += wij;
     }
   }
diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
index 1c63b9af95..03bf1a8b61 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
@@ -233,6 +233,14 @@ void PairDPDfdtEnergyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
             if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,0,1,false> >(0,inum),*this,ev);
             else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,0,0,false> >(0,inum),*this);
           }
+        } else if (neighflag == FULL) {
+          if (newton_pair) {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<FULL,1,1,false> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<FULL,1,0,false> >(0,inum),*this);
+          } else {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<FULL,0,1,false> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<FULL,0,0,false> >(0,inum),*this);
+          }
         }
       } else {
         if (neighflag == HALF) {
@@ -251,6 +259,14 @@ void PairDPDfdtEnergyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
             if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,0,1,true> >(0,inum),*this,ev);
             else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,0,0,true> >(0,inum),*this);
           }
+        } else if (neighflag == FULL) {
+          if (newton_pair) {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<FULL,1,1,true> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<FULL,1,0,true> >(0,inum),*this);
+          } else {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<FULL,0,1,true> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<FULL,0,0,true> >(0,inum),*this);
+          }
         }
       }
     }
@@ -291,6 +307,14 @@ void PairDPDfdtEnergyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
           if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,0,1,false> >(0,inum),*this,ev);
           else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,0,0,false> >(0,inum),*this);
         }
+      } else if (neighflag == FULL) {
+        if (newton_pair) {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<FULL,1,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<FULL,1,0,false> >(0,inum),*this);
+        } else {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<FULL,0,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<FULL,0,0,false> >(0,inum),*this);
+        }
       }
     } else {
       if (neighflag == HALF) {
@@ -309,6 +333,14 @@ void PairDPDfdtEnergyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
           if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,0,1,false> >(0,inum),*this,ev);
           else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,0,0,false> >(0,inum),*this);
         }
+      } else if (neighflag == FULL) {
+        if (newton_pair) {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<FULL,1,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<FULL,1,0,false> >(0,inum),*this);
+        } else {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<FULL,0,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<FULL,0,0,false> >(0,inum),*this);
+        }
       }
     }
 
@@ -405,7 +437,7 @@ void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeSp
       fx_i += delx*fpair;
       fy_i += dely*fpair;
       fz_i += delz*fpair;
-      if (NEWTON_PAIR || j < nlocal) {
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
         a_f(j,0) -= delx*fpair;
         a_f(j,1) -= dely*fpair;
         a_f(j,2) -= delz*fpair;
@@ -418,7 +450,7 @@ void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeSp
         evdwl = 0.5*a0_ij*cut_ij * wd;
         evdwl *= factor_dpd;
         if (EVFLAG)
-          ev.evdwl += ((NEWTON_PAIR||(j<nlocal))?1.0:0.5)*evdwl;
+          ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR||(j<nlocal)))?1.0:0.5)*evdwl;
       }
 
       if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair,delx,dely,delz);
@@ -522,7 +554,7 @@ void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeNo
       fx_i += delx*fpair;
       fy_i += dely*fpair;
       fz_i += delz*fpair;
-      if (NEWTON_PAIR || j < nlocal) {
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
         a_f(j,0) -= delx*fpair;
         a_f(j,1) -= dely*fpair;
         a_f(j,2) -= delz*fpair;
@@ -548,7 +580,7 @@ void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeNo
       uTmp *= 0.5;
 
       a_duMech[i] += uTmp;
-      if (NEWTON_PAIR || j < nlocal) {
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
         a_duMech[j] += uTmp;
       }
 
@@ -562,7 +594,7 @@ void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeNo
       uTmp += randPair;
 
       a_duCond[i] += uTmp;
-      if (NEWTON_PAIR || j < nlocal) {
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
         a_duCond[j] -= uTmp;
       }
 
@@ -573,7 +605,7 @@ void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeNo
         evdwl = 0.5*a0_ij*cut_ij * wd;
         evdwl *= factor_dpd;
         if (EVFLAG)
-          ev.evdwl += ((NEWTON_PAIR||(j<nlocal))?1.0:0.5)*evdwl;
+          ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR||(j<nlocal)))?1.0:0.5)*evdwl;
       }
 
       if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair,delx,dely,delz);
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index e22a4bff22..abc158d72c 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -221,6 +221,14 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
       if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALFTHREAD,0,1> >(0,inum),*this,ev);
       else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALFTHREAD,0,0> >(0,inum),*this);
     }
+  } else if (neighflag == FULL) {
+    if (newton_pair) {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<FULL,1,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<FULL,1,0> >(0,inum),*this);
+    } else {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<FULL,0,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<FULL,0,0> >(0,inum),*this);
+    }
   }
 
   k_error_flag.template modify<DeviceType>();
@@ -509,7 +517,7 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
         evdwlOld *= factor_lj;
 
         uCG_i += 0.5*evdwlOld;
-        if (NEWTON_PAIR || j < nlocal)
+        if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal))
           a_uCG[j] += 0.5*evdwlOld;
       }
 
@@ -592,7 +600,7 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
       fx_i += delx*fpair;
       fy_i += dely*fpair;
       fz_i += delz*fpair;
-      if (NEWTON_PAIR || j < nlocal) {
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
         a_f(j,0) -= delx*fpair;
         a_f(j,1) -= dely*fpair;
         a_f(j,2) -= delz*fpair;
@@ -603,11 +611,11 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
       evdwl *= factor_lj;
 
       uCGnew_i   += 0.5*evdwl;
-      if (NEWTON_PAIR || j < nlocal)
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal))
         a_uCGnew[j] += 0.5*evdwl;
       evdwl = evdwlOld;
       if (EVFLAG)
-        ev.evdwl += ((NEWTON_PAIR||(j<nlocal))?1.0:0.5)*evdwl;
+        ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR||(j<nlocal)))?1.0:0.5)*evdwl;
       //if (vflag_either || eflag_atom) 
       if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair,delx,dely,delz);
     }
diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
index 4379cc4001..ef30fdc6f6 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
@@ -216,6 +216,14 @@ void PairMultiLucyRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in
       if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALFTHREAD,0,1,TABSTYLE> >(0,inum),*this,ev);
       else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALFTHREAD,0,0,TABSTYLE> >(0,inum),*this);
     }
+  } else if (neighflag == FULL) {
+    if (newton_pair) {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<FULL,1,1,TABSTYLE> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<FULL,1,0,TABSTYLE> >(0,inum),*this);
+    } else {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<FULL,0,1,TABSTYLE> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<FULL,0,0,TABSTYLE> >(0,inum),*this);
+    }
   }
 
   if (evflag) atomKK->modified(execution_space,F_MASK | ENERGY_MASK | VIRIAL_MASK | UCG_MASK | UCGNEW_MASK);
@@ -378,7 +386,7 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
       fx_i += delx*fpair;
       fy_i += dely*fpair;
       fz_i += delz*fpair;
-      if (NEWTON_PAIR || j < nlocal) {
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
         a_f(j,0) -= delx*fpair;
         a_f(j,1) -= dely*fpair;
         a_f(j,2) -= delz*fpair;
@@ -421,7 +429,7 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
 
   //if (evflag) ev_tally(0,0,nlocal,newton_pair,evdwl,0.0,0.0,0.0,0.0,0.0);
   if (EVFLAG)
-    ev.evdwl += (NEWTON_PAIR?1.0:0.5)*evdwl;
+    ev.evdwl += ((/*FIXME??? (NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && */ NEWTON_PAIR)?1.0:0.5)*evdwl;
 }
 
 template<class DeviceType>
@@ -491,6 +499,17 @@ void PairMultiLucyRXKokkos<DeviceType>::computeLocalDensity()
         Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALFTHREAD,0,true> >(0,inum),*this);
       else
         Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALFTHREAD,0,false> >(0,inum),*this);
+  } else if (neighflag == FULL) {
+    if (newton_pair)
+      if (one_type)
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<FULL,1,true> >(0,inum),*this);
+      else
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<FULL,1,false> >(0,inum),*this);
+    else
+      if (one_type)
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<FULL,0,true> >(0,inum),*this);
+      else
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<FULL,0,false> >(0,inum),*this);
   }
 
   atomKK->modified(execution_space,DPDRHO_MASK);
@@ -548,7 +567,7 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXComputeLoca
         const double tmpFactor4 = tmpFactor*tmpFactor*tmpFactor*tmpFactor;
         const double factor = factor_type11*(1.0 + 1.5*r_over_rcut)*tmpFactor4;
         rho_i_contrib += factor;
-        if (NEWTON_PAIR || j < nlocal)
+        if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal))
           a_rho[j] += factor;
       }
     } else if (rsq < d_cutsq(itype,jtype)) {
@@ -557,7 +576,7 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXComputeLoca
       const double tmpFactor4 = tmpFactor*tmpFactor*tmpFactor*tmpFactor;
       const double factor = (84.0/(5.0*pi*rcut*rcut*rcut))*(1.0+3.0*sqrt(rsq)/(2.0*rcut))*tmpFactor4;
       rho_i_contrib += factor;
-      if (NEWTON_PAIR || j < nlocal)
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal))
         a_rho[j] += factor;
     }
   }
diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index cbb1096712..e3d416f293 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -693,6 +693,14 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
           special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
           d_table_const, eflag, eflag_atom,
           vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+    } else if (neighflag == FULL) {
+      compute_all_items<DeviceType,FULL,false,TABSTYLE>(
+          newton_pair, ev, nlocal,
+          l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
     }
   } else {
     if (neighflag == HALFTHREAD) {
@@ -711,6 +719,14 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
           special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
           d_table_const, eflag, eflag_atom,
           vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+    } else if (neighflag == FULL) {
+      compute_all_items<DeviceType,FULL,true,TABSTYLE>(
+          newton_pair, ev, nlocal,
+          l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
     }
   }
 

From 4b4bc7dc3bd11d52e6ad49e99749f80405d7dbbb Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Mon, 13 Mar 2017 03:03:27 -0400
Subject: [PATCH 201/267] USER-DPD: specialize PairTableRXKokkos's
 compute_all_items() on NEWTON_PAIR No noticable performance change, but it
 does eliminate a deep conditional.

---
 src/KOKKOS/pair_table_rx_kokkos.cpp | 110 +++++++++++++++++-----------
 1 file changed, 69 insertions(+), 41 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index e3d416f293..e93ea53fa4 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -514,9 +514,8 @@ compute_item(
   return ev;
 }
 
-template<class DeviceType, int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE>
+template<class DeviceType, int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE, int NEWTON_PAIR>
 static void compute_all_items(
-    int newton_pair,
     EV_FLOAT& ev,
     int nlocal,
     int inum,
@@ -560,42 +559,23 @@ static void compute_all_items(
   if (eflag || vflag) {
     Kokkos::parallel_reduce(inum,
     LAMMPS_LAMBDA(int i, EV_FLOAT& energy_virial) {
-      if (newton_pair) {
         energy_virial +=
-          compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,1,1>(
+          compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,1,NEWTON_PAIR>(
             i, nlocal, d_ilist, d_neighbors, d_numneigh, x, type,
             mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
             special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
             d_table_const, eflag, eflag_atom,
             vflag, vflag_global, vflag_atom, v_vatom, v_eatom);
-      } else {
-        energy_virial +=
-          compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,1,0>(
-            i, nlocal, d_ilist, d_neighbors, d_numneigh, x, type,
-            mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
-            special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
-            d_table_const, eflag, eflag_atom,
-            vflag, vflag_global, vflag_atom, v_vatom, v_eatom);
-      }
     }, ev);
   } else {
     Kokkos::parallel_for(inum,
     LAMMPS_LAMBDA(int i) {
-      if (newton_pair) {
-        compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,0,1>(
+        compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,0,NEWTON_PAIR>(
             i, nlocal, d_ilist, d_neighbors, d_numneigh, x, type,
             mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
             special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
             d_table_const, eflag, eflag_atom,
             vflag, vflag_global, vflag_atom, v_vatom, v_eatom);
-      } else {
-        compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,0,0>(
-            i, nlocal, d_ilist, d_neighbors, d_numneigh, x, type,
-            mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
-            special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
-            d_table_const, eflag, eflag_atom,
-            vflag, vflag_global, vflag_atom, v_vatom, v_eatom);
-      }
     });
   }
 }
@@ -678,55 +658,103 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
   EV_FLOAT ev;
   if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
     if (neighflag == HALFTHREAD) {
-      compute_all_items<DeviceType,HALFTHREAD,false,TABSTYLE>(
-          newton_pair, ev, nlocal,
-          l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+      if (newton_pair) {
+        compute_all_items<DeviceType,HALFTHREAD,false,TABSTYLE,1>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
           x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
           special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
           d_table_const, eflag, eflag_atom,
           vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      } else {
+        compute_all_items<DeviceType,HALFTHREAD,false,TABSTYLE,0>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      }
     } else if (neighflag == HALF) {
-      compute_all_items<DeviceType,HALF,false,TABSTYLE>(
-          newton_pair, ev, nlocal,
-          l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+      if (newton_pair) {
+        compute_all_items<DeviceType,HALF,false,TABSTYLE,1>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
           x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
           special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
           d_table_const, eflag, eflag_atom,
           vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      } else {
+        compute_all_items<DeviceType,HALF,false,TABSTYLE,0>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      }
     } else if (neighflag == FULL) {
-      compute_all_items<DeviceType,FULL,false,TABSTYLE>(
-          newton_pair, ev, nlocal,
-          l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+      if (newton_pair) {
+        compute_all_items<DeviceType,FULL,false,TABSTYLE,1>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
           x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
           special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
           d_table_const, eflag, eflag_atom,
           vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      } else {
+        compute_all_items<DeviceType,FULL,false,TABSTYLE,0>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      }
     }
   } else {
     if (neighflag == HALFTHREAD) {
-      compute_all_items<DeviceType,HALFTHREAD,true,TABSTYLE>(
-          newton_pair, ev, nlocal,
-          l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+      if (newton_pair) {
+        compute_all_items<DeviceType,HALFTHREAD,true,TABSTYLE,1>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
           x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
           special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
           d_table_const, eflag, eflag_atom,
           vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      } else {
+        compute_all_items<DeviceType,HALFTHREAD,true,TABSTYLE,0>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      }
     } else if (neighflag == HALF) {
-      compute_all_items<DeviceType,HALF,true,TABSTYLE>(
-          newton_pair, ev, nlocal,
-          l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+      if (newton_pair) {
+        compute_all_items<DeviceType,HALF,true,TABSTYLE,1>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
           x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
           special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
           d_table_const, eflag, eflag_atom,
           vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      } else {
+        compute_all_items<DeviceType,HALF,true,TABSTYLE,0>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      }
     } else if (neighflag == FULL) {
-      compute_all_items<DeviceType,FULL,true,TABSTYLE>(
-          newton_pair, ev, nlocal,
-          l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+      if (newton_pair) {
+        compute_all_items<DeviceType,FULL,true,TABSTYLE,1>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
           x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
           special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
           d_table_const, eflag, eflag_atom,
           vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      } else {
+        compute_all_items<DeviceType,FULL,true,TABSTYLE,0>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      }
     }
   }
 

From d2cbfef13bac634b99459f3f4f78465ed03e712d Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Mon, 13 Mar 2017 09:01:35 -0600
Subject: [PATCH 202/267] Add CUDA support to atom_vec_hybrid_kokkos

---
 src/KOKKOS/atom_vec_hybrid_kokkos.cpp | 37 +++++++++++++++++++++++----
 src/KOKKOS/atom_vec_hybrid_kokkos.h   |  2 +-
 2 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/src/KOKKOS/atom_vec_hybrid_kokkos.cpp b/src/KOKKOS/atom_vec_hybrid_kokkos.cpp
index 0c9d261be5..e5e361e70a 100644
--- a/src/KOKKOS/atom_vec_hybrid_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_hybrid_kokkos.cpp
@@ -13,7 +13,6 @@
 
 #include <stdlib.h>
 #include <string.h>
-#include "atom_vec_kokkos.h"
 #include "atom_vec_hybrid_kokkos.h"
 #include "atom_kokkos.h"
 #include "domain.h"
@@ -21,6 +20,7 @@
 #include "fix.h"
 #include "memory.h"
 #include "error.h"
+#include "atom_masks.h"
 
 using namespace LAMMPS_NS;
 
@@ -132,10 +132,6 @@ void AtomVecHybridKokkos::init()
 {
   AtomVec::init();
   for (int k = 0; k < nstyles; k++) styles[k]->init();
-
-#ifdef KOKKOS_HAVE_CUDA
-  error->all(FLERR,"AtomVecHybridKokkos doesn't yet support CUDA");
-#endif
 }
 
 /* ----------------------------------------------------------------------
@@ -303,6 +299,8 @@ int AtomVecHybridKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int
 int AtomVecHybridKokkos::pack_comm(int n, int *list, double *buf,
                              int pbc_flag, int *pbc)
 {
+  sync(Host,X_MASK);
+
   int i,j,k,m;
   double dx,dy,dz;
 
@@ -345,6 +343,8 @@ int AtomVecHybridKokkos::pack_comm(int n, int *list, double *buf,
 int AtomVecHybridKokkos::pack_comm_vel(int n, int *list, double *buf,
                                  int pbc_flag, int *pbc)
 {
+  sync(Host,X_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+
   int i,j,k,m;
   double dx,dy,dz,dvx,dvy,dvz;
   int omega_flag = atom->omega_flag;
@@ -455,6 +455,8 @@ void AtomVecHybridKokkos::unpack_comm(int n, int first, double *buf)
     h_x(i,2) = buf[m++];
   }
 
+  modified(Host,X_MASK);
+
   // unpack sub-style contributions as contiguous chunks
 
   for (k = 0; k < nstyles; k++)
@@ -490,6 +492,8 @@ void AtomVecHybridKokkos::unpack_comm_vel(int n, int first, double *buf)
     }
   }
 
+  modified(Host,X_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+
   // unpack sub-style contributions as contiguous chunks
 
   for (k = 0; k < nstyles; k++)
@@ -500,6 +504,8 @@ void AtomVecHybridKokkos::unpack_comm_vel(int n, int first, double *buf)
 
 int AtomVecHybridKokkos::pack_reverse(int n, int first, double *buf)
 {
+  sync(Host,F_MASK);
+
   int i,k,m,last;
 
   m = 0;
@@ -532,6 +538,8 @@ void AtomVecHybridKokkos::unpack_reverse(int n, int *list, double *buf)
     h_f(j,2) += buf[m++];
   }
 
+  modified(Host,F_MASK);
+
   // unpack sub-style contributions as contiguous chunks
 
   for (k = 0; k < nstyles; k++)
@@ -543,6 +551,8 @@ void AtomVecHybridKokkos::unpack_reverse(int n, int *list, double *buf)
 int AtomVecHybridKokkos::pack_border(int n, int *list, double *buf,
                                int pbc_flag, int *pbc)
 {
+  sync(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+
   int i,j,k,m;
   double dx,dy,dz;
 
@@ -595,6 +605,7 @@ int AtomVecHybridKokkos::pack_border(int n, int *list, double *buf,
 int AtomVecHybridKokkos::pack_border_vel(int n, int *list, double *buf,
                                    int pbc_flag, int *pbc)
 {
+  sync(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
   int i,j,k,m;
   double dx,dy,dz,dvx,dvy,dvz;
   int omega_flag = atom->omega_flag;
@@ -722,6 +733,8 @@ void AtomVecHybridKokkos::unpack_border(int n, int first, double *buf)
     h_mask[i] = (int) ubuf(buf[m++]).i;
   }
 
+  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+
   // unpack sub-style contributions as contiguous chunks
 
   for (k = 0; k < nstyles; k++)
@@ -766,6 +779,8 @@ void AtomVecHybridKokkos::unpack_border_vel(int n, int first, double *buf)
     }
   }
 
+  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+
   // unpack sub-style contributions as contiguous chunks
 
   for (k = 0; k < nstyles; k++)
@@ -946,6 +961,8 @@ void AtomVecHybridKokkos::create_atom(int itype, double *coord)
 
 void AtomVecHybridKokkos::data_atom(double *coord, imageint imagetmp, char **values)
 {
+  sync(Host,X_MASK|TAG_MASK|TYPE_MASK|IMAGE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+
   int nlocal = atom->nlocal;
   if (nlocal == nmax) grow(0);
 
@@ -975,6 +992,8 @@ void AtomVecHybridKokkos::data_atom(double *coord, imageint imagetmp, char **val
     h_angmom(nlocal,2) = 0.0;
   }
 
+  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|IMAGE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+
   // each sub-style parses sub-style specific values
 
   int m = 5;
@@ -990,10 +1009,14 @@ void AtomVecHybridKokkos::data_atom(double *coord, imageint imagetmp, char **val
 
 void AtomVecHybridKokkos::data_vel(int m, char **values)
 {
+  sync(Host,V_MASK);
+
   h_v(m,0) = atof(values[0]);
   h_v(m,1) = atof(values[1]);
   h_v(m,2) = atof(values[2]);
 
+  modified(Host,V_MASK);
+
   // each sub-style parses sub-style specific values
 
   int n = 3;
@@ -1007,6 +1030,8 @@ void AtomVecHybridKokkos::data_vel(int m, char **values)
 
 void AtomVecHybridKokkos::pack_data(double **buf)
 {
+  sync(Host,TAG_MASK|TYPE_MASK|X_MASK);
+
   int k,m;
 
   int nlocal = atom->nlocal;
@@ -1056,6 +1081,8 @@ void AtomVecHybridKokkos::write_data(FILE *fp, int n, double **buf)
 
 void AtomVecHybridKokkos::pack_vel(double **buf)
 {
+  sync(Host,V_MASK);
+
   int k,m;
 
   int nlocal = atom->nlocal;
diff --git a/src/KOKKOS/atom_vec_hybrid_kokkos.h b/src/KOKKOS/atom_vec_hybrid_kokkos.h
index 802314bfa6..fcf48f6c74 100644
--- a/src/KOKKOS/atom_vec_hybrid_kokkos.h
+++ b/src/KOKKOS/atom_vec_hybrid_kokkos.h
@@ -21,7 +21,7 @@ AtomStyle(hybrid/kk,AtomVecHybridKokkos)
 #define LMP_ATOM_VEC_HYBRID_KOKKOS_H
 
 #include <stdio.h>
-#include "atom_vec.h"
+#include "atom_vec_kokkos.h"
 #include "kokkos_type.h"
 
 namespace LAMMPS_NS {

From 5925460a275fe4cf588e86eab45242351f5e86cf Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Tue, 14 Mar 2017 14:27:23 -0500
Subject: [PATCH 203/267] Improve the performance of read_data of gzip'ed files
 using taskset. Normally, the gzip process would be pinned to the same core as
 the MPI rank 0 process, which makes the pipe stay in one core's cache, but
 forces the two process to fight for that core, slowing things down.

---
 src/read_data.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/read_data.cpp b/src/read_data.cpp
index d6a33d6e9d..3e180b7aeb 100644
--- a/src/read_data.cpp
+++ b/src/read_data.cpp
@@ -50,7 +50,7 @@ using namespace LAMMPS_NS;
 
 #define MAXLINE 256
 #define LB_FACTOR 1.1
-#define CHUNK 1024
+#define CHUNK 4096
 #define DELTA 4            // must be 2 or larger
 #define MAXBODY 32         // max # of lines in one body
 
@@ -1856,8 +1856,12 @@ void ReadData::open(char *file)
   if (!compressed) fp = fopen(file,"r");
   else {
 #ifdef LAMMPS_GZIP
-    char gunzip[128];
-    sprintf(gunzip,"gzip -c -d %s",file);
+    char gunzip[2048];
+    // Use taskset to force the gzip process to NOT run on the 0th "CPU", which should
+    // keep it from thrashing with the MPI rank zero process (the one reading the pipe).
+    // This is Linux specific, and the 1023 upper range might also be system specific.
+    // Use of something like hwloc would be more portable... but more complicated.
+    sprintf(gunzip,"taskset -c 1-1023 gzip -c -d %s",file);
 
 #ifdef _WIN32
     fp = _popen(gunzip,"rb");

From f4a08ba4fcaed73f5bc9660266a75ddc868c9c3d Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Wed, 15 Mar 2017 09:25:16 -0600
Subject: [PATCH 204/267] pass Views by reference for pair_table_rx_kokkos

this greatly speeds up pair_table_rx_kokkos,
and should put it on par with pair_table_rx
in the Serial case
---
 src/KOKKOS/neigh_list_kokkos.h      |  4 +--
 src/KOKKOS/pair_table_rx_kokkos.cpp | 44 ++++++++++++++---------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/KOKKOS/neigh_list_kokkos.h b/src/KOKKOS/neigh_list_kokkos.h
index b43e1106f2..cece97197d 100644
--- a/src/KOKKOS/neigh_list_kokkos.h
+++ b/src/KOKKOS/neigh_list_kokkos.h
@@ -89,8 +89,8 @@ public:
 
   KOKKOS_INLINE_FUNCTION
   static AtomNeighborsConst static_neighbors_const(int i,
-           typename ArrayTypes<Device>::t_neighbors_2d_const d_neighbors,
-           typename ArrayTypes<Device>::t_int_1d_const d_numneigh) {
+           typename ArrayTypes<Device>::t_neighbors_2d_const const& d_neighbors,
+           typename ArrayTypes<Device>::t_int_1d_const const& d_numneigh) {
     return AtomNeighborsConst(&d_neighbors(i,0),d_numneigh(i),
                               &d_neighbors(i,1)-&d_neighbors(i,0));
   }
diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index e93ea53fa4..044f303bf5 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -193,7 +193,7 @@ KOKKOS_INLINE_FUNCTION
 static F_FLOAT
 compute_fpair(F_FLOAT rsq,
               int itype, int jtype,
-              typename PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const
+              typename PairTableRXKokkos<DeviceType>::TableDeviceConst const& d_table_const
               ) {
   Pair::union_int_float_t rsq_lookup;
   double fpair;
@@ -228,7 +228,7 @@ static F_FLOAT
 compute_evdwl(
     F_FLOAT rsq,
     int itype, int jtype,
-    typename PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const
+    typename PairTableRXKokkos<DeviceType>::TableDeviceConst const& d_table_const
     ) {
   double evdwl;
   Pair::union_int_float_t rsq_lookup;
@@ -274,11 +274,11 @@ ev_tally(
     Kokkos::View<F_FLOAT*[6],
                  typename ArrayTypes<DeviceType>::t_virial_array::array_layout,
                  DeviceType,
-                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom,
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > const& v_vatom,
     Kokkos::View<E_FLOAT*,
                  typename ArrayTypes<DeviceType>::t_efloat_1d::array_layout,
                  DeviceType,
-                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom)
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > const& v_eatom)
 {
   if (eflag) {
     if (eflag_atom) {
@@ -374,32 +374,32 @@ static EV_FLOAT
 compute_item(
     int ii,
     int nlocal,
-    typename ArrayTypes<DeviceType>::t_int_1d_const d_ilist,
-    typename ArrayTypes<DeviceType>::t_neighbors_2d_const d_neighbors,
-    typename ArrayTypes<DeviceType>::t_int_1d_const d_numneigh,
-    typename ArrayTypes<DeviceType>::t_x_array_randomread x,
-    typename ArrayTypes<DeviceType>::t_int_1d_randomread type,
-    Kokkos::View<double*, DeviceType> mixWtSite1old,
-    Kokkos::View<double*, DeviceType> mixWtSite2old,
-    Kokkos::View<double*, DeviceType> mixWtSite1,
-    Kokkos::View<double*, DeviceType> mixWtSite2,
-    Few<int, 4> special_lj,
-    Few<Few<F_FLOAT, MAX_TYPES_STACKPARAMS+1>, MAX_TYPES_STACKPARAMS+1> m_cutsq,
-    typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq,
+    typename ArrayTypes<DeviceType>::t_int_1d_const const& d_ilist,
+    typename ArrayTypes<DeviceType>::t_neighbors_2d_const const& d_neighbors,
+    typename ArrayTypes<DeviceType>::t_int_1d_const const& d_numneigh,
+    typename ArrayTypes<DeviceType>::t_x_array_randomread const& x,
+    typename ArrayTypes<DeviceType>::t_int_1d_randomread const& type,
+    Kokkos::View<double*, DeviceType> const& mixWtSite1old,
+    Kokkos::View<double*, DeviceType> const& mixWtSite2old,
+    Kokkos::View<double*, DeviceType> const& mixWtSite1,
+    Kokkos::View<double*, DeviceType> const& mixWtSite2,
+    Few<int, 4> const& special_lj,
+    Few<Few<F_FLOAT, MAX_TYPES_STACKPARAMS+1>, MAX_TYPES_STACKPARAMS+1> const& m_cutsq,
+    typename ArrayTypes<DeviceType>::t_ffloat_2d const& d_cutsq,
     Kokkos::View<F_FLOAT*[3],
       typename ArrayTypes<DeviceType>::t_f_array::array_layout,
       DeviceType,
-      Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > f,
+      Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > const& f,
     Kokkos::View<E_FLOAT*,
                  typename ArrayTypes<DeviceType>::t_efloat_1d::array_layout,
                  DeviceType,
-                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCG,
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > const& uCG,
     Kokkos::View<E_FLOAT*,
                  typename ArrayTypes<DeviceType>::t_efloat_1d::array_layout,
                  DeviceType,
-                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCGnew,
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > const& uCGnew,
     int isite1, int isite2,
-    typename PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const,
+    typename PairTableRXKokkos<DeviceType>::TableDeviceConst const& d_table_const,
     int eflag,
     int eflag_atom,
     int vflag,
@@ -408,11 +408,11 @@ compute_item(
     Kokkos::View<F_FLOAT*[6],
                  typename ArrayTypes<DeviceType>::t_virial_array::array_layout,
                  DeviceType,
-                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom,
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > const& v_vatom,
     Kokkos::View<E_FLOAT*,
                  typename ArrayTypes<DeviceType>::t_efloat_1d::array_layout,
                  DeviceType,
-                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom) {
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > const& v_eatom) {
   EV_FLOAT ev;
   auto i = d_ilist(ii);
   auto xtmp = x(i,0);

From 7ebed717de983c351208b8e2080b37fbf761d522 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 15 Mar 2017 16:05:51 -0600
Subject: [PATCH 205/267] Adding gb_test

---
 src/KOKKOS/kokkos.cpp              |  10 +-
 src/KOKKOS/kokkos.h                |   1 +
 src/KOKKOS/pair_exp6_rx_kokkos.cpp | 425 +++++++++++++++++++++++++++++
 src/KOKKOS/pair_exp6_rx_kokkos.h   |  27 ++
 4 files changed, 462 insertions(+), 1 deletion(-)

diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp
index b8be74ac1e..a000ad5550 100644
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@@ -34,6 +34,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
   lmp->kokkos = this;
 
   auto_sync = 1;
+  gb_test = 0;
 
   int me = 0;
   MPI_Comm_rank(world,&me);
@@ -156,6 +157,7 @@ void KokkosLMP::accelerator(int narg, char **arg)
   neighflag = FULL;
   neighflag_qeq = FULL;
   neighflag_qeq_set = 0;
+  gb_test = 0;
   int newtonflag = 0;
   double binsize = 0.0;
   exchange_comm_classic = forward_comm_classic = 0;
@@ -197,6 +199,12 @@ void KokkosLMP::accelerator(int narg, char **arg)
       else if (strcmp(arg[iarg+1],"on") == 0) newtonflag = 1;
       else error->all(FLERR,"Illegal package kokkos command");
       iarg += 2;
+    } else if (strcmp(arg[iarg],"gb/test") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
+      if (strcmp(arg[iarg+1],"off") == 0) gb_test = 0;
+      else if (strcmp(arg[iarg+1],"on") == 0) gb_test = 1;
+      else error->all(FLERR,"Illegal package kokkos command");
+      iarg += 2;
     } else if (strcmp(arg[iarg],"comm") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
       if (strcmp(arg[iarg+1],"no") == 0) {
@@ -293,4 +301,4 @@ void KokkosLMP::my_signal_handler(int sig)
   if (sig == SIGSEGV) {
     kill(getpid(),SIGABRT);
   }
-}
\ No newline at end of file
+}
diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h
index 8e28b38cbf..3784d806bf 100644
--- a/src/KOKKOS/kokkos.h
+++ b/src/KOKKOS/kokkos.h
@@ -32,6 +32,7 @@ class KokkosLMP : protected Pointers {
   int num_threads,ngpu;
   int numa;
   int auto_sync;
+  int gb_test;
 
   KokkosLMP(class LAMMPS *, int, char **);
   ~KokkosLMP();
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index abc158d72c..8cf235964c 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -205,6 +205,8 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
   EV_FLOAT ev;
 
+  if (!lmp->kokkos->gb_test) {
+
   if (neighflag == HALF) {
     if (newton_pair) {
       if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALF,1,1> >(0,inum),*this,ev);
@@ -231,6 +233,48 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     }
   }
 
+  } else { // No atomics
+
+  num_threads = lmp->kokkos->num_threads;
+  int nmax = f.dimension_1();
+  if (nmax > t_f.dimension_1()) {
+    t_f = t_f_array_thread("pair_exp6_rx:t_f",num_threads,nmax);
+    t_uCG = t_efloat_1d_thread("pair_exp6_rx:t_uCG",num_threads,nmax);
+    t_uCGnew = t_efloat_1d_thread("pair_exp6_rx:t_UCGnew",num_threads,nmax);
+  }
+
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxZeroDupViews>(0,nmax),*this);
+
+  if (neighflag == HALF) {
+    if (newton_pair) {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<HALF,1,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<HALF,1,0> >(0,inum),*this);
+    } else {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<HALF,0,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<HALF,0,0> >(0,inum),*this);
+    }
+  } else if (neighflag == HALFTHREAD) {
+    if (newton_pair) {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<HALFTHREAD,1,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<HALFTHREAD,1,0> >(0,inum),*this);
+    } else {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<HALFTHREAD,0,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<HALFTHREAD,0,0> >(0,inum),*this);
+    }
+  } else if (neighflag == FULL) {
+    if (newton_pair) {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<FULL,1,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<FULL,1,0> >(0,inum),*this);
+    } else {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<FULL,0,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<FULL,0,0> >(0,inum),*this);
+    }
+  }
+
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCollapseDupViews>(0,nmax),*this);
+
+  }
+
   k_error_flag.template modify<DeviceType>();
   k_error_flag.template sync<LMPHostType>();
   if (k_error_flag.h_view())
@@ -636,6 +680,387 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
   this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(TagPairExp6rxCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(), ii, ev);
 }
 
+// Experimental thread-safety using duplicated data instead of atomics
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxComputeNoAtomics<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii, EV_FLOAT& ev) const {
+
+  int tid = 0;
+#ifndef KOKKOS_HAVE_CUDA
+  tid = DeviceType::hardware_thread_id();
+#endif
+
+  int i,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,evdwlOld,fpair;
+  double rsq,r2inv,r6inv,forceExp6,factor_lj;
+  double rCut,rCutInv,rCut2inv,rCut6inv,rCutExp,urc,durc;
+  double rm2ij,rm6ij;
+  double r,rexp;
+
+  double alphaOld12_ij, rmOld12_ij, epsilonOld12_ij;
+  double alphaOld21_ij, rmOld21_ij, epsilonOld21_ij;
+  double alpha12_ij, rm12_ij, epsilon12_ij;
+  double alpha21_ij, rm21_ij, epsilon21_ij;
+  double rminv, buck1, buck2;
+  double epsilonOld1_i,alphaOld1_i,rmOld1_i;
+  double epsilonOld1_j,alphaOld1_j,rmOld1_j;
+  double epsilonOld2_i,alphaOld2_i,rmOld2_i;
+  double epsilonOld2_j,alphaOld2_j,rmOld2_j;
+  double epsilon1_i,alpha1_i,rm1_i;
+  double epsilon1_j,alpha1_j,rm1_j;
+  double epsilon2_i,alpha2_i,rm2_i;
+  double epsilon2_j,alpha2_j,rm2_j;
+  double evdwlOldEXP6_12, evdwlOldEXP6_21, fpairOldEXP6_12, fpairOldEXP6_21;
+  double evdwlEXP6_12, evdwlEXP6_21;
+  double mixWtSite1old_i, mixWtSite1old_j;
+  double mixWtSite2old_i, mixWtSite2old_j;
+  double mixWtSite1_i, mixWtSite1_j;
+  double mixWtSite2_i, mixWtSite2_j;
+
+  const int nRep = 12;
+  const double shift = 1.05;
+  double rin1, aRep, uin1, win1, uin1rep, rin1exp, rin6, rin6inv;
+
+  evdwlOld = 0.0;
+  evdwl = 0.0;
+
+  i = d_ilist[ii];
+  xtmp = x(i,0);
+  ytmp = x(i,1);
+  ztmp = x(i,2);
+  itype = type[i];
+  jnum = d_numneigh[i];
+
+  double fx_i = 0.0;
+  double fy_i = 0.0;
+  double fz_i = 0.0;
+  double uCG_i = 0.0;
+  double uCGnew_i = 0.0;
+
+  {
+     epsilon1_i     = PairExp6ParamData.epsilon1[i];
+     alpha1_i       = PairExp6ParamData.alpha1[i];
+     rm1_i          = PairExp6ParamData.rm1[i];
+     mixWtSite1_i    = PairExp6ParamData.mixWtSite1[i];
+     epsilon2_i     = PairExp6ParamData.epsilon2[i];
+     alpha2_i       = PairExp6ParamData.alpha2[i];
+     rm2_i          = PairExp6ParamData.rm2[i];
+     mixWtSite2_i    = PairExp6ParamData.mixWtSite2[i];
+     epsilonOld1_i  = PairExp6ParamData.epsilonOld1[i];
+     alphaOld1_i    = PairExp6ParamData.alphaOld1[i];
+     rmOld1_i       = PairExp6ParamData.rmOld1[i];
+     mixWtSite1old_i = PairExp6ParamData.mixWtSite1old[i];
+     epsilonOld2_i  = PairExp6ParamData.epsilonOld2[i];
+     alphaOld2_i    = PairExp6ParamData.alphaOld2[i];
+     rmOld2_i       = PairExp6ParamData.rmOld2[i];
+     mixWtSite2old_i = PairExp6ParamData.mixWtSite2old[i];
+  }
+
+  for (jj = 0; jj < jnum; jj++) {
+    int j = d_neighbors(i,jj);
+    factor_lj = special_lj[sbmask(j)];
+    j &= NEIGHMASK;
+
+    delx = xtmp - x(j,0);
+    dely = ytmp - x(j,1);
+    delz = ztmp - x(j,2);
+
+    rsq = delx*delx + dely*dely + delz*delz;
+    jtype = type[j];
+
+    if (rsq < d_cutsq(itype,jtype)) { // optimize
+      r2inv = 1.0/rsq;
+      r6inv = r2inv*r2inv*r2inv;
+
+      r = sqrt(rsq);
+      rCut2inv = 1.0/d_cutsq(itype,jtype);
+      rCut6inv = rCut2inv*rCut2inv*rCut2inv;
+      rCut = sqrt(d_cutsq(itype,jtype));
+      rCutInv = 1.0/rCut;
+
+      //
+      // A. Compute the exp-6 potential
+      //
+
+      // A1.  Get alpha, epsilon and rm for particle j
+
+      {
+         epsilon1_j     = PairExp6ParamData.epsilon1[j];
+         alpha1_j       = PairExp6ParamData.alpha1[j];
+         rm1_j          = PairExp6ParamData.rm1[j];
+         mixWtSite1_j    = PairExp6ParamData.mixWtSite1[j];
+         epsilon2_j     = PairExp6ParamData.epsilon2[j];
+         alpha2_j       = PairExp6ParamData.alpha2[j];
+         rm2_j          = PairExp6ParamData.rm2[j];
+         mixWtSite2_j    = PairExp6ParamData.mixWtSite2[j];
+         epsilonOld1_j  = PairExp6ParamData.epsilonOld1[j];
+         alphaOld1_j    = PairExp6ParamData.alphaOld1[j];
+         rmOld1_j       = PairExp6ParamData.rmOld1[j];
+         mixWtSite1old_j = PairExp6ParamData.mixWtSite1old[j];
+         epsilonOld2_j  = PairExp6ParamData.epsilonOld2[j];
+         alphaOld2_j    = PairExp6ParamData.alphaOld2[j];
+         rmOld2_j       = PairExp6ParamData.rmOld2[j];
+         mixWtSite2old_j = PairExp6ParamData.mixWtSite2old[j];
+      }
+
+      // A2.  Apply Lorentz-Berthelot mixing rules for the i-j pair
+      alphaOld12_ij = sqrt(alphaOld1_i*alphaOld2_j);
+      rmOld12_ij = 0.5*(rmOld1_i + rmOld2_j);
+      epsilonOld12_ij = sqrt(epsilonOld1_i*epsilonOld2_j);
+      alphaOld21_ij = sqrt(alphaOld2_i*alphaOld1_j);
+      rmOld21_ij = 0.5*(rmOld2_i + rmOld1_j);
+      epsilonOld21_ij = sqrt(epsilonOld2_i*epsilonOld1_j);
+
+      alpha12_ij = sqrt(alpha1_i*alpha2_j);
+      rm12_ij = 0.5*(rm1_i + rm2_j);
+      epsilon12_ij = sqrt(epsilon1_i*epsilon2_j);
+      alpha21_ij = sqrt(alpha2_i*alpha1_j);
+      rm21_ij = 0.5*(rm2_i + rm1_j);
+      epsilon21_ij = sqrt(epsilon2_i*epsilon1_j);
+
+      evdwlOldEXP6_12 = 0.0;
+      evdwlOldEXP6_21 = 0.0;
+      evdwlEXP6_12 = 0.0;
+      evdwlEXP6_21 = 0.0;
+      fpairOldEXP6_12 = 0.0;
+      fpairOldEXP6_21 = 0.0;
+
+      if(rmOld12_ij!=0.0 && rmOld21_ij!=0.0){
+        if(alphaOld21_ij == 6.0 || alphaOld12_ij == 6.0)
+          k_error_flag.d_view() = 1;
+
+        // A3.  Compute some convenient quantities for evaluating the force
+        rminv = 1.0/rmOld12_ij;
+        buck1 = epsilonOld12_ij / (alphaOld12_ij - 6.0);
+        rexp = expValue(alphaOld12_ij*(1.0-r*rminv));
+        rm2ij = rmOld12_ij*rmOld12_ij;
+        rm6ij = rm2ij*rm2ij*rm2ij;
+
+        // Compute the shifted potential
+        rCutExp = expValue(alphaOld12_ij*(1.0-rCut*rminv));
+        buck2 = 6.0*alphaOld12_ij;
+        urc = buck1*(6.0*rCutExp - alphaOld12_ij*rm6ij*rCut6inv);
+        durc = -buck1*buck2*(rCutExp* rminv - rCutInv*rm6ij*rCut6inv);
+        rin1 = shift*rmOld12_ij*func_rin(alphaOld12_ij);
+        if(r < rin1){
+          rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+          rin6inv = 1.0/rin6;
+
+          rin1exp = expValue(alphaOld12_ij*(1.0-rin1*rminv));
+
+          uin1 = buck1*(6.0*rin1exp - alphaOld12_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+          win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
+
+          aRep = win1*powint(rin1,nRep)/nRep;
+
+          uin1rep = aRep/powint(rin1,nRep);
+
+          forceExp6 = double(nRep)*aRep/powint(r,nRep);
+          fpairOldEXP6_12 = factor_lj*forceExp6*r2inv;
+
+          evdwlOldEXP6_12 = uin1 - uin1rep + aRep/powint(r,nRep);
+        } else {
+          forceExp6 = buck1*buck2*(r*rexp*rminv - rm6ij*r6inv) + r*durc;
+          fpairOldEXP6_12 = factor_lj*forceExp6*r2inv;
+
+          evdwlOldEXP6_12 = buck1*(6.0*rexp - alphaOld12_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+        }
+
+        // A3.  Compute some convenient quantities for evaluating the force
+        rminv = 1.0/rmOld21_ij;
+        buck1 = epsilonOld21_ij / (alphaOld21_ij - 6.0);
+        buck2 = 6.0*alphaOld21_ij;
+        rexp = expValue(alphaOld21_ij*(1.0-r*rminv));
+        rm2ij = rmOld21_ij*rmOld21_ij;
+        rm6ij = rm2ij*rm2ij*rm2ij;
+
+        // Compute the shifted potential
+        rCutExp = expValue(alphaOld21_ij*(1.0-rCut*rminv));
+        buck2 = 6.0*alphaOld21_ij;
+        urc = buck1*(6.0*rCutExp - alphaOld21_ij*rm6ij*rCut6inv);
+        durc = -buck1*buck2*(rCutExp* rminv - rCutInv*rm6ij*rCut6inv);
+        rin1 = shift*rmOld21_ij*func_rin(alphaOld21_ij);
+
+        if(r < rin1){
+          rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+          rin6inv = 1.0/rin6;
+
+          rin1exp = expValue(alphaOld21_ij*(1.0-rin1*rminv));
+
+          uin1 = buck1*(6.0*rin1exp - alphaOld21_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+          win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
+
+          aRep = win1*powint(rin1,nRep)/nRep;
+
+          uin1rep = aRep/powint(rin1,nRep);
+
+          forceExp6 = double(nRep)*aRep/powint(r,nRep);
+          fpairOldEXP6_21 = factor_lj*forceExp6*r2inv;
+
+          evdwlOldEXP6_21 = uin1 - uin1rep + aRep/powint(r,nRep);
+        } else {
+          forceExp6 = buck1*buck2*(r*rexp*rminv - rm6ij*r6inv) + r*durc;
+          fpairOldEXP6_21 = factor_lj*forceExp6*r2inv;
+
+          evdwlOldEXP6_21 = buck1*(6.0*rexp - alphaOld21_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+        }
+
+        if (isite1 == isite2)
+          evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwlOldEXP6_12;
+        else
+          evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwlOldEXP6_12 + sqrt(mixWtSite2old_i*mixWtSite1old_j)*evdwlOldEXP6_21;
+
+        evdwlOld *= factor_lj;
+
+        uCG_i += 0.5*evdwlOld;
+        if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal))
+          t_uCG(tid,j) += 0.5*evdwlOld;
+      }
+
+      if(rm12_ij!=0.0 && rm21_ij!=0.0){
+        if(alpha21_ij == 6.0 || alpha12_ij == 6.0)
+          k_error_flag.d_view() = 1;
+
+        // A3.  Compute some convenient quantities for evaluating the force
+        rminv = 1.0/rm12_ij;
+        buck1 = epsilon12_ij / (alpha12_ij - 6.0);
+        buck2 = 6.0*alpha12_ij;
+        rexp = expValue(alpha12_ij*(1.0-r*rminv));
+        rm2ij = rm12_ij*rm12_ij;
+        rm6ij = rm2ij*rm2ij*rm2ij;
+
+        // Compute the shifted potential
+        rCutExp = expValue(alpha12_ij*(1.0-rCut*rminv));
+        urc = buck1*(6.0*rCutExp - alpha12_ij*rm6ij*rCut6inv);
+        durc = -buck1*buck2*(rCutExp*rminv - rCutInv*rm6ij*rCut6inv);
+        rin1 = shift*rm12_ij*func_rin(alpha12_ij);
+
+        if(r < rin1){
+          rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+          rin6inv = 1.0/rin6;
+
+          rin1exp = expValue(alpha12_ij*(1.0-rin1*rminv));
+
+          uin1 = buck1*(6.0*rin1exp - alpha12_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+          win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
+
+          aRep = win1*powint(rin1,nRep)/nRep;
+
+          uin1rep = aRep/powint(rin1,nRep);
+
+          evdwlEXP6_12 = uin1 - uin1rep + aRep/powint(r,nRep);
+        } else {
+          evdwlEXP6_12 = buck1*(6.0*rexp - alpha12_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+        }
+
+        rminv = 1.0/rm21_ij;
+        buck1 = epsilon21_ij / (alpha21_ij - 6.0);
+        buck2 = 6.0*alpha21_ij;
+        rexp = expValue(alpha21_ij*(1.0-r*rminv));
+        rm2ij = rm21_ij*rm21_ij;
+        rm6ij = rm2ij*rm2ij*rm2ij;
+
+        // Compute the shifted potential
+        rCutExp = expValue(alpha21_ij*(1.0-rCut*rminv));
+        urc = buck1*(6.0*rCutExp - alpha21_ij*rm6ij*rCut6inv);
+        durc = -buck1*buck2*(rCutExp*rminv - rCutInv*rm6ij*rCut6inv);
+        rin1 = shift*rm21_ij*func_rin(alpha21_ij);
+
+        if(r < rin1){
+          rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+          rin6inv = 1.0/rin6;
+
+          rin1exp = expValue(alpha21_ij*(1.0-rin1*rminv));
+
+          uin1 = buck1*(6.0*rin1exp - alpha21_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+          win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
+
+          aRep = win1*powint(rin1,nRep)/nRep;
+
+          uin1rep = aRep/powint(rin1,nRep);
+
+          evdwlEXP6_21 = uin1 - uin1rep + aRep/powint(r,nRep);
+        } else {
+          evdwlEXP6_21 = buck1*(6.0*rexp - alpha21_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+        }
+      }
+
+      //
+      // Apply Mixing Rule to get the overall force for the CG pair
+      //
+      if (isite1 == isite2) fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpairOldEXP6_12;
+      else fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpairOldEXP6_12 + sqrt(mixWtSite2old_i*mixWtSite1old_j)*fpairOldEXP6_21;
+
+      fx_i += delx*fpair;
+      fy_i += dely*fpair;
+      fz_i += delz*fpair;
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
+        t_f(tid,j,0) -= delx*fpair;
+        t_f(tid,j,1) -= dely*fpair;
+        t_f(tid,j,2) -= delz*fpair;
+      }
+
+      if (isite1 == isite2) evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwlEXP6_12;
+      else evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwlEXP6_12 + sqrt(mixWtSite2_i*mixWtSite1_j)*evdwlEXP6_21;
+      evdwl *= factor_lj;
+
+      uCGnew_i += 0.5*evdwl;
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal))
+        t_uCGnew(tid,j) += 0.5*evdwl;
+      evdwl = evdwlOld;
+      if (EVFLAG)
+        ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR||(j<nlocal)))?1.0:0.5)*evdwl;
+      //if (vflag_either || eflag_atom) 
+      if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair,delx,dely,delz);
+    }
+  }
+
+  t_f(tid,i,0) += fx_i;
+  t_f(tid,i,1) += fy_i;
+  t_f(tid,i,2) += fz_i;
+  t_uCG(tid,i) += uCG_i;
+  t_uCGnew(tid,i) += uCGnew_i;
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxComputeNoAtomics<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii) const {
+  EV_FLOAT ev;
+  this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(TagPairExp6rxComputeNoAtomics<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(), ii, ev);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCollapseDupViews, const int &i) const {
+  for (int n = 0; n < num_threads; n++) {
+    f(i,0) += t_f(n,i,0);
+    f(i,1) += t_f(n,i,1);
+    f(i,2) += t_f(n,i,2);
+    uCG(i) += t_uCG(n,i);
+    uCGnew(i) += t_uCGnew(n,i);
+  }
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxZeroDupViews, const int &i) const {
+  for (int n = 0; n < num_threads; n++) {
+    t_f(n,i,0) = 0.0;
+    t_f(n,i,1) = 0.0;
+    t_f(n,i,2) = 0.0;
+    t_uCG(n,i) = 0.0;
+    t_uCGnew(n,i) = 0.0;
+  }
+}
+
+
 /* ----------------------------------------------------------------------
    allocate all arrays
 ------------------------------------------------------------------------- */
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.h b/src/KOKKOS/pair_exp6_rx_kokkos.h
index 488c9d0039..8754a73c96 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.h
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.h
@@ -57,6 +57,12 @@ struct TagPairExp6rxgetMixingWeights{};
 template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
 struct TagPairExp6rxCompute{};
 
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+struct TagPairExp6rxComputeNoAtomics{};
+
+struct TagPairExp6rxCollapseDupViews{};
+struct TagPairExp6rxZeroDupViews{};
+
 template<class DeviceType>
 class PairExp6rxKokkos : public PairExp6rx {
  public:
@@ -81,6 +87,20 @@ class PairExp6rxKokkos : public PairExp6rx {
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairExp6rxCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&) const;
 
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairExp6rxComputeNoAtomics<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&, EV_FLOAT&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairExp6rxComputeNoAtomics<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairExp6rxCollapseDupViews, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairExp6rxZeroDupViews, const int&) const;
+
   template<int NEIGHFLAG, int NEWTON_PAIR>
   KOKKOS_INLINE_FUNCTION
   void ev_tally(EV_FLOAT &ev, const int &i, const int &j,
@@ -94,6 +114,7 @@ class PairExp6rxKokkos : public PairExp6rx {
   int eflag,vflag;
   int nlocal,newton_pair,neighflag;
   double special_lj[4];
+  int num_threads;
 
   typename AT::t_x_array_randomread x;
   typename AT::t_f_array f;
@@ -101,6 +122,12 @@ class PairExp6rxKokkos : public PairExp6rx {
   typename AT::t_efloat_1d uCG, uCGnew;
   typename AT::t_float_2d dvector;
 
+  typedef Kokkos::View<F_FLOAT**[3],Kokkos::LayoutRight,DeviceType> t_f_array_thread;
+  typedef Kokkos::View<E_FLOAT**,Kokkos::LayoutRight,DeviceType> t_efloat_1d_thread;
+
+  t_f_array_thread t_f;
+  t_efloat_1d_thread t_uCG, t_uCGnew;
+
   DAT::tdual_efloat_1d k_eatom;
   DAT::tdual_virial_array k_vatom;
   DAT::t_efloat_1d d_eatom;

From acdb932c4ec56b2ce56a71280dab5c17b39f2c03 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Thu, 16 Mar 2017 09:28:27 -0600
Subject: [PATCH 206/267] Fixing index issue in pair_exp6_rx_kokkos

---
 src/KOKKOS/pair_exp6_rx_kokkos.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index 8cf235964c..577d5261a3 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -236,7 +236,7 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   } else { // No atomics
 
   num_threads = lmp->kokkos->num_threads;
-  int nmax = f.dimension_1();
+  int nmax = f.dimension_0();
   if (nmax > t_f.dimension_1()) {
     t_f = t_f_array_thread("pair_exp6_rx:t_f",num_threads,nmax);
     t_uCG = t_efloat_1d_thread("pair_exp6_rx:t_uCG",num_threads,nmax);

From f5b7361ef6b6dbeedb2ad2181a44db64943001bb Mon Sep 17 00:00:00 2001
From: "Christopher P. Stone" <chris.stone@computational-science.com>
Date: Thu, 16 Mar 2017 21:31:30 -0400
Subject: [PATCH 207/267] Non-kokkos candidate of
 PairExp6rxKokkos::getMixingWeights to improve vectorization on the KNL.

- Moved the particle loop inside a replica of getMixingWeights, getMixingWeightsVect,
  and refactored to improve vectorization.
- Added OMP SIMD and OMP threading directly inside that function but will replace with
  kokkos parallel_for and parallel_reduce methods later.
---
 src/KOKKOS/pair_exp6_rx_kokkos.cpp | 473 ++++++++++++++++++++++++++++-
 src/KOKKOS/pair_exp6_rx_kokkos.h   |   3 +
 2 files changed, 475 insertions(+), 1 deletion(-)

diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index abc158d72c..df663c9df9 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -53,6 +53,22 @@ using namespace MathSpecialKokkos;
 #define exp6PotentialType (1)
 #define isExp6PotentialType(_type) ( (_type) == exp6PotentialType )
 
+namespace /* anonymous */
+{
+
+//typedef double TimerType;
+//TimerType getTimeStamp(void) { return MPI_Wtime(); }
+//double getElapsedTime( const TimerType &t0, const TimerType &t1) { return t1-t0; }
+
+typedef struct timespec TimerType;
+TimerType getTimeStamp(void) { TimerType tick; clock_gettime( CLOCK_MONOTONIC, &tick); return tick; }
+double getElapsedTime( const TimerType &t0, const TimerType &t1)
+{
+   return (t1.tv_sec - t0.tv_sec) + 1e-9*(t1.tv_nsec - t0.tv_nsec);
+}
+
+} // end namespace
+
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
@@ -121,6 +137,8 @@ void PairExp6rxKokkos<DeviceType>::init_style()
 template<class DeviceType>
 void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 {
+  TimerType t_start = getTimeStamp();
+
   copymode = 1;
 
   eflag = eflag_in;
@@ -165,6 +183,7 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   // and ghost atoms. Make the parameter data persistent
   // and exchange like any other atom property later.
 
+  TimerType t_mix_start = getTimeStamp();
   {
      const int np_total = nlocal + atom->nghost;
 
@@ -185,8 +204,77 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
      PairExp6ParamData.rmOld2       = typename AT::t_float_1d("PairExp6ParamData.rmOld2"      ,np_total);
      PairExp6ParamData.mixWtSite2old = typename AT::t_float_1d("PairExp6ParamData.mixWtSite2old",np_total);
 
-     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxgetMixingWeights>(0,np_total),*this);
+     //Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxgetMixingWeights>(0,np_total),*this);
+
+     //typename AT::t_float_1d epsilon1     ("epsilon1"    ,np_total);
+     //typename AT::t_float_1d alpha1       ("alpha1"      ,np_total);
+     //typename AT::t_float_1d rm1          ("rm1"         ,np_total);
+     //typename AT::t_float_1d mixWtSite1   ("mixWtSite1"   ,np_total);
+     //typename AT::t_float_1d epsilon2     ("epsilon2"    ,np_total);
+     //typename AT::t_float_1d alpha2       ("alpha2"      ,np_total);
+     //typename AT::t_float_1d rm2          ("rm2"         ,np_total);
+     //typename AT::t_float_1d mixWtSite2   ("mixWtSite2"   ,np_total);
+     //typename AT::t_float_1d epsilonOld1  ("epsilonOld1" ,np_total);
+     //typename AT::t_float_1d alphaOld1    ("alphaOld1"   ,np_total);
+     //typename AT::t_float_1d rmOld1       ("rmOld1"      ,np_total);
+     //typename AT::t_float_1d mixWtSite1old("mixWtSite1old",np_total);
+     //typename AT::t_float_1d epsilonOld2  ("epsilonOld2" ,np_total);
+     //typename AT::t_float_1d alphaOld2    ("alphaOld2"   ,np_total);
+     //typename AT::t_float_1d rmOld2       ("rmOld2"      ,np_total);
+     //typename AT::t_float_1d mixWtSite2old("mixWtSite2old",np_total);
+
+     int errorFlag = 0;
+     getMixingWeightsVect (np_total, errorFlag, PairExp6ParamData.epsilon1,
+                                                PairExp6ParamData.alpha1,
+                                                PairExp6ParamData.rm1,
+                                                PairExp6ParamData.mixWtSite1,
+                                                PairExp6ParamData.epsilon2,
+                                                PairExp6ParamData.alpha2,
+                                                PairExp6ParamData.rm2,
+                                                PairExp6ParamData.mixWtSite2,
+                                                PairExp6ParamData.epsilonOld1,
+                                                PairExp6ParamData.alphaOld1,
+                                                PairExp6ParamData.rmOld1,
+                                                PairExp6ParamData.mixWtSite1old,
+                                                PairExp6ParamData.epsilonOld2,
+                                                PairExp6ParamData.alphaOld2,
+                                                PairExp6ParamData.rmOld2,
+                                                PairExp6ParamData.mixWtSite2old);
+     if (errorFlag == 1)
+       error->all(FLERR,"The number of molecules in CG particle is less than 10*DBL_EPSILON.");
+     else if (errorFlag == 2)
+       error->all(FLERR,"Computed fraction less than -10*DBL_EPSILON");
+
+     //#define _test_var(var) { \
+     //  double ref2 = 0, err2 = 0; \
+     //  for (int id = 0; id < np_total; ++id) \
+     //  { \
+     //     double ref = PairExp6ParamData. var [id]; \
+     //     double diff = ref - var[id]; \
+     //     ref2 += ref*ref; \
+     //     err2 += diff*diff; \
+     //  } \
+     //  if (ref2 < 1e-20) ref2 = 1.0; \
+     //  if (sqrt(err2)/sqrt(ref2) > 1e-12) \
+     //     printf("%s: %e %e %e\n", # var, sqrt(ref2), sqrt(err2), sqrt(err2)/sqrt(ref2)); \
+     //}
+     //_test_var( epsilon1);
+     //_test_var( alpha1);
+     //_test_var( rm1);
+     //_test_var( epsilon2);
+     //_test_var( alpha2);
+     //_test_var( rm2);
+     //_test_var( mixWtSite2);
+     //_test_var( epsilonOld1);
+     //_test_var( alphaOld1);
+     //_test_var( rmOld1);
+     //_test_var( mixWtSite1old);
+     //_test_var( epsilonOld2);
+     //_test_var( alphaOld2);
+     //_test_var( rmOld2);
+     //_test_var( mixWtSite2old);
   }
+  TimerType t_mix_stop = getTimeStamp();
 
   k_error_flag.template modify<DeviceType>();
   k_error_flag.template sync<LMPHostType>();
@@ -259,6 +347,9 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   }
 
   copymode = 0;
+
+  TimerType t_stop = getTimeStamp();
+  printf("PairExp6rxKokkos::compute %f %f\n", getElapsedTime(t_start, t_stop), getElapsedTime(t_mix_start, t_mix_stop));
 }
 
 template<class DeviceType>
@@ -917,6 +1008,7 @@ void PairExp6rxKokkos<DeviceType>::getMixingWeights(int id,double &epsilon1,doub
       nMoleculesOld2 = dvector(ispecies+nspecies,id);
       nMolecules2 = dvector(ispecies,id);
       fractionOld2 = dvector(ispecies+nspecies,id)/nTotalold;
+      fraction2 = nMolecules2/nTotal;
     }
 
     // If Site1 or Site2 matches is a fluid, then compute the paramters
@@ -1072,6 +1164,385 @@ void PairExp6rxKokkos<DeviceType>::getMixingWeights(int id,double &epsilon1,doub
   }
 }
 
+#ifdef _OPENMP
+void partition_range( const int begin, const int end, int &thread_begin, int &thread_end, const int chunkSize = 1)
+{
+   int threadId = omp_get_thread_num();
+   int nThreads = omp_get_num_threads();
+
+   const int len = end - begin;
+   const int nBlocks = (len + (chunkSize - 1)) / chunkSize;
+   const int nBlocksPerThread = nBlocks / nThreads;
+   const int nRemaining = nBlocks - nBlocksPerThread * nThreads;
+   int block_lo, block_hi;
+   if (threadId < nRemaining)
+   {
+      block_lo = threadId * nBlocksPerThread + threadId;
+      block_hi = block_lo + nBlocksPerThread + 1;
+   }
+   else
+   {
+      block_lo = threadId * nBlocksPerThread + nRemaining;
+      block_hi = block_lo + nBlocksPerThread;
+   }
+
+   thread_begin = std::min(begin + block_lo * chunkSize, end);
+   thread_end   = std::min(begin + block_hi * chunkSize, end);
+   //printf("tid: %d %d %d %d %d\n", threadId, block_lo, block_hi, thread_begin, thread_end);
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+  template<class ArrayT>
+void PairExp6rxKokkos<DeviceType>::getMixingWeightsVect(const int np_total, int errorFlag, 
+                          ArrayT &epsilon1, ArrayT &alpha1, ArrayT &rm1,  ArrayT &mixWtSite1, ArrayT &epsilon2, ArrayT &alpha2, ArrayT &rm2, ArrayT &mixWtSite2, ArrayT &epsilon1_old, ArrayT &alpha1_old, ArrayT &rm1_old,  ArrayT &mixWtSite1old, ArrayT &epsilon2_old, ArrayT &alpha2_old, ArrayT &rm2_old, ArrayT &mixWtSite2old) const
+{
+  ArrayT epsilon("PairExp6ParamData.epsilon",  np_total);
+  ArrayT rm3("PairExp6ParamData.rm3",  np_total);
+  ArrayT alpha("PairExp6ParamData.alpha",  np_total);
+  ArrayT xMolei("PairExp6ParamData.xMolei",  np_total);
+
+  ArrayT epsilon_old("PairExp6ParamData.epsilon_old",  np_total);
+  ArrayT rm3_old("PairExp6ParamData.rm3_old",  np_total);
+  ArrayT alpha_old("PairExp6ParamData.alpha_old",  np_total);
+  ArrayT xMolei_old("PairExp6ParamData.xMolei_old",  np_total);
+
+  ArrayT fractionOFA("PairExp6ParamData.fractionOFA",  np_total);
+  ArrayT fraction1("PairExp6ParamData.fraction1",  np_total);
+  ArrayT fraction2("PairExp6ParamData.fraction2",  np_total);
+  ArrayT nMoleculesOFA("PairExp6ParamData.nMoleculesOFA",  np_total);
+  ArrayT nMolecules1("PairExp6ParamData.nMolecules1",  np_total);
+  ArrayT nMolecules2("PairExp6ParamData.nMolecules2",  np_total);
+  ArrayT nTotal("PairExp6ParamData.nTotal",  np_total);
+
+  ArrayT fractionOFAold("PairExp6ParamData.fractionOFAold",  np_total);
+  ArrayT fractionOld1("PairExp6ParamData.fractionOld1",  np_total);
+  ArrayT fractionOld2("PairExp6ParamData.fractionOld2",  np_total);
+  ArrayT nMoleculesOFAold("PairExp6ParamData.nMoleculesOFAold",  np_total);
+  ArrayT nMoleculesOld1("PairExp6ParamData.nMoleculesOld1",  np_total);
+  ArrayT nMoleculesOld2("PairExp6ParamData.nMoleculesOld2",  np_total);
+  ArrayT nTotalold("PairExp6ParamData.nTotalold",  np_total);
+
+  int errorFlag1 = 0, errorFlag2 = 0;
+
+#ifdef _OPENMP
+  #pragma omp parallel reduction(+: errorFlag1, errorFlag2)
+#endif
+  {
+    int idx_begin = 0, idx_end = np_total;
+#ifdef _OPENMP
+    partition_range( 0, np_total, idx_begin, idx_end, 16 );
+#endif
+
+  // Zero out all of the terms first.
+  #pragma ivdep
+  for (int id = idx_begin; id < idx_end; ++id)
+  {
+     rm3[id] = 0.0;
+     epsilon[id] = 0.0;
+     alpha[id] = 0.0;
+     epsilon_old[id] = 0.0;
+     rm3_old[id] = 0.0;
+     alpha_old[id] = 0.0;
+     fractionOFA[id] = 0.0;
+     fractionOFAold[id] = 0.0;
+     nMoleculesOFA[id] = 0.0;
+     nMoleculesOFAold[id] = 0.0;
+     nTotal[id] = 0.0;
+     nTotalold[id] = 0.0;
+  }
+
+  // Compute the total number of molecules in the old and new CG particle as well as the total number of molecules in the fluid portion of the old and new CG particle
+  for (int ispecies = 0; ispecies < nspecies; ispecies++)
+  {
+    #pragma ivdep
+    for (int id = idx_begin; id < idx_end; ++id)
+    {
+      nTotal[id] += dvector(ispecies,id);
+      nTotalold[id] += dvector(ispecies+nspecies,id);
+    }
+
+    const int iparam = d_mol2param[ispecies];
+
+    if (iparam < 0 || d_params[iparam].potentialType != exp6PotentialType ) continue;
+    if (isOneFluidApprox(isite1) || isOneFluidApprox(isite2)) {
+      if (isite1 == d_params[iparam].ispecies || isite2 == d_params[iparam].ispecies) continue;
+
+      #pragma ivdep
+      for (int id = idx_begin; id < idx_end; ++id)
+      {
+        nMoleculesOFAold[id] += dvector(ispecies+nspecies,id);
+        nMoleculesOFA[id] += dvector(ispecies,id);
+      }
+    }
+  }
+
+  // Make a reduction.
+  #pragma omp simd reduction(+:errorFlag1)
+  for (int id = idx_begin; id < idx_end; ++id)
+  {
+    if ( nTotal[id] < MY_EPSILON || nTotalold[id] < MY_EPSILON )
+      errorFlag1 = 1;
+
+    // Compute the mole fraction of molecules within the fluid portion of the particle (One Fluid Approximation)
+    fractionOFAold[id] = nMoleculesOFAold[id] / nTotalold[id];
+    fractionOFA[id] = nMoleculesOFA[id] / nTotal[id];
+  }
+
+  for (int ispecies = 0; ispecies < nspecies; ispecies++) {
+    const int iparam = d_mol2param[ispecies];
+    if (iparam < 0 || d_params[iparam].potentialType != exp6PotentialType ) continue;
+
+    // If Site1 matches a pure species, then grab the parameters
+    if (isite1 == d_params[iparam].ispecies)
+    {
+      #pragma ivdep
+      for (int id = idx_begin; id < idx_end; ++id)
+      {
+        rm1_old[id] = d_params[iparam].rm;
+        rm1[id] = d_params[iparam].rm;
+        epsilon1_old[id] = d_params[iparam].epsilon;
+        epsilon1[id] = d_params[iparam].epsilon;
+        alpha1_old[id] = d_params[iparam].alpha;
+        alpha1[id] = d_params[iparam].alpha;
+
+        // Compute the mole fraction of Site1
+        nMoleculesOld1[id] = dvector(ispecies+nspecies,id);
+        nMolecules1[id] = dvector(ispecies,id);
+        fractionOld1[id] = nMoleculesOld1[id]/nTotalold[id];
+        fraction1[id] = nMolecules1[id]/nTotal[id];
+      }
+    }
+
+    // If Site2 matches a pure species, then grab the parameters
+    if (isite2 == d_params[iparam].ispecies)
+    {
+      #pragma ivdep
+      for (int id = idx_begin; id < idx_end; ++id)
+      {
+        rm2_old[id] = d_params[iparam].rm;
+        rm2[id] = d_params[iparam].rm;
+        epsilon2_old[id] = d_params[iparam].epsilon;
+        epsilon2[id] = d_params[iparam].epsilon;
+        alpha2_old[id] = d_params[iparam].alpha;
+        alpha2[id] = d_params[iparam].alpha;
+
+        // Compute the mole fraction of Site2
+        nMoleculesOld2[id] = dvector(ispecies+nspecies,id);
+        nMolecules2[id] = dvector(ispecies,id);
+        fractionOld2[id] = nMoleculesOld2[id]/nTotalold[id];
+        fraction2[id] = nMolecules2[id]/nTotal[id];
+      }
+    }
+
+    // If Site1 or Site2 matches is a fluid, then compute the paramters
+    if (isOneFluidApprox(isite1) || isOneFluidApprox(isite2)) {
+      if (isite1 == d_params[iparam].ispecies || isite2 == d_params[iparam].ispecies) continue;
+
+      const double rmi = d_params[iparam].rm;
+      const double epsiloni = d_params[iparam].epsilon;
+      const double alphai = d_params[iparam].alpha;
+
+      #pragma ivdep
+      for (int id = idx_begin; id < idx_end; ++id)
+      {
+        if(nMoleculesOFA[id]<MY_EPSILON) xMolei[id] = 0.0;
+        else xMolei[id] = dvector(ispecies,id)/nMoleculesOFA[id];
+        if(nMoleculesOFAold[id]<MY_EPSILON) xMolei_old[id] = 0.0;
+        else xMolei_old[id] = dvector(ispecies+nspecies,id)/nMoleculesOFAold[id];
+      }
+
+      for (int jspecies = 0; jspecies < nspecies; jspecies++) {
+        const int jparam = d_mol2param[jspecies];
+        if (jparam < 0 || d_params[jparam].potentialType != exp6PotentialType ) continue;
+        if (isite1 == d_params[jparam].ispecies || isite2 == d_params[jparam].ispecies) continue;
+
+        const double rmj = d_params[jparam].rm;
+        const double epsilonj = d_params[jparam].epsilon;
+        const double alphaj = d_params[jparam].alpha;
+
+        const double rmij = (rmi+rmj)/2.0;
+        const double rm3ij = rmij*rmij*rmij;
+        const double epsilonij = sqrt(epsiloni*epsilonj);
+        const double alphaij = sqrt(alphai*alphaj);
+
+        #pragma ivdep
+        for (int id = idx_begin; id < idx_end; ++id)
+        {
+          double xMolej, xMolej_old;
+          if(nMoleculesOFA[id]<MY_EPSILON) xMolej = 0.0;
+          else xMolej = dvector(jspecies,id)/nMoleculesOFA[id];
+          if(nMoleculesOFAold[id]<MY_EPSILON) xMolej_old = 0.0;
+          else xMolej_old = dvector(jspecies+nspecies,id)/nMoleculesOFAold[id];
+
+          if(fractionOFAold[id] > 0.0){
+            rm3_old[id] += xMolei_old[id]*xMolej_old*rm3ij;
+            epsilon_old[id] += xMolei_old[id]*xMolej_old*rm3ij*epsilonij;
+            alpha_old[id] += xMolei_old[id]*xMolej_old*rm3ij*epsilonij*alphaij;
+          }
+          if(fractionOFA[id] > 0.0){
+            rm3[id] += xMolei[id]*xMolej*rm3ij;
+            epsilon[id] += xMolei[id]*xMolej*rm3ij*epsilonij;
+            alpha[id] += xMolei[id]*xMolej*rm3ij*epsilonij*alphaij;
+          }
+        }
+      }
+    }
+  }
+
+  if (isOneFluidApprox(isite1))
+  {
+    #pragma ivdep
+    for (int id = idx_begin; id < idx_end; ++id)
+    {
+      rm1[id] = cbrt(rm3[id]);
+      if(rm1[id] < MY_EPSILON) {
+        rm1[id] = 0.0;
+        epsilon1[id] = 0.0;
+        alpha1[id] = 0.0;
+      } else {
+        epsilon1[id] = epsilon[id] / rm3[id];
+        alpha1[id] = alpha[id] / epsilon1[id] / rm3[id];
+      }
+      nMolecules1[id] = 1.0-(nTotal[id]-nMoleculesOFA[id]);
+      fraction1[id] = fractionOFA[id];
+
+      rm1_old[id] = cbrt(rm3_old[id]);
+      if(rm1_old[id] < MY_EPSILON) {
+        rm1_old[id] = 0.0;
+        epsilon1_old[id] = 0.0;
+        alpha1_old[id] = 0.0;
+      } else {
+        epsilon1_old[id] = epsilon_old[id] / rm3_old[id];
+        alpha1_old[id] = alpha_old[id] / epsilon1_old[id] / rm3_old[id];
+      }
+      nMoleculesOld1[id] = 1.0-(nTotalold[id]-nMoleculesOFAold[id]);
+      fractionOld1[id] = fractionOFAold[id];
+    }
+
+    if(scalingFlag == EXPONENT) {
+      #pragma ivdep
+      for (int id = idx_begin; id < idx_end; ++id)
+      {
+        exponentScaling(nMoleculesOFA[id],epsilon1[id],rm1[id]);
+        exponentScaling(nMoleculesOFAold[id],epsilon1_old[id],rm1_old[id]);
+      }
+    }
+    else if(scalingFlag == POLYNOMIAL){
+      #pragma ivdep
+      for (int id = idx_begin; id < idx_end; ++id)
+      {
+        polynomialScaling(nMoleculesOFA[id],alpha1[id],epsilon1[id],rm1[id]);
+        polynomialScaling(nMoleculesOFAold[id],alpha1_old[id],epsilon1_old[id],rm1_old[id]);
+      }
+    }
+  }
+
+  if (isOneFluidApprox(isite2))
+  {
+    #pragma ivdep
+    for (int id = idx_begin; id < idx_end; ++id)
+    {
+      rm2[id] = cbrt(rm3[id]);
+      if(rm2[id] < MY_EPSILON) {
+        rm2[id] = 0.0;
+        epsilon2[id] = 0.0;
+        alpha2[id] = 0.0;
+      } else {
+        epsilon2[id] = epsilon[id] / rm3[id];
+        alpha2[id] = alpha[id] / epsilon2[id] / rm3[id];
+      }
+      nMolecules2[id] = 1.0-(nTotal[id]-nMoleculesOFA[id]);
+      fraction2[id] = fractionOFA[id];
+
+      rm2_old[id] = cbrt(rm3_old[id]);
+      if(rm2_old[id] < MY_EPSILON) {
+        rm2_old[id] = 0.0;
+        epsilon2_old[id] = 0.0;
+        alpha2_old[id] = 0.0;
+      } else {
+        epsilon2_old[id] = epsilon_old[id] / rm3_old[id];
+        alpha2_old[id] = alpha_old[id] / epsilon2_old[id] / rm3_old[id];
+      }
+      nMoleculesOld2[id] = 1.0-(nTotalold[id]-nMoleculesOFAold[id]);
+      fractionOld2[id] = fractionOFAold[id];
+    }
+
+    if(scalingFlag == EXPONENT){
+      #pragma ivdep
+      for (int id = idx_begin; id < idx_end; ++id)
+      {
+        exponentScaling(nMoleculesOFA[id],epsilon2[id],rm2[id]);
+        exponentScaling(nMoleculesOFAold[id],epsilon2_old[id],rm2_old[id]);
+      }
+    }
+    else if(scalingFlag == POLYNOMIAL){
+      #pragma ivdep
+      for (int id = idx_begin; id < idx_end; ++id)
+      {
+        polynomialScaling(nMoleculesOFA[id],alpha2[id],epsilon2[id],rm2[id]);
+        polynomialScaling(nMoleculesOFAold[id],alpha2_old[id],epsilon2_old[id],rm2_old[id]);
+      }
+    }
+  }
+
+  // Check that no fractions are less than zero
+  #pragma omp simd reduction(+:errorFlag2)
+  for (int id = idx_begin; id < idx_end; ++id)
+  {
+    if(fraction1[id] < 0.0 || nMolecules1[id] < 0.0){
+      if(fraction1[id] < -MY_EPSILON || nMolecules1[id] < -MY_EPSILON){
+        errorFlag2 = 2;
+      }
+      nMolecules1[id] = 0.0;
+      fraction1[id] = 0.0;
+    }
+    if(fraction2[id] < 0.0 || nMolecules2[id] < 0.0){
+      if(fraction2[id] < -MY_EPSILON || nMolecules2[id] < -MY_EPSILON){
+        errorFlag2 = 2;
+      }
+      nMolecules2[id] = 0.0;
+      fraction2[id] = 0.0;
+    }
+    if(fractionOld1[id] < 0.0 || nMoleculesOld1[id] < 0.0){
+      if(fractionOld1[id] < -MY_EPSILON || nMoleculesOld1[id] < -MY_EPSILON){
+        errorFlag2 = 2;
+      }
+      nMoleculesOld1[id] = 0.0;
+      fractionOld1[id] = 0.0;
+    }
+    if(fractionOld2[id] < 0.0 || nMoleculesOld2[id] < 0.0){
+      if(fractionOld2[id] < -MY_EPSILON || nMoleculesOld2[id] < -MY_EPSILON){
+        errorFlag2 = 2;
+      }
+      nMoleculesOld2[id] = 0.0;
+      fractionOld2[id] = 0.0;
+    }
+
+    if(fractionalWeighting){
+      mixWtSite1old[id] = fractionOld1[id];
+      mixWtSite1[id] = fraction1[id];
+      mixWtSite2old[id] = fractionOld2[id];
+      mixWtSite2[id] = fraction2[id];
+    } else {
+      mixWtSite1old[id] = nMoleculesOld1[id];
+      mixWtSite1[id] = nMolecules1[id];
+      mixWtSite2old[id] = nMoleculesOld2[id];
+      mixWtSite2[id] = nMolecules2[id];
+    }
+  }
+
+  } // end parallel region
+
+  if (errorFlag1 > 0)
+    errorFlag = 1;
+
+  if (errorFlag2 > 0)
+    errorFlag = 2;
+}
+
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.h b/src/KOKKOS/pair_exp6_rx_kokkos.h
index 488c9d0039..55b29f559b 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.h
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.h
@@ -133,6 +133,9 @@ class PairExp6rxKokkos : public PairExp6rx {
   KOKKOS_INLINE_FUNCTION
   void getMixingWeights(int, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &) const;
 
+  template <class ArrayT>
+  void getMixingWeightsVect(const int, int, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &) const;
+
   KOKKOS_INLINE_FUNCTION
   void exponentScaling(double, double &, double &) const;
 

From ec192a95cb1465184f29f9f6bae06da9815411dc Mon Sep 17 00:00:00 2001
From: "Christopher P. Stone" <chris.stone@computational-science.com>
Date: Thu, 16 Mar 2017 22:28:19 -0400
Subject: [PATCH 208/267] Cleaned up the non-kokkos part of
 KOKKOS/pair_exp6_rx_kokkos.cpp

---
 src/KOKKOS/pair_exp6_rx_kokkos.cpp | 78 +++++++-----------------------
 1 file changed, 17 insertions(+), 61 deletions(-)

diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index df663c9df9..d1481e6a44 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -187,42 +187,26 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   {
      const int np_total = nlocal + atom->nghost;
 
-     PairExp6ParamData.epsilon1     = typename AT::t_float_1d("PairExp6ParamData.epsilon1"    ,np_total);
-     PairExp6ParamData.alpha1       = typename AT::t_float_1d("PairExp6ParamData.alpha1"      ,np_total);
-     PairExp6ParamData.rm1          = typename AT::t_float_1d("PairExp6ParamData.rm1"         ,np_total);
+     PairExp6ParamData.epsilon1      = typename AT::t_float_1d("PairExp6ParamData.epsilon1"     ,np_total);
+     PairExp6ParamData.alpha1        = typename AT::t_float_1d("PairExp6ParamData.alpha1"       ,np_total);
+     PairExp6ParamData.rm1           = typename AT::t_float_1d("PairExp6ParamData.rm1"          ,np_total);
      PairExp6ParamData.mixWtSite1    = typename AT::t_float_1d("PairExp6ParamData.mixWtSite1"   ,np_total);
-     PairExp6ParamData.epsilon2     = typename AT::t_float_1d("PairExp6ParamData.epsilon2"    ,np_total);
-     PairExp6ParamData.alpha2       = typename AT::t_float_1d("PairExp6ParamData.alpha2"      ,np_total);
-     PairExp6ParamData.rm2          = typename AT::t_float_1d("PairExp6ParamData.rm2"         ,np_total);
+     PairExp6ParamData.epsilon2      = typename AT::t_float_1d("PairExp6ParamData.epsilon2"     ,np_total);
+     PairExp6ParamData.alpha2        = typename AT::t_float_1d("PairExp6ParamData.alpha2"       ,np_total);
+     PairExp6ParamData.rm2           = typename AT::t_float_1d("PairExp6ParamData.rm2"          ,np_total);
      PairExp6ParamData.mixWtSite2    = typename AT::t_float_1d("PairExp6ParamData.mixWtSite2"   ,np_total);
-     PairExp6ParamData.epsilonOld1  = typename AT::t_float_1d("PairExp6ParamData.epsilonOld1" ,np_total);
-     PairExp6ParamData.alphaOld1    = typename AT::t_float_1d("PairExp6ParamData.alphaOld1"   ,np_total);
-     PairExp6ParamData.rmOld1       = typename AT::t_float_1d("PairExp6ParamData.rmOld1"      ,np_total);
+     PairExp6ParamData.epsilonOld1   = typename AT::t_float_1d("PairExp6ParamData.epsilonOld1"  ,np_total);
+     PairExp6ParamData.alphaOld1     = typename AT::t_float_1d("PairExp6ParamData.alphaOld1"    ,np_total);
+     PairExp6ParamData.rmOld1        = typename AT::t_float_1d("PairExp6ParamData.rmOld1"       ,np_total);
      PairExp6ParamData.mixWtSite1old = typename AT::t_float_1d("PairExp6ParamData.mixWtSite1old",np_total);
-     PairExp6ParamData.epsilonOld2  = typename AT::t_float_1d("PairExp6ParamData.epsilonOld2" ,np_total);
-     PairExp6ParamData.alphaOld2    = typename AT::t_float_1d("PairExp6ParamData.alphaOld2"   ,np_total);
-     PairExp6ParamData.rmOld2       = typename AT::t_float_1d("PairExp6ParamData.rmOld2"      ,np_total);
+     PairExp6ParamData.epsilonOld2   = typename AT::t_float_1d("PairExp6ParamData.epsilonOld2"  ,np_total);
+     PairExp6ParamData.alphaOld2     = typename AT::t_float_1d("PairExp6ParamData.alphaOld2"    ,np_total);
+     PairExp6ParamData.rmOld2        = typename AT::t_float_1d("PairExp6ParamData.rmOld2"       ,np_total);
      PairExp6ParamData.mixWtSite2old = typename AT::t_float_1d("PairExp6ParamData.mixWtSite2old",np_total);
 
-     //Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxgetMixingWeights>(0,np_total),*this);
-
-     //typename AT::t_float_1d epsilon1     ("epsilon1"    ,np_total);
-     //typename AT::t_float_1d alpha1       ("alpha1"      ,np_total);
-     //typename AT::t_float_1d rm1          ("rm1"         ,np_total);
-     //typename AT::t_float_1d mixWtSite1   ("mixWtSite1"   ,np_total);
-     //typename AT::t_float_1d epsilon2     ("epsilon2"    ,np_total);
-     //typename AT::t_float_1d alpha2       ("alpha2"      ,np_total);
-     //typename AT::t_float_1d rm2          ("rm2"         ,np_total);
-     //typename AT::t_float_1d mixWtSite2   ("mixWtSite2"   ,np_total);
-     //typename AT::t_float_1d epsilonOld1  ("epsilonOld1" ,np_total);
-     //typename AT::t_float_1d alphaOld1    ("alphaOld1"   ,np_total);
-     //typename AT::t_float_1d rmOld1       ("rmOld1"      ,np_total);
-     //typename AT::t_float_1d mixWtSite1old("mixWtSite1old",np_total);
-     //typename AT::t_float_1d epsilonOld2  ("epsilonOld2" ,np_total);
-     //typename AT::t_float_1d alphaOld2    ("alphaOld2"   ,np_total);
-     //typename AT::t_float_1d rmOld2       ("rmOld2"      ,np_total);
-     //typename AT::t_float_1d mixWtSite2old("mixWtSite2old",np_total);
-
+#ifdef KOKKOS_HAVE_CUDA
+     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxgetMixingWeights>(0,np_total),*this);
+#else
      int errorFlag = 0;
      getMixingWeightsVect (np_total, errorFlag, PairExp6ParamData.epsilon1,
                                                 PairExp6ParamData.alpha1,
@@ -244,35 +228,7 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        error->all(FLERR,"The number of molecules in CG particle is less than 10*DBL_EPSILON.");
      else if (errorFlag == 2)
        error->all(FLERR,"Computed fraction less than -10*DBL_EPSILON");
-
-     //#define _test_var(var) { \
-     //  double ref2 = 0, err2 = 0; \
-     //  for (int id = 0; id < np_total; ++id) \
-     //  { \
-     //     double ref = PairExp6ParamData. var [id]; \
-     //     double diff = ref - var[id]; \
-     //     ref2 += ref*ref; \
-     //     err2 += diff*diff; \
-     //  } \
-     //  if (ref2 < 1e-20) ref2 = 1.0; \
-     //  if (sqrt(err2)/sqrt(ref2) > 1e-12) \
-     //     printf("%s: %e %e %e\n", # var, sqrt(ref2), sqrt(err2), sqrt(err2)/sqrt(ref2)); \
-     //}
-     //_test_var( epsilon1);
-     //_test_var( alpha1);
-     //_test_var( rm1);
-     //_test_var( epsilon2);
-     //_test_var( alpha2);
-     //_test_var( rm2);
-     //_test_var( mixWtSite2);
-     //_test_var( epsilonOld1);
-     //_test_var( alphaOld1);
-     //_test_var( rmOld1);
-     //_test_var( mixWtSite1old);
-     //_test_var( epsilonOld2);
-     //_test_var( alphaOld2);
-     //_test_var( rmOld2);
-     //_test_var( mixWtSite2old);
+#endif
   }
   TimerType t_mix_stop = getTimeStamp();
 
@@ -349,7 +305,7 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   copymode = 0;
 
   TimerType t_stop = getTimeStamp();
-  printf("PairExp6rxKokkos::compute %f %f\n", getElapsedTime(t_start, t_stop), getElapsedTime(t_mix_start, t_mix_stop));
+  //printf("PairExp6rxKokkos::compute %f %f\n", getElapsedTime(t_start, t_stop), getElapsedTime(t_mix_start, t_mix_stop));
 }
 
 template<class DeviceType>

From 64fdb1f528bcaacc8c6a7ad1ea1b4824533af838 Mon Sep 17 00:00:00 2001
From: "Christopher P. Stone" <chris.stone@computational-science.com>
Date: Fri, 17 Mar 2017 15:52:40 -0400
Subject: [PATCH 209/267] Kokkos/pair_exp6_rx_kokkos optimized for SIMD on the
 inner j-loop.

---
 src/KOKKOS/pair_exp6_rx_kokkos.cpp | 518 ++++++++++++++++++++++++++++-
 src/KOKKOS/pair_exp6_rx_kokkos.h   |   4 +
 2 files changed, 521 insertions(+), 1 deletion(-)

diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index 64a91c9e65..85d919091f 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -349,7 +349,7 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   copymode = 0;
 
   TimerType t_stop = getTimeStamp();
-  //printf("PairExp6rxKokkos::compute %f %f\n", getElapsedTime(t_start, t_stop), getElapsedTime(t_mix_start, t_mix_stop));
+  printf("PairExp6rxKokkos::compute %f %f\n", getElapsedTime(t_start, t_stop), getElapsedTime(t_mix_start, t_mix_stop));
 }
 
 template<class DeviceType>
@@ -378,6 +378,14 @@ template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii, EV_FLOAT& ev) const {
 
+  {
+    if (isite1 == isite2)
+      this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,true, true>(ii, ev);
+    else
+      this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,false,true>(ii, ev);
+    return;
+  }
+
   // These arrays are atomic for Half/Thread neighbor style
   Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
   Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_uCG = uCG;
@@ -734,6 +742,14 @@ template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxComputeNoAtomics<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii, EV_FLOAT& ev) const {
 
+  {
+    if (isite1 == isite2)
+      this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,true, false>(ii, ev);
+    else
+      this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,false,false>(ii, ev);
+    return;
+  }
+
   int tid = 0;
 #ifndef KOKKOS_HAVE_CUDA
   tid = DeviceType::hardware_thread_id();
@@ -1075,6 +1091,506 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxComputeNoAtomics<NEIG
   t_uCGnew(tid,i) += uCGnew_i;
 }
 
+// Experimental thread-safe approach using duplicated data instead of atomics and
+// temporary local short vector arrays for the inner j-loop to increase vectorization.
+
+template<int n>
+  KOKKOS_INLINE_FUNCTION
+double __powint(const double& x, const int)
+{
+   static_assert(n == 12, "__powint<> only supports specific integer powers.");
+
+   if (n == 12)
+   {
+     // Do x^12 here ... x^12 = (x^3)^4
+     double x3 = x*x*x;
+     return x3*x3*x3*x3;
+   }
+}
+
+template<class DeviceType>
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool Site1EqSite2, bool UseAtomics>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::vectorized_operator(const int &ii, EV_FLOAT& ev) const
+{
+  // These arrays are atomic for Half/Thread neighbor style
+  Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
+  Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_uCG = uCG;
+  Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_uCGnew = uCGnew;
+
+  int tid = 0;
+#ifndef KOKKOS_HAVE_CUDA
+  tid = DeviceType::hardware_thread_id();
+#endif
+
+  const int nRep = 12;
+  const double shift = 1.05;
+
+  const int i = d_ilist[ii];
+  const double xtmp = x(i,0);
+  const double ytmp = x(i,1);
+  const double ztmp = x(i,2);
+  const int itype = type[i];
+  const int jnum = d_numneigh[i];
+
+  double fx_i = 0.0;
+  double fy_i = 0.0;
+  double fz_i = 0.0;
+  double uCG_i = 0.0;
+  double uCGnew_i = 0.0;
+
+  // Constant values for this atom.
+  const double epsilon1_i      = PairExp6ParamData.epsilon1[i];
+  const double alpha1_i        = PairExp6ParamData.alpha1[i];
+  const double rm1_i           = PairExp6ParamData.rm1[i];
+  const double mixWtSite1_i    = PairExp6ParamData.mixWtSite1[i];
+  const double epsilon2_i      = PairExp6ParamData.epsilon2[i];
+  const double alpha2_i        = PairExp6ParamData.alpha2[i];
+  const double rm2_i           = PairExp6ParamData.rm2[i];
+  const double mixWtSite2_i    = PairExp6ParamData.mixWtSite2[i];
+  const double epsilonOld1_i   = PairExp6ParamData.epsilonOld1[i];
+  const double alphaOld1_i     = PairExp6ParamData.alphaOld1[i];
+  const double rmOld1_i        = PairExp6ParamData.rmOld1[i];
+  const double mixWtSite1old_i = PairExp6ParamData.mixWtSite1old[i];
+  const double epsilonOld2_i   = PairExp6ParamData.epsilonOld2[i];
+  const double alphaOld2_i     = PairExp6ParamData.alphaOld2[i];
+  const double rmOld2_i        = PairExp6ParamData.rmOld2[i];
+  const double mixWtSite2old_i = PairExp6ParamData.mixWtSite2old[i];
+
+  // Do error testing locally.
+  bool hasError = false;
+
+  // Process this many neighbors concurrently -- if possible.
+  const int batchSize = 8;
+
+  int neigh_j[batchSize];
+  double evdwlOld_j[batchSize];
+  double uCGnew_j[batchSize];
+  double fpair_j[batchSize];
+  double delx_j[batchSize];
+  double dely_j[batchSize];
+  double delz_j[batchSize];
+  double cutsq_j[batchSize];
+  //double j_epsilon1[batchSize]      ;
+  //double j_alpha1[batchSize]        ;
+  //double j_rm1[batchSize]           ;
+  //double j_mixWtSite1[batchSize]    ;
+  //double j_epsilon2[batchSize]      ;
+  //double j_alpha2[batchSize]        ;
+  //double j_rm2[batchSize]           ;
+  //double j_mixWtSite2[batchSize]    ;
+  //double j_epsilonOld1[batchSize]   ;
+  //double j_alphaOld1[batchSize]     ;
+  //double j_rmOld1[batchSize]        ;
+  //double j_mixWtSite1old[batchSize] ;
+  //double j_epsilonOld2[batchSize]   ;
+  //double j_alphaOld2[batchSize]     ;
+  //double j_rmOld2[batchSize]        ;
+  //double j_mixWtSite2old[batchSize] ;
+
+  for (int jptr = 0; jptr < jnum; )
+  {
+    // The core computation here is very expensive so let's only bother with
+    // those that pass rsq < cutsq.
+
+    for (int j = 0; j < batchSize; ++j)
+    {
+      evdwlOld_j[j] = 0.0;
+      uCGnew_j[j] = 0.0;
+      fpair_j[j] = 0.0;
+      //delx_j[j] = 0.0;
+      //dely_j[j] = 0.0;
+      //delz_j[j] = 0.0;
+      //cutsq_j[j] = 0.0;
+    }
+
+    int niters = 0;
+
+    for (; (jptr < jnum) && (niters < batchSize); ++jptr)
+    {
+      const int j = d_neighbors(i,jptr) & NEIGHMASK;
+
+      const double delx = xtmp - x(j,0);
+      const double dely = ytmp - x(j,1);
+      const double delz = ztmp - x(j,2);
+
+      const double rsq = delx*delx + dely*dely + delz*delz;
+      const int jtype = type[j];
+
+      if (rsq < d_cutsq(itype,jtype))
+      {
+        delx_j [niters] = delx;
+        dely_j [niters] = dely;
+        delz_j [niters] = delz;
+        cutsq_j[niters] = d_cutsq(itype,jtype);
+
+        neigh_j[niters] = d_neighbors(i,jptr);
+
+        //j_epsilon1[niters]      = PairExp6ParamData.epsilon1[j];
+        //j_alpha1[niters]        = PairExp6ParamData.alpha1[j];
+        //j_rm1[niters]           = PairExp6ParamData.rm1[j];
+        //j_mixWtSite1[niters]    = PairExp6ParamData.mixWtSite1[j];
+        //j_epsilon2[niters]      = PairExp6ParamData.epsilon2[j];
+        //j_alpha2[niters]        = PairExp6ParamData.alpha2[j];
+        //j_rm2[niters]           = PairExp6ParamData.rm2[j];
+        //j_mixWtSite2[niters]    = PairExp6ParamData.mixWtSite2[j];
+        //j_epsilonOld1[niters]   = PairExp6ParamData.epsilonOld1[j];
+        //j_alphaOld1[niters]     = PairExp6ParamData.alphaOld1[j];
+        //j_rmOld1[niters]        = PairExp6ParamData.rmOld1[j];
+        //j_mixWtSite1old[niters] = PairExp6ParamData.mixWtSite1old[j];
+        //j_epsilonOld2[niters]   = PairExp6ParamData.epsilonOld2[j];
+        //j_alphaOld2[niters]     = PairExp6ParamData.alphaOld2[j];
+        //j_rmOld2[niters]        = PairExp6ParamData.rmOld2[j];
+        //j_mixWtSite2old[niters] = PairExp6ParamData.mixWtSite2old[j];
+
+        ++niters;
+      }
+    }
+
+    // reduction here.
+    #pragma simd reduction(+: fx_i, fy_i, fz_i, uCG_i, uCGnew_i) reduction(|: hasError)
+    for (int jlane = 0; jlane < niters; jlane++)
+    {
+      int j = neigh_j[jlane];
+      const double factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      const double delx = delx_j[jlane];
+      const double dely = dely_j[jlane];
+      const double delz = delz_j[jlane];
+
+      const double rsq = delx*delx + dely*dely + delz*delz;
+      // const int jtype = type[j];
+
+      // if (rsq < d_cutsq(itype,jtype)) // optimize
+      {
+        const double r2inv = 1.0/rsq;
+        const double r6inv = r2inv*r2inv*r2inv;
+
+        const double r = sqrt(rsq);
+        const double rCut2inv = 1.0/ cutsq_j[jlane];
+        const double rCut6inv = rCut2inv*rCut2inv*rCut2inv;
+        const double rCut = sqrt( cutsq_j[jlane] );
+        const double rCutInv = 1.0/rCut;
+
+        //
+        // A. Compute the exp-6 potential
+        //
+
+        // A1.  Get alpha, epsilon and rm for particle j
+
+        const double epsilon1_j      = PairExp6ParamData.epsilon1[j];
+        const double alpha1_j        = PairExp6ParamData.alpha1[j];
+        const double rm1_j           = PairExp6ParamData.rm1[j];
+        const double mixWtSite1_j    = PairExp6ParamData.mixWtSite1[j];
+        const double epsilon2_j      = PairExp6ParamData.epsilon2[j];
+        const double alpha2_j        = PairExp6ParamData.alpha2[j];
+        const double rm2_j           = PairExp6ParamData.rm2[j];
+        const double mixWtSite2_j    = PairExp6ParamData.mixWtSite2[j];
+        const double epsilonOld1_j   = PairExp6ParamData.epsilonOld1[j];
+        const double alphaOld1_j     = PairExp6ParamData.alphaOld1[j];
+        const double rmOld1_j        = PairExp6ParamData.rmOld1[j];
+        const double mixWtSite1old_j = PairExp6ParamData.mixWtSite1old[j];
+        const double epsilonOld2_j   = PairExp6ParamData.epsilonOld2[j];
+        const double alphaOld2_j     = PairExp6ParamData.alphaOld2[j];
+        const double rmOld2_j        = PairExp6ParamData.rmOld2[j];
+        const double mixWtSite2old_j = PairExp6ParamData.mixWtSite2old[j];
+        //const double epsilon1_j      = j_epsilon1[jlane];
+        //const double alpha1_j        = j_alpha1[jlane];
+        //const double rm1_j           = j_rm1[jlane];
+        //const double mixWtSite1_j    = j_mixWtSite1[jlane];
+        //const double epsilon2_j      = j_epsilon2[jlane];
+        //const double alpha2_j        = j_alpha2[jlane];
+        //const double rm2_j           = j_rm2[jlane];
+        //const double mixWtSite2_j    = j_mixWtSite2[jlane];
+        //const double epsilonOld1_j   = j_epsilonOld1[jlane];
+        //const double alphaOld1_j     = j_alphaOld1[jlane];
+        //const double rmOld1_j        = j_rmOld1[jlane];
+        //const double mixWtSite1old_j = j_mixWtSite1old[jlane];
+        //const double epsilonOld2_j   = j_epsilonOld2[jlane];
+        //const double alphaOld2_j     = j_alphaOld2[jlane];
+        //const double rmOld2_j        = j_rmOld2[jlane];
+        //const double mixWtSite2old_j = j_mixWtSite2old[jlane];
+
+        // A2.  Apply Lorentz-Berthelot mixing rules for the i-j pair
+        const double alphaOld12_ij = sqrt(alphaOld1_i*alphaOld2_j);
+        const double rmOld12_ij = 0.5*(rmOld1_i + rmOld2_j);
+        const double epsilonOld12_ij = sqrt(epsilonOld1_i*epsilonOld2_j);
+        const double alphaOld21_ij = sqrt(alphaOld2_i*alphaOld1_j);
+        const double rmOld21_ij = 0.5*(rmOld2_i + rmOld1_j);
+        const double epsilonOld21_ij = sqrt(epsilonOld2_i*epsilonOld1_j);
+
+        const double alpha12_ij = sqrt(alpha1_i*alpha2_j);
+        const double rm12_ij = 0.5*(rm1_i + rm2_j);
+        const double epsilon12_ij = sqrt(epsilon1_i*epsilon2_j);
+        const double alpha21_ij = sqrt(alpha2_i*alpha1_j);
+        const double rm21_ij = 0.5*(rm2_i + rm1_j);
+        const double epsilon21_ij = sqrt(epsilon2_i*epsilon1_j);
+
+        double evdwlOldEXP6_12 = 0.0;
+        double evdwlOldEXP6_21 = 0.0;
+        double evdwlEXP6_12 = 0.0;
+        double evdwlEXP6_21 = 0.0;
+        double fpairOldEXP6_12 = 0.0;
+        double fpairOldEXP6_21 = 0.0;
+
+        if(rmOld12_ij!=0.0 && rmOld21_ij!=0.0)
+        {
+          hasError |= (alphaOld21_ij == 6.0 || alphaOld12_ij == 6.0);
+
+          // A3.  Compute some convenient quantities for evaluating the force
+          double rminv = 1.0/rmOld12_ij;
+          double buck1 = epsilonOld12_ij / (alphaOld12_ij - 6.0);
+          double rexp = expValue(alphaOld12_ij*(1.0-r*rminv));
+          double rm2ij = rmOld12_ij*rmOld12_ij;
+          double rm6ij = rm2ij*rm2ij*rm2ij;
+
+          // Compute the shifted potential
+          double rCutExp = expValue(alphaOld12_ij*(1.0-rCut*rminv));
+          double buck2 = 6.0*alphaOld12_ij;
+          double urc = buck1*(6.0*rCutExp - alphaOld12_ij*rm6ij*rCut6inv);
+          double durc = -buck1*buck2*(rCutExp* rminv - rCutInv*rm6ij*rCut6inv);
+          double rin1 = shift*rmOld12_ij*func_rin(alphaOld12_ij);
+
+          if(r < rin1){
+            const double rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+            const double rin6inv = 1.0/rin6;
+
+            const double rin1exp = expValue(alphaOld12_ij*(1.0-rin1*rminv));
+
+            const double uin1 = buck1*(6.0*rin1exp - alphaOld12_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+            const double win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
+
+            const double aRep = win1*__powint<12>(rin1,nRep)/nRep;
+
+            const double uin1rep = aRep/__powint<12>(rin1,nRep);
+
+            const double forceExp6 = double(nRep)*aRep/__powint<12>(r,nRep);
+            fpairOldEXP6_12 = factor_lj*forceExp6*r2inv;
+
+            evdwlOldEXP6_12 = uin1 - uin1rep + aRep/__powint<12>(r,nRep);
+          } else {
+            const double forceExp6 = buck1*buck2*(r*rexp*rminv - rm6ij*r6inv) + r*durc;
+            fpairOldEXP6_12 = factor_lj*forceExp6*r2inv;
+
+            evdwlOldEXP6_12 = buck1*(6.0*rexp - alphaOld12_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+          }
+
+          // A3.  Compute some convenient quantities for evaluating the force
+          rminv = 1.0/rmOld21_ij;
+          buck1 = epsilonOld21_ij / (alphaOld21_ij - 6.0);
+          buck2 = 6.0*alphaOld21_ij;
+          rexp = expValue(alphaOld21_ij*(1.0-r*rminv));
+          rm2ij = rmOld21_ij*rmOld21_ij;
+          rm6ij = rm2ij*rm2ij*rm2ij;
+
+          // Compute the shifted potential
+          rCutExp = expValue(alphaOld21_ij*(1.0-rCut*rminv));
+          buck2 = 6.0*alphaOld21_ij;
+          urc = buck1*(6.0*rCutExp - alphaOld21_ij*rm6ij*rCut6inv);
+          durc = -buck1*buck2*(rCutExp* rminv - rCutInv*rm6ij*rCut6inv);
+          rin1 = shift*rmOld21_ij*func_rin(alphaOld21_ij);
+
+          if(r < rin1){
+            const double rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+            const double rin6inv = 1.0/rin6;
+
+            const double rin1exp = expValue(alphaOld21_ij*(1.0-rin1*rminv));
+
+            const double uin1 = buck1*(6.0*rin1exp - alphaOld21_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+            const double win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
+
+            const double aRep = win1*__powint<12>(rin1,nRep)/nRep;
+
+            const double uin1rep = aRep/__powint<12>(rin1,nRep);
+
+            const double forceExp6 = double(nRep)*aRep/__powint<12>(r,nRep);
+            fpairOldEXP6_21 = factor_lj*forceExp6*r2inv;
+
+            evdwlOldEXP6_21 = uin1 - uin1rep + aRep/__powint<12>(r,nRep);
+          } else {
+            const double forceExp6 = buck1*buck2*(r*rexp*rminv - rm6ij*r6inv) + r*durc;
+            fpairOldEXP6_21 = factor_lj*forceExp6*r2inv;
+
+            evdwlOldEXP6_21 = buck1*(6.0*rexp - alphaOld21_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+          }
+
+          double evdwlOld;
+          if (Site1EqSite2)
+            evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwlOldEXP6_12;
+          else
+            evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwlOldEXP6_12 + sqrt(mixWtSite2old_i*mixWtSite1old_j)*evdwlOldEXP6_21;
+
+          evdwlOld *= factor_lj;
+
+          uCG_i += 0.5*evdwlOld;
+
+          evdwlOld_j[jlane] = evdwlOld;
+        }
+
+        if(rm12_ij!=0.0 && rm21_ij!=0.0)
+        {
+          hasError |= (alpha21_ij == 6.0 || alpha12_ij == 6.0);
+
+          // A3.  Compute some convenient quantities for evaluating the force
+          double rminv = 1.0/rm12_ij;
+          double buck1 = epsilon12_ij / (alpha12_ij - 6.0);
+          double buck2 = 6.0*alpha12_ij;
+          double rexp = expValue(alpha12_ij*(1.0-r*rminv));
+          double rm2ij = rm12_ij*rm12_ij;
+          double rm6ij = rm2ij*rm2ij*rm2ij;
+
+          // Compute the shifted potential
+          double rCutExp = expValue(alpha12_ij*(1.0-rCut*rminv));
+          double urc = buck1*(6.0*rCutExp - alpha12_ij*rm6ij*rCut6inv);
+          double durc = -buck1*buck2*(rCutExp*rminv - rCutInv*rm6ij*rCut6inv);
+          double rin1 = shift*rm12_ij*func_rin(alpha12_ij);
+
+          if(r < rin1){
+            const double rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+            const double rin6inv = 1.0/rin6;
+
+            const double rin1exp = expValue(alpha12_ij*(1.0-rin1*rminv));
+
+            const double uin1 = buck1*(6.0*rin1exp - alpha12_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+            const double win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
+
+            const double aRep = win1*__powint<12>(rin1,nRep)/nRep;
+
+            const double uin1rep = aRep/__powint<12>(rin1,nRep);
+
+            evdwlEXP6_12 = uin1 - uin1rep + aRep/__powint<12>(r,nRep);
+          } else {
+            evdwlEXP6_12 = buck1*(6.0*rexp - alpha12_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+          }
+
+          rminv = 1.0/rm21_ij;
+          buck1 = epsilon21_ij / (alpha21_ij - 6.0);
+          buck2 = 6.0*alpha21_ij;
+          rexp = expValue(alpha21_ij*(1.0-r*rminv));
+          rm2ij = rm21_ij*rm21_ij;
+          rm6ij = rm2ij*rm2ij*rm2ij;
+
+          // Compute the shifted potential
+          rCutExp = expValue(alpha21_ij*(1.0-rCut*rminv));
+          urc = buck1*(6.0*rCutExp - alpha21_ij*rm6ij*rCut6inv);
+          durc = -buck1*buck2*(rCutExp*rminv - rCutInv*rm6ij*rCut6inv);
+          rin1 = shift*rm21_ij*func_rin(alpha21_ij);
+
+          if(r < rin1){
+            const double rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+            const double rin6inv = 1.0/rin6;
+
+            const double rin1exp = expValue(alpha21_ij*(1.0-rin1*rminv));
+
+            const double uin1 = buck1*(6.0*rin1exp - alpha21_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+            const double win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
+
+            const double aRep = win1*__powint<12>(rin1,nRep)/nRep;
+
+            const double uin1rep = aRep/__powint<12>(rin1,nRep);
+
+            evdwlEXP6_21 = uin1 - uin1rep + aRep/__powint<12>(r,nRep);
+          } else {
+            evdwlEXP6_21 = buck1*(6.0*rexp - alpha21_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+          }
+        }
+
+        //
+        // Apply Mixing Rule to get the overall force for the CG pair
+        //
+        double fpair;
+        if (Site1EqSite2)
+          fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpairOldEXP6_12;
+        else
+          fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpairOldEXP6_12 + sqrt(mixWtSite2old_i*mixWtSite1old_j)*fpairOldEXP6_21;
+
+        double evdwl;
+        if (Site1EqSite2)
+          evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwlEXP6_12;
+        else
+          evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwlEXP6_12 + sqrt(mixWtSite2_i*mixWtSite1_j)*evdwlEXP6_21;
+
+        evdwl *= factor_lj;
+
+        fpair_j[jlane] = fpair;
+
+        fx_i += delx*fpair;
+        fy_i += dely*fpair;
+        fz_i += delz*fpair;
+
+        uCGnew_i += 0.5*evdwl;
+        if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD))
+          uCGnew_j[jlane] = 0.5*evdwl;
+
+      } // if rsq < cutsq
+
+    } // end jlane loop.
+
+    for (int jlane = 0; jlane < niters; jlane++)
+    {
+      const int j = neigh_j[jlane] & NEIGHMASK;
+
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal))
+        if (UseAtomics)
+          a_uCG(j) += 0.5*evdwlOld_j[jlane];
+        else
+          t_uCG(tid,j) += 0.5*evdwlOld_j[jlane];
+
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal))
+        if (UseAtomics)
+          a_uCGnew(j) += uCGnew_j[jlane];
+        else
+          t_uCGnew(tid,j) += uCGnew_j[jlane];
+
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
+        if (UseAtomics)
+        {
+          a_f(j,0) -= delx_j[jlane]*fpair_j[jlane];
+          a_f(j,1) -= dely_j[jlane]*fpair_j[jlane];
+          a_f(j,2) -= delz_j[jlane]*fpair_j[jlane];
+        }
+        else
+        {
+          t_f(tid,j,0) -= delx_j[jlane]*fpair_j[jlane];
+          t_f(tid,j,1) -= dely_j[jlane]*fpair_j[jlane];
+          t_f(tid,j,2) -= delz_j[jlane]*fpair_j[jlane];
+        }
+      }
+
+      double evdwl = evdwlOld_j[jlane];
+      if (EVFLAG)
+        ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR||(j<nlocal)))?1.0:0.5)*evdwl;
+      //if (vflag_either || eflag_atom) 
+      if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair_j[jlane],delx_j[jlane],dely_j[jlane],delz_j[jlane]);
+    }
+  }
+
+  if (hasError)
+    k_error_flag.d_view() = 1;
+
+  if (UseAtomics)
+  {
+    a_f(i,0) += fx_i;
+    a_f(i,1) += fy_i;
+    a_f(i,2) += fz_i;
+    a_uCG(i) += uCG_i;
+    a_uCGnew(i) += uCGnew_i;
+  }
+  else
+  {
+    t_f(tid,i,0) += fx_i;
+    t_f(tid,i,1) += fy_i;
+    t_f(tid,i,2) += fz_i;
+    t_uCG(tid,i) += uCG_i;
+    t_uCGnew(tid,i) += uCGnew_i;
+  }
+}
+
 template<class DeviceType>
 template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.h b/src/KOKKOS/pair_exp6_rx_kokkos.h
index ebbc26ea20..6899e5ff62 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.h
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.h
@@ -91,6 +91,10 @@ class PairExp6rxKokkos : public PairExp6rx {
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairExp6rxComputeNoAtomics<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&, EV_FLOAT&) const;
 
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool Site1EqSite2, bool UseAtomics>
+  KOKKOS_INLINE_FUNCTION
+  void vectorized_operator(const int&, EV_FLOAT&) const;
+
   template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairExp6rxComputeNoAtomics<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&) const;

From 75670244bb5d30c407e8fc3635f06cf9b08ba817 Mon Sep 17 00:00:00 2001
From: "Christopher P. Stone" <chris.stone@computational-science.com>
Date: Fri, 17 Mar 2017 17:02:47 -0400
Subject: [PATCH 210/267] Added ONE-TYPE template capability to
 vectorized_operator and cleaned up timers.

---
 src/KOKKOS/pair_exp6_rx_kokkos.cpp | 98 +++++++++++-------------------
 src/KOKKOS/pair_exp6_rx_kokkos.h   |  2 +-
 2 files changed, 37 insertions(+), 63 deletions(-)

diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index 85d919091f..5c74cba8c7 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -348,8 +348,8 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
   copymode = 0;
 
-  TimerType t_stop = getTimeStamp();
-  printf("PairExp6rxKokkos::compute %f %f\n", getElapsedTime(t_start, t_stop), getElapsedTime(t_mix_start, t_mix_stop));
+  //TimerType t_stop = getTimeStamp();
+  //printf("PairExp6rxKokkos::compute %f %f\n", getElapsedTime(t_start, t_stop), getElapsedTime(t_mix_start, t_mix_stop));
 }
 
 template<class DeviceType>
@@ -379,10 +379,17 @@ KOKKOS_INLINE_FUNCTION
 void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii, EV_FLOAT& ev) const {
 
   {
+    const bool one_type = (atom->ntypes == 1);
     if (isite1 == isite2)
-      this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,true, true>(ii, ev);
+      if (one_type)
+        this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,true, true, true>(ii, ev);
+      else
+        this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,true, true,false>(ii, ev);
     else
-      this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,false,true>(ii, ev);
+      if (one_type)
+        this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,false,true, true>(ii, ev);
+      else
+        this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,false,true,false>(ii, ev);
     return;
   }
 
@@ -743,10 +750,17 @@ KOKKOS_INLINE_FUNCTION
 void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxComputeNoAtomics<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii, EV_FLOAT& ev) const {
 
   {
+    const bool one_type = (atom->ntypes == 1);
     if (isite1 == isite2)
-      this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,true, false>(ii, ev);
+      if (one_type)
+        this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,true, false, true>(ii, ev);
+      else
+        this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,true, false,false>(ii, ev);
     else
-      this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,false,false>(ii, ev);
+      if (one_type)
+        this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,false,false, true>(ii, ev);
+      else
+        this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,false,false,false>(ii, ev);
     return;
   }
 
@@ -1109,7 +1123,7 @@ double __powint(const double& x, const int)
 }
 
 template<class DeviceType>
-  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool Site1EqSite2, bool UseAtomics>
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool Site1EqSite2, bool UseAtomics, bool OneType>
 KOKKOS_INLINE_FUNCTION
 void PairExp6rxKokkos<DeviceType>::vectorized_operator(const int &ii, EV_FLOAT& ev) const
 {
@@ -1157,6 +1171,12 @@ void PairExp6rxKokkos<DeviceType>::vectorized_operator(const int &ii, EV_FLOAT&
   const double rmOld2_i        = PairExp6ParamData.rmOld2[i];
   const double mixWtSite2old_i = PairExp6ParamData.mixWtSite2old[i];
 
+  const double cutsq_type11 = d_cutsq(1,1);
+  const double rCut2inv_type11 = 1.0/ cutsq_type11;
+  const double rCut6inv_type11 = rCut2inv_type11*rCut2inv_type11*rCut2inv_type11;
+  const double rCut_type11 = sqrt( cutsq_type11 );
+  const double rCutInv_type11 = 1.0/rCut_type11;
+
   // Do error testing locally.
   bool hasError = false;
 
@@ -1171,22 +1191,6 @@ void PairExp6rxKokkos<DeviceType>::vectorized_operator(const int &ii, EV_FLOAT&
   double dely_j[batchSize];
   double delz_j[batchSize];
   double cutsq_j[batchSize];
-  //double j_epsilon1[batchSize]      ;
-  //double j_alpha1[batchSize]        ;
-  //double j_rm1[batchSize]           ;
-  //double j_mixWtSite1[batchSize]    ;
-  //double j_epsilon2[batchSize]      ;
-  //double j_alpha2[batchSize]        ;
-  //double j_rm2[batchSize]           ;
-  //double j_mixWtSite2[batchSize]    ;
-  //double j_epsilonOld1[batchSize]   ;
-  //double j_alphaOld1[batchSize]     ;
-  //double j_rmOld1[batchSize]        ;
-  //double j_mixWtSite1old[batchSize] ;
-  //double j_epsilonOld2[batchSize]   ;
-  //double j_alphaOld2[batchSize]     ;
-  //double j_rmOld2[batchSize]        ;
-  //double j_mixWtSite2old[batchSize] ;
 
   for (int jptr = 0; jptr < jnum; )
   {
@@ -1217,32 +1221,18 @@ void PairExp6rxKokkos<DeviceType>::vectorized_operator(const int &ii, EV_FLOAT&
       const double rsq = delx*delx + dely*dely + delz*delz;
       const int jtype = type[j];
 
-      if (rsq < d_cutsq(itype,jtype))
+      const double cutsq_ij = (OneType) ? cutsq_type11 : d_cutsq(itype,jtype);
+
+      if (rsq < cutsq_ij)
       {
         delx_j [niters] = delx;
         dely_j [niters] = dely;
         delz_j [niters] = delz;
-        cutsq_j[niters] = d_cutsq(itype,jtype);
+        if (OneType == false)
+          cutsq_j[niters] = cutsq_ij;
 
         neigh_j[niters] = d_neighbors(i,jptr);
 
-        //j_epsilon1[niters]      = PairExp6ParamData.epsilon1[j];
-        //j_alpha1[niters]        = PairExp6ParamData.alpha1[j];
-        //j_rm1[niters]           = PairExp6ParamData.rm1[j];
-        //j_mixWtSite1[niters]    = PairExp6ParamData.mixWtSite1[j];
-        //j_epsilon2[niters]      = PairExp6ParamData.epsilon2[j];
-        //j_alpha2[niters]        = PairExp6ParamData.alpha2[j];
-        //j_rm2[niters]           = PairExp6ParamData.rm2[j];
-        //j_mixWtSite2[niters]    = PairExp6ParamData.mixWtSite2[j];
-        //j_epsilonOld1[niters]   = PairExp6ParamData.epsilonOld1[j];
-        //j_alphaOld1[niters]     = PairExp6ParamData.alphaOld1[j];
-        //j_rmOld1[niters]        = PairExp6ParamData.rmOld1[j];
-        //j_mixWtSite1old[niters] = PairExp6ParamData.mixWtSite1old[j];
-        //j_epsilonOld2[niters]   = PairExp6ParamData.epsilonOld2[j];
-        //j_alphaOld2[niters]     = PairExp6ParamData.alphaOld2[j];
-        //j_rmOld2[niters]        = PairExp6ParamData.rmOld2[j];
-        //j_mixWtSite2old[niters] = PairExp6ParamData.mixWtSite2old[j];
-
         ++niters;
       }
     }
@@ -1268,10 +1258,10 @@ void PairExp6rxKokkos<DeviceType>::vectorized_operator(const int &ii, EV_FLOAT&
         const double r6inv = r2inv*r2inv*r2inv;
 
         const double r = sqrt(rsq);
-        const double rCut2inv = 1.0/ cutsq_j[jlane];
-        const double rCut6inv = rCut2inv*rCut2inv*rCut2inv;
-        const double rCut = sqrt( cutsq_j[jlane] );
-        const double rCutInv = 1.0/rCut;
+        const double rCut2inv = (OneType) ? rCut2inv_type11 : (1.0/ cutsq_j[jlane]);
+        const double rCut6inv = (OneType) ? rCut6inv_type11 : (rCut2inv*rCut2inv*rCut2inv);
+        const double rCut =     (OneType) ? rCut_type11     : (sqrt( cutsq_j[jlane] ));
+        const double rCutInv =  (OneType) ? rCutInv_type11  : (1.0/rCut);
 
         //
         // A. Compute the exp-6 potential
@@ -1295,22 +1285,6 @@ void PairExp6rxKokkos<DeviceType>::vectorized_operator(const int &ii, EV_FLOAT&
         const double alphaOld2_j     = PairExp6ParamData.alphaOld2[j];
         const double rmOld2_j        = PairExp6ParamData.rmOld2[j];
         const double mixWtSite2old_j = PairExp6ParamData.mixWtSite2old[j];
-        //const double epsilon1_j      = j_epsilon1[jlane];
-        //const double alpha1_j        = j_alpha1[jlane];
-        //const double rm1_j           = j_rm1[jlane];
-        //const double mixWtSite1_j    = j_mixWtSite1[jlane];
-        //const double epsilon2_j      = j_epsilon2[jlane];
-        //const double alpha2_j        = j_alpha2[jlane];
-        //const double rm2_j           = j_rm2[jlane];
-        //const double mixWtSite2_j    = j_mixWtSite2[jlane];
-        //const double epsilonOld1_j   = j_epsilonOld1[jlane];
-        //const double alphaOld1_j     = j_alphaOld1[jlane];
-        //const double rmOld1_j        = j_rmOld1[jlane];
-        //const double mixWtSite1old_j = j_mixWtSite1old[jlane];
-        //const double epsilonOld2_j   = j_epsilonOld2[jlane];
-        //const double alphaOld2_j     = j_alphaOld2[jlane];
-        //const double rmOld2_j        = j_rmOld2[jlane];
-        //const double mixWtSite2old_j = j_mixWtSite2old[jlane];
 
         // A2.  Apply Lorentz-Berthelot mixing rules for the i-j pair
         const double alphaOld12_ij = sqrt(alphaOld1_i*alphaOld2_j);
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.h b/src/KOKKOS/pair_exp6_rx_kokkos.h
index 6899e5ff62..9f38732c32 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.h
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.h
@@ -91,7 +91,7 @@ class PairExp6rxKokkos : public PairExp6rx {
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairExp6rxComputeNoAtomics<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&, EV_FLOAT&) const;
 
-  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool Site1EqSite2, bool UseAtomics>
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool Site1EqSite2, bool UseAtomics, bool OneType>
   KOKKOS_INLINE_FUNCTION
   void vectorized_operator(const int&, EV_FLOAT&) const;
 

From 0cd3f0cd63f2305d2408bea36a18302ee11d9326 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 17 Mar 2017 19:11:39 -0400
Subject: [PATCH 211/267] USER-DPD: bugfix for npair_half_bin_newton_ssa when
 bonds are involved. Only locals have valid special[] arrays, so when finding
 neighbors of ghosts, we have to swap the arguments to find_special().

---
 src/USER-DPD/npair_half_bin_newton_ssa.cpp | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/USER-DPD/npair_half_bin_newton_ssa.cpp b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
index ab439d3731..a6479d4c4f 100644
--- a/src/USER-DPD/npair_half_bin_newton_ssa.cpp
+++ b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
@@ -250,11 +250,6 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
       xtmp = x[i][0];
       ytmp = x[i][1];
       ztmp = x[i][2];
-      if (moltemplate) {
-        imol = molindex[i];
-        iatom = molatom[i];
-        tagprev = tag[i] - iatom - 1;
-      }
 
       ibin = coord2bin(x[i],xbin,ybin,zbin);
 
@@ -281,12 +276,16 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
           if (rsq <= cutneighsq[itype][jtype]) {
             if (molecular) {
               if (!moltemplate)
-                which = find_special(special[i],nspecial[i],tag[j]);
-              else if (imol >= 0)
-                which = find_special(onemols[imol]->special[iatom],
-                                     onemols[imol]->nspecial[iatom],
-                                     tag[j]-tagprev);
-              else which = 0;
+                which = find_special(special[j],nspecial[j],tag[i]);
+              else {
+                int jmol = molindex[j];
+                if (jmol >= 0) {
+                  int jatom = molatom[j];
+                  which = find_special(onemols[jmol]->special[jatom],
+                                     onemols[jmol]->nspecial[jatom],
+                                     tag[i] - (tag[j] - jatom - 1));
+                } else which = 0;
+              }
               if (which == 0) neighptr[n++] = j;
               else if (domain->minimum_image_check(delx,dely,delz))
                 neighptr[n++] = j;

From fff43a4604a29de23f23f066027b5f8e41cd33f4 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 17 Mar 2017 19:33:04 -0400
Subject: [PATCH 212/267] USER-DPD Kokkos: bugfix for npair_ssa_kokkos.cpp
 corresponding to 0cd3f0cd

---
 src/KOKKOS/npair_ssa_kokkos.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp
index a9b59bfc96..7b5a569051 100644
--- a/src/KOKKOS/npair_ssa_kokkos.cpp
+++ b/src/KOKKOS/npair_ssa_kokkos.cpp
@@ -515,11 +515,11 @@ void NPairSSAKokkosExecute<DeviceType>::build_ghosts()
             if(rsq <= cutneighsq(itype,jtype)) {
               if (molecular) {
                 if (!moltemplate)
-                  which = find_special(i,j);
-                    /* else if (imol >= 0) */
-                    /*   which = find_special(onemols[imol]->special[iatom], */
-                    /*                        onemols[imol]->nspecial[iatom], */
-                    /*                        tag[j]-tagprev); */
+                  which = find_special(j,i);
+                    /* else if (jmol >= 0) */
+                    /*   which = find_special(onemols[jmol]->special[jatom], */
+                    /*                        onemols[jmol]->nspecial[jatom], */
+                    /*                        tag[i]-jtagprev); */
                     /* else which = 0; */
                 if (which == 0){
                   if(n<neigh_list.maxneighs) neighbors_i(n++) = j;

From a68f3a93e53a99ddfc8ffe371cf84f3ed31a613c Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Sun, 19 Mar 2017 21:12:52 -0400
Subject: [PATCH 213/267] USER-DPD Kokkos: bugfix, add a misisng line of code
 in pair_exp6_rx_kokkos.cpp

---
 src/KOKKOS/pair_exp6_rx_kokkos.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index abc158d72c..23fb4f59e5 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -917,6 +917,7 @@ void PairExp6rxKokkos<DeviceType>::getMixingWeights(int id,double &epsilon1,doub
       nMoleculesOld2 = dvector(ispecies+nspecies,id);
       nMolecules2 = dvector(ispecies,id);
       fractionOld2 = dvector(ispecies+nspecies,id)/nTotalold;
+      fraction2 = nMolecules2/nTotal;
     }
 
     // If Site1 or Site2 matches is a fluid, then compute the paramters

From 3c91f9734dbb97f293718b0c3f5e5d5d98accc38 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Tue, 21 Mar 2017 17:12:09 -0400
Subject: [PATCH 214/267] make RK solver check in fix_rx_kokkos.cpp be as
 lenient as in fix_rx.cpp NOTE: the (y < -MY_EPSILON) test was too strict, but
 don't know by how much This needs to be revisited before merging back to
 LAMMPS master.

---
 src/KOKKOS/fix_rx_kokkos.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/KOKKOS/fix_rx_kokkos.cpp b/src/KOKKOS/fix_rx_kokkos.cpp
index ac81e5c2a7..d994b2c5d1 100644
--- a/src/KOKKOS/fix_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@@ -1389,9 +1389,9 @@ void FixRxKokkos<DeviceType>::operator()(Tag_FixRxKokkos_solveSystems<ZERO_RATES
     // Store the solution back in dvector.
     for (int ispecies = 0; ispecies < nspecies; ispecies++)
     {
-      if (y[ispecies] < -MY_EPSILON)
+      if (y[ispecies] < -1.0e-10)
       {
-        //error->one(FLERR,"Computed concentration in RK solver is < -10*DBL_EPSILON");
+        //error->one(FLERR,"Computed concentration in RK solver is < -1.0e-10");
         k_error_flag.d_view() = 2;
         // This should be an atomic update.
       }
@@ -1599,9 +1599,9 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
         // Store the solution back in dvector.
         for (int ispecies = 0; ispecies < nspecies; ispecies++)
         {
-          if (y[ispecies] < -MY_EPSILON)
+          if (y[ispecies] < -1.0e-10)
           {
-            //error->one(FLERR,"Computed concentration in RK solver is < -10*DBL_EPSILON");
+            //error->one(FLERR,"Computed concentration in RK solver is < -1.0e-10");
             k_error_flag.d_view() = 2;
             // This should be an atomic update.
           }
@@ -1639,7 +1639,7 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
   k_error_flag.template modify<DeviceType>();
   k_error_flag.template sync<LMPHostType>();
   if (k_error_flag.h_view() == 2)
-    error->one(FLERR,"Computed concentration in RK solver is < -10*DBL_EPSILON");
+    error->one(FLERR,"Computed concentration in RK solver is < -1.0e-10");
 
   // Signal that dvector has been modified on this execution space.
   atomKK->modified( execution_space, DVECTOR_MASK );

From b418b46a03acf427c5d9eb015d1ea202557caed8 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Sun, 26 Mar 2017 23:07:48 -0400
Subject: [PATCH 215/267] USER-DPD: bugfix for an array that changed length in
 the non-kokkos version.

---
 src/KOKKOS/npair_ssa_kokkos.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp
index 7b5a569051..699c2d3269 100644
--- a/src/KOKKOS/npair_ssa_kokkos.cpp
+++ b/src/KOKKOS/npair_ssa_kokkos.cpp
@@ -122,8 +122,8 @@ void NPairSSAKokkos<DeviceType>::copy_stencil_info()
   NStencilSSA *ns_ssa = dynamic_cast<NStencilSSA*>(ns);
   if (!ns_ssa) error->one(FLERR, "NStencil wasn't a NStencilSSA object");
 
-  k_nstencil_ssa = DAT::tdual_int_1d("NPairSSAKokkos:nstencil_ssa",8);
-  for (int k = 0; k < 8; ++k) {
+  k_nstencil_ssa = DAT::tdual_int_1d("NPairSSAKokkos:nstencil_ssa",5);
+  for (int k = 0; k < 5; ++k) {
     k_nstencil_ssa.h_view(k) = ns_ssa->nstencil_ssa[k];
   }
   k_nstencil_ssa.modify<LMPHostType>();

From 5f0823172c4a2f76f6385e011e90876a08e7390c Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Mon, 27 Mar 2017 06:35:19 -0400
Subject: [PATCH 216/267] Make read_restart properly size the atom_vec_* data
 when reading via mpiio

---
 src/read_restart.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/read_restart.cpp b/src/read_restart.cpp
index 6a950353ef..92d21a7062 100644
--- a/src/read_restart.cpp
+++ b/src/read_restart.cpp
@@ -208,6 +208,7 @@ void ReadRestart::command(int narg, char **arg)
     mpiio->read((headerOffset+assignedChunkOffset),assignedChunkSize,buf);
     mpiio->close();
 
+    if (assignedChunkSize > atom->nmax) avec->grow(assignedChunkSize);
     m = 0;
     while (m < assignedChunkSize) m += avec->unpack_restart(&buf[m]);
   }

From 28784a4ce2de90728365717a768c5d0f5b17772a Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Mon, 27 Mar 2017 08:38:40 -0500
Subject: [PATCH 217/267] Now with the correct math, make read_restart properly
 size the atom_vec_* data

---
 src/read_restart.cpp | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/read_restart.cpp b/src/read_restart.cpp
index 92d21a7062..f29a603ef6 100644
--- a/src/read_restart.cpp
+++ b/src/read_restart.cpp
@@ -207,8 +207,23 @@ void ReadRestart::command(int narg, char **arg)
     memory->create(buf,assignedChunkSize,"read_restart:buf");
     mpiio->read((headerOffset+assignedChunkOffset),assignedChunkSize,buf);
     mpiio->close();
-
-    if (assignedChunkSize > atom->nmax) avec->grow(assignedChunkSize);
+    if (!nextra) { // We can actually calculate number of atoms from assignedChunkSize
+      atom->nlocal = 1; // temporarily claim there is one atom...
+      int perAtomSize = avec->size_restart(); // ...so we can get its size
+      atom->nlocal = 0; // restore nlocal to zero atoms
+      int atomCt = (int) (assignedChunkSize / perAtomSize);
+#ifdef DEBUG_ME_NOTNOW
+fprintf(stdout, "ReadRestart::command %04d: pAS %d, aCt %d, nmax %d, chunckSize %12.0f, %12.0f\n"
+  ,me
+  ,perAtomSize
+  ,atomCt
+  ,atom->nmax
+  ,(double) assignedChunkSize
+  ,((double) perAtomSize) * atomCt
+);
+#endif
+      if (atomCt > atom->nmax) avec->grow(atomCt);
+    }
     m = 0;
     while (m < assignedChunkSize) m += avec->unpack_restart(&buf[m]);
   }

From 0463923e330b070e578a38f7a2a0eda2b033c8c4 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Mon, 27 Mar 2017 10:41:32 -0500
Subject: [PATCH 218/267] USER-DPD Kokkos: tighten up the SSA data allocation
 to what is needed. A future version was planned to use more space for a ghost
 work queue.

---
 src/KOKKOS/npair_ssa_kokkos.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp
index 699c2d3269..59470189bc 100644
--- a/src/KOKKOS/npair_ssa_kokkos.cpp
+++ b/src/KOKKOS/npair_ssa_kokkos.cpp
@@ -205,7 +205,7 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
 {
   NeighListKokkos<DeviceType>* list = (NeighListKokkos<DeviceType>*) list_;
   const int nlocal = includegroup?atom->nfirst:atom->nlocal;
-  const int nl_size = (nlocal + atom->nghost) * 4;
+  const int nl_size = (nlocal * 4) + atom->nghost;
   list->grow(nl_size); // Make special larger SSA neighbor list
 
   ssa_phaseCt = sz1*sy1*sx1;

From 661bd37e15bcb83266dbd28b294889fe0ece9554 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Mon, 27 Mar 2017 14:53:48 -0500
Subject: [PATCH 219/267] Make read_restart evenly divide the work of reading
 when using mpiio. Currently only affects restart files written without any
 per-atom fix data.

---
 src/read_restart.cpp | 90 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 69 insertions(+), 21 deletions(-)

diff --git a/src/read_restart.cpp b/src/read_restart.cpp
index f29a603ef6..331a5d6cda 100644
--- a/src/read_restart.cpp
+++ b/src/read_restart.cpp
@@ -212,7 +212,8 @@ void ReadRestart::command(int narg, char **arg)
       int perAtomSize = avec->size_restart(); // ...so we can get its size
       atom->nlocal = 0; // restore nlocal to zero atoms
       int atomCt = (int) (assignedChunkSize / perAtomSize);
-#ifdef DEBUG_ME_NOTNOW
+//#define DEBUG_PRE_GROW
+#ifdef DEBUG_PRE_GROW
 fprintf(stdout, "ReadRestart::command %04d: pAS %d, aCt %d, nmax %d, chunckSize %12.0f, %12.0f\n"
   ,me
   ,perAtomSize
@@ -1026,6 +1027,7 @@ void ReadRestart::file_layout()
         // if the number of ranks that did the writing is different
 
         if (me == 0) {
+          int ndx;
           int *all_written_send_sizes;
           memory->create(all_written_send_sizes,nprocs_file,
                          "write_restart:all_written_send_sizes");
@@ -1035,30 +1037,76 @@ void ReadRestart::file_layout()
 
           fread(all_written_send_sizes,sizeof(int),nprocs_file,fp);
 
-          int init_chunk_number = nprocs_file/nprocs;
-          int num_extra_chunks = nprocs_file - (nprocs*init_chunk_number);
+          if ((nprocs != nprocs_file) && !(atom->nextra_store)) {
+            // nprocs differ, but atom sizes are fixed length, yeah!
+            atom->nlocal = 1; // temporarily claim there is one atom...
+            int perAtomSize = atom->avec->size_restart(); // ...so we can get its size
+            atom->nlocal = 0; // restore nlocal to zero atoms
 
-          for (int i = 0; i < nprocs; i++) {
-            if (i < num_extra_chunks)
-              nproc_chunk_number[i] = init_chunk_number+1;
-            else
-              nproc_chunk_number[i] = init_chunk_number;
-          }
+            bigint total_size = 0;
+            for (int i = 0; i < nprocs_file; ++i) {
+              total_size += all_written_send_sizes[i];
+            }
+            bigint total_ct = total_size / perAtomSize;
 
-          int all_written_send_sizes_index = 0;
-          bigint current_offset = 0;
-          for (int i=0;i<nprocs;i++) {
-            nproc_chunk_offsets[i] = current_offset;
-            nproc_chunk_sizes[i] = 0;
-            for (int j=0;j<nproc_chunk_number[i];j++) {
-              nproc_chunk_sizes[i] +=
-                all_written_send_sizes[all_written_send_sizes_index];
-              current_offset +=
-                (all_written_send_sizes[all_written_send_sizes_index] *
-                 sizeof(double));
-              all_written_send_sizes_index++;
+            bigint base_ct = total_ct / nprocs;
+            bigint leftover_ct = total_ct  - (base_ct * nprocs);
+            bigint current_ByteOffset = 0;
+            base_ct += 1;
+            bigint base_ByteOffset = base_ct * (perAtomSize * sizeof(double));
+            for (ndx = 0; ndx < leftover_ct; ++ndx) {
+              nproc_chunk_offsets[ndx] = current_ByteOffset;
+              nproc_chunk_sizes[ndx] = base_ct * perAtomSize;
+              current_ByteOffset += base_ByteOffset;
+            }
+            base_ct -= 1;
+            base_ByteOffset -= (perAtomSize * sizeof(double));
+            for (; ndx < nprocs; ++ndx) {
+              nproc_chunk_offsets[ndx] = current_ByteOffset;
+              nproc_chunk_sizes[ndx] = base_ct * perAtomSize;
+              current_ByteOffset += base_ByteOffset;
+            }
+//#define DEBUG_FILE_LAYOUT
+#ifdef DEBUG_FILE_LAYOUT
+fprintf(stdout, "ReadRestart::file_layout: %15.0f/%d = %15.0f totCt, %15.0f natoms, %12.0f baseCt, %12.0f leftover, %d np != %d npf %c%c\n"
+  ,(double) total_size
+  ,perAtomSize
+  ,(double) total_ct
+  ,(double) atom->natoms
+  ,(double) base_ct
+  ,(double) leftover_ct
+  ,nprocs
+  ,nprocs_file
+  ,(total_size == (total_ct * perAtomSize)) ? ' ' : 'E'
+  ,(total_ct == (base_ct * nprocs + leftover_ct)) ? ' ' : 'F'
+);
+#endif
+          } else { // Bummer, we have to read in based on how it was written
+            int init_chunk_number = nprocs_file/nprocs;
+            int num_extra_chunks = nprocs_file - (nprocs*init_chunk_number);
+
+            for (int i = 0; i < nprocs; i++) {
+              if (i < num_extra_chunks)
+                nproc_chunk_number[i] = init_chunk_number+1;
+              else
+                nproc_chunk_number[i] = init_chunk_number;
             }
 
+            int all_written_send_sizes_index = 0;
+            bigint current_offset = 0;
+            for (int i=0;i<nprocs;i++) {
+              nproc_chunk_offsets[i] = current_offset;
+              nproc_chunk_sizes[i] = 0;
+              for (int j=0;j<nproc_chunk_number[i];j++) {
+                nproc_chunk_sizes[i] +=
+                  all_written_send_sizes[all_written_send_sizes_index];
+                current_offset +=
+                  (all_written_send_sizes[all_written_send_sizes_index] *
+                   sizeof(double));
+                all_written_send_sizes_index++;
+              }
+
+            }
           }
           memory->destroy(all_written_send_sizes);
           memory->destroy(nproc_chunk_number);

From 20ae05055dbb04054f0c1c6a0b4cda3260d7fbad Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Tue, 28 Mar 2017 11:38:26 -0600
Subject: [PATCH 220/267] fix memory leak via NeighListKokkos::clean_copy()

There were several clean_copy() calls in pair
styles *outside device code*.
They seem to have been left over from an abandoned
effort to copy the Kokkos neighbor list as
a member of the pair style, instead of copying
out the individual views needed.
These leftover clean_copy() calls were setting
pointers to NULL that had not been freed,
leading to large memory leaks.
I've removed the clean_copy() function entirely,
and replaced it with the copymode flag system used
in many other Kokkos objects.
The copymode flag is only set to one in
functors that hold copies of the neighbor list.
---
 src/KOKKOS/fix_qeq_reax_kokkos.cpp     |  2 --
 src/KOKKOS/fix_shardlow_kokkos.cpp     |  2 --
 src/KOKKOS/neigh_list_kokkos.cpp       | 19 ++++++-------------
 src/KOKKOS/neigh_list_kokkos.h         |  7 +------
 src/KOKKOS/npair_kokkos.h              |  2 +-
 src/KOKKOS/npair_ssa_kokkos.h          |  2 +-
 src/KOKKOS/pair_coul_dsf_kokkos.cpp    |  3 ---
 src/KOKKOS/pair_coul_wolf_kokkos.cpp   |  3 ---
 src/KOKKOS/pair_eam_alloy_kokkos.cpp   |  3 ---
 src/KOKKOS/pair_eam_fs_kokkos.cpp      |  3 ---
 src/KOKKOS/pair_eam_kokkos.cpp         |  5 +----
 src/KOKKOS/pair_kokkos.h               |  4 ++--
 src/KOKKOS/pair_reax_c_kokkos.cpp      |  3 ---
 src/KOKKOS/pair_sw_kokkos.cpp          |  1 -
 src/KOKKOS/pair_tersoff_kokkos.cpp     |  1 -
 src/KOKKOS/pair_tersoff_mod_kokkos.cpp |  1 -
 src/KOKKOS/pair_tersoff_zbl_kokkos.cpp |  1 -
 src/neigh_list.cpp                     |  2 ++
 src/neigh_list.h                       |  3 ++-
 19 files changed, 16 insertions(+), 51 deletions(-)

diff --git a/src/KOKKOS/fix_qeq_reax_kokkos.cpp b/src/KOKKOS/fix_qeq_reax_kokkos.cpp
index 3b8d5a85ea..fbc6e0a298 100644
--- a/src/KOKKOS/fix_qeq_reax_kokkos.cpp
+++ b/src/KOKKOS/fix_qeq_reax_kokkos.cpp
@@ -217,8 +217,6 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
   d_ilist = k_list->d_ilist;
   inum = list->inum;
 
-  k_list->clean_copy();
-  //cleanup_copy();
   copymode = 1;
 
   int teamsize = TEAMSIZE;
diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index bf026552fa..676df07b61 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -624,8 +624,6 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
   d_neighbors = k_list->d_neighbors;
   d_ilist = k_list->d_ilist;
 
-  k_list->clean_copy();
-  //cleanup_copy();
   copymode = 1;
 
   dtsqrt = sqrt(update->dt);
diff --git a/src/KOKKOS/neigh_list_kokkos.cpp b/src/KOKKOS/neigh_list_kokkos.cpp
index b1b4e4467a..caf2dfee56 100644
--- a/src/KOKKOS/neigh_list_kokkos.cpp
+++ b/src/KOKKOS/neigh_list_kokkos.cpp
@@ -22,21 +22,14 @@ enum{NSQ,BIN,MULTI};
 /* ---------------------------------------------------------------------- */
 
 template<class Device>
-void NeighListKokkos<Device>::clean_copy()
+NeighListKokkos<Device>::NeighListKokkos(class LAMMPS *lmp):NeighList(lmp)
 {
-  ilist = NULL;
-  numneigh = NULL;
-  firstneigh = NULL;
-  firstdouble = NULL;
-  dnum = 0;
-  iskip = NULL;
-  ijskip = NULL;
-
-  ipage = NULL;
-  dpage = NULL;
-
+  _stride = 1;
+  maxneighs = 16;
+  kokkos = 1;
   maxatoms = 0;
-}
+  execution_space = ExecutionSpaceFromDevice<Device>::space;
+};
 
 /* ---------------------------------------------------------------------- */
 
diff --git a/src/KOKKOS/neigh_list_kokkos.h b/src/KOKKOS/neigh_list_kokkos.h
index cece97197d..1c433f321c 100644
--- a/src/KOKKOS/neigh_list_kokkos.h
+++ b/src/KOKKOS/neigh_list_kokkos.h
@@ -68,18 +68,13 @@ class NeighListKokkos: public NeighList {
 public:
   int maxneighs;
 
-  void clean_copy();
   void grow(int nmax);
   typename ArrayTypes<Device>::t_neighbors_2d d_neighbors;
   typename DAT::tdual_int_1d k_ilist;   // local indices of I atoms
   typename ArrayTypes<Device>::t_int_1d d_ilist;
   typename ArrayTypes<Device>::t_int_1d d_numneigh; // # of J neighs for each I
 
-  NeighListKokkos(class LAMMPS *lmp):
-  NeighList(lmp) {_stride = 1; maxneighs = 16; kokkos = 1; maxatoms = 0;
-                  execution_space = ExecutionSpaceFromDevice<Device>::space;
-  };
-  ~NeighListKokkos() {numneigh = NULL; ilist = NULL;};
+  NeighListKokkos(class LAMMPS *lmp);
 
   KOKKOS_INLINE_FUNCTION
   AtomNeighbors get_neighbors(const int &i) const {
diff --git a/src/KOKKOS/npair_kokkos.h b/src/KOKKOS/npair_kokkos.h
index b31ef2ebbf..ab094e68eb 100644
--- a/src/KOKKOS/npair_kokkos.h
+++ b/src/KOKKOS/npair_kokkos.h
@@ -265,7 +265,7 @@ class NeighborKokkosExecute
     h_new_maxneighs() = neigh_list.maxneighs;
   };
 
-  ~NeighborKokkosExecute() {neigh_list.clean_copy();};
+  ~NeighborKokkosExecute() {neigh_list.copymode = 1;};
 
   template<int HalfNeigh, int Newton, int Tri>
   KOKKOS_FUNCTION
diff --git a/src/KOKKOS/npair_ssa_kokkos.h b/src/KOKKOS/npair_ssa_kokkos.h
index e38d648984..96efd7404b 100644
--- a/src/KOKKOS/npair_ssa_kokkos.h
+++ b/src/KOKKOS/npair_ssa_kokkos.h
@@ -287,7 +287,7 @@ class NPairSSAKokkosExecute
     h_new_maxneighs() = neigh_list.maxneighs;
   };
 
-  ~NPairSSAKokkosExecute() {neigh_list.clean_copy();};
+  ~NPairSSAKokkosExecute() {neigh_list.copymode = 1;};
 
   void build_locals();
   void build_ghosts();
diff --git a/src/KOKKOS/pair_coul_dsf_kokkos.cpp b/src/KOKKOS/pair_coul_dsf_kokkos.cpp
index f2063bdc08..e6f5407f2d 100644
--- a/src/KOKKOS/pair_coul_dsf_kokkos.cpp
+++ b/src/KOKKOS/pair_coul_dsf_kokkos.cpp
@@ -120,9 +120,6 @@ void PairCoulDSFKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
   int inum = list->inum;
 
-  // Call cleanup_copy which sets allocations NULL which are destructed by the PairStyle
-
-  k_list->clean_copy();
   copymode = 1;
 
   // loop over neighbors of my atoms
diff --git a/src/KOKKOS/pair_coul_wolf_kokkos.cpp b/src/KOKKOS/pair_coul_wolf_kokkos.cpp
index 8049ba0031..75177e2d81 100644
--- a/src/KOKKOS/pair_coul_wolf_kokkos.cpp
+++ b/src/KOKKOS/pair_coul_wolf_kokkos.cpp
@@ -121,9 +121,6 @@ void PairCoulWolfKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
   int inum = list->inum;
 
-  // Call cleanup_copy which sets allocations NULL which are destructed by the PairStyle
-
-  k_list->clean_copy();
   copymode = 1;
 
   // loop over neighbors of my atoms
diff --git a/src/KOKKOS/pair_eam_alloy_kokkos.cpp b/src/KOKKOS/pair_eam_alloy_kokkos.cpp
index 45c320bc51..acf9b27963 100644
--- a/src/KOKKOS/pair_eam_alloy_kokkos.cpp
+++ b/src/KOKKOS/pair_eam_alloy_kokkos.cpp
@@ -122,9 +122,6 @@ void PairEAMAlloyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   d_ilist = k_list->d_ilist;
   int inum = list->inum;
 
-  // Call cleanup_copy which sets allocations NULL which are destructed by the PairStyle
-
-  k_list->clean_copy();
   copymode = 1;
 
   // zero out density
diff --git a/src/KOKKOS/pair_eam_fs_kokkos.cpp b/src/KOKKOS/pair_eam_fs_kokkos.cpp
index b9fa82740a..a31263dfcd 100644
--- a/src/KOKKOS/pair_eam_fs_kokkos.cpp
+++ b/src/KOKKOS/pair_eam_fs_kokkos.cpp
@@ -122,9 +122,6 @@ void PairEAMFSKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   d_ilist = k_list->d_ilist;
   int inum = list->inum;
 
-  // Call cleanup_copy which sets allocations NULL which are destructed by the PairStyle
-
-  k_list->clean_copy();
   copymode = 1;
 
   // zero out density
diff --git a/src/KOKKOS/pair_eam_kokkos.cpp b/src/KOKKOS/pair_eam_kokkos.cpp
index e4128de722..006c9582c5 100644
--- a/src/KOKKOS/pair_eam_kokkos.cpp
+++ b/src/KOKKOS/pair_eam_kokkos.cpp
@@ -117,9 +117,6 @@ void PairEAMKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   d_ilist = k_list->d_ilist;
   int inum = list->inum;
 
-  // Call cleanup_copy which sets allocations NULL which are destructed by the PairStyle
-
-  k_list->clean_copy();
   copymode = 1;
 
   // zero out density
@@ -870,4 +867,4 @@ template class PairEAMKokkos<LMPDeviceType>;
 #ifdef KOKKOS_HAVE_CUDA
 template class PairEAMKokkos<LMPHostType>;
 #endif
-}
\ No newline at end of file
+}
diff --git a/src/KOKKOS/pair_kokkos.h b/src/KOKKOS/pair_kokkos.h
index 1e01b3df15..b0614a934b 100644
--- a/src/KOKKOS/pair_kokkos.h
+++ b/src/KOKKOS/pair_kokkos.h
@@ -87,7 +87,7 @@ struct PairComputeFunctor  {
   vatom(c.d_vatom),list(*list_ptr) {};
 
   // Call cleanup_copy which sets allocations NULL which are destructed by the PairStyle
-  ~PairComputeFunctor() {c.cleanup_copy();list.clean_copy();};
+  ~PairComputeFunctor() {c.cleanup_copy();list.copymode = 1;};
 
   KOKKOS_INLINE_FUNCTION int sbmask(const int& j) const {
     return j >> SBBITS & 3;
@@ -344,7 +344,7 @@ struct PairComputeFunctor<PairStyle,N2,STACKPARAMS,Specialisation>  {
   PairComputeFunctor(PairStyle* c_ptr,
                           NeighListKokkos<device_type>* list_ptr):
   c(*c_ptr),list(*list_ptr) {};
-  ~PairComputeFunctor() {c.cleanup_copy();list.clean_copy();};
+  ~PairComputeFunctor() {c.cleanup_copy();list.copymode = 1;};
 
   KOKKOS_INLINE_FUNCTION int sbmask(const int& j) const {
     return j >> SBBITS & 3;
diff --git a/src/KOKKOS/pair_reax_c_kokkos.cpp b/src/KOKKOS/pair_reax_c_kokkos.cpp
index acf9c754cd..87915dce3e 100644
--- a/src/KOKKOS/pair_reax_c_kokkos.cpp
+++ b/src/KOKKOS/pair_reax_c_kokkos.cpp
@@ -709,8 +709,6 @@ void PairReaxCKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   d_neighbors = k_list->d_neighbors;
   d_ilist = k_list->d_ilist;
 
-  k_list->clean_copy();
-
   if (eflag_global) {
     for (int i = 0; i < 14; i++)
       pvector[i] = 0.0;
@@ -4012,7 +4010,6 @@ void PairReaxCKokkos<DeviceType>::FindBond(int &numbonds)
   const int inum = list->inum;
   NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
   d_ilist = k_list->d_ilist;
-  k_list->clean_copy();
 
   numbonds = 0;
   PairReaxCKokkosFindBondFunctor<DeviceType> find_bond_functor(this);
diff --git a/src/KOKKOS/pair_sw_kokkos.cpp b/src/KOKKOS/pair_sw_kokkos.cpp
index a8950a0c79..e5c947cc8e 100644
--- a/src/KOKKOS/pair_sw_kokkos.cpp
+++ b/src/KOKKOS/pair_sw_kokkos.cpp
@@ -115,7 +115,6 @@ void PairSWKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   d_numneigh = k_list->d_numneigh;
   d_neighbors = k_list->d_neighbors;
 
-  k_list->clean_copy();
   copymode = 1;
 
   EV_FLOAT ev;
diff --git a/src/KOKKOS/pair_tersoff_kokkos.cpp b/src/KOKKOS/pair_tersoff_kokkos.cpp
index 75280c8f7c..833c815ad9 100644
--- a/src/KOKKOS/pair_tersoff_kokkos.cpp
+++ b/src/KOKKOS/pair_tersoff_kokkos.cpp
@@ -200,7 +200,6 @@ void PairTersoffKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   d_neighbors = k_list->d_neighbors;
   d_ilist = k_list->d_ilist;
 
-  k_list->clean_copy();
   copymode = 1;
 
   EV_FLOAT ev;
diff --git a/src/KOKKOS/pair_tersoff_mod_kokkos.cpp b/src/KOKKOS/pair_tersoff_mod_kokkos.cpp
index d16a7fc4d7..d77ba2f141 100644
--- a/src/KOKKOS/pair_tersoff_mod_kokkos.cpp
+++ b/src/KOKKOS/pair_tersoff_mod_kokkos.cpp
@@ -200,7 +200,6 @@ void PairTersoffMODKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   d_neighbors = k_list->d_neighbors;
   d_ilist = k_list->d_ilist;
 
-  k_list->clean_copy();
   copymode = 1;
 
   EV_FLOAT ev;
diff --git a/src/KOKKOS/pair_tersoff_zbl_kokkos.cpp b/src/KOKKOS/pair_tersoff_zbl_kokkos.cpp
index e9bae49fb7..040d8c5230 100644
--- a/src/KOKKOS/pair_tersoff_zbl_kokkos.cpp
+++ b/src/KOKKOS/pair_tersoff_zbl_kokkos.cpp
@@ -214,7 +214,6 @@ void PairTersoffZBLKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   d_neighbors = k_list->d_neighbors;
   d_ilist = k_list->d_ilist;
 
-  k_list->clean_copy();
   copymode = 1;
 
   EV_FLOAT ev;
diff --git a/src/neigh_list.cpp b/src/neigh_list.cpp
index 6376637832..dde544a69f 100644
--- a/src/neigh_list.cpp
+++ b/src/neigh_list.cpp
@@ -48,6 +48,7 @@ NeighList::NeighList(LAMMPS *lmp) : Pointers(lmp)
   ghost = 0;
   ssa = 0;
   copy = 0;
+  copymode = 0;
   dnum = 0;
 
   // ptrs
@@ -86,6 +87,7 @@ NeighList::NeighList(LAMMPS *lmp) : Pointers(lmp)
 
 NeighList::~NeighList()
 {
+  if (copymode) return;
   if (!copy) {
     memory->destroy(ilist);
     memory->destroy(numneigh);
diff --git a/src/neigh_list.h b/src/neigh_list.h
index bef512512c..4010a68857 100644
--- a/src/neigh_list.h
+++ b/src/neigh_list.h
@@ -34,7 +34,8 @@ class NeighList : protected Pointers {
   int occasional;                  // 0 if build every reneighbor, 1 if not
   int ghost;                       // 1 if list stores neighbors of ghosts
   int ssa;                         // 1 if list stores Shardlow data
-  int copy;                        // 1 if this list copied from another list
+  int copy;                        // 1 if this list is (host) copied from another list
+  int copymode;                    // 1 if this is a Kokkos on-device copy
   int dnum;                        // # of doubles per neighbor, 0 if none
 
   // data structs to store neighbor pairs I,J and associated values

From b3d6d9f8cf3e66437c7471a214fb668a7601c9fd Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Tue, 28 Mar 2017 11:38:26 -0600
Subject: [PATCH 221/267] fix memory leak via NeighListKokkos::clean_copy()

There were several clean_copy() calls in pair
styles *outside device code*.
They seem to have been left over from an abandoned
effort to copy the Kokkos neighbor list as
a member of the pair style, instead of copying
out the individual views needed.
These leftover clean_copy() calls were setting
pointers to NULL that had not been freed,
leading to large memory leaks.
I've removed the clean_copy() function entirely,
and replaced it with the copymode flag system used
in many other Kokkos objects.
The copymode flag is only set to one in
functors that hold copies of the neighbor list.
---
 src/KOKKOS/fix_qeq_reax_kokkos.cpp     |  2 --
 src/KOKKOS/fix_shardlow_kokkos.cpp     |  2 --
 src/KOKKOS/neigh_list_kokkos.cpp       | 19 ++++++-------------
 src/KOKKOS/neigh_list_kokkos.h         |  7 +------
 src/KOKKOS/npair_kokkos.h              |  2 +-
 src/KOKKOS/npair_ssa_kokkos.h          |  2 +-
 src/KOKKOS/pair_coul_dsf_kokkos.cpp    |  3 ---
 src/KOKKOS/pair_coul_wolf_kokkos.cpp   |  3 ---
 src/KOKKOS/pair_eam_alloy_kokkos.cpp   |  3 ---
 src/KOKKOS/pair_eam_fs_kokkos.cpp      |  3 ---
 src/KOKKOS/pair_eam_kokkos.cpp         |  5 +----
 src/KOKKOS/pair_kokkos.h               |  4 ++--
 src/KOKKOS/pair_reax_c_kokkos.cpp      |  3 ---
 src/KOKKOS/pair_sw_kokkos.cpp          |  1 -
 src/KOKKOS/pair_tersoff_kokkos.cpp     |  1 -
 src/KOKKOS/pair_tersoff_mod_kokkos.cpp |  1 -
 src/KOKKOS/pair_tersoff_zbl_kokkos.cpp |  1 -
 src/neigh_list.cpp                     |  2 ++
 src/neigh_list.h                       |  3 ++-
 19 files changed, 16 insertions(+), 51 deletions(-)

diff --git a/src/KOKKOS/fix_qeq_reax_kokkos.cpp b/src/KOKKOS/fix_qeq_reax_kokkos.cpp
index 3b8d5a85ea..fbc6e0a298 100644
--- a/src/KOKKOS/fix_qeq_reax_kokkos.cpp
+++ b/src/KOKKOS/fix_qeq_reax_kokkos.cpp
@@ -217,8 +217,6 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
   d_ilist = k_list->d_ilist;
   inum = list->inum;
 
-  k_list->clean_copy();
-  //cleanup_copy();
   copymode = 1;
 
   int teamsize = TEAMSIZE;
diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index bf026552fa..676df07b61 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -624,8 +624,6 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
   d_neighbors = k_list->d_neighbors;
   d_ilist = k_list->d_ilist;
 
-  k_list->clean_copy();
-  //cleanup_copy();
   copymode = 1;
 
   dtsqrt = sqrt(update->dt);
diff --git a/src/KOKKOS/neigh_list_kokkos.cpp b/src/KOKKOS/neigh_list_kokkos.cpp
index b1b4e4467a..caf2dfee56 100644
--- a/src/KOKKOS/neigh_list_kokkos.cpp
+++ b/src/KOKKOS/neigh_list_kokkos.cpp
@@ -22,21 +22,14 @@ enum{NSQ,BIN,MULTI};
 /* ---------------------------------------------------------------------- */
 
 template<class Device>
-void NeighListKokkos<Device>::clean_copy()
+NeighListKokkos<Device>::NeighListKokkos(class LAMMPS *lmp):NeighList(lmp)
 {
-  ilist = NULL;
-  numneigh = NULL;
-  firstneigh = NULL;
-  firstdouble = NULL;
-  dnum = 0;
-  iskip = NULL;
-  ijskip = NULL;
-
-  ipage = NULL;
-  dpage = NULL;
-
+  _stride = 1;
+  maxneighs = 16;
+  kokkos = 1;
   maxatoms = 0;
-}
+  execution_space = ExecutionSpaceFromDevice<Device>::space;
+};
 
 /* ---------------------------------------------------------------------- */
 
diff --git a/src/KOKKOS/neigh_list_kokkos.h b/src/KOKKOS/neigh_list_kokkos.h
index cece97197d..1c433f321c 100644
--- a/src/KOKKOS/neigh_list_kokkos.h
+++ b/src/KOKKOS/neigh_list_kokkos.h
@@ -68,18 +68,13 @@ class NeighListKokkos: public NeighList {
 public:
   int maxneighs;
 
-  void clean_copy();
   void grow(int nmax);
   typename ArrayTypes<Device>::t_neighbors_2d d_neighbors;
   typename DAT::tdual_int_1d k_ilist;   // local indices of I atoms
   typename ArrayTypes<Device>::t_int_1d d_ilist;
   typename ArrayTypes<Device>::t_int_1d d_numneigh; // # of J neighs for each I
 
-  NeighListKokkos(class LAMMPS *lmp):
-  NeighList(lmp) {_stride = 1; maxneighs = 16; kokkos = 1; maxatoms = 0;
-                  execution_space = ExecutionSpaceFromDevice<Device>::space;
-  };
-  ~NeighListKokkos() {numneigh = NULL; ilist = NULL;};
+  NeighListKokkos(class LAMMPS *lmp);
 
   KOKKOS_INLINE_FUNCTION
   AtomNeighbors get_neighbors(const int &i) const {
diff --git a/src/KOKKOS/npair_kokkos.h b/src/KOKKOS/npair_kokkos.h
index b31ef2ebbf..ab094e68eb 100644
--- a/src/KOKKOS/npair_kokkos.h
+++ b/src/KOKKOS/npair_kokkos.h
@@ -265,7 +265,7 @@ class NeighborKokkosExecute
     h_new_maxneighs() = neigh_list.maxneighs;
   };
 
-  ~NeighborKokkosExecute() {neigh_list.clean_copy();};
+  ~NeighborKokkosExecute() {neigh_list.copymode = 1;};
 
   template<int HalfNeigh, int Newton, int Tri>
   KOKKOS_FUNCTION
diff --git a/src/KOKKOS/npair_ssa_kokkos.h b/src/KOKKOS/npair_ssa_kokkos.h
index e38d648984..96efd7404b 100644
--- a/src/KOKKOS/npair_ssa_kokkos.h
+++ b/src/KOKKOS/npair_ssa_kokkos.h
@@ -287,7 +287,7 @@ class NPairSSAKokkosExecute
     h_new_maxneighs() = neigh_list.maxneighs;
   };
 
-  ~NPairSSAKokkosExecute() {neigh_list.clean_copy();};
+  ~NPairSSAKokkosExecute() {neigh_list.copymode = 1;};
 
   void build_locals();
   void build_ghosts();
diff --git a/src/KOKKOS/pair_coul_dsf_kokkos.cpp b/src/KOKKOS/pair_coul_dsf_kokkos.cpp
index f2063bdc08..e6f5407f2d 100644
--- a/src/KOKKOS/pair_coul_dsf_kokkos.cpp
+++ b/src/KOKKOS/pair_coul_dsf_kokkos.cpp
@@ -120,9 +120,6 @@ void PairCoulDSFKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
   int inum = list->inum;
 
-  // Call cleanup_copy which sets allocations NULL which are destructed by the PairStyle
-
-  k_list->clean_copy();
   copymode = 1;
 
   // loop over neighbors of my atoms
diff --git a/src/KOKKOS/pair_coul_wolf_kokkos.cpp b/src/KOKKOS/pair_coul_wolf_kokkos.cpp
index 8049ba0031..75177e2d81 100644
--- a/src/KOKKOS/pair_coul_wolf_kokkos.cpp
+++ b/src/KOKKOS/pair_coul_wolf_kokkos.cpp
@@ -121,9 +121,6 @@ void PairCoulWolfKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
   int inum = list->inum;
 
-  // Call cleanup_copy which sets allocations NULL which are destructed by the PairStyle
-
-  k_list->clean_copy();
   copymode = 1;
 
   // loop over neighbors of my atoms
diff --git a/src/KOKKOS/pair_eam_alloy_kokkos.cpp b/src/KOKKOS/pair_eam_alloy_kokkos.cpp
index 45c320bc51..acf9b27963 100644
--- a/src/KOKKOS/pair_eam_alloy_kokkos.cpp
+++ b/src/KOKKOS/pair_eam_alloy_kokkos.cpp
@@ -122,9 +122,6 @@ void PairEAMAlloyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   d_ilist = k_list->d_ilist;
   int inum = list->inum;
 
-  // Call cleanup_copy which sets allocations NULL which are destructed by the PairStyle
-
-  k_list->clean_copy();
   copymode = 1;
 
   // zero out density
diff --git a/src/KOKKOS/pair_eam_fs_kokkos.cpp b/src/KOKKOS/pair_eam_fs_kokkos.cpp
index b9fa82740a..a31263dfcd 100644
--- a/src/KOKKOS/pair_eam_fs_kokkos.cpp
+++ b/src/KOKKOS/pair_eam_fs_kokkos.cpp
@@ -122,9 +122,6 @@ void PairEAMFSKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   d_ilist = k_list->d_ilist;
   int inum = list->inum;
 
-  // Call cleanup_copy which sets allocations NULL which are destructed by the PairStyle
-
-  k_list->clean_copy();
   copymode = 1;
 
   // zero out density
diff --git a/src/KOKKOS/pair_eam_kokkos.cpp b/src/KOKKOS/pair_eam_kokkos.cpp
index e4128de722..006c9582c5 100644
--- a/src/KOKKOS/pair_eam_kokkos.cpp
+++ b/src/KOKKOS/pair_eam_kokkos.cpp
@@ -117,9 +117,6 @@ void PairEAMKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   d_ilist = k_list->d_ilist;
   int inum = list->inum;
 
-  // Call cleanup_copy which sets allocations NULL which are destructed by the PairStyle
-
-  k_list->clean_copy();
   copymode = 1;
 
   // zero out density
@@ -870,4 +867,4 @@ template class PairEAMKokkos<LMPDeviceType>;
 #ifdef KOKKOS_HAVE_CUDA
 template class PairEAMKokkos<LMPHostType>;
 #endif
-}
\ No newline at end of file
+}
diff --git a/src/KOKKOS/pair_kokkos.h b/src/KOKKOS/pair_kokkos.h
index 1e01b3df15..b0614a934b 100644
--- a/src/KOKKOS/pair_kokkos.h
+++ b/src/KOKKOS/pair_kokkos.h
@@ -87,7 +87,7 @@ struct PairComputeFunctor  {
   vatom(c.d_vatom),list(*list_ptr) {};
 
   // Call cleanup_copy which sets allocations NULL which are destructed by the PairStyle
-  ~PairComputeFunctor() {c.cleanup_copy();list.clean_copy();};
+  ~PairComputeFunctor() {c.cleanup_copy();list.copymode = 1;};
 
   KOKKOS_INLINE_FUNCTION int sbmask(const int& j) const {
     return j >> SBBITS & 3;
@@ -344,7 +344,7 @@ struct PairComputeFunctor<PairStyle,N2,STACKPARAMS,Specialisation>  {
   PairComputeFunctor(PairStyle* c_ptr,
                           NeighListKokkos<device_type>* list_ptr):
   c(*c_ptr),list(*list_ptr) {};
-  ~PairComputeFunctor() {c.cleanup_copy();list.clean_copy();};
+  ~PairComputeFunctor() {c.cleanup_copy();list.copymode = 1;};
 
   KOKKOS_INLINE_FUNCTION int sbmask(const int& j) const {
     return j >> SBBITS & 3;
diff --git a/src/KOKKOS/pair_reax_c_kokkos.cpp b/src/KOKKOS/pair_reax_c_kokkos.cpp
index acf9c754cd..87915dce3e 100644
--- a/src/KOKKOS/pair_reax_c_kokkos.cpp
+++ b/src/KOKKOS/pair_reax_c_kokkos.cpp
@@ -709,8 +709,6 @@ void PairReaxCKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   d_neighbors = k_list->d_neighbors;
   d_ilist = k_list->d_ilist;
 
-  k_list->clean_copy();
-
   if (eflag_global) {
     for (int i = 0; i < 14; i++)
       pvector[i] = 0.0;
@@ -4012,7 +4010,6 @@ void PairReaxCKokkos<DeviceType>::FindBond(int &numbonds)
   const int inum = list->inum;
   NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
   d_ilist = k_list->d_ilist;
-  k_list->clean_copy();
 
   numbonds = 0;
   PairReaxCKokkosFindBondFunctor<DeviceType> find_bond_functor(this);
diff --git a/src/KOKKOS/pair_sw_kokkos.cpp b/src/KOKKOS/pair_sw_kokkos.cpp
index a8950a0c79..e5c947cc8e 100644
--- a/src/KOKKOS/pair_sw_kokkos.cpp
+++ b/src/KOKKOS/pair_sw_kokkos.cpp
@@ -115,7 +115,6 @@ void PairSWKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   d_numneigh = k_list->d_numneigh;
   d_neighbors = k_list->d_neighbors;
 
-  k_list->clean_copy();
   copymode = 1;
 
   EV_FLOAT ev;
diff --git a/src/KOKKOS/pair_tersoff_kokkos.cpp b/src/KOKKOS/pair_tersoff_kokkos.cpp
index 75280c8f7c..833c815ad9 100644
--- a/src/KOKKOS/pair_tersoff_kokkos.cpp
+++ b/src/KOKKOS/pair_tersoff_kokkos.cpp
@@ -200,7 +200,6 @@ void PairTersoffKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   d_neighbors = k_list->d_neighbors;
   d_ilist = k_list->d_ilist;
 
-  k_list->clean_copy();
   copymode = 1;
 
   EV_FLOAT ev;
diff --git a/src/KOKKOS/pair_tersoff_mod_kokkos.cpp b/src/KOKKOS/pair_tersoff_mod_kokkos.cpp
index d16a7fc4d7..d77ba2f141 100644
--- a/src/KOKKOS/pair_tersoff_mod_kokkos.cpp
+++ b/src/KOKKOS/pair_tersoff_mod_kokkos.cpp
@@ -200,7 +200,6 @@ void PairTersoffMODKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   d_neighbors = k_list->d_neighbors;
   d_ilist = k_list->d_ilist;
 
-  k_list->clean_copy();
   copymode = 1;
 
   EV_FLOAT ev;
diff --git a/src/KOKKOS/pair_tersoff_zbl_kokkos.cpp b/src/KOKKOS/pair_tersoff_zbl_kokkos.cpp
index e9bae49fb7..040d8c5230 100644
--- a/src/KOKKOS/pair_tersoff_zbl_kokkos.cpp
+++ b/src/KOKKOS/pair_tersoff_zbl_kokkos.cpp
@@ -214,7 +214,6 @@ void PairTersoffZBLKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   d_neighbors = k_list->d_neighbors;
   d_ilist = k_list->d_ilist;
 
-  k_list->clean_copy();
   copymode = 1;
 
   EV_FLOAT ev;
diff --git a/src/neigh_list.cpp b/src/neigh_list.cpp
index 6376637832..dde544a69f 100644
--- a/src/neigh_list.cpp
+++ b/src/neigh_list.cpp
@@ -48,6 +48,7 @@ NeighList::NeighList(LAMMPS *lmp) : Pointers(lmp)
   ghost = 0;
   ssa = 0;
   copy = 0;
+  copymode = 0;
   dnum = 0;
 
   // ptrs
@@ -86,6 +87,7 @@ NeighList::NeighList(LAMMPS *lmp) : Pointers(lmp)
 
 NeighList::~NeighList()
 {
+  if (copymode) return;
   if (!copy) {
     memory->destroy(ilist);
     memory->destroy(numneigh);
diff --git a/src/neigh_list.h b/src/neigh_list.h
index bef512512c..4010a68857 100644
--- a/src/neigh_list.h
+++ b/src/neigh_list.h
@@ -34,7 +34,8 @@ class NeighList : protected Pointers {
   int occasional;                  // 0 if build every reneighbor, 1 if not
   int ghost;                       // 1 if list stores neighbors of ghosts
   int ssa;                         // 1 if list stores Shardlow data
-  int copy;                        // 1 if this list copied from another list
+  int copy;                        // 1 if this list is (host) copied from another list
+  int copymode;                    // 1 if this is a Kokkos on-device copy
   int dnum;                        // # of doubles per neighbor, 0 if none
 
   // data structs to store neighbor pairs I,J and associated values

From bf4f0817d4f85bc22a6f1b3a01b40b8f23b46a6b Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Fri, 31 Mar 2017 15:57:00 -0600
Subject: [PATCH 222/267] fix memory leaks in pair_tabl_rx_kokkos

---
 src/KOKKOS/pair_table_rx_kokkos.cpp | 19 ++++++++++++++-----
 src/USER-DPD/pair_table_rx.cpp      | 10 ++++++++++
 src/USER-DPD/pair_table_rx.h        |  2 +-
 3 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index 044f303bf5..eacaf83cf5 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -147,6 +147,9 @@ PairTableRXKokkos<DeviceType>::PairTableRXKokkos(LAMMPS *lmp) : PairTable(lmp)
   h_table = new TableHost();
   d_table = new TableDevice();
   fractionalWeighting = true;
+
+  site1 = nullptr;
+  site2 = nullptr;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -156,14 +159,21 @@ PairTableRXKokkos<DeviceType>::~PairTableRXKokkos()
 {
   if (copymode) return;
 
+  delete [] site1;
+  delete [] site2;
+
   memory->destroy_kokkos(k_eatom,eatom);
   memory->destroy_kokkos(k_vatom,vatom);
 
+  if (allocated) {
+    memory->destroy_kokkos(d_table->cutsq, cutsq);
+    memory->destroy_kokkos(d_table->tabindex, tabindex);
+  }
+
   delete h_table;
   h_table = nullptr;
   delete d_table;
   d_table = nullptr;
-  copymode = true; //prevents base class destructor from running
 }
 
 /* ---------------------------------------------------------------------- */
@@ -981,6 +991,8 @@ void PairTableRXKokkos<DeviceType>::settings(int narg, char **arg)
 
   for (int m = 0; m < ntables; m++) free_table(&tables[m]);
   memory->sfree(tables);
+  ntables = 0;
+  tables = NULL;
 
   if (allocated) {
     memory->destroy(setflag);
@@ -990,11 +1002,8 @@ void PairTableRXKokkos<DeviceType>::settings(int narg, char **arg)
 
     d_table_const.cutsq = d_table->cutsq = typename ArrayTypes<DeviceType>::t_ffloat_2d();
     h_table->cutsq = typename ArrayTypes<LMPHostType>::t_ffloat_2d();
+    allocated = 0;
   }
-  allocated = 0;
-
-  ntables = 0;
-  tables = NULL;
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/USER-DPD/pair_table_rx.cpp b/src/USER-DPD/pair_table_rx.cpp
index cf85fe2e60..89d09e7322 100644
--- a/src/USER-DPD/pair_table_rx.cpp
+++ b/src/USER-DPD/pair_table_rx.cpp
@@ -47,6 +47,16 @@ enum{NONE,RLINEAR,RSQ,BMP};
 PairTableRX::PairTableRX(LAMMPS *lmp) : PairTable(lmp)
 {
   fractionalWeighting = true;
+  site1 = NULL;
+  site2 = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairTableRX::~PairTableRX()
+{
+  delete [] site1;
+  delete [] site2;
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/USER-DPD/pair_table_rx.h b/src/USER-DPD/pair_table_rx.h
index 9dee5df266..da7889e99a 100644
--- a/src/USER-DPD/pair_table_rx.h
+++ b/src/USER-DPD/pair_table_rx.h
@@ -27,7 +27,7 @@ namespace LAMMPS_NS {
 class PairTableRX : public PairTable {
  public:
   PairTableRX(class LAMMPS *);
-  virtual ~PairTableRX() {}
+  virtual ~PairTableRX();
 
   virtual void compute(int, int);
   void settings(int, char **);

From 5edbd63920681f585b054d4aebf2fb7eb462f5ce Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Fri, 31 Mar 2017 16:03:05 -0600
Subject: [PATCH 223/267] fix memory leak in fix_shardlow_kokkos

---
 src/KOKKOS/fix_shardlow_kokkos.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index 676df07b61..52287d586c 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -741,6 +741,7 @@ fprintf(stdout, "\n%6d %6d,%6d %6d: "
 );
 #endif
 
+  copymode = 0;
 }
 
 /* ---------------------------------------------------------------------- */

From fe82926c1f41d2a99ad75ca3d07312ad0945e52a Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Fri, 31 Mar 2017 15:57:00 -0600
Subject: [PATCH 224/267] fix memory leaks in pair_tabl_rx_kokkos

---
 src/KOKKOS/pair_table_rx_kokkos.cpp | 19 ++++++++++++++-----
 src/USER-DPD/pair_table_rx.cpp      | 10 ++++++++++
 src/USER-DPD/pair_table_rx.h        |  2 +-
 3 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index 044f303bf5..eacaf83cf5 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -147,6 +147,9 @@ PairTableRXKokkos<DeviceType>::PairTableRXKokkos(LAMMPS *lmp) : PairTable(lmp)
   h_table = new TableHost();
   d_table = new TableDevice();
   fractionalWeighting = true;
+
+  site1 = nullptr;
+  site2 = nullptr;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -156,14 +159,21 @@ PairTableRXKokkos<DeviceType>::~PairTableRXKokkos()
 {
   if (copymode) return;
 
+  delete [] site1;
+  delete [] site2;
+
   memory->destroy_kokkos(k_eatom,eatom);
   memory->destroy_kokkos(k_vatom,vatom);
 
+  if (allocated) {
+    memory->destroy_kokkos(d_table->cutsq, cutsq);
+    memory->destroy_kokkos(d_table->tabindex, tabindex);
+  }
+
   delete h_table;
   h_table = nullptr;
   delete d_table;
   d_table = nullptr;
-  copymode = true; //prevents base class destructor from running
 }
 
 /* ---------------------------------------------------------------------- */
@@ -981,6 +991,8 @@ void PairTableRXKokkos<DeviceType>::settings(int narg, char **arg)
 
   for (int m = 0; m < ntables; m++) free_table(&tables[m]);
   memory->sfree(tables);
+  ntables = 0;
+  tables = NULL;
 
   if (allocated) {
     memory->destroy(setflag);
@@ -990,11 +1002,8 @@ void PairTableRXKokkos<DeviceType>::settings(int narg, char **arg)
 
     d_table_const.cutsq = d_table->cutsq = typename ArrayTypes<DeviceType>::t_ffloat_2d();
     h_table->cutsq = typename ArrayTypes<LMPHostType>::t_ffloat_2d();
+    allocated = 0;
   }
-  allocated = 0;
-
-  ntables = 0;
-  tables = NULL;
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/USER-DPD/pair_table_rx.cpp b/src/USER-DPD/pair_table_rx.cpp
index cf85fe2e60..89d09e7322 100644
--- a/src/USER-DPD/pair_table_rx.cpp
+++ b/src/USER-DPD/pair_table_rx.cpp
@@ -47,6 +47,16 @@ enum{NONE,RLINEAR,RSQ,BMP};
 PairTableRX::PairTableRX(LAMMPS *lmp) : PairTable(lmp)
 {
   fractionalWeighting = true;
+  site1 = NULL;
+  site2 = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairTableRX::~PairTableRX()
+{
+  delete [] site1;
+  delete [] site2;
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/USER-DPD/pair_table_rx.h b/src/USER-DPD/pair_table_rx.h
index 9dee5df266..da7889e99a 100644
--- a/src/USER-DPD/pair_table_rx.h
+++ b/src/USER-DPD/pair_table_rx.h
@@ -27,7 +27,7 @@ namespace LAMMPS_NS {
 class PairTableRX : public PairTable {
  public:
   PairTableRX(class LAMMPS *);
-  virtual ~PairTableRX() {}
+  virtual ~PairTableRX();
 
   virtual void compute(int, int);
   void settings(int, char **);

From 6ba59cb4583c86af3f0104bb10e1ecd324bf9cce Mon Sep 17 00:00:00 2001
From: Dan Ibanez <daibane@sandia.gov>
Date: Fri, 31 Mar 2017 16:03:05 -0600
Subject: [PATCH 225/267] fix memory leak in fix_shardlow_kokkos

---
 src/KOKKOS/fix_shardlow_kokkos.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index 676df07b61..52287d586c 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -741,6 +741,7 @@ fprintf(stdout, "\n%6d %6d,%6d %6d: "
 );
 #endif
 
+  copymode = 0;
 }
 
 /* ---------------------------------------------------------------------- */

From ac64183ecfdd1a7cdd82770c96e1fbe05934967e Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Sat, 1 Apr 2017 12:11:55 -0400
Subject: [PATCH 226/267] USER-DPD Kokkos: WIP on preflighting SSA neighbor
 list build, with debugging

---
 src/KOKKOS/npair_ssa_kokkos.cpp | 73 +++++++++++++++++++++++++++++++--
 1 file changed, 69 insertions(+), 4 deletions(-)

diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp
index 59470189bc..87cc02e734 100644
--- a/src/KOKKOS/npair_ssa_kokkos.cpp
+++ b/src/KOKKOS/npair_ssa_kokkos.cpp
@@ -240,6 +240,58 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
     ssa_gitemLen = k_ssa_gitemLen.view<DeviceType>();
   }
 
+{ // Preflight the neighbor list build
+  const typename ArrayTypes<DeviceType>::t_int_1d_const c_bincount = k_bincount.view<DeviceType>();
+  int inum = 0;
+
+  int workPhase = 0;
+  // loop over bins with local atoms, storing half of the neighbors
+  for (int zoff = sz1 - 1; zoff >= 0; --zoff) {
+  for (int yoff = sy1 - 1; yoff >= 0; --yoff) {
+  for (int xoff = sx1 - 1; xoff >= 0; --xoff) {
+    int workItem = 0;
+  for (int zbin = lbinzlo + zoff; zbin < lbinzhi; zbin += sz1) {
+  for (int ybin = lbinylo + yoff - sy1 + 1; ybin < lbinyhi; ybin += sy1) {
+  for (int xbin = lbinxlo + xoff - sx1 + 1; xbin < lbinxhi; xbin += sx1) {
+//    if (workItem >= phaseLenEstimate) error->one(FLERR,"phaseLenEstimate was too small");
+    ssa_itemLoc(workPhase, workItem) = inum; // record where workItem starts in ilist
+
+    for (int subphase = 0; subphase < 4; subphase++) {
+      int s_ybin = ybin + ((subphase & 0x2) ? sy1 - 1 : 0);
+      int s_xbin = xbin + ((subphase & 0x1) ? sx1 - 1 : 0);
+      if ((s_ybin < lbinylo) || (s_ybin >= lbinyhi)) continue;
+      if ((s_xbin < lbinxlo) || (s_xbin >= lbinxhi)) continue;
+
+      int ibin = zbin*mbiny*mbinx + s_ybin*mbinx + s_xbin;
+      inum += c_bincount(ibin);
+    }
+    // record where workItem ends in ilist
+    ssa_itemLen(workPhase,workItem) = inum - ssa_itemLoc(workPhase,workItem);
+    if (ssa_itemLen(workPhase,workItem) > 0) workItem++;
+  }
+  }
+  }
+
+fprintf(stdout, "phase %3d could use %6d inums, expected %6d inums. maxworkItems = %3d, inums/workItems = %g\n"
+  ,workPhase
+  ,inum - ssa_itemLoc(workPhase, 0)
+  ,(nlocal*4 + ssa_phaseCt - 1) / ssa_phaseCt
+  ,workItem
+  ,(inum - ssa_itemLoc(workPhase, 0)) / (double) workItem
+);
+    // record where workPhase ends
+    ssa_phaseLen(workPhase++) = workItem;
+  }
+  }
+  }
+fprintf(stdout, "total %3d could use %6d inums, expected %6d inums. inums/phase = %g\n"
+  ,workPhase
+  ,inum
+  ,nlocal*4
+  ,inum / (double) workPhase
+);
+}
+
   NPairSSAKokkosExecute<DeviceType>
     data(*list,
          k_cutneighsq.view<DeviceType>(),
@@ -355,18 +407,18 @@ void NPairSSAKokkosExecute<DeviceType>::build_locals()
   int n = 0;
   int which = 0;
   int inum = 0;
-
   int workPhase = 0;
+
   // loop over bins with local atoms, storing half of the neighbors
   for (int zoff = sz1 - 1; zoff >= 0; --zoff) {
   for (int yoff = sy1 - 1; yoff >= 0; --yoff) {
   for (int xoff = sx1 - 1; xoff >= 0; --xoff) {
     int workItem = 0;
+    inum = d_ssa_itemLoc(workPhase, workItem); // get where workPhase starts in ilist
   for (int zbin = lbinzlo + zoff; zbin < lbinzhi; zbin += sz1) {
   for (int ybin = lbinylo + yoff - sy1 + 1; ybin < lbinyhi; ybin += sy1) {
   for (int xbin = lbinxlo + xoff - sx1 + 1; xbin < lbinxhi; xbin += sx1) {
-//    if (workItem >= phaseLenEstimate) error->one(FLERR,"phaseLenEstimate was too small");
-    d_ssa_itemLoc(workPhase, workItem) = inum; // record where workItem starts in ilist
+    d_ssa_itemLoc(workPhase, workItem) = inum; // record where workItem actually starts in ilist
 
     for (int subphase = 0; subphase < 4; subphase++) {
       int s_ybin = ybin + ((subphase & 0x2) ? sy1 - 1 : 0);
@@ -441,18 +493,31 @@ void NPairSSAKokkosExecute<DeviceType>::build_locals()
         }
       }
     }
-    // record where workItem ends in ilist
+    // record where workItem actually ends in ilist
     d_ssa_itemLen(workPhase,workItem) = inum - d_ssa_itemLoc(workPhase,workItem);
     if (d_ssa_itemLen(workPhase,workItem) > 0) workItem++;
   }
   }
   }
 
+fprintf(stdout, "phase %3d used %6d inums, expected %6d inums. workItems = %3d, inums/workItems = %g\n"
+  ,workPhase
+  ,inum - d_ssa_itemLoc(workPhase, 0)
+  ,(nlocal*4 + ssa_phaseCt - 1) / ssa_phaseCt
+  ,workItem
+  ,(inum - d_ssa_itemLoc(workPhase, 0)) / (double) workItem
+);
     // record where workPhase ends
     d_ssa_phaseLen(workPhase++) = workItem;
   }
   }
   }
+fprintf(stdout, "Total %3d could use %6d inums, expected %6d inums. inums/phase = %g\n"
+  ,workPhase
+  ,inum
+  ,nlocal*4
+  ,inum / (double) workPhase
+);
 
 //FIXME  if (ssa_phaseCt != workPhase) error->one(FLERR,"ssa_phaseCt was wrong");
 

From ac4c35ce8d1caf0d7deaa1f0c816c0f5f5d9c523 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Sat, 1 Apr 2017 13:45:29 -0400
Subject: [PATCH 227/267] USER-DPD Kokkos: more WIP on preflighting SSA
 neighbor list build, with debugging

---
 src/KOKKOS/npair_ssa_kokkos.cpp | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp
index 87cc02e734..5c20f1c270 100644
--- a/src/KOKKOS/npair_ssa_kokkos.cpp
+++ b/src/KOKKOS/npair_ssa_kokkos.cpp
@@ -205,8 +205,7 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
 {
   NeighListKokkos<DeviceType>* list = (NeighListKokkos<DeviceType>*) list_;
   const int nlocal = includegroup?atom->nfirst:atom->nlocal;
-  const int nl_size = (nlocal * 4) + atom->nghost;
-  list->grow(nl_size); // Make special larger SSA neighbor list
+  int nl_size = atom->nghost;
 
   ssa_phaseCt = sz1*sy1*sx1;
 
@@ -240,8 +239,11 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
     ssa_gitemLen = k_ssa_gitemLen.view<DeviceType>();
   }
 
-{ // Preflight the neighbor list build
+{ // Preflight the neighbor list workplan
   const typename ArrayTypes<DeviceType>::t_int_1d_const c_bincount = k_bincount.view<DeviceType>();
+  const typename ArrayTypes<DeviceType>::t_int_2d_const c_bins     = k_bins.view<DeviceType>();
+  const typename ArrayTypes<DeviceType>::t_int_1d_const_um c_stencil = k_stencil.view<DeviceType>();
+  const typename ArrayTypes<DeviceType>::t_int_1d_const c_nstencil_ssa = k_nstencil_ssa.view<DeviceType>();
   int inum = 0;
 
   int workPhase = 0;
@@ -263,7 +265,17 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
       if ((s_xbin < lbinxlo) || (s_xbin >= lbinxhi)) continue;
 
       int ibin = zbin*mbiny*mbinx + s_ybin*mbinx + s_xbin;
-      inum += c_bincount(ibin);
+      for (int il = 0; il < c_bincount(ibin); ++il) {
+        int n = 0;
+
+        // count all local atoms in the current stencil "subphase" as potential neighbors
+        for (int k = c_nstencil_ssa(subphase); k < c_nstencil_ssa(subphase+1); k++) {
+          const int jbin = ibin+c_stencil(k);
+          int jl = (jbin != ibin) ? 0 : (il + 1); // same bin as il, so start just past il in the bin
+          n +=  c_bincount(jbin) - jl;
+        }
+        if (n > 0) inum++;
+      }
     }
     // record where workItem ends in ilist
     ssa_itemLen(workPhase,workItem) = inum - ssa_itemLoc(workPhase,workItem);
@@ -290,8 +302,11 @@ fprintf(stdout, "total %3d could use %6d inums, expected %6d inums. inums/phase
   ,nlocal*4
   ,inum / (double) workPhase
 );
+  nl_size += inum;
 }
 
+  list->grow(nl_size); // Make special larger SSA neighbor list
+
   NPairSSAKokkosExecute<DeviceType>
     data(*list,
          k_cutneighsq.view<DeviceType>(),
@@ -404,7 +419,6 @@ fprintf(stdout, "total %3d could use %6d inums, expected %6d inums. inums/phase
 template<class DeviceType>
 void NPairSSAKokkosExecute<DeviceType>::build_locals()
 {
-  int n = 0;
   int which = 0;
   int inum = 0;
   int workPhase = 0;
@@ -429,7 +443,7 @@ void NPairSSAKokkosExecute<DeviceType>::build_locals()
       int ibin = zbin*mbiny*mbinx + s_ybin*mbinx + s_xbin;
       for (int il = 0; il < c_bincount(ibin); ++il) {
         const int i = c_bins(ibin, il);
-        n = 0;
+        int n = 0;
 
         const AtomNeighbors neighbors_i = neigh_list.get_neighbors(inum);
         const X_FLOAT xtmp = x(i, 0);

From e0021a3ff51702ed4b5c79720dfe69dd247988fa Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Sat, 1 Apr 2017 14:41:52 -0400
Subject: [PATCH 228/267] USER-DPD Kokkos: preflight SSA neigh list workplan to
 reduce allocated storage

---
 src/KOKKOS/npair_ssa_kokkos.cpp | 36 +++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp
index 5c20f1c270..1c7095c9b4 100644
--- a/src/KOKKOS/npair_ssa_kokkos.cpp
+++ b/src/KOKKOS/npair_ssa_kokkos.cpp
@@ -205,7 +205,7 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
 {
   NeighListKokkos<DeviceType>* list = (NeighListKokkos<DeviceType>*) list_;
   const int nlocal = includegroup?atom->nfirst:atom->nlocal;
-  int nl_size = atom->nghost;
+  int nl_size;
 
   ssa_phaseCt = sz1*sy1*sx1;
 
@@ -265,17 +265,17 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
       if ((s_xbin < lbinxlo) || (s_xbin >= lbinxhi)) continue;
 
       int ibin = zbin*mbiny*mbinx + s_ybin*mbinx + s_xbin;
-      for (int il = 0; il < c_bincount(ibin); ++il) {
-        int n = 0;
-
-        // count all local atoms in the current stencil "subphase" as potential neighbors
-        for (int k = c_nstencil_ssa(subphase); k < c_nstencil_ssa(subphase+1); k++) {
-          const int jbin = ibin+c_stencil(k);
-          int jl = (jbin != ibin) ? 0 : (il + 1); // same bin as il, so start just past il in the bin
-          n +=  c_bincount(jbin) - jl;
-        }
-        if (n > 0) inum++;
+      int base_n = 0;
+      bool include_same = false;
+      // count all local atoms in the current stencil "subphase" as potential neighbors
+      for (int k = c_nstencil_ssa(subphase); k < c_nstencil_ssa(subphase+1); k++) {
+        const int jbin = ibin+c_stencil(k);
+        if (jbin != ibin) base_n += c_bincount(jbin);
+        else include_same = true;
       }
+      // Calculate how many ibin particles would have had some neighbors
+      if (base_n > 0) inum += c_bincount(ibin);
+      else if (include_same) inum += c_bincount(ibin) - 1;
     }
     // record where workItem ends in ilist
     ssa_itemLen(workPhase,workItem) = inum - ssa_itemLoc(workPhase,workItem);
@@ -302,9 +302,12 @@ fprintf(stdout, "total %3d could use %6d inums, expected %6d inums. inums/phase
   ,nlocal*4
   ,inum / (double) workPhase
 );
-  nl_size += inum;
+  nl_size = inum; // record how much space is needed for the local work plan
 }
-
+  // count how many ghosts are likely to have neighbors, and increase the work plan storage
+  for (int workPhase = 0; workPhase < ssa_gphaseCt; workPhase++) {
+     nl_size += k_gbincount.h_view(workPhase + 1);
+  }
   list->grow(nl_size); // Make special larger SSA neighbor list
 
   NPairSSAKokkosExecute<DeviceType>
@@ -412,6 +415,13 @@ fprintf(stdout, "total %3d could use %6d inums, expected %6d inums. inums/phase
   list->inum = data.neigh_list.inum; //FIXME once the above is in a parallel_for
   list->gnum = data.neigh_list.gnum; // it will need a deep_copy or something
 
+fprintf(stdout, "%6d inum %6d gnum, total used %6d, allocated %6d\n"
+  ,list->inum
+  ,list->gnum
+  ,list->inum + list->gnum
+  ,nl_size
+);
+
   list->k_ilist.template modify<DeviceType>();
 }
 

From c4c3d490c7e0e4c416a119f852ef2973229c2815 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Sat, 1 Apr 2017 23:52:14 -0400
Subject: [PATCH 229/267] USER-DPD Kokkos: preflight storage needed for SSA
 threaded neigh list build

---
 src/KOKKOS/npair_ssa_kokkos.cpp | 122 +++++++++++++++++++++++---------
 src/KOKKOS/npair_ssa_kokkos.h   |   2 +-
 2 files changed, 90 insertions(+), 34 deletions(-)

diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp
index 1c7095c9b4..042c48fbac 100644
--- a/src/KOKKOS/npair_ssa_kokkos.cpp
+++ b/src/KOKKOS/npair_ssa_kokkos.cpp
@@ -25,6 +25,7 @@
 #include "nbin_ssa_kokkos.h"
 #include "nstencil_ssa.h"
 #include "error.h"
+#include "comm.h"
 
 namespace LAMMPS_NS {
 
@@ -255,8 +256,8 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
   for (int zbin = lbinzlo + zoff; zbin < lbinzhi; zbin += sz1) {
   for (int ybin = lbinylo + yoff - sy1 + 1; ybin < lbinyhi; ybin += sy1) {
   for (int xbin = lbinxlo + xoff - sx1 + 1; xbin < lbinxhi; xbin += sx1) {
+    int inum_start = inum;
 //    if (workItem >= phaseLenEstimate) error->one(FLERR,"phaseLenEstimate was too small");
-    ssa_itemLoc(workPhase, workItem) = inum; // record where workItem starts in ilist
 
     for (int subphase = 0; subphase < 4; subphase++) {
       int s_ybin = ybin + ((subphase & 0x2) ? sy1 - 1 : 0);
@@ -264,27 +265,40 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
       if ((s_ybin < lbinylo) || (s_ybin >= lbinyhi)) continue;
       if ((s_xbin < lbinxlo) || (s_xbin >= lbinxhi)) continue;
 
-      int ibin = zbin*mbiny*mbinx + s_ybin*mbinx + s_xbin;
-      int base_n = 0;
-      bool include_same = false;
-      // count all local atoms in the current stencil "subphase" as potential neighbors
-      for (int k = c_nstencil_ssa(subphase); k < c_nstencil_ssa(subphase+1); k++) {
-        const int jbin = ibin+c_stencil(k);
-        if (jbin != ibin) base_n += c_bincount(jbin);
-        else include_same = true;
+      const int ibin = zbin*mbiny*mbinx + s_ybin*mbinx + s_xbin;
+      const int ibinCt = c_bincount(ibin);
+      if (ibinCt > 0) {
+        int base_n = 0;
+        bool include_same = false;
+        // count all local atoms in the current stencil "subphase" as potential neighbors
+        for (int k = c_nstencil_ssa(subphase); k < c_nstencil_ssa(subphase+1); k++) {
+          const int jbin = ibin+c_stencil(k);
+          if (jbin != ibin) base_n += c_bincount(jbin);
+          else include_same = true;
+        }
+        // Calculate how many ibin particles would have had some neighbors
+        if (base_n > 0) inum += ibinCt;
+        else if (include_same) inum += ibinCt - 1;
       }
-      // Calculate how many ibin particles would have had some neighbors
-      if (base_n > 0) inum += c_bincount(ibin);
-      else if (include_same) inum += c_bincount(ibin) - 1;
     }
-    // record where workItem ends in ilist
-    ssa_itemLen(workPhase,workItem) = inum - ssa_itemLoc(workPhase,workItem);
-    if (ssa_itemLen(workPhase,workItem) > 0) workItem++;
+    /* if (inum > inum_start) */ {
+      ssa_itemLoc(workPhase,workItem) = inum_start; // record where workItem starts in ilist
+      ssa_itemLen(workPhase,workItem) = inum - inum_start; // record workItem length
+if (ssa_itemLen(workPhase,workItem) < 0) fprintf(stdout, "undr%03d phase (%3d,%3d) inum %d - inum_start %d UNDERFLOW\n"
+  ,comm->me
+  ,workPhase
+  ,workItem
+  ,inum
+  ,inum_start
+);
+      workItem++;
+    }
   }
   }
   }
 
-fprintf(stdout, "phase %3d could use %6d inums, expected %6d inums. maxworkItems = %3d, inums/workItems = %g\n"
+fprintf(stdout, "phas%03d phase %3d could use %6d inums, expected %6d inums. maxworkItems = %3d, inums/workItems = %g\n"
+  ,comm->me
   ,workPhase
   ,inum - ssa_itemLoc(workPhase, 0)
   ,(nlocal*4 + ssa_phaseCt - 1) / ssa_phaseCt
@@ -296,7 +310,8 @@ fprintf(stdout, "phase %3d could use %6d inums, expected %6d inums. maxworkItems
   }
   }
   }
-fprintf(stdout, "total %3d could use %6d inums, expected %6d inums. inums/phase = %g\n"
+fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inums/phase = %g\n"
+  ,comm->me
   ,workPhase
   ,inum
   ,nlocal*4
@@ -378,6 +393,7 @@ fprintf(stdout, "total %3d could use %6d inums, expected %6d inums. inums/phase
   data.special_flag[2] = special_flag[2];
   data.special_flag[3] = special_flag[3];
 
+  bool firstTry = true;
   data.h_resize()=1;
   while(data.h_resize()) {
     data.h_new_maxneighs() = list->maxneighs;
@@ -390,8 +406,9 @@ fprintf(stdout, "total %3d could use %6d inums, expected %6d inums. inums/phase
     NPairSSAKokkosBuildFunctor<DeviceType> f(data,atoms_per_bin*5*sizeof(X_FLOAT));
     Kokkos::parallel_for(nall, f);
 #endif
-    data.build_locals();
+    data.build_locals(firstTry, comm->me);
     data.build_ghosts();
+    firstTry = false;
 
     DeviceType::fence();
     deep_copy(data.h_resize, data.resize);
@@ -415,7 +432,8 @@ fprintf(stdout, "total %3d could use %6d inums, expected %6d inums. inums/phase
   list->inum = data.neigh_list.inum; //FIXME once the above is in a parallel_for
   list->gnum = data.neigh_list.gnum; // it will need a deep_copy or something
 
-fprintf(stdout, "%6d inum %6d gnum, total used %6d, allocated %6d\n"
+fprintf(stdout, "Fina%03d: %6d inum %6d gnum, total used %6d, allocated %6d\n"
+  ,comm->me
   ,list->inum
   ,list->gnum
   ,list->inum + list->gnum
@@ -427,8 +445,9 @@ fprintf(stdout, "%6d inum %6d gnum, total used %6d, allocated %6d\n"
 
 
 template<class DeviceType>
-void NPairSSAKokkosExecute<DeviceType>::build_locals()
+void NPairSSAKokkosExecute<DeviceType>::build_locals(const bool firstTry, int me)
 {
+  const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil = d_stencil;
   int which = 0;
   int inum = 0;
   int workPhase = 0;
@@ -438,11 +457,29 @@ void NPairSSAKokkosExecute<DeviceType>::build_locals()
   for (int yoff = sy1 - 1; yoff >= 0; --yoff) {
   for (int xoff = sx1 - 1; xoff >= 0; --xoff) {
     int workItem = 0;
-    inum = d_ssa_itemLoc(workPhase, workItem); // get where workPhase starts in ilist
+    int skippedItems = 0;
+//    inum = d_ssa_itemLoc(workPhase, workItem); // get where workPhase starts in ilist
   for (int zbin = lbinzlo + zoff; zbin < lbinzhi; zbin += sz1) {
   for (int ybin = lbinylo + yoff - sy1 + 1; ybin < lbinyhi; ybin += sy1) {
   for (int xbin = lbinxlo + xoff - sx1 + 1; xbin < lbinxhi; xbin += sx1) {
-    d_ssa_itemLoc(workPhase, workItem) = inum; // record where workItem actually starts in ilist
+    if (d_ssa_itemLen(workPhase, workItem + skippedItems) == 0) {
+      if (firstTry) ++skippedItems;
+      else ++workItem; // phase is done,should break out of three loops here if we could...
+      continue;
+    }
+    int inum_start = d_ssa_itemLoc(workPhase, workItem + skippedItems);
+    if (inum > inum_start) { // This shouldn't happen!
+fprintf(stdout, "Rank%03d workphase (%2d,%3d,%3d): inum = %4d, but ssa_itemLoc = %4d OVERFLOW\n"
+  ,me
+  ,workPhase
+  ,workItem
+  ,workItem + skippedItems
+  ,inum
+  ,d_ssa_itemLoc(workPhase, workItem + skippedItems)
+);
+      inum_start = inum;
+    } else inum = inum_start;
+    // d_ssa_itemLoc(workPhase, workItem) = inum; // record where workItem actually starts in ilist
 
     for (int subphase = 0; subphase < 4; subphase++) {
       int s_ybin = ybin + ((subphase & 0x2) ? sy1 - 1 : 0);
@@ -461,9 +498,6 @@ void NPairSSAKokkosExecute<DeviceType>::build_locals()
         const X_FLOAT ztmp = x(i, 2);
         const int itype = type(i);
 
-        const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil
-          = d_stencil;
-
         // loop over all local atoms in the current stencil "subphase"
         for (int k = d_nstencil_ssa(subphase); k < d_nstencil_ssa(subphase+1); k++) {
           const int jbin = ibin+stencil(k);
@@ -517,26 +551,48 @@ void NPairSSAKokkosExecute<DeviceType>::build_locals()
         }
       }
     }
-    // record where workItem actually ends in ilist
-    d_ssa_itemLen(workPhase,workItem) = inum - d_ssa_itemLoc(workPhase,workItem);
-    if (d_ssa_itemLen(workPhase,workItem) > 0) workItem++;
+    int len = inum - inum_start;
+    if (len != d_ssa_itemLen(workPhase, workItem + skippedItems)) {
+fprintf(stdout, "Leng%03d workphase (%2d,%3d,%3d): len  = %4d, but ssa_itemLen = %4d%s\n"
+  ,me
+  ,workPhase
+  ,workItem
+  ,workItem + skippedItems
+  ,len
+  ,d_ssa_itemLen(workPhase, workItem + skippedItems)
+  ,(len > d_ssa_itemLen(workPhase, workItem + skippedItems)) ? " OVERFLOW" : ""
+);
+    }
+    if (inum > inum_start) {
+      d_ssa_itemLoc(workPhase,workItem) = inum_start; // record where workItem starts in ilist
+      d_ssa_itemLen(workPhase,workItem) = inum - inum_start; // record actual workItem length
+      workItem++;
+    } else if (firstTry) ++skippedItems;
   }
   }
   }
 
-fprintf(stdout, "phase %3d used %6d inums, expected %6d inums. workItems = %3d, inums/workItems = %g\n"
+fprintf(stdout, "Phas%03d phase %3d used %6d inums, workItems = %3d, skipped = %3d, inums/workItems = %g\n"
+  ,me
   ,workPhase
   ,inum - d_ssa_itemLoc(workPhase, 0)
-  ,(nlocal*4 + ssa_phaseCt - 1) / ssa_phaseCt
   ,workItem
+  ,skippedItems
   ,(inum - d_ssa_itemLoc(workPhase, 0)) / (double) workItem
 );
-    // record where workPhase ends
-    d_ssa_phaseLen(workPhase++) = workItem;
+    // record where workPhase actually ends
+    if (firstTry) {
+      d_ssa_phaseLen(workPhase) = workItem;
+      while (workItem < (int) d_ssa_itemLen.dimension_1()) {
+        d_ssa_itemLen(workPhase,workItem++) = 0;
+      }
+    }
+    ++workPhase;
   }
   }
   }
-fprintf(stdout, "Total %3d could use %6d inums, expected %6d inums. inums/phase = %g\n"
+fprintf(stdout, "Totl%03d %3d could use %6d inums, expected %6d inums. inums/phase = %g\n"
+  ,me
   ,workPhase
   ,inum
   ,nlocal*4
diff --git a/src/KOKKOS/npair_ssa_kokkos.h b/src/KOKKOS/npair_ssa_kokkos.h
index 96efd7404b..2c2ae15fb8 100644
--- a/src/KOKKOS/npair_ssa_kokkos.h
+++ b/src/KOKKOS/npair_ssa_kokkos.h
@@ -289,7 +289,7 @@ class NPairSSAKokkosExecute
 
   ~NPairSSAKokkosExecute() {neigh_list.copymode = 1;};
 
-  void build_locals();
+  void build_locals(const bool firstTry, int me);
   void build_ghosts();
 
   KOKKOS_INLINE_FUNCTION

From 2b2f3bd57c1d42a87334812cdab06f7e75405b7e Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Sun, 2 Apr 2017 00:07:24 -0400
Subject: [PATCH 230/267] USER-DPD Kokkos: #ifdef DEBUG_SSA_BUILD_LOCALS the
 new debug output

---
 src/KOKKOS/npair_ssa_kokkos.cpp | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp
index 042c48fbac..4c3218a08a 100644
--- a/src/KOKKOS/npair_ssa_kokkos.cpp
+++ b/src/KOKKOS/npair_ssa_kokkos.cpp
@@ -281,9 +281,9 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
         else if (include_same) inum += ibinCt - 1;
       }
     }
-    /* if (inum > inum_start) */ {
-      ssa_itemLoc(workPhase,workItem) = inum_start; // record where workItem starts in ilist
-      ssa_itemLen(workPhase,workItem) = inum - inum_start; // record workItem length
+    ssa_itemLoc(workPhase,workItem) = inum_start; // record where workItem starts in ilist
+    ssa_itemLen(workPhase,workItem) = inum - inum_start; // record workItem length
+#ifdef DEBUG_SSA_BUILD_LOCALS
 if (ssa_itemLen(workPhase,workItem) < 0) fprintf(stdout, "undr%03d phase (%3d,%3d) inum %d - inum_start %d UNDERFLOW\n"
   ,comm->me
   ,workPhase
@@ -291,12 +291,13 @@ if (ssa_itemLen(workPhase,workItem) < 0) fprintf(stdout, "undr%03d phase (%3d,%3
   ,inum
   ,inum_start
 );
-      workItem++;
-    }
+#endif
+    workItem++;
   }
   }
   }
 
+#ifdef DEBUG_SSA_BUILD_LOCALS
 fprintf(stdout, "phas%03d phase %3d could use %6d inums, expected %6d inums. maxworkItems = %3d, inums/workItems = %g\n"
   ,comm->me
   ,workPhase
@@ -305,11 +306,13 @@ fprintf(stdout, "phas%03d phase %3d could use %6d inums, expected %6d inums. max
   ,workItem
   ,(inum - ssa_itemLoc(workPhase, 0)) / (double) workItem
 );
+#endif
     // record where workPhase ends
     ssa_phaseLen(workPhase++) = workItem;
   }
   }
   }
+#ifdef DEBUG_SSA_BUILD_LOCALS
 fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inums/phase = %g\n"
   ,comm->me
   ,workPhase
@@ -317,9 +320,10 @@ fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inu
   ,nlocal*4
   ,inum / (double) workPhase
 );
+#endif
   nl_size = inum; // record how much space is needed for the local work plan
 }
-  // count how many ghosts are likely to have neighbors, and increase the work plan storage
+  // count how many ghosts might have neighbors, and increase the work plan storage
   for (int workPhase = 0; workPhase < ssa_gphaseCt; workPhase++) {
      nl_size += k_gbincount.h_view(workPhase + 1);
   }
@@ -432,13 +436,15 @@ fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inu
   list->inum = data.neigh_list.inum; //FIXME once the above is in a parallel_for
   list->gnum = data.neigh_list.gnum; // it will need a deep_copy or something
 
-fprintf(stdout, "Fina%03d: %6d inum %6d gnum, total used %6d, allocated %6d\n"
+#ifdef DEBUG_SSA_BUILD_LOCALS
+fprintf(stdout, "Fina%03d %6d inum %6d gnum, total used %6d, allocated %6d\n"
   ,comm->me
   ,list->inum
   ,list->gnum
   ,list->inum + list->gnum
   ,nl_size
 );
+#endif
 
   list->k_ilist.template modify<DeviceType>();
 }
@@ -468,6 +474,7 @@ void NPairSSAKokkosExecute<DeviceType>::build_locals(const bool firstTry, int me
       continue;
     }
     int inum_start = d_ssa_itemLoc(workPhase, workItem + skippedItems);
+#ifdef DEBUG_SSA_BUILD_LOCALS
     if (inum > inum_start) { // This shouldn't happen!
 fprintf(stdout, "Rank%03d workphase (%2d,%3d,%3d): inum = %4d, but ssa_itemLoc = %4d OVERFLOW\n"
   ,me
@@ -478,7 +485,9 @@ fprintf(stdout, "Rank%03d workphase (%2d,%3d,%3d): inum = %4d, but ssa_itemLoc =
   ,d_ssa_itemLoc(workPhase, workItem + skippedItems)
 );
       inum_start = inum;
-    } else inum = inum_start;
+    } else
+#endif
+    inum = inum_start;
     // d_ssa_itemLoc(workPhase, workItem) = inum; // record where workItem actually starts in ilist
 
     for (int subphase = 0; subphase < 4; subphase++) {
@@ -552,6 +561,7 @@ fprintf(stdout, "Rank%03d workphase (%2d,%3d,%3d): inum = %4d, but ssa_itemLoc =
       }
     }
     int len = inum - inum_start;
+#ifdef DEBUG_SSA_BUILD_LOCALS
     if (len != d_ssa_itemLen(workPhase, workItem + skippedItems)) {
 fprintf(stdout, "Leng%03d workphase (%2d,%3d,%3d): len  = %4d, but ssa_itemLen = %4d%s\n"
   ,me
@@ -563,6 +573,7 @@ fprintf(stdout, "Leng%03d workphase (%2d,%3d,%3d): len  = %4d, but ssa_itemLen =
   ,(len > d_ssa_itemLen(workPhase, workItem + skippedItems)) ? " OVERFLOW" : ""
 );
     }
+#endif
     if (inum > inum_start) {
       d_ssa_itemLoc(workPhase,workItem) = inum_start; // record where workItem starts in ilist
       d_ssa_itemLen(workPhase,workItem) = inum - inum_start; // record actual workItem length
@@ -572,6 +583,7 @@ fprintf(stdout, "Leng%03d workphase (%2d,%3d,%3d): len  = %4d, but ssa_itemLen =
   }
   }
 
+#ifdef DEBUG_SSA_BUILD_LOCALS
 fprintf(stdout, "Phas%03d phase %3d used %6d inums, workItems = %3d, skipped = %3d, inums/workItems = %g\n"
   ,me
   ,workPhase
@@ -580,6 +592,7 @@ fprintf(stdout, "Phas%03d phase %3d used %6d inums, workItems = %3d, skipped = %
   ,skippedItems
   ,(inum - d_ssa_itemLoc(workPhase, 0)) / (double) workItem
 );
+#endif
     // record where workPhase actually ends
     if (firstTry) {
       d_ssa_phaseLen(workPhase) = workItem;
@@ -591,6 +604,7 @@ fprintf(stdout, "Phas%03d phase %3d used %6d inums, workItems = %3d, skipped = %
   }
   }
   }
+#ifdef DEBUG_SSA_BUILD_LOCALS
 fprintf(stdout, "Totl%03d %3d could use %6d inums, expected %6d inums. inums/phase = %g\n"
   ,me
   ,workPhase
@@ -598,6 +612,7 @@ fprintf(stdout, "Totl%03d %3d could use %6d inums, expected %6d inums. inums/pha
   ,nlocal*4
   ,inum / (double) workPhase
 );
+#endif
 
 //FIXME  if (ssa_phaseCt != workPhase) error->one(FLERR,"ssa_phaseCt was wrong");
 

From aedd7c57f3f78596f8b737972aed5f241ef4f7f4 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Mon, 3 Apr 2017 16:42:18 -0600
Subject: [PATCH 231/267] Reset atom map values from restart file

---
 src/read_restart.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/read_restart.cpp b/src/read_restart.cpp
index 331a5d6cda..fcbd8d186d 100644
--- a/src/read_restart.cpp
+++ b/src/read_restart.cpp
@@ -905,8 +905,10 @@ void ReadRestart::header(int incompatible)
       atom->tag_enable = read_int();
     } else if (flag == ATOM_MAP_STYLE) {
       atom->map_style = read_int();
+      atom->map_style = 0;
     } else if (flag == ATOM_MAP_USER) {
       atom->map_user  = read_int();
+      atom->map_user = 0;
     } else if (flag == ATOM_SORTFREQ) {
       atom->sortfreq = read_int();
     } else if (flag == ATOM_SORTBIN) {

From 4d4b6f66b7139be46a9f292d7bad2403b22117f3 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 5 Apr 2017 11:42:25 -0600
Subject: [PATCH 232/267] Changing default gb/test to on

---
 src/KOKKOS/kokkos.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp
index a000ad5550..10e7bda4e0 100644
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@@ -34,7 +34,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
   lmp->kokkos = this;
 
   auto_sync = 1;
-  gb_test = 0;
+  gb_test = 1;
 
   int me = 0;
   MPI_Comm_rank(world,&me);
@@ -157,7 +157,7 @@ void KokkosLMP::accelerator(int narg, char **arg)
   neighflag = FULL;
   neighflag_qeq = FULL;
   neighflag_qeq_set = 0;
-  gb_test = 0;
+  gb_test = 1;
   int newtonflag = 0;
   double binsize = 0.0;
   exchange_comm_classic = forward_comm_classic = 0;

From 9e272cb393fdce5697267a2b629cd5a3a3fdc0b2 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Thu, 6 Apr 2017 02:31:45 -0400
Subject: [PATCH 233/267] USER-DPD Kokkos: use a parallel_for() to build the
 locals workplan for SSA

---
 src/KOKKOS/npair_ssa_kokkos.cpp | 131 ++++++++++++++------------------
 src/KOKKOS/npair_ssa_kokkos.h   |  27 ++++++-
 2 files changed, 84 insertions(+), 74 deletions(-)

diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp
index 4c3218a08a..2b33256599 100644
--- a/src/KOKKOS/npair_ssa_kokkos.cpp
+++ b/src/KOKKOS/npair_ssa_kokkos.cpp
@@ -34,6 +34,14 @@ namespace LAMMPS_NS {
 template<class DeviceType>
 NPairSSAKokkos<DeviceType>::NPairSSAKokkos(LAMMPS *lmp) : NPair(lmp), ssa_phaseCt(27), ssa_gphaseCt(7)
 {
+  const int gphaseLenEstimate = 1; //FIXME make this 4 eventually
+  k_ssa_gphaseLen = DAT::tdual_int_1d("NPairSSAKokkos:ssa_gphaseLen",ssa_gphaseCt);
+  ssa_gphaseLen = k_ssa_gphaseLen.view<DeviceType>();
+
+  k_ssa_gitemLoc = DAT::tdual_int_2d("NPairSSAKokkos::ssa_gitemLoc",ssa_gphaseCt,gphaseLenEstimate);
+  ssa_gitemLoc = k_ssa_gitemLoc.view<DeviceType>();
+  k_ssa_gitemLen = DAT::tdual_int_2d("NPairSSAKokkos::ssa_gitemLen",ssa_gphaseCt,gphaseLenEstimate);
+  ssa_gitemLen = k_ssa_gitemLen.view<DeviceType>();
 }
 
 /* ----------------------------------------------------------------------
@@ -132,6 +140,27 @@ void NPairSSAKokkos<DeviceType>::copy_stencil_info()
   sx1 = ns_ssa->sx + 1;
   sy1 = ns_ssa->sy + 1;
   sz1 = ns_ssa->sz + 1;
+
+  // Setup the phases of the workplan for locals
+  ssa_phaseCt = sz1*sy1*sx1;
+  if (ssa_phaseCt > (int) k_ssa_phaseLen.dimension_0()) {
+    k_ssa_phaseLen = DAT::tdual_int_1d("NPairSSAKokkos:ssa_phaseLen",ssa_phaseCt);
+    ssa_phaseLen = k_ssa_phaseLen.view<DeviceType>();
+    k_ssa_phaseOff = DAT::tdual_int_1d_3("NPairSSAKokkos:ssa_phaseOff",ssa_phaseCt);
+    ssa_phaseOff = k_ssa_phaseOff.view<DeviceType>();
+  }
+  int workPhase = 0;
+  for (int zoff = sz1 - 1; zoff >= 0; --zoff) {
+    for (int yoff = sy1 - 1; yoff >= 0; --yoff) {
+      for (int xoff = sx1 - 1; xoff >= 0; --xoff) {
+        ssa_phaseOff(workPhase, 0) = xoff;
+        ssa_phaseOff(workPhase, 1) = yoff;
+        ssa_phaseOff(workPhase, 2) = zoff;
+        workPhase++;
+      }
+    }
+  }
+
 }
 
 /* ---------------------------------------------------------------------- */
@@ -208,18 +237,11 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
   const int nlocal = includegroup?atom->nfirst:atom->nlocal;
   int nl_size;
 
-  ssa_phaseCt = sz1*sy1*sx1;
+  int xbinCt = (lbinxhi - lbinxlo + sx1 - 1) / sx1 + 1;
+  int ybinCt = (lbinyhi - lbinylo + sy1 - 1) / sy1 + 1;
+  int zbinCt = (lbinzhi - lbinzlo + sz1 - 1) / sz1 + 1;
+  int phaseLenEstimate = xbinCt*ybinCt*zbinCt;
 
-  int xbin = (lbinxhi - lbinxlo + sx1 - 1) / sx1 + 1;
-  int ybin = (lbinyhi - lbinylo + sy1 - 1) / sy1 + 1;
-  int zbin = (lbinzhi - lbinzlo + sz1 - 1) / sz1 + 1;
-  int phaseLenEstimate = xbin*ybin*zbin;
-  int gphaseLenEstimate = 1; //FIXME make this 4 eventually
-
-  if (ssa_phaseCt > (int) k_ssa_phaseLen.dimension_0()) {
-    k_ssa_phaseLen = DAT::tdual_int_1d("NPairSSAKokkos:ssa_phaseLen",ssa_phaseCt);
-    ssa_phaseLen = k_ssa_phaseLen.view<DeviceType>();
-  }
   if ((ssa_phaseCt > (int) k_ssa_itemLoc.dimension_0()) ||
       (phaseLenEstimate > (int) k_ssa_itemLoc.dimension_1())) {
     k_ssa_itemLoc = DAT::tdual_int_2d("NPairSSAKokkos::ssa_itemLoc",ssa_phaseCt,phaseLenEstimate);
@@ -228,18 +250,6 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
     ssa_itemLen = k_ssa_itemLen.view<DeviceType>();
   }
 
-  if (ssa_gphaseCt > (int) k_ssa_gphaseLen.dimension_0()) {
-    k_ssa_gphaseLen = DAT::tdual_int_1d("NPairSSAKokkos:ssa_gphaseLen",ssa_gphaseCt);
-    ssa_gphaseLen = k_ssa_gphaseLen.view<DeviceType>();
-  }
-  if ((ssa_gphaseCt > (int) k_ssa_gitemLoc.dimension_0()) ||
-      (gphaseLenEstimate > (int) k_ssa_gitemLoc.dimension_1())) {
-    k_ssa_gitemLoc = DAT::tdual_int_2d("NPairSSAKokkos::ssa_gitemLoc",ssa_gphaseCt,gphaseLenEstimate);
-    ssa_gitemLoc = k_ssa_gitemLoc.view<DeviceType>();
-    k_ssa_gitemLen = DAT::tdual_int_2d("NPairSSAKokkos::ssa_gitemLen",ssa_gphaseCt,gphaseLenEstimate);
-    ssa_gitemLen = k_ssa_gitemLen.view<DeviceType>();
-  }
-
 { // Preflight the neighbor list workplan
   const typename ArrayTypes<DeviceType>::t_int_1d_const c_bincount = k_bincount.view<DeviceType>();
   const typename ArrayTypes<DeviceType>::t_int_2d_const c_bins     = k_bins.view<DeviceType>();
@@ -247,11 +257,11 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
   const typename ArrayTypes<DeviceType>::t_int_1d_const c_nstencil_ssa = k_nstencil_ssa.view<DeviceType>();
   int inum = 0;
 
-  int workPhase = 0;
-  // loop over bins with local atoms, storing half of the neighbors
-  for (int zoff = sz1 - 1; zoff >= 0; --zoff) {
-  for (int yoff = sy1 - 1; yoff >= 0; --yoff) {
-  for (int xoff = sx1 - 1; xoff >= 0; --xoff) {
+  // loop over bins with local atoms, counting half of the neighbors
+  for (int workPhase = 0; workPhase < ssa_phaseCt; ++workPhase) {
+    int zoff = ssa_phaseOff(workPhase, 2);
+    int yoff = ssa_phaseOff(workPhase, 1);
+    int xoff = ssa_phaseOff(workPhase, 0);
     int workItem = 0;
   for (int zbin = lbinzlo + zoff; zbin < lbinzhi; zbin += sz1) {
   for (int ybin = lbinylo + yoff - sy1 + 1; ybin < lbinyhi; ybin += sy1) {
@@ -308,9 +318,7 @@ fprintf(stdout, "phas%03d phase %3d could use %6d inums, expected %6d inums. max
 );
 #endif
     // record where workPhase ends
-    ssa_phaseLen(workPhase++) = workItem;
-  }
-  }
+    ssa_phaseLen(workPhase) = workItem;
   }
 #ifdef DEBUG_SSA_BUILD_LOCALS
 fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inums/phase = %g\n"
@@ -343,6 +351,7 @@ fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inu
          k_nstencil_ssa.view<DeviceType>(),
          ssa_phaseCt,
          k_ssa_phaseLen.view<DeviceType>(),
+         k_ssa_phaseOff.view<DeviceType>(),
          k_ssa_itemLoc.view<DeviceType>(),
          k_ssa_itemLen.view<DeviceType>(),
          ssa_gphaseCt,
@@ -410,7 +419,17 @@ fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inu
     NPairSSAKokkosBuildFunctor<DeviceType> f(data,atoms_per_bin*5*sizeof(X_FLOAT));
     Kokkos::parallel_for(nall, f);
 #endif
-    data.build_locals(firstTry, comm->me);
+    // loop over bins with local atoms, storing half of the neighbors
+#ifdef USE_LAMBDA_BUILD
+    Kokkos::parallel_for(ssa_phaseCt, LAMMPS_LAMBDA (const int workPhase) {
+      data.build_locals_onePhase(firstTry, comm->me, workPhase);
+    });
+#else
+    NPairSSAKokkosBuildFunctor<DeviceType> f(data, firstTry, comm->me);
+    Kokkos::parallel_for(ssa_phaseCt, f);
+#endif
+    data.neigh_list.inum = ssa_itemLoc(ssa_phaseCt-1,ssa_phaseLen(ssa_phaseCt-1)-1) +
+      ssa_itemLen(ssa_phaseCt-1,ssa_phaseLen(ssa_phaseCt-1)-1);
     data.build_ghosts();
     firstTry = false;
 
@@ -451,20 +470,16 @@ fprintf(stdout, "Fina%03d %6d inum %6d gnum, total used %6d, allocated %6d\n"
 
 
 template<class DeviceType>
-void NPairSSAKokkosExecute<DeviceType>::build_locals(const bool firstTry, int me)
+void NPairSSAKokkosExecute<DeviceType>::build_locals_onePhase(const bool firstTry, int me, int workPhase) const
 {
   const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil = d_stencil;
   int which = 0;
-  int inum = 0;
-  int workPhase = 0;
 
-  // loop over bins with local atoms, storing half of the neighbors
-  for (int zoff = sz1 - 1; zoff >= 0; --zoff) {
-  for (int yoff = sy1 - 1; yoff >= 0; --yoff) {
-  for (int xoff = sx1 - 1; xoff >= 0; --xoff) {
-    int workItem = 0;
-    int skippedItems = 0;
-//    inum = d_ssa_itemLoc(workPhase, workItem); // get where workPhase starts in ilist
+  int zoff = d_ssa_phaseOff(workPhase, 2);
+  int yoff = d_ssa_phaseOff(workPhase, 1);
+  int xoff = d_ssa_phaseOff(workPhase, 0);
+  int workItem = 0;
+  int skippedItems = 0;
   for (int zbin = lbinzlo + zoff; zbin < lbinzhi; zbin += sz1) {
   for (int ybin = lbinylo + yoff - sy1 + 1; ybin < lbinyhi; ybin += sy1) {
   for (int xbin = lbinxlo + xoff - sx1 + 1; xbin < lbinxhi; xbin += sx1) {
@@ -474,21 +489,7 @@ void NPairSSAKokkosExecute<DeviceType>::build_locals(const bool firstTry, int me
       continue;
     }
     int inum_start = d_ssa_itemLoc(workPhase, workItem + skippedItems);
-#ifdef DEBUG_SSA_BUILD_LOCALS
-    if (inum > inum_start) { // This shouldn't happen!
-fprintf(stdout, "Rank%03d workphase (%2d,%3d,%3d): inum = %4d, but ssa_itemLoc = %4d OVERFLOW\n"
-  ,me
-  ,workPhase
-  ,workItem
-  ,workItem + skippedItems
-  ,inum
-  ,d_ssa_itemLoc(workPhase, workItem + skippedItems)
-);
-      inum_start = inum;
-    } else
-#endif
-    inum = inum_start;
-    // d_ssa_itemLoc(workPhase, workItem) = inum; // record where workItem actually starts in ilist
+    int inum = inum_start;
 
     for (int subphase = 0; subphase < 4; subphase++) {
       int s_ybin = ybin + ((subphase & 0x2) ? sy1 - 1 : 0);
@@ -600,23 +601,7 @@ fprintf(stdout, "Phas%03d phase %3d used %6d inums, workItems = %3d, skipped = %
         d_ssa_itemLen(workPhase,workItem++) = 0;
       }
     }
-    ++workPhase;
-  }
-  }
-  }
-#ifdef DEBUG_SSA_BUILD_LOCALS
-fprintf(stdout, "Totl%03d %3d could use %6d inums, expected %6d inums. inums/phase = %g\n"
-  ,me
-  ,workPhase
-  ,inum
-  ,nlocal*4
-  ,inum / (double) workPhase
-);
-#endif
 
-//FIXME  if (ssa_phaseCt != workPhase) error->one(FLERR,"ssa_phaseCt was wrong");
-
-  neigh_list.inum = inum;
 }
 
 
diff --git a/src/KOKKOS/npair_ssa_kokkos.h b/src/KOKKOS/npair_ssa_kokkos.h
index 2c2ae15fb8..62c4135cc7 100644
--- a/src/KOKKOS/npair_ssa_kokkos.h
+++ b/src/KOKKOS/npair_ssa_kokkos.h
@@ -41,9 +41,11 @@ class NPairSSAKokkos : public NPair {
   // SSA Work plan data structures
   int ssa_phaseCt;
   DAT::tdual_int_1d k_ssa_phaseLen;
+  DAT::tdual_int_1d_3 k_ssa_phaseOff;
   DAT::tdual_int_2d k_ssa_itemLoc;
   DAT::tdual_int_2d k_ssa_itemLen;
   typename AT::t_int_1d ssa_phaseLen;
+  typename AT::t_int_1d_3 ssa_phaseOff;
   typename AT::t_int_2d ssa_itemLoc;
   typename AT::t_int_2d ssa_itemLen;
 
@@ -175,6 +177,7 @@ class NPairSSAKokkosExecute
   // SSA Work plan data structures
   int ssa_phaseCt;
   typename AT::t_int_1d d_ssa_phaseLen;
+  typename AT::t_int_1d_3_const d_ssa_phaseOff;
   typename AT::t_int_2d d_ssa_itemLoc;
   typename AT::t_int_2d d_ssa_itemLen;
   int ssa_gphaseCt;
@@ -198,6 +201,7 @@ class NPairSSAKokkosExecute
         const typename AT::t_int_1d &_d_nstencil_ssa,
         const int _ssa_phaseCt,
         const typename AT::t_int_1d &_d_ssa_phaseLen,
+        const typename AT::t_int_1d_3 &_d_ssa_phaseOff,
         const typename AT::t_int_2d &_d_ssa_itemLoc,
         const typename AT::t_int_2d &_d_ssa_itemLen,
         const int _ssa_gphaseCt,
@@ -242,6 +246,7 @@ class NPairSSAKokkosExecute
     d_stencil(_d_stencil),d_stencilxyz(_d_stencilxyz),d_nstencil_ssa(_d_nstencil_ssa),
     ssa_phaseCt(_ssa_phaseCt),
     d_ssa_phaseLen(_d_ssa_phaseLen),
+    d_ssa_phaseOff(_d_ssa_phaseOff),
     d_ssa_itemLoc(_d_ssa_itemLoc),
     d_ssa_itemLen(_d_ssa_itemLen),
     ssa_gphaseCt(_ssa_gphaseCt),
@@ -289,7 +294,9 @@ class NPairSSAKokkosExecute
 
   ~NPairSSAKokkosExecute() {neigh_list.copymode = 1;};
 
-  void build_locals(const bool firstTry, int me);
+  KOKKOS_FUNCTION
+  void build_locals_onePhase(const bool firstTry, int me, int workPhase) const;
+
   void build_ghosts();
 
   KOKKOS_INLINE_FUNCTION
@@ -344,6 +351,24 @@ class NPairSSAKokkosExecute
 
 };
 
+template<class DeviceType>
+struct NPairSSAKokkosBuildFunctor {
+  typedef DeviceType device_type;
+
+  const NPairSSAKokkosExecute<DeviceType> c;
+  const bool firstTry;
+  const int me;
+
+  NPairSSAKokkosBuildFunctor(const NPairSSAKokkosExecute<DeviceType> &_c,
+                             const bool _firstTry, const int _me):c(_c),
+                             firstTry(_firstTry), me(_me) {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int & i) const {
+    c.build_locals_onePhase(firstTry, me, i);
+  }
+};
+
 }
 
 #endif

From 178af2ec9e7225daff3ce853af749f6fdb6e58a9 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Thu, 6 Apr 2017 03:53:57 -0400
Subject: [PATCH 234/267] USER-DPD Kokkos: use a parallel_for() to build the
 ghosts workplan for SSA

---
 src/KOKKOS/npair_ssa_kokkos.cpp | 47 +++++++++++++--------------------
 src/KOKKOS/npair_ssa_kokkos.h   | 25 +++---------------
 2 files changed, 23 insertions(+), 49 deletions(-)

diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp
index 2b33256599..ba4bc9171c 100644
--- a/src/KOKKOS/npair_ssa_kokkos.cpp
+++ b/src/KOKKOS/npair_ssa_kokkos.cpp
@@ -333,7 +333,10 @@ fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inu
 }
   // count how many ghosts might have neighbors, and increase the work plan storage
   for (int workPhase = 0; workPhase < ssa_gphaseCt; workPhase++) {
-     nl_size += k_gbincount.h_view(workPhase + 1);
+    int len = k_gbincount.h_view(workPhase + 1);
+    ssa_gitemLoc(workPhase,0) = nl_size; // record where workItem starts in ilist
+    ssa_gitemLen(workPhase,0) = len;
+    nl_size += len;
   }
   list->grow(nl_size); // Make special larger SSA neighbor list
 
@@ -415,22 +418,19 @@ fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inu
     Kokkos::deep_copy(data.resize, data.h_resize);
     Kokkos::deep_copy(data.new_maxneighs, data.h_new_maxneighs);
 
-#ifdef NOTYET
-    NPairSSAKokkosBuildFunctor<DeviceType> f(data,atoms_per_bin*5*sizeof(X_FLOAT));
-    Kokkos::parallel_for(nall, f);
-#endif
     // loop over bins with local atoms, storing half of the neighbors
-#ifdef USE_LAMBDA_BUILD
     Kokkos::parallel_for(ssa_phaseCt, LAMMPS_LAMBDA (const int workPhase) {
       data.build_locals_onePhase(firstTry, comm->me, workPhase);
     });
-#else
-    NPairSSAKokkosBuildFunctor<DeviceType> f(data, firstTry, comm->me);
-    Kokkos::parallel_for(ssa_phaseCt, f);
-#endif
     data.neigh_list.inum = ssa_itemLoc(ssa_phaseCt-1,ssa_phaseLen(ssa_phaseCt-1)-1) +
       ssa_itemLen(ssa_phaseCt-1,ssa_phaseLen(ssa_phaseCt-1)-1);
-    data.build_ghosts();
+
+    // loop over AIR ghost atoms, storing their local neighbors
+    Kokkos::parallel_for(ssa_gphaseCt, LAMMPS_LAMBDA (const int workPhase) {
+      data.build_ghosts_onePhase(workPhase);
+    });
+    data.neigh_list.gnum = ssa_gitemLoc(ssa_gphaseCt-1,ssa_gphaseLen(ssa_gphaseCt-1)-1) +
+      ssa_gitemLen(ssa_gphaseCt-1,ssa_gphaseLen(ssa_gphaseCt-1)-1) - data.neigh_list.inum;
     firstTry = false;
 
     DeviceType::fence();
@@ -606,34 +606,27 @@ fprintf(stdout, "Phas%03d phase %3d used %6d inums, workItems = %3d, skipped = %
 
 
 template<class DeviceType>
-void NPairSSAKokkosExecute<DeviceType>::build_ghosts()
+void NPairSSAKokkosExecute<DeviceType>::build_ghosts_onePhase(int workPhase) const
 {
-  int n = 0;
+  const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil = d_stencil;
   int which = 0;
-  int inum = neigh_list.inum;
-  int gnum = 0;
 
-  // loop over AIR ghost atoms, storing their local neighbors
   // since these are ghosts, must check if stencil bin is out of bounds
-  for (int workPhase = 0; workPhase < ssa_gphaseCt; workPhase++) {
     int airnum = workPhase + 1;
     //FIXME for now, there is only 1 workItem for each ghost AIR
     int workItem;
     for (workItem = 0; workItem < 1; ++workItem) {
-      d_ssa_gitemLoc(workPhase, workItem) = inum + gnum; // record where workItem starts in ilist
+      int gNdx = d_ssa_gitemLoc(workPhase, workItem); // record where workItem starts in ilist
       for (int il = 0; il < c_gbincount(airnum); ++il) {
         const int i = c_gbins(airnum, il);
-        n = 0;
+        int n = 0;
 
-        const AtomNeighbors neighbors_i = neigh_list.get_neighbors(inum + gnum);
+        const AtomNeighbors neighbors_i = neigh_list.get_neighbors(gNdx);
         const X_FLOAT xtmp = x(i, 0);
         const X_FLOAT ytmp = x(i, 1);
         const X_FLOAT ztmp = x(i, 2);
         const int itype = type(i);
 
-        const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil
-          = d_stencil;
-
         int loc[3];
         const int ibin = coord2bin(x(i, 0), x(i, 1), x(i, 2), &(loc[0]));
 
@@ -686,8 +679,8 @@ void NPairSSAKokkosExecute<DeviceType>::build_ghosts()
         }
 
         if (n > 0) {
-          neigh_list.d_numneigh(inum + gnum) = n;
-          neigh_list.d_ilist(inum + (gnum++)) = i;
+          neigh_list.d_numneigh(gNdx) = n;
+          neigh_list.d_ilist(gNdx++) = i;
           if(n > neigh_list.maxneighs) {
             resize() = 1;
             if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
@@ -695,12 +688,10 @@ void NPairSSAKokkosExecute<DeviceType>::build_ghosts()
         }
       }
       // record where workItem ends in ilist
-      d_ssa_gitemLen(workPhase,workItem) = inum + gnum - d_ssa_gitemLoc(workPhase,workItem);
+      d_ssa_gitemLen(workPhase,workItem) = gNdx - d_ssa_gitemLoc(workPhase,workItem);
       // if (d_ssa_gitemLen(workPhase,workItem) > 0) workItem++;
     }
     d_ssa_gphaseLen(workPhase) = workItem;
-  }
-  neigh_list.gnum = gnum;
 }
 
 }
diff --git a/src/KOKKOS/npair_ssa_kokkos.h b/src/KOKKOS/npair_ssa_kokkos.h
index 62c4135cc7..98046feba8 100644
--- a/src/KOKKOS/npair_ssa_kokkos.h
+++ b/src/KOKKOS/npair_ssa_kokkos.h
@@ -275,7 +275,7 @@ class NPairSSAKokkosExecute
     bboxlo[0] = _bboxlo[0]; bboxlo[1] = _bboxlo[1]; bboxlo[2] = _bboxlo[2];
     bboxhi[0] = _bboxhi[0]; bboxhi[1] = _bboxhi[1]; bboxhi[2] = _bboxhi[2];
 
-    resize = typename AT::t_int_scalar("NeighborKokkosFunctor::resize");
+    resize = typename AT::t_int_scalar("NPairSSAKokkosExecute::resize");
 #ifndef KOKKOS_USE_CUDA_UVM
     h_resize = Kokkos::create_mirror_view(resize);
 #else
@@ -283,7 +283,7 @@ class NPairSSAKokkosExecute
 #endif
     h_resize() = 1;
     new_maxneighs = typename AT::
-      t_int_scalar("NeighborKokkosFunctor::new_maxneighs");
+      t_int_scalar("NPairSSAKokkosExecute::new_maxneighs");
 #ifndef KOKKOS_USE_CUDA_UVM
     h_new_maxneighs = Kokkos::create_mirror_view(new_maxneighs);
 #else
@@ -297,7 +297,8 @@ class NPairSSAKokkosExecute
   KOKKOS_FUNCTION
   void build_locals_onePhase(const bool firstTry, int me, int workPhase) const;
 
-  void build_ghosts();
+  KOKKOS_FUNCTION
+  void build_ghosts_onePhase(int workPhase) const;
 
   KOKKOS_INLINE_FUNCTION
   int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z, int* i) const
@@ -351,24 +352,6 @@ class NPairSSAKokkosExecute
 
 };
 
-template<class DeviceType>
-struct NPairSSAKokkosBuildFunctor {
-  typedef DeviceType device_type;
-
-  const NPairSSAKokkosExecute<DeviceType> c;
-  const bool firstTry;
-  const int me;
-
-  NPairSSAKokkosBuildFunctor(const NPairSSAKokkosExecute<DeviceType> &_c,
-                             const bool _firstTry, const int _me):c(_c),
-                             firstTry(_firstTry), me(_me) {};
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int & i) const {
-    c.build_locals_onePhase(firstTry, me, i);
-  }
-};
-
 }
 
 #endif

From 035d0a80d7bc8375886c3c6989a85c1bda12de67 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Mon, 10 Apr 2017 16:38:58 -0600
Subject: [PATCH 235/267] Reducing memory churn in pair_exp6_rx_kokkos

---
 src/KOKKOS/pair_exp6_rx_kokkos.cpp | 56 +++++++++++++++++++++---------
 src/KOKKOS/pair_exp6_rx_kokkos.h   |  4 +++
 2 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index 5c74cba8c7..312f1c6076 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -187,22 +187,25 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   {
      const int np_total = nlocal + atom->nghost;
 
-     PairExp6ParamData.epsilon1      = typename AT::t_float_1d("PairExp6ParamData.epsilon1"     ,np_total);
-     PairExp6ParamData.alpha1        = typename AT::t_float_1d("PairExp6ParamData.alpha1"       ,np_total);
-     PairExp6ParamData.rm1           = typename AT::t_float_1d("PairExp6ParamData.rm1"          ,np_total);
-     PairExp6ParamData.mixWtSite1    = typename AT::t_float_1d("PairExp6ParamData.mixWtSite1"   ,np_total);
-     PairExp6ParamData.epsilon2      = typename AT::t_float_1d("PairExp6ParamData.epsilon2"     ,np_total);
-     PairExp6ParamData.alpha2        = typename AT::t_float_1d("PairExp6ParamData.alpha2"       ,np_total);
-     PairExp6ParamData.rm2           = typename AT::t_float_1d("PairExp6ParamData.rm2"          ,np_total);
-     PairExp6ParamData.mixWtSite2    = typename AT::t_float_1d("PairExp6ParamData.mixWtSite2"   ,np_total);
-     PairExp6ParamData.epsilonOld1   = typename AT::t_float_1d("PairExp6ParamData.epsilonOld1"  ,np_total);
-     PairExp6ParamData.alphaOld1     = typename AT::t_float_1d("PairExp6ParamData.alphaOld1"    ,np_total);
-     PairExp6ParamData.rmOld1        = typename AT::t_float_1d("PairExp6ParamData.rmOld1"       ,np_total);
-     PairExp6ParamData.mixWtSite1old = typename AT::t_float_1d("PairExp6ParamData.mixWtSite1old",np_total);
-     PairExp6ParamData.epsilonOld2   = typename AT::t_float_1d("PairExp6ParamData.epsilonOld2"  ,np_total);
-     PairExp6ParamData.alphaOld2     = typename AT::t_float_1d("PairExp6ParamData.alphaOld2"    ,np_total);
-     PairExp6ParamData.rmOld2        = typename AT::t_float_1d("PairExp6ParamData.rmOld2"       ,np_total);
-     PairExp6ParamData.mixWtSite2old = typename AT::t_float_1d("PairExp6ParamData.mixWtSite2old",np_total);
+     if (np_total > PairExp6ParamData.epsilon1.dimension_0()) {
+       PairExp6ParamData.epsilon1      = typename AT::t_float_1d("PairExp6ParamData.epsilon1"     ,np_total);
+       PairExp6ParamData.alpha1        = typename AT::t_float_1d("PairExp6ParamData.alpha1"       ,np_total);
+       PairExp6ParamData.rm1           = typename AT::t_float_1d("PairExp6ParamData.rm1"          ,np_total);
+       PairExp6ParamData.mixWtSite1    = typename AT::t_float_1d("PairExp6ParamData.mixWtSite1"   ,np_total);
+       PairExp6ParamData.epsilon2      = typename AT::t_float_1d("PairExp6ParamData.epsilon2"     ,np_total);
+       PairExp6ParamData.alpha2        = typename AT::t_float_1d("PairExp6ParamData.alpha2"       ,np_total);
+       PairExp6ParamData.rm2           = typename AT::t_float_1d("PairExp6ParamData.rm2"          ,np_total);
+       PairExp6ParamData.mixWtSite2    = typename AT::t_float_1d("PairExp6ParamData.mixWtSite2"   ,np_total);
+       PairExp6ParamData.epsilonOld1   = typename AT::t_float_1d("PairExp6ParamData.epsilonOld1"  ,np_total);
+       PairExp6ParamData.alphaOld1     = typename AT::t_float_1d("PairExp6ParamData.alphaOld1"    ,np_total);
+       PairExp6ParamData.rmOld1        = typename AT::t_float_1d("PairExp6ParamData.rmOld1"       ,np_total);
+       PairExp6ParamData.mixWtSite1old = typename AT::t_float_1d("PairExp6ParamData.mixWtSite1old",np_total);
+       PairExp6ParamData.epsilonOld2   = typename AT::t_float_1d("PairExp6ParamData.epsilonOld2"  ,np_total);
+       PairExp6ParamData.alphaOld2     = typename AT::t_float_1d("PairExp6ParamData.alphaOld2"    ,np_total);
+       PairExp6ParamData.rmOld2        = typename AT::t_float_1d("PairExp6ParamData.rmOld2"       ,np_total);
+       PairExp6ParamData.mixWtSite2old = typename AT::t_float_1d("PairExp6ParamData.mixWtSite2old",np_total);
+     } else
+       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxZeroMixingWeights>(0,np_total),*this);
 
 #ifdef KOKKOS_HAVE_CUDA
      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxgetMixingWeights>(0,np_total),*this);
@@ -352,6 +355,27 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   //printf("PairExp6rxKokkos::compute %f %f\n", getElapsedTime(t_start, t_stop), getElapsedTime(t_mix_start, t_mix_stop));
 }
 
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxZeroMixingWeights, const int &i) const {
+  PairExp6ParamData.epsilon1[i] = 0.0;
+  PairExp6ParamData.alpha1[i] = 0.0;
+  PairExp6ParamData.rm1[i] = 0.0;
+  PairExp6ParamData.mixWtSite1[i] = 0.0;
+  PairExp6ParamData.epsilon2[i] = 0.0;
+  PairExp6ParamData.alpha2[i] = 0.0;
+  PairExp6ParamData.rm2[i] = 0.0;
+  PairExp6ParamData.mixWtSite2[i] = 0.0;
+  PairExp6ParamData.epsilonOld1[i] = 0.0;
+  PairExp6ParamData.alphaOld1[i] = 0.0;
+  PairExp6ParamData.rmOld1[i] = 0.0;
+  PairExp6ParamData.mixWtSite1old[i] = 0.0;
+  PairExp6ParamData.epsilonOld2[i] = 0.0;
+  PairExp6ParamData.alphaOld2[i] = 0.0;
+  PairExp6ParamData.rmOld2[i] = 0.0;
+  PairExp6ParamData.mixWtSite2old[i] = 0.0;
+}
+
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxgetMixingWeights, const int &i) const {
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.h b/src/KOKKOS/pair_exp6_rx_kokkos.h
index 9f38732c32..5e9fb4e3e3 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.h
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.h
@@ -52,6 +52,7 @@ struct PairExp6ParamDataTypeKokkos
    {}
 };
 
+struct TagPairExp6rxZeroMixingWeights{};
 struct TagPairExp6rxgetMixingWeights{};
 
 template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
@@ -76,6 +77,9 @@ class PairExp6rxKokkos : public PairExp6rx {
   void coeff(int, char **);
   void init_style();
 
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairExp6rxZeroMixingWeights, const int&) const;
+
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairExp6rxgetMixingWeights, const int&) const;
 

From 6c0b6918821ac738327b0aee398873546a929340 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Tue, 11 Apr 2017 09:12:46 -0600
Subject: [PATCH 236/267] Removing more memory churn in pair_exp6_rx_kokkos

---
 src/KOKKOS/pair_exp6_rx_kokkos.cpp | 67 ++++++++++++++++++++----------
 src/KOKKOS/pair_exp6_rx_kokkos.h   | 24 +++++++++++
 2 files changed, 69 insertions(+), 22 deletions(-)

diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index 312f1c6076..51cf1a72e7 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -204,6 +204,29 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        PairExp6ParamData.alphaOld2     = typename AT::t_float_1d("PairExp6ParamData.alphaOld2"    ,np_total);
        PairExp6ParamData.rmOld2        = typename AT::t_float_1d("PairExp6ParamData.rmOld2"       ,np_total);
        PairExp6ParamData.mixWtSite2old = typename AT::t_float_1d("PairExp6ParamData.mixWtSite2old",np_total);
+
+       PairExp6ParamDataVect.epsilon          = typename AT::t_float_1d("PairExp6ParamDataVect.epsilon"         ,np_total);;
+       PairExp6ParamDataVect.rm3              = typename AT::t_float_1d("PairExp6ParamDataVect.rm3"             ,np_total);;
+       PairExp6ParamDataVect.alpha            = typename AT::t_float_1d("PairExp6ParamDataVect.alpha"           ,np_total);;
+       PairExp6ParamDataVect.xMolei           = typename AT::t_float_1d("PairExp6ParamDataVect.xMolei"          ,np_total);;
+       PairExp6ParamDataVect.epsilon_old      = typename AT::t_float_1d("PairExp6ParamDataVect.epsilon_old"     ,np_total);;
+       PairExp6ParamDataVect.rm3_old          = typename AT::t_float_1d("PairExp6ParamDataVect.rm3_old"         ,np_total);;
+       PairExp6ParamDataVect.alpha_old        = typename AT::t_float_1d("PairExp6ParamDataVect.alpha_old"       ,np_total);;
+       PairExp6ParamDataVect.xMolei_old       = typename AT::t_float_1d("PairExp6ParamDataVect.xMolei_old"      ,np_total);;
+       PairExp6ParamDataVect.fractionOFA      = typename AT::t_float_1d("PairExp6ParamDataVect.fractionOFA"     ,np_total);;
+       PairExp6ParamDataVect.fraction1        = typename AT::t_float_1d("PairExp6ParamDataVect.fraction1"       ,np_total);;
+       PairExp6ParamDataVect.fraction2        = typename AT::t_float_1d("PairExp6ParamDataVect.fraction2"       ,np_total);;
+       PairExp6ParamDataVect.nMoleculesOFA    = typename AT::t_float_1d("PairExp6ParamDataVect.nMoleculesOFA"   ,np_total);;
+       PairExp6ParamDataVect.nMolecules1      = typename AT::t_float_1d("PairExp6ParamDataVect.nMolecules1"     ,np_total);;
+       PairExp6ParamDataVect.nMolecules2      = typename AT::t_float_1d("PairExp6ParamDataVect.nMolecules2"     ,np_total);;
+       PairExp6ParamDataVect.nTotal           = typename AT::t_float_1d("PairExp6ParamDataVect.nTotal"          ,np_total);;
+       PairExp6ParamDataVect.fractionOFAold   = typename AT::t_float_1d("PairExp6ParamDataVect.fractionOFAold"  ,np_total);;
+       PairExp6ParamDataVect.fractionOld1     = typename AT::t_float_1d("PairExp6ParamDataVect.fractionOld1"    ,np_total);;
+       PairExp6ParamDataVect.fractionOld2     = typename AT::t_float_1d("PairExp6ParamDataVect.fractionOld2"    ,np_total);;
+       PairExp6ParamDataVect.nMoleculesOFAold = typename AT::t_float_1d("PairExp6ParamDataVect.nMoleculesOFAold",np_total);;
+       PairExp6ParamDataVect.nMoleculesOld1   = typename AT::t_float_1d("PairExp6ParamDataVect.nMoleculesOld1"  ,np_total);;
+       PairExp6ParamDataVect.nMoleculesOld2   = typename AT::t_float_1d("PairExp6ParamDataVect.nMoleculesOld2"  ,np_total);;
+       PairExp6ParamDataVect.nTotalold        = typename AT::t_float_1d("PairExp6ParamDataVect.nTotalold"       ,np_total);;
      } else
        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxZeroMixingWeights>(0,np_total),*this);
 
@@ -2094,31 +2117,31 @@ template<class DeviceType>
 void PairExp6rxKokkos<DeviceType>::getMixingWeightsVect(const int np_total, int errorFlag, 
                           ArrayT &epsilon1, ArrayT &alpha1, ArrayT &rm1,  ArrayT &mixWtSite1, ArrayT &epsilon2, ArrayT &alpha2, ArrayT &rm2, ArrayT &mixWtSite2, ArrayT &epsilon1_old, ArrayT &alpha1_old, ArrayT &rm1_old,  ArrayT &mixWtSite1old, ArrayT &epsilon2_old, ArrayT &alpha2_old, ArrayT &rm2_old, ArrayT &mixWtSite2old) const
 {
-  ArrayT epsilon("PairExp6ParamData.epsilon",  np_total);
-  ArrayT rm3("PairExp6ParamData.rm3",  np_total);
-  ArrayT alpha("PairExp6ParamData.alpha",  np_total);
-  ArrayT xMolei("PairExp6ParamData.xMolei",  np_total);
+  ArrayT epsilon          = PairExp6ParamDataVect.epsilon         ;
+  ArrayT rm3              = PairExp6ParamDataVect.rm3             ;
+  ArrayT alpha            = PairExp6ParamDataVect.alpha           ;
+  ArrayT xMolei           = PairExp6ParamDataVect.xMolei          ;
 
-  ArrayT epsilon_old("PairExp6ParamData.epsilon_old",  np_total);
-  ArrayT rm3_old("PairExp6ParamData.rm3_old",  np_total);
-  ArrayT alpha_old("PairExp6ParamData.alpha_old",  np_total);
-  ArrayT xMolei_old("PairExp6ParamData.xMolei_old",  np_total);
+  ArrayT epsilon_old      = PairExp6ParamDataVect.epsilon_old     ;
+  ArrayT rm3_old          = PairExp6ParamDataVect.rm3_old         ;
+  ArrayT alpha_old        = PairExp6ParamDataVect.alpha_old       ;
+  ArrayT xMolei_old       = PairExp6ParamDataVect.xMolei_old      ;
 
-  ArrayT fractionOFA("PairExp6ParamData.fractionOFA",  np_total);
-  ArrayT fraction1("PairExp6ParamData.fraction1",  np_total);
-  ArrayT fraction2("PairExp6ParamData.fraction2",  np_total);
-  ArrayT nMoleculesOFA("PairExp6ParamData.nMoleculesOFA",  np_total);
-  ArrayT nMolecules1("PairExp6ParamData.nMolecules1",  np_total);
-  ArrayT nMolecules2("PairExp6ParamData.nMolecules2",  np_total);
-  ArrayT nTotal("PairExp6ParamData.nTotal",  np_total);
+  ArrayT fractionOFA      = PairExp6ParamDataVect.fractionOFA     ;
+  ArrayT fraction1        = PairExp6ParamDataVect.fraction1       ;
+  ArrayT fraction2        = PairExp6ParamDataVect.fraction2       ;
+  ArrayT nMoleculesOFA    = PairExp6ParamDataVect.nMoleculesOFA   ;
+  ArrayT nMolecules1      = PairExp6ParamDataVect.nMolecules1     ;
+  ArrayT nMolecules2      = PairExp6ParamDataVect.nMolecules2     ;
+  ArrayT nTotal           = PairExp6ParamDataVect.nTotal          ;
 
-  ArrayT fractionOFAold("PairExp6ParamData.fractionOFAold",  np_total);
-  ArrayT fractionOld1("PairExp6ParamData.fractionOld1",  np_total);
-  ArrayT fractionOld2("PairExp6ParamData.fractionOld2",  np_total);
-  ArrayT nMoleculesOFAold("PairExp6ParamData.nMoleculesOFAold",  np_total);
-  ArrayT nMoleculesOld1("PairExp6ParamData.nMoleculesOld1",  np_total);
-  ArrayT nMoleculesOld2("PairExp6ParamData.nMoleculesOld2",  np_total);
-  ArrayT nTotalold("PairExp6ParamData.nTotalold",  np_total);
+  ArrayT fractionOFAold   = PairExp6ParamDataVect.fractionOFAold  ;
+  ArrayT fractionOld1     = PairExp6ParamDataVect.fractionOld1    ;
+  ArrayT fractionOld2     = PairExp6ParamDataVect.fractionOld2    ;
+  ArrayT nMoleculesOFAold = PairExp6ParamDataVect.nMoleculesOFAold;
+  ArrayT nMoleculesOld1   = PairExp6ParamDataVect.nMoleculesOld1  ;
+  ArrayT nMoleculesOld2   = PairExp6ParamDataVect.nMoleculesOld2  ;
+  ArrayT nTotalold        = PairExp6ParamDataVect.nTotalold       ;
 
   int errorFlag1 = 0, errorFlag2 = 0;
 
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.h b/src/KOKKOS/pair_exp6_rx_kokkos.h
index 5e9fb4e3e3..09283662a2 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.h
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.h
@@ -52,6 +52,29 @@ struct PairExp6ParamDataTypeKokkos
    {}
 };
 
+template<class DeviceType>
+struct PairExp6ParamDataTypeKokkosVect
+{
+  typedef ArrayTypes<DeviceType> AT;
+
+   typename AT::t_float_1d epsilon, rm3, alpha, xMolei, epsilon_old, rm3_old,
+                           alpha_old, xMolei_old, fractionOFA, fraction1,
+                           fraction2, nMoleculesOFA, nMolecules1, nMolecules2,
+                           nTotal, fractionOFAold, fractionOld1, fractionOld2,
+                           nMoleculesOFAold, nMoleculesOld1, nMoleculesOld2,
+                           nTotalold;
+
+   // Default constructor -- nullify everything.
+   PairExp6ParamDataTypeKokkosVect<DeviceType>(void)
+      : epsilon(NULL), rm3(NULL), alpha(NULL), xMolei(NULL), epsilon_old(NULL), rm3_old(NULL),
+        alpha_old(NULL), xMolei_old(NULL), fractionOFA(NULL), fraction1(NULL),
+        fraction2(NULL), nMoleculesOFA(NULL), nMolecules1(NULL), nMolecules2(NULL),
+        nTotal(NULL), fractionOFAold(NULL), fractionOld1(NULL), fractionOld2(NULL),
+        nMoleculesOFAold(NULL), nMoleculesOld1(NULL), nMoleculesOld2(NULL),
+        nTotalold(NULL)
+   {}
+};
+
 struct TagPairExp6rxZeroMixingWeights{};
 struct TagPairExp6rxgetMixingWeights{};
 
@@ -148,6 +171,7 @@ class PairExp6rxKokkos : public PairExp6rx {
   typename AT::t_int_1d_randomread d_numneigh;
 
   PairExp6ParamDataTypeKokkos<DeviceType> PairExp6ParamData;
+  PairExp6ParamDataTypeKokkosVect<DeviceType> PairExp6ParamDataVect;
 
   void allocate();
   DAT::tdual_int_1d k_mol2param;               // mapping from molecule to parameters

From ca4619e22791294cb7e63a0043869504350772ed Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Tue, 11 Apr 2017 09:14:21 -0600
Subject: [PATCH 237/267] Fix format issue in pair_exp6_rx_kokkos

---
 src/KOKKOS/pair_exp6_rx_kokkos.cpp | 44 +++++++++++++++---------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index 51cf1a72e7..5b84f09fd6 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -205,28 +205,28 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        PairExp6ParamData.rmOld2        = typename AT::t_float_1d("PairExp6ParamData.rmOld2"       ,np_total);
        PairExp6ParamData.mixWtSite2old = typename AT::t_float_1d("PairExp6ParamData.mixWtSite2old",np_total);
 
-       PairExp6ParamDataVect.epsilon          = typename AT::t_float_1d("PairExp6ParamDataVect.epsilon"         ,np_total);;
-       PairExp6ParamDataVect.rm3              = typename AT::t_float_1d("PairExp6ParamDataVect.rm3"             ,np_total);;
-       PairExp6ParamDataVect.alpha            = typename AT::t_float_1d("PairExp6ParamDataVect.alpha"           ,np_total);;
-       PairExp6ParamDataVect.xMolei           = typename AT::t_float_1d("PairExp6ParamDataVect.xMolei"          ,np_total);;
-       PairExp6ParamDataVect.epsilon_old      = typename AT::t_float_1d("PairExp6ParamDataVect.epsilon_old"     ,np_total);;
-       PairExp6ParamDataVect.rm3_old          = typename AT::t_float_1d("PairExp6ParamDataVect.rm3_old"         ,np_total);;
-       PairExp6ParamDataVect.alpha_old        = typename AT::t_float_1d("PairExp6ParamDataVect.alpha_old"       ,np_total);;
-       PairExp6ParamDataVect.xMolei_old       = typename AT::t_float_1d("PairExp6ParamDataVect.xMolei_old"      ,np_total);;
-       PairExp6ParamDataVect.fractionOFA      = typename AT::t_float_1d("PairExp6ParamDataVect.fractionOFA"     ,np_total);;
-       PairExp6ParamDataVect.fraction1        = typename AT::t_float_1d("PairExp6ParamDataVect.fraction1"       ,np_total);;
-       PairExp6ParamDataVect.fraction2        = typename AT::t_float_1d("PairExp6ParamDataVect.fraction2"       ,np_total);;
-       PairExp6ParamDataVect.nMoleculesOFA    = typename AT::t_float_1d("PairExp6ParamDataVect.nMoleculesOFA"   ,np_total);;
-       PairExp6ParamDataVect.nMolecules1      = typename AT::t_float_1d("PairExp6ParamDataVect.nMolecules1"     ,np_total);;
-       PairExp6ParamDataVect.nMolecules2      = typename AT::t_float_1d("PairExp6ParamDataVect.nMolecules2"     ,np_total);;
-       PairExp6ParamDataVect.nTotal           = typename AT::t_float_1d("PairExp6ParamDataVect.nTotal"          ,np_total);;
-       PairExp6ParamDataVect.fractionOFAold   = typename AT::t_float_1d("PairExp6ParamDataVect.fractionOFAold"  ,np_total);;
-       PairExp6ParamDataVect.fractionOld1     = typename AT::t_float_1d("PairExp6ParamDataVect.fractionOld1"    ,np_total);;
-       PairExp6ParamDataVect.fractionOld2     = typename AT::t_float_1d("PairExp6ParamDataVect.fractionOld2"    ,np_total);;
-       PairExp6ParamDataVect.nMoleculesOFAold = typename AT::t_float_1d("PairExp6ParamDataVect.nMoleculesOFAold",np_total);;
-       PairExp6ParamDataVect.nMoleculesOld1   = typename AT::t_float_1d("PairExp6ParamDataVect.nMoleculesOld1"  ,np_total);;
-       PairExp6ParamDataVect.nMoleculesOld2   = typename AT::t_float_1d("PairExp6ParamDataVect.nMoleculesOld2"  ,np_total);;
-       PairExp6ParamDataVect.nTotalold        = typename AT::t_float_1d("PairExp6ParamDataVect.nTotalold"       ,np_total);;
+       PairExp6ParamDataVect.epsilon          = typename AT::t_float_1d("PairExp6ParamDataVect.epsilon"         ,np_total);
+       PairExp6ParamDataVect.rm3              = typename AT::t_float_1d("PairExp6ParamDataVect.rm3"             ,np_total);
+       PairExp6ParamDataVect.alpha            = typename AT::t_float_1d("PairExp6ParamDataVect.alpha"           ,np_total);
+       PairExp6ParamDataVect.xMolei           = typename AT::t_float_1d("PairExp6ParamDataVect.xMolei"          ,np_total);
+       PairExp6ParamDataVect.epsilon_old      = typename AT::t_float_1d("PairExp6ParamDataVect.epsilon_old"     ,np_total);
+       PairExp6ParamDataVect.rm3_old          = typename AT::t_float_1d("PairExp6ParamDataVect.rm3_old"         ,np_total);
+       PairExp6ParamDataVect.alpha_old        = typename AT::t_float_1d("PairExp6ParamDataVect.alpha_old"       ,np_total);
+       PairExp6ParamDataVect.xMolei_old       = typename AT::t_float_1d("PairExp6ParamDataVect.xMolei_old"      ,np_total);
+       PairExp6ParamDataVect.fractionOFA      = typename AT::t_float_1d("PairExp6ParamDataVect.fractionOFA"     ,np_total);
+       PairExp6ParamDataVect.fraction1        = typename AT::t_float_1d("PairExp6ParamDataVect.fraction1"       ,np_total);
+       PairExp6ParamDataVect.fraction2        = typename AT::t_float_1d("PairExp6ParamDataVect.fraction2"       ,np_total);
+       PairExp6ParamDataVect.nMoleculesOFA    = typename AT::t_float_1d("PairExp6ParamDataVect.nMoleculesOFA"   ,np_total);
+       PairExp6ParamDataVect.nMolecules1      = typename AT::t_float_1d("PairExp6ParamDataVect.nMolecules1"     ,np_total);
+       PairExp6ParamDataVect.nMolecules2      = typename AT::t_float_1d("PairExp6ParamDataVect.nMolecules2"     ,np_total);
+       PairExp6ParamDataVect.nTotal           = typename AT::t_float_1d("PairExp6ParamDataVect.nTotal"          ,np_total);
+       PairExp6ParamDataVect.fractionOFAold   = typename AT::t_float_1d("PairExp6ParamDataVect.fractionOFAold"  ,np_total);
+       PairExp6ParamDataVect.fractionOld1     = typename AT::t_float_1d("PairExp6ParamDataVect.fractionOld1"    ,np_total);
+       PairExp6ParamDataVect.fractionOld2     = typename AT::t_float_1d("PairExp6ParamDataVect.fractionOld2"    ,np_total);
+       PairExp6ParamDataVect.nMoleculesOFAold = typename AT::t_float_1d("PairExp6ParamDataVect.nMoleculesOFAold",np_total);
+       PairExp6ParamDataVect.nMoleculesOld1   = typename AT::t_float_1d("PairExp6ParamDataVect.nMoleculesOld1"  ,np_total);
+       PairExp6ParamDataVect.nMoleculesOld2   = typename AT::t_float_1d("PairExp6ParamDataVect.nMoleculesOld2"  ,np_total);
+       PairExp6ParamDataVect.nTotalold        = typename AT::t_float_1d("PairExp6ParamDataVect.nTotalold"       ,np_total);
      } else
        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxZeroMixingWeights>(0,np_total),*this);
 

From 36cbe439780dc8b44ecbb25036327853033aab68 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Tue, 6 Jun 2017 10:51:26 -0600
Subject: [PATCH 238/267] Fixing some CUDA runtime issues in npair_ssa_kokkos

---
 src/KOKKOS/npair_ssa_kokkos.cpp | 100 +++++++++++++++++++++++---------
 1 file changed, 74 insertions(+), 26 deletions(-)

diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp
index ba4bc9171c..0c3a5985ff 100644
--- a/src/KOKKOS/npair_ssa_kokkos.cpp
+++ b/src/KOKKOS/npair_ssa_kokkos.cpp
@@ -149,17 +149,21 @@ void NPairSSAKokkos<DeviceType>::copy_stencil_info()
     k_ssa_phaseOff = DAT::tdual_int_1d_3("NPairSSAKokkos:ssa_phaseOff",ssa_phaseCt);
     ssa_phaseOff = k_ssa_phaseOff.view<DeviceType>();
   }
+  auto h_ssa_phaseOff = k_ssa_phaseOff.h_view;
+  k_ssa_phaseOff.sync<LMPHostType>();
   int workPhase = 0;
   for (int zoff = sz1 - 1; zoff >= 0; --zoff) {
     for (int yoff = sy1 - 1; yoff >= 0; --yoff) {
       for (int xoff = sx1 - 1; xoff >= 0; --xoff) {
-        ssa_phaseOff(workPhase, 0) = xoff;
-        ssa_phaseOff(workPhase, 1) = yoff;
-        ssa_phaseOff(workPhase, 2) = zoff;
+        h_ssa_phaseOff(workPhase, 0) = xoff;
+        h_ssa_phaseOff(workPhase, 1) = yoff;
+        h_ssa_phaseOff(workPhase, 2) = zoff;
         workPhase++;
       }
     }
   }
+  k_ssa_phaseOff.modify<LMPHostType>();
+  k_ssa_phaseOff.sync<DeviceType>();
 
 }
 
@@ -250,8 +254,25 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
     ssa_itemLen = k_ssa_itemLen.view<DeviceType>();
   }
 
+  k_ssa_itemLoc.sync<LMPHostType>();
+  k_ssa_itemLen.sync<LMPHostType>();
+  k_ssa_gitemLoc.sync<LMPHostType>();
+  k_ssa_gitemLen.sync<LMPHostType>();
+  k_ssa_phaseOff.sync<LMPHostType>();
+  k_ssa_phaseLen.sync<LMPHostType>();
+  k_ssa_gphaseLen.sync<LMPHostType>();
+  auto h_ssa_itemLoc = k_ssa_itemLoc.h_view;
+  auto h_ssa_itemLen = k_ssa_itemLen.h_view;
+  auto h_ssa_gitemLoc = k_ssa_gitemLoc.h_view;
+  auto h_ssa_gitemLen = k_ssa_gitemLen.h_view;
+  auto h_ssa_phaseOff = k_ssa_phaseOff.h_view;
+  auto h_ssa_phaseLen = k_ssa_phaseLen.h_view;
+  auto h_ssa_gphaseLen = k_ssa_gphaseLen.h_view;
+
 { // Preflight the neighbor list workplan
   const typename ArrayTypes<DeviceType>::t_int_1d_const c_bincount = k_bincount.view<DeviceType>();
+  k_bincount.sync<LMPHostType>();
+  auto h_bincount = k_bincount.h_view;
   const typename ArrayTypes<DeviceType>::t_int_2d_const c_bins     = k_bins.view<DeviceType>();
   const typename ArrayTypes<DeviceType>::t_int_1d_const_um c_stencil = k_stencil.view<DeviceType>();
   const typename ArrayTypes<DeviceType>::t_int_1d_const c_nstencil_ssa = k_nstencil_ssa.view<DeviceType>();
@@ -259,9 +280,9 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
 
   // loop over bins with local atoms, counting half of the neighbors
   for (int workPhase = 0; workPhase < ssa_phaseCt; ++workPhase) {
-    int zoff = ssa_phaseOff(workPhase, 2);
-    int yoff = ssa_phaseOff(workPhase, 1);
-    int xoff = ssa_phaseOff(workPhase, 0);
+    int zoff = h_ssa_phaseOff(workPhase, 2);
+    int yoff = h_ssa_phaseOff(workPhase, 1);
+    int xoff = h_ssa_phaseOff(workPhase, 0);
     int workItem = 0;
   for (int zbin = lbinzlo + zoff; zbin < lbinzhi; zbin += sz1) {
   for (int ybin = lbinylo + yoff - sy1 + 1; ybin < lbinyhi; ybin += sy1) {
@@ -276,14 +297,14 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
       if ((s_xbin < lbinxlo) || (s_xbin >= lbinxhi)) continue;
 
       const int ibin = zbin*mbiny*mbinx + s_ybin*mbinx + s_xbin;
-      const int ibinCt = c_bincount(ibin);
+      const int ibinCt = h_bincount(ibin);
       if (ibinCt > 0) {
         int base_n = 0;
         bool include_same = false;
         // count all local atoms in the current stencil "subphase" as potential neighbors
         for (int k = c_nstencil_ssa(subphase); k < c_nstencil_ssa(subphase+1); k++) {
           const int jbin = ibin+c_stencil(k);
-          if (jbin != ibin) base_n += c_bincount(jbin);
+          if (jbin != ibin) base_n += h_bincount(jbin);
           else include_same = true;
         }
         // Calculate how many ibin particles would have had some neighbors
@@ -291,10 +312,10 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
         else if (include_same) inum += ibinCt - 1;
       }
     }
-    ssa_itemLoc(workPhase,workItem) = inum_start; // record where workItem starts in ilist
-    ssa_itemLen(workPhase,workItem) = inum - inum_start; // record workItem length
+    h_ssa_itemLoc(workPhase,workItem) = inum_start; // record where workItem starts in ilist
+    h_ssa_itemLen(workPhase,workItem) = inum - inum_start; // record workItem length
 #ifdef DEBUG_SSA_BUILD_LOCALS
-if (ssa_itemLen(workPhase,workItem) < 0) fprintf(stdout, "undr%03d phase (%3d,%3d) inum %d - inum_start %d UNDERFLOW\n"
+if (h_ssa_itemLen(workPhase,workItem) < 0) fprintf(stdout, "undr%03d phase (%3d,%3d) inum %d - inum_start %d UNDERFLOW\n"
   ,comm->me
   ,workPhase
   ,workItem
@@ -311,14 +332,14 @@ if (ssa_itemLen(workPhase,workItem) < 0) fprintf(stdout, "undr%03d phase (%3d,%3
 fprintf(stdout, "phas%03d phase %3d could use %6d inums, expected %6d inums. maxworkItems = %3d, inums/workItems = %g\n"
   ,comm->me
   ,workPhase
-  ,inum - ssa_itemLoc(workPhase, 0)
+  ,inum - h_ssa_itemLoc(workPhase, 0)
   ,(nlocal*4 + ssa_phaseCt - 1) / ssa_phaseCt
   ,workItem
-  ,(inum - ssa_itemLoc(workPhase, 0)) / (double) workItem
+  ,(inum - h_ssa_itemLoc(workPhase, 0)) / (double) workItem
 );
 #endif
     // record where workPhase ends
-    ssa_phaseLen(workPhase) = workItem;
+    h_ssa_phaseLen(workPhase) = workItem;
   }
 #ifdef DEBUG_SSA_BUILD_LOCALS
 fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inums/phase = %g\n"
@@ -331,15 +352,30 @@ fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inu
 #endif
   nl_size = inum; // record how much space is needed for the local work plan
 }
+
   // count how many ghosts might have neighbors, and increase the work plan storage
   for (int workPhase = 0; workPhase < ssa_gphaseCt; workPhase++) {
     int len = k_gbincount.h_view(workPhase + 1);
-    ssa_gitemLoc(workPhase,0) = nl_size; // record where workItem starts in ilist
-    ssa_gitemLen(workPhase,0) = len;
+    h_ssa_gitemLoc(workPhase,0) = nl_size; // record where workItem starts in ilist
+    h_ssa_gitemLen(workPhase,0) = len;
     nl_size += len;
   }
   list->grow(nl_size); // Make special larger SSA neighbor list
 
+  k_ssa_itemLoc.modify<LMPHostType>();
+  k_ssa_itemLen.modify<LMPHostType>();
+  k_ssa_gitemLoc.modify<LMPHostType>();
+  k_ssa_gitemLen.modify<LMPHostType>();
+  k_ssa_phaseOff.modify<LMPHostType>();
+  k_ssa_phaseLen.modify<LMPHostType>();
+  k_ssa_itemLoc.sync<DeviceType>();
+  k_ssa_itemLen.sync<DeviceType>();
+  k_ssa_gitemLen.sync<DeviceType>();
+  k_ssa_gitemLoc.sync<DeviceType>();
+  k_ssa_phaseOff.sync<DeviceType>();
+  k_ssa_phaseLen.sync<DeviceType>();
+  k_ssa_gphaseLen.sync<DeviceType>();
+
   NPairSSAKokkosExecute<DeviceType>
     data(*list,
          k_cutneighsq.view<DeviceType>(),
@@ -422,15 +458,27 @@ fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inu
     Kokkos::parallel_for(ssa_phaseCt, LAMMPS_LAMBDA (const int workPhase) {
       data.build_locals_onePhase(firstTry, comm->me, workPhase);
     });
-    data.neigh_list.inum = ssa_itemLoc(ssa_phaseCt-1,ssa_phaseLen(ssa_phaseCt-1)-1) +
-      ssa_itemLen(ssa_phaseCt-1,ssa_phaseLen(ssa_phaseCt-1)-1);
+    k_ssa_itemLoc.modify<DeviceType>();
+    k_ssa_itemLen.modify<DeviceType>();
+    k_ssa_phaseLen.modify<DeviceType>();
+    k_ssa_itemLoc.sync<LMPHostType>();
+    k_ssa_itemLen.sync<LMPHostType>();
+    k_ssa_phaseLen.sync<LMPHostType>();
+    data.neigh_list.inum = h_ssa_itemLoc(ssa_phaseCt-1,h_ssa_phaseLen(ssa_phaseCt-1)-1) +
+      h_ssa_itemLen(ssa_phaseCt-1,h_ssa_phaseLen(ssa_phaseCt-1)-1);
 
     // loop over AIR ghost atoms, storing their local neighbors
     Kokkos::parallel_for(ssa_gphaseCt, LAMMPS_LAMBDA (const int workPhase) {
       data.build_ghosts_onePhase(workPhase);
     });
-    data.neigh_list.gnum = ssa_gitemLoc(ssa_gphaseCt-1,ssa_gphaseLen(ssa_gphaseCt-1)-1) +
-      ssa_gitemLen(ssa_gphaseCt-1,ssa_gphaseLen(ssa_gphaseCt-1)-1) - data.neigh_list.inum;
+    k_ssa_gitemLoc.modify<DeviceType>();
+    k_ssa_gitemLen.modify<DeviceType>();
+    k_ssa_gphaseLen.modify<DeviceType>();
+    k_ssa_gitemLoc.sync<LMPHostType>();
+    k_ssa_gitemLen.sync<LMPHostType>();
+    k_ssa_gphaseLen.sync<LMPHostType>();
+    data.neigh_list.gnum = h_ssa_gitemLoc(ssa_gphaseCt-1,h_ssa_gphaseLen(ssa_gphaseCt-1)-1) +
+      h_ssa_gitemLen(ssa_gphaseCt-1,h_ssa_gphaseLen(ssa_gphaseCt-1)-1) - data.neigh_list.inum;
     firstTry = false;
 
     DeviceType::fence();
@@ -445,12 +493,12 @@ fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inu
     }
   }
 
-  k_ssa_phaseLen.modify<DeviceType>();
-  k_ssa_itemLoc.modify<DeviceType>();
-  k_ssa_itemLen.modify<DeviceType>();
-  k_ssa_gphaseLen.modify<DeviceType>();
-  k_ssa_gitemLoc.modify<DeviceType>();
-  k_ssa_gitemLen.modify<DeviceType>();
+  //k_ssa_phaseLen.modify<DeviceType>();
+  //k_ssa_itemLoc.modify<DeviceType>();
+  //k_ssa_itemLen.modify<DeviceType>();
+  //k_ssa_gphaseLen.modify<DeviceType>();
+  //k_ssa_gitemLoc.modify<DeviceType>();
+  //k_ssa_gitemLen.modify<DeviceType>();
 
   list->inum = data.neigh_list.inum; //FIXME once the above is in a parallel_for
   list->gnum = data.neigh_list.gnum; // it will need a deep_copy or something

From efe60bf991c69d0cdd0e1f960f060c53abb62457 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Tue, 6 Jun 2017 13:10:04 -0600
Subject: [PATCH 239/267] Fixing more CUDA runtime issues

---
 src/KOKKOS/nbin_ssa_kokkos.cpp  | 2 ++
 src/KOKKOS/npair_ssa_kokkos.cpp | 8 ++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/nbin_ssa_kokkos.cpp b/src/KOKKOS/nbin_ssa_kokkos.cpp
index 6c9e3a3446..f11d7e18ef 100644
--- a/src/KOKKOS/nbin_ssa_kokkos.cpp
+++ b/src/KOKKOS/nbin_ssa_kokkos.cpp
@@ -212,6 +212,8 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
     });
     DeviceType::fence();
   }
+  k_bins.modify<DeviceType>();
+  k_bincount.modify<DeviceType>();
   c_bins = bins; // bins won't change until the next bin_atoms
 
 //now dispose of the k_binID array
diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp
index 0c3a5985ff..368fb1a6ed 100644
--- a/src/KOKKOS/npair_ssa_kokkos.cpp
+++ b/src/KOKKOS/npair_ssa_kokkos.cpp
@@ -275,7 +275,11 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
   auto h_bincount = k_bincount.h_view;
   const typename ArrayTypes<DeviceType>::t_int_2d_const c_bins     = k_bins.view<DeviceType>();
   const typename ArrayTypes<DeviceType>::t_int_1d_const_um c_stencil = k_stencil.view<DeviceType>();
+  k_stencil.sync<LMPHostType>();
+  auto h_stencil = k_stencil.h_view;
   const typename ArrayTypes<DeviceType>::t_int_1d_const c_nstencil_ssa = k_nstencil_ssa.view<DeviceType>();
+  k_nstencil_ssa.sync<LMPHostType>();
+  auto h_nstencil_ssa = k_nstencil_ssa.h_view;
   int inum = 0;
 
   // loop over bins with local atoms, counting half of the neighbors
@@ -302,8 +306,8 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
         int base_n = 0;
         bool include_same = false;
         // count all local atoms in the current stencil "subphase" as potential neighbors
-        for (int k = c_nstencil_ssa(subphase); k < c_nstencil_ssa(subphase+1); k++) {
-          const int jbin = ibin+c_stencil(k);
+        for (int k = h_nstencil_ssa(subphase); k < h_nstencil_ssa(subphase+1); k++) {
+          const int jbin = ibin+h_stencil(k);
           if (jbin != ibin) base_n += h_bincount(jbin);
           else include_same = true;
         }

From 520ab26bd966b5fda778b5e30f4cbdeb95d8e842 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 7 Jun 2017 15:07:53 -0600
Subject: [PATCH 240/267] Fixing more CUDA runtime issues

---
 src/KOKKOS/nbin_ssa_kokkos.cpp  | 3 +++
 src/KOKKOS/npair_ssa_kokkos.cpp | 9 ++-------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/KOKKOS/nbin_ssa_kokkos.cpp b/src/KOKKOS/nbin_ssa_kokkos.cpp
index f11d7e18ef..883ba25b24 100644
--- a/src/KOKKOS/nbin_ssa_kokkos.cpp
+++ b/src/KOKKOS/nbin_ssa_kokkos.cpp
@@ -216,6 +216,9 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
   k_bincount.modify<DeviceType>();
   c_bins = bins; // bins won't change until the next bin_atoms
 
+  k_gbins.modify<DeviceType>();
+  k_gbincount.modify<DeviceType>();
+
 //now dispose of the k_binID array
   k_binID = DAT::tdual_int_1d("NBinSSAKokkos::binID",0);
   binID = k_binID.view<DeviceType>();
diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp
index 368fb1a6ed..aec482993d 100644
--- a/src/KOKKOS/npair_ssa_kokkos.cpp
+++ b/src/KOKKOS/npair_ssa_kokkos.cpp
@@ -260,24 +260,18 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
   k_ssa_gitemLen.sync<LMPHostType>();
   k_ssa_phaseOff.sync<LMPHostType>();
   k_ssa_phaseLen.sync<LMPHostType>();
-  k_ssa_gphaseLen.sync<LMPHostType>();
   auto h_ssa_itemLoc = k_ssa_itemLoc.h_view;
   auto h_ssa_itemLen = k_ssa_itemLen.h_view;
   auto h_ssa_gitemLoc = k_ssa_gitemLoc.h_view;
   auto h_ssa_gitemLen = k_ssa_gitemLen.h_view;
   auto h_ssa_phaseOff = k_ssa_phaseOff.h_view;
   auto h_ssa_phaseLen = k_ssa_phaseLen.h_view;
-  auto h_ssa_gphaseLen = k_ssa_gphaseLen.h_view;
 
 { // Preflight the neighbor list workplan
-  const typename ArrayTypes<DeviceType>::t_int_1d_const c_bincount = k_bincount.view<DeviceType>();
   k_bincount.sync<LMPHostType>();
   auto h_bincount = k_bincount.h_view;
-  const typename ArrayTypes<DeviceType>::t_int_2d_const c_bins     = k_bins.view<DeviceType>();
-  const typename ArrayTypes<DeviceType>::t_int_1d_const_um c_stencil = k_stencil.view<DeviceType>();
   k_stencil.sync<LMPHostType>();
   auto h_stencil = k_stencil.h_view;
-  const typename ArrayTypes<DeviceType>::t_int_1d_const c_nstencil_ssa = k_nstencil_ssa.view<DeviceType>();
   k_nstencil_ssa.sync<LMPHostType>();
   auto h_nstencil_ssa = k_nstencil_ssa.h_view;
   int inum = 0;
@@ -358,6 +352,7 @@ fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inu
 }
 
   // count how many ghosts might have neighbors, and increase the work plan storage
+  k_gbincount.sync<LMPHostType>();
   for (int workPhase = 0; workPhase < ssa_gphaseCt; workPhase++) {
     int len = k_gbincount.h_view(workPhase + 1);
     h_ssa_gitemLoc(workPhase,0) = nl_size; // record where workItem starts in ilist
@@ -370,7 +365,6 @@ fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inu
   k_ssa_itemLen.modify<LMPHostType>();
   k_ssa_gitemLoc.modify<LMPHostType>();
   k_ssa_gitemLen.modify<LMPHostType>();
-  k_ssa_phaseOff.modify<LMPHostType>();
   k_ssa_phaseLen.modify<LMPHostType>();
   k_ssa_itemLoc.sync<DeviceType>();
   k_ssa_itemLen.sync<DeviceType>();
@@ -481,6 +475,7 @@ fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inu
     k_ssa_gitemLoc.sync<LMPHostType>();
     k_ssa_gitemLen.sync<LMPHostType>();
     k_ssa_gphaseLen.sync<LMPHostType>();
+    auto h_ssa_gphaseLen = k_ssa_gphaseLen.h_view;
     data.neigh_list.gnum = h_ssa_gitemLoc(ssa_gphaseCt-1,h_ssa_gphaseLen(ssa_gphaseCt-1)-1) +
       h_ssa_gitemLen(ssa_gphaseCt-1,h_ssa_gphaseLen(ssa_gphaseCt-1)-1) - data.neigh_list.inum;
     firstTry = false;

From 611bb6f130355d88c1b89e710cf963b629b2a443 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Thu, 8 Jun 2017 09:31:51 -0600
Subject: [PATCH 241/267] Reduce memory churn in pair_table_rx_kokkos

---
 src/KOKKOS/pair_table_rx_kokkos.cpp | 26 ++++++++++++++------------
 src/KOKKOS/pair_table_rx_kokkos.h   |  5 +++++
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
index eacaf83cf5..2f5a670537 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -534,10 +534,10 @@ static void compute_all_items(
     typename ArrayTypes<DeviceType>::t_int_1d_const d_numneigh,
     typename ArrayTypes<DeviceType>::t_x_array_randomread x,
     typename ArrayTypes<DeviceType>::t_int_1d_randomread type,
-    Kokkos::View<double*, DeviceType> mixWtSite1old,
-    Kokkos::View<double*, DeviceType> mixWtSite2old,
-    Kokkos::View<double*, DeviceType> mixWtSite1,
-    Kokkos::View<double*, DeviceType> mixWtSite2,
+    Kokkos::View<double*, DeviceType> const& mixWtSite1old,
+    Kokkos::View<double*, DeviceType> const& mixWtSite2old,
+    Kokkos::View<double*, DeviceType> const& mixWtSite1,
+    Kokkos::View<double*, DeviceType> const& mixWtSite2,
     Few<int, 4> special_lj,
     Few<Few<F_FLOAT, MAX_TYPES_STACKPARAMS+1>, MAX_TYPES_STACKPARAMS+1> m_cutsq,
     typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq,
@@ -597,10 +597,10 @@ static void getAllMixingWeights(
     int nspecies,
     int isite1, int isite2,
     bool fractionalWeighting,
-    Kokkos::View<double*, DeviceType> mixWtSite1old,
-    Kokkos::View<double*, DeviceType> mixWtSite2old,
-    Kokkos::View<double*, DeviceType> mixWtSite1,
-    Kokkos::View<double*, DeviceType> mixWtSite2) {
+    Kokkos::View<double*, DeviceType> const& mixWtSite1old,
+    Kokkos::View<double*, DeviceType> const& mixWtSite2old,
+    Kokkos::View<double*, DeviceType> const& mixWtSite1,
+    Kokkos::View<double*, DeviceType> const& mixWtSite2) {
   Kokkos::parallel_for(ntotal,
   LAMMPS_LAMBDA(int i) {
       getMixingWeights<DeviceType>(dvector,nspecies,isite1,isite2,fractionalWeighting,
@@ -651,10 +651,12 @@ void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
   // loop over neighbors of my atoms
 
   const int ntotal = atom->nlocal + atom->nghost;
-  auto mixWtSite1old = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite1old", ntotal);
-  auto mixWtSite2old = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite2old", ntotal);
-  auto mixWtSite1 = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite1", ntotal);
-  auto mixWtSite2 = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite2", ntotal);
+  if (ntotal > mixWtSite1.dimension_0()) {
+    mixWtSite1old = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite1old", ntotal);
+    mixWtSite2old = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite2old", ntotal);
+    mixWtSite1 = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite1", ntotal);
+    mixWtSite2 = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite2", ntotal);
+  }
 
   getAllMixingWeights(ntotal, atomKK->k_dvector.template view<DeviceType>(),
       nspecies, isite1, isite2, fractionalWeighting,
diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
index 54c114a433..4230263dc9 100644
--- a/src/KOKKOS/pair_table_rx_kokkos.h
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -96,6 +96,11 @@ class PairTableRXKokkos : public PairTable {
 
   /* PairTableRX members */
 
+  Kokkos::View<double*, DeviceType> mixWtSite1old;
+  Kokkos::View<double*, DeviceType> mixWtSite2old;
+  Kokkos::View<double*, DeviceType> mixWtSite1;
+  Kokkos::View<double*, DeviceType> mixWtSite2;
+
   int nspecies;
   char *site1, *site2;
   int isite1, isite2;

From 6f24c58c1a31d6f7fe8cac237e22b21c8a159660 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Thu, 8 Jun 2017 09:52:00 -0600
Subject: [PATCH 242/267] Reduce memory churn in fix_rx_kokkos

---
 src/KOKKOS/fix_rx_kokkos.cpp | 52 ++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/src/KOKKOS/fix_rx_kokkos.cpp b/src/KOKKOS/fix_rx_kokkos.cpp
index d994b2c5d1..92db54d234 100644
--- a/src/KOKKOS/fix_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@@ -79,6 +79,17 @@ FixRxKokkos<DeviceType>::~FixRxKokkos()
 {
   //printf("Inside FixRxKokkos::~FixRxKokkos copymode= %d\n", copymode);
   if (copymode) return;
+
+  if (localTempFlag)
+    memory->destroy_kokkos(k_dpdThetaLocal, dpdThetaLocal);
+
+  memory->destroy_kokkos(k_sumWeights, sumWeights);
+  //memory->destroy_kokkos(k_sumWeights);
+
+  //delete [] scratchSpace;
+  memory->destroy_kokkos(d_scratchSpace);
+
+  memory->destroy_kokkos(k_cutsq);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -1433,9 +1444,12 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
   {
     const int count = nlocal + (newton_pair ? nghost : 0);
 
-    memory->create_kokkos (k_dpdThetaLocal, dpdThetaLocal, count, "FixRxKokkos::dpdThetaLocal");
-    this->d_dpdThetaLocal = k_dpdThetaLocal.d_view;
-    this->h_dpdThetaLocal = k_dpdThetaLocal.h_view;
+    if (count > k_dpdThetaLocal.d_view.dimension_0()) {
+      memory->destroy_kokkos (k_dpdThetaLocal, dpdThetaLocal);
+      memory->create_kokkos (k_dpdThetaLocal, dpdThetaLocal, count, "FixRxKokkos::dpdThetaLocal");
+      this->d_dpdThetaLocal = k_dpdThetaLocal.d_view;
+      this->h_dpdThetaLocal = k_dpdThetaLocal.h_view;
+    }
 
     const int neighflag = lmp->kokkos->neighflag;
 
@@ -1527,7 +1541,10 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
   //double *scratchSpace = new double[ scratchSpaceSize * nlocal ];
 
   //typename ArrayTypes<DeviceType>::t_double_1d d_scratchSpace("d_scratchSpace", scratchSpaceSize * nlocal);
-  memory->create_kokkos (d_scratchSpace, nlocal*scratchSpaceSize, "FixRxKokkos::d_scratchSpace");
+  if (nlocal*scratchSpaceSize > d_scratchSpace.dimension_0()) {
+    memory->destroy_kokkos (d_scratchSpace);
+    memory->create_kokkos (d_scratchSpace, nlocal*scratchSpaceSize, "FixRxKokkos::d_scratchSpace");
+  }
 
 #if 0
   Kokkos::parallel_reduce( nlocal, LAMMPS_LAMBDA(int i, CounterType &counter)
@@ -1630,9 +1647,6 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
     Kokkos::parallel_reduce( Kokkos::RangePolicy<DeviceType, Tag_FixRxKokkos_solveSystems<false> >(0,nlocal), *this, TotalCounters);
 #endif
 
-  //delete [] scratchSpace;
-  memory->destroy_kokkos (d_scratchSpace);
-
   TimerType timer_ODE = getTimeStamp();
 
   // Check the error flag for any failures.
@@ -1651,9 +1665,6 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
 
   atomKK->modified ( Host, DVECTOR_MASK );
 
-  if (localTempFlag)
-    memory->destroy_kokkos(k_dpdThetaLocal, dpdThetaLocal);
-
   TimerType timer_stop = getTimeStamp();
 
   double time_ODE = getElapsedTime(timer_localTemperature, timer_ODE);
@@ -2012,8 +2023,11 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
     const int ntypes = atom->ntypes;
 
     //memory->create_kokkos (k_cutsq, h_cutsq, ntypes+1, ntypes+1, "pair:cutsq");
-    memory->create_kokkos (k_cutsq, ntypes+1, ntypes+1, "FixRxKokkos::k_cutsq");
-    d_cutsq = k_cutsq.template view<DeviceType>();
+    if (ntypes+1 > k_cutsq.dimension_0()) {
+      memory->destroy_kokkos (k_cutsq);
+      memory->create_kokkos (k_cutsq, ntypes+1, ntypes+1, "FixRxKokkos::k_cutsq");
+      d_cutsq = k_cutsq.template view<DeviceType>();
+    }
 
     for (int i = 1; i <= ntypes; ++i)
       for (int j = i; j <= ntypes; ++j)
@@ -2030,9 +2044,12 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
   int sumWeightsCt = nlocal + (NEWTON_PAIR ? nghost : 0);
 
   //memory->create_kokkos (k_sumWeights, sumWeights, sumWeightsCt, "FixRxKokkos::sumWeights");
-  memory->create_kokkos (k_sumWeights, sumWeightsCt, "FixRxKokkos::sumWeights");
-  d_sumWeights = k_sumWeights.d_view;
-  h_sumWeights = k_sumWeights.h_view;
+  if (sumWeightsCt > k_sumWeights.d_view.dimension_0()) {
+    memory->destroy_kokkos(k_sumWeights, sumWeights);
+    memory->create_kokkos (k_sumWeights, sumWeightsCt, "FixRxKokkos::sumWeights");
+    d_sumWeights = k_sumWeights.d_view;
+    h_sumWeights = k_sumWeights.h_view;
+  }
 
   // Initialize the accumulator to zero ...
   //Kokkos::parallel_for (sumWeightsCt,
@@ -2165,11 +2182,6 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
   Kokkos::parallel_for (Kokkos::RangePolicy<DeviceType, Tag_FixRxKokkos_2ndPairOperator<WT_FLAG, LOCAL_TEMP_FLAG> >(0, nlocal), *this);
 #endif
 
-  // Clean up the local kokkos data.
-  //memory->destroy_kokkos(k_cutsq, h_cutsq);
-  memory->destroy_kokkos(k_cutsq);
-  //memory->destroy_kokkos(k_sumWeights, sumWeights);
-  memory->destroy_kokkos(k_sumWeights);
 }
 
 /* ---------------------------------------------------------------------- */

From 43cfa10ea48df7323ce4c15996aacddcb66b2228 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Thu, 8 Jun 2017 09:58:10 -0600
Subject: [PATCH 243/267] Reduce memory churn in pair_multi_lucy_rx_kokkos

---
 src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
index ef30fdc6f6..f7e1bad056 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
@@ -180,10 +180,12 @@ void PairMultiLucyRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in
 
   {
     const int ntotal = nlocal + nghost;
-    d_mixWtSite1old = typename AT::t_float_1d("PairMultiLucyRX::mixWtSite1old",ntotal);
-    d_mixWtSite2old = typename AT::t_float_1d("PairMultiLucyRX::mixWtSite2old",ntotal);
-    d_mixWtSite1 = typename AT::t_float_1d("PairMultiLucyRX::mixWtSite1",ntotal);
-    d_mixWtSite2 = typename AT::t_float_1d("PairMultiLucyRX::mixWtSite2",ntotal);
+    if (ntotal > d_mixWtSite1.dimension_0()) {
+      d_mixWtSite1old = typename AT::t_float_1d("PairMultiLucyRX::mixWtSite1old",ntotal);
+      d_mixWtSite2old = typename AT::t_float_1d("PairMultiLucyRX::mixWtSite2old",ntotal);
+      d_mixWtSite1 = typename AT::t_float_1d("PairMultiLucyRX::mixWtSite1",ntotal);
+      d_mixWtSite2 = typename AT::t_float_1d("PairMultiLucyRX::mixWtSite2",ntotal);
+    }
 
     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXgetMixingWeights>(0,ntotal),*this);
   }

From b4b7310884382a18f9439983a4c241c24998d88c Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Thu, 8 Jun 2017 13:33:23 -0600
Subject: [PATCH 244/267] Fixing CUDA runtime issues in pair_exp6_rx_kokkos

---
 src/KOKKOS/pair_exp6_rx_kokkos.cpp | 16 ++++++++--------
 src/KOKKOS/pair_exp6_rx_kokkos.h   |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index 5b84f09fd6..1eb1c6c770 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -426,7 +426,7 @@ KOKKOS_INLINE_FUNCTION
 void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii, EV_FLOAT& ev) const {
 
   {
-    const bool one_type = (atom->ntypes == 1);
+    const bool one_type = (ntypes == 1);
     if (isite1 == isite2)
       if (one_type)
         this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,true, true, true>(ii, ev);
@@ -797,7 +797,7 @@ KOKKOS_INLINE_FUNCTION
 void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxComputeNoAtomics<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii, EV_FLOAT& ev) const {
 
   {
-    const bool one_type = (atom->ntypes == 1);
+    const bool one_type = (ntypes == 1);
     if (isite1 == isite2)
       if (one_type)
         this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,true, false, true>(ii, ev);
@@ -1653,18 +1653,18 @@ template<class DeviceType>
 void PairExp6rxKokkos<DeviceType>::allocate()
 {
   allocated = 1;
-  int n = atom->ntypes;
+  ntypes = atom->ntypes;
 
-  memory->create(setflag,n+1,n+1,"pair:setflag");
-  for (int i = 1; i <= n; i++)
-    for (int j = i; j <= n; j++)
+  memory->create(setflag,ntypes+1,ntypes+1,"pair:setflag");
+  for (int i = 1; i <= ntypes; i++)
+    for (int j = i; j <= ntypes; j++)
       setflag[i][j] = 0;
 
-  memory->create_kokkos(k_cutsq,cutsq,n+1,n+1,"pair:cutsq");
+  memory->create_kokkos(k_cutsq,cutsq,ntypes+1,ntypes+1,"pair:cutsq");
   d_cutsq = k_cutsq.template view<DeviceType>();
   k_cutsq.template modify<LMPHostType>();
 
-  memory->create(cut,n+1,n+1,"pair:cut_lj");
+  memory->create(cut,ntypes+1,ntypes+1,"pair:cut_lj");
 }
 
 
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.h b/src/KOKKOS/pair_exp6_rx_kokkos.h
index 09283662a2..4c35c76851 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.h
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.h
@@ -145,7 +145,7 @@ class PairExp6rxKokkos : public PairExp6rx {
   int eflag,vflag;
   int nlocal,newton_pair,neighflag;
   double special_lj[4];
-  int num_threads;
+  int num_threads,ntypes;
 
   typename AT::t_x_array_randomread x;
   typename AT::t_f_array f;

From 86497949f20a2a6ae0609172e9aabf4e7221390d Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Thu, 8 Jun 2017 13:40:20 -0600
Subject: [PATCH 245/267] Fixing CUDA runtime issues in fix_shardlow_kokkos

---
 src/KOKKOS/fix_shardlow_kokkos.cpp | 21 ++++++++++++++++-----
 src/KOKKOS/fix_shardlow_kokkos.h   |  2 ++
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index 52287d586c..b3d4e86244 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -444,9 +444,6 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
   rand_type rand_gen = rand_pool.get_state(id);
 #endif
 
-  const double boltz_inv = 1.0/force->boltz;
-  const double ftm2v = force->ftm2v;
-  const double dt     = update->dt;
   int ct = count;
   int ii = start_ii;
 
@@ -639,6 +636,16 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
   ssa_gitemLoc = np_ssa->ssa_gitemLoc;
   ssa_gitemLen = np_ssa->ssa_gitemLen;
 
+  np_ssa->k_ssa_itemLoc.template sync<DeviceType>();
+  np_ssa->k_ssa_itemLen.template sync<DeviceType>();
+  np_ssa->k_ssa_gitemLoc.template sync<DeviceType>();
+  np_ssa->k_ssa_gitemLen.template sync<DeviceType>();
+
+  np_ssa->k_ssa_phaseLen.template sync<LMPHostType>();
+  np_ssa->k_ssa_gphaseLen.template sync<LMPHostType>();
+  auto h_ssa_phaseLen = np_ssa->k_ssa_phaseLen.h_view;
+  auto h_ssa_gphaseLen = np_ssa->k_ssa_gphaseLen.h_view;
+
   int maxWorkItemCt = (int) ssa_itemLoc.dimension_1();
   if (maxWorkItemCt < (int) ssa_gitemLoc.dimension_1()) {
     maxWorkItemCt = (int) ssa_gitemLoc.dimension_1();
@@ -670,9 +677,13 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
   deep_copy(d_hist, h_hist);
 #endif
 
+  boltz_inv = 1.0/force->boltz;
+  ftm2v = force->ftm2v;
+  dt     = update->dt;
+
   // process neighbors in the local AIR
   for (int workPhase = 0; workPhase < ssa_phaseCt; ++workPhase) {
-    int workItemCt = ssa_phaseLen[workPhase];
+    int workItemCt = h_ssa_phaseLen[workPhase];
 
     if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
       Kokkos::parallel_for(workItemCt, LAMMPS_LAMBDA (const int workItem ) {
@@ -692,7 +703,7 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
   //Loop over all 13 outward directions (7 stages)
   for (int workPhase = 0; workPhase < ssa_gphaseCt; ++workPhase) {
     // int airnum = workPhase + 1;
-    int workItemCt = ssa_gphaseLen[workPhase];
+    int workItemCt = h_ssa_gphaseLen[workPhase];
 
     // Communicate the updated velocities to all nodes
     comm->forward_comm_fix(this);
diff --git a/src/KOKKOS/fix_shardlow_kokkos.h b/src/KOKKOS/fix_shardlow_kokkos.h
index 4dc47709e1..df8849d80b 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.h
+++ b/src/KOKKOS/fix_shardlow_kokkos.h
@@ -68,6 +68,8 @@ class FixShardlowKokkos : public FixShardlow {
 #endif
 
  protected:
+  double boltz_inv,ftm2v,dt;
+
 //  class PairDPDfdt *pairDPD;
   PairDPDfdtEnergyKokkos<DeviceType> *k_pairDPDE;
 

From c51cadcc6c38ff2c939fb0bed46dd73c09873c2d Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Fri, 9 Jun 2017 09:31:37 -0600
Subject: [PATCH 246/267] Fixing CUDA runtime issues in fix_shardlow_kokkos

---
 src/KOKKOS/fix_shardlow_kokkos.cpp | 66 ++++++++++++++++--------------
 src/KOKKOS/fix_shardlow_kokkos.h   | 17 +++++++-
 2 files changed, 51 insertions(+), 32 deletions(-)

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index b3d4e86244..d2fb937a57 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -436,7 +436,7 @@ template<bool STACKPARAMS>
 KOKKOS_INLINE_FUNCTION
 void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
   int start_ii, int count, int id
-)
+) const
 {
 #ifdef DPD_USE_RAN_MARS
   class RanMars *pRNG = pp_random[id];
@@ -682,26 +682,18 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
   dt     = update->dt;
 
   // process neighbors in the local AIR
-  for (int workPhase = 0; workPhase < ssa_phaseCt; ++workPhase) {
+  for (workPhase = 0; workPhase < ssa_phaseCt; ++workPhase) {
     int workItemCt = h_ssa_phaseLen[workPhase];
 
-    if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
-      Kokkos::parallel_for(workItemCt, LAMMPS_LAMBDA (const int workItem ) {
-        int ct = ssa_itemLen(workPhase, workItem);
-        int ii = ssa_itemLoc(workPhase, workItem);
-        ssa_update_dpde<false>(ii, ct, workItem);
-      });
-    } else {
-      Kokkos::parallel_for(workItemCt, LAMMPS_LAMBDA (const int workItem ) {
-        int ct = ssa_itemLen(workPhase, workItem);
-        int ii = ssa_itemLoc(workPhase, workItem);
-        ssa_update_dpde<true>(ii, ct, workItem);
-      });
-    }
+
+    if(atom->ntypes > MAX_TYPES_STACKPARAMS)
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagFixShardlowSSAUpdateDPDE<false> >(0,workItemCt),*this);
+    else
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagFixShardlowSSAUpdateDPDE<true> >(0,workItemCt),*this);
   }
 
   //Loop over all 13 outward directions (7 stages)
-  for (int workPhase = 0; workPhase < ssa_gphaseCt; ++workPhase) {
+  for (workPhase = 0; workPhase < ssa_gphaseCt; ++workPhase) {
     // int airnum = workPhase + 1;
     int workItemCt = h_ssa_gphaseLen[workPhase];
 
@@ -713,27 +705,21 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
 //      memset(&(atom->uCond[nlocal]), 0, sizeof(double)*nghost);
 //      memset(&(atom->uMech[nlocal]), 0, sizeof(double)*nghost);
 
+      // must capture local variables, not class variables
+      auto l_uCond = uCond;
+      auto l_uMech = uMech;
       Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType>(nlocal,nlocal+nghost), LAMMPS_LAMBDA (const int i) {
-        uCond(i) = 0.0;
-        uMech(i) = 0.0;
+        l_uCond(i) = 0.0;
+        l_uMech(i) = 0.0;
       });
       DeviceType::fence();
     }
 
     // process neighbors in this AIR
-    if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
-      Kokkos::parallel_for(workItemCt, LAMMPS_LAMBDA (const int workItem ) {
-        int ct = ssa_gitemLen(workPhase, workItem);
-        int ii = ssa_gitemLoc(workPhase, workItem);
-        ssa_update_dpde<false>(ii, ct, workItem);
-      });
-    } else {
-      Kokkos::parallel_for(workItemCt, LAMMPS_LAMBDA (const int workItem ) {
-        int ct = ssa_gitemLen(workPhase, workItem);
-        int ii = ssa_gitemLoc(workPhase, workItem);
-        ssa_update_dpde<true>(ii, ct, workItem);
-      });
-    }
+    if(atom->ntypes > MAX_TYPES_STACKPARAMS)
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagFixShardlowSSAUpdateDPDEGhost<false> >(0,workItemCt),*this);
+    else
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagFixShardlowSSAUpdateDPDEGhost<true> >(0,workItemCt),*this);
 
     // Communicate the ghost deltas to the atom owners
     comm->reverse_comm_fix(this);
@@ -755,6 +741,24 @@ fprintf(stdout, "\n%6d %6d,%6d %6d: "
   copymode = 0;
 }
 
+template<class DeviceType>
+template<bool STACKPARAMS>
+KOKKOS_INLINE_FUNCTION
+void FixShardlowKokkos<DeviceType>::operator()(TagFixShardlowSSAUpdateDPDE<STACKPARAMS>, const int &workItem) const {
+  const int ct = ssa_itemLen(workPhase, workItem);
+  const int ii = ssa_itemLoc(workPhase, workItem);
+  ssa_update_dpde<STACKPARAMS>(ii, ct, workItem);
+}
+
+template<class DeviceType>
+template<bool STACKPARAMS>
+KOKKOS_INLINE_FUNCTION
+void FixShardlowKokkos<DeviceType>::operator()(TagFixShardlowSSAUpdateDPDEGhost<STACKPARAMS>, const int &workItem) const {
+  const int ct = ssa_gitemLen(workPhase, workItem);
+  const int ii = ssa_gitemLoc(workPhase, workItem);
+  ssa_update_dpde<STACKPARAMS>(ii, ct, workItem);
+}
+
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
diff --git a/src/KOKKOS/fix_shardlow_kokkos.h b/src/KOKKOS/fix_shardlow_kokkos.h
index df8849d80b..91a2fdbc97 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.h
+++ b/src/KOKKOS/fix_shardlow_kokkos.h
@@ -30,6 +30,12 @@ FixStyle(shardlow/kk/host,FixShardlowKokkos<LMPHostType>)
 
 namespace LAMMPS_NS {
 
+template<bool STACKPARAMS>
+struct TagFixShardlowSSAUpdateDPDE{};
+
+template<bool STACKPARAMS>
+struct TagFixShardlowSSAUpdateDPDEGhost{};
+
 template<class DeviceType>
 class FixShardlowKokkos : public FixShardlow {
  public:
@@ -60,6 +66,14 @@ class FixShardlowKokkos : public FixShardlow {
     F_FLOAT cutinv,halfsigma,kappa,alpha;
   };
 
+  template<bool STACKPARAMS>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFixShardlowSSAUpdateDPDE<STACKPARAMS>, const int&) const;
+
+  template<bool STACKPARAMS>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFixShardlowSSAUpdateDPDEGhost<STACKPARAMS>, const int&) const;
+
 #ifdef DEBUG_PAIR_CT
   typename AT::t_int_2d d_counters;
   typename HAT::t_int_2d h_counters;
@@ -68,6 +82,7 @@ class FixShardlowKokkos : public FixShardlow {
 #endif
 
  protected:
+  int workPhase;
   double boltz_inv,ftm2v,dt;
 
 //  class PairDPDfdt *pairDPD;
@@ -127,7 +142,7 @@ class FixShardlowKokkos : public FixShardlow {
 //  void ssa_update_dpd(int, int);  // Constant Temperature
   template<bool STACKPARAMS>
   KOKKOS_INLINE_FUNCTION
-  void ssa_update_dpde(int, int, int); // Constant Energy
+  void ssa_update_dpde(int, int, int) const; // Constant Energy
 
 };
 

From 3c8e75ad590ae35be1002ce88281d88d8bbfc6f9 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Fri, 9 Jun 2017 10:57:35 -0600
Subject: [PATCH 247/267] Add missing sync/modify to fix_shardlow_kokkos

---
 src/KOKKOS/fix_shardlow_kokkos.cpp | 27 ++++++++++++++++++---------
 src/KOKKOS/fix_shardlow_kokkos.h   |  2 +-
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index d2fb937a57..0c7c51c821 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -73,11 +73,11 @@ FixShardlowKokkos<DeviceType>::FixShardlowKokkos(LAMMPS *lmp, int narg, char **a
   FixShardlow(lmp, narg, arg), k_pairDPDE(NULL), ghostmax(0), nlocal(0) , nghost(0)
 {
   kokkosable = 1;
-//  atomKK = (AtomKokkos *) atom;
-//  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
 
-//  datamask_read = X_MASK | V_MASK | F_MASK | MASK_MASK | Q_MASK | TYPE_MASK;
-//  datamask_modify = Q_MASK | X_MASK;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
 
   if (narg != 3) error->all(FLERR,"Illegal fix shardlow command");
 
@@ -167,6 +167,7 @@ void FixShardlowKokkos<DeviceType>::init()
 //FIXME either create cutsq and fill it in, or just point to pairDPD's...
 //  memory->destroy(cutsq); //FIXME
 //  memory->create_kokkos(k_cutsq,cutsq,ntypes+1,ntypes+1,"FixShardlowKokkos:cutsq");
+  k_pairDPDE->k_cutsq.template sync<DeviceType>();
   d_cutsq = k_pairDPDE->k_cutsq.template view<DeviceType>(); //FIXME
 
   const double boltz2 = 2.0*force->boltz;
@@ -288,10 +289,6 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpd(
   rand_type rand_gen = rand_pool.get_state(id);
 #endif
 
-  const double theta_ij_inv = 1.0/k_pairDPD->temperature; // independent of i,j
-  const double boltz_inv = 1.0/force->boltz;
-  const double ftm2v = force->ftm2v;
-  const double dt     = update->dt;
   int ct = count;
   int ii = start_ii;
 
@@ -677,20 +674,24 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
   deep_copy(d_hist, h_hist);
 #endif
 
+  //theta_ij_inv = 1.0/k_pairDPD->temperature; // independent of i,j
   boltz_inv = 1.0/force->boltz;
   ftm2v = force->ftm2v;
   dt     = update->dt;
 
+  k_params.template sync<DeviceType>();
+
   // process neighbors in the local AIR
+  atomKK->sync(execution_space,X_MASK | V_MASK | TYPE_MASK | RMASS_MASK | UCOND_MASK | UMECH_MASK | DPDTHETA_MASK);
   for (workPhase = 0; workPhase < ssa_phaseCt; ++workPhase) {
     int workItemCt = h_ssa_phaseLen[workPhase];
 
-
     if(atom->ntypes > MAX_TYPES_STACKPARAMS)
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagFixShardlowSSAUpdateDPDE<false> >(0,workItemCt),*this);
     else
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagFixShardlowSSAUpdateDPDE<true> >(0,workItemCt),*this);
   }
+  atomKK->modified(execution_space,V_MASK | UCOND_MASK | UMECH_MASK);
 
   //Loop over all 13 outward directions (7 stages)
   for (workPhase = 0; workPhase < ssa_gphaseCt; ++workPhase) {
@@ -698,7 +699,9 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
     int workItemCt = h_ssa_gphaseLen[workPhase];
 
     // Communicate the updated velocities to all nodes
+    atomKK->sync(Host,V_MASK);
     comm->forward_comm_fix(this);
+    atomKK->modified(Host,V_MASK);
 
     if(k_pairDPDE){
       // Zero out the ghosts' uCond & uMech to be used as delta accumulators
@@ -706,6 +709,7 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
 //      memset(&(atom->uMech[nlocal]), 0, sizeof(double)*nghost);
 
       // must capture local variables, not class variables
+      atomKK->sync(execution_space,UCOND_MASK | UMECH_MASK);
       auto l_uCond = uCond;
       auto l_uMech = uMech;
       Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType>(nlocal,nlocal+nghost), LAMMPS_LAMBDA (const int i) {
@@ -713,16 +717,21 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
         l_uMech(i) = 0.0;
       });
       DeviceType::fence();
+      atomKK->modified(execution_space,UCOND_MASK | UMECH_MASK);
     }
 
     // process neighbors in this AIR
+    atomKK->sync(execution_space,X_MASK | V_MASK | TYPE_MASK | RMASS_MASK | UCOND_MASK | UMECH_MASK | DPDTHETA_MASK);
     if(atom->ntypes > MAX_TYPES_STACKPARAMS)
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagFixShardlowSSAUpdateDPDEGhost<false> >(0,workItemCt),*this);
     else
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagFixShardlowSSAUpdateDPDEGhost<true> >(0,workItemCt),*this);
+    atomKK->modified(execution_space,V_MASK | UCOND_MASK | UMECH_MASK);
 
     // Communicate the ghost deltas to the atom owners
+    atomKK->sync(Host,V_MASK | UCOND_MASK | UMECH_MASK);
     comm->reverse_comm_fix(this);
+    atomKK->modified(Host,V_MASK | UCOND_MASK | UMECH_MASK);
 
   }  //End Loop over all directions For airnum = Top, Top-Right, Right, Bottom-Right, Back
 
diff --git a/src/KOKKOS/fix_shardlow_kokkos.h b/src/KOKKOS/fix_shardlow_kokkos.h
index 91a2fdbc97..3dbbaaa61c 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.h
+++ b/src/KOKKOS/fix_shardlow_kokkos.h
@@ -83,7 +83,7 @@ class FixShardlowKokkos : public FixShardlow {
 
  protected:
   int workPhase;
-  double boltz_inv,ftm2v,dt;
+  double theta_ij_inv,boltz_inv,ftm2v,dt;
 
 //  class PairDPDfdt *pairDPD;
   PairDPDfdtEnergyKokkos<DeviceType> *k_pairDPDE;

From b96b6b9cd775b43007777ba2182949c331ca9fb2 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Mon, 19 Jun 2017 14:04:16 -0600
Subject: [PATCH 248/267] Fixing error checks

---
 src/KOKKOS/fix_rx_kokkos.cpp  | 2 +-
 src/USER-DPD/fix_shardlow.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/fix_rx_kokkos.cpp b/src/KOKKOS/fix_rx_kokkos.cpp
index ac81e5c2a7..6fbdfad289 100644
--- a/src/KOKKOS/fix_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@@ -116,7 +116,7 @@ void FixRxKokkos<DeviceType>::init()
 
   bool eos_flag = false;
   for (int i = 0; i < modify->nfix; i++)
-    if (strcmp(modify->fix[i]->style,"eos/table/rx") == 0) eos_flag = true;
+    if (strncmp(modify->fix[i]->style,"eos/table/rx",3) == 0) eos_flag = true;
   if(!eos_flag) error->all(FLERR,"fix rx requires fix eos/table/rx to be specified");
 
   if (update_kinetics_data)
diff --git a/src/USER-DPD/fix_shardlow.cpp b/src/USER-DPD/fix_shardlow.cpp
index a1059e2fb0..f3057a6563 100644
--- a/src/USER-DPD/fix_shardlow.cpp
+++ b/src/USER-DPD/fix_shardlow.cpp
@@ -157,7 +157,7 @@ void FixShardlow::setup(int vflag)
       error->all(FLERR,"Cannot use constant temperature integration routines with DPD.");
 
   for (int i = 0; i < modify->nfix; i++){
-    if (strcmp(modify->fix[i]->style,"shardlow") == 0) fixShardlow = true;
+    if (strncmp(modify->fix[i]->style,"shardlow",3) == 0) fixShardlow = true;
     if (strncmp(modify->fix[i]->style,"nve",3) == 0 || (strncmp(modify->fix[i]->style,"nph",3) == 0)){
       if(fixShardlow) break;
       else error->all(FLERR,"The deterministic integrator must follow fix shardlow in the input file.");

From 67a0183b333225a89902aba88c4bdc69160709d6 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Mon, 19 Jun 2017 15:23:33 -0600
Subject: [PATCH 249/267] Removing atom2bin change since ssa neighlists aren't
 be used for occasional lists

---
 src/USER-DPD/nbin_ssa.cpp                  | 3 ---
 src/USER-DPD/npair_half_bin_newton_ssa.cpp | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/USER-DPD/nbin_ssa.cpp b/src/USER-DPD/nbin_ssa.cpp
index 5dacf52ee1..4c57a8e70f 100644
--- a/src/USER-DPD/nbin_ssa.cpp
+++ b/src/USER-DPD/nbin_ssa.cpp
@@ -76,7 +76,6 @@ void NBinSSA::bin_atoms()
     int nowned = atom->nlocal; // NOTE: nlocal was set to atom->nfirst above
     for (i = nall-1; i >= nowned; i--) {
       ibin = coord2ssaAIR(x[i]);
-      atom2bin[i] = ibin;
       if (ibin < 1) continue; // skip ghost atoms not in AIR
       if (mask[i] & bitmask) {
         bins[i] = gairhead_ssa[ibin];
@@ -86,7 +85,6 @@ void NBinSSA::bin_atoms()
   } else {
     for (i = nall-1; i >= nlocal; i--) {
       ibin = coord2ssaAIR(x[i]);
-      atom2bin[i] = ibin;
       if (ibin < 1) continue; // skip ghost atoms not in AIR
       bins[i] = gairhead_ssa[ibin];
       gairhead_ssa[ibin] = i;
@@ -94,7 +92,6 @@ void NBinSSA::bin_atoms()
   }
   for (i = nlocal-1; i >= 0; i--) {
     ibin = coord2bin(x[i][0], x[i][1], x[i][2], xbin, ybin, zbin);
-    atom2bin[i] = ibin;
     // Find the bounding box of the local atoms in the bins
     if (xbin < lbinxlo) lbinxlo = xbin;
     if (xbin >= lbinxhi) lbinxhi = xbin + 1;
diff --git a/src/USER-DPD/npair_half_bin_newton_ssa.cpp b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
index 221aa5b454..a6479d4c4f 100644
--- a/src/USER-DPD/npair_half_bin_newton_ssa.cpp
+++ b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
@@ -251,7 +251,7 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
       ytmp = x[i][1];
       ztmp = x[i][2];
 
-      ibin = atom2bin[i];
+      ibin = coord2bin(x[i],xbin,ybin,zbin);
 
       // loop over AIR ghost atoms in all bins in "full" stencil
       // Note: the non-AIR ghost atoms have already been filtered out

From 270abff2a2a1923fe1cd66de92a64caecf9579b3 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 23 Aug 2017 14:59:19 -0600
Subject: [PATCH 250/267] Fix compile error for CUDA in pair_exp6_rx_kokkos

---
 src/KOKKOS/pair_exp6_rx_kokkos.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index 1eb1c6c770..46e06ca200 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -2112,6 +2112,7 @@ void partition_range( const int begin, const int end, int &thread_begin, int &th
 
 /* ---------------------------------------------------------------------- */
 
+#ifndef KOKKOS_HAVE_CUDA
 template<class DeviceType>
   template<class ArrayT>
 void PairExp6rxKokkos<DeviceType>::getMixingWeightsVect(const int np_total, int errorFlag, 
@@ -2460,6 +2461,7 @@ void PairExp6rxKokkos<DeviceType>::getMixingWeightsVect(const int np_total, int
   if (errorFlag2 > 0)
     errorFlag = 2;
 }
+#endif
 
 /* ---------------------------------------------------------------------- */
 

From 4784506ba907a8f209e6872e9d5a4e020bb30fcd Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 23 Aug 2017 15:02:26 -0600
Subject: [PATCH 251/267] Remove unused function in rand_pool_wrap_kokkos

---
 src/KOKKOS/rand_pool_wrap_kokkos.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/KOKKOS/rand_pool_wrap_kokkos.h b/src/KOKKOS/rand_pool_wrap_kokkos.h
index ce134e5215..975ce0c89a 100644
--- a/src/KOKKOS/rand_pool_wrap_kokkos.h
+++ b/src/KOKKOS/rand_pool_wrap_kokkos.h
@@ -69,8 +69,6 @@ class RandPoolWrap : protected Pointers {
 
   }
 
-  void clean_copy() { random_thr = NULL; }
-
  private:
   class RanMars **random_thr;
   int nthreads;

From f5a99dece766a3dc35ae1c2b63cf4d7c4f75795d Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 23 Aug 2017 15:08:44 -0600
Subject: [PATCH 252/267] Remove unnecessary thread fences

---
 src/KOKKOS/fix_shardlow_kokkos.cpp       | 1 -
 src/KOKKOS/fix_wall_lj93_kokkos.cpp      | 1 -
 src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp | 2 --
 3 files changed, 4 deletions(-)

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index 0c7c51c821..e3d9723c53 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -716,7 +716,6 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
         l_uCond(i) = 0.0;
         l_uMech(i) = 0.0;
       });
-      DeviceType::fence();
       atomKK->modified(execution_space,UCOND_MASK | UMECH_MASK);
     }
 
diff --git a/src/KOKKOS/fix_wall_lj93_kokkos.cpp b/src/KOKKOS/fix_wall_lj93_kokkos.cpp
index 38c7347e97..b0f7e0bda4 100644
--- a/src/KOKKOS/fix_wall_lj93_kokkos.cpp
+++ b/src/KOKKOS/fix_wall_lj93_kokkos.cpp
@@ -62,7 +62,6 @@ void FixWallLJ93Kokkos<DeviceType>::wall_particle(int m_in, int which, double co
   copymode = 1;
   FixWallLJ93KokkosFunctor<DeviceType> wp_functor(this);
   Kokkos::parallel_reduce(nlocal,wp_functor,ewall);
-  DeviceType::fence();
   copymode = 0;
 
   atomKK->modified(execution_space, F_MASK);
diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
index f7e1bad056..08e0f5096e 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
@@ -673,7 +673,6 @@ int PairMultiLucyRXKokkos<DeviceType>::pack_forward_comm_kokkos(int n, DAT::tdua
   iswap = iswap_in;
   v_buf = buf.view<DeviceType>();
   Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType, TagPairMultiLucyRXPackForwardComm>(0,n),*this);
-  DeviceType::fence();
   return n;
 }
 
@@ -692,7 +691,6 @@ void PairMultiLucyRXKokkos<DeviceType>::unpack_forward_comm_kokkos(int n, int fi
   first = first_in;
   v_buf = buf.view<DeviceType>();
   Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType, TagPairMultiLucyRXUnpackForwardComm>(0,n),*this);
-  DeviceType::fence();
 
   atomKK->modified(execution_space,DPDRHO_MASK);
 }

From a641289d5ba30cf60c0c6ff4f3177b0f03836138 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Fri, 25 Aug 2017 12:36:53 -0600
Subject: [PATCH 253/267] Must use atomics for GPUs in pair_exp6_rx_kokkos

---
 src/KOKKOS/kokkos.cpp              | 8 --------
 src/KOKKOS/kokkos.h                | 1 -
 src/KOKKOS/pair_exp6_rx_kokkos.cpp | 6 +++---
 3 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp
index 10e7bda4e0..072a802b54 100644
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@@ -34,7 +34,6 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
   lmp->kokkos = this;
 
   auto_sync = 1;
-  gb_test = 1;
 
   int me = 0;
   MPI_Comm_rank(world,&me);
@@ -157,7 +156,6 @@ void KokkosLMP::accelerator(int narg, char **arg)
   neighflag = FULL;
   neighflag_qeq = FULL;
   neighflag_qeq_set = 0;
-  gb_test = 1;
   int newtonflag = 0;
   double binsize = 0.0;
   exchange_comm_classic = forward_comm_classic = 0;
@@ -199,12 +197,6 @@ void KokkosLMP::accelerator(int narg, char **arg)
       else if (strcmp(arg[iarg+1],"on") == 0) newtonflag = 1;
       else error->all(FLERR,"Illegal package kokkos command");
       iarg += 2;
-    } else if (strcmp(arg[iarg],"gb/test") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
-      if (strcmp(arg[iarg+1],"off") == 0) gb_test = 0;
-      else if (strcmp(arg[iarg+1],"on") == 0) gb_test = 1;
-      else error->all(FLERR,"Illegal package kokkos command");
-      iarg += 2;
     } else if (strcmp(arg[iarg],"comm") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
       if (strcmp(arg[iarg+1],"no") == 0) {
diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h
index 3784d806bf..8e28b38cbf 100644
--- a/src/KOKKOS/kokkos.h
+++ b/src/KOKKOS/kokkos.h
@@ -32,7 +32,6 @@ class KokkosLMP : protected Pointers {
   int num_threads,ngpu;
   int numa;
   int auto_sync;
-  int gb_test;
 
   KokkosLMP(class LAMMPS *, int, char **);
   ~KokkosLMP();
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index 46e06ca200..b3e413428d 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -275,7 +275,7 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
   EV_FLOAT ev;
 
-  if (!lmp->kokkos->gb_test) {
+#ifdef KOKKOS_HAVE_CUDA  // Use atomics
 
   if (neighflag == HALF) {
     if (newton_pair) {
@@ -303,7 +303,7 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     }
   }
 
-  } else { // No atomics
+#else // No atomics
 
   num_threads = lmp->kokkos->num_threads;
   int nmax = f.dimension_0();
@@ -343,7 +343,7 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCollapseDupViews>(0,nmax),*this);
 
-  }
+#endif
 
   k_error_flag.template modify<DeviceType>();
   k_error_flag.template sync<LMPHostType>();

From a062944de95e013abafb4604f0b3e1d830d8a161 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 23 Aug 2017 16:08:01 -0600
Subject: [PATCH 254/267] Fix execution space issues

---
 src/KOKKOS/atom_vec_dpd_kokkos.h          |  2 ++
 src/KOKKOS/fix_eos_table_rx_kokkos.cpp    | 12 ++++++------
 src/KOKKOS/fix_rx_kokkos.cpp              | 20 +++++++++----------
 src/KOKKOS/fix_rx_kokkos.h                | 14 +++++++------
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp |  6 +++---
 src/KOKKOS/pair_dpd_fdt_energy_kokkos.h   |  4 ++--
 src/KOKKOS/pair_exp6_rx_kokkos.cpp        | 24 +++++++++++------------
 src/KOKKOS/pair_exp6_rx_kokkos.h          |  4 ++--
 src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp  | 16 +++++++--------
 src/KOKKOS/pair_multi_lucy_rx_kokkos.h    |  4 ++--
 10 files changed, 55 insertions(+), 51 deletions(-)

diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.h b/src/KOKKOS/atom_vec_dpd_kokkos.h
index d108e58ae7..372404cc7d 100644
--- a/src/KOKKOS/atom_vec_dpd_kokkos.h
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.h
@@ -14,6 +14,8 @@
 #ifdef ATOM_CLASS
 
 AtomStyle(dpd/kk,AtomVecDPDKokkos)
+AtomStyle(dpd/kk/device,AtomVecDPDKokkos)
+AtomStyle(dpd/kk/host,AtomVecDPDKokkos)
 
 #else
 
diff --git a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
index 8487fd4c4f..552141ced2 100644
--- a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
@@ -197,7 +197,7 @@ void FixEOStableRXKokkos<DeviceType>::operator()(TagFixEOStableRXInit, const int
   double tmp;
   if (mask[i] & groupbit) {
     if(dpdTheta[i] <= 0.0)
-      k_error_flag.d_view() = 1;
+      k_error_flag.template view<DeviceType>()() = 1;
     energy_lookup(i,dpdTheta[i],tmp);
     uCond[i] = 0.0;
     uMech[i] = tmp;
@@ -239,7 +239,7 @@ void FixEOStableRXKokkos<DeviceType>::operator()(TagFixEOStableRXTemperatureLook
   if (mask[i] & groupbit){
     temperature_lookup(i,uCond[i]+uMech[i]+uChem[i],dpdTheta[i]);
     if (dpdTheta[i] <= 0.0)
-      k_error_flag.d_view() = 1;
+      k_error_flag.template view<DeviceType>()() = 1;
   }
 }
 
@@ -387,11 +387,11 @@ void FixEOStableRXKokkos<DeviceType>::temperature_lookup(int id, double ui, doub
   // Apply the Secant Method
   for(it=0; it<maxit; it++){
     if(fabs(f2-f1) < MY_EPSILON){
-      if(isnan(f1) || isnan(f2)) k_error_flag.d_view() = 2;
+      if(isnan(f1) || isnan(f2)) k_error_flag.template view<DeviceType>()() = 2;
       temp = t1;
       temp = MAX(temp,lo);
       temp = MIN(temp,hi);
-      k_warning_flag.d_view() = 1;
+      k_warning_flag.template view<DeviceType>()() = 1;
       break;
     }
     temp = t2 - f2*(t2-t1)/(f2-f1);
@@ -404,9 +404,9 @@ void FixEOStableRXKokkos<DeviceType>::temperature_lookup(int id, double ui, doub
   }
   if(it==maxit){
     if(isnan(f1) || isnan(f2) || isnan(ui) || isnan(thetai) || isnan(t1) || isnan(t2))
-      k_error_flag.d_view() = 2;
+      k_error_flag.template view<DeviceType>()() = 2;
     else
-      k_error_flag.d_view() = 3;
+      k_error_flag.template view<DeviceType>()() = 3;
   }
   thetai = temp;
 }
diff --git a/src/KOKKOS/fix_rx_kokkos.cpp b/src/KOKKOS/fix_rx_kokkos.cpp
index f04b1a3a49..b1cfd20be2 100644
--- a/src/KOKKOS/fix_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@@ -1403,7 +1403,7 @@ void FixRxKokkos<DeviceType>::operator()(Tag_FixRxKokkos_solveSystems<ZERO_RATES
       if (y[ispecies] < -1.0e-10)
       {
         //error->one(FLERR,"Computed concentration in RK solver is < -1.0e-10");
-        k_error_flag.d_view() = 2;
+        k_error_flag.template view<DeviceType>()() = 2;
         // This should be an atomic update.
       }
       else if (y[ispecies] < MY_EPSILON)
@@ -1444,10 +1444,10 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
   {
     const int count = nlocal + (newton_pair ? nghost : 0);
 
-    if (count > k_dpdThetaLocal.d_view.dimension_0()) {
+    if (count > k_dpdThetaLocal.template view<DeviceType>().dimension_0()) {
       memory->destroy_kokkos (k_dpdThetaLocal, dpdThetaLocal);
       memory->create_kokkos (k_dpdThetaLocal, dpdThetaLocal, count, "FixRxKokkos::dpdThetaLocal");
-      this->d_dpdThetaLocal = k_dpdThetaLocal.d_view;
+      this->d_dpdThetaLocal = k_dpdThetaLocal.template view<DeviceType>();
       this->h_dpdThetaLocal = k_dpdThetaLocal.h_view;
     }
 
@@ -1514,8 +1514,8 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
     memory->create_kokkos (k_diagnosticCounterPerODEnSteps, diagnosticCounterPerODEnSteps, nlocal, "FixRxKokkos::diagnosticCounterPerODEnSteps");
     memory->create_kokkos (k_diagnosticCounterPerODEnFuncs, diagnosticCounterPerODEnFuncs, nlocal, "FixRxKokkos::diagnosticCounterPerODEnFuncs");
 
-    d_diagnosticCounterPerODEnSteps = k_diagnosticCounterPerODEnSteps.d_view;
-    d_diagnosticCounterPerODEnFuncs = k_diagnosticCounterPerODEnFuncs.d_view;
+    d_diagnosticCounterPerODEnSteps = k_diagnosticCounterPerODEnSteps.template view<DeviceType>();
+    d_diagnosticCounterPerODEnFuncs = k_diagnosticCounterPerODEnFuncs.template view<DeviceType>();
 
     Kokkos::parallel_for ( Kokkos::RangePolicy<DeviceType, Tag_FixRxKokkos_zeroCounterViews>(0,nlocal), *this);
     //Kokkos::parallel_for ( nlocal,
@@ -1619,7 +1619,7 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
           if (y[ispecies] < -1.0e-10)
           {
             //error->one(FLERR,"Computed concentration in RK solver is < -1.0e-10");
-            k_error_flag.d_view() = 2;
+            k_error_flag.template view<DeviceType>()() = 2;
             // This should be an atomic update.
           }
           else if (y[ispecies] < MY_EPSILON)
@@ -1907,7 +1907,7 @@ void FixRxKokkos<DeviceType>::operator()(Tag_FixRxKokkos_firstPairOperator<WT_FL
 {
   // Create an atomic view of sumWeights and dpdThetaLocal. Only needed
   // for Half/thread scenarios.
-  typedef Kokkos::View< E_FLOAT*, typename DAT::t_efloat_1d::array_layout, typename DAT::t_efloat_1d::device_type, Kokkos::MemoryTraits< AtomicF< NEIGHFLAG >::value> > AtomicViewType;
+  typedef Kokkos::View< E_FLOAT*, typename DAT::t_efloat_1d::array_layout, DeviceType, Kokkos::MemoryTraits< AtomicF< NEIGHFLAG >::value> > AtomicViewType;
 
   AtomicViewType a_dpdThetaLocal = d_dpdThetaLocal;
   AtomicViewType a_sumWeights    = d_sumWeights;
@@ -2044,10 +2044,10 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
   int sumWeightsCt = nlocal + (NEWTON_PAIR ? nghost : 0);
 
   //memory->create_kokkos (k_sumWeights, sumWeights, sumWeightsCt, "FixRxKokkos::sumWeights");
-  if (sumWeightsCt > k_sumWeights.d_view.dimension_0()) {
+  if (sumWeightsCt > k_sumWeights.template view<DeviceType>().dimension_0()) {
     memory->destroy_kokkos(k_sumWeights, sumWeights);
     memory->create_kokkos (k_sumWeights, sumWeightsCt, "FixRxKokkos::sumWeights");
-    d_sumWeights = k_sumWeights.d_view;
+    d_sumWeights = k_sumWeights.template view<DeviceType>();
     h_sumWeights = k_sumWeights.h_view;
   }
 
@@ -2083,7 +2083,7 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
           // Create an atomic view of sumWeights and dpdThetaLocal. Only needed
           // for Half/thread scenarios.
           //typedef Kokkos::View< E_FLOAT*, typename DAT::t_efloat_1d::array_layout, DeviceType, Kokkos::MemoryTraits< AtomicF< NEIGHFLAG >::value> > AtomicViewType;
-          typedef Kokkos::View< E_FLOAT*, typename DAT::t_efloat_1d::array_layout, typename DAT::t_efloat_1d::device_type, Kokkos::MemoryTraits< AtomicF< NEIGHFLAG >::value> > AtomicViewType;
+          typedef Kokkos::View< E_FLOAT*, typename DAT::t_efloat_1d::array_layout, DeviceType, Kokkos::MemoryTraits< AtomicF< NEIGHFLAG >::value> > AtomicViewType;
 
           AtomicViewType a_dpdThetaLocal = d_dpdThetaLocal;
           AtomicViewType a_sumWeights    = d_sumWeights;
diff --git a/src/KOKKOS/fix_rx_kokkos.h b/src/KOKKOS/fix_rx_kokkos.h
index 169a87a2f9..92b715f34d 100644
--- a/src/KOKKOS/fix_rx_kokkos.h
+++ b/src/KOKKOS/fix_rx_kokkos.h
@@ -74,6 +74,8 @@ typedef struct s_CounterType CounterType;
 template <typename DeviceType>
 class FixRxKokkos : public FixRX {
  public:
+  typedef ArrayTypes<DeviceType> AT;
+
   FixRxKokkos(class LAMMPS *, int, char **);
   virtual ~FixRxKokkos();
   virtual void init();
@@ -202,10 +204,10 @@ class FixRxKokkos : public FixRX {
   DAT::tdual_int_1d k_diagnosticCounterPerODEnFuncs;
   //typename ArrayTypes<DeviceType>::t_int_1d d_diagnosticCounterPerODEnSteps;
   //typename ArrayTypes<DeviceType>::t_int_1d d_diagnosticCounterPerODEnFuncs;
-  typename DAT::t_int_1d d_diagnosticCounterPerODEnSteps;
-  typename DAT::t_int_1d d_diagnosticCounterPerODEnFuncs;
-  typename HAT::t_int_1d h_diagnosticCounterPerODEnSteps;
-  typename HAT::t_int_1d h_diagnosticCounterPerODEnFuncs;
+  typename AT::t_int_1d d_diagnosticCounterPerODEnSteps;
+  typename AT::t_int_1d d_diagnosticCounterPerODEnFuncs;
+  HAT::t_int_1d h_diagnosticCounterPerODEnSteps;
+  HAT::t_int_1d h_diagnosticCounterPerODEnFuncs;
 
   template <typename KokkosDeviceType>
   struct KineticsType
@@ -233,8 +235,8 @@ class FixRxKokkos : public FixRX {
   // Need a dual-view and device-view for dpdThetaLocal and sumWeights since they're used in several callbacks.
   DAT::tdual_efloat_1d k_dpdThetaLocal, k_sumWeights;
   //typename ArrayTypes<DeviceType>::t_efloat_1d d_dpdThetaLocal, d_sumWeights;
-  typename DAT::t_efloat_1d d_dpdThetaLocal, d_sumWeights;
-  typename HAT::t_efloat_1d h_dpdThetaLocal, h_sumWeights;
+  typename AT::t_efloat_1d d_dpdThetaLocal, d_sumWeights;
+  HAT::t_efloat_1d h_dpdThetaLocal, h_sumWeights;
 
   typename ArrayTypes<DeviceType>::t_x_array_randomread d_x       ;
   typename ArrayTypes<DeviceType>::t_int_1d_randomread  d_type    ;
diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
index 03bf1a8b61..c559ab412f 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
@@ -169,12 +169,12 @@ void PairDPDfdtEnergyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   if (eflag_atom) {
     memory->destroy_kokkos(k_eatom,eatom);
     memory->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
-    d_eatom = k_eatom.d_view;
+    d_eatom = k_eatom.template view<DeviceType>();
   }
   if (vflag_atom) {
     memory->destroy_kokkos(k_vatom,vatom);
     memory->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom");
-    d_vatom = k_vatom.d_view;
+    d_vatom = k_vatom.template view<DeviceType>();
   }
 
   x = atomKK->k_x.view<DeviceType>();
@@ -645,7 +645,7 @@ void PairDPDfdtEnergyKokkos<DeviceType>::allocate()
   d_cutsq = k_cutsq.template view<DeviceType>();
 
   k_params = Kokkos::DualView<params_dpd**,Kokkos::LayoutRight,DeviceType>("PairDPDfdtEnergy::params",n+1,n+1);
-  params = k_params.d_view;
+  params = k_params.template view<DeviceType>();
 
   if (!splitFDT_flag) {
     memory->destroy(duCond);
diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
index fcf4b33a7a..424779f839 100644
--- a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
@@ -139,8 +139,8 @@ class PairDPDfdtEnergyKokkos : public PairDPDfdtEnergy {
 
   DAT::tdual_efloat_1d k_eatom;
   DAT::tdual_virial_array k_vatom;
-  DAT::t_efloat_1d d_eatom;
-  DAT::t_virial_array d_vatom;
+  typename AT::t_efloat_1d d_eatom;
+  typename AT::t_virial_array d_vatom;
 
   typename AT::t_neighbors_2d d_neighbors;
   typename AT::t_int_1d_randomread d_ilist;
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
index b3e413428d..8d65be23af 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -153,12 +153,12 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   if (eflag_atom) {
     memory->destroy_kokkos(k_eatom,eatom);
     memory->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
-    d_eatom = k_eatom.d_view;
+    d_eatom = k_eatom.template view<DeviceType>();
   }
   if (vflag_atom) {
     memory->destroy_kokkos(k_vatom,vatom);
     memory->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom");
-    d_vatom = k_vatom.d_view;
+    d_vatom = k_vatom.template view<DeviceType>();
   }
 
   x = atomKK->k_x.view<DeviceType>();
@@ -582,7 +582,7 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
 
       if(rmOld12_ij!=0.0 && rmOld21_ij!=0.0){
         if(alphaOld21_ij == 6.0 || alphaOld12_ij == 6.0)
-          k_error_flag.d_view() = 1;
+          k_error_flag.template view<DeviceType>()() = 1;
 
         // A3.  Compute some convenient quantities for evaluating the force
         rminv = 1.0/rmOld12_ij;
@@ -676,7 +676,7 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEW
 
       if(rm12_ij!=0.0 && rm21_ij!=0.0){
         if(alpha21_ij == 6.0 || alpha12_ij == 6.0)
-          k_error_flag.d_view() = 1;
+          k_error_flag.template view<DeviceType>()() = 1;
 
         // A3.  Compute some convenient quantities for evaluating the force
         rminv = 1.0/rm12_ij;
@@ -953,7 +953,7 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxComputeNoAtomics<NEIG
 
       if(rmOld12_ij!=0.0 && rmOld21_ij!=0.0){
         if(alphaOld21_ij == 6.0 || alphaOld12_ij == 6.0)
-          k_error_flag.d_view() = 1;
+          k_error_flag.template view<DeviceType>()() = 1;
 
         // A3.  Compute some convenient quantities for evaluating the force
         rminv = 1.0/rmOld12_ij;
@@ -1047,7 +1047,7 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxComputeNoAtomics<NEIG
 
       if(rm12_ij!=0.0 && rm21_ij!=0.0){
         if(alpha21_ij == 6.0 || alpha12_ij == 6.0)
-          k_error_flag.d_view() = 1;
+          k_error_flag.template view<DeviceType>()() = 1;
 
         // A3.  Compute some convenient quantities for evaluating the force
         rminv = 1.0/rm12_ij;
@@ -1592,7 +1592,7 @@ void PairExp6rxKokkos<DeviceType>::vectorized_operator(const int &ii, EV_FLOAT&
   }
 
   if (hasError)
-    k_error_flag.d_view() = 1;
+    k_error_flag.template view<DeviceType>()() = 1;
 
   if (UseAtomics)
   {
@@ -1887,7 +1887,7 @@ void PairExp6rxKokkos<DeviceType>::getMixingWeights(int id,double &epsilon1,doub
     }
   }
   if(nTotal < MY_EPSILON || nTotalold < MY_EPSILON)
-    k_error_flag.d_view() = 1;
+    k_error_flag.template view<DeviceType>()() = 1;
 
   // Compute the mole fraction of molecules within the fluid portion of the particle (One Fluid Approximation)
   fractionOFAold = nMoleculesOFAold / nTotalold;
@@ -2042,28 +2042,28 @@ void PairExp6rxKokkos<DeviceType>::getMixingWeights(int id,double &epsilon1,doub
   // Check that no fractions are less than zero
   if(fraction1 < 0.0 || nMolecules1 < 0.0){
     if(fraction1 < -MY_EPSILON || nMolecules1 < -MY_EPSILON){
-      k_error_flag.d_view() = 2;
+      k_error_flag.template view<DeviceType>()() = 2;
     }
     nMolecules1 = 0.0;
     fraction1 = 0.0;
   }
   if(fraction2 < 0.0 || nMolecules2 < 0.0){
     if(fraction2 < -MY_EPSILON || nMolecules2 < -MY_EPSILON){
-      k_error_flag.d_view() = 2;
+      k_error_flag.template view<DeviceType>()() = 2;
     }
     nMolecules2 = 0.0;
     fraction2 = 0.0;
   }
   if(fractionOld1 < 0.0 || nMoleculesOld1 < 0.0){
     if(fractionOld1 < -MY_EPSILON || nMoleculesOld1 < -MY_EPSILON){
-      k_error_flag.d_view() = 2;
+      k_error_flag.template view<DeviceType>()() = 2;
     }
     nMoleculesOld1 = 0.0;
     fractionOld1 = 0.0;
   }
   if(fractionOld2 < 0.0 || nMoleculesOld2 < 0.0){
     if(fractionOld2 < -MY_EPSILON || nMoleculesOld2 < -MY_EPSILON){
-      k_error_flag.d_view() = 2;
+      k_error_flag.template view<DeviceType>()() = 2;
     }
     nMoleculesOld2 = 0.0;
     fractionOld2 = 0.0;
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.h b/src/KOKKOS/pair_exp6_rx_kokkos.h
index 4c35c76851..5e44048ae2 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.h
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.h
@@ -161,8 +161,8 @@ class PairExp6rxKokkos : public PairExp6rx {
 
   DAT::tdual_efloat_1d k_eatom;
   DAT::tdual_virial_array k_vatom;
-  DAT::t_efloat_1d d_eatom;
-  DAT::t_virial_array d_vatom;
+  typename AT::t_efloat_1d d_eatom;
+  typename AT::t_virial_array d_vatom;
 
   DAT::tdual_int_scalar k_error_flag;
 
diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
index 08e0f5096e..d9a4f1ab83 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
@@ -155,12 +155,12 @@ void PairMultiLucyRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in
   if (eflag_atom) {
     memory->destroy_kokkos(k_eatom,eatom);
     memory->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
-    d_eatom = k_eatom.d_view;
+    d_eatom = k_eatom.template view<DeviceType>();
   }
   if (vflag_atom) {
     memory->destroy_kokkos(k_vatom,vatom);
     memory->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom");
-    d_vatom = k_vatom.d_view;
+    d_vatom = k_vatom.template view<DeviceType>();
   }
 
   x = atomKK->k_x.view<DeviceType>();
@@ -328,7 +328,7 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
 
       //if (rho[i]*rho[i] < tb->innersq || rho[j]*rho[j] < tb->innersq){
       if (rho[i]*rho[i] < d_table_const.innersq(tidx) || rho[j]*rho[j] < d_table_const.innersq(tidx)){
-        k_error_flag.d_view() = 1;
+        k_error_flag.template view<DeviceType>()() = 1;
       }
 
       if (TABSTYLE == LOOKUP) {
@@ -337,7 +337,7 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
         //jtable = static_cast<int> (((rho[j]*rho[j]) - tb->innersq) * tb->invdelta);
         jtable = static_cast<int> (((rho[j]*rho[j]) - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
         if (itable >= tlm1 || jtable >= tlm1){
-          k_error_flag.d_view() = 2;
+          k_error_flag.template view<DeviceType>()() = 2;
         }
         //A_i = tb->f[itable];
         A_i = d_table_const.f(tidx,itable);
@@ -355,7 +355,7 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
         //jtable = static_cast<int> (((rho[j]*rho[j]) - tb->innersq) * tb->invdelta);
         jtable = static_cast<int> ((rho[j]*rho[j] - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
         if (itable >= tlm1 || jtable >= tlm1){
-          k_error_flag.d_view() = 2;
+          k_error_flag.template view<DeviceType>()() = 2;
         }
         if(itable<0) itable=0;
         if(itable>=tlm1) itable=tlm1;
@@ -380,7 +380,7 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
         fpair = 0.5*(A_i + A_j)*(4.0-3.0*rfactor)*rfactor*rfactor*rfactor;
         fpair /= sqrt(rsq);
 
-      } else k_error_flag.d_view() = 3;
+      } else k_error_flag.template view<DeviceType>()() = 3;
 
       if (isite1 == isite2) fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpair;
       else fpair = (sqrt(mixWtSite1old_i*mixWtSite2old_j) + sqrt(mixWtSite2old_i*mixWtSite1old_j))*fpair;
@@ -411,14 +411,14 @@ void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEI
     evdwl = d_table_const.e(tidx,itable);
   } else if (TABSTYLE == LINEAR) {
     if (itable >= tlm1){
-      k_error_flag.d_view() = 2;
+      k_error_flag.template view<DeviceType>()() = 2;
     }
     if(itable==0) fraction_i=0.0;
     //else fraction_i = (((rho[i]*rho[i]) - tb->rsq[itable]) * tb->invdelta);
     else fraction_i = (((rho[i]*rho[i]) - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx));
     //evdwl = tb->e[itable] + fraction_i*tb->de[itable];
     evdwl = d_table_const.e(tidx,itable) + fraction_i*d_table_const.de(tidx,itable);
-  } else k_error_flag.d_view() = 3;
+  } else k_error_flag.template view<DeviceType>()() = 3;
 
   evdwl *=(pi*d_cutsq(itype,itype)*d_cutsq(itype,itype))/84.0;
   evdwlOld = mixWtSite1old_i*evdwl;
diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.h b/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
index 8556319531..b8ced4c847 100644
--- a/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
@@ -167,8 +167,8 @@ class PairMultiLucyRXKokkos : public PairMultiLucyRX {
 
   DAT::tdual_efloat_1d k_eatom;
   DAT::tdual_virial_array k_vatom;
-  DAT::t_efloat_1d d_eatom;
-  DAT::t_virial_array d_vatom;
+  typename AT::t_efloat_1d d_eatom;
+  typename AT::t_virial_array d_vatom;
 
   typename AT::t_neighbors_2d d_neighbors;
   typename AT::t_int_1d_randomread d_ilist;

From 1e16fed9ab94435321f81188abbe001f2320e1b8 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Fri, 25 Aug 2017 15:16:19 -0600
Subject: [PATCH 255/267] Error out if using pair hybrid with Kokkos, but not
 pair hybrid/overlay

---
 src/pair_hybrid.cpp | 3 +++
 src/pair_hybrid.h   | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/src/pair_hybrid.cpp b/src/pair_hybrid.cpp
index 4a98cca614..751560deff 100644
--- a/src/pair_hybrid.cpp
+++ b/src/pair_hybrid.cpp
@@ -379,6 +379,9 @@ void PairHybrid::coeff(int narg, char **arg)
   if (narg < 3) error->all(FLERR,"Incorrect args for pair coefficients");
   if (!allocated) allocate();
 
+  if (lmp->kokkos)
+    error->all(FLERR,"Cannot yet use pair hybrid with Kokkos");
+
   int ilo,ihi,jlo,jhi;
   force->bounds(FLERR,arg[0],atom->ntypes,ilo,ihi);
   force->bounds(FLERR,arg[1],atom->ntypes,jlo,jhi);
diff --git a/src/pair_hybrid.h b/src/pair_hybrid.h
index 2364b16f46..463ae00eca 100644
--- a/src/pair_hybrid.h
+++ b/src/pair_hybrid.h
@@ -90,6 +90,10 @@ class PairHybrid : public Pair {
 
 /* ERROR/WARNING messages:
 
+E: Cannot yet use pair hybrid with Kokkos
+
+This feature is not yet supported.
+
 E: Illegal ... command
 
 Self-explanatory.  Check the input script syntax and compare to the

From b73999ef215c101c24c54627102f2fc44cfd581b Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Fri, 25 Aug 2017 15:25:41 -0600
Subject: [PATCH 256/267] Revert change to read_data.cpp

---
 src/read_data.cpp | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/read_data.cpp b/src/read_data.cpp
index 6f0a229ed9..b1a42608c0 100644
--- a/src/read_data.cpp
+++ b/src/read_data.cpp
@@ -50,7 +50,7 @@ using namespace LAMMPS_NS;
 
 #define MAXLINE 256
 #define LB_FACTOR 1.1
-#define CHUNK 4096
+#define CHUNK 1024
 #define DELTA 4            // must be 2 or larger
 #define MAXBODY 32         // max # of lines in one body
 
@@ -1904,12 +1904,8 @@ void ReadData::open(char *file)
   if (!compressed) fp = fopen(file,"r");
   else {
 #ifdef LAMMPS_GZIP
-    char gunzip[2048];
-    // Use taskset to force the gzip process to NOT run on the 0th "CPU", which should
-    // keep it from thrashing with the MPI rank zero process (the one reading the pipe).
-    // This is Linux specific, and the 1023 upper range might also be system specific.
-    // Use of something like hwloc would be more portable... but more complicated.
-    sprintf(gunzip,"taskset -c 1-1023 gzip -c -d %s",file);
+    char gunzip[128];
+    sprintf(gunzip,"gzip -c -d %s",file);
 
 #ifdef _WIN32
     fp = _popen(gunzip,"rb");

From 1f8c4f2c62dfaf24340673a99175cf84e478c698 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Fri, 25 Aug 2017 15:31:00 -0600
Subject: [PATCH 257/267] Remove hardcoded map variables and debug output

---
 src/read_restart.cpp | 30 +-----------------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

diff --git a/src/read_restart.cpp b/src/read_restart.cpp
index fcbd8d186d..82583bfe01 100644
--- a/src/read_restart.cpp
+++ b/src/read_restart.cpp
@@ -212,17 +212,6 @@ void ReadRestart::command(int narg, char **arg)
       int perAtomSize = avec->size_restart(); // ...so we can get its size
       atom->nlocal = 0; // restore nlocal to zero atoms
       int atomCt = (int) (assignedChunkSize / perAtomSize);
-//#define DEBUG_PRE_GROW
-#ifdef DEBUG_PRE_GROW
-fprintf(stdout, "ReadRestart::command %04d: pAS %d, aCt %d, nmax %d, chunckSize %12.0f, %12.0f\n"
-  ,me
-  ,perAtomSize
-  ,atomCt
-  ,atom->nmax
-  ,(double) assignedChunkSize
-  ,((double) perAtomSize) * atomCt
-);
-#endif
       if (atomCt > atom->nmax) avec->grow(atomCt);
     }
     m = 0;
@@ -905,10 +894,8 @@ void ReadRestart::header(int incompatible)
       atom->tag_enable = read_int();
     } else if (flag == ATOM_MAP_STYLE) {
       atom->map_style = read_int();
-      atom->map_style = 0;
     } else if (flag == ATOM_MAP_USER) {
       atom->map_user  = read_int();
-      atom->map_user = 0;
     } else if (flag == ATOM_SORTFREQ) {
       atom->sortfreq = read_int();
     } else if (flag == ATOM_SORTBIN) {
@@ -1068,22 +1055,7 @@ void ReadRestart::file_layout()
               nproc_chunk_sizes[ndx] = base_ct * perAtomSize;
               current_ByteOffset += base_ByteOffset;
             }
-//#define DEBUG_FILE_LAYOUT
-#ifdef DEBUG_FILE_LAYOUT
-fprintf(stdout, "ReadRestart::file_layout: %15.0f/%d = %15.0f totCt, %15.0f natoms, %12.0f baseCt, %12.0f leftover, %d np != %d npf %c%c\n"
-  ,(double) total_size
-  ,perAtomSize
-  ,(double) total_ct
-  ,(double) atom->natoms
-  ,(double) base_ct
-  ,(double) leftover_ct
-  ,nprocs
-  ,nprocs_file
-  ,(total_size == (total_ct * perAtomSize)) ? ' ' : 'E'
-  ,(total_ct == (base_ct * nprocs + leftover_ct)) ? ' ' : 'F'
-);
-#endif
-          } else { // Bummer, we have to read in based on how it was written
+          } else { // we have to read in based on how it was written
             int init_chunk_number = nprocs_file/nprocs;
             int num_extra_chunks = nprocs_file - (nprocs*init_chunk_number);
 

From e52a28f8afaf404ae84f3b377a6a060a9d43fe17 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Fri, 25 Aug 2017 16:20:42 -0600
Subject: [PATCH 258/267] Update docs for Kokkos version of USER-DPD package

---
 doc/src/fix_dpd_energy.txt     | 24 ++++++++++++++++++++++++
 doc/src/fix_eos_table_rx.txt   | 24 ++++++++++++++++++++++++
 doc/src/fix_rx.txt             | 24 ++++++++++++++++++++++++
 doc/src/fix_shardlow.txt       | 24 ++++++++++++++++++++++++
 doc/src/fix_wall.txt           | 26 ++++++++++++++++++++++++++
 doc/src/pair_dpd_fdt.txt       | 24 ++++++++++++++++++++++++
 doc/src/pair_exp6_rx.txt       | 26 ++++++++++++++++++++++++++
 doc/src/pair_hybrid.txt        |  1 +
 doc/src/pair_multi_lucy_rx.txt | 24 ++++++++++++++++++++++++
 doc/src/pair_table_rx.txt      | 24 ++++++++++++++++++++++++
 10 files changed, 221 insertions(+)

diff --git a/doc/src/fix_dpd_energy.txt b/doc/src/fix_dpd_energy.txt
index ed49e5a671..1c10d954d6 100644
--- a/doc/src/fix_dpd_energy.txt
+++ b/doc/src/fix_dpd_energy.txt
@@ -7,6 +7,7 @@
 :line
 
 fix dpd/energy command :h3
+fix dpd/energy/kk command :h3
 
 [Syntax:]
 
@@ -46,6 +47,29 @@ examples/USER/dpd directory.
 
 :line
 
+Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section 5"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+
+These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the "-suffix command-line
+switch"_Section_start.html#start_6 when you invoke LAMMPS, or you can
+use the "suffix"_suffix.html command in your input script.
+
+See "Section 5"_Section_accelerate.html of the manual for
+more instructions on how to use the accelerated styles effectively.
+
+:line
+
 [Restrictions:]
 
 This command is part of the USER-DPD package.  It is only enabled if
diff --git a/doc/src/fix_eos_table_rx.txt b/doc/src/fix_eos_table_rx.txt
index e5e4f772f6..0c87874347 100644
--- a/doc/src/fix_eos_table_rx.txt
+++ b/doc/src/fix_eos_table_rx.txt
@@ -7,6 +7,7 @@
 :line
 
 fix eos/table/rx command :h3
+fix eos/table/rx/kk command :h3
 
 [Syntax:]
 
@@ -152,6 +153,29 @@ no      0.93 0.00 0.000 -1.76 :pre
 
 :line
 
+Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section 5"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+
+These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the "-suffix command-line
+switch"_Section_start.html#start_6 when you invoke LAMMPS, or you can
+use the "suffix"_suffix.html command in your input script.
+
+See "Section 5"_Section_accelerate.html of the manual for
+more instructions on how to use the accelerated styles effectively.
+
+:line
+
 [Restrictions:]
 
 This command is part of the USER-DPD package.  It is only enabled if
diff --git a/doc/src/fix_rx.txt b/doc/src/fix_rx.txt
index 6a800f3865..0810a34740 100644
--- a/doc/src/fix_rx.txt
+++ b/doc/src/fix_rx.txt
@@ -7,6 +7,7 @@
 :line
 
 fix rx command :h3
+fix rx/kk command :h3
 
 [Syntax:]
 
@@ -182,6 +183,29 @@ read_data    data.dpd fix foo_SPECIES NULL Species
 
 :line
 
+Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section 5"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+
+These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the "-suffix command-line
+switch"_Section_start.html#start_6 when you invoke LAMMPS, or you can
+use the "suffix"_suffix.html command in your input script.
+
+See "Section 5"_Section_accelerate.html of the manual for
+more instructions on how to use the accelerated styles effectively.
+
+:line
+
 [Restrictions:]
 
 This command is part of the USER-DPD package.  It is only enabled if
diff --git a/doc/src/fix_shardlow.txt b/doc/src/fix_shardlow.txt
index 8354b4c41c..24726d8610 100644
--- a/doc/src/fix_shardlow.txt
+++ b/doc/src/fix_shardlow.txt
@@ -7,6 +7,7 @@
 :line
 
 fix shardlow command :h3
+fix shardlow/kk command :h3
 
 [Syntax:]
 
@@ -52,6 +53,29 @@ examples/USER/dpd directory.
 
 :line
 
+Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section 5"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+
+These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the "-suffix command-line
+switch"_Section_start.html#start_6 when you invoke LAMMPS, or you can
+use the "suffix"_suffix.html command in your input script.
+
+See "Section 5"_Section_accelerate.html of the manual for
+more instructions on how to use the accelerated styles effectively.
+
+:line
+
 [Restrictions:]
 
 This command is part of the USER-DPD package.  It is only enabled if
diff --git a/doc/src/fix_wall.txt b/doc/src/fix_wall.txt
index 6d76956620..6bbfccf9db 100644
--- a/doc/src/fix_wall.txt
+++ b/doc/src/fix_wall.txt
@@ -7,6 +7,7 @@
 :line
 
 fix wall/lj93 command :h3
+fix wall/lj93/kk command :h3
 fix wall/lj126 command :h3
 fix wall/lj1043 command :h3
 fix wall/colloid command :h3
@@ -277,6 +278,31 @@ the total potential energy of the system (the quantity being
 minimized), you MUST enable the "fix_modify"_fix_modify.html {energy}
 option for this fix.
 
+:line
+
+Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section 5"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+
+These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the "-suffix command-line
+switch"_Section_start.html#start_6 when you invoke LAMMPS, or you can
+use the "suffix"_suffix.html command in your input script.
+
+See "Section 5"_Section_accelerate.html of the manual for
+more instructions on how to use the accelerated styles effectively.
+
+:line
+
 [Restrictions:] none
 
 [Related commands:]
diff --git a/doc/src/pair_dpd_fdt.txt b/doc/src/pair_dpd_fdt.txt
index b75e7c323c..867f3f2315 100644
--- a/doc/src/pair_dpd_fdt.txt
+++ b/doc/src/pair_dpd_fdt.txt
@@ -8,6 +8,7 @@
 
 pair_style dpd/fdt command :h3
 pair_style dpd/fdt/energy command :h3
+pair_style dpd/fdt/energy/kk command :h3
 
 [Syntax:]
 
@@ -125,6 +126,29 @@ significantly larger timesteps to be taken.
 
 :line
 
+Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section 5"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+
+These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the "-suffix command-line
+switch"_Section_start.html#start_6 when you invoke LAMMPS, or you can
+use the "suffix"_suffix.html command in your input script.
+
+See "Section 5"_Section_accelerate.html of the manual for
+more instructions on how to use the accelerated styles effectively.
+
+:line
+
 [Restrictions:]
 
 These commands are part of the USER-DPD package.  They are only
diff --git a/doc/src/pair_exp6_rx.txt b/doc/src/pair_exp6_rx.txt
index cbc17d357d..7eafa23543 100644
--- a/doc/src/pair_exp6_rx.txt
+++ b/doc/src/pair_exp6_rx.txt
@@ -7,6 +7,7 @@
 :line
 
 pair_style exp6/rx command :h3
+pair_style exp6/rx/kk command :h3
 
 [Syntax:]
 
@@ -147,6 +148,31 @@ This style does not support the pair_modify tail option for adding long-range
 tail corrections to energy and pressure for the A,C terms in the
 pair interaction.
 
+:line
+
+Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section 5"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+
+These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the "-suffix command-line
+switch"_Section_start.html#start_6 when you invoke LAMMPS, or you can
+use the "suffix"_suffix.html command in your input script.
+
+See "Section 5"_Section_accelerate.html of the manual for
+more instructions on how to use the accelerated styles effectively.
+
+:line
+
 [Restrictions:]
 
 This command is part of the USER-DPD package.  It is only enabled if
diff --git a/doc/src/pair_hybrid.txt b/doc/src/pair_hybrid.txt
index fc1824cf62..d37dedc709 100644
--- a/doc/src/pair_hybrid.txt
+++ b/doc/src/pair_hybrid.txt
@@ -10,6 +10,7 @@ pair_style hybrid command :h3
 pair_style hybrid/omp command :h3
 pair_style hybrid/overlay command :h3
 pair_style hybrid/overlay/omp command :h3
+pair_style hybrid/overlay/kk command :h3
 
 [Syntax:]
 
diff --git a/doc/src/pair_multi_lucy_rx.txt b/doc/src/pair_multi_lucy_rx.txt
index 77ed223e2a..57abcf4a4c 100644
--- a/doc/src/pair_multi_lucy_rx.txt
+++ b/doc/src/pair_multi_lucy_rx.txt
@@ -7,6 +7,7 @@
 :line
 
 pair_style multi/lucy/rx command :h3
+pair_style multi/lucy/rx/kk command :h3
 
 [Syntax:]
 
@@ -200,6 +201,29 @@ This pair style can only be used via the {pair} keyword of the
 
 :line
 
+Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section 5"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+
+These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the "-suffix command-line
+switch"_Section_start.html#start_6 when you invoke LAMMPS, or you can
+use the "suffix"_suffix.html command in your input script.
+
+See "Section 5"_Section_accelerate.html of the manual for
+more instructions on how to use the accelerated styles effectively.
+
+:line
+
 [Restrictions:]
 
 This command is part of the USER-DPD package.  It is only enabled if
diff --git a/doc/src/pair_table_rx.txt b/doc/src/pair_table_rx.txt
index f93af21da4..cd3a7ef31b 100644
--- a/doc/src/pair_table_rx.txt
+++ b/doc/src/pair_table_rx.txt
@@ -7,6 +7,7 @@
 :line
 
 pair_style table/rx command :h3
+pair_style table/rx/kk command :h3
 
 [Syntax:]
 
@@ -223,6 +224,29 @@ This pair style can only be used via the {pair} keyword of the
 
 :line
 
+Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section 5"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+
+These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the "-suffix command-line
+switch"_Section_start.html#start_6 when you invoke LAMMPS, or you can
+use the "suffix"_suffix.html command in your input script.
+
+See "Section 5"_Section_accelerate.html of the manual for
+more instructions on how to use the accelerated styles effectively.
+
+:line
+
 [Restrictions:]
 
 This command is part of the USER-DPD package.  It is only enabled if

From a4a45f1d9cfbd52f41b69dc160b73a316dad2229 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Tue, 29 Aug 2017 17:25:13 -0600
Subject: [PATCH 259/267] Remove unnecessary check in npair_kokkos

---
 src/KOKKOS/npair_kokkos.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp
index 462a4b8424..d5ea8376f6 100644
--- a/src/KOKKOS/npair_kokkos.cpp
+++ b/src/KOKKOS/npair_kokkos.cpp
@@ -173,12 +173,6 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
   data.special_flag[2] = special_flag[2];
   data.special_flag[3] = special_flag[3];
 
-  if(list->d_neighbors.dimension_0()<nall) { // Can this EVER be true??? - TIM 20170215
-    list->d_neighbors = typename ArrayTypes<DeviceType>::t_neighbors_2d("neighbors", nall*1.1, list->maxneighs);
-    list->d_numneigh = typename ArrayTypes<DeviceType>::t_int_1d("numneigh", nall*1.1);
-    data.neigh_list.d_neighbors = list->d_neighbors;
-    data.neigh_list.d_numneigh = list->d_numneigh;
-  }
   data.h_resize()=1;
   while(data.h_resize()) {
     data.h_new_maxneighs() = list->maxneighs;

From f2d8c37f27b142606741780f436f250da74bab09 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Wed, 30 Aug 2017 10:24:29 -0500
Subject: [PATCH 260/267] Rename SSA specific debug #ifdef to DEBUG_SSA_PAIR_CT

---
 src/KOKKOS/fix_shardlow_kokkos.cpp | 14 +++++++-------
 src/KOKKOS/fix_shardlow_kokkos.h   |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index e3d9723c53..a09c8bc1ba 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -102,7 +102,7 @@ FixShardlowKokkos<DeviceType>::FixShardlowKokkos(LAMMPS *lmp, int narg, char **a
   if(/* k_pairDPD == NULL &&*/ k_pairDPDE == NULL)
     error->all(FLERR,"Must use pair_style "/*"dpd/fdt/kk or "*/"dpd/fdt/energy/kk with fix shardlow/kk");
 
-#ifdef DEBUG_PAIR_CT
+#ifdef DEBUG_SSA_PAIR_CT
   d_counters = typename AT::t_int_2d("FixShardlowKokkos::d_counters", 2, 3);
   d_hist = typename AT::t_int_1d("FixShardlowKokkos::d_hist", 32);
 #ifndef KOKKOS_USE_CUDA_UVM
@@ -319,7 +319,7 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpd(
       const X_FLOAT dely = ytmp - x(j, 1);
       const X_FLOAT delz = ztmp - x(j, 2);
       const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
-#ifdef DEBUG_PAIR_CT
+#ifdef DEBUG_SSA_PAIR_CT
       if ((i < nlocal) && (j < nlocal)) Kokkos::atomic_increment(&(d_counters(0, 0)));
       else Kokkos::atomic_increment(&(d_counters(0, 1)));
       Kokkos::atomic_increment(&(d_counters(0, 2)));
@@ -332,7 +332,7 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpd(
       // NOTE: r can be 0.0 in DPD systems, so do EPSILON_SQUARED test
       if ((rsq < (STACKPARAMS?m_cutsq[itype][jtype]:d_cutsq(itype,jtype)))
         && (rsq >= EPSILON_SQUARED)) {
-#ifdef DEBUG_PAIR_CT
+#ifdef DEBUG_SSA_PAIR_CT
         if ((i < nlocal) && (j < nlocal)) Kokkos::atomic_increment(&(d_counters(1, 0)));
         else Kokkos::atomic_increment(&(d_counters(1, 1)));
         Kokkos::atomic_increment(&(d_counters(1, 2)));
@@ -475,7 +475,7 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
       const X_FLOAT dely = ytmp - x(j, 1);
       const X_FLOAT delz = ztmp - x(j, 2);
       const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
-#ifdef DEBUG_PAIR_CT
+#ifdef DEBUG_SSA_PAIR_CT
       if ((i < nlocal) && (j < nlocal)) Kokkos::atomic_increment(&(d_counters(0, 0)));
       else Kokkos::atomic_increment(&(d_counters(0, 1)));
       Kokkos::atomic_increment(&(d_counters(0, 2)));
@@ -488,7 +488,7 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
       // NOTE: r can be 0.0 in DPD systems, so do EPSILON_SQUARED test
       if ((rsq < (STACKPARAMS?m_cutsq[itype][jtype]:d_cutsq(itype,jtype)))
         && (rsq >= EPSILON_SQUARED)) {
-#ifdef DEBUG_PAIR_CT
+#ifdef DEBUG_SSA_PAIR_CT
         if ((i < nlocal) && (j < nlocal)) Kokkos::atomic_increment(&(d_counters(1, 0)));
         else Kokkos::atomic_increment(&(d_counters(1, 1)));
         Kokkos::atomic_increment(&(d_counters(1, 2)));
@@ -665,7 +665,7 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
     maxRNG = maxWorkItemCt;
   }
 
-#ifdef DEBUG_PAIR_CT
+#ifdef DEBUG_SSA_PAIR_CT
   for (int i = 0; i < 2; ++i)
     for (int j = 0; j < 3; ++j)
       h_counters(i,j) = 0;
@@ -734,7 +734,7 @@ void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
 
   }  //End Loop over all directions For airnum = Top, Top-Right, Right, Bottom-Right, Back
 
-#ifdef DEBUG_PAIR_CT
+#ifdef DEBUG_SSA_PAIR_CT
 deep_copy(h_counters, d_counters);
 deep_copy(h_hist, d_hist);
 for (int i = 0; i < 32; ++i) fprintf(stdout, "%8d", h_hist[i]);
diff --git a/src/KOKKOS/fix_shardlow_kokkos.h b/src/KOKKOS/fix_shardlow_kokkos.h
index 3dbbaaa61c..1ff94d5eec 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.h
+++ b/src/KOKKOS/fix_shardlow_kokkos.h
@@ -74,7 +74,7 @@ class FixShardlowKokkos : public FixShardlow {
   KOKKOS_INLINE_FUNCTION
   void operator()(TagFixShardlowSSAUpdateDPDEGhost<STACKPARAMS>, const int&) const;
 
-#ifdef DEBUG_PAIR_CT
+#ifdef DEBUG_SSA_PAIR_CT
   typename AT::t_int_2d d_counters;
   typename HAT::t_int_2d h_counters;
   typename AT::t_int_1d d_hist;

From 2dd202cc761de7afb9419294a52652edc8cf278d Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Wed, 30 Aug 2017 10:34:09 -0500
Subject: [PATCH 261/267] USER-DPD: remove some out-of-date FIXME comments in
 fix_shardlow_kokkos.cpp

---
 src/KOKKOS/fix_shardlow_kokkos.cpp | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index a09c8bc1ba..ea7cc21fff 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -164,17 +164,13 @@ void FixShardlowKokkos<DeviceType>::init()
   k_params = Kokkos::DualView<params_ssa**,Kokkos::LayoutRight,DeviceType>
     ("FixShardlowKokkos::params",ntypes+1,ntypes+1);
   params = k_params.template view<DeviceType>();
-//FIXME either create cutsq and fill it in, or just point to pairDPD's...
-//  memory->destroy(cutsq); //FIXME
-//  memory->create_kokkos(k_cutsq,cutsq,ntypes+1,ntypes+1,"FixShardlowKokkos:cutsq");
   k_pairDPDE->k_cutsq.template sync<DeviceType>();
-  d_cutsq = k_pairDPDE->k_cutsq.template view<DeviceType>(); //FIXME
+  d_cutsq = k_pairDPDE->k_cutsq.template view<DeviceType>();
 
   const double boltz2 = 2.0*force->boltz;
   for (int i = 1; i <= ntypes; i++) {
     for (int j = i; j <= ntypes; j++) {
       F_FLOAT cutone = k_pairDPDE->cut[i][j];
-//      k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutone*cutone; //FIXME
       if (cutone > EPSILON) k_params.h_view(i,j).cutinv = 1.0/cutone;
       else k_params.h_view(i,j).cutinv = FLT_MAX;
       k_params.h_view(i,j).halfsigma = 0.5*k_pairDPDE->sigma[i][j];
@@ -190,7 +186,6 @@ void FixShardlowKokkos<DeviceType>::init()
     }
   }
 
-  // k_cutsq.template modify<LMPHostType>();
   k_params.template modify<LMPHostType>();
 }
 

From dc7f1281b83214d360cac981b33fdca2d559f935 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Wed, 30 Aug 2017 11:01:17 -0500
Subject: [PATCH 262/267] USER-DPD: Kokkos version of Const Temperature DPD
 isn't implemented yet

The Constant Energy DPD (DPDE) was our primary usage case, so only stubs
for the Constant Temperature case were included in Kokkos code so far.
The non-Kokkos version works fine for Constant Temperature DPD.
---
 src/KOKKOS/fix_shardlow_kokkos.cpp |  3 ++-
 src/KOKKOS/fix_shardlow_kokkos.h   | 14 +++++++++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index ea7cc21fff..98bbb02714 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -265,7 +265,8 @@ void FixShardlowKokkos<DeviceType>::setup_pre_neighbor()
 
 /* ---------------------------------------------------------------------- */
 
-#ifdef NOTNOW
+#ifdef ENABLE_KOKKOS_DPD_CONSTANT_TEMPERATURE
+#error "FixShardlowKokkos::ssa_update_dpd() is not functional yet - TIM 20170830"
 /* ----------------------------------------------------------------------
    Perform the stochastic integration and Shardlow update for constant temperature
    Allow for both per-type and per-atom mass
diff --git a/src/KOKKOS/fix_shardlow_kokkos.h b/src/KOKKOS/fix_shardlow_kokkos.h
index 1ff94d5eec..70dccf2e2d 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.h
+++ b/src/KOKKOS/fix_shardlow_kokkos.h
@@ -26,6 +26,9 @@ FixStyle(shardlow/kk/host,FixShardlowKokkos<LMPHostType>)
 #include "fix_shardlow.h"
 #include "kokkos_type.h"
 #include "neigh_list_kokkos.h"
+#ifdef ENABLE_KOKKOS_DPD_CONSTANT_TEMPERATURE
+#include "pair_dpd_fdt_kokkos.h"
+#endif
 #include "pair_dpd_fdt_energy_kokkos.h"
 
 namespace LAMMPS_NS {
@@ -85,7 +88,9 @@ class FixShardlowKokkos : public FixShardlow {
   int workPhase;
   double theta_ij_inv,boltz_inv,ftm2v,dt;
 
-//  class PairDPDfdt *pairDPD;
+#ifdef ENABLE_KOKKOS_DPD_CONSTANT_TEMPERATURE
+//  class PairDPDfdt *pairDPD; FIXME as per k_pairDPDE below
+#endif
   PairDPDfdtEnergyKokkos<DeviceType> *k_pairDPDE;
 
   int maxRNG;
@@ -138,8 +143,11 @@ class FixShardlowKokkos : public FixShardlow {
   typename AT::t_int_2d ssa_gitemLoc, ssa_gitemLen;
 
 
-//  template<bool STACKPARAMS>
-//  void ssa_update_dpd(int, int);  // Constant Temperature
+#ifdef ENABLE_KOKKOS_DPD_CONSTANT_TEMPERATURE
+  template<bool STACKPARAMS>
+  KOKKOS_INLINE_FUNCTION
+  void ssa_update_dpd(int, int, int) const;  // Constant Temperature
+#endif
   template<bool STACKPARAMS>
   KOKKOS_INLINE_FUNCTION
   void ssa_update_dpde(int, int, int) const; // Constant Energy

From 3e6cdd1400117c1a99008f9649e05fdcd10fc6e1 Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Wed, 30 Aug 2017 14:58:38 -0500
Subject: [PATCH 263/267] USER-DPD: finish renaming #ifdef DEBUG_PAIR_CT to
 DEBUG_SSA_PAIR_CT

---
 src/USER-DPD/fix_shardlow.cpp | 16 ++++++++--------
 src/USER-DPD/fix_shardlow.h   |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/USER-DPD/fix_shardlow.cpp b/src/USER-DPD/fix_shardlow.cpp
index f3057a6563..cec53ab15f 100644
--- a/src/USER-DPD/fix_shardlow.cpp
+++ b/src/USER-DPD/fix_shardlow.cpp
@@ -212,7 +212,7 @@ void FixShardlow::ssa_update_dpd(
   const double mass_i = (rmass) ? rmass[i] : mass[itype];
   const double massinv_i = 1.0 / mass_i;
 
-#ifdef DEBUG_PAIR_CT
+#ifdef DEBUG_SSA_PAIR_CT
   const int nlocal = atom->nlocal;
 #endif
 
@@ -225,7 +225,7 @@ void FixShardlow::ssa_update_dpd(
     double dely = ytmp - x[j][1];
     double delz = ztmp - x[j][2];
     double rsq = delx*delx + dely*dely + delz*delz;
-#ifdef DEBUG_PAIR_CT
+#ifdef DEBUG_SSA_PAIR_CT
     if ((i < nlocal) && (j < nlocal)) ++(counters[0][0]);
     else ++(counters[0][1]);
     ++(counters[0][2]);
@@ -237,7 +237,7 @@ void FixShardlow::ssa_update_dpd(
 
     // NOTE: r can be 0.0 in DPD systems, so do EPSILON_SQUARED test
     if ((rsq < cut2_i[jtype]) && (rsq >= EPSILON_SQUARED)) {
-#ifdef DEBUG_PAIR_CT
+#ifdef DEBUG_SSA_PAIR_CT
       if ((i < nlocal) && (j < nlocal)) ++(counters[1][0]);
       else ++(counters[1][1]);
       ++(counters[1][2]);
@@ -369,7 +369,7 @@ void FixShardlow::ssa_update_dpde(
   const double massinv_i = 1.0 / mass_i;
   const double mass_i_div_neg4_ftm2v = mass_i*(-0.25)/ftm2v;
 
-#ifdef DEBUG_PAIR_CT
+#ifdef DEBUG_SSA_PAIR_CT
   const int nlocal = atom->nlocal;
 #endif
 
@@ -382,7 +382,7 @@ void FixShardlow::ssa_update_dpde(
     double dely = ytmp - x[j][1];
     double delz = ztmp - x[j][2];
     double rsq = delx*delx + dely*dely + delz*delz;
-#ifdef DEBUG_PAIR_CT
+#ifdef DEBUG_SSA_PAIR_CT
     if ((i < nlocal) && (j < nlocal)) ++(counters[0][0]);
     else ++(counters[0][1]);
     ++(counters[0][2]);
@@ -394,7 +394,7 @@ void FixShardlow::ssa_update_dpde(
 
     // NOTE: r can be 0.0 in DPD systems, so do EPSILON_SQUARED test
     if ((rsq < cut2_i[jtype]) && (rsq >= EPSILON_SQUARED)) {
-#ifdef DEBUG_PAIR_CT
+#ifdef DEBUG_SSA_PAIR_CT
       if ((i < nlocal) && (j < nlocal)) ++(counters[1][0]);
       else ++(counters[1][1]);
       ++(counters[1][2]);
@@ -530,7 +530,7 @@ void FixShardlow::initial_integrate(int vflag)
     error->one(FLERR, msg);
   }
 
-#ifdef DEBUG_PAIR_CT
+#ifdef DEBUG_SSA_PAIR_CT
   for (int i = 0; i < 2; ++i)
     for (int j = 0; j < 3; ++j)
       counters[i][j] = 0;
@@ -598,7 +598,7 @@ void FixShardlow::initial_integrate(int vflag)
 
   }  //End Loop over all directions For airnum = Top, Top-Right, Right, Bottom-Right, Back
 
-#ifdef DEBUG_PAIR_CT
+#ifdef DEBUG_SSA_PAIR_CT
 for (int i = 0; i < 32; ++i) fprintf(stdout, "%8d", hist[i]);
 fprintf(stdout, "\n%6d %6d,%6d %6d: "
   ,counters[0][2]
diff --git a/src/USER-DPD/fix_shardlow.h b/src/USER-DPD/fix_shardlow.h
index e87ae3c9cf..e8e5f484a0 100644
--- a/src/USER-DPD/fix_shardlow.h
+++ b/src/USER-DPD/fix_shardlow.h
@@ -38,7 +38,7 @@ class FixShardlow : public Fix {
 
   double memory_usage();
 
-#ifdef DEBUG_PAIR_CT
+#ifdef DEBUG_SSA_PAIR_CT
   int counters[2][3];
   int hist[32];
 #endif

From d95a5f219e69c41461971912130a20c4c5b11efd Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Thu, 31 Aug 2017 10:38:36 -0600
Subject: [PATCH 264/267] Remove all thread fences except one in verlet_kokkos

---
 src/KOKKOS/atom_vec_angle_kokkos.cpp     | 16 ----------------
 src/KOKKOS/atom_vec_atomic_kokkos.cpp    | 16 ----------------
 src/KOKKOS/atom_vec_bond_kokkos.cpp      | 16 ----------------
 src/KOKKOS/atom_vec_charge_kokkos.cpp    | 16 ----------------
 src/KOKKOS/atom_vec_dpd_kokkos.cpp       | 16 ----------------
 src/KOKKOS/atom_vec_full_kokkos.cpp      | 16 ----------------
 src/KOKKOS/atom_vec_molecular_kokkos.cpp | 16 ----------------
 src/KOKKOS/comm_kokkos.cpp               |  7 -------
 src/KOKKOS/domain_kokkos.cpp             |  6 ------
 src/KOKKOS/nbin_kokkos.cpp               |  2 --
 src/KOKKOS/nbin_ssa_kokkos.cpp           |  4 ----
 src/KOKKOS/neigh_bond_kokkos.cpp         | 11 -----------
 src/KOKKOS/neighbor_kokkos.cpp           |  2 --
 src/KOKKOS/npair_kokkos.cpp              |  1 -
 src/KOKKOS/npair_ssa_kokkos.cpp          |  1 -
 src/KOKKOS/region_block_kokkos.cpp       |  1 -
 16 files changed, 147 deletions(-)

diff --git a/src/KOKKOS/atom_vec_angle_kokkos.cpp b/src/KOKKOS/atom_vec_angle_kokkos.cpp
index 34b868aadc..05414cf2e4 100644
--- a/src/KOKKOS/atom_vec_angle_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_angle_kokkos.cpp
@@ -308,7 +308,6 @@ int AtomVecAngleKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     if(pbc_flag) {
@@ -336,7 +335,6 @@ int AtomVecAngleKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 
   return n*size_forward;
@@ -430,7 +428,6 @@ int AtomVecAngleKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &li
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
@@ -463,7 +460,6 @@ int AtomVecAngleKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &li
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 	return n*3;
 }
@@ -501,13 +497,11 @@ void AtomVecAngleKokkos::unpack_comm_kokkos(const int &n, const int &first,
     modified(Host,X_MASK);
     struct AtomVecAngleKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
     struct AtomVecAngleKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -753,13 +747,11 @@ int AtomVecAngleKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,h_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecAngleKokkos_PackBorder<LMPDeviceType,1> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,d_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
 
   } else {
@@ -769,13 +761,11 @@ int AtomVecAngleKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,h_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecAngleKokkos_PackBorder<LMPDeviceType,0> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,d_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
   }
   return n*size_border;
@@ -977,12 +967,10 @@ void AtomVecAngleKokkos::unpack_border_kokkos(const int &n, const int &first,
     struct AtomVecAngleKokkos_UnpackBorder<LMPHostType>
       f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,h_molecule,first);
     Kokkos::parallel_for(n,f);
-    LMPHostType::fence();
   } else {
     struct AtomVecAngleKokkos_UnpackBorder<LMPDeviceType>
       f(buf.view<LMPDeviceType>(),d_x,d_tag,d_type,d_mask,d_molecule,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -1241,13 +1229,11 @@ int AtomVecAngleKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_
     AtomVecAngleKokkos_PackExchangeFunctor<LMPHostType>
       f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPHostType::fence();
     return nsend*elements;
   } else {
     AtomVecAngleKokkos_PackExchangeFunctor<LMPDeviceType>
       f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPDeviceType::fence();
     return nsend*elements;
   }
 }
@@ -1405,7 +1391,6 @@ int AtomVecAngleKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int n
     AtomVecAngleKokkos_UnpackExchangeFunctor<LMPHostType>
       f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/elements,f);
-    LMPHostType::fence();
     return k_count.h_view(0);
   } else {
     k_count.h_view(0) = nlocal;
@@ -1414,7 +1399,6 @@ int AtomVecAngleKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int n
     AtomVecAngleKokkos_UnpackExchangeFunctor<LMPDeviceType>
       f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/elements,f);
-    LMPDeviceType::fence();
     k_count.modify<LMPDeviceType>();
     k_count.sync<LMPHostType>();
 
diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.cpp b/src/KOKKOS/atom_vec_atomic_kokkos.cpp
index d040bd3553..b63dc5fb8c 100644
--- a/src/KOKKOS/atom_vec_atomic_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_atomic_kokkos.cpp
@@ -224,7 +224,6 @@ int AtomVecAtomicKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     if(pbc_flag) {
@@ -252,7 +251,6 @@ int AtomVecAtomicKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 
 	return n*size_forward;
@@ -340,7 +338,6 @@ int AtomVecAtomicKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &l
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
@@ -369,7 +366,6 @@ int AtomVecAtomicKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &l
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 	return n*3;
 }
@@ -407,13 +403,11 @@ void AtomVecAtomicKokkos::unpack_comm_kokkos(const int &n, const int &first,
     modified(Host,X_MASK);
     struct AtomVecAtomicKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
     struct AtomVecAtomicKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -655,13 +649,11 @@ int AtomVecAtomicKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecAtomicKokkos_PackBorder<LMPDeviceType,1> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
 
   } else {
@@ -671,13 +663,11 @@ int AtomVecAtomicKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecAtomicKokkos_PackBorder<LMPDeviceType,0> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
   }
   return n*6;
@@ -853,11 +843,9 @@ void AtomVecAtomicKokkos::unpack_border_kokkos(const int &n, const int &first,
   if(space==Host) {
     struct AtomVecAtomicKokkos_UnpackBorder<LMPHostType> f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,first);
     Kokkos::parallel_for(n,f);
-    LMPHostType::fence();
   } else {
     struct AtomVecAtomicKokkos_UnpackBorder<LMPDeviceType> f(buf.view<LMPDeviceType>(),d_x,d_tag,d_type,d_mask,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -1009,12 +997,10 @@ int AtomVecAtomicKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat
   if(space == Host) {
     AtomVecAtomicKokkos_PackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPHostType::fence();
     return nsend*11;
   } else {
     AtomVecAtomicKokkos_PackExchangeFunctor<LMPDeviceType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPDeviceType::fence();
     return nsend*11;
   }
 }
@@ -1106,7 +1092,6 @@ int AtomVecAtomicKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int
     k_count.h_view(0) = nlocal;
     AtomVecAtomicKokkos_UnpackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/11,f);
-    LMPHostType::fence();
     return k_count.h_view(0);
   } else {
     k_count.h_view(0) = nlocal;
@@ -1114,7 +1099,6 @@ int AtomVecAtomicKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int
     k_count.sync<LMPDeviceType>();
     AtomVecAtomicKokkos_UnpackExchangeFunctor<LMPDeviceType> f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/11,f);
-    LMPDeviceType::fence();
     k_count.modify<LMPDeviceType>();
     k_count.sync<LMPHostType>();
 
diff --git a/src/KOKKOS/atom_vec_bond_kokkos.cpp b/src/KOKKOS/atom_vec_bond_kokkos.cpp
index c46c49cb29..e0f29a27bb 100644
--- a/src/KOKKOS/atom_vec_bond_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_bond_kokkos.cpp
@@ -266,7 +266,6 @@ int AtomVecBondKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     if(pbc_flag) {
@@ -294,7 +293,6 @@ int AtomVecBondKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 
 	return n*size_forward;
@@ -382,7 +380,6 @@ int AtomVecBondKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &lis
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
@@ -411,7 +408,6 @@ int AtomVecBondKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &lis
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 	return n*3;
 }
@@ -449,13 +445,11 @@ void AtomVecBondKokkos::unpack_comm_kokkos(const int &n, const int &first,
     modified(Host,X_MASK);
     struct AtomVecBondKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
     struct AtomVecBondKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -701,13 +695,11 @@ int AtomVecBondKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,h_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecBondKokkos_PackBorder<LMPDeviceType,1> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,d_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
 
   } else {
@@ -717,13 +709,11 @@ int AtomVecBondKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,h_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecBondKokkos_PackBorder<LMPDeviceType,0> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,d_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
   }
   return n*size_border;
@@ -925,12 +915,10 @@ void AtomVecBondKokkos::unpack_border_kokkos(const int &n, const int &first,
     struct AtomVecBondKokkos_UnpackBorder<LMPHostType>
       f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,h_molecule,first);
     Kokkos::parallel_for(n,f);
-    LMPHostType::fence();
   } else {
     struct AtomVecBondKokkos_UnpackBorder<LMPDeviceType>
       f(buf.view<LMPDeviceType>(),d_x,d_tag,d_type,d_mask,d_molecule,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -1157,13 +1145,11 @@ int AtomVecBondKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2
     AtomVecBondKokkos_PackExchangeFunctor<LMPHostType>
       f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPHostType::fence();
     return nsend*elements;
   } else {
     AtomVecBondKokkos_PackExchangeFunctor<LMPDeviceType>
       f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPDeviceType::fence();
     return nsend*elements;
   }
 }
@@ -1299,7 +1285,6 @@ int AtomVecBondKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nr
     AtomVecBondKokkos_UnpackExchangeFunctor<LMPHostType>
       f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/elements,f);
-    LMPHostType::fence();
     return k_count.h_view(0);
   } else {
     k_count.h_view(0) = nlocal;
@@ -1308,7 +1293,6 @@ int AtomVecBondKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nr
     AtomVecBondKokkos_UnpackExchangeFunctor<LMPDeviceType>
       f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/elements,f);
-    LMPDeviceType::fence();
     k_count.modify<LMPDeviceType>();
     k_count.sync<LMPHostType>();
 
diff --git a/src/KOKKOS/atom_vec_charge_kokkos.cpp b/src/KOKKOS/atom_vec_charge_kokkos.cpp
index 856660d1e9..89f7e91c2b 100644
--- a/src/KOKKOS/atom_vec_charge_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_charge_kokkos.cpp
@@ -236,7 +236,6 @@ int AtomVecChargeKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     if(pbc_flag) {
@@ -264,7 +263,6 @@ int AtomVecChargeKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 
 	return n*size_forward;
@@ -352,7 +350,6 @@ int AtomVecChargeKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &l
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
@@ -381,7 +378,6 @@ int AtomVecChargeKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &l
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 	return n*3;
 }
@@ -419,13 +415,11 @@ void AtomVecChargeKokkos::unpack_comm_kokkos(const int &n, const int &first,
     modified(Host,X_MASK);
     struct AtomVecChargeKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
     struct AtomVecChargeKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -669,13 +663,11 @@ int AtomVecChargeKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,h_q,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecChargeKokkos_PackBorder<LMPDeviceType,1> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,d_q,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
 
   } else {
@@ -685,13 +677,11 @@ int AtomVecChargeKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,h_q,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecChargeKokkos_PackBorder<LMPDeviceType,0> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,d_q,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
   }
   return n*size_border;
@@ -890,12 +880,10 @@ void AtomVecChargeKokkos::unpack_border_kokkos(const int &n, const int &first,
     struct AtomVecChargeKokkos_UnpackBorder<LMPHostType>
       f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,h_q,first);
     Kokkos::parallel_for(n,f);
-    LMPHostType::fence();
   } else {
     struct AtomVecChargeKokkos_UnpackBorder<LMPDeviceType>
       f(buf.view<LMPDeviceType>(),d_x,d_tag,d_type,d_mask,d_q,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
   modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK);
 }
@@ -1078,13 +1066,11 @@ int AtomVecChargeKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat
     AtomVecChargeKokkos_PackExchangeFunctor<LMPHostType>
       f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPHostType::fence();
     return nsend*12;
   } else {
     AtomVecChargeKokkos_PackExchangeFunctor<LMPDeviceType>
       f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPDeviceType::fence();
     return nsend*12;
   }
 }
@@ -1181,7 +1167,6 @@ int AtomVecChargeKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int
     k_count.h_view(0) = nlocal;
     AtomVecChargeKokkos_UnpackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/12,f);
-    LMPHostType::fence();
     return k_count.h_view(0);
   } else {
     k_count.h_view(0) = nlocal;
@@ -1190,7 +1175,6 @@ int AtomVecChargeKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int
     AtomVecChargeKokkos_UnpackExchangeFunctor<LMPDeviceType>
       f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/12,f);
-    LMPDeviceType::fence();
     k_count.modify<LMPDeviceType>();
     k_count.sync<LMPHostType>();
 
diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.cpp b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
index 2090e924ec..c4e493bd85 100644
--- a/src/KOKKOS/atom_vec_dpd_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
@@ -298,7 +298,6 @@ int AtomVecDPDKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
     if(pbc_flag) {
@@ -334,7 +333,6 @@ int AtomVecDPDKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 
 	return n*size_forward;
@@ -443,7 +441,6 @@ int AtomVecDPDKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
     modified(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
@@ -480,7 +477,6 @@ int AtomVecDPDKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 	return n*3;
 }
@@ -534,7 +530,6 @@ void AtomVecDPDKokkos::unpack_comm_kokkos(const int &n, const int &first,
     atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
     buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   } else {
     sync(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
     modified(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
@@ -542,7 +537,6 @@ void AtomVecDPDKokkos::unpack_comm_kokkos(const int &n, const int &first,
     atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
     buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -840,7 +834,6 @@ int AtomVecDPDKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DA
         h_dpdTheta,h_uCond,h_uMech,h_uChem,h_uCG,h_uCGnew,
         dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecDPDKokkos_PackBorder<LMPDeviceType,1> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
@@ -848,7 +841,6 @@ int AtomVecDPDKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DA
         d_dpdTheta,d_uCond,d_uMech,d_uChem,d_uCG,d_uCGnew,
         dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
 
   } else {
@@ -860,7 +852,6 @@ int AtomVecDPDKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DA
         h_dpdTheta,h_uCond,h_uMech,h_uChem,h_uCG,h_uCGnew,
         dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecDPDKokkos_PackBorder<LMPDeviceType,0> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
@@ -868,7 +859,6 @@ int AtomVecDPDKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DA
         d_dpdTheta,d_uCond,d_uMech,d_uChem,d_uCG,d_uCGnew,
         dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
   }
   return n*6;
@@ -1146,14 +1136,12 @@ void AtomVecDPDKokkos::unpack_border_kokkos(const int &n, const int &first,
       h_dpdTheta,h_uCond,h_uMech,h_uChem,h_uCG,h_uCGnew,
       first);
     Kokkos::parallel_for(n,f);
-    LMPHostType::fence();
   } else {
     struct AtomVecDPDKokkos_UnpackBorder<LMPDeviceType> f(buf.view<LMPDeviceType>(),
       d_x,d_tag,d_type,d_mask,
       d_dpdTheta,d_uCond,d_uMech,d_uChem,d_uCG,d_uCGnew,
       first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -1399,11 +1387,9 @@ int AtomVecDPDKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d
   if(space == Host) {
     AtomVecDPDKokkos_PackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPHostType::fence();
   } else {
     AtomVecDPDKokkos_PackExchangeFunctor<LMPDeviceType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPDeviceType::fence();
   }
   return nsend*17;
 }
@@ -1518,14 +1504,12 @@ int AtomVecDPDKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nre
     k_count.h_view(0) = nlocal;
     AtomVecDPDKokkos_UnpackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/17,f);
-    LMPHostType::fence();
   } else {
     k_count.h_view(0) = nlocal;
     k_count.modify<LMPHostType>();
     k_count.sync<LMPDeviceType>();
     AtomVecDPDKokkos_UnpackExchangeFunctor<LMPDeviceType> f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/17,f);
-    LMPDeviceType::fence();
     k_count.modify<LMPDeviceType>();
     k_count.sync<LMPHostType>();
   }
diff --git a/src/KOKKOS/atom_vec_full_kokkos.cpp b/src/KOKKOS/atom_vec_full_kokkos.cpp
index fa4cf18ae3..fd7eaf7c81 100644
--- a/src/KOKKOS/atom_vec_full_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_full_kokkos.cpp
@@ -396,7 +396,6 @@ int AtomVecFullKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     if(pbc_flag) {
@@ -424,7 +423,6 @@ int AtomVecFullKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 
 	return n*size_forward;
@@ -515,7 +513,6 @@ int AtomVecFullKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &lis
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
@@ -544,7 +541,6 @@ int AtomVecFullKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &lis
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 	return n*3;
 }
@@ -582,13 +578,11 @@ void AtomVecFullKokkos::unpack_comm_kokkos(const int &n, const int &first,
     modified(Host,X_MASK);
     struct AtomVecFullKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
     struct AtomVecFullKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -838,13 +832,11 @@ int AtomVecFullKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,h_q,h_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecFullKokkos_PackBorder<LMPDeviceType,1> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,d_q,d_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
 
   } else {
@@ -854,13 +846,11 @@ int AtomVecFullKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,h_q,h_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecFullKokkos_PackBorder<LMPDeviceType,0> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,d_q,d_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
   }
   return n*size_border;
@@ -1071,12 +1061,10 @@ void AtomVecFullKokkos::unpack_border_kokkos(const int &n, const int &first,
     struct AtomVecFullKokkos_UnpackBorder<LMPHostType>
       f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,h_q,h_molecule,first);
     Kokkos::parallel_for(n,f);
-    LMPHostType::fence();
   } else {
     struct AtomVecFullKokkos_UnpackBorder<LMPDeviceType>
       f(buf.view<LMPDeviceType>(),d_x,d_tag,d_type,d_mask,d_q,d_molecule,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -1422,13 +1410,11 @@ int AtomVecFullKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2
     AtomVecFullKokkos_PackExchangeFunctor<LMPHostType>
       f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPHostType::fence();
     return nsend*elements;
   } else {
     AtomVecFullKokkos_PackExchangeFunctor<LMPDeviceType>
       f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPDeviceType::fence();
     return nsend*elements;
   }
 }
@@ -1643,7 +1629,6 @@ int AtomVecFullKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nr
     AtomVecFullKokkos_UnpackExchangeFunctor<LMPHostType>
       f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/elements,f);
-    LMPHostType::fence();
     return k_count.h_view(0);
   } else {
     k_count.h_view(0) = nlocal;
@@ -1652,7 +1637,6 @@ int AtomVecFullKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nr
     AtomVecFullKokkos_UnpackExchangeFunctor<LMPDeviceType>
       f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/elements,f);
-    LMPDeviceType::fence();
     k_count.modify<LMPDeviceType>();
     k_count.sync<LMPHostType>();
 
diff --git a/src/KOKKOS/atom_vec_molecular_kokkos.cpp b/src/KOKKOS/atom_vec_molecular_kokkos.cpp
index 5c16ac1513..dbf6a857b2 100644
--- a/src/KOKKOS/atom_vec_molecular_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_molecular_kokkos.cpp
@@ -387,7 +387,6 @@ int AtomVecMolecularKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     if(pbc_flag) {
@@ -415,7 +414,6 @@ int AtomVecMolecularKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 
 	return n*size_forward;
@@ -506,7 +504,6 @@ int AtomVecMolecularKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
@@ -535,7 +532,6 @@ int AtomVecMolecularKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 	return n*3;
 }
@@ -573,13 +569,11 @@ void AtomVecMolecularKokkos::unpack_comm_kokkos(const int &n, const int &first,
     modified(Host,X_MASK);
     struct AtomVecMolecularKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
     struct AtomVecMolecularKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -825,13 +819,11 @@ int AtomVecMolecularKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendli
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,h_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecMolecularKokkos_PackBorder<LMPDeviceType,1> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,d_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
 
   } else {
@@ -841,13 +833,11 @@ int AtomVecMolecularKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendli
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,h_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecMolecularKokkos_PackBorder<LMPDeviceType,0> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,d_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
   }
   return n*size_border;
@@ -1049,12 +1039,10 @@ void AtomVecMolecularKokkos::unpack_border_kokkos(const int &n, const int &first
     struct AtomVecMolecularKokkos_UnpackBorder<LMPHostType>
       f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,h_molecule,first);
     Kokkos::parallel_for(n,f);
-    LMPHostType::fence();
   } else {
     struct AtomVecMolecularKokkos_UnpackBorder<LMPDeviceType>
       f(buf.view<LMPDeviceType>(),d_x,d_tag,d_type,d_mask,d_molecule,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -1389,13 +1377,11 @@ int AtomVecMolecularKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfl
     AtomVecMolecularKokkos_PackExchangeFunctor<LMPHostType>
       f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPHostType::fence();
     return nsend*elements;
   } else {
     AtomVecMolecularKokkos_PackExchangeFunctor<LMPDeviceType>
       f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPDeviceType::fence();
     return nsend*elements;
   }
 }
@@ -1608,7 +1594,6 @@ int AtomVecMolecularKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,i
     AtomVecMolecularKokkos_UnpackExchangeFunctor<LMPHostType>
       f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/elements,f);
-    LMPHostType::fence();
     return k_count.h_view(0);
   } else {
     k_count.h_view(0) = nlocal;
@@ -1617,7 +1602,6 @@ int AtomVecMolecularKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,i
     AtomVecMolecularKokkos_UnpackExchangeFunctor<LMPDeviceType>
       f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/elements,f);
-    LMPDeviceType::fence();
     k_count.modify<LMPDeviceType>();
     k_count.sync<LMPHostType>();
 
diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp
index 2b19908396..da1f4a89fe 100644
--- a/src/KOKKOS/comm_kokkos.cpp
+++ b/src/KOKKOS/comm_kokkos.cpp
@@ -499,7 +499,6 @@ void CommKokkos::exchange_device()
           f(atomKK->k_x,k_exchange_sendlist,k_count,k_sendflag,
             nlocal,dim,lo,hi);
         Kokkos::parallel_for(nlocal,f);
-        DeviceType::fence();
         k_exchange_sendlist.modify<DeviceType>();
         k_sendflag.modify<DeviceType>();
         k_count.modify<DeviceType>();
@@ -535,7 +534,6 @@ void CommKokkos::exchange_device()
                                    k_exchange_sendlist,k_exchange_copylist,
                                    ExecutionSpaceFromDevice<DeviceType>::
                                    space,dim,lo,hi);
-      DeviceType::fence();
 
     } else {
       while (i < nlocal) {
@@ -560,7 +558,6 @@ void CommKokkos::exchange_device()
         atom->nlocal=avec->
           unpack_exchange_kokkos(k_buf_send,nrecv,atom->nlocal,dim,lo,hi,
                                  ExecutionSpaceFromDevice<DeviceType>::space);
-        DeviceType::fence();
       }
     } else {
       MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][0],0,
@@ -593,7 +590,6 @@ void CommKokkos::exchange_device()
         atom->nlocal = avec->
           unpack_exchange_kokkos(k_buf_recv,nrecv,atom->nlocal,dim,lo,hi,
                                  ExecutionSpaceFromDevice<DeviceType>::space);
-        DeviceType::fence();
       }
     }
 
@@ -765,7 +761,6 @@ void CommKokkos::borders_device() {
                 total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
             Kokkos::TeamPolicy<DeviceType> config((nlast-nfirst+127)/128,128);
             Kokkos::parallel_for(config,f);
-            DeviceType::fence();
 
             total_send.template modify<DeviceType>();
             total_send.template sync<LMPHostType>();
@@ -782,7 +777,6 @@ void CommKokkos::borders_device() {
                   total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
               Kokkos::TeamPolicy<DeviceType> config((nlast-nfirst+127)/128,128);
               Kokkos::parallel_for(config,f);
-              DeviceType::fence();
               total_send.template modify<DeviceType>();
               total_send.template sync<LMPHostType>();
             }
@@ -911,7 +905,6 @@ void CommKokkos::borders_device() {
 
   if (exec_space == Host) k_sendlist.sync<LMPDeviceType>();
   atomKK->modified(exec_space,ALL_MASK);
-  DeviceType::fence();
   atomKK->sync(Host,TAG_MASK);
   if (map_style) atom->map_set();
 }
diff --git a/src/KOKKOS/domain_kokkos.cpp b/src/KOKKOS/domain_kokkos.cpp
index 4bf8dc9841..d9c1332778 100644
--- a/src/KOKKOS/domain_kokkos.cpp
+++ b/src/KOKKOS/domain_kokkos.cpp
@@ -99,7 +99,6 @@ void DomainKokkos::reset_box()
     DomainResetBoxFunctor<LMPDeviceType>
       f(atomKK->k_x);
     Kokkos::parallel_reduce(nlocal,f,result);
-    LMPDeviceType::fence();
 
     double (*extent)[2] = result.value;
     double all[3][2];
@@ -384,7 +383,6 @@ void DomainKokkos::pbc()
       Kokkos::parallel_for(nlocal,f);
     }
   }
-  LMPDeviceType::fence();
 
   atomKK->modified(Device,X_MASK|V_MASK|IMAGE_MASK);
 }
@@ -424,7 +422,6 @@ void DomainKokkos::remap_all()
 
   copymode = 1;
   Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType, TagDomain_remap_all>(0,nlocal),*this);
-  LMPDeviceType::fence();
   copymode = 0;
 
   atomKK->modified(Device,X_MASK | IMAGE_MASK);
@@ -528,7 +525,6 @@ void DomainKokkos::image_flip(int m_in, int n_in, int p_in)
 
   copymode = 1;
   Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType, TagDomain_image_flip>(0,nlocal),*this);
-  LMPDeviceType::fence();
   copymode = 0;
 
   atomKK->modified(Device,IMAGE_MASK);
@@ -561,7 +557,6 @@ void DomainKokkos::lamda2x(int n)
 
   copymode = 1;
   Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType, TagDomain_lamda2x>(0,n),*this);
-  LMPDeviceType::fence();
   copymode = 0;
 
   atomKK->modified(Device,X_MASK);
@@ -587,7 +582,6 @@ void DomainKokkos::x2lamda(int n)
 
   copymode = 1;
   Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType, TagDomain_x2lamda>(0,n),*this);
-  LMPDeviceType::fence();
   copymode = 0;
 
   atomKK->modified(Device,X_MASK);
diff --git a/src/KOKKOS/nbin_kokkos.cpp b/src/KOKKOS/nbin_kokkos.cpp
index 5e41787247..c7e815928a 100644
--- a/src/KOKKOS/nbin_kokkos.cpp
+++ b/src/KOKKOS/nbin_kokkos.cpp
@@ -95,7 +95,6 @@ void NBinKokkos<DeviceType>::bin_atoms()
     MemsetZeroFunctor<DeviceType> f_zero;
     f_zero.ptr = (void*) k_bincount.view<DeviceType>().ptr_on_device();
     Kokkos::parallel_for(mbins, f_zero);
-    DeviceType::fence();
 
     atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);
     x = atomKK->k_x.view<DeviceType>();
@@ -106,7 +105,6 @@ void NBinKokkos<DeviceType>::bin_atoms()
     NPairKokkosBinAtomsFunctor<DeviceType> f(*this);
 
     Kokkos::parallel_for(atom->nlocal+atom->nghost, f);
-    DeviceType::fence();
 
     deep_copy(h_resize, d_resize);
     if(h_resize()) {
diff --git a/src/KOKKOS/nbin_ssa_kokkos.cpp b/src/KOKKOS/nbin_ssa_kokkos.cpp
index 883ba25b24..ab97cb5848 100644
--- a/src/KOKKOS/nbin_ssa_kokkos.cpp
+++ b/src/KOKKOS/nbin_ssa_kokkos.cpp
@@ -152,7 +152,6 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
     for (int i = 0; i < 8; i++) k_gbincount.h_view(i) = 0;
     k_gbincount.modify<LMPHostType>();
     k_gbincount.sync<DeviceType>();
-    DeviceType::fence(); // FIXME?
     ghosts_per_gbin = 0;
     NPairSSAKokkosBinIDGhostsFunctor<DeviceType> f(*this);
     Kokkos::parallel_reduce(Kokkos::RangePolicy<LMPDeviceType>(nlocal,nall), f, ghosts_per_gbin);
@@ -167,7 +166,6 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
     for (int i = 0; i < 8; i++) k_gbincount.h_view(i) = 0;
     k_gbincount.modify<LMPHostType>();
     k_gbincount.sync<DeviceType>();
-    DeviceType::fence(); // FIXME?
 
     auto binID_ = binID;
     auto gbincount_ = gbincount;
@@ -198,7 +196,6 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
     MemsetZeroFunctor<DeviceType> f_zero;
     f_zero.ptr = (void*) k_bincount.view<DeviceType>().ptr_on_device();
     Kokkos::parallel_for(mbins, f_zero);
-    DeviceType::fence();
 
     auto bincount_ = bincount;
     auto bins_ = bins;
@@ -210,7 +207,6 @@ void NBinSSAKokkos<DeviceType>::bin_atoms()
       LAMMPS_LAMBDA (const int i) {
       sortBin(bincount_, bins_, i);
     });
-    DeviceType::fence();
   }
   k_bins.modify<DeviceType>();
   k_bincount.modify<DeviceType>();
diff --git a/src/KOKKOS/neigh_bond_kokkos.cpp b/src/KOKKOS/neigh_bond_kokkos.cpp
index a8c230fa59..a674e7cec4 100644
--- a/src/KOKKOS/neigh_bond_kokkos.cpp
+++ b/src/KOKKOS/neigh_bond_kokkos.cpp
@@ -274,7 +274,6 @@ void NeighBondKokkos<DeviceType>::bond_all()
     k_fail_flag.template sync<DeviceType>();
 
     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondBondAll>(0,nlocal),*this,nmissing);
-    DeviceType::fence();
 
     k_nlist.template modify<DeviceType>();
     k_nlist.template sync<LMPHostType>();
@@ -370,7 +369,6 @@ void NeighBondKokkos<DeviceType>::bond_partial()
     k_fail_flag.template sync<DeviceType>();
 
     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondBondPartial>(0,nlocal),*this,nmissing);
-    DeviceType::fence();
 
     k_nlist.template modify<DeviceType>();
     k_nlist.template sync<LMPHostType>();
@@ -443,7 +441,6 @@ void NeighBondKokkos<DeviceType>::bond_check()
   k_bondlist.sync<DeviceType>();
 
   Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondBondCheck>(0,neighbor->nbondlist),*this,flag);
-  DeviceType::fence();
 
   int flag_all;
   MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
@@ -494,7 +491,6 @@ void NeighBondKokkos<DeviceType>::angle_all()
     k_fail_flag.template sync<DeviceType>();
 
     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondAngleAll>(0,nlocal),*this,nmissing);
-    DeviceType::fence();
 
     k_nlist.template modify<DeviceType>();
     k_nlist.template sync<LMPHostType>();
@@ -597,7 +593,6 @@ void NeighBondKokkos<DeviceType>::angle_partial()
     k_fail_flag.template sync<DeviceType>();
 
     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondAnglePartial>(0,nlocal),*this,nmissing);
-    DeviceType::fence();
 
     k_nlist.template modify<DeviceType>();
     k_nlist.template sync<LMPHostType>();
@@ -678,7 +673,6 @@ void NeighBondKokkos<DeviceType>::angle_check()
   k_anglelist.sync<DeviceType>();
 
   Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondAngleCheck>(0,neighbor->nanglelist),*this,flag);
-  DeviceType::fence();
 
   int flag_all;
   MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
@@ -741,7 +735,6 @@ void NeighBondKokkos<DeviceType>::dihedral_all()
     k_fail_flag.template sync<DeviceType>();
 
     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondDihedralAll>(0,nlocal),*this,nmissing);
-    DeviceType::fence();
 
     k_nlist.template modify<DeviceType>();
     k_nlist.template sync<LMPHostType>();
@@ -849,7 +842,6 @@ void NeighBondKokkos<DeviceType>::dihedral_partial()
     k_fail_flag.template sync<DeviceType>();
 
     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondDihedralPartial>(0,nlocal),*this,nmissing);
-    DeviceType::fence();
 
     k_nlist.template modify<DeviceType>();
     k_nlist.template sync<LMPHostType>();
@@ -935,7 +927,6 @@ void NeighBondKokkos<DeviceType>::dihedral_check(int nlist, typename AT::t_int_2
   k_dihedrallist.sync<DeviceType>();
 
   Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondDihedralCheck>(0,nlist),*this,flag);
-  DeviceType::fence();
 
   int flag_all;
   MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
@@ -1015,7 +1006,6 @@ void NeighBondKokkos<DeviceType>::improper_all()
     k_fail_flag.template sync<DeviceType>();
 
     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondImproperAll>(0,nlocal),*this,nmissing);
-    DeviceType::fence();
 
     k_nlist.template modify<DeviceType>();
     k_nlist.template sync<LMPHostType>();
@@ -1123,7 +1113,6 @@ void NeighBondKokkos<DeviceType>::improper_partial()
     k_fail_flag.template sync<DeviceType>();
 
     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondImproperPartial>(0,nlocal),*this,nmissing);
-    DeviceType::fence();
 
     k_nlist.template modify<DeviceType>();
     k_nlist.template sync<LMPHostType>();
diff --git a/src/KOKKOS/neighbor_kokkos.cpp b/src/KOKKOS/neighbor_kokkos.cpp
index 8eda7ee55c..9a40808052 100644
--- a/src/KOKKOS/neighbor_kokkos.cpp
+++ b/src/KOKKOS/neighbor_kokkos.cpp
@@ -206,7 +206,6 @@ int NeighborKokkos::check_distance_kokkos()
   int flag = 0;
   copymode = 1;
   Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighborCheckDistance<DeviceType> >(0,nlocal),*this,flag);
-  DeviceType::fence();
   copymode = 0;
 
   int flagall;
@@ -273,7 +272,6 @@ void NeighborKokkos::build_kokkos(int topoflag)
     }
     copymode = 1;
     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagNeighborXhold<DeviceType> >(0,nlocal),*this);
-    DeviceType::fence();
     copymode = 0;
     xhold.modify<DeviceType>();
     if (boxcheck) {
diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp
index d5ea8376f6..b568bd5c93 100644
--- a/src/KOKKOS/npair_kokkos.cpp
+++ b/src/KOKKOS/npair_kokkos.cpp
@@ -214,7 +214,6 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
 #endif
       }
     }
-    DeviceType::fence();
     deep_copy(data.h_resize, data.resize);
 
     if(data.h_resize()) {
diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp
index aec482993d..b73e54e33f 100644
--- a/src/KOKKOS/npair_ssa_kokkos.cpp
+++ b/src/KOKKOS/npair_ssa_kokkos.cpp
@@ -480,7 +480,6 @@ fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inu
       h_ssa_gitemLen(ssa_gphaseCt-1,h_ssa_gphaseLen(ssa_gphaseCt-1)-1) - data.neigh_list.inum;
     firstTry = false;
 
-    DeviceType::fence();
     deep_copy(data.h_resize, data.resize);
 
     if(data.h_resize()) {
diff --git a/src/KOKKOS/region_block_kokkos.cpp b/src/KOKKOS/region_block_kokkos.cpp
index 90fd47ab06..eed4272f23 100644
--- a/src/KOKKOS/region_block_kokkos.cpp
+++ b/src/KOKKOS/region_block_kokkos.cpp
@@ -67,7 +67,6 @@ void RegBlockKokkos<DeviceType>::match_all_kokkos(int groupbit_in, DAT::tdual_in
 
   copymode = 1;
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagRegBlockMatchAll>(0,nlocal),*this);
-  DeviceType::fence();
   copymode = 0;
 
   k_match_in.template modify<DeviceType>();

From c8f92c1a617a16dafb010458bcb8d7711a1d7b73 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 5 Sep 2017 16:42:58 -0400
Subject: [PATCH 265/267] add a couple deleted files from USER-DPD to
 Purge.list

---
 src/Purge.list | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/Purge.list b/src/Purge.list
index 15d20fc71a..d6f5010d49 100644
--- a/src/Purge.list
+++ b/src/Purge.list
@@ -16,6 +16,9 @@ style_region.h
 style_neigh_bin.h
 style_neigh_pair.h
 style_neigh_stencil.h
+# deleted on 5 September 2017
+npair_halffull_newton_ssa.cpp
+npair_halffull_newton_ssa.f
 # deleted on 6 June 2017
 pair_lj_sf.cpp
 pair_lj_sf.h

From 4c5d901e2b0489ae8cb23a5d320ead5803c120ba Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 5 Sep 2017 16:45:03 -0400
Subject: [PATCH 266/267] fix stupid typo (too much compiling of fortran
 codes...)

---
 src/Purge.list | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Purge.list b/src/Purge.list
index d6f5010d49..4ccde5f4b5 100644
--- a/src/Purge.list
+++ b/src/Purge.list
@@ -18,7 +18,7 @@ style_neigh_pair.h
 style_neigh_stencil.h
 # deleted on 5 September 2017
 npair_halffull_newton_ssa.cpp
-npair_halffull_newton_ssa.f
+npair_halffull_newton_ssa.h
 # deleted on 6 June 2017
 pair_lj_sf.cpp
 pair_lj_sf.h

From 0248a7b98e9eac8ef560b70141da7a50cf5f779a Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Wed, 6 Sep 2017 09:24:05 -0500
Subject: [PATCH 267/267] remove duplicate listing of deleted USER-DPD files
 from Purge.list

---
 src/Purge.list | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/Purge.list b/src/Purge.list
index 4ccde5f4b5..315e5e3424 100644
--- a/src/Purge.list
+++ b/src/Purge.list
@@ -45,9 +45,6 @@ fix_reax_c_bonds_kokkos.cpp
 fix_reax_c_bonds_kokkos.h
 fix_reax_c_species_kokkos.cpp
 fix_reax_c_species_kokkos.h
-# deleted on 01 Mar 2017
-npair_halffull_newton_ssa.cpp
-npair_halffull_newton_ssa.h
 # deleted on 19 April 2017
 vmdplugin.h
 molfile_plugin.h