From 8b76e47d6bb084cbef50332ef1af0caad5d75854 Mon Sep 17 00:00:00 2001
From: Steve Plimpton <sjplimp@sandia.gov>
Date: Wed, 5 Aug 2020 16:44:56 -0600
Subject: [PATCH 01/38] support for tiled decompositions in PPPM

---
 src/KSPACE/gridcomm2.cpp | 1093 ++++++++++++
 src/KSPACE/gridcomm2.h   |  201 +++
 src/KSPACE/pppm2.cpp     | 3524 ++++++++++++++++++++++++++++++++++++++
 src/KSPACE/pppm2.h       |  360 ++++
 src/force.cpp            |    6 +-
 src/kspace.h             |    5 +
 6 files changed, 5186 insertions(+), 3 deletions(-)
 create mode 100644 src/KSPACE/gridcomm2.cpp
 create mode 100644 src/KSPACE/gridcomm2.h
 create mode 100644 src/KSPACE/pppm2.cpp
 create mode 100644 src/KSPACE/pppm2.h

diff --git a/src/KSPACE/gridcomm2.cpp b/src/KSPACE/gridcomm2.cpp
new file mode 100644
index 0000000000..ce9a1e7568
--- /dev/null
+++ b/src/KSPACE/gridcomm2.cpp
@@ -0,0 +1,1093 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "gridcomm2.h"
+#include <mpi.h>
+#include "comm.h"
+#include "kspace.h"
+#include "irregular.h"
+#include "memory.h"
+
+using namespace LAMMPS_NS;
+
+enum{REGULAR,TILED};
+
+#define SWAPDELTA 8
+
+// NOTE: gridcomm needs to be world for TILED, will it work with MSM?
+// NOTE: Tiled implementation here only works for RCB, not general tiled
+
+/* ----------------------------------------------------------------------
+   gcomm = MPI communicator that shares this grid
+           does not have to be world, see MSM
+   gn xyz = size of global grid
+   i xyz lohi = portion of global grid this proc owns, 0 <= index < N
+   o xyz lohi = owned grid portion + ghost grid cells needed in all directions
+   if o indices are < 0 or hi indices are >= N,
+     then grid is treated as periodic in that dimension,
+     communication is done across the periodic boundaries
+------------------------------------------------------------------------- */
+
+GridComm2::GridComm2(LAMMPS *lmp, MPI_Comm gcomm,
+		     int gnx, int gny, int gnz,
+		     int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
+		     int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi)
+  : Pointers(lmp)
+{
+  gridcomm = gcomm;
+  MPI_Comm_rank(gridcomm,&me);
+  MPI_Comm_size(gridcomm,&nprocs);
+
+  nx = gnx;
+  ny = gny;
+  nz = gnz;
+  
+  inxlo = ixlo;
+  inxhi = ixhi;
+  inylo = iylo;
+  inyhi = iyhi;
+  inzlo = izlo;
+  inzhi = izhi;
+
+  outxlo = oxlo;
+  outxhi = oxhi;
+  outylo = oylo;
+  outyhi = oyhi;
+  outzlo = ozlo;
+  outzhi = ozhi;
+
+  // layout == REGULAR or TILED
+  // for REGULAR, proc xyz lohi = my 6 neighbor procs
+  
+  layout = REGULAR;
+  if (comm->layout == Comm::LAYOUT_TILED) layout = TILED;
+  
+  outxlo_max = oxlo;
+  outxhi_max = oxhi;
+  outylo_max = oylo;
+  outyhi_max = oyhi;
+  outzlo_max = ozlo;
+  outzhi_max = ozhi;
+
+  if (layout == REGULAR) {
+    int (*procneigh)[2] = comm->procneigh;
+
+    procxlo = procneigh[0][0];
+    procxhi = procneigh[0][1];
+    procylo = procneigh[1][0];
+    procyhi = procneigh[1][1];
+    proczlo = procneigh[2][0];
+    proczhi = procneigh[2][1];
+  }
+  
+  nswap = maxswap = 0;
+  swap = NULL;
+
+  nsend = nrecv = ncopy = 0;
+  send = NULL;
+  recv = NULL;
+  copy = NULL;
+  requests = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   same as first constructor except o xyz lohi max are added arguments
+   this is for case when caller stores grid in a larger array than o xyz lohi
+   only affects indices() method which generates indices into the caller's array
+------------------------------------------------------------------------- */
+
+GridComm2::GridComm2(LAMMPS *lmp, MPI_Comm gcomm,
+		     int gnx, int gny, int gnz,
+		     int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
+		     int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi,
+		     int oxlo_max, int oxhi_max, int oylo_max, int oyhi_max,
+		     int ozlo_max, int ozhi_max)
+  : Pointers(lmp)
+{
+  gridcomm = gcomm;
+  MPI_Comm_rank(gridcomm,&me);
+  MPI_Comm_size(gridcomm,&nprocs);
+
+  nx = gnx;
+  ny = gny;
+  nz = gnz;
+
+  inxlo = ixlo;
+  inxhi = ixhi;
+  inylo = iylo;
+  inyhi = iyhi;
+  inzlo = izlo;
+  inzhi = izhi;
+
+  outxlo = oxlo;
+  outxhi = oxhi;
+  outylo = oylo;
+  outyhi = oyhi;
+  outzlo = ozlo;
+  outzhi = ozhi;
+
+  outxlo_max = oxlo_max;
+  outxhi_max = oxhi_max;
+  outylo_max = oylo_max;
+  outyhi_max = oyhi_max;
+  outzlo_max = ozlo_max;
+  outzhi_max = ozhi_max;
+
+  // layout == REGULAR or TILED
+  // for REGULAR, proc xyz lohi = my 6 neighbor procs
+
+  layout = REGULAR;
+  if (comm->layout == Comm::LAYOUT_TILED) layout = TILED;
+
+  if (layout == REGULAR) {
+    int (*procneigh)[2] = comm->procneigh;
+
+    procxlo = procneigh[0][0];
+    procxhi = procneigh[0][1];
+    procylo = procneigh[1][0];
+    procyhi = procneigh[1][1];
+    proczlo = procneigh[2][0];
+    proczhi = procneigh[2][1];
+  }
+
+  nswap = maxswap = 0;
+  swap = NULL;
+
+  nsend = nrecv = ncopy = 0;
+  send = NULL;
+  recv = NULL;
+  copy = NULL;
+  requests = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+GridComm2::~GridComm2()
+{
+  // regular comm data struct
+  
+  for (int i = 0; i < nswap; i++) {
+    memory->destroy(swap[i].packlist);
+    memory->destroy(swap[i].unpacklist);
+  }
+  memory->sfree(swap);
+
+  // tiled comm data structs
+  
+  for (int i = 0; i < nsend; i++)
+    memory->destroy(send[i].packlist);
+  memory->sfree(send);
+
+  for (int i = 0; i < nrecv; i++)
+    memory->destroy(recv[i].unpacklist);
+  memory->sfree(recv);
+
+  for (int i = 0; i < ncopy; i++) {
+    memory->destroy(copy[i].packlist);
+    memory->destroy(copy[i].unpacklist);
+  }
+  memory->sfree(copy);
+
+  delete [] requests;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void GridComm2::setup(int &nbuf1, int &nbuf2)
+{
+  if (layout == REGULAR) setup_regular(nbuf1,nbuf2);
+  else setup_tiled(nbuf1,nbuf2);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void GridComm2::setup_regular(int &nbuf1, int &nbuf2)
+{
+  int nsent,sendfirst,sendlast,recvfirst,recvlast;
+  int sendplanes,recvplanes;
+  int notdoneme,notdone;
+
+  // notify 6 neighbor procs how many ghost grid planes I need from them
+  // ghost xyz lo = # of my lower grid planes that proc xyz lo needs as its ghosts
+  // ghost xyz hi = # of my upper grid planes that proc xyz hi needs as its ghosts
+  // if this proc is its own neighbor across periodic bounary, value is from self
+
+  int nplanes = inxlo - outxlo;
+  if (procxlo != me)
+      MPI_Sendrecv(&nplanes,1,MPI_INT,procxlo,0,
+                   &ghostxhi,1,MPI_INT,procxhi,0,gridcomm,MPI_STATUS_IGNORE);
+  else ghostxhi = nplanes;
+
+  nplanes = outxhi - inxhi;
+  if (procxhi != me)
+      MPI_Sendrecv(&nplanes,1,MPI_INT,procxhi,0,
+                   &ghostxlo,1,MPI_INT,procxlo,0,gridcomm,MPI_STATUS_IGNORE);
+  else ghostxlo = nplanes;
+
+  nplanes = inylo - outylo;
+  if (procylo != me)
+    MPI_Sendrecv(&nplanes,1,MPI_INT,procylo,0,
+                 &ghostyhi,1,MPI_INT,procyhi,0,gridcomm,MPI_STATUS_IGNORE);
+  else ghostyhi = nplanes;
+
+  nplanes = outyhi - inyhi;
+  if (procyhi != me)
+    MPI_Sendrecv(&nplanes,1,MPI_INT,procyhi,0,
+                 &ghostylo,1,MPI_INT,procylo,0,gridcomm,MPI_STATUS_IGNORE);
+  else ghostylo = nplanes;
+
+  nplanes = inzlo - outzlo;
+  if (proczlo != me)
+    MPI_Sendrecv(&nplanes,1,MPI_INT,proczlo,0,
+                 &ghostzhi,1,MPI_INT,proczhi,0,gridcomm,MPI_STATUS_IGNORE);
+  else ghostzhi = nplanes;
+
+  nplanes = outzhi - inzhi;
+  if (proczhi != me)
+    MPI_Sendrecv(&nplanes,1,MPI_INT,proczhi,0,
+                 &ghostzlo,1,MPI_INT,proczlo,0,gridcomm,MPI_STATUS_IGNORE);
+  else ghostzlo = nplanes;
+
+  // setup swaps = exchange of grid data with one of 6 neighobr procs
+  // can be more than one in a direction if ghost region extends beyond neigh proc
+  // all procs have same swap count, but swapsize npack/nunpack can be empty
+  
+  nswap = 0;
+
+  // send own grid pts to -x processor, recv ghost grid pts from +x processor
+
+  nsent = 0;
+  sendfirst = inxlo;
+  sendlast = inxhi;
+  recvfirst = inxhi+1;
+  notdone = 1;
+
+  while (notdone) {
+    if (nswap == maxswap) grow_swap();
+
+    swap[nswap].sendproc = procxlo;
+    swap[nswap].recvproc = procxhi;
+    sendplanes = MIN(sendlast-sendfirst+1,ghostxlo-nsent);
+    swap[nswap].npack =
+      indices(swap[nswap].packlist,
+              sendfirst,sendfirst+sendplanes-1,inylo,inyhi,inzlo,inzhi);
+
+    if (procxlo != me)
+      MPI_Sendrecv(&sendplanes,1,MPI_INT,procxlo,0,
+                   &recvplanes,1,MPI_INT,procxhi,0,gridcomm,MPI_STATUS_IGNORE);
+    else recvplanes = sendplanes;
+
+    swap[nswap].nunpack =
+      indices(swap[nswap].unpacklist,
+              recvfirst,recvfirst+recvplanes-1,inylo,inyhi,inzlo,inzhi);
+
+    nsent += sendplanes;
+    sendfirst += sendplanes;
+    sendlast += recvplanes;
+    recvfirst += recvplanes;
+    nswap++;
+
+    if (nsent < ghostxlo) notdoneme = 1;
+    else notdoneme = 0;
+    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
+  }
+
+  // send own grid pts to +x processor, recv ghost grid pts from -x processor
+
+  nsent = 0;
+  sendfirst = inxlo;
+  sendlast = inxhi;
+  recvlast = inxlo-1;
+  notdone = 1;
+
+  while (notdone) {
+    if (nswap == maxswap) grow_swap();
+
+    swap[nswap].sendproc = procxhi;
+    swap[nswap].recvproc = procxlo;
+    sendplanes = MIN(sendlast-sendfirst+1,ghostxhi-nsent);
+    swap[nswap].npack =
+      indices(swap[nswap].packlist,
+              sendlast-sendplanes+1,sendlast,inylo,inyhi,inzlo,inzhi);
+
+    if (procxhi != me)
+      MPI_Sendrecv(&sendplanes,1,MPI_INT,procxhi,0,
+                   &recvplanes,1,MPI_INT,procxlo,0,gridcomm,MPI_STATUS_IGNORE);
+    else recvplanes = sendplanes;
+
+    swap[nswap].nunpack =
+      indices(swap[nswap].unpacklist,
+              recvlast-recvplanes+1,recvlast,inylo,inyhi,inzlo,inzhi);
+
+    nsent += sendplanes;
+    sendfirst -= recvplanes;
+    sendlast -= sendplanes;
+    recvlast -= recvplanes;
+    nswap++;
+
+    if (nsent < ghostxhi) notdoneme = 1;
+    else notdoneme = 0;
+    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
+  }
+
+  // send own grid pts to -y processor, recv ghost grid pts from +y processor
+
+  nsent = 0;
+  sendfirst = inylo;
+  sendlast = inyhi;
+  recvfirst = inyhi+1;
+  notdone = 1;
+
+  while (notdone) {
+    if (nswap == maxswap) grow_swap();
+
+    swap[nswap].sendproc = procylo;
+    swap[nswap].recvproc = procyhi;
+    sendplanes = MIN(sendlast-sendfirst+1,ghostylo-nsent);
+    swap[nswap].npack =
+      indices(swap[nswap].packlist,
+              outxlo,outxhi,sendfirst,sendfirst+sendplanes-1,inzlo,inzhi);
+
+    if (procylo != me)
+      MPI_Sendrecv(&sendplanes,1,MPI_INT,procylo,0,
+                   &recvplanes,1,MPI_INT,procyhi,0,gridcomm,MPI_STATUS_IGNORE);
+    else recvplanes = sendplanes;
+
+    swap[nswap].nunpack =
+      indices(swap[nswap].unpacklist,
+              outxlo,outxhi,recvfirst,recvfirst+recvplanes-1,inzlo,inzhi);
+
+    nsent += sendplanes;
+    sendfirst += sendplanes;
+    sendlast += recvplanes;
+    recvfirst += recvplanes;
+    nswap++;
+
+    if (nsent < ghostylo) notdoneme = 1;
+    else notdoneme = 0;
+    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
+  }
+
+  // send own grid pts to +y processor, recv ghost grid pts from -y processor
+
+  nsent = 0;
+  sendfirst = inylo;
+  sendlast = inyhi;
+  recvlast = inylo-1;
+  notdone = 1;
+
+  while (notdone) {
+    if (nswap == maxswap) grow_swap();
+
+    swap[nswap].sendproc = procyhi;
+    swap[nswap].recvproc = procylo;
+    sendplanes = MIN(sendlast-sendfirst+1,ghostyhi-nsent);
+    swap[nswap].npack =
+      indices(swap[nswap].packlist,
+              outxlo,outxhi,sendlast-sendplanes+1,sendlast,inzlo,inzhi);
+
+    if (procyhi != me)
+      MPI_Sendrecv(&sendplanes,1,MPI_INT,procyhi,0,
+                   &recvplanes,1,MPI_INT,procylo,0,gridcomm,MPI_STATUS_IGNORE);
+    else recvplanes = sendplanes;
+
+    swap[nswap].nunpack =
+      indices(swap[nswap].unpacklist,
+              outxlo,outxhi,recvlast-recvplanes+1,recvlast,inzlo,inzhi);
+
+    nsent += sendplanes;
+    sendfirst -= recvplanes;
+    sendlast -= sendplanes;
+    recvlast -= recvplanes;
+    nswap++;
+
+    if (nsent < ghostyhi) notdoneme = 1;
+    else notdoneme = 0;
+    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
+  }
+
+  // send own grid pts to -z processor, recv ghost grid pts from +z processor
+
+  nsent = 0;
+  sendfirst = inzlo;
+  sendlast = inzhi;
+  recvfirst = inzhi+1;
+  notdone = 1;
+
+  while (notdone) {
+    if (nswap == maxswap) grow_swap();
+
+    swap[nswap].sendproc = proczlo;
+    swap[nswap].recvproc = proczhi;
+    sendplanes = MIN(sendlast-sendfirst+1,ghostzlo-nsent);
+    swap[nswap].npack =
+      indices(swap[nswap].packlist,
+              outxlo,outxhi,outylo,outyhi,sendfirst,sendfirst+sendplanes-1);
+
+    if (proczlo != me)
+      MPI_Sendrecv(&sendplanes,1,MPI_INT,proczlo,0,
+                   &recvplanes,1,MPI_INT,proczhi,0,gridcomm,MPI_STATUS_IGNORE);
+    else recvplanes = sendplanes;
+
+    swap[nswap].nunpack =
+      indices(swap[nswap].unpacklist,
+              outxlo,outxhi,outylo,outyhi,recvfirst,recvfirst+recvplanes-1);
+
+    nsent += sendplanes;
+    sendfirst += sendplanes;
+    sendlast += recvplanes;
+    recvfirst += recvplanes;
+    nswap++;
+
+    if (nsent < ghostzlo) notdoneme = 1;
+    else notdoneme = 0;
+    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
+  }
+
+  // send own grid pts to +z processor, recv ghost grid pts from -z processor
+
+  nsent = 0;
+  sendfirst = inzlo;
+  sendlast = inzhi;
+  recvlast = inzlo-1;
+  notdone = 1;
+
+  while (notdone) {
+    if (nswap == maxswap) grow_swap();
+
+    swap[nswap].sendproc = proczhi;
+    swap[nswap].recvproc = proczlo;
+    sendplanes = MIN(sendlast-sendfirst+1,ghostzhi-nsent);
+    swap[nswap].npack =
+      indices(swap[nswap].packlist,
+              outxlo,outxhi,outylo,outyhi,sendlast-sendplanes+1,sendlast);
+
+    if (proczhi != me)
+      MPI_Sendrecv(&sendplanes,1,MPI_INT,proczhi,0,
+                   &recvplanes,1,MPI_INT,proczlo,0,gridcomm,MPI_STATUS_IGNORE);
+    else recvplanes = sendplanes;
+
+    swap[nswap].nunpack =
+      indices(swap[nswap].unpacklist,
+              outxlo,outxhi,outylo,outyhi,recvlast-recvplanes+1,recvlast);
+
+    nsent += sendplanes;
+    sendfirst -= recvplanes;
+    sendlast -= sendplanes;
+    recvlast -= recvplanes;
+    nswap++;
+
+    if (nsent < ghostzhi) notdoneme = 1;
+    else notdoneme = 0;
+    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
+  }
+
+  // ngrid = max of any forward/reverse pack/unpack grid points
+
+  int ngrid = 0;
+  for (int i = 0; i < nswap; i++) {
+    ngrid = MAX(ngrid,swap[i].npack);
+    ngrid = MAX(ngrid,swap[i].nunpack);
+  }
+
+  nbuf1 = nbuf2 = ngrid;
+}
+
+/* ----------------------------------------------------------------------
+------------------------------------------------------------------------- */
+
+void GridComm2::setup_tiled(int &nbuf1, int &nbuf2)
+{
+  int i,m;
+  double xlo,xhi,ylo,yhi,zlo,zhi;
+  int ghostbox[6],pbc[3];
+
+  // setup RCB tree of cut info for grid
+  // access CommTiled to get cut dimension
+  // cut = this proc's inlo in that dim
+  // dim is -1 for proc 0, but never accessed
+  
+  rcbinfo = (RCBinfo *)
+    memory->smalloc(nprocs*sizeof(RCBinfo),"GridComm:rcbinfo");
+  RCBinfo rcbone;
+  rcbone.dim = comm->rcbcutdim;
+  if (rcbone.dim <= 0) rcbone.cut = inxlo;
+  else if (rcbone.dim == 1) rcbone.cut = inylo;
+  else if (rcbone.dim == 2) rcbone.cut = inzlo;
+  MPI_Allgather(&rcbone,sizeof(RCBinfo),MPI_CHAR,
+                rcbinfo,sizeof(RCBinfo),MPI_CHAR,gridcomm);
+
+  // find overlaps of my extended ghost box with all other procs
+  // accounts for crossings of periodic boundaries
+  // noverlap = # of overlaps, including self
+  // overlap = vector of overlap info using Overlap data struct
+  
+  ghostbox[0] = outxlo;
+  ghostbox[1] = outxhi;
+  ghostbox[2] = outylo;
+  ghostbox[3] = outyhi;
+  ghostbox[4] = outzlo;
+  ghostbox[5] = outzhi;
+  
+  pbc[0] = pbc[1] = pbc[2] = 0;
+
+  memory->create(overlap_procs,nprocs,"GridComm:overlap_procs");
+  noverlap = maxoverlap = 0;
+  overlap = NULL;
+
+  ghost_box_drop(ghostbox,pbc);
+
+  // send each proc an overlap message
+  // content: me, index of my overlap, box that overlaps with its owned cells
+  // ncopy = # of overlaps with myself, across a periodic boundary
+
+  int *proclist;
+  memory->create(proclist,noverlap,"GridComm:proclist");
+  srequest = (Request *)
+    memory->smalloc(noverlap*sizeof(Request),"GridComm:srequest");
+  
+  int nsend_request = 0;
+  ncopy = 0;
+  
+  for (m = 0; m < noverlap; m++) {
+    if (overlap[m].proc == me) ncopy++;
+    else {
+      proclist[nsend_request] = overlap[m].proc;
+      srequest[nsend_request].sender = me;
+      srequest[nsend_request].index = m;
+      for (i = 0; i < 6; i++)
+	srequest[nsend_request].box[i] = overlap[m].box[i];
+      nsend_request++;
+    }
+  }
+
+  Irregular *irregular = new Irregular(lmp);
+  int nrecv_request = irregular->create_data(nsend_request,proclist,1);
+  Request *rrequest =
+    (Request *) memory->smalloc(nrecv_request*sizeof(Request),"GridComm:rrequest");
+  irregular->exchange_data((char *) srequest,sizeof(Request),(char *) rrequest);
+  irregular->destroy_data();
+  
+  // compute overlaps between received ghost boxes and my owned box
+  // overlap box used to setup my Send data struct and respond to requests
+
+  send = (Send *) memory->smalloc(nrecv_request*sizeof(Send),"GridComm:send");
+  sresponse = (Response *)
+    memory->smalloc(nrecv_request*sizeof(Response),"GridComm:sresponse");
+  memory->destroy(proclist);
+  memory->create(proclist,nrecv_request,"GridComm:proclist");
+
+  for (m = 0; m < nrecv_request; m++) {
+    send[m].proc = rrequest[m].sender;
+    xlo = MAX(rrequest[m].box[0],inxlo);
+    xhi = MIN(rrequest[m].box[1],inxhi);
+    ylo = MAX(rrequest[m].box[2],inylo);
+    yhi = MIN(rrequest[m].box[3],inyhi);
+    zlo = MAX(rrequest[m].box[4],inzlo);
+    zhi = MIN(rrequest[m].box[5],inzhi);
+    send[m].npack = indices(send[m].packlist,xlo,xhi,ylo,yhi,zlo,zhi);
+
+    proclist[m] = rrequest[m].sender;
+    sresponse[m].index = rrequest[m].index;
+    sresponse[m].box[0] = xlo;
+    sresponse[m].box[1] = xhi;
+    sresponse[m].box[2] = ylo;
+    sresponse[m].box[3] = yhi;
+    sresponse[m].box[4] = zlo;
+    sresponse[m].box[5] = zhi;
+  }
+
+  nsend = nrecv_request;
+  
+  // reply to each Request message with a Response message
+  // content: index for the overlap on requestor, overlap box on my owned grid
+
+  int nsend_response = nrecv_request;
+  int nrecv_response = irregular->create_data(nsend_response,proclist,1);
+  Response *rresponse =
+    (Response *) memory->smalloc(nrecv_response*sizeof(Response),"GridComm:rresponse");
+  irregular->exchange_data((char *) sresponse,sizeof(Response),(char *) rresponse);
+  irregular->destroy_data();
+  delete irregular;
+
+  // process received responses
+  // box used to setup my Recv data struct after unwrapping via PBC
+  // adjacent = 0 if any box of ghost cells does not adjoin my owned cells
+  
+  recv = (Recv *) memory->smalloc(nrecv_response*sizeof(Recv),"CommGrid:recv");
+  adjacent = 1;
+  
+  for (i = 0; i < nrecv_response; i++) {
+    m = rresponse[i].index;
+    recv[i].proc = overlap[m].proc;
+    xlo = rresponse[i].box[0] + overlap[m].pbc[0] * nx;
+    xhi = rresponse[i].box[1] + overlap[m].pbc[0] * nx;
+    ylo = rresponse[i].box[2] + overlap[m].pbc[1] * ny;
+    yhi = rresponse[i].box[3] + overlap[m].pbc[1] * ny;
+    zlo = rresponse[i].box[4] + overlap[m].pbc[2] * nz;
+    zhi = rresponse[i].box[5] + overlap[m].pbc[2] * nz;
+    recv[i].nunpack = indices(recv[i].unpacklist,xlo,xhi,ylo,yhi,zlo,zhi);
+    
+    if (xlo != inxhi+1 && xhi != inxlo-1 &&
+	ylo != inyhi+1 && yhi != inylo-1 &&
+	zlo != inzhi+1 && zhi != inzlo-1) adjacent = 0;
+  }
+
+  nrecv = nrecv_response;
+
+  // create Copy data struct from overlaps with self
+  
+  copy = (Copy *) memory->smalloc(ncopy*sizeof(Copy),"CommGrid:copy");
+ 
+  ncopy = 0;
+  for (m = 0; m < noverlap; m++) {
+    if (overlap[m].proc != me) continue;
+    xlo = overlap[m].box[0];
+    xhi = overlap[m].box[1];
+    ylo = overlap[m].box[2];
+    yhi = overlap[m].box[3];
+    zlo = overlap[m].box[4];
+    zhi = overlap[m].box[5];
+    copy[ncopy].npack = indices(copy[ncopy].packlist,xlo,xhi,ylo,yhi,zlo,zhi);
+    xlo = overlap[m].box[0] + overlap[m].pbc[0] * nx;
+    xhi = overlap[m].box[1] + overlap[m].pbc[0] * nx;
+    ylo = overlap[m].box[2] + overlap[m].pbc[1] * ny;
+    yhi = overlap[m].box[3] + overlap[m].pbc[1] * ny;
+    zlo = overlap[m].box[4] + overlap[m].pbc[2] * nz;
+    zhi = overlap[m].box[5] + overlap[m].pbc[2] * nz;
+    copy[ncopy].nunpack = indices(copy[ncopy].unpacklist,xlo,xhi,ylo,yhi,zlo,zhi);
+    ncopy++;
+  }
+
+  // set offsets for received data
+
+  int offset = 0;
+  for (m = 0; m < nsend; m++) {
+    send[m].offset = offset;
+    offset += send[m].npack;
+  }
+
+  offset = 0;
+  for (m = 0; m < nrecv; m++) {
+    recv[m].offset = offset;
+    offset += recv[m].nunpack;
+  }
+
+  // length of MPI requests vector is max of nsend, nrecv
+
+  int nrequest = MAX(nsend,nrecv);
+  requests = new MPI_Request[nrequest];
+    
+  // clean-up
+
+  memory->sfree(rcbinfo);
+  memory->destroy(proclist);
+  memory->destroy(overlap_procs);
+  memory->sfree(overlap);
+  memory->sfree(srequest);
+  memory->sfree(rrequest);
+  memory->sfree(sresponse);
+  memory->sfree(rresponse);
+
+  // nbuf1 = largest pack or unpack in any Send or Recv or Copy
+  // nbuf2 = larget of sum of all packs or unpacks in Send or Recv
+  
+  nbuf1 = 0;
+
+  for (m = 0; m < ncopy; m++) {
+    nbuf1 = MAX(nbuf1,copy[m].npack);
+    nbuf1 = MAX(nbuf1,copy[m].nunpack);
+  }
+
+  int nbufs = 0;
+  for (m = 0; m < nsend; m++) {
+    nbuf1 = MAX(nbuf1,send[m].npack);
+    nbufs += send[m].npack;
+  }
+
+  int nbufr = 0;
+  for (m = 0; m < nrecv; m++) {
+    nbuf1 = MAX(nbuf1,recv[m].nunpack);
+    nbufr += recv[m].nunpack;
+  }
+
+  nbuf2 = MAX(nbufs,nbufr);
+}
+
+/* ----------------------------------------------------------------------
+------------------------------------------------------------------------- */
+
+void GridComm2::ghost_box_drop(int *box, int *pbc)
+{
+  int i,m;
+  
+  // newbox12 and newpbc are initially copies of caller box and pbc
+  
+  int newbox1[6],newbox2[6],newpbc[3];
+
+  for (i = 0; i < 6; i++) newbox1[i] = newbox2[i] = box[i];
+  for (i = 0; i < 3; i++) newpbc[i] = pbc[i];
+
+  // 6 if tests to see if box needs to be split across a periodic boundary
+  // final else is no split
+  
+  int splitflag = 1;
+  
+  if (box[0] < 0) {
+    newbox1[0] = 0;
+    newbox2[0] = box[0] + nx;
+    newbox2[1] = nx - 1;
+    newpbc[0]--;
+  } else if (box[1] >= nx) {
+    newbox1[1] = nx - 1;
+    newbox2[0] = 0;
+    newbox2[1] = box[1] - nx;
+    newpbc[0]++;
+  } else if (box[2] < 0) {
+    newbox1[2] = 0;
+    newbox2[2] = box[2] + ny;
+    newbox2[3] = ny - 1;
+    newpbc[1]--;
+  } else if (box[3] >= ny) {
+    newbox1[3] = ny - 1;
+    newbox2[2] = 0;
+    newbox2[3] = box[3] - ny; 
+    newpbc[1]++;
+  } else if (box[4] < 0) {
+    newbox1[4] = 0;
+    newbox2[4] = box[4] + nz;
+    newbox2[5] = nz - 1;
+    newpbc[2]--;
+  } else if (box[5] >= nz) {
+    newbox1[5] = nz - 1;
+    newbox2[4] = 0;
+    newbox2[5] = box[5] - nz;
+    newpbc[2]++;
+
+  // box is not split, drop on RCB tree
+  // returns nprocs = # of procs it overlaps, including self
+  // returns proc_overlap = list of proc IDs it overlaps
+  // skip self overlap if no crossing of periodic boundaries
+    
+  } else {
+    splitflag = 0;
+    int np = 0;
+    box_drop_grid(box,0,nprocs-1,np,overlap_procs);
+    for (m = 0; m < np; m++) {
+      if (noverlap == maxoverlap) grow_overlap();
+      if (overlap_procs[m] == me &&
+	  pbc[0] == 0 && pbc[1] == 0 && pbc[2] == 0) continue;
+      overlap[noverlap].proc = overlap_procs[m];
+      for (i = 0; i < 6; i++) overlap[noverlap].box[i] = box[i];
+      for (i = 0; i < 3; i++) overlap[noverlap].pbc[i] = pbc[i];
+      noverlap++;
+    }
+  }
+
+  // recurse with 2 split boxes
+  
+  if (splitflag) {
+    ghost_box_drop(newbox1,pbc);
+    ghost_box_drop(newbox2,newpbc);
+  }
+}
+
+/* ----------------------------------------------------------------------
+------------------------------------------------------------------------- */
+
+void GridComm2::box_drop_grid(int *box, int proclower, int procupper,
+			      int &np, int *plist)
+{
+  // end recursion when partition is a single proc
+  // add proclower to plist
+
+  if (proclower == procupper) {
+    plist[np++] = proclower;
+    return;
+  }
+
+  // drop box on each side of cut it extends beyond
+  // use < and >= criteria so does not include a box it only touches
+  // procmid = 1st processor in upper half of partition
+  //         = location in tree that stores this cut
+  // cut = index of first grid cell in upper partition
+  // dim = 0,1,2 dimension of cut
+
+  int procmid = proclower + (procupper - proclower) / 2 + 1;
+  int dim = rcbinfo[procmid].dim;
+  int cut = rcbinfo[procmid].cut;
+
+  if (box[2*dim] < cut) box_drop_grid(box,proclower,procmid-1,np,plist);
+  if (box[2*dim+1] >= cut) box_drop_grid(box,procmid,procupper,np,plist);
+}
+
+/* ----------------------------------------------------------------------
+   check if all procs only need ghost info from adjacent procs
+   return 1 if yes, 0 if no
+------------------------------------------------------------------------- */
+
+int GridComm2::ghost_adjacent()
+{
+  if (layout == REGULAR) return ghost_adjacent_regular();
+  return ghost_adjacent_tiled();
+}
+
+/* ----------------------------------------------------------------------
+   adjacent = 0 if a proc's ghost xyz lohi values exceed its subdomain size
+   return 0 if adjacent=0 for any proc, else 1
+------------------------------------------------------------------------- */
+
+int GridComm2::ghost_adjacent_regular()
+{
+  adjacent = 1;
+  if (ghostxlo > inxhi-inxlo+1) adjacent = 0;
+  if (ghostxhi > inxhi-inxlo+1) adjacent = 0;
+  if (ghostylo > inyhi-inylo+1) adjacent = 0;
+  if (ghostyhi > inyhi-inylo+1) adjacent = 0;
+  if (ghostzlo > inzhi-inzlo+1) adjacent = 0;
+  if (ghostzhi > inzhi-inzlo+1) adjacent = 0;
+
+  int adjacent_all;
+  MPI_Allreduce(&adjacent,&adjacent_all,1,MPI_INT,MPI_MIN,gridcomm);
+  return adjacent_all;
+}
+
+/* ----------------------------------------------------------------------
+   adjacent = 0 if a proc's received ghosts were flagged
+     as non-adjacent in setup_tiled()
+   return 0 if adjacent=0 for any proc, else 1
+------------------------------------------------------------------------- */
+
+int GridComm2::ghost_adjacent_tiled()
+{
+  int adjacent_all;
+  MPI_Allreduce(&adjacent,&adjacent_all,1,MPI_INT,MPI_MIN,gridcomm);
+  return adjacent_all;
+}
+
+/* ----------------------------------------------------------------------
+   use swap list in forward order to acquire copy of all needed ghost grid pts
+------------------------------------------------------------------------- */
+
+void GridComm2::forward_comm(KSpace *kspace, int nper, int nbyte, int which,
+			     void *buf1, void *buf2, MPI_Datatype datatype)
+{
+  if (layout == REGULAR)
+    forward_comm_regular(kspace,nper,nbyte,which,buf1,buf2,datatype);
+  else
+    forward_comm_tiled(kspace,nper,nbyte,which,buf1,buf2,datatype);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void GridComm2::forward_comm_regular(KSpace *kspace, int nper, int nbyte, int which,
+				     void *buf1, void *buf2, MPI_Datatype datatype)
+{
+  int m;
+  MPI_Request request;
+
+  for (m = 0; m < nswap; m++) {
+    if (swap[m].sendproc == me)
+      kspace->pack_forward2(which,buf2,swap[m].npack,swap[m].packlist);
+    else
+      kspace->pack_forward2(which,buf1,swap[m].npack,swap[m].packlist);
+
+    if (swap[m].sendproc != me) {
+      if (swap[m].nunpack) MPI_Irecv(buf2,nper*swap[m].nunpack,datatype,
+				     swap[m].recvproc,0,gridcomm,&request);
+      if (swap[m].npack) MPI_Send(buf1,nper*swap[m].npack,datatype,
+				  swap[m].sendproc,0,gridcomm);
+      if (swap[m].nunpack) MPI_Wait(&request,MPI_STATUS_IGNORE);
+    }
+
+    kspace->unpack_forward2(which,buf2,swap[m].nunpack,swap[m].unpacklist);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void GridComm2::forward_comm_tiled(KSpace *kspace, int nper, int nbyte, int which,
+				   void *buf1, void *vbuf2, MPI_Datatype datatype)
+{
+  int i,m,offset;
+
+  char *buf2 = (char *) vbuf2;
+  
+  // post all receives
+  
+  for (m = 0; m < nrecv; m++) {
+    offset = nper * recv[m].offset * nbyte;
+    MPI_Irecv((void *) &buf2[offset],nper*recv[m].nunpack,datatype,
+	      recv[m].proc,0,gridcomm,&requests[m]);
+  }
+
+  // perform all sends to other procs
+
+  for (m = 0; m < nsend; m++) {
+    kspace->pack_forward2(which,buf1,send[m].npack,send[m].packlist);
+    MPI_Send(buf1,nper*send[m].npack,datatype,send[m].proc,0,gridcomm);
+  }
+
+  // perform all copies to self
+
+  for (m = 0; m < ncopy; m++) {
+    kspace->pack_forward2(which,buf1,copy[m].npack,copy[m].packlist);
+    kspace->unpack_forward2(which,buf1,copy[m].nunpack,copy[m].unpacklist);
+  }
+
+  // unpack all received data
+  
+  for (i = 0; i < nrecv; i++) {
+    MPI_Waitany(nrecv,requests,&m,MPI_STATUS_IGNORE);
+    offset = nper * recv[m].offset * nbyte;
+    kspace->unpack_forward2(which,(void *) &buf2[offset],
+			    recv[m].nunpack,recv[m].unpacklist);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   use swap list in reverse order to compute fully summed value
+   for each owned grid pt that some other proc has copy of as a ghost grid pt
+------------------------------------------------------------------------- */
+
+void GridComm2::reverse_comm(KSpace *kspace, int nper, int nbyte, int which,
+			     void *buf1, void *buf2, MPI_Datatype datatype)
+{
+  if (layout == REGULAR)
+    reverse_comm_regular(kspace,nper,nbyte,which,buf1,buf2,datatype);
+  else
+    reverse_comm_tiled(kspace,nper,nbyte,which,buf1,buf2,datatype);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void GridComm2::reverse_comm_regular(KSpace *kspace, int nper, int nbyte, int which,
+				     void *buf1, void *buf2, MPI_Datatype datatype)
+{
+  int m;
+  MPI_Request request;
+
+  for (m = nswap-1; m >= 0; m--) {
+    if (swap[m].recvproc == me)
+      kspace->pack_reverse2(which,buf2,swap[m].nunpack,swap[m].unpacklist);
+    else
+      kspace->pack_reverse2(which,buf1,swap[m].nunpack,swap[m].unpacklist);
+
+    if (swap[m].recvproc != me) {
+      if (swap[m].npack) MPI_Irecv(buf2,nper*swap[m].npack,datatype,
+				   swap[m].sendproc,0,gridcomm,&request);
+      if (swap[m].nunpack) MPI_Send(buf1,nper*swap[m].nunpack,datatype,
+				     swap[m].recvproc,0,gridcomm);
+      if (swap[m].npack) MPI_Wait(&request,MPI_STATUS_IGNORE);
+    }
+
+    kspace->unpack_reverse2(which,buf2,swap[m].npack,swap[m].packlist);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void GridComm2::reverse_comm_tiled(KSpace *kspace, int nper, int nbyte, int which,
+				   void *buf1, void *vbuf2, MPI_Datatype datatype)
+{
+  int i,m,offset;
+
+  char *buf2 = (char *) vbuf2;
+
+  // post all receives
+  
+  for (m = 0; m < nsend; m++) {
+    offset = nper * send[m].offset * nbyte;
+    MPI_Irecv((void *) &buf2[offset],nper*send[m].npack,datatype,
+	      send[m].proc,0,gridcomm,&requests[m]);
+  }
+
+  // perform all sends to other procs
+
+  for (m = 0; m < nrecv; m++) {
+    kspace->pack_reverse2(which,buf1,recv[m].nunpack,recv[m].unpacklist);
+    MPI_Send(buf1,nper*recv[m].nunpack,datatype,recv[m].proc,0,gridcomm);
+  }
+
+  // perform all copies to self
+
+  for (m = 0; m < ncopy; m++) {
+    kspace->pack_reverse2(which,buf1,copy[m].nunpack,copy[m].unpacklist);
+    kspace->unpack_reverse2(which,buf1,copy[m].npack,copy[m].packlist);
+  }
+
+  // unpack all received data
+  
+  for (i = 0; i < nsend; i++) {
+    MPI_Waitany(nsend,requests,&m,MPI_STATUS_IGNORE);
+    offset = nper * send[m].offset * nbyte;
+    kspace->unpack_reverse2(which,(void *) &buf2[offset],
+			    send[m].npack,send[m].packlist);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   create swap stencil for grid own/ghost communication
+   swaps covers all 3 dimensions and both directions
+   swaps cover multiple iterations in a direction if need grid pts
+     from further away than nearest-neighbor proc
+   same swap list used by forward and reverse communication
+------------------------------------------------------------------------- */
+
+void GridComm2::grow_swap()
+{
+  maxswap += SWAPDELTA;
+  swap = (Swap *)
+    memory->srealloc(swap,maxswap*sizeof(Swap),"CommGrid:swap");
+}
+
+/* ----------------------------------------------------------------------
+   create swap stencil for grid own/ghost communication
+   swaps covers all 3 dimensions and both directions
+   swaps cover multiple iterations in a direction if need grid pts
+     from further away than nearest-neighbor proc
+   same swap list used by forward and reverse communication
+------------------------------------------------------------------------- */
+
+void GridComm2::grow_overlap()
+{
+  maxoverlap += SWAPDELTA;
+  overlap = (Overlap *)
+    memory->srealloc(overlap,maxoverlap*sizeof(Overlap),"CommGrid:overlap");
+}
+
+/* ----------------------------------------------------------------------
+   create 1d list of offsets into 3d array section (xlo:xhi,ylo:yhi,zlo:zhi)
+   assume 3d array is allocated as (outxlo_max:outxhi_max,outylo_max:outyhi_max,
+     outzlo_max:outzhi_max)
+------------------------------------------------------------------------- */
+
+int GridComm2::indices(int *&list,
+                       int xlo, int xhi, int ylo, int yhi, int zlo, int zhi)
+{
+  int nmax = (xhi-xlo+1) * (yhi-ylo+1) * (zhi-zlo+1);
+  memory->create(list,nmax,"CommGrid:indices");
+  if (nmax == 0) return 0;
+
+  int nx = (outxhi_max-outxlo_max+1);
+  int ny = (outyhi_max-outylo_max+1);
+
+  int n = 0;
+  int ix,iy,iz;
+  for (iz = zlo; iz <= zhi; iz++)
+    for (iy = ylo; iy <= yhi; iy++)
+      for (ix = xlo; ix <= xhi; ix++)
+        list[n++] = (iz-outzlo_max)*ny*nx + (iy-outylo_max)*nx + (ix-outxlo_max);
+
+  return nmax;
+}
diff --git a/src/KSPACE/gridcomm2.h b/src/KSPACE/gridcomm2.h
new file mode 100644
index 0000000000..eeba990d2d
--- /dev/null
+++ b/src/KSPACE/gridcomm2.h
@@ -0,0 +1,201 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_GRIDCOMM2_H
+#define LMP_GRIDCOMM2_H
+
+#include "pointers.h"
+
+namespace LAMMPS_NS {
+
+class GridComm2 : protected Pointers {
+ public:
+  GridComm2(class LAMMPS *, MPI_Comm, int, int, int,
+	    int, int, int, int, int, int,
+	    int, int, int, int, int, int);
+  GridComm2(class LAMMPS *, MPI_Comm, int, int, int,
+	    int, int, int, int, int, int,
+	    int, int, int, int, int, int,
+	    int, int, int, int, int, int);
+  ~GridComm2();
+  void setup(int &, int &);
+  int ghost_adjacent();
+  void forward_comm(class KSpace *, int, int, int, void *, void *, MPI_Datatype);
+  void reverse_comm(class KSpace *, int, int, int, void *, void *, MPI_Datatype);
+
+ private:
+  int me,nprocs;
+  int layout;                 // REGULAR or TILED
+  MPI_Comm gridcomm;
+
+  // inputs from caller via constructor
+
+  int nx,ny,nz;               // size of global grid in all 3 dims
+  int inxlo,inxhi;            // inclusive extent of my grid chunk
+  int inylo,inyhi;            //   0 <= in <= N-1
+  int inzlo,inzhi;   
+  int outxlo,outxhi;          // inclusive extent of my grid chunk plus
+  int outylo,outyhi;          //   ghost cells in all 6 directions
+  int outzlo,outzhi;          // lo indices can be < 0, hi indices can be >= N
+  int outxlo_max,outxhi_max;  // ??
+  int outylo_max,outyhi_max;
+  int outzlo_max,outzhi_max;
+
+  // -------------------------------------------
+  // internal variables for REGULAR layout
+  // -------------------------------------------
+
+  int procxlo,procxhi;     // 6 neighbor procs that adjoin me
+  int procylo,procyhi;     //   not used for comm_style = tiled
+  int proczlo,proczhi;
+  
+  int ghostxlo,ghostxhi;   // # of my owned grid planes needed
+  int ghostylo,ghostyhi;   //   by neighobr procs in each dir as their ghost planes
+  int ghostzlo,ghostzhi;
+
+  // swap = exchange of owned and ghost grid cells between 2 procs, including self
+  
+  struct Swap {
+    int sendproc;       // proc to send to for forward comm
+    int recvproc;       // proc to recv from for forward comm
+    int npack;          // # of datums to pack
+    int nunpack;        // # of datums to unpack
+    int *packlist;      // 3d array offsets to pack
+    int *unpacklist;    // 3d array offsets to unpack
+  };
+
+  int nswap,maxswap;
+  Swap *swap;
+
+  // -------------------------------------------
+  // internal variables for TILED layout
+  // -------------------------------------------
+
+  int *overlap_procs;
+  MPI_Request *requests;
+
+  // RCB tree of cut info
+  // each proc contributes one value, except proc 0
+  
+  struct RCBinfo {
+    int dim;        // 0,1,2 = which dim the cut is in
+    int cut;        // grid index of lowest cell in upper half of cut
+  };
+
+  RCBinfo *rcbinfo;
+    
+  // overlap = a proc whose owned cells overlap with my extended ghost box
+  // includes overlaps across periodic boundaries, can also be self
+  
+  struct Overlap {
+    int proc;            // proc whose owned cells overlap my ghost cells
+    int box[6];          // box that overlaps otherproc's owned cells
+                         // this box is wholly contained within global grid
+    int pbc[3];          // PBC offsets to convert box to a portion of my ghost box
+                         // my ghost box may extend beyond global grid
+  };
+
+  int noverlap,maxoverlap;
+  Overlap *overlap;
+  
+  // request = sent to each proc whose owned cells overlap my ghost cells
+  
+  struct Request {
+    int sender;          // sending proc
+    int index;           // index of overlap on sender
+    int box[6];          // box that overlaps receiver's owned cells
+                         // wholly contained within global grid
+  };
+
+  Request *srequest,*rrequest;
+  
+  // response = reply from each proc whose owned cells overlap my ghost cells
+  
+  struct Response {
+    int index;           // index of my overlap for the initial request
+    int box[6];          // box that overlaps responder's owned cells
+                         // wholly contained within global grid
+                         // has to unwrapped by PBC to map to my ghost cells
+  };
+
+  Response *sresponse,*rresponse;
+  
+  // send = proc to send a subset of my owned cells to, for forward comm
+  // for reverse comm, proc I receive ghost overlaps with my owned cells from
+  // offset used in reverse comm to recv a message in middle of a large buffer
+
+  struct Send {
+    int proc;
+    int npack;
+    int *packlist;
+    int offset;
+  };
+
+  // recv = proc to recv a subset of my ghost cells from, for forward comm
+  // for reverse comm, proc I send a subset of my ghost cells to
+  // offset used in forward comm to recv a message in middle of a large buffer
+  
+  struct Recv {
+    int proc;
+    int nunpack;
+    int *unpacklist;
+    int offset;
+  };
+
+  int adjacent;      // 0 on a proc who receives ghosts from a non-neighbor proc
+
+  // copy = subset of my owned cells to copy into subset of my ghost cells
+  // that describes forward comm, for reverse comm it is the opposite
+  
+  struct Copy {
+    int npack;
+    int nunpack;
+    int *packlist;
+    int *unpacklist;
+  };
+
+  int nsend,nrecv,ncopy;
+  Send *send;
+  Recv *recv;
+  Copy *copy;
+
+  // -------------------------------------------
+  // internal methods
+  // -------------------------------------------
+  
+  void setup_regular(int &, int &);
+  void setup_tiled(int &, int &);
+  void ghost_box_drop(int *, int *);
+  void box_drop_grid(int *, int, int, int &, int *);
+  
+  int ghost_adjacent_regular();
+  int ghost_adjacent_tiled();
+  
+  void forward_comm_regular(class KSpace *, int, int, int,
+			    void *, void *, MPI_Datatype);
+  void forward_comm_tiled(class KSpace *, int, int, int,
+			  void *, void *, MPI_Datatype);
+  void reverse_comm_regular(class KSpace *, int, int, int,
+			    void *, void *, MPI_Datatype);
+  void reverse_comm_tiled(class KSpace *, int, int, int,
+			  void *, void *, MPI_Datatype);
+
+  void grow_swap();
+  void grow_overlap();
+  
+  int indices(int *&, int, int, int, int, int, int);
+};
+
+}
+
+#endif
diff --git a/src/KSPACE/pppm2.cpp b/src/KSPACE/pppm2.cpp
new file mode 100644
index 0000000000..19baa13f16
--- /dev/null
+++ b/src/KSPACE/pppm2.cpp
@@ -0,0 +1,3524 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Roy Pollock (LLNL), Paul Crozier (SNL)
+     per-atom energy/virial & group/group energy/force added by Stan Moore (BYU)
+     analytic diff (2 FFT) option added by Rolf Isele-Holder (Aachen University)
+     triclinic added by Stan Moore (SNL)
+------------------------------------------------------------------------- */
+
+#include "pppm2.h"
+#include <mpi.h>
+#include <cstring>
+#include <string>
+#include <cmath>
+#include "atom.h"
+#include "comm.h"
+#include "gridcomm2.h"
+#include "neighbor.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "domain.h"
+#include "fft3d_wrap.h"
+#include "remap_wrap.h"
+#include "memory.h"
+#include "error.h"
+#include "utils.h"
+#include "fmt/format.h"
+
+#include "math_const.h"
+#include "math_special.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+using namespace MathSpecial;
+
+#define MAXORDER 7
+#define OFFSET 16384
+#define LARGE 10000.0
+#define SMALL 0.00001
+#define EPS_HOC 1.0e-7
+
+enum{REVERSE_RHO};
+enum{FORWARD_IK,FORWARD_AD,FORWARD_IK_PERATOM,FORWARD_AD_PERATOM};
+
+#ifdef FFT_SINGLE
+#define ZEROF 0.0f
+#define ONEF  1.0f
+#else
+#define ZEROF 0.0
+#define ONEF  1.0
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+PPPM2::PPPM2(LAMMPS *lmp) : KSpace(lmp),
+  factors(NULL), density_brick(NULL), vdx_brick(NULL), vdy_brick(NULL), vdz_brick(NULL),
+  u_brick(NULL), v0_brick(NULL), v1_brick(NULL), v2_brick(NULL), v3_brick(NULL),
+  v4_brick(NULL), v5_brick(NULL), greensfn(NULL), vg(NULL), fkx(NULL), fky(NULL),
+  fkz(NULL), density_fft(NULL), work1(NULL), work2(NULL), gf_b(NULL), rho1d(NULL),
+  rho_coeff(NULL), drho1d(NULL), drho_coeff(NULL),
+  sf_precoeff1(NULL), sf_precoeff2(NULL), sf_precoeff3(NULL),
+  sf_precoeff4(NULL), sf_precoeff5(NULL), sf_precoeff6(NULL),
+  acons(NULL), density_A_brick(NULL), density_B_brick(NULL), density_A_fft(NULL),
+  density_B_fft(NULL), fft1(NULL), fft2(NULL), remap(NULL), gc(NULL),
+  gc_buf1(NULL), gc_buf2(NULL), part2grid(NULL), boxlo(NULL)
+{
+  peratom_allocate_flag = 0;
+  group_allocate_flag = 0;
+
+  pppmflag = 1;
+  group_group_enable = 1;
+  triclinic = domain->triclinic;
+
+  nfactors = 3;
+  factors = new int[nfactors];
+  factors[0] = 2;
+  factors[1] = 3;
+  factors[2] = 5;
+
+  MPI_Comm_rank(world,&me);
+  MPI_Comm_size(world,&nprocs);
+
+  nfft_both = 0;
+  nxhi_in = nxlo_in = nxhi_out = nxlo_out = 0;
+  nyhi_in = nylo_in = nyhi_out = nylo_out = 0;
+  nzhi_in = nzlo_in = nzhi_out = nzlo_out = 0;
+
+  density_brick = vdx_brick = vdy_brick = vdz_brick = NULL;
+  density_fft = NULL;
+  u_brick = NULL;
+  v0_brick = v1_brick = v2_brick = v3_brick = v4_brick = v5_brick = NULL;
+  greensfn = NULL;
+  work1 = work2 = NULL;
+  vg = NULL;
+  fkx = fky = fkz = NULL;
+
+  sf_precoeff1 = sf_precoeff2 = sf_precoeff3 =
+    sf_precoeff4 = sf_precoeff5 = sf_precoeff6 = NULL;
+
+  density_A_brick = density_B_brick = NULL;
+  density_A_fft = density_B_fft = NULL;
+
+  gf_b = NULL;
+  rho1d = rho_coeff = drho1d = drho_coeff = NULL;
+
+  fft1 = fft2 = NULL;
+  remap = NULL;
+  gc = NULL;
+  gc_buf1 = gc_buf2 = NULL;
+
+  nmax = 0;
+  part2grid = NULL;
+
+  // define acons coefficients for estimation of kspace errors
+  // see JCP 109, pg 7698 for derivation of coefficients
+  // higher order coefficients may be computed if needed
+
+  memory->create(acons,8,7,"pppm:acons");
+  acons[1][0] = 2.0 / 3.0;
+  acons[2][0] = 1.0 / 50.0;
+  acons[2][1] = 5.0 / 294.0;
+  acons[3][0] = 1.0 / 588.0;
+  acons[3][1] = 7.0 / 1440.0;
+  acons[3][2] = 21.0 / 3872.0;
+  acons[4][0] = 1.0 / 4320.0;
+  acons[4][1] = 3.0 / 1936.0;
+  acons[4][2] = 7601.0 / 2271360.0;
+  acons[4][3] = 143.0 / 28800.0;
+  acons[5][0] = 1.0 / 23232.0;
+  acons[5][1] = 7601.0 / 13628160.0;
+  acons[5][2] = 143.0 / 69120.0;
+  acons[5][3] = 517231.0 / 106536960.0;
+  acons[5][4] = 106640677.0 / 11737571328.0;
+  acons[6][0] = 691.0 / 68140800.0;
+  acons[6][1] = 13.0 / 57600.0;
+  acons[6][2] = 47021.0 / 35512320.0;
+  acons[6][3] = 9694607.0 / 2095994880.0;
+  acons[6][4] = 733191589.0 / 59609088000.0;
+  acons[6][5] = 326190917.0 / 11700633600.0;
+  acons[7][0] = 1.0 / 345600.0;
+  acons[7][1] = 3617.0 / 35512320.0;
+  acons[7][2] = 745739.0 / 838397952.0;
+  acons[7][3] = 56399353.0 / 12773376000.0;
+  acons[7][4] = 25091609.0 / 1560084480.0;
+  acons[7][5] = 1755948832039.0 / 36229939200000.0;
+  acons[7][6] = 4887769399.0 / 37838389248.0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PPPM2::settings(int narg, char **arg)
+{
+  if (narg < 1) error->all(FLERR,"Illegal kspace_style pppm command");
+  accuracy_relative = fabs(force->numeric(FLERR,arg[0]));
+}
+
+/* ----------------------------------------------------------------------
+   free all memory
+------------------------------------------------------------------------- */
+
+PPPM2::~PPPM2()
+{
+  if (copymode) return;
+
+  delete [] factors;
+  deallocate();
+  if (peratom_allocate_flag) deallocate_peratom();
+  if (group_allocate_flag) deallocate_groups();
+  memory->destroy(part2grid);
+  memory->destroy(acons);
+}
+
+/* ----------------------------------------------------------------------
+   called once before run
+------------------------------------------------------------------------- */
+
+void PPPM2::init()
+{
+  if (me == 0) utils::logmesg(lmp,"PPPM initialization ...\n");
+
+  // error check
+
+  triclinic_check();
+
+  if (triclinic != domain->triclinic)
+    error->all(FLERR,"Must redefine kspace_style after changing to triclinic box");
+
+  if (domain->triclinic && differentiation_flag == 1)
+    error->all(FLERR,"Cannot (yet) use PPPM with triclinic box "
+               "and kspace_modify diff ad");
+  if (domain->triclinic && slabflag)
+    error->all(FLERR,"Cannot (yet) use PPPM with triclinic box and "
+               "slab correction");
+  if (domain->dimension == 2)
+    error->all(FLERR,"Cannot use PPPM with 2d simulation");
+
+  if (!atom->q_flag)
+    error->all(FLERR,"Kspace style requires atom attribute q");
+
+  if (slabflag == 0 && domain->nonperiodic > 0)
+    error->all(FLERR,"Cannot use non-periodic boundaries with PPPM");
+  if (slabflag) {
+    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
+        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
+      error->all(FLERR,"Incorrect boundaries with slab PPPM");
+  }
+
+  if (order < 2 || order > MAXORDER)
+    error->all(FLERR,fmt::format("PPPM order cannot be < 2 or > {}",MAXORDER));
+
+  // compute two charge force
+
+  two_charge();
+
+  // extract short-range Coulombic cutoff from pair style
+
+  triclinic = domain->triclinic;
+  pair_check();
+
+  int itmp = 0;
+  double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
+  if (p_cutoff == NULL)
+    error->all(FLERR,"KSpace style is incompatible with Pair style");
+  cutoff = *p_cutoff;
+
+  // if kspace is TIP4P, extract TIP4P params from pair style
+  // bond/angle are not yet init(), so insure equilibrium request is valid
+
+  qdist = 0.0;
+
+  if (tip4pflag) {
+    if (me == 0) utils::logmesg(lmp,"  extracting TIP4P info from pair style\n");
+
+    double *p_qdist = (double *) force->pair->extract("qdist",itmp);
+    int *p_typeO = (int *) force->pair->extract("typeO",itmp);
+    int *p_typeH = (int *) force->pair->extract("typeH",itmp);
+    int *p_typeA = (int *) force->pair->extract("typeA",itmp);
+    int *p_typeB = (int *) force->pair->extract("typeB",itmp);
+    if (!p_qdist || !p_typeO || !p_typeH || !p_typeA || !p_typeB)
+      error->all(FLERR,"Pair style is incompatible with TIP4P KSpace style");
+    qdist = *p_qdist;
+    typeO = *p_typeO;
+    typeH = *p_typeH;
+    int typeA = *p_typeA;
+    int typeB = *p_typeB;
+
+    if (force->angle == NULL || force->bond == NULL ||
+        force->angle->setflag == NULL || force->bond->setflag == NULL)
+      error->all(FLERR,"Bond and angle potentials must be defined for TIP4P");
+    if (typeA < 1 || typeA > atom->nangletypes ||
+        force->angle->setflag[typeA] == 0)
+      error->all(FLERR,"Bad TIP4P angle type for PPPM/TIP4P");
+    if (typeB < 1 || typeB > atom->nbondtypes ||
+        force->bond->setflag[typeB] == 0)
+      error->all(FLERR,"Bad TIP4P bond type for PPPM/TIP4P");
+    double theta = force->angle->equilibrium_angle(typeA);
+    double blen = force->bond->equilibrium_distance(typeB);
+    alpha = qdist / (cos(0.5*theta) * blen);
+  }
+
+  // compute qsum & qsqsum and warn if not charge-neutral
+
+  scale = 1.0;
+  qqrd2e = force->qqrd2e;
+  qsum_qsq();
+  natoms_original = atom->natoms;
+
+  // set accuracy (force units) from accuracy_relative or accuracy_absolute
+
+  if (accuracy_absolute >= 0.0) accuracy = accuracy_absolute;
+  else accuracy = accuracy_relative * two_charge_force;
+
+  // free all arrays previously allocated
+
+  deallocate();
+  if (peratom_allocate_flag) deallocate_peratom();
+  if (group_allocate_flag) deallocate_groups();
+
+  // setup FFT grid resolution and g_ewald
+  // normally one iteration thru while loop is all that is required
+  // if grid stencil does not extend beyond neighbor proc
+  //   or overlap is allowed, then done
+  // else reduce order and try again
+
+  int (*procneigh)[2] = comm->procneigh;
+
+  GridComm2 *gctmp = NULL;
+  int iteration = 0;
+
+  while (order >= minorder) {
+    if (iteration && me == 0)
+      error->warning(FLERR,"Reducing PPPM order b/c stencil extends "
+                     "beyond nearest neighbor processor");
+
+    if (stagger_flag && !differentiation_flag) compute_gf_denom();
+    set_grid_global();
+    set_grid_local();
+    if (overlap_allowed) break;
+
+    gctmp = new GridComm2(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+			  nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+			  nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
+
+    int tmp1,tmp2;
+    gctmp->setup(tmp1,tmp2);
+    if (gctmp->ghost_adjacent()) break;
+    delete gctmp;
+
+    order--;
+    iteration++;
+  }
+
+  if (order < minorder) error->all(FLERR,"PPPM order < minimum allowed order");
+  if (!overlap_allowed && !gctmp->ghost_adjacent())
+    error->all(FLERR,"PPPM grid stencil extends "
+               "beyond nearest neighbor processor");
+  if (gctmp) delete gctmp;
+
+  // adjust g_ewald
+
+  if (!gewaldflag) adjust_gewald();
+
+  // calculate the final accuracy
+
+  double estimated_accuracy = final_accuracy();
+
+  // print stats
+
+  int ngrid_max,nfft_both_max;
+  MPI_Allreduce(&ngrid,&ngrid_max,1,MPI_INT,MPI_MAX,world);
+  MPI_Allreduce(&nfft_both,&nfft_both_max,1,MPI_INT,MPI_MAX,world);
+
+  if (me == 0) {
+    std::string mesg = fmt::format("  G vector (1/distance) = {:.8g}\n",g_ewald);
+    mesg += fmt::format("  grid = {} {} {}\n",nx_pppm,ny_pppm,nz_pppm);
+    mesg += fmt::format("  stencil order = {}\n",order);
+    mesg += fmt::format("  estimated absolute RMS force accuracy = {:.8g}\n",
+                       estimated_accuracy);
+    mesg += fmt::format("  estimated relative force accuracy = {:.8g}\n",
+                       estimated_accuracy/two_charge_force);
+    mesg += "  using " LMP_FFT_PREC " precision " LMP_FFT_LIB "\n";
+    mesg += fmt::format("  3d grid and FFT values/proc = {} {}\n",
+                       ngrid_max,nfft_both_max);
+    utils::logmesg(lmp,mesg);
+  }
+
+  // allocate K-space dependent memory
+  // don't invoke allocate peratom() or group(), will be allocated when needed
+
+  allocate();
+
+  // pre-compute Green's function denomiator expansion
+  // pre-compute 1d charge distribution coefficients
+
+  compute_gf_denom();
+  if (differentiation_flag == 1) compute_sf_precoeff();
+  compute_rho_coeff();
+}
+
+/* ----------------------------------------------------------------------
+   adjust PPPM coeffs, called initially and whenever volume has changed
+------------------------------------------------------------------------- */
+
+void PPPM2::setup()
+{
+  if (triclinic) {
+    setup_triclinic();
+    return;
+  }
+
+  // perform some checks to avoid illegal boundaries with read_data
+
+  if (slabflag == 0 && domain->nonperiodic > 0)
+    error->all(FLERR,"Cannot use non-periodic boundaries with PPPM");
+  if (slabflag) {
+    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
+        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
+      error->all(FLERR,"Incorrect boundaries with slab PPPM");
+  }
+
+  int i,j,k,n;
+  double *prd;
+
+  // volume-dependent factors
+  // adjust z dimension for 2d slab PPPM
+  // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
+
+  if (triclinic == 0) prd = domain->prd;
+  else prd = domain->prd_lamda;
+
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+  double zprd_slab = zprd*slab_volfactor;
+  volume = xprd * yprd * zprd_slab;
+
+  delxinv = nx_pppm/xprd;
+  delyinv = ny_pppm/yprd;
+  delzinv = nz_pppm/zprd_slab;
+
+  delvolinv = delxinv*delyinv*delzinv;
+
+  double unitkx = (MY_2PI/xprd);
+  double unitky = (MY_2PI/yprd);
+  double unitkz = (MY_2PI/zprd_slab);
+
+  // fkx,fky,fkz for my FFT grid pts
+
+  double per;
+
+  for (i = nxlo_fft; i <= nxhi_fft; i++) {
+    per = i - nx_pppm*(2*i/nx_pppm);
+    fkx[i] = unitkx*per;
+  }
+
+  for (i = nylo_fft; i <= nyhi_fft; i++) {
+    per = i - ny_pppm*(2*i/ny_pppm);
+    fky[i] = unitky*per;
+  }
+
+  for (i = nzlo_fft; i <= nzhi_fft; i++) {
+    per = i - nz_pppm*(2*i/nz_pppm);
+    fkz[i] = unitkz*per;
+  }
+
+  // virial coefficients
+
+  double sqk,vterm;
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++) {
+    for (j = nylo_fft; j <= nyhi_fft; j++) {
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k];
+        if (sqk == 0.0) {
+          vg[n][0] = 0.0;
+          vg[n][1] = 0.0;
+          vg[n][2] = 0.0;
+          vg[n][3] = 0.0;
+          vg[n][4] = 0.0;
+          vg[n][5] = 0.0;
+        } else {
+          vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
+          vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i];
+          vg[n][1] = 1.0 + vterm*fky[j]*fky[j];
+          vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k];
+          vg[n][3] = vterm*fkx[i]*fky[j];
+          vg[n][4] = vterm*fkx[i]*fkz[k];
+          vg[n][5] = vterm*fky[j]*fkz[k];
+        }
+        n++;
+      }
+    }
+  }
+
+  if (differentiation_flag == 1) compute_gf_ad();
+  else compute_gf_ik();
+}
+
+/* ----------------------------------------------------------------------
+   adjust PPPM coeffs, called initially and whenever volume has changed
+   for a triclinic system
+------------------------------------------------------------------------- */
+
+void PPPM2::setup_triclinic()
+{
+  int i,j,k,n;
+  double *prd;
+
+  // volume-dependent factors
+  // adjust z dimension for 2d slab PPPM
+  // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
+
+  prd = domain->prd;
+
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+  double zprd_slab = zprd*slab_volfactor;
+  volume = xprd * yprd * zprd_slab;
+
+  // use lamda (0-1) coordinates
+
+  delxinv = nx_pppm;
+  delyinv = ny_pppm;
+  delzinv = nz_pppm;
+  delvolinv = delxinv*delyinv*delzinv/volume;
+
+  // fkx,fky,fkz for my FFT grid pts
+
+  double per_i,per_j,per_k;
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++) {
+    per_k = k - nz_pppm*(2*k/nz_pppm);
+    for (j = nylo_fft; j <= nyhi_fft; j++) {
+      per_j = j - ny_pppm*(2*j/ny_pppm);
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        per_i = i - nx_pppm*(2*i/nx_pppm);
+
+        double unitk_lamda[3];
+        unitk_lamda[0] = 2.0*MY_PI*per_i;
+        unitk_lamda[1] = 2.0*MY_PI*per_j;
+        unitk_lamda[2] = 2.0*MY_PI*per_k;
+        x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
+        fkx[n] = unitk_lamda[0];
+        fky[n] = unitk_lamda[1];
+        fkz[n] = unitk_lamda[2];
+        n++;
+      }
+    }
+  }
+
+  // virial coefficients
+
+  double sqk,vterm;
+
+  for (n = 0; n < nfft; n++) {
+    sqk = fkx[n]*fkx[n] + fky[n]*fky[n] + fkz[n]*fkz[n];
+    if (sqk == 0.0) {
+      vg[n][0] = 0.0;
+      vg[n][1] = 0.0;
+      vg[n][2] = 0.0;
+      vg[n][3] = 0.0;
+      vg[n][4] = 0.0;
+      vg[n][5] = 0.0;
+    } else {
+      vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
+      vg[n][0] = 1.0 + vterm*fkx[n]*fkx[n];
+      vg[n][1] = 1.0 + vterm*fky[n]*fky[n];
+      vg[n][2] = 1.0 + vterm*fkz[n]*fkz[n];
+      vg[n][3] = vterm*fkx[n]*fky[n];
+      vg[n][4] = vterm*fkx[n]*fkz[n];
+      vg[n][5] = vterm*fky[n]*fkz[n];
+    }
+  }
+
+  compute_gf_ik_triclinic();
+}
+
+/* ----------------------------------------------------------------------
+   reset local grid arrays and communication stencils
+   called by fix balance b/c it changed sizes of processor sub-domains
+------------------------------------------------------------------------- */
+
+void PPPM2::setup_grid()
+{
+  // free all arrays previously allocated
+
+  deallocate();
+  if (peratom_allocate_flag) deallocate_peratom();
+  if (group_allocate_flag) deallocate_groups();
+
+  // reset portion of global grid that each proc owns
+
+  set_grid_local();
+
+  // reallocate K-space dependent memory
+  // check if grid communication is now overlapping if not allowed
+  // don't invoke allocate peratom() or group(), will be allocated when needed
+
+  allocate();
+
+  if (!overlap_allowed && !gc->ghost_adjacent())
+    error->all(FLERR,"PPPM grid stencil extends "
+               "beyond nearest neighbor processor");
+
+  // pre-compute Green's function denomiator expansion
+  // pre-compute 1d charge distribution coefficients
+
+  compute_gf_denom();
+  if (differentiation_flag == 1) compute_sf_precoeff();
+  compute_rho_coeff();
+
+  // pre-compute volume-dependent coeffs for portion of grid I now own
+
+  setup();
+}
+
+/* ----------------------------------------------------------------------
+   compute the PPPM long-range force, energy, virial
+------------------------------------------------------------------------- */
+
+void PPPM2::compute(int eflag, int vflag)
+{
+  int i,j;
+
+  // set energy/virial flags
+  // invoke allocate_peratom() if needed for first time
+
+  ev_init(eflag,vflag);
+
+  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();
+
+  // if atom count has changed, update qsum and qsqsum
+
+  if (atom->natoms != natoms_original) {
+    qsum_qsq();
+    natoms_original = atom->natoms;
+  }
+
+  // return if there are no charges
+
+  if (qsqsum == 0.0) return;
+
+  // convert atoms from box to lamda coords
+
+  if (triclinic == 0) boxlo = domain->boxlo;
+  else {
+    boxlo = domain->boxlo_lamda;
+    domain->x2lamda(atom->nlocal);
+  }
+
+  // extend size of per-atom arrays if necessary
+
+  if (atom->nmax > nmax) {
+    memory->destroy(part2grid);
+    nmax = atom->nmax;
+    memory->create(part2grid,nmax,3,"pppm:part2grid");
+  }
+
+  // find grid points for all my particles
+  // map my particle charge onto my local 3d density grid
+
+  particle_map();
+  make_rho();
+
+  // all procs communicate density values from their ghost cells
+  //   to fully sum contribution in their 3d bricks
+  // remap from 3d decomposition to FFT decomposition
+
+  gc->reverse_comm(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+		   gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  brick2fft();
+
+  // compute potential gradient on my FFT grid and
+  //   portion of e_long on this proc's FFT grid
+  // return gradients (electric fields) in 3d brick decomposition
+  // also performs per-atom calculations via poisson_peratom()
+
+  poisson();
+
+  // all procs communicate E-field values
+  // to fill ghost cells surrounding their 3d bricks
+
+  if (differentiation_flag == 1)
+    gc->forward_comm(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
+		     gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  else
+    gc->forward_comm(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
+		     gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+
+  // extra per-atom energy/virial communication
+
+  if (evflag_atom) {
+    if (differentiation_flag == 1 && vflag_atom)
+      gc->forward_comm(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+		       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+    else if (differentiation_flag == 0)
+      gc->forward_comm(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+		       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  }
+
+  // calculate the force on my particles
+
+  fieldforce();
+
+  // extra per-atom energy/virial communication
+
+  if (evflag_atom) fieldforce_peratom();
+
+  // sum global energy across procs and add in volume-dependent term
+
+  const double qscale = qqrd2e * scale;
+
+  if (eflag_global) {
+    double energy_all;
+    MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
+    energy = energy_all;
+
+    energy *= 0.5*volume;
+    energy -= g_ewald*qsqsum/MY_PIS +
+      MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume);
+    energy *= qscale;
+  }
+
+  // sum global virial across procs
+
+  if (vflag_global) {
+    double virial_all[6];
+    MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
+    for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
+  }
+
+  // per-atom energy/virial
+  // energy includes self-energy correction
+  // ntotal accounts for TIP4P tallying eatom/vatom for ghost atoms
+
+  if (evflag_atom) {
+    double *q = atom->q;
+    int nlocal = atom->nlocal;
+    int ntotal = nlocal;
+    if (tip4pflag) ntotal += atom->nghost;
+
+    if (eflag_atom) {
+      for (i = 0; i < nlocal; i++) {
+        eatom[i] *= 0.5;
+        eatom[i] -= g_ewald*q[i]*q[i]/MY_PIS + MY_PI2*q[i]*qsum /
+          (g_ewald*g_ewald*volume);
+        eatom[i] *= qscale;
+      }
+      for (i = nlocal; i < ntotal; i++) eatom[i] *= 0.5*qscale;
+    }
+
+    if (vflag_atom) {
+      for (i = 0; i < ntotal; i++)
+        for (j = 0; j < 6; j++) vatom[i][j] *= 0.5*qscale;
+    }
+  }
+
+  // 2d slab correction
+
+  if (slabflag == 1) slabcorr();
+
+  // convert atoms back from lamda to box coords
+
+  if (triclinic) domain->lamda2x(atom->nlocal);
+}
+
+/* ----------------------------------------------------------------------
+   allocate memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPM2::allocate()
+{
+  memory->create3d_offset(density_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:density_brick");
+
+  memory->create(density_fft,nfft_both,"pppm:density_fft");
+  memory->create(greensfn,nfft_both,"pppm:greensfn");
+  memory->create(work1,2*nfft_both,"pppm:work1");
+  memory->create(work2,2*nfft_both,"pppm:work2");
+  memory->create(vg,nfft_both,6,"pppm:vg");
+
+  if (triclinic == 0) {
+    memory->create1d_offset(fkx,nxlo_fft,nxhi_fft,"pppm:fkx");
+    memory->create1d_offset(fky,nylo_fft,nyhi_fft,"pppm:fky");
+    memory->create1d_offset(fkz,nzlo_fft,nzhi_fft,"pppm:fkz");
+  } else {
+    memory->create(fkx,nfft_both,"pppm:fkx");
+    memory->create(fky,nfft_both,"pppm:fky");
+    memory->create(fkz,nfft_both,"pppm:fkz");
+  }
+
+  if (differentiation_flag == 1) {
+    memory->create3d_offset(u_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:u_brick");
+
+    memory->create(sf_precoeff1,nfft_both,"pppm:sf_precoeff1");
+    memory->create(sf_precoeff2,nfft_both,"pppm:sf_precoeff2");
+    memory->create(sf_precoeff3,nfft_both,"pppm:sf_precoeff3");
+    memory->create(sf_precoeff4,nfft_both,"pppm:sf_precoeff4");
+    memory->create(sf_precoeff5,nfft_both,"pppm:sf_precoeff5");
+    memory->create(sf_precoeff6,nfft_both,"pppm:sf_precoeff6");
+
+  } else {
+    memory->create3d_offset(vdx_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                            nxlo_out,nxhi_out,"pppm:vdx_brick");
+    memory->create3d_offset(vdy_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                            nxlo_out,nxhi_out,"pppm:vdy_brick");
+    memory->create3d_offset(vdz_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                            nxlo_out,nxhi_out,"pppm:vdz_brick");
+  }
+
+  // summation coeffs
+
+  order_allocated = order;
+  if (!stagger_flag) memory->create(gf_b,order,"pppm:gf_b");
+  memory->create2d_offset(rho1d,3,-order/2,order/2,"pppm:rho1d");
+  memory->create2d_offset(drho1d,3,-order/2,order/2,"pppm:drho1d");
+  memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm:rho_coeff");
+  memory->create2d_offset(drho_coeff,order,(1-order)/2,order/2,
+                          "pppm:drho_coeff");
+
+  // create 2 FFTs and a Remap
+  // 1st FFT keeps data in FFT decomposition
+  // 2nd FFT returns data in 3d brick decomposition
+  // remap takes data from 3d brick to FFT decomposition
+
+  int tmp;
+
+  fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                   0,0,&tmp,collective_flag);
+
+  fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                   nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                   0,0,&tmp,collective_flag);
+
+  remap = new Remap(lmp,world,
+                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                    1,0,0,FFT_PRECISION,collective_flag);
+
+  // create ghost grid object for rho and electric field communication
+  // also create 2 bufs for ghost grid cell comm, passed to GridComm methods
+  
+  gc = new GridComm2(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+		     nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+		     nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
+
+  gc->setup(ngc_buf1,ngc_buf2);
+
+  if (differentiation_flag) npergrid = 1;
+  else npergrid = 3;
+
+  memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
+  memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
+}
+
+/* ----------------------------------------------------------------------
+   deallocate memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPM2::deallocate()
+{
+  memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out);
+
+  if (differentiation_flag == 1) {
+    memory->destroy3d_offset(u_brick,nzlo_out,nylo_out,nxlo_out);
+    memory->destroy(sf_precoeff1);
+    memory->destroy(sf_precoeff2);
+    memory->destroy(sf_precoeff3);
+    memory->destroy(sf_precoeff4);
+    memory->destroy(sf_precoeff5);
+    memory->destroy(sf_precoeff6);
+  } else {
+    memory->destroy3d_offset(vdx_brick,nzlo_out,nylo_out,nxlo_out);
+    memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
+    memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
+  }
+
+  memory->destroy(density_fft);
+  memory->destroy(greensfn);
+  memory->destroy(work1);
+  memory->destroy(work2);
+  memory->destroy(vg);
+
+  if (triclinic == 0) {
+    memory->destroy1d_offset(fkx,nxlo_fft);
+    memory->destroy1d_offset(fky,nylo_fft);
+    memory->destroy1d_offset(fkz,nzlo_fft);
+  } else {
+    memory->destroy(fkx);
+    memory->destroy(fky);
+    memory->destroy(fkz);
+  }
+
+  memory->destroy(gf_b);
+  if (stagger_flag) gf_b = NULL;
+  memory->destroy2d_offset(rho1d,-order_allocated/2);
+  memory->destroy2d_offset(drho1d,-order_allocated/2);
+  memory->destroy2d_offset(rho_coeff,(1-order_allocated)/2);
+  memory->destroy2d_offset(drho_coeff,(1-order_allocated)/2);
+
+  delete fft1;
+  delete fft2;
+  delete remap;
+  delete gc;
+  memory->destroy(gc_buf1);
+  memory->destroy(gc_buf2);
+}
+
+/* ----------------------------------------------------------------------
+   allocate per-atom memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPM2::allocate_peratom()
+{
+  peratom_allocate_flag = 1;
+
+  if (differentiation_flag != 1)
+    memory->create3d_offset(u_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                            nxlo_out,nxhi_out,"pppm:u_brick");
+
+  memory->create3d_offset(v0_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:v0_brick");
+
+  memory->create3d_offset(v1_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:v1_brick");
+  memory->create3d_offset(v2_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:v2_brick");
+  memory->create3d_offset(v3_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:v3_brick");
+  memory->create3d_offset(v4_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:v4_brick");
+  memory->create3d_offset(v5_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:v5_brick");
+
+  // use same GC ghost grid object for peratom grid communication
+  // but need to reallocate a larger gc_buf1 and gc_buf2
+
+  if (differentiation_flag) npergrid = 6;
+  else npergrid = 7;
+
+  memory->destroy(gc_buf1);
+  memory->destroy(gc_buf2);
+  memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
+  memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
+}
+
+/* ----------------------------------------------------------------------
+   deallocate per-atom memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPM2::deallocate_peratom()
+{
+  peratom_allocate_flag = 0;
+
+  memory->destroy3d_offset(v0_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v1_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v2_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v3_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v4_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v5_brick,nzlo_out,nylo_out,nxlo_out);
+
+  if (differentiation_flag != 1)
+    memory->destroy3d_offset(u_brick,nzlo_out,nylo_out,nxlo_out);
+}
+
+/* ----------------------------------------------------------------------
+   set global size of PPPM grid = nx,ny,nz_pppm
+   used for charge accumulation, FFTs, and electric field interpolation
+------------------------------------------------------------------------- */
+
+void PPPM2::set_grid_global()
+{
+  // use xprd,yprd,zprd (even if triclinic, and then scale later)
+  // adjust z dimension for 2d slab PPPM
+  // 3d PPPM just uses zprd since slab_volfactor = 1.0
+
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  double zprd_slab = zprd*slab_volfactor;
+
+  // make initial g_ewald estimate
+  // based on desired accuracy and real space cutoff
+  // fluid-occupied volume used to estimate real-space error
+  // zprd used rather than zprd_slab
+
+  double h;
+  bigint natoms = atom->natoms;
+
+  if (!gewaldflag) {
+    if (accuracy <= 0.0)
+      error->all(FLERR,"KSpace accuracy must be > 0");
+    if (q2 == 0.0)
+      error->all(FLERR,"Must use kspace_modify gewald for uncharged system");
+    g_ewald = accuracy*sqrt(natoms*cutoff*xprd*yprd*zprd) / (2.0*q2);
+    if (g_ewald >= 1.0) g_ewald = (1.35 - 0.15*log(accuracy))/cutoff;
+    else g_ewald = sqrt(-log(g_ewald)) / cutoff;
+  }
+
+  // set optimal nx_pppm,ny_pppm,nz_pppm based on order and accuracy
+  // nz_pppm uses extended zprd_slab instead of zprd
+  // reduce it until accuracy target is met
+
+  if (!gridflag) {
+
+    if (differentiation_flag == 1 || stagger_flag) {
+
+      h = h_x = h_y = h_z = 4.0/g_ewald;
+      int count = 0;
+      while (1) {
+
+        // set grid dimensions
+	
+        nx_pppm = static_cast<int> (xprd/h_x);
+        ny_pppm = static_cast<int> (yprd/h_y);
+        nz_pppm = static_cast<int> (zprd_slab/h_z);
+
+        if (nx_pppm <= 1) nx_pppm = 2;
+        if (ny_pppm <= 1) ny_pppm = 2;
+        if (nz_pppm <= 1) nz_pppm = 2;
+
+	// estimate Kspace force error
+	
+        double df_kspace = compute_df_kspace();
+
+        // break loop if the accuracy has been reached or
+        // too many loops have been performed
+
+	count++;
+        if (df_kspace <= accuracy) break;
+
+        if (count > 500) error->all(FLERR, "Could not compute grid size");
+        h *= 0.95;
+        h_x = h_y = h_z = h;
+      }
+
+    } else {
+
+      double err;
+      h_x = h_y = h_z = 1.0/g_ewald;
+
+      nx_pppm = static_cast<int> (xprd/h_x) + 1;
+      ny_pppm = static_cast<int> (yprd/h_y) + 1;
+      nz_pppm = static_cast<int> (zprd_slab/h_z) + 1;
+
+      err = estimate_ik_error(h_x,xprd,natoms);
+      while (err > accuracy) {
+        err = estimate_ik_error(h_x,xprd,natoms);
+        nx_pppm++;
+        h_x = xprd/nx_pppm;
+      }
+
+      err = estimate_ik_error(h_y,yprd,natoms);
+      while (err > accuracy) {
+        err = estimate_ik_error(h_y,yprd,natoms);
+        ny_pppm++;
+        h_y = yprd/ny_pppm;
+      }
+
+      err = estimate_ik_error(h_z,zprd_slab,natoms);
+      while (err > accuracy) {
+        err = estimate_ik_error(h_z,zprd_slab,natoms);
+        nz_pppm++;
+        h_z = zprd_slab/nz_pppm;
+      }
+    }
+
+    // scale grid for triclinic skew
+
+    if (triclinic) {
+      double tmp[3];
+      tmp[0] = nx_pppm/xprd;
+      tmp[1] = ny_pppm/yprd;
+      tmp[2] = nz_pppm/zprd;
+      lamda2xT(&tmp[0],&tmp[0]);
+      nx_pppm = static_cast<int>(tmp[0]) + 1;
+      ny_pppm = static_cast<int>(tmp[1]) + 1;
+      nz_pppm = static_cast<int>(tmp[2]) + 1;
+    }
+  }
+
+  // boost grid size until it is factorable
+
+  while (!factorable(nx_pppm)) nx_pppm++;
+  while (!factorable(ny_pppm)) ny_pppm++;
+  while (!factorable(nz_pppm)) nz_pppm++;
+
+  if (triclinic == 0) {
+    h_x = xprd/nx_pppm;
+    h_y = yprd/ny_pppm;
+    h_z = zprd_slab/nz_pppm;
+  } else {
+    double tmp[3];
+    tmp[0] = nx_pppm;
+    tmp[1] = ny_pppm;
+    tmp[2] = nz_pppm;
+    x2lamdaT(&tmp[0],&tmp[0]);
+    h_x = 1.0/tmp[0];
+    h_y = 1.0/tmp[1];
+    h_z = 1.0/tmp[2];
+  }
+
+  if (nx_pppm >= OFFSET || ny_pppm >= OFFSET || nz_pppm >= OFFSET)
+    error->all(FLERR,"PPPM grid is too large");
+}
+
+/* ----------------------------------------------------------------------
+   check if all factors of n are in list of factors
+   return 1 if yes, 0 if no
+------------------------------------------------------------------------- */
+
+int PPPM2::factorable(int n)
+{
+  int i;
+
+  while (n > 1) {
+    for (i = 0; i < nfactors; i++) {
+      if (n % factors[i] == 0) {
+        n /= factors[i];
+        break;
+      }
+    }
+    if (i == nfactors) return 0;
+  }
+
+  return 1;
+}
+
+/* ----------------------------------------------------------------------
+   compute estimated kspace force error
+------------------------------------------------------------------------- */
+
+double PPPM2::compute_df_kspace()
+{
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  double zprd_slab = zprd*slab_volfactor;
+  bigint natoms = atom->natoms;
+  double df_kspace = 0.0;
+  if (differentiation_flag == 1 || stagger_flag) {
+    double qopt = compute_qopt();
+    df_kspace = sqrt(qopt/natoms)*q2/(xprd*yprd*zprd_slab);
+  } else {
+    double lprx = estimate_ik_error(h_x,xprd,natoms);
+    double lpry = estimate_ik_error(h_y,yprd,natoms);
+    double lprz = estimate_ik_error(h_z,zprd_slab,natoms);
+    df_kspace = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0);
+  }
+  return df_kspace;
+}
+
+/* ----------------------------------------------------------------------
+   compute qopt
+------------------------------------------------------------------------- */
+
+double PPPM2::compute_qopt()
+{
+  int k,l,m,nx,ny,nz;
+  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  double u1,u2,sqk;
+  double sum1,sum2,sum3,sum4,dot2;
+
+  double *prd = domain->prd;
+
+  const double xprd = prd[0];
+  const double yprd = prd[1];
+  const double zprd = prd[2];
+  const double zprd_slab = zprd*slab_volfactor;
+  volume = xprd * yprd * zprd_slab;
+
+  const double unitkx = (MY_2PI/xprd);
+  const double unitky = (MY_2PI/yprd);
+  const double unitkz = (MY_2PI/zprd_slab);
+
+  const int twoorder = 2*order;
+
+  // loop over entire FFT grid
+  // each proc calculates contributions from every Pth grid point
+  
+  bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
+  int nxy_pppm = nx_pppm * ny_pppm;
+  
+  double qopt = 0.0;
+
+  for (bigint i = me; i < ngridtotal; i += nprocs) {
+    k = i % nx_pppm;
+    l = (i/nx_pppm) % ny_pppm;
+    m = i / nxy_pppm;
+
+    const int kper = k - nx_pppm*(2*k/nx_pppm);
+    const int lper = l - ny_pppm*(2*l/ny_pppm);
+    const int mper = m - nz_pppm*(2*m/nz_pppm);
+
+    sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
+    if (sqk == 0.0) continue;
+
+    sum1 = sum2 = sum3 = sum4 = 0.0;
+    
+    for (nx = -2; nx <= 2; nx++) {
+      qx = unitkx*(kper+nx_pppm*nx);
+      sx = exp(-0.25*square(qx/g_ewald));
+      argx = 0.5*qx*xprd/nx_pppm;
+      wx = powsinxx(argx,twoorder);
+      qx *= qx;
+      
+      for (ny = -2; ny <= 2; ny++) {
+	qy = unitky*(lper+ny_pppm*ny);
+	sy = exp(-0.25*square(qy/g_ewald));
+	argy = 0.5*qy*yprd/ny_pppm;
+	wy = powsinxx(argy,twoorder);
+	qy *= qy;
+	
+	for (nz = -2; nz <= 2; nz++) {
+	  qz = unitkz*(mper+nz_pppm*nz);
+	  sz = exp(-0.25*square(qz/g_ewald));
+	  argz = 0.5*qz*zprd_slab/nz_pppm;
+	  wz = powsinxx(argz,twoorder);
+	  qz *= qz;
+	  
+	  dot2 = qx+qy+qz;
+	  u1   = sx*sy*sz;
+	  u2   = wx*wy*wz;
+	  
+	  sum1 += u1*u1/dot2*MY_4PI*MY_4PI;
+	  sum2 += u1 * u2 * MY_4PI;
+	  sum3 += u2;
+	  sum4 += dot2*u2;
+	}
+      }
+    }
+    
+    sum2 *= sum2;
+    qopt += sum1 - sum2/(sum3*sum4);
+  }
+
+  // sum qopt over all procs
+  
+  double qopt_all;
+  MPI_Allreduce(&qopt,&qopt_all,1,MPI_DOUBLE,MPI_SUM,world);
+  return qopt_all;
+}
+
+/* ----------------------------------------------------------------------
+   estimate kspace force error for ik method
+------------------------------------------------------------------------- */
+
+double PPPM2::estimate_ik_error(double h, double prd, bigint natoms)
+{
+  double sum = 0.0;
+  if (natoms == 0) return 0.0;
+  for (int m = 0; m < order; m++)
+    sum += acons[order][m] * pow(h*g_ewald,2.0*m);
+  double value = q2 * pow(h*g_ewald,(double)order) *
+    sqrt(g_ewald*prd*sqrt(MY_2PI)*sum/natoms) / (prd*prd);
+
+  return value;
+}
+
+/* ----------------------------------------------------------------------
+   adjust the g_ewald parameter to near its optimal value
+   using a Newton-Raphson solver
+------------------------------------------------------------------------- */
+
+void PPPM2::adjust_gewald()
+{
+  double dx;
+
+  for (int i = 0; i < LARGE; i++) {
+    dx = newton_raphson_f() / derivf();
+    g_ewald -= dx;
+    if (fabs(newton_raphson_f()) < SMALL) return;
+  }
+  error->all(FLERR, "Could not compute g_ewald");
+}
+
+/* ----------------------------------------------------------------------
+   calculate f(x) using Newton-Raphson solver
+------------------------------------------------------------------------- */
+
+double PPPM2::newton_raphson_f()
+{
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  bigint natoms = atom->natoms;
+
+  double df_rspace = 2.0*q2*exp(-g_ewald*g_ewald*cutoff*cutoff) /
+       sqrt(natoms*cutoff*xprd*yprd*zprd);
+
+  double df_kspace = compute_df_kspace();
+
+  return df_rspace - df_kspace;
+}
+
+/* ----------------------------------------------------------------------
+   calculate numerical derivative f'(x) using forward difference
+   [f(x + h) - f(x)] / h
+------------------------------------------------------------------------- */
+
+double PPPM2::derivf()
+{
+  double h = 0.000001;  //Derivative step-size
+  double df,f1,f2,g_ewald_old;
+
+  f1 = newton_raphson_f();
+  g_ewald_old = g_ewald;
+  g_ewald += h;
+  f2 = newton_raphson_f();
+  g_ewald = g_ewald_old;
+  df = (f2 - f1)/h;
+
+  return df;
+}
+
+/* ----------------------------------------------------------------------
+   calculate the final estimate of the accuracy
+------------------------------------------------------------------------- */
+
+double PPPM2::final_accuracy()
+{
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  bigint natoms = atom->natoms;
+  if (natoms == 0) natoms = 1; // avoid division by zero
+
+  double df_kspace = compute_df_kspace();
+  double q2_over_sqrt = q2 / sqrt(natoms*cutoff*xprd*yprd*zprd);
+  double df_rspace = 2.0 * q2_over_sqrt * exp(-g_ewald*g_ewald*cutoff*cutoff);
+  double df_table = estimate_table_accuracy(q2_over_sqrt,df_rspace);
+  double estimated_accuracy = sqrt(df_kspace*df_kspace + df_rspace*df_rspace +
+                                   df_table*df_table);
+
+  return estimated_accuracy;
+}
+
+/* ----------------------------------------------------------------------
+   set local subset of PPPM/FFT grid that I own
+   n xyz lo/hi in = 3d brick that I own (inclusive)
+   n xyz lo/hi out = 3d brick + ghost cells in 6 directions (inclusive)
+   n xyz lo/hi fft = FFT columns that I own (all of x dim, 2d decomp in yz)
+------------------------------------------------------------------------- */
+
+void PPPM2::set_grid_local()
+{
+  // global indices of PPPM grid range from 0 to N-1
+  // nlo_in,nhi_in = lower/upper limits of the 3d sub-brick of
+  //   global PPPM grid that I own without ghost cells
+  // for slab PPPM, assign z grid as if it were not extended
+  // both non-tiled and tiled proc layouts use 0-1 fractional sumdomain info
+  
+  if (comm->layout != Comm::LAYOUT_TILED) {
+    nxlo_in = static_cast<int> (comm->xsplit[comm->myloc[0]] * nx_pppm);
+    nxhi_in = static_cast<int> (comm->xsplit[comm->myloc[0]+1] * nx_pppm) - 1;
+
+    nylo_in = static_cast<int> (comm->ysplit[comm->myloc[1]] * ny_pppm);
+    nyhi_in = static_cast<int> (comm->ysplit[comm->myloc[1]+1] * ny_pppm) - 1;
+
+    nzlo_in = static_cast<int>
+      (comm->zsplit[comm->myloc[2]] * nz_pppm/slab_volfactor);
+    nzhi_in = static_cast<int>
+      (comm->zsplit[comm->myloc[2]+1] * nz_pppm/slab_volfactor) - 1;
+    
+  } else {
+    nxlo_in = static_cast<int> (comm->mysplit[0][0] * nx_pppm);
+    nxhi_in = static_cast<int> (comm->mysplit[0][1] * nx_pppm) - 1;
+
+    nylo_in = static_cast<int> (comm->mysplit[1][0] * ny_pppm);
+    nyhi_in = static_cast<int> (comm->mysplit[1][1] * ny_pppm) - 1;
+
+    nzlo_in = static_cast<int> (comm->mysplit[2][0] * nz_pppm/slab_volfactor);
+    nzhi_in = static_cast<int> (comm->mysplit[2][1] * nz_pppm/slab_volfactor) - 1;
+  }
+
+  // nlower,nupper = stencil size for mapping particles to PPPM grid
+
+  nlower = -(order-1)/2;
+  nupper = order/2;
+
+  // shift values for particle <-> grid mapping
+  // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
+
+  if (order % 2) shift = OFFSET + 0.5;
+  else shift = OFFSET;
+  if (order % 2) shiftone = 0.0;
+  else shiftone = 0.5;
+
+  // nlo_out,nhi_out = lower/upper limits of the 3d sub-brick of
+  //   global PPPM grid that my particles can contribute charge to
+  // effectively nlo_in,nhi_in + ghost cells
+  // nlo,nhi = global coords of grid pt to "lower left" of smallest/largest
+  //           position a particle in my box can be at
+  // dist[3] = particle position bound = subbox + skin/2.0 + qdist
+  //   qdist = offset due to TIP4P fictitious charge
+  //   convert to triclinic if necessary
+  // nlo_out,nhi_out = nlo,nhi + stencil size for particle mapping
+  // for slab PPPM, assign z grid as if it were not extended
+
+  double *prd,*sublo,*subhi;
+
+  if (triclinic == 0) {
+    prd = domain->prd;
+    boxlo = domain->boxlo;
+    sublo = domain->sublo;
+    subhi = domain->subhi;
+  } else {
+    prd = domain->prd_lamda;
+    boxlo = domain->boxlo_lamda;
+    sublo = domain->sublo_lamda;
+    subhi = domain->subhi_lamda;
+  }
+
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+  double zprd_slab = zprd*slab_volfactor;
+
+  double dist[3] = {0.0,0.0,0.0};
+  double cuthalf = 0.5*neighbor->skin + qdist;
+  if (triclinic == 0) dist[0] = dist[1] = dist[2] = cuthalf;
+  else kspacebbox(cuthalf,&dist[0]);
+
+  int nlo,nhi;
+  nlo = nhi = 0;
+
+  nlo = static_cast<int> ((sublo[0]-dist[0]-boxlo[0]) *
+                            nx_pppm/xprd + shift) - OFFSET;
+  nhi = static_cast<int> ((subhi[0]+dist[0]-boxlo[0]) *
+                            nx_pppm/xprd + shift) - OFFSET;
+  nxlo_out = nlo + nlower;
+  nxhi_out = nhi + nupper;
+
+  nlo = static_cast<int> ((sublo[1]-dist[1]-boxlo[1]) *
+                            ny_pppm/yprd + shift) - OFFSET;
+  nhi = static_cast<int> ((subhi[1]+dist[1]-boxlo[1]) *
+                            ny_pppm/yprd + shift) - OFFSET;
+  nylo_out = nlo + nlower;
+  nyhi_out = nhi + nupper;
+
+  nlo = static_cast<int> ((sublo[2]-dist[2]-boxlo[2]) *
+                            nz_pppm/zprd_slab + shift) - OFFSET;
+  nhi = static_cast<int> ((subhi[2]+dist[2]-boxlo[2]) *
+                            nz_pppm/zprd_slab + shift) - OFFSET;
+  nzlo_out = nlo + nlower;
+  nzhi_out = nhi + nupper;
+
+  if (stagger_flag) {
+    nxhi_out++;
+    nyhi_out++;
+    nzhi_out++;
+  }
+
+  // for slab PPPM, change the grid boundary for processors at +z end
+  //   to include the empty volume between periodically repeating slabs
+  // for slab PPPM, want charge data communicated from -z proc to +z proc,
+  //   but not vice versa, also want field data communicated from +z proc to
+  //   -z proc, but not vice versa
+  // this is accomplished by nzhi_in = nzhi_out on +z end (no ghost cells)
+  // also insure no other procs use ghost cells beyond +z limit
+  // differnet logic for non-tiled vs tiled decomposition
+  
+  if (slabflag == 1) {
+    if (comm->layout != Comm::LAYOUT_TILED) {
+      if (comm->myloc[2] == comm->procgrid[2]-1) nzhi_in = nzhi_out = nz_pppm - 1;
+    } else {
+      if (comm->mysplit[2][1] == 1.0) nzhi_in = nzhi_out = nz_pppm - 1;
+    }
+    nzhi_out = MIN(nzhi_out,nz_pppm-1);
+  }
+
+  // x-pencil decomposition of FFT mesh
+  // global indices range from 0 to N-1
+  // each proc owns entire x-dimension, clumps of columns in y,z dimensions
+  // npey_fft,npez_fft = # of procs in y,z dims
+  // if nprocs is small enough, proc can own 1 or more entire xy planes,
+  //   else proc owns 2d sub-blocks of yz plane
+  // me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions
+  // nlo_fft,nhi_fft = lower/upper limit of the section
+  //   of the global FFT mesh that I own in x-pencil decomposition
+
+  int npey_fft,npez_fft;
+  if (nz_pppm >= nprocs) {
+    npey_fft = 1;
+    npez_fft = nprocs;
+  } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);
+
+  int me_y = me % npey_fft;
+  int me_z = me / npey_fft;
+
+  nxlo_fft = 0;
+  nxhi_fft = nx_pppm - 1;
+  nylo_fft = me_y*ny_pppm/npey_fft;
+  nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1;
+  nzlo_fft = me_z*nz_pppm/npez_fft;
+  nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1;
+
+  // ngrid = count of PPPM grid pts owned by this proc, including ghosts
+
+  ngrid = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
+    (nzhi_out-nzlo_out+1);
+
+  // count of FFT grids pts owned by this proc, without ghosts
+  // nfft = FFT points in x-pencil FFT decomposition on this proc
+  // nfft_brick = FFT points in 3d brick-decomposition on this proc
+  // nfft_both = greater of 2 values
+
+  nfft = (nxhi_fft-nxlo_fft+1) * (nyhi_fft-nylo_fft+1) *
+    (nzhi_fft-nzlo_fft+1);
+  int nfft_brick = (nxhi_in-nxlo_in+1) * (nyhi_in-nylo_in+1) *
+    (nzhi_in-nzlo_in+1);
+  nfft_both = MAX(nfft,nfft_brick);
+}
+
+/* ----------------------------------------------------------------------
+   pre-compute Green's function denominator expansion coeffs, Gamma(2n)
+------------------------------------------------------------------------- */
+
+void PPPM2::compute_gf_denom()
+{
+  int k,l,m;
+
+  for (l = 1; l < order; l++) gf_b[l] = 0.0;
+  gf_b[0] = 1.0;
+
+  for (m = 1; m < order; m++) {
+    for (l = m; l > 0; l--)
+      gf_b[l] = 4.0 * (gf_b[l]*(l-m)*(l-m-0.5)-gf_b[l-1]*(l-m-1)*(l-m-1));
+    gf_b[0] = 4.0 * (gf_b[0]*(l-m)*(l-m-0.5));
+  }
+
+  bigint ifact = 1;
+  for (k = 1; k < 2*order; k++) ifact *= k;
+  double gaminv = 1.0/ifact;
+  for (l = 0; l < order; l++) gf_b[l] *= gaminv;
+}
+
+/* ----------------------------------------------------------------------
+   pre-compute modified (Hockney-Eastwood) Coulomb Green's function
+------------------------------------------------------------------------- */
+
+void PPPM2::compute_gf_ik()
+{
+  const double * const prd = domain->prd;
+
+  const double xprd = prd[0];
+  const double yprd = prd[1];
+  const double zprd = prd[2];
+  const double zprd_slab = zprd*slab_volfactor;
+  const double unitkx = (MY_2PI/xprd);
+  const double unitky = (MY_2PI/yprd);
+  const double unitkz = (MY_2PI/zprd_slab);
+
+  double snx,sny,snz;
+  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  double sum1,dot1,dot2;
+  double numerator,denominator;
+  double sqk;
+
+  int k,l,m,n,nx,ny,nz,kper,lper,mper;
+
+  const int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) *
+                                    pow(-log(EPS_HOC),0.25));
+  const int nby = static_cast<int> ((g_ewald*yprd/(MY_PI*ny_pppm)) *
+                                    pow(-log(EPS_HOC),0.25));
+  const int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) *
+                                    pow(-log(EPS_HOC),0.25));
+  const int twoorder = 2*order;
+
+  n = 0;
+  for (m = nzlo_fft; m <= nzhi_fft; m++) {
+    mper = m - nz_pppm*(2*m/nz_pppm);
+    snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm));
+
+    for (l = nylo_fft; l <= nyhi_fft; l++) {
+      lper = l - ny_pppm*(2*l/ny_pppm);
+      sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));
+
+      for (k = nxlo_fft; k <= nxhi_fft; k++) {
+        kper = k - nx_pppm*(2*k/nx_pppm);
+        snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));
+
+        sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
+
+        if (sqk != 0.0) {
+          numerator = 12.5663706/sqk;
+          denominator = gf_denom(snx,sny,snz);
+          sum1 = 0.0;
+
+          for (nx = -nbx; nx <= nbx; nx++) {
+            qx = unitkx*(kper+nx_pppm*nx);
+            sx = exp(-0.25*square(qx/g_ewald));
+            argx = 0.5*qx*xprd/nx_pppm;
+            wx = powsinxx(argx,twoorder);
+
+            for (ny = -nby; ny <= nby; ny++) {
+              qy = unitky*(lper+ny_pppm*ny);
+              sy = exp(-0.25*square(qy/g_ewald));
+              argy = 0.5*qy*yprd/ny_pppm;
+              wy = powsinxx(argy,twoorder);
+
+              for (nz = -nbz; nz <= nbz; nz++) {
+                qz = unitkz*(mper+nz_pppm*nz);
+                sz = exp(-0.25*square(qz/g_ewald));
+                argz = 0.5*qz*zprd_slab/nz_pppm;
+                wz = powsinxx(argz,twoorder);
+
+                dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
+                dot2 = qx*qx+qy*qy+qz*qz;
+                sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
+              }
+            }
+          }
+          greensfn[n++] = numerator*sum1/denominator;
+        } else greensfn[n++] = 0.0;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pre-compute modified (Hockney-Eastwood) Coulomb Green's function
+   for a triclinic system
+------------------------------------------------------------------------- */
+
+void PPPM2::compute_gf_ik_triclinic()
+{
+  double snx,sny,snz;
+  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  double sum1,dot1,dot2;
+  double numerator,denominator;
+  double sqk;
+
+  int k,l,m,n,nx,ny,nz,kper,lper,mper;
+
+  double tmp[3];
+  tmp[0] = (g_ewald/(MY_PI*nx_pppm)) * pow(-log(EPS_HOC),0.25);
+  tmp[1] = (g_ewald/(MY_PI*ny_pppm)) * pow(-log(EPS_HOC),0.25);
+  tmp[2] = (g_ewald/(MY_PI*nz_pppm)) * pow(-log(EPS_HOC),0.25);
+  lamda2xT(&tmp[0],&tmp[0]);
+  const int nbx = static_cast<int> (tmp[0]);
+  const int nby = static_cast<int> (tmp[1]);
+  const int nbz = static_cast<int> (tmp[2]);
+
+  const int twoorder = 2*order;
+
+  n = 0;
+  for (m = nzlo_fft; m <= nzhi_fft; m++) {
+    mper = m - nz_pppm*(2*m/nz_pppm);
+    snz = square(sin(MY_PI*mper/nz_pppm));
+
+    for (l = nylo_fft; l <= nyhi_fft; l++) {
+      lper = l - ny_pppm*(2*l/ny_pppm);
+      sny = square(sin(MY_PI*lper/ny_pppm));
+
+      for (k = nxlo_fft; k <= nxhi_fft; k++) {
+        kper = k - nx_pppm*(2*k/nx_pppm);
+        snx = square(sin(MY_PI*kper/nx_pppm));
+
+        double unitk_lamda[3];
+        unitk_lamda[0] = 2.0*MY_PI*kper;
+        unitk_lamda[1] = 2.0*MY_PI*lper;
+        unitk_lamda[2] = 2.0*MY_PI*mper;
+        x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
+
+        sqk = square(unitk_lamda[0]) + square(unitk_lamda[1]) + square(unitk_lamda[2]);
+
+        if (sqk != 0.0) {
+          numerator = 12.5663706/sqk;
+          denominator = gf_denom(snx,sny,snz);
+          sum1 = 0.0;
+
+          for (nx = -nbx; nx <= nbx; nx++) {
+            argx = MY_PI*kper/nx_pppm + MY_PI*nx;
+            wx = powsinxx(argx,twoorder);
+
+            for (ny = -nby; ny <= nby; ny++) {
+              argy = MY_PI*lper/ny_pppm + MY_PI*ny;
+              wy = powsinxx(argy,twoorder);
+
+              for (nz = -nbz; nz <= nbz; nz++) {
+                argz = MY_PI*mper/nz_pppm + MY_PI*nz;
+                wz = powsinxx(argz,twoorder);
+
+                double b[3];
+                b[0] = 2.0*MY_PI*nx_pppm*nx;
+                b[1] = 2.0*MY_PI*ny_pppm*ny;
+                b[2] = 2.0*MY_PI*nz_pppm*nz;
+                x2lamdaT(&b[0],&b[0]);
+
+                qx = unitk_lamda[0]+b[0];
+                sx = exp(-0.25*square(qx/g_ewald));
+
+                qy = unitk_lamda[1]+b[1];
+                sy = exp(-0.25*square(qy/g_ewald));
+
+                qz = unitk_lamda[2]+b[2];
+                sz = exp(-0.25*square(qz/g_ewald));
+
+                dot1 = unitk_lamda[0]*qx + unitk_lamda[1]*qy + unitk_lamda[2]*qz;
+                dot2 = qx*qx+qy*qy+qz*qz;
+                sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
+              }
+            }
+          }
+          greensfn[n++] = numerator*sum1/denominator;
+        } else greensfn[n++] = 0.0;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute optimized Green's function for energy calculation
+------------------------------------------------------------------------- */
+
+void PPPM2::compute_gf_ad()
+{
+  const double * const prd = domain->prd;
+
+  const double xprd = prd[0];
+  const double yprd = prd[1];
+  const double zprd = prd[2];
+  const double zprd_slab = zprd*slab_volfactor;
+  const double unitkx = (MY_2PI/xprd);
+  const double unitky = (MY_2PI/yprd);
+  const double unitkz = (MY_2PI/zprd_slab);
+
+  double snx,sny,snz,sqk;
+  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  double numerator,denominator;
+  int k,l,m,n,kper,lper,mper;
+
+  const int twoorder = 2*order;
+
+  for (int i = 0; i < 6; i++) sf_coeff[i] = 0.0;
+
+  n = 0;
+  for (m = nzlo_fft; m <= nzhi_fft; m++) {
+    mper = m - nz_pppm*(2*m/nz_pppm);
+    qz = unitkz*mper;
+    snz = square(sin(0.5*qz*zprd_slab/nz_pppm));
+    sz = exp(-0.25*square(qz/g_ewald));
+    argz = 0.5*qz*zprd_slab/nz_pppm;
+    wz = powsinxx(argz,twoorder);
+
+    for (l = nylo_fft; l <= nyhi_fft; l++) {
+      lper = l - ny_pppm*(2*l/ny_pppm);
+      qy = unitky*lper;
+      sny = square(sin(0.5*qy*yprd/ny_pppm));
+      sy = exp(-0.25*square(qy/g_ewald));
+      argy = 0.5*qy*yprd/ny_pppm;
+      wy = powsinxx(argy,twoorder);
+
+      for (k = nxlo_fft; k <= nxhi_fft; k++) {
+        kper = k - nx_pppm*(2*k/nx_pppm);
+        qx = unitkx*kper;
+        snx = square(sin(0.5*qx*xprd/nx_pppm));
+        sx = exp(-0.25*square(qx/g_ewald));
+        argx = 0.5*qx*xprd/nx_pppm;
+        wx = powsinxx(argx,twoorder);
+
+        sqk = qx*qx + qy*qy + qz*qz;
+
+        if (sqk != 0.0) {
+          numerator = MY_4PI/sqk;
+          denominator = gf_denom(snx,sny,snz);
+          greensfn[n] = numerator*sx*sy*sz*wx*wy*wz/denominator;
+          sf_coeff[0] += sf_precoeff1[n]*greensfn[n];
+          sf_coeff[1] += sf_precoeff2[n]*greensfn[n];
+          sf_coeff[2] += sf_precoeff3[n]*greensfn[n];
+          sf_coeff[3] += sf_precoeff4[n]*greensfn[n];
+          sf_coeff[4] += sf_precoeff5[n]*greensfn[n];
+          sf_coeff[5] += sf_precoeff6[n]*greensfn[n];
+          n++;
+        } else {
+          greensfn[n] = 0.0;
+          sf_coeff[0] += sf_precoeff1[n]*greensfn[n];
+          sf_coeff[1] += sf_precoeff2[n]*greensfn[n];
+          sf_coeff[2] += sf_precoeff3[n]*greensfn[n];
+          sf_coeff[3] += sf_precoeff4[n]*greensfn[n];
+          sf_coeff[4] += sf_precoeff5[n]*greensfn[n];
+          sf_coeff[5] += sf_precoeff6[n]*greensfn[n];
+          n++;
+        }
+      }
+    }
+  }
+
+  // compute the coefficients for the self-force correction
+
+  double prex, prey, prez;
+  prex = prey = prez = MY_PI/volume;
+  prex *= nx_pppm/xprd;
+  prey *= ny_pppm/yprd;
+  prez *= nz_pppm/zprd_slab;
+  sf_coeff[0] *= prex;
+  sf_coeff[1] *= prex*2;
+  sf_coeff[2] *= prey;
+  sf_coeff[3] *= prey*2;
+  sf_coeff[4] *= prez;
+  sf_coeff[5] *= prez*2;
+
+  // communicate values with other procs
+
+  double tmp[6];
+  MPI_Allreduce(sf_coeff,tmp,6,MPI_DOUBLE,MPI_SUM,world);
+  for (n = 0; n < 6; n++) sf_coeff[n] = tmp[n];
+}
+
+/* ----------------------------------------------------------------------
+   compute self force coefficients for ad-differentiation scheme
+------------------------------------------------------------------------- */
+
+void PPPM2::compute_sf_precoeff()
+{
+  int i,k,l,m,n;
+  int nx,ny,nz,kper,lper,mper;
+  double wx0[5],wy0[5],wz0[5],wx1[5],wy1[5],wz1[5],wx2[5],wy2[5],wz2[5];
+  double qx0,qy0,qz0,qx1,qy1,qz1,qx2,qy2,qz2;
+  double u0,u1,u2,u3,u4,u5,u6;
+  double sum1,sum2,sum3,sum4,sum5,sum6;
+
+  n = 0;
+  for (m = nzlo_fft; m <= nzhi_fft; m++) {
+    mper = m - nz_pppm*(2*m/nz_pppm);
+
+    for (l = nylo_fft; l <= nyhi_fft; l++) {
+      lper = l - ny_pppm*(2*l/ny_pppm);
+
+      for (k = nxlo_fft; k <= nxhi_fft; k++) {
+        kper = k - nx_pppm*(2*k/nx_pppm);
+
+        sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = 0.0;
+        for (i = 0; i < 5; i++) {
+
+          qx0 = MY_2PI*(kper+nx_pppm*(i-2));
+          qx1 = MY_2PI*(kper+nx_pppm*(i-1));
+          qx2 = MY_2PI*(kper+nx_pppm*(i  ));
+          wx0[i] = powsinxx(0.5*qx0/nx_pppm,order);
+          wx1[i] = powsinxx(0.5*qx1/nx_pppm,order);
+          wx2[i] = powsinxx(0.5*qx2/nx_pppm,order);
+
+          qy0 = MY_2PI*(lper+ny_pppm*(i-2));
+          qy1 = MY_2PI*(lper+ny_pppm*(i-1));
+          qy2 = MY_2PI*(lper+ny_pppm*(i  ));
+          wy0[i] = powsinxx(0.5*qy0/ny_pppm,order);
+          wy1[i] = powsinxx(0.5*qy1/ny_pppm,order);
+          wy2[i] = powsinxx(0.5*qy2/ny_pppm,order);
+
+          qz0 = MY_2PI*(mper+nz_pppm*(i-2));
+          qz1 = MY_2PI*(mper+nz_pppm*(i-1));
+          qz2 = MY_2PI*(mper+nz_pppm*(i  ));
+
+          wz0[i] = powsinxx(0.5*qz0/nz_pppm,order);
+          wz1[i] = powsinxx(0.5*qz1/nz_pppm,order);
+          wz2[i] = powsinxx(0.5*qz2/nz_pppm,order);
+        }
+
+        for (nx = 0; nx < 5; nx++) {
+          for (ny = 0; ny < 5; ny++) {
+            for (nz = 0; nz < 5; nz++) {
+              u0 = wx0[nx]*wy0[ny]*wz0[nz];
+              u1 = wx1[nx]*wy0[ny]*wz0[nz];
+              u2 = wx2[nx]*wy0[ny]*wz0[nz];
+              u3 = wx0[nx]*wy1[ny]*wz0[nz];
+              u4 = wx0[nx]*wy2[ny]*wz0[nz];
+              u5 = wx0[nx]*wy0[ny]*wz1[nz];
+              u6 = wx0[nx]*wy0[ny]*wz2[nz];
+
+              sum1 += u0*u1;
+              sum2 += u0*u2;
+              sum3 += u0*u3;
+              sum4 += u0*u4;
+              sum5 += u0*u5;
+              sum6 += u0*u6;
+            }
+          }
+        }
+
+        // store values
+
+        sf_precoeff1[n] = sum1;
+        sf_precoeff2[n] = sum2;
+        sf_precoeff3[n] = sum3;
+        sf_precoeff4[n] = sum4;
+        sf_precoeff5[n] = sum5;
+        sf_precoeff6[n++] = sum6;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   find center grid pt for each of my particles
+   check that full stencil for the particle will fit in my 3d brick
+   store central grid pt indices in part2grid array
+------------------------------------------------------------------------- */
+
+void PPPM2::particle_map()
+{
+  int nx,ny,nz;
+
+  double **x = atom->x;
+  int nlocal = atom->nlocal;
+
+  int flag = 0;
+
+  if (!std::isfinite(boxlo[0]) || !std::isfinite(boxlo[1]) || !std::isfinite(boxlo[2]))
+    error->one(FLERR,"Non-numeric box dimensions - simulation unstable");
+
+  for (int i = 0; i < nlocal; i++) {
+
+    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+    // current particle coord can be outside global and local box
+    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
+
+    nx = static_cast<int> ((x[i][0]-boxlo[0])*delxinv+shift) - OFFSET;
+    ny = static_cast<int> ((x[i][1]-boxlo[1])*delyinv+shift) - OFFSET;
+    nz = static_cast<int> ((x[i][2]-boxlo[2])*delzinv+shift) - OFFSET;
+
+    part2grid[i][0] = nx;
+    part2grid[i][1] = ny;
+    part2grid[i][2] = nz;
+
+    // check that entire stencil around nx,ny,nz will fit in my 3d brick
+
+    if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
+        ny+nlower < nylo_out || ny+nupper > nyhi_out ||
+        nz+nlower < nzlo_out || nz+nupper > nzhi_out)
+      flag = 1;
+  }
+
+  if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPM");
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = charge "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid
+------------------------------------------------------------------------- */
+
+void PPPM2::make_rho()
+{
+  int l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
+
+  // clear 3d density array
+
+  memset(&(density_brick[nzlo_out][nylo_out][nxlo_out]),0,
+         ngrid*sizeof(FFT_SCALAR));
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  double *q = atom->q;
+  double **x = atom->x;
+  int nlocal = atom->nlocal;
+
+  for (int i = 0; i < nlocal; i++) {
+
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+
+    z0 = delvolinv * q[i];
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      y0 = z0*rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        x0 = y0*rho1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          density_brick[mz][my][mx] += x0*rho1d[0][l];
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   remap density from 3d brick decomposition to FFT decomposition
+------------------------------------------------------------------------- */
+
+void PPPM2::brick2fft()
+{
+  int n,ix,iy,iz;
+
+  // copy grabs inner portion of density from 3d brick
+  // remap could be done as pre-stage of FFT,
+  //   but this works optimally on only double values, not complex values
+
+  n = 0;
+  for (iz = nzlo_in; iz <= nzhi_in; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++)
+        density_fft[n++] = density_brick[iz][iy][ix];
+
+  remap->perform(density_fft,density_fft,work1);
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver
+------------------------------------------------------------------------- */
+
+void PPPM2::poisson()
+{
+  if (differentiation_flag == 1) poisson_ad();
+  else poisson_ik();
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for ik
+------------------------------------------------------------------------- */
+
+void PPPM2::poisson_ik()
+{
+  int i,j,k,n;
+  double eng;
+
+  // transform charge density (r -> k)
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work1[n++] = density_fft[i];
+    work1[n++] = ZEROF;
+  }
+
+  fft1->compute(work1,work1,1);
+
+  // global energy and virial contribution
+
+  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
+  double s2 = scaleinv*scaleinv;
+
+  if (eflag_global || vflag_global) {
+    if (vflag_global) {
+      n = 0;
+      for (i = 0; i < nfft; i++) {
+        eng = s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
+        for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j];
+        if (eflag_global) energy += eng;
+        n += 2;
+      }
+    } else {
+      n = 0;
+      for (i = 0; i < nfft; i++) {
+        energy +=
+          s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
+        n += 2;
+      }
+    }
+  }
+
+  // scale by 1/total-grid-pts to get rho(k)
+  // multiply by Green's function to get V(k)
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work1[n++] *= scaleinv * greensfn[i];
+    work1[n++] *= scaleinv * greensfn[i];
+  }
+
+  // extra FFTs for per-atom energy/virial
+
+  if (evflag_atom) poisson_peratom();
+
+  // triclinic system
+
+  if (triclinic) {
+    poisson_ik_triclinic();
+    return;
+  }
+
+  // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
+  // FFT leaves data in 3d brick decomposition
+  // copy it into inner portion of vdx,vdy,vdz arrays
+
+  // x direction gradient
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work2[n] = fkx[i]*work1[n+1];
+        work2[n+1] = -fkx[i]*work1[n];
+        n += 2;
+      }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdx_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+
+  // y direction gradient
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work2[n] = fky[j]*work1[n+1];
+        work2[n+1] = -fky[j]*work1[n];
+        n += 2;
+      }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdy_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+
+  // z direction gradient
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work2[n] = fkz[k]*work1[n+1];
+        work2[n+1] = -fkz[k]*work1[n];
+        n += 2;
+      }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdz_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for ik for a triclinic system
+------------------------------------------------------------------------- */
+
+void PPPM2::poisson_ik_triclinic()
+{
+  int i,j,k,n;
+
+  // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
+  // FFT leaves data in 3d brick decomposition
+  // copy it into inner portion of vdx,vdy,vdz arrays
+
+  // x direction gradient
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = fkx[i]*work1[n+1];
+    work2[n+1] = -fkx[i]*work1[n];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdx_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+
+  // y direction gradient
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = fky[i]*work1[n+1];
+    work2[n+1] = -fky[i]*work1[n];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdy_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+
+  // z direction gradient
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = fkz[i]*work1[n+1];
+    work2[n+1] = -fkz[i]*work1[n];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdz_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for ad
+------------------------------------------------------------------------- */
+
+void PPPM2::poisson_ad()
+{
+  int i,j,k,n;
+  double eng;
+
+  // transform charge density (r -> k)
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work1[n++] = density_fft[i];
+    work1[n++] = ZEROF;
+  }
+
+  fft1->compute(work1,work1,1);
+
+  // global energy and virial contribution
+
+  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
+  double s2 = scaleinv*scaleinv;
+
+  if (eflag_global || vflag_global) {
+    if (vflag_global) {
+      n = 0;
+      for (i = 0; i < nfft; i++) {
+        eng = s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
+        for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j];
+        if (eflag_global) energy += eng;
+        n += 2;
+      }
+    } else {
+      n = 0;
+      for (i = 0; i < nfft; i++) {
+        energy +=
+          s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
+        n += 2;
+      }
+    }
+  }
+
+  // scale by 1/total-grid-pts to get rho(k)
+  // multiply by Green's function to get V(k)
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work1[n++] *= scaleinv * greensfn[i];
+    work1[n++] *= scaleinv * greensfn[i];
+  }
+
+  // extra FFTs for per-atom energy/virial
+
+  if (vflag_atom) poisson_peratom();
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = work1[n];
+    work2[n+1] = work1[n+1];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        u_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for per-atom energy/virial
+------------------------------------------------------------------------- */
+
+void PPPM2::poisson_peratom()
+{
+  int i,j,k,n;
+
+  // energy
+
+  if (eflag_atom && differentiation_flag != 1) {
+    n = 0;
+    for (i = 0; i < nfft; i++) {
+      work2[n] = work1[n];
+      work2[n+1] = work1[n+1];
+      n += 2;
+    }
+
+    fft2->compute(work2,work2,-1);
+
+    n = 0;
+    for (k = nzlo_in; k <= nzhi_in; k++)
+      for (j = nylo_in; j <= nyhi_in; j++)
+        for (i = nxlo_in; i <= nxhi_in; i++) {
+          u_brick[k][j][i] = work2[n];
+          n += 2;
+        }
+  }
+
+  // 6 components of virial in v0 thru v5
+
+  if (!vflag_atom) return;
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = work1[n]*vg[i][0];
+    work2[n+1] = work1[n+1]*vg[i][0];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v0_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = work1[n]*vg[i][1];
+    work2[n+1] = work1[n+1]*vg[i][1];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v1_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = work1[n]*vg[i][2];
+    work2[n+1] = work1[n+1]*vg[i][2];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v2_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = work1[n]*vg[i][3];
+    work2[n+1] = work1[n+1]*vg[i][3];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v3_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = work1[n]*vg[i][4];
+    work2[n+1] = work1[n+1]*vg[i][4];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v4_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = work1[n]*vg[i][5];
+    work2[n+1] = work1[n+1]*vg[i][5];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v5_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles
+------------------------------------------------------------------------- */
+
+void PPPM2::fieldforce()
+{
+  if (differentiation_flag == 1) fieldforce_ad();
+  else fieldforce_ik();
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles for ik
+------------------------------------------------------------------------- */
+
+void PPPM2::fieldforce_ik()
+{
+  int i,l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
+  FFT_SCALAR ekx,eky,ekz;
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  double *q = atom->q;
+  double **x = atom->x;
+  double **f = atom->f;
+
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+
+    ekx = eky = ekz = ZEROF;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      z0 = rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        y0 = z0*rho1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          x0 = y0*rho1d[0][l];
+          ekx -= x0*vdx_brick[mz][my][mx];
+          eky -= x0*vdy_brick[mz][my][mx];
+          ekz -= x0*vdz_brick[mz][my][mx];
+        }
+      }
+    }
+
+    // convert E-field to force
+
+    const double qfactor = qqrd2e * scale * q[i];
+    f[i][0] += qfactor*ekx;
+    f[i][1] += qfactor*eky;
+    if (slabflag != 2) f[i][2] += qfactor*ekz;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles for ad
+------------------------------------------------------------------------- */
+
+void PPPM2::fieldforce_ad()
+{
+  int i,l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz;
+  FFT_SCALAR ekx,eky,ekz;
+  double s1,s2,s3;
+  double sf = 0.0;
+  double *prd;
+
+  prd = domain->prd;
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+
+  double hx_inv = nx_pppm/xprd;
+  double hy_inv = ny_pppm/yprd;
+  double hz_inv = nz_pppm/zprd;
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  double *q = atom->q;
+  double **x = atom->x;
+  double **f = atom->f;
+
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+    compute_drho1d(dx,dy,dz);
+
+    ekx = eky = ekz = ZEROF;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          ekx += drho1d[0][l]*rho1d[1][m]*rho1d[2][n]*u_brick[mz][my][mx];
+          eky += rho1d[0][l]*drho1d[1][m]*rho1d[2][n]*u_brick[mz][my][mx];
+          ekz += rho1d[0][l]*rho1d[1][m]*drho1d[2][n]*u_brick[mz][my][mx];
+        }
+      }
+    }
+    ekx *= hx_inv;
+    eky *= hy_inv;
+    ekz *= hz_inv;
+
+    // convert E-field to force and subtract self forces
+
+    const double qfactor = qqrd2e * scale;
+
+    s1 = x[i][0]*hx_inv;
+    s2 = x[i][1]*hy_inv;
+    s3 = x[i][2]*hz_inv;
+    sf = sf_coeff[0]*sin(2*MY_PI*s1);
+    sf += sf_coeff[1]*sin(4*MY_PI*s1);
+    sf *= 2*q[i]*q[i];
+    f[i][0] += qfactor*(ekx*q[i] - sf);
+
+    sf = sf_coeff[2]*sin(2*MY_PI*s2);
+    sf += sf_coeff[3]*sin(4*MY_PI*s2);
+    sf *= 2*q[i]*q[i];
+    f[i][1] += qfactor*(eky*q[i] - sf);
+
+
+    sf = sf_coeff[4]*sin(2*MY_PI*s3);
+    sf += sf_coeff[5]*sin(4*MY_PI*s3);
+    sf *= 2*q[i]*q[i];
+    if (slabflag != 2) f[i][2] += qfactor*(ekz*q[i] - sf);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get per-atom energy/virial
+------------------------------------------------------------------------- */
+
+void PPPM2::fieldforce_peratom()
+{
+  int i,l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
+  FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
+
+  // loop over my charges, interpolate from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  double *q = atom->q;
+  double **x = atom->x;
+
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+
+    u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      z0 = rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        y0 = z0*rho1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          x0 = y0*rho1d[0][l];
+          if (eflag_atom) u += x0*u_brick[mz][my][mx];
+          if (vflag_atom) {
+            v0 += x0*v0_brick[mz][my][mx];
+            v1 += x0*v1_brick[mz][my][mx];
+            v2 += x0*v2_brick[mz][my][mx];
+            v3 += x0*v3_brick[mz][my][mx];
+            v4 += x0*v4_brick[mz][my][mx];
+            v5 += x0*v5_brick[mz][my][mx];
+          }
+        }
+      }
+    }
+
+    if (eflag_atom) eatom[i] += q[i]*u;
+    if (vflag_atom) {
+      vatom[i][0] += q[i]*v0;
+      vatom[i][1] += q[i]*v1;
+      vatom[i][2] += q[i]*v2;
+      vatom[i][3] += q[i]*v3;
+      vatom[i][4] += q[i]*v4;
+      vatom[i][5] += q[i]*v5;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pack own values to buf to send to another proc
+------------------------------------------------------------------------- */
+
+void PPPM2::pack_forward2(int flag, void *pbuf, int nlist, int *list)
+{
+  FFT_SCALAR *buf = (FFT_SCALAR *) pbuf;
+  
+  int n = 0;
+
+  if (flag == FORWARD_IK) {
+    FFT_SCALAR *xsrc = &vdx_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *ysrc = &vdy_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *zsrc = &vdz_brick[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      buf[n++] = xsrc[list[i]];
+      buf[n++] = ysrc[list[i]];
+      buf[n++] = zsrc[list[i]];
+    }
+  } else if (flag == FORWARD_AD) {
+    FFT_SCALAR *src = &u_brick[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++)
+      buf[i] = src[list[i]];
+  } else if (flag == FORWARD_IK_PERATOM) {
+    FFT_SCALAR *esrc = &u_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      if (eflag_atom) buf[n++] = esrc[list[i]];
+      if (vflag_atom) {
+        buf[n++] = v0src[list[i]];
+        buf[n++] = v1src[list[i]];
+        buf[n++] = v2src[list[i]];
+        buf[n++] = v3src[list[i]];
+        buf[n++] = v4src[list[i]];
+        buf[n++] = v5src[list[i]];
+      }
+    }
+  } else if (flag == FORWARD_AD_PERATOM) {
+    FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      buf[n++] = v0src[list[i]];
+      buf[n++] = v1src[list[i]];
+      buf[n++] = v2src[list[i]];
+      buf[n++] = v3src[list[i]];
+      buf[n++] = v4src[list[i]];
+      buf[n++] = v5src[list[i]];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack another proc's own values from buf and set own ghost values
+------------------------------------------------------------------------- */
+
+void PPPM2::unpack_forward2(int flag, void *pbuf, int nlist, int *list)
+{
+  FFT_SCALAR *buf = (FFT_SCALAR *) pbuf;
+
+  int n = 0;
+
+  if (flag == FORWARD_IK) {
+    FFT_SCALAR *xdest = &vdx_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *ydest = &vdy_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *zdest = &vdz_brick[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      xdest[list[i]] = buf[n++];
+      ydest[list[i]] = buf[n++];
+      zdest[list[i]] = buf[n++];
+    }
+  } else if (flag == FORWARD_AD) {
+    FFT_SCALAR *dest = &u_brick[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++)
+      dest[list[i]] = buf[i];
+  } else if (flag == FORWARD_IK_PERATOM) {
+    FFT_SCALAR *esrc = &u_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      if (eflag_atom) esrc[list[i]] = buf[n++];
+      if (vflag_atom) {
+        v0src[list[i]] = buf[n++];
+        v1src[list[i]] = buf[n++];
+        v2src[list[i]] = buf[n++];
+        v3src[list[i]] = buf[n++];
+        v4src[list[i]] = buf[n++];
+        v5src[list[i]] = buf[n++];
+      }
+    }
+  } else if (flag == FORWARD_AD_PERATOM) {
+    FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      v0src[list[i]] = buf[n++];
+      v1src[list[i]] = buf[n++];
+      v2src[list[i]] = buf[n++];
+      v3src[list[i]] = buf[n++];
+      v4src[list[i]] = buf[n++];
+      v5src[list[i]] = buf[n++];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pack ghost values into buf to send to another proc
+------------------------------------------------------------------------- */
+
+void PPPM2::pack_reverse2(int flag, void *pbuf, int nlist, int *list)
+{
+  FFT_SCALAR *buf = (FFT_SCALAR *) pbuf;
+    
+  if (flag == REVERSE_RHO) {
+    FFT_SCALAR *src = &density_brick[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++)
+      buf[i] = src[list[i]];
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack another proc's ghost values from buf and add to own values
+------------------------------------------------------------------------- */
+
+void PPPM2::unpack_reverse2(int flag, void *pbuf, int nlist, int *list)
+{
+  FFT_SCALAR *buf = (FFT_SCALAR *) pbuf;
+
+  if (flag == REVERSE_RHO) {
+    FFT_SCALAR *dest = &density_brick[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++)
+      dest[list[i]] += buf[i];
+  }
+}
+
+/* ----------------------------------------------------------------------
+   map nprocs to NX by NY grid as PX by PY procs - return optimal px,py
+------------------------------------------------------------------------- */
+
+void PPPM2::procs2grid2d(int nprocs, int nx, int ny, int *px, int *py)
+{
+  // loop thru all possible factorizations of nprocs
+  // surf = surface area of largest proc sub-domain
+  // innermost if test minimizes surface area and surface/volume ratio
+
+  int bestsurf = 2 * (nx + ny);
+  int bestboxx = 0;
+  int bestboxy = 0;
+
+  int boxx,boxy,surf,ipx,ipy;
+
+  ipx = 1;
+  while (ipx <= nprocs) {
+    if (nprocs % ipx == 0) {
+      ipy = nprocs/ipx;
+      boxx = nx/ipx;
+      if (nx % ipx) boxx++;
+      boxy = ny/ipy;
+      if (ny % ipy) boxy++;
+      surf = boxx + boxy;
+      if (surf < bestsurf ||
+          (surf == bestsurf && boxx*boxy > bestboxx*bestboxy)) {
+        bestsurf = surf;
+        bestboxx = boxx;
+        bestboxy = boxy;
+        *px = ipx;
+        *py = ipy;
+      }
+    }
+    ipx++;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   charge assignment into rho1d
+   dx,dy,dz = distance of particle from "lower left" grid point
+------------------------------------------------------------------------- */
+
+void PPPM2::compute_rho1d(const FFT_SCALAR &dx, const FFT_SCALAR &dy,
+                         const FFT_SCALAR &dz)
+{
+  int k,l;
+  FFT_SCALAR r1,r2,r3;
+
+  for (k = (1-order)/2; k <= order/2; k++) {
+    r1 = r2 = r3 = ZEROF;
+
+    for (l = order-1; l >= 0; l--) {
+      r1 = rho_coeff[l][k] + r1*dx;
+      r2 = rho_coeff[l][k] + r2*dy;
+      r3 = rho_coeff[l][k] + r3*dz;
+    }
+    rho1d[0][k] = r1;
+    rho1d[1][k] = r2;
+    rho1d[2][k] = r3;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   charge assignment into drho1d
+   dx,dy,dz = distance of particle from "lower left" grid point
+------------------------------------------------------------------------- */
+
+void PPPM2::compute_drho1d(const FFT_SCALAR &dx, const FFT_SCALAR &dy,
+                          const FFT_SCALAR &dz)
+{
+  int k,l;
+  FFT_SCALAR r1,r2,r3;
+
+  for (k = (1-order)/2; k <= order/2; k++) {
+    r1 = r2 = r3 = ZEROF;
+
+    for (l = order-2; l >= 0; l--) {
+      r1 = drho_coeff[l][k] + r1*dx;
+      r2 = drho_coeff[l][k] + r2*dy;
+      r3 = drho_coeff[l][k] + r3*dz;
+    }
+    drho1d[0][k] = r1;
+    drho1d[1][k] = r2;
+    drho1d[2][k] = r3;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   generate coeffients for the weight function of order n
+
+              (n-1)
+  Wn(x) =     Sum    wn(k,x) , Sum is over every other integer
+           k=-(n-1)
+  For k=-(n-1),-(n-1)+2, ....., (n-1)-2,n-1
+      k is odd integers if n is even and even integers if n is odd
+              ---
+             | n-1
+             | Sum a(l,j)*(x-k/2)**l   if abs(x-k/2) < 1/2
+  wn(k,x) = <  l=0
+             |
+             |  0                       otherwise
+              ---
+  a coeffients are packed into the array rho_coeff to eliminate zeros
+  rho_coeff(l,((k+mod(n+1,2))/2) = a(l,k)
+------------------------------------------------------------------------- */
+
+void PPPM2::compute_rho_coeff()
+{
+  int j,k,l,m;
+  FFT_SCALAR s;
+
+  FFT_SCALAR **a;
+  memory->create2d_offset(a,order,-order,order,"pppm:a");
+
+  for (k = -order; k <= order; k++)
+    for (l = 0; l < order; l++)
+      a[l][k] = 0.0;
+
+  a[0][0] = 1.0;
+  for (j = 1; j < order; j++) {
+    for (k = -j; k <= j; k += 2) {
+      s = 0.0;
+      for (l = 0; l < j; l++) {
+        a[l+1][k] = (a[l][k+1]-a[l][k-1]) / (l+1);
+#ifdef FFT_SINGLE
+        s += powf(0.5,(float) l+1) *
+          (a[l][k-1] + powf(-1.0,(float) l) * a[l][k+1]) / (l+1);
+#else
+        s += pow(0.5,(double) l+1) *
+          (a[l][k-1] + pow(-1.0,(double) l) * a[l][k+1]) / (l+1);
+#endif
+      }
+      a[0][k] = s;
+    }
+  }
+
+  m = (1-order)/2;
+  for (k = -(order-1); k < order; k += 2) {
+    for (l = 0; l < order; l++)
+      rho_coeff[l][m] = a[l][k];
+    for (l = 1; l < order; l++)
+      drho_coeff[l-1][m] = l*a[l][k];
+    m++;
+  }
+
+  memory->destroy2d_offset(a,-order);
+}
+
+/* ----------------------------------------------------------------------
+   Slab-geometry correction term to dampen inter-slab interactions between
+   periodically repeating slabs.  Yields good approximation to 2D Ewald if
+   adequate empty space is left between repeating slabs (J. Chem. Phys.
+   111, 3155).  Slabs defined here to be parallel to the xy plane. Also
+   extended to non-neutral systems (J. Chem. Phys. 131, 094107).
+------------------------------------------------------------------------- */
+
+void PPPM2::slabcorr()
+{
+  // compute local contribution to global dipole moment
+
+  double *q = atom->q;
+  double **x = atom->x;
+  double zprd = domain->zprd;
+  int nlocal = atom->nlocal;
+
+  double dipole = 0.0;
+  for (int i = 0; i < nlocal; i++) dipole += q[i]*x[i][2];
+
+  // sum local contributions to get global dipole moment
+
+  double dipole_all;
+  MPI_Allreduce(&dipole,&dipole_all,1,MPI_DOUBLE,MPI_SUM,world);
+
+  // need to make non-neutral systems and/or
+  //  per-atom energy translationally invariant
+
+  double dipole_r2 = 0.0;
+  if (eflag_atom || fabs(qsum) > SMALL) {
+    for (int i = 0; i < nlocal; i++)
+      dipole_r2 += q[i]*x[i][2]*x[i][2];
+
+    // sum local contributions
+
+    double tmp;
+    MPI_Allreduce(&dipole_r2,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
+    dipole_r2 = tmp;
+  }
+
+  // compute corrections
+
+  const double e_slabcorr = MY_2PI*(dipole_all*dipole_all -
+    qsum*dipole_r2 - qsum*qsum*zprd*zprd/12.0)/volume;
+  const double qscale = qqrd2e * scale;
+
+  if (eflag_global) energy += qscale * e_slabcorr;
+
+  // per-atom energy
+
+  if (eflag_atom) {
+    double efact = qscale * MY_2PI/volume;
+    for (int i = 0; i < nlocal; i++)
+      eatom[i] += efact * q[i]*(x[i][2]*dipole_all - 0.5*(dipole_r2 +
+        qsum*x[i][2]*x[i][2]) - qsum*zprd*zprd/12.0);
+  }
+
+  // add on force corrections
+
+  double ffact = qscale * (-4.0*MY_PI/volume);
+  double **f = atom->f;
+
+  for (int i = 0; i < nlocal; i++) f[i][2] += ffact * q[i]*(dipole_all - qsum*x[i][2]);
+}
+
+/* ----------------------------------------------------------------------
+   perform and time the 1d FFTs required for N timesteps
+------------------------------------------------------------------------- */
+
+int PPPM2::timing_1d(int n, double &time1d)
+{
+  double time1,time2;
+
+  for (int i = 0; i < 2*nfft_both; i++) work1[i] = ZEROF;
+
+  MPI_Barrier(world);
+  time1 = MPI_Wtime();
+
+  for (int i = 0; i < n; i++) {
+    fft1->timing1d(work1,nfft_both,1);
+    fft2->timing1d(work1,nfft_both,-1);
+    if (differentiation_flag != 1) {
+      fft2->timing1d(work1,nfft_both,-1);
+      fft2->timing1d(work1,nfft_both,-1);
+    }
+  }
+
+  MPI_Barrier(world);
+  time2 = MPI_Wtime();
+  time1d = time2 - time1;
+
+  if (differentiation_flag) return 2;
+  return 4;
+}
+
+/* ----------------------------------------------------------------------
+   perform and time the 3d FFTs required for N timesteps
+------------------------------------------------------------------------- */
+
+int PPPM2::timing_3d(int n, double &time3d)
+{
+  double time1,time2;
+
+  for (int i = 0; i < 2*nfft_both; i++) work1[i] = ZEROF;
+
+  MPI_Barrier(world);
+  time1 = MPI_Wtime();
+
+  for (int i = 0; i < n; i++) {
+    fft1->compute(work1,work1,1);
+    fft2->compute(work1,work1,-1);
+    if (differentiation_flag != 1) {
+      fft2->compute(work1,work1,-1);
+      fft2->compute(work1,work1,-1);
+    }
+  }
+
+  MPI_Barrier(world);
+  time2 = MPI_Wtime();
+  time3d = time2 - time1;
+
+  if (differentiation_flag) return 2;
+  return 4;
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of local arrays
+------------------------------------------------------------------------- */
+
+double PPPM2::memory_usage()
+{
+  double bytes = nmax*3 * sizeof(double);
+
+  int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
+    (nzhi_out-nzlo_out+1);
+  if (differentiation_flag == 1) {
+    bytes += 2 * nbrick * sizeof(FFT_SCALAR);
+  } else {
+    bytes += 4 * nbrick * sizeof(FFT_SCALAR);
+  }
+
+  if (triclinic) bytes += 3 * nfft_both * sizeof(double);
+  bytes += 6 * nfft_both * sizeof(double);
+  bytes += nfft_both * sizeof(double);
+  bytes += nfft_both*5 * sizeof(FFT_SCALAR);
+
+  if (peratom_allocate_flag)
+    bytes += 6 * nbrick * sizeof(FFT_SCALAR);
+
+  if (group_allocate_flag) {
+    bytes += 2 * nbrick * sizeof(FFT_SCALAR);
+    bytes += 2 * nfft_both * sizeof(FFT_SCALAR);;
+  }
+
+  // two GridComm bufs
+
+  bytes += (ngc_buf1 + ngc_buf2) * npergrid * sizeof(FFT_SCALAR);
+  
+  return bytes;
+}
+
+/* ----------------------------------------------------------------------
+   group-group interactions
+ ------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   compute the PPPM total long-range force and energy for groups A and B
+ ------------------------------------------------------------------------- */
+
+void PPPM2::compute_group_group(int groupbit_A, int groupbit_B, int AA_flag)
+{
+  if (slabflag && triclinic)
+    error->all(FLERR,"Cannot (yet) use K-space slab "
+               "correction with compute group/group for triclinic systems");
+
+  if (differentiation_flag)
+    error->all(FLERR,"Cannot (yet) use kspace_modify "
+               "diff ad with compute group/group");
+
+  if (!group_allocate_flag) allocate_groups();
+
+  // convert atoms from box to lamda coords
+
+  if (triclinic == 0) boxlo = domain->boxlo;
+  else {
+    boxlo = domain->boxlo_lamda;
+    domain->x2lamda(atom->nlocal);
+  }
+
+  e2group = 0.0; //energy
+  f2group[0] = 0.0; //force in x-direction
+  f2group[1] = 0.0; //force in y-direction
+  f2group[2] = 0.0; //force in z-direction
+
+  // map my particle charge onto my local 3d density grid
+
+  make_rho_groups(groupbit_A,groupbit_B,AA_flag);
+
+  // all procs communicate density values from their ghost cells
+  //   to fully sum contribution in their 3d bricks
+  // remap from 3d decomposition to FFT decomposition
+
+  // temporarily store and switch pointers so we can
+  //  use brick2fft() for groups A and B (without
+  //  writing an additional function)
+
+  FFT_SCALAR ***density_brick_real = density_brick;
+  FFT_SCALAR *density_fft_real = density_fft;
+
+  // group A
+
+  density_brick = density_A_brick;
+  density_fft = density_A_fft;
+
+  gc->reverse_comm(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+		   gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  brick2fft();
+
+  // group B
+
+  density_brick = density_B_brick;
+  density_fft = density_B_fft;
+
+  gc->reverse_comm(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+		   gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  brick2fft();
+
+  // switch back pointers
+
+  density_brick = density_brick_real;
+  density_fft = density_fft_real;
+
+  // compute potential gradient on my FFT grid and
+  //   portion of group-group energy/force on this proc's FFT grid
+
+  poisson_groups(AA_flag);
+
+  const double qscale = qqrd2e * scale;
+
+  // total group A <--> group B energy
+  // self and boundary correction terms are in compute_group_group.cpp
+
+  double e2group_all;
+  MPI_Allreduce(&e2group,&e2group_all,1,MPI_DOUBLE,MPI_SUM,world);
+  e2group = e2group_all;
+
+  e2group *= qscale*0.5*volume;
+
+  // total group A <--> group B force
+
+  double f2group_all[3];
+  MPI_Allreduce(f2group,f2group_all,3,MPI_DOUBLE,MPI_SUM,world);
+
+  f2group[0] = qscale*volume*f2group_all[0];
+  f2group[1] = qscale*volume*f2group_all[1];
+  if (slabflag != 2) f2group[2] = qscale*volume*f2group_all[2];
+
+  // convert atoms back from lamda to box coords
+
+  if (triclinic) domain->lamda2x(atom->nlocal);
+
+  if (slabflag == 1)
+    slabcorr_groups(groupbit_A, groupbit_B, AA_flag);
+}
+
+/* ----------------------------------------------------------------------
+ allocate group-group memory that depends on # of K-vectors and order
+ ------------------------------------------------------------------------- */
+
+void PPPM2::allocate_groups()
+{
+  group_allocate_flag = 1;
+
+  memory->create3d_offset(density_A_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:density_A_brick");
+  memory->create3d_offset(density_B_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:density_B_brick");
+  memory->create(density_A_fft,nfft_both,"pppm:density_A_fft");
+  memory->create(density_B_fft,nfft_both,"pppm:density_B_fft");
+}
+
+/* ----------------------------------------------------------------------
+ deallocate group-group memory that depends on # of K-vectors and order
+ ------------------------------------------------------------------------- */
+
+void PPPM2::deallocate_groups()
+{
+  group_allocate_flag = 0;
+
+  memory->destroy3d_offset(density_A_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(density_B_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy(density_A_fft);
+  memory->destroy(density_B_fft);
+}
+
+/* ----------------------------------------------------------------------
+ create discretized "density" on section of global grid due to my particles
+ density(x,y,z) = charge "density" at grid points of my 3d brick
+ (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+ in global grid for group-group interactions
+ ------------------------------------------------------------------------- */
+
+void PPPM2::make_rho_groups(int groupbit_A, int groupbit_B, int AA_flag)
+{
+  int l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
+
+  // clear 3d density arrays
+
+  memset(&(density_A_brick[nzlo_out][nylo_out][nxlo_out]),0,
+         ngrid*sizeof(FFT_SCALAR));
+
+  memset(&(density_B_brick[nzlo_out][nylo_out][nxlo_out]),0,
+         ngrid*sizeof(FFT_SCALAR));
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  double *q = atom->q;
+  double **x = atom->x;
+  int nlocal = atom->nlocal;
+  int *mask = atom->mask;
+
+  for (int i = 0; i < nlocal; i++) {
+
+    if (!((mask[i] & groupbit_A) && (mask[i] & groupbit_B)))
+      if (AA_flag) continue;
+
+    if ((mask[i] & groupbit_A) || (mask[i] & groupbit_B)) {
+
+      nx = part2grid[i][0];
+      ny = part2grid[i][1];
+      nz = part2grid[i][2];
+      dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+      dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+      dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+      compute_rho1d(dx,dy,dz);
+
+      z0 = delvolinv * q[i];
+      for (n = nlower; n <= nupper; n++) {
+        mz = n+nz;
+        y0 = z0*rho1d[2][n];
+        for (m = nlower; m <= nupper; m++) {
+          my = m+ny;
+          x0 = y0*rho1d[1][m];
+          for (l = nlower; l <= nupper; l++) {
+            mx = l+nx;
+
+            // group A
+
+            if (mask[i] & groupbit_A)
+              density_A_brick[mz][my][mx] += x0*rho1d[0][l];
+
+            // group B
+
+            if (mask[i] & groupbit_B)
+              density_B_brick[mz][my][mx] += x0*rho1d[0][l];
+          }
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for group-group interactions
+ ------------------------------------------------------------------------- */
+
+void PPPM2::poisson_groups(int AA_flag)
+{
+  int i,j,k,n;
+
+  // reuse memory (already declared)
+
+  FFT_SCALAR *work_A = work1;
+  FFT_SCALAR *work_B = work2;
+
+  // transform charge density (r -> k)
+
+  // group A
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work_A[n++] = density_A_fft[i];
+    work_A[n++] = ZEROF;
+  }
+
+  fft1->compute(work_A,work_A,1);
+
+  // group B
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work_B[n++] = density_B_fft[i];
+    work_B[n++] = ZEROF;
+  }
+
+  fft1->compute(work_B,work_B,1);
+
+  // group-group energy and force contribution,
+  //  keep everything in reciprocal space so
+  //  no inverse FFTs needed
+
+  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
+  double s2 = scaleinv*scaleinv;
+
+  // energy
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    e2group += s2 * greensfn[i] *
+      (work_A[n]*work_B[n] + work_A[n+1]*work_B[n+1]);
+    n += 2;
+  }
+
+  if (AA_flag) return;
+
+
+  // multiply by Green's function and s2
+  //  (only for work_A so it is not squared below)
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work_A[n++] *= s2 * greensfn[i];
+    work_A[n++] *= s2 * greensfn[i];
+  }
+
+  // triclinic system
+
+  if (triclinic) {
+    poisson_groups_triclinic();
+    return;
+  }
+
+  double partial_group;
+
+  // force, x direction
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
+        f2group[0] += fkx[i] * partial_group;
+        n += 2;
+      }
+
+  // force, y direction
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
+        f2group[1] += fky[j] * partial_group;
+        n += 2;
+      }
+
+  // force, z direction
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
+        f2group[2] += fkz[k] * partial_group;
+        n += 2;
+      }
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for group-group interactions
+   for a triclinic system
+ ------------------------------------------------------------------------- */
+
+void PPPM2::poisson_groups_triclinic()
+{
+  int i,n;
+
+  // reuse memory (already declared)
+
+  FFT_SCALAR *work_A = work1;
+  FFT_SCALAR *work_B = work2;
+
+  double partial_group;
+
+  // force, x direction
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
+    f2group[0] += fkx[i] * partial_group;
+    n += 2;
+  }
+
+  // force, y direction
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
+    f2group[1] += fky[i] * partial_group;
+    n += 2;
+  }
+
+  // force, z direction
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
+    f2group[2] += fkz[i] * partial_group;
+    n += 2;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Slab-geometry correction term to dampen inter-slab interactions between
+   periodically repeating slabs.  Yields good approximation to 2D Ewald if
+   adequate empty space is left between repeating slabs (J. Chem. Phys.
+   111, 3155).  Slabs defined here to be parallel to the xy plane. Also
+   extended to non-neutral systems (J. Chem. Phys. 131, 094107).
+------------------------------------------------------------------------- */
+
+void PPPM2::slabcorr_groups(int groupbit_A, int groupbit_B, int AA_flag)
+{
+  // compute local contribution to global dipole moment
+
+  double *q = atom->q;
+  double **x = atom->x;
+  double zprd = domain->zprd;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+
+  double qsum_A = 0.0;
+  double qsum_B = 0.0;
+  double dipole_A = 0.0;
+  double dipole_B = 0.0;
+  double dipole_r2_A = 0.0;
+  double dipole_r2_B = 0.0;
+
+  for (int i = 0; i < nlocal; i++) {
+    if (!((mask[i] & groupbit_A) && (mask[i] & groupbit_B)))
+      if (AA_flag) continue;
+
+    if (mask[i] & groupbit_A) {
+      qsum_A += q[i];
+      dipole_A += q[i]*x[i][2];
+      dipole_r2_A += q[i]*x[i][2]*x[i][2];
+    }
+
+    if (mask[i] & groupbit_B) {
+      qsum_B += q[i];
+      dipole_B += q[i]*x[i][2];
+      dipole_r2_B += q[i]*x[i][2]*x[i][2];
+    }
+  }
+
+  // sum local contributions to get total charge and global dipole moment
+  //  for each group
+
+  double tmp;
+  MPI_Allreduce(&qsum_A,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
+  qsum_A = tmp;
+
+  MPI_Allreduce(&qsum_B,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
+  qsum_B = tmp;
+
+  MPI_Allreduce(&dipole_A,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
+  dipole_A = tmp;
+
+  MPI_Allreduce(&dipole_B,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
+  dipole_B = tmp;
+
+  MPI_Allreduce(&dipole_r2_A,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
+  dipole_r2_A = tmp;
+
+  MPI_Allreduce(&dipole_r2_B,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
+  dipole_r2_B = tmp;
+
+  // compute corrections
+
+  const double qscale = qqrd2e * scale;
+  const double efact = qscale * MY_2PI/volume;
+
+  e2group += efact * (dipole_A*dipole_B - 0.5*(qsum_A*dipole_r2_B +
+    qsum_B*dipole_r2_A) - qsum_A*qsum_B*zprd*zprd/12.0);
+
+  // add on force corrections
+
+  const double ffact = qscale * (-4.0*MY_PI/volume);
+  f2group[2] += ffact * (qsum_A*dipole_B - qsum_B*dipole_A);
+}
diff --git a/src/KSPACE/pppm2.h b/src/KSPACE/pppm2.h
new file mode 100644
index 0000000000..31b8534735
--- /dev/null
+++ b/src/KSPACE/pppm2.h
@@ -0,0 +1,360 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm2,PPPM2)
+
+#else
+
+#ifndef LMP_PPPM2_H
+#define LMP_PPPM2_H
+
+#include "kspace.h"
+
+#if defined(FFT_FFTW3)
+#define LMP_FFT_LIB "FFTW3"
+#elif defined(FFT_MKL)
+#define LMP_FFT_LIB "MKL FFT"
+#elif defined(FFT_CUFFT)
+#define LMP_FFT_LIB "cuFFT"
+#else
+#define LMP_FFT_LIB "KISS FFT"
+#endif
+
+#ifdef FFT_SINGLE
+typedef float FFT_SCALAR;
+#define LMP_FFT_PREC "single"
+#define MPI_FFT_SCALAR MPI_FLOAT
+#else
+
+typedef double FFT_SCALAR;
+#define LMP_FFT_PREC "double"
+#define MPI_FFT_SCALAR MPI_DOUBLE
+#endif
+
+namespace LAMMPS_NS {
+
+class PPPM2 : public KSpace {
+ public:
+  PPPM2(class LAMMPS *);
+  virtual ~PPPM2();
+  virtual void settings(int, char **);
+  virtual void init();
+  virtual void setup();
+  virtual void setup_grid();
+  virtual void compute(int, int);
+  virtual int timing_1d(int, double &);
+  virtual int timing_3d(int, double &);
+  virtual double memory_usage();
+
+  virtual void compute_group_group(int, int, int);
+
+ protected:
+  int me,nprocs;
+  int nfactors;
+  int *factors;
+  double cutoff;
+  double volume;
+  double delxinv,delyinv,delzinv,delvolinv;
+  double h_x,h_y,h_z;
+  double shift,shiftone;
+  int peratom_allocate_flag;
+
+  int nxlo_in,nylo_in,nzlo_in,nxhi_in,nyhi_in,nzhi_in;
+  int nxlo_out,nylo_out,nzlo_out,nxhi_out,nyhi_out,nzhi_out;
+  int nxlo_ghost,nxhi_ghost,nylo_ghost,nyhi_ghost,nzlo_ghost,nzhi_ghost;
+  int nxlo_fft,nylo_fft,nzlo_fft,nxhi_fft,nyhi_fft,nzhi_fft;
+  int nlower,nupper;
+  int ngrid,nfft,nfft_both;
+
+  FFT_SCALAR ***density_brick;
+  FFT_SCALAR ***vdx_brick,***vdy_brick,***vdz_brick;
+  FFT_SCALAR ***u_brick;
+  FFT_SCALAR ***v0_brick,***v1_brick,***v2_brick;
+  FFT_SCALAR ***v3_brick,***v4_brick,***v5_brick;
+  double *greensfn;
+  double **vg;
+  double *fkx,*fky,*fkz;
+  FFT_SCALAR *density_fft;
+  FFT_SCALAR *work1,*work2;
+
+  double *gf_b;
+  FFT_SCALAR **rho1d,**rho_coeff,**drho1d,**drho_coeff;
+  double *sf_precoeff1, *sf_precoeff2, *sf_precoeff3;
+  double *sf_precoeff4, *sf_precoeff5, *sf_precoeff6;
+  double sf_coeff[6];          // coefficients for calculating ad self-forces
+  double **acons;
+
+  // FFTs and grid communication
+
+  class FFT3d *fft1,*fft2;
+  class Remap *remap;
+  class GridComm2 *gc;
+  FFT_SCALAR *gc_buf1,*gc_buf2;
+  int ngc_buf1,ngc_buf2,npergrid;
+
+  // group-group interactions
+
+  int group_allocate_flag;
+  FFT_SCALAR ***density_A_brick,***density_B_brick;
+  FFT_SCALAR *density_A_fft,*density_B_fft;
+
+  int **part2grid;             // storage for particle -> grid mapping
+  int nmax;
+
+  double *boxlo;
+                               // TIP4P settings
+  int typeH,typeO;             // atom types of TIP4P water H and O atoms
+  double qdist;                // distance from O site to negative charge
+  double alpha;                // geometric factor
+
+  virtual void set_grid_global();
+  void set_grid_local();
+  void adjust_gewald();
+  virtual double newton_raphson_f();
+  double derivf();
+  double final_accuracy();
+
+  virtual void allocate();
+  virtual void allocate_peratom();
+  virtual void deallocate();
+  virtual void deallocate_peratom();
+  int factorable(int);
+  double compute_df_kspace();
+  double estimate_ik_error(double, double, bigint);
+  virtual double compute_qopt();
+  virtual void compute_gf_denom();
+  virtual void compute_gf_ik();
+  virtual void compute_gf_ad();
+  void compute_sf_precoeff();
+
+  virtual void particle_map();
+  virtual void make_rho();
+  virtual void brick2fft();
+
+  virtual void poisson();
+  virtual void poisson_ik();
+  virtual void poisson_ad();
+
+  virtual void fieldforce();
+  virtual void fieldforce_ik();
+  virtual void fieldforce_ad();
+
+  virtual void poisson_peratom();
+  virtual void fieldforce_peratom();
+  void procs2grid2d(int,int,int,int *, int*);
+  void compute_rho1d(const FFT_SCALAR &, const FFT_SCALAR &,
+                     const FFT_SCALAR &);
+  void compute_drho1d(const FFT_SCALAR &, const FFT_SCALAR &,
+                     const FFT_SCALAR &);
+  void compute_rho_coeff();
+  virtual void slabcorr();
+
+  // grid communication
+
+  virtual void pack_forward2(int, void *, int, int *);
+  virtual void unpack_forward2(int, void *, int, int *);
+  virtual void pack_reverse2(int, void *, int, int *);
+  virtual void unpack_reverse2(int, void *, int, int *);
+
+  // triclinic
+
+  int triclinic;               // domain settings, orthog or triclinic
+  void setup_triclinic();
+  void compute_gf_ik_triclinic();
+  void poisson_ik_triclinic();
+  void poisson_groups_triclinic();
+
+  // group-group interactions
+
+  virtual void allocate_groups();
+  virtual void deallocate_groups();
+  virtual void make_rho_groups(int, int, int);
+  virtual void poisson_groups(int);
+  virtual void slabcorr_groups(int,int,int);
+
+/* ----------------------------------------------------------------------
+   denominator for Hockney-Eastwood Green's function
+     of x,y,z = sin(kx*deltax/2), etc
+
+            inf                 n-1
+   S(n,k) = Sum  W(k+pi*j)**2 = Sum b(l)*(z*z)**l
+           j=-inf               l=0
+
+          = -(z*z)**n /(2n-1)! * (d/dx)**(2n-1) cot(x)  at z = sin(x)
+   gf_b = denominator expansion coeffs
+------------------------------------------------------------------------- */
+
+  inline double gf_denom(const double &x, const double &y,
+                         const double &z) const {
+    double sx,sy,sz;
+    sz = sy = sx = 0.0;
+    for (int l = order-1; l >= 0; l--) {
+      sx = gf_b[l] + sx*x;
+      sy = gf_b[l] + sy*y;
+      sz = gf_b[l] + sz*z;
+    }
+    double s = sx*sy*sz;
+    return s*s;
+  };
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Must redefine kspace_style after changing to triclinic box
+
+UNDOCUMENTED
+
+E: Cannot (yet) use PPPM with triclinic box and kspace_modify diff ad
+
+This feature is not yet supported.
+
+E: Cannot (yet) use PPPM with triclinic box and slab correction
+
+This feature is not yet supported.
+
+E: Cannot use PPPM with 2d simulation
+
+The kspace style pppm cannot be used in 2d simulations.  You can use
+2d PPPM in a 3d simulation; see the kspace_modify command.
+
+E: PPPM can only currently be used with comm_style brick
+
+This is a current restriction in LAMMPS.
+
+E: Kspace style requires atom attribute q
+
+The atom style defined does not have these attributes.
+
+E: Cannot use non-periodic boundaries with PPPM
+
+For kspace style pppm, all 3 dimensions must have periodic boundaries
+unless you use the kspace_modify command to define a 2d slab with a
+non-periodic z dimension.
+
+E: Incorrect boundaries with slab PPPM
+
+Must have periodic x,y dimensions and non-periodic z dimension to use
+2d slab option with PPPM.
+
+E: PPPM order cannot be < 2 or > than %d
+
+This is a limitation of the PPPM implementation in LAMMPS.
+
+E: KSpace style is incompatible with Pair style
+
+Setting a kspace style requires that a pair style with matching
+long-range Coulombic or dispersion components be used.
+
+E: Pair style is incompatible with TIP4P KSpace style
+
+The pair style does not have the requires TIP4P settings.
+
+E: Bond and angle potentials must be defined for TIP4P
+
+Cannot use TIP4P pair potential unless bond and angle potentials
+are defined.
+
+E: Bad TIP4P angle type for PPPM/TIP4P
+
+Specified angle type is not valid.
+
+E: Bad TIP4P bond type for PPPM/TIP4P
+
+Specified bond type is not valid.
+
+W: Reducing PPPM order b/c stencil extends beyond nearest neighbor processor
+
+This may lead to a larger grid than desired.  See the kspace_modify overlap
+command to prevent changing of the PPPM order.
+
+E: PPPM order < minimum allowed order
+
+The default minimum order is 2.  This can be reset by the
+kspace_modify minorder command.
+
+E: PPPM grid stencil extends beyond nearest neighbor processor
+
+This is not allowed if the kspace_modify overlap setting is no.
+
+E: KSpace accuracy must be > 0
+
+The kspace accuracy designated in the input must be greater than zero.
+
+E: Must use kspace_modify gewald for uncharged system
+
+UNDOCUMENTED
+
+E: Could not compute grid size
+
+The code is unable to compute a grid size consistent with the desired
+accuracy.  This error should not occur for typical problems.  Please
+send an email to the developers.
+
+E: PPPM grid is too large
+
+The global PPPM grid is larger than OFFSET in one or more dimensions.
+OFFSET is currently set to 4096.  You likely need to decrease the
+requested accuracy.
+
+E: Could not compute g_ewald
+
+The Newton-Raphson solver failed to converge to a good value for
+g_ewald.  This error should not occur for typical problems.  Please
+send an email to the developers.
+
+E: Non-numeric box dimensions - simulation unstable
+
+The box size has apparently blown up.
+
+E: Out of range atoms - cannot compute PPPM
+
+One or more atoms are attempting to map their charge to a PPPM grid
+point that is not owned by a processor.  This is likely for one of two
+reasons, both of them bad.  First, it may mean that an atom near the
+boundary of a processor's sub-domain has moved more than 1/2 the
+"neighbor skin distance"_neighbor.html without neighbor lists being
+rebuilt and atoms being migrated to new processors.  This also means
+you may be missing pairwise interactions that need to be computed.
+The solution is to change the re-neighboring criteria via the
+"neigh_modify"_neigh_modify command.  The safest settings are "delay 0
+every 1 check yes".  Second, it may mean that an atom has moved far
+outside a processor's sub-domain or even the entire simulation box.
+This indicates bad physics, e.g. due to highly overlapping atoms, too
+large a timestep, etc.
+
+E: Cannot (yet) use K-space slab correction with compute group/group for triclinic systems
+
+This option is not yet supported.
+
+E: Cannot (yet) use kspace_modify diff ad with compute group/group
+
+This option is not yet supported.
+
+U: Cannot (yet) use PPPM with triclinic box and TIP4P
+
+This feature is not yet supported.
+
+*/
diff --git a/src/force.cpp b/src/force.cpp
index d2eb137d06..595ffd3140 100644
--- a/src/force.cpp
+++ b/src/force.cpp
@@ -665,9 +665,9 @@ void Force::create_kspace(const std::string &style, int trysuffix)
   kspace = new_kspace(style,trysuffix,sflag);
   store_style(kspace_style,style,sflag);
 
-  if (comm->style == 1 && !kspace_match("ewald",0))
-    error->all(FLERR,
-               "Cannot yet use KSpace solver with grid with comm style tiled");
+  //if (comm->style == 1 && !kspace_match("ewald",0))
+  // error->all(FLERR,
+  //             "Cannot yet use KSpace solver with grid with comm style tiled");
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/kspace.h b/src/kspace.h
index e26b9490d6..4d686d59f0 100644
--- a/src/kspace.h
+++ b/src/kspace.h
@@ -126,6 +126,11 @@ class KSpace : protected Pointers {
   virtual void pack_reverse(int, FFT_SCALAR *, int, int *) {};
   virtual void unpack_reverse(int, FFT_SCALAR *, int, int *) {};
 
+  virtual void pack_forward2(int, void *, int, int *) {};
+  virtual void unpack_forward2(int, void *, int, int *) {};
+  virtual void pack_reverse2(int, void *, int, int *) {};
+  virtual void unpack_reverse2(int, void *, int, int *) {};
+
   virtual int timing(int, double &, double &) {return 0;}
   virtual int timing_1d(int, double &) {return 0;}
   virtual int timing_3d(int, double &) {return 0;}

From 5caeb2c2ee2f13b533008276bd0d5945993eb819 Mon Sep 17 00:00:00 2001
From: Steve Plimpton <sjplimp@sandia.gov>
Date: Thu, 6 Aug 2020 08:36:59 -0600
Subject: [PATCH 02/38] generalize to allow GridComm to be called from Fixes or
 Computes

---
 src/KSPACE/gridcomm2.cpp | 68 +++++++++++++++++++++-------------------
 src/KSPACE/gridcomm2.h   | 22 +++++++------
 src/KSPACE/pppm2.cpp     | 36 ++++++++++-----------
 src/KSPACE/pppm2.h       |  8 ++---
 src/kspace.h             |  9 +++---
 5 files changed, 75 insertions(+), 68 deletions(-)

diff --git a/src/KSPACE/gridcomm2.cpp b/src/KSPACE/gridcomm2.cpp
index ce9a1e7568..15ca165c74 100644
--- a/src/KSPACE/gridcomm2.cpp
+++ b/src/KSPACE/gridcomm2.cpp
@@ -879,28 +879,29 @@ int GridComm2::ghost_adjacent_tiled()
    use swap list in forward order to acquire copy of all needed ghost grid pts
 ------------------------------------------------------------------------- */
 
-void GridComm2::forward_comm(KSpace *kspace, int nper, int nbyte, int which,
-			     void *buf1, void *buf2, MPI_Datatype datatype)
+void GridComm2::forward_comm_kspace(KSpace *kspace, int nper, int nbyte, int which,
+				    void *buf1, void *buf2, MPI_Datatype datatype)
 {
   if (layout == REGULAR)
-    forward_comm_regular(kspace,nper,nbyte,which,buf1,buf2,datatype);
+    forward_comm_kspace_regular(kspace,nper,nbyte,which,buf1,buf2,datatype);
   else
-    forward_comm_tiled(kspace,nper,nbyte,which,buf1,buf2,datatype);
+    forward_comm_kspace_tiled(kspace,nper,nbyte,which,buf1,buf2,datatype);
 }
 
 /* ---------------------------------------------------------------------- */
 
-void GridComm2::forward_comm_regular(KSpace *kspace, int nper, int nbyte, int which,
-				     void *buf1, void *buf2, MPI_Datatype datatype)
+void GridComm2::
+forward_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
+			    void *buf1, void *buf2, MPI_Datatype datatype)
 {
   int m;
   MPI_Request request;
 
   for (m = 0; m < nswap; m++) {
     if (swap[m].sendproc == me)
-      kspace->pack_forward2(which,buf2,swap[m].npack,swap[m].packlist);
+      kspace->pack_forward_grid(which,buf2,swap[m].npack,swap[m].packlist);
     else
-      kspace->pack_forward2(which,buf1,swap[m].npack,swap[m].packlist);
+      kspace->pack_forward_grid(which,buf1,swap[m].npack,swap[m].packlist);
 
     if (swap[m].sendproc != me) {
       if (swap[m].nunpack) MPI_Irecv(buf2,nper*swap[m].nunpack,datatype,
@@ -910,14 +911,15 @@ void GridComm2::forward_comm_regular(KSpace *kspace, int nper, int nbyte, int wh
       if (swap[m].nunpack) MPI_Wait(&request,MPI_STATUS_IGNORE);
     }
 
-    kspace->unpack_forward2(which,buf2,swap[m].nunpack,swap[m].unpacklist);
+    kspace->unpack_forward_grid(which,buf2,swap[m].nunpack,swap[m].unpacklist);
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
-void GridComm2::forward_comm_tiled(KSpace *kspace, int nper, int nbyte, int which,
-				   void *buf1, void *vbuf2, MPI_Datatype datatype)
+void GridComm2::
+forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
+			  void *buf1, void *vbuf2, MPI_Datatype datatype)
 {
   int i,m,offset;
 
@@ -934,15 +936,15 @@ void GridComm2::forward_comm_tiled(KSpace *kspace, int nper, int nbyte, int whic
   // perform all sends to other procs
 
   for (m = 0; m < nsend; m++) {
-    kspace->pack_forward2(which,buf1,send[m].npack,send[m].packlist);
+    kspace->pack_forward_grid(which,buf1,send[m].npack,send[m].packlist);
     MPI_Send(buf1,nper*send[m].npack,datatype,send[m].proc,0,gridcomm);
   }
 
   // perform all copies to self
 
   for (m = 0; m < ncopy; m++) {
-    kspace->pack_forward2(which,buf1,copy[m].npack,copy[m].packlist);
-    kspace->unpack_forward2(which,buf1,copy[m].nunpack,copy[m].unpacklist);
+    kspace->pack_forward_grid(which,buf1,copy[m].npack,copy[m].packlist);
+    kspace->unpack_forward_grid(which,buf1,copy[m].nunpack,copy[m].unpacklist);
   }
 
   // unpack all received data
@@ -950,8 +952,8 @@ void GridComm2::forward_comm_tiled(KSpace *kspace, int nper, int nbyte, int whic
   for (i = 0; i < nrecv; i++) {
     MPI_Waitany(nrecv,requests,&m,MPI_STATUS_IGNORE);
     offset = nper * recv[m].offset * nbyte;
-    kspace->unpack_forward2(which,(void *) &buf2[offset],
-			    recv[m].nunpack,recv[m].unpacklist);
+    kspace->unpack_forward_grid(which,(void *) &buf2[offset],
+				recv[m].nunpack,recv[m].unpacklist);
   }
 }
 
@@ -960,28 +962,29 @@ void GridComm2::forward_comm_tiled(KSpace *kspace, int nper, int nbyte, int whic
    for each owned grid pt that some other proc has copy of as a ghost grid pt
 ------------------------------------------------------------------------- */
 
-void GridComm2::reverse_comm(KSpace *kspace, int nper, int nbyte, int which,
-			     void *buf1, void *buf2, MPI_Datatype datatype)
+void GridComm2::reverse_comm_kspace(KSpace *kspace, int nper, int nbyte, int which,
+				    void *buf1, void *buf2, MPI_Datatype datatype)
 {
   if (layout == REGULAR)
-    reverse_comm_regular(kspace,nper,nbyte,which,buf1,buf2,datatype);
+    reverse_comm_kspace_regular(kspace,nper,nbyte,which,buf1,buf2,datatype);
   else
-    reverse_comm_tiled(kspace,nper,nbyte,which,buf1,buf2,datatype);
+    reverse_comm_kspace_tiled(kspace,nper,nbyte,which,buf1,buf2,datatype);
 }
 
 /* ---------------------------------------------------------------------- */
 
-void GridComm2::reverse_comm_regular(KSpace *kspace, int nper, int nbyte, int which,
-				     void *buf1, void *buf2, MPI_Datatype datatype)
+void GridComm2::
+reverse_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
+			    void *buf1, void *buf2, MPI_Datatype datatype)
 {
   int m;
   MPI_Request request;
 
   for (m = nswap-1; m >= 0; m--) {
     if (swap[m].recvproc == me)
-      kspace->pack_reverse2(which,buf2,swap[m].nunpack,swap[m].unpacklist);
+      kspace->pack_reverse_grid(which,buf2,swap[m].nunpack,swap[m].unpacklist);
     else
-      kspace->pack_reverse2(which,buf1,swap[m].nunpack,swap[m].unpacklist);
+      kspace->pack_reverse_grid(which,buf1,swap[m].nunpack,swap[m].unpacklist);
 
     if (swap[m].recvproc != me) {
       if (swap[m].npack) MPI_Irecv(buf2,nper*swap[m].npack,datatype,
@@ -991,14 +994,15 @@ void GridComm2::reverse_comm_regular(KSpace *kspace, int nper, int nbyte, int wh
       if (swap[m].npack) MPI_Wait(&request,MPI_STATUS_IGNORE);
     }
 
-    kspace->unpack_reverse2(which,buf2,swap[m].npack,swap[m].packlist);
+    kspace->unpack_reverse_grid(which,buf2,swap[m].npack,swap[m].packlist);
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
-void GridComm2::reverse_comm_tiled(KSpace *kspace, int nper, int nbyte, int which,
-				   void *buf1, void *vbuf2, MPI_Datatype datatype)
+void GridComm2::
+reverse_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
+			  void *buf1, void *vbuf2, MPI_Datatype datatype)
 {
   int i,m,offset;
 
@@ -1015,15 +1019,15 @@ void GridComm2::reverse_comm_tiled(KSpace *kspace, int nper, int nbyte, int whic
   // perform all sends to other procs
 
   for (m = 0; m < nrecv; m++) {
-    kspace->pack_reverse2(which,buf1,recv[m].nunpack,recv[m].unpacklist);
+    kspace->pack_reverse_grid(which,buf1,recv[m].nunpack,recv[m].unpacklist);
     MPI_Send(buf1,nper*recv[m].nunpack,datatype,recv[m].proc,0,gridcomm);
   }
 
   // perform all copies to self
 
   for (m = 0; m < ncopy; m++) {
-    kspace->pack_reverse2(which,buf1,copy[m].nunpack,copy[m].unpacklist);
-    kspace->unpack_reverse2(which,buf1,copy[m].npack,copy[m].packlist);
+    kspace->pack_reverse_grid(which,buf1,copy[m].nunpack,copy[m].unpacklist);
+    kspace->unpack_reverse_grid(which,buf1,copy[m].npack,copy[m].packlist);
   }
 
   // unpack all received data
@@ -1031,8 +1035,8 @@ void GridComm2::reverse_comm_tiled(KSpace *kspace, int nper, int nbyte, int whic
   for (i = 0; i < nsend; i++) {
     MPI_Waitany(nsend,requests,&m,MPI_STATUS_IGNORE);
     offset = nper * send[m].offset * nbyte;
-    kspace->unpack_reverse2(which,(void *) &buf2[offset],
-			    send[m].npack,send[m].packlist);
+    kspace->unpack_reverse_grid(which,(void *) &buf2[offset],
+				send[m].npack,send[m].packlist);
   }
 }
 
diff --git a/src/KSPACE/gridcomm2.h b/src/KSPACE/gridcomm2.h
index eeba990d2d..66cf9d42e5 100644
--- a/src/KSPACE/gridcomm2.h
+++ b/src/KSPACE/gridcomm2.h
@@ -30,8 +30,10 @@ class GridComm2 : protected Pointers {
   ~GridComm2();
   void setup(int &, int &);
   int ghost_adjacent();
-  void forward_comm(class KSpace *, int, int, int, void *, void *, MPI_Datatype);
-  void reverse_comm(class KSpace *, int, int, int, void *, void *, MPI_Datatype);
+  void forward_comm_kspace(class KSpace *, int, int, int,
+			   void *, void *, MPI_Datatype);
+  void reverse_comm_kspace(class KSpace *, int, int, int,
+			   void *, void *, MPI_Datatype);
 
  private:
   int me,nprocs;
@@ -181,14 +183,14 @@ class GridComm2 : protected Pointers {
   int ghost_adjacent_regular();
   int ghost_adjacent_tiled();
   
-  void forward_comm_regular(class KSpace *, int, int, int,
-			    void *, void *, MPI_Datatype);
-  void forward_comm_tiled(class KSpace *, int, int, int,
-			  void *, void *, MPI_Datatype);
-  void reverse_comm_regular(class KSpace *, int, int, int,
-			    void *, void *, MPI_Datatype);
-  void reverse_comm_tiled(class KSpace *, int, int, int,
-			  void *, void *, MPI_Datatype);
+  void forward_comm_kspace_regular(class KSpace *, int, int, int,
+				   void *, void *, MPI_Datatype);
+  void forward_comm_kspace_tiled(class KSpace *, int, int, int,
+				 void *, void *, MPI_Datatype);
+  void reverse_comm_kspace_regular(class KSpace *, int, int, int,
+				   void *, void *, MPI_Datatype);
+  void reverse_comm_kspace_tiled(class KSpace *, int, int, int,
+				 void *, void *, MPI_Datatype);
 
   void grow_swap();
   void grow_overlap();
diff --git a/src/KSPACE/pppm2.cpp b/src/KSPACE/pppm2.cpp
index 19baa13f16..927c9edee5 100644
--- a/src/KSPACE/pppm2.cpp
+++ b/src/KSPACE/pppm2.cpp
@@ -642,8 +642,8 @@ void PPPM2::compute(int eflag, int vflag)
   //   to fully sum contribution in their 3d bricks
   // remap from 3d decomposition to FFT decomposition
 
-  gc->reverse_comm(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
-		   gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+			  gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   brick2fft();
 
   // compute potential gradient on my FFT grid and
@@ -657,21 +657,21 @@ void PPPM2::compute(int eflag, int vflag)
   // to fill ghost cells surrounding their 3d bricks
 
   if (differentiation_flag == 1)
-    gc->forward_comm(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
-		     gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+    gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   else
-    gc->forward_comm(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
-		     gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+    gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
   // extra per-atom energy/virial communication
 
   if (evflag_atom) {
     if (differentiation_flag == 1 && vflag_atom)
-      gc->forward_comm(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
-		       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+      gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     else if (differentiation_flag == 0)
-      gc->forward_comm(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
-		       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+      gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   }
 
   // calculate the force on my particles
@@ -2633,7 +2633,7 @@ void PPPM2::fieldforce_peratom()
    pack own values to buf to send to another proc
 ------------------------------------------------------------------------- */
 
-void PPPM2::pack_forward2(int flag, void *pbuf, int nlist, int *list)
+void PPPM2::pack_forward_grid(int flag, void *pbuf, int nlist, int *list)
 {
   FFT_SCALAR *buf = (FFT_SCALAR *) pbuf;
   
@@ -2693,7 +2693,7 @@ void PPPM2::pack_forward2(int flag, void *pbuf, int nlist, int *list)
    unpack another proc's own values from buf and set own ghost values
 ------------------------------------------------------------------------- */
 
-void PPPM2::unpack_forward2(int flag, void *pbuf, int nlist, int *list)
+void PPPM2::unpack_forward_grid(int flag, void *pbuf, int nlist, int *list)
 {
   FFT_SCALAR *buf = (FFT_SCALAR *) pbuf;
 
@@ -2753,7 +2753,7 @@ void PPPM2::unpack_forward2(int flag, void *pbuf, int nlist, int *list)
    pack ghost values into buf to send to another proc
 ------------------------------------------------------------------------- */
 
-void PPPM2::pack_reverse2(int flag, void *pbuf, int nlist, int *list)
+void PPPM2::pack_reverse_grid(int flag, void *pbuf, int nlist, int *list)
 {
   FFT_SCALAR *buf = (FFT_SCALAR *) pbuf;
     
@@ -2768,7 +2768,7 @@ void PPPM2::pack_reverse2(int flag, void *pbuf, int nlist, int *list)
    unpack another proc's ghost values from buf and add to own values
 ------------------------------------------------------------------------- */
 
-void PPPM2::unpack_reverse2(int flag, void *pbuf, int nlist, int *list)
+void PPPM2::unpack_reverse_grid(int flag, void *pbuf, int nlist, int *list)
 {
   FFT_SCALAR *buf = (FFT_SCALAR *) pbuf;
 
@@ -3142,8 +3142,8 @@ void PPPM2::compute_group_group(int groupbit_A, int groupbit_B, int AA_flag)
   density_brick = density_A_brick;
   density_fft = density_A_fft;
 
-  gc->reverse_comm(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
-		   gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+			  gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   brick2fft();
 
   // group B
@@ -3151,8 +3151,8 @@ void PPPM2::compute_group_group(int groupbit_A, int groupbit_B, int AA_flag)
   density_brick = density_B_brick;
   density_fft = density_B_fft;
 
-  gc->reverse_comm(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
-		   gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+			  gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   brick2fft();
 
   // switch back pointers
diff --git a/src/KSPACE/pppm2.h b/src/KSPACE/pppm2.h
index 31b8534735..11c9e74737 100644
--- a/src/KSPACE/pppm2.h
+++ b/src/KSPACE/pppm2.h
@@ -163,10 +163,10 @@ class PPPM2 : public KSpace {
 
   // grid communication
 
-  virtual void pack_forward2(int, void *, int, int *);
-  virtual void unpack_forward2(int, void *, int, int *);
-  virtual void pack_reverse2(int, void *, int, int *);
-  virtual void unpack_reverse2(int, void *, int, int *);
+  virtual void pack_forward_grid(int, void *, int, int *);
+  virtual void unpack_forward_grid(int, void *, int, int *);
+  virtual void pack_reverse_grid(int, void *, int, int *);
+  virtual void unpack_reverse_grid(int, void *, int, int *);
 
   // triclinic
 
diff --git a/src/kspace.h b/src/kspace.h
index 4d686d59f0..f5229c57a7 100644
--- a/src/kspace.h
+++ b/src/kspace.h
@@ -121,15 +121,16 @@ class KSpace : protected Pointers {
   virtual void compute(int, int) = 0;
   virtual void compute_group_group(int, int, int) {};
 
+  // can remove these 4 when done with new GridComm
   virtual void pack_forward(int, FFT_SCALAR *, int, int *) {};
   virtual void unpack_forward(int, FFT_SCALAR *, int, int *) {};
   virtual void pack_reverse(int, FFT_SCALAR *, int, int *) {};
   virtual void unpack_reverse(int, FFT_SCALAR *, int, int *) {};
 
-  virtual void pack_forward2(int, void *, int, int *) {};
-  virtual void unpack_forward2(int, void *, int, int *) {};
-  virtual void pack_reverse2(int, void *, int, int *) {};
-  virtual void unpack_reverse2(int, void *, int, int *) {};
+  virtual void pack_forward_grid(int, void *, int, int *) {};
+  virtual void unpack_forward_grid(int, void *, int, int *) {};
+  virtual void pack_reverse_grid(int, void *, int, int *) {};
+  virtual void unpack_reverse_grid(int, void *, int, int *) {};
 
   virtual int timing(int, double &, double &) {return 0;}
   virtual int timing_1d(int, double &) {return 0;}

From eae7312680d3e3c0787dacf4542b78ce44b06547 Mon Sep 17 00:00:00 2001
From: Steve Plimpton <sjplimp@sandia.gov>
Date: Wed, 12 Aug 2020 14:25:28 -0600
Subject: [PATCH 03/38] changes to more PPPM variants for tiled support

---
 src/GPU/pppm_gpu.cpp               |   48 +-
 src/GPU/pppm_gpu.h                 |    8 +-
 src/KSPACE/gridcomm.cpp            |  831 +++++++++++++---
 src/KSPACE/gridcomm.h              |  195 +++-
 src/KSPACE/pppm.cpp                |  342 +++----
 src/KSPACE/pppm.h                  |   22 +-
 src/KSPACE/pppm_cg.cpp             |   26 +-
 src/KSPACE/pppm_dipole.cpp         |  117 +--
 src/KSPACE/pppm_dipole.h           |   15 +-
 src/KSPACE/pppm_dipole_spin.cpp    |   44 +-
 src/KSPACE/pppm_dipole_spin.h      |    1 -
 src/KSPACE/pppm_disp.cpp           | 1489 +++++++++++++++-------------
 src/KSPACE/pppm_disp.h             |   50 +-
 src/KSPACE/pppm_stagger.cpp        |   23 +-
 src/KSPACE/pppm_stagger.h          |    1 -
 src/USER-INTEL/pppm_disp_intel.cpp |  167 ++--
 src/USER-INTEL/pppm_intel.cpp      |   23 +-
 src/force.cpp                      |    4 -
 src/kspace.h                       |    6 -
 19 files changed, 2093 insertions(+), 1319 deletions(-)

diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp
index 7c58f6d406..4dcfbdfee2 100644
--- a/src/GPU/pppm_gpu.cpp
+++ b/src/GPU/pppm_gpu.cpp
@@ -203,11 +203,7 @@ void PPPMGPU::compute(int eflag, int vflag)
   // If need per-atom energies/virials, allocate per-atom arrays here
   // so that particle map on host can be done concurrently with GPU calculations
 
-  if (evflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    cg_peratom->ghost_notify();
-    cg_peratom->setup();
-  }
+  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();
 
   if (triclinic == 0) {
     bool success = true;
@@ -258,10 +254,12 @@ void PPPMGPU::compute(int eflag, int vflag)
   // remap from 3d decomposition to FFT decomposition
 
   if (triclinic == 0) {
-    cg->reverse_comm(this,REVERSE_RHO_GPU);
+    gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO_GPU,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     brick2fft_gpu();
   } else {
-    cg->reverse_comm(this,REVERSE_RHO);
+    gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     PPPM::brick2fft();
   }
 
@@ -274,16 +272,22 @@ void PPPMGPU::compute(int eflag, int vflag)
   // all procs communicate E-field values
   // to fill ghost cells surrounding their 3d bricks
 
-  if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
-  else cg->forward_comm(this,FORWARD_IK);
+  if (differentiation_flag == 1)
+    gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  else
+    gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
   // extra per-atom energy/virial communication
 
   if (evflag_atom) {
     if (differentiation_flag == 1 && vflag_atom)
-      cg_peratom->forward_comm(this,FORWARD_AD_PERATOM);
+      gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     else if (differentiation_flag == 0)
-      cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
+      gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   }
 
   poisson_time += MPI_Wtime()-t3;
@@ -510,8 +514,10 @@ void PPPMGPU::poisson_ik()
    pack own values to buf to send to another proc
 ------------------------------------------------------------------------- */
 
-void PPPMGPU::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPMGPU::pack_forward_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
   int n = 0;
 
   if (flag == FORWARD_IK) {
@@ -568,8 +574,10 @@ void PPPMGPU::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
    unpack another proc's own values from buf and set own ghost values
 ------------------------------------------------------------------------- */
 
-void PPPMGPU::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPMGPU::unpack_forward_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
   int n = 0;
 
   if (flag == FORWARD_IK) {
@@ -626,8 +634,10 @@ void PPPMGPU::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
    pack ghost values into buf to send to another proc
 ------------------------------------------------------------------------- */
 
-void PPPMGPU::pack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPMGPU::pack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
   if (flag == REVERSE_RHO_GPU) {
     FFT_SCALAR *src = &density_brick_gpu[nzlo_out][nylo_out][nxlo_out];
     for (int i = 0; i < nlist; i++)
@@ -643,8 +653,10 @@ void PPPMGPU::pack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
    unpack another proc's ghost values from buf and add to own values
 ------------------------------------------------------------------------- */
 
-void PPPMGPU::unpack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPMGPU::unpack_reverse_grid(int flag, void *buf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
   if (flag == REVERSE_RHO_GPU) {
     FFT_SCALAR *dest = &density_brick_gpu[nzlo_out][nylo_out][nxlo_out];
     for (int i = 0; i < nlist; i++)
@@ -818,7 +830,8 @@ void PPPMGPU::compute_group_group(int groupbit_A, int groupbit_B, int AA_flag)
   density_brick = density_A_brick;
   density_fft = density_A_fft;
 
-  cg->reverse_comm(this,REVERSE_RHO);
+  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+			  gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   brick2fft();
 
   // group B
@@ -826,7 +839,8 @@ void PPPMGPU::compute_group_group(int groupbit_A, int groupbit_B, int AA_flag)
   density_brick = density_B_brick;
   density_fft = density_B_fft;
 
-  cg->reverse_comm(this,REVERSE_RHO);
+  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+			  gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   brick2fft();
 
   // switch back pointers
diff --git a/src/GPU/pppm_gpu.h b/src/GPU/pppm_gpu.h
index 737d6c1816..60b7e06d09 100644
--- a/src/GPU/pppm_gpu.h
+++ b/src/GPU/pppm_gpu.h
@@ -46,10 +46,10 @@ class PPPMGPU : public PPPM {
   void brick2fft_gpu();
   virtual void poisson_ik();
 
-  void pack_forward(int, FFT_SCALAR *, int, int *);
-  void unpack_forward(int, FFT_SCALAR *, int, int *);
-  void pack_reverse(int, FFT_SCALAR *, int, int *);
-  void unpack_reverse(int, FFT_SCALAR *, int, int *);
+  void pack_forward_grid(int, void *, int, int *);
+  void unpack_forward_grid(int, void *, int, int *);
+  void pack_reverse_grid(int, void *, int, int *);
+  void unpack_reverse_grid(int, void *, int, int *);
 
   FFT_SCALAR ***create_3d_offset(int, int, int, int, int, int, const char *,
                                  FFT_SCALAR *, int);
diff --git a/src/KSPACE/gridcomm.cpp b/src/KSPACE/gridcomm.cpp
index 368e7370fe..b292fd53da 100644
--- a/src/KSPACE/gridcomm.cpp
+++ b/src/KSPACE/gridcomm.cpp
@@ -13,27 +13,45 @@
 
 #include "gridcomm.h"
 #include <mpi.h>
+#include "comm.h"
 #include "kspace.h"
+#include "irregular.h"
 #include "memory.h"
 
 using namespace LAMMPS_NS;
 
+enum{REGULAR,TILED};
+
 #define SWAPDELTA 8
 
-/* ---------------------------------------------------------------------- */
+// NOTE: gridcomm needs to be world for TILED, will it work with MSM?
+// NOTE: Tiled implementation here only works for RCB, not general tiled
 
-GridComm::GridComm(LAMMPS *lmp, MPI_Comm gcomm, int forward, int reverse,
-                   int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
-                   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi,
-                   int pxlo, int pxhi, int pylo, int pyhi, int pzlo, int pzhi)
+/* ----------------------------------------------------------------------
+   gcomm = MPI communicator that shares this grid
+           does not have to be world, see MSM
+   gn xyz = size of global grid
+   i xyz lohi = portion of global grid this proc owns, 0 <= index < N
+   o xyz lohi = owned grid portion + ghost grid cells needed in all directions
+   if o indices are < 0 or hi indices are >= N,
+     then grid is treated as periodic in that dimension,
+     communication is done across the periodic boundaries
+------------------------------------------------------------------------- */
+
+GridComm::GridComm(LAMMPS *lmp, MPI_Comm gcomm,
+		   int gnx, int gny, int gnz,
+		   int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
+		   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi)
   : Pointers(lmp)
 {
   gridcomm = gcomm;
   MPI_Comm_rank(gridcomm,&me);
+  MPI_Comm_size(gridcomm,&nprocs);
 
-  nforward = forward;
-  nreverse = reverse;
-
+  nx = gnx;
+  ny = gny;
+  nz = gnz;
+  
   inxlo = ixlo;
   inxhi = ixhi;
   inylo = iylo;
@@ -48,6 +66,12 @@ GridComm::GridComm(LAMMPS *lmp, MPI_Comm gcomm, int forward, int reverse,
   outzlo = ozlo;
   outzhi = ozhi;
 
+  // layout == REGULAR or TILED
+  // for REGULAR, proc xyz lohi = my 6 neighbor procs
+  
+  layout = REGULAR;
+  if (comm->layout == Comm::LAYOUT_TILED) layout = TILED;
+  
   outxlo_max = oxlo;
   outxhi_max = oxhi;
   outylo_max = oylo;
@@ -55,33 +79,48 @@ GridComm::GridComm(LAMMPS *lmp, MPI_Comm gcomm, int forward, int reverse,
   outzlo_max = ozlo;
   outzhi_max = ozhi;
 
-  procxlo = pxlo;
-  procxhi = pxhi;
-  procylo = pylo;
-  procyhi = pyhi;
-  proczlo = pzlo;
-  proczhi = pzhi;
+  if (layout == REGULAR) {
+    int (*procneigh)[2] = comm->procneigh;
 
-  nswap = 0;
+    procxlo = procneigh[0][0];
+    procxhi = procneigh[0][1];
+    procylo = procneigh[1][0];
+    procyhi = procneigh[1][1];
+    proczlo = procneigh[2][0];
+    proczhi = procneigh[2][1];
+  }
+  
+  nswap = maxswap = 0;
   swap = NULL;
-  buf1 = buf2 = NULL;
+
+  nsend = nrecv = ncopy = 0;
+  send = NULL;
+  recv = NULL;
+  copy = NULL;
+  requests = NULL;
 }
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   same as first constructor except o xyz lohi max are added arguments
+   this is for case when caller stores grid in a larger array than o xyz lohi
+   only affects indices() method which generates indices into the caller's array
+------------------------------------------------------------------------- */
 
-GridComm::GridComm(LAMMPS *lmp, MPI_Comm gcomm, int forward, int reverse,
-                   int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
-                   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi,
-                   int oxlo_max, int oxhi_max, int oylo_max, int oyhi_max,
-                   int ozlo_max, int ozhi_max,
-                   int pxlo, int pxhi, int pylo, int pyhi, int pzlo, int pzhi)
+GridComm::GridComm(LAMMPS *lmp, MPI_Comm gcomm,
+		   int gnx, int gny, int gnz,
+		   int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
+		   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi,
+		   int oxlo_max, int oxhi_max, int oylo_max, int oyhi_max,
+		   int ozlo_max, int ozhi_max)
   : Pointers(lmp)
 {
   gridcomm = gcomm;
   MPI_Comm_rank(gridcomm,&me);
+  MPI_Comm_size(gridcomm,&nprocs);
 
-  nforward = forward;
-  nreverse = reverse;
+  nx = gnx;
+  ny = gny;
+  nz = gnz;
 
   inxlo = ixlo;
   inxhi = ixhi;
@@ -104,43 +143,85 @@ GridComm::GridComm(LAMMPS *lmp, MPI_Comm gcomm, int forward, int reverse,
   outzlo_max = ozlo_max;
   outzhi_max = ozhi_max;
 
-  procxlo = pxlo;
-  procxhi = pxhi;
-  procylo = pylo;
-  procyhi = pyhi;
-  proczlo = pzlo;
-  proczhi = pzhi;
+  // layout == REGULAR or TILED
+  // for REGULAR, proc xyz lohi = my 6 neighbor procs
 
-  nswap = 0;
+  layout = REGULAR;
+  if (comm->layout == Comm::LAYOUT_TILED) layout = TILED;
+
+  if (layout == REGULAR) {
+    int (*procneigh)[2] = comm->procneigh;
+
+    procxlo = procneigh[0][0];
+    procxhi = procneigh[0][1];
+    procylo = procneigh[1][0];
+    procyhi = procneigh[1][1];
+    proczlo = procneigh[2][0];
+    proczhi = procneigh[2][1];
+  }
+
+  nswap = maxswap = 0;
   swap = NULL;
-  buf1 = buf2 = NULL;
+
+  nsend = nrecv = ncopy = 0;
+  send = NULL;
+  recv = NULL;
+  copy = NULL;
+  requests = NULL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 GridComm::~GridComm()
 {
+  // regular comm data struct
+  
   for (int i = 0; i < nswap; i++) {
     memory->destroy(swap[i].packlist);
     memory->destroy(swap[i].unpacklist);
   }
   memory->sfree(swap);
 
-  memory->destroy(buf1);
-  memory->destroy(buf2);
+  // tiled comm data structs
+  
+  for (int i = 0; i < nsend; i++)
+    memory->destroy(send[i].packlist);
+  memory->sfree(send);
+
+  for (int i = 0; i < nrecv; i++)
+    memory->destroy(recv[i].unpacklist);
+  memory->sfree(recv);
+
+  for (int i = 0; i < ncopy; i++) {
+    memory->destroy(copy[i].packlist);
+    memory->destroy(copy[i].unpacklist);
+  }
+  memory->sfree(copy);
+
+  delete [] requests;
 }
 
-/* ----------------------------------------------------------------------
-   notify 6 neighbor procs how many ghost grid planes I need from them
-   ghostxlo = # of lower grid planes I own that are needed from me
-              by procxlo to become its upper ghost planes
-   ghostxhi = # of upper grid planes I own that are needed from me
-              by procxhi to become its lower ghost planes
-   if no neighbor proc, value is from self
-------------------------------------------------------------------------- */
+/* ---------------------------------------------------------------------- */
 
-void GridComm::ghost_notify()
+void GridComm::setup(int &nbuf1, int &nbuf2)
 {
+  if (layout == REGULAR) setup_regular(nbuf1,nbuf2);
+  else setup_tiled(nbuf1,nbuf2);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void GridComm::setup_regular(int &nbuf1, int &nbuf2)
+{
+  int nsent,sendfirst,sendlast,recvfirst,recvlast;
+  int sendplanes,recvplanes;
+  int notdoneme,notdone;
+
+  // notify 6 neighbor procs how many ghost grid planes I need from them
+  // ghost xyz lo = # of my lower grid planes that proc xyz lo needs as its ghosts
+  // ghost xyz hi = # of my upper grid planes that proc xyz hi needs as its ghosts
+  // if this proc is its own neighbor across periodic bounary, value is from self
+
   int nplanes = inxlo - outxlo;
   if (procxlo != me)
       MPI_Sendrecv(&nplanes,1,MPI_INT,procxlo,0,
@@ -176,45 +257,11 @@ void GridComm::ghost_notify()
     MPI_Sendrecv(&nplanes,1,MPI_INT,proczhi,0,
                  &ghostzlo,1,MPI_INT,proczlo,0,gridcomm,MPI_STATUS_IGNORE);
   else ghostzlo = nplanes;
-}
 
-/* ----------------------------------------------------------------------
-   check if all ghost grid comm needs overlap into non nearest-neighbor proc
-   if yes, return 1, else return 0
-------------------------------------------------------------------------- */
-
-int GridComm::ghost_overlap()
-{
-  int nearest = 0;
-  if (ghostxlo > inxhi-inxlo+1) nearest = 1;
-  if (ghostxhi > inxhi-inxlo+1) nearest = 1;
-  if (ghostylo > inyhi-inylo+1) nearest = 1;
-  if (ghostyhi > inyhi-inylo+1) nearest = 1;
-  if (ghostzlo > inzhi-inzlo+1) nearest = 1;
-  if (ghostzhi > inzhi-inzlo+1) nearest = 1;
-
-  int nearest_all;
-  MPI_Allreduce(&nearest,&nearest_all,1,MPI_INT,MPI_MIN,gridcomm);
-
-  return nearest_all;
-}
-
-/* ----------------------------------------------------------------------
-   create swap stencil for grid own/ghost communication
-   swaps covers all 3 dimensions and both directions
-   swaps cover multiple iterations in a direction if need grid pts
-     from further away than nearest-neighbor proc
-   same swap list used by forward and reverse communication
-------------------------------------------------------------------------- */
-
-void GridComm::setup()
-{
-  int nsent,sendfirst,sendlast,recvfirst,recvlast;
-  int sendplanes,recvplanes;
-  int notdoneme,notdone;
-
-  int maxswap = 6;
-  swap = (Swap *) memory->smalloc(maxswap*sizeof(Swap),"Commgrid:swap");
+  // setup swaps = exchange of grid data with one of 6 neighobr procs
+  // can be more than one in a direction if ghost region extends beyond neigh proc
+  // all procs have same swap count, but swapsize npack/nunpack can be empty
+  
   nswap = 0;
 
   // send own grid pts to -x processor, recv ghost grid pts from +x processor
@@ -226,11 +273,7 @@ void GridComm::setup()
   notdone = 1;
 
   while (notdone) {
-    if (nswap == maxswap) {
-      maxswap += SWAPDELTA;
-      swap = (Swap *)
-        memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap");
-    }
+    if (nswap == maxswap) grow_swap();
 
     swap[nswap].sendproc = procxlo;
     swap[nswap].recvproc = procxhi;
@@ -268,11 +311,7 @@ void GridComm::setup()
   notdone = 1;
 
   while (notdone) {
-    if (nswap == maxswap) {
-      maxswap += 1;
-      swap = (Swap *)
-        memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap");
-    }
+    if (nswap == maxswap) grow_swap();
 
     swap[nswap].sendproc = procxhi;
     swap[nswap].recvproc = procxlo;
@@ -310,11 +349,7 @@ void GridComm::setup()
   notdone = 1;
 
   while (notdone) {
-    if (nswap == maxswap) {
-      maxswap += SWAPDELTA;
-      swap = (Swap *)
-        memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap");
-    }
+    if (nswap == maxswap) grow_swap();
 
     swap[nswap].sendproc = procylo;
     swap[nswap].recvproc = procyhi;
@@ -352,11 +387,7 @@ void GridComm::setup()
   notdone = 1;
 
   while (notdone) {
-    if (nswap == maxswap) {
-      maxswap += 1;
-      swap = (Swap *)
-        memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap");
-    }
+    if (nswap == maxswap) grow_swap();
 
     swap[nswap].sendproc = procyhi;
     swap[nswap].recvproc = procylo;
@@ -394,11 +425,7 @@ void GridComm::setup()
   notdone = 1;
 
   while (notdone) {
-    if (nswap == maxswap) {
-      maxswap += SWAPDELTA;
-      swap = (Swap *)
-        memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap");
-    }
+    if (nswap == maxswap) grow_swap();
 
     swap[nswap].sendproc = proczlo;
     swap[nswap].recvproc = proczhi;
@@ -436,11 +463,7 @@ void GridComm::setup()
   notdone = 1;
 
   while (notdone) {
-    if (nswap == maxswap) {
-      maxswap += 1;
-      swap = (Swap *)
-        memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap");
-    }
+    if (nswap == maxswap) grow_swap();
 
     swap[nswap].sendproc = proczhi;
     swap[nswap].recvproc = proczlo;
@@ -469,39 +492,468 @@ void GridComm::setup()
     MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
   }
 
-  // nbuf = max of any forward/reverse pack/unpack
+  // ngrid = max of any forward/reverse pack/unpack grid points
 
-  nbuf = 0;
+  int ngrid = 0;
   for (int i = 0; i < nswap; i++) {
-    nbuf = MAX(nbuf,swap[i].npack);
-    nbuf = MAX(nbuf,swap[i].nunpack);
+    ngrid = MAX(ngrid,swap[i].npack);
+    ngrid = MAX(ngrid,swap[i].nunpack);
   }
-  nbuf *= MAX(nforward,nreverse);
-  memory->create(buf1,nbuf,"Commgrid:buf1");
-  memory->create(buf2,nbuf,"Commgrid:buf2");
+
+  nbuf1 = nbuf2 = ngrid;
+}
+
+/* ----------------------------------------------------------------------
+------------------------------------------------------------------------- */
+
+void GridComm::setup_tiled(int &nbuf1, int &nbuf2)
+{
+  int i,m;
+  double xlo,xhi,ylo,yhi,zlo,zhi;
+  int ghostbox[6],pbc[3];
+
+  // setup RCB tree of cut info for grid
+  // access CommTiled to get cut dimension
+  // cut = this proc's inlo in that dim
+  // dim is -1 for proc 0, but never accessed
+  
+  rcbinfo = (RCBinfo *)
+    memory->smalloc(nprocs*sizeof(RCBinfo),"GridComm:rcbinfo");
+  RCBinfo rcbone;
+  rcbone.dim = comm->rcbcutdim;
+  if (rcbone.dim <= 0) rcbone.cut = inxlo;
+  else if (rcbone.dim == 1) rcbone.cut = inylo;
+  else if (rcbone.dim == 2) rcbone.cut = inzlo;
+  MPI_Allgather(&rcbone,sizeof(RCBinfo),MPI_CHAR,
+                rcbinfo,sizeof(RCBinfo),MPI_CHAR,gridcomm);
+
+  // find overlaps of my extended ghost box with all other procs
+  // accounts for crossings of periodic boundaries
+  // noverlap = # of overlaps, including self
+  // overlap = vector of overlap info using Overlap data struct
+  
+  ghostbox[0] = outxlo;
+  ghostbox[1] = outxhi;
+  ghostbox[2] = outylo;
+  ghostbox[3] = outyhi;
+  ghostbox[4] = outzlo;
+  ghostbox[5] = outzhi;
+  
+  pbc[0] = pbc[1] = pbc[2] = 0;
+
+  memory->create(overlap_procs,nprocs,"GridComm:overlap_procs");
+  noverlap = maxoverlap = 0;
+  overlap = NULL;
+
+  ghost_box_drop(ghostbox,pbc);
+
+  // send each proc an overlap message
+  // content: me, index of my overlap, box that overlaps with its owned cells
+  // ncopy = # of overlaps with myself, across a periodic boundary
+
+  int *proclist;
+  memory->create(proclist,noverlap,"GridComm:proclist");
+  srequest = (Request *)
+    memory->smalloc(noverlap*sizeof(Request),"GridComm:srequest");
+  
+  int nsend_request = 0;
+  ncopy = 0;
+  
+  for (m = 0; m < noverlap; m++) {
+    if (overlap[m].proc == me) ncopy++;
+    else {
+      proclist[nsend_request] = overlap[m].proc;
+      srequest[nsend_request].sender = me;
+      srequest[nsend_request].index = m;
+      for (i = 0; i < 6; i++)
+	srequest[nsend_request].box[i] = overlap[m].box[i];
+      nsend_request++;
+    }
+  }
+
+  Irregular *irregular = new Irregular(lmp);
+  int nrecv_request = irregular->create_data(nsend_request,proclist,1);
+  Request *rrequest =
+    (Request *) memory->smalloc(nrecv_request*sizeof(Request),"GridComm:rrequest");
+  irregular->exchange_data((char *) srequest,sizeof(Request),(char *) rrequest);
+  irregular->destroy_data();
+  
+  // compute overlaps between received ghost boxes and my owned box
+  // overlap box used to setup my Send data struct and respond to requests
+
+  send = (Send *) memory->smalloc(nrecv_request*sizeof(Send),"GridComm:send");
+  sresponse = (Response *)
+    memory->smalloc(nrecv_request*sizeof(Response),"GridComm:sresponse");
+  memory->destroy(proclist);
+  memory->create(proclist,nrecv_request,"GridComm:proclist");
+
+  for (m = 0; m < nrecv_request; m++) {
+    send[m].proc = rrequest[m].sender;
+    xlo = MAX(rrequest[m].box[0],inxlo);
+    xhi = MIN(rrequest[m].box[1],inxhi);
+    ylo = MAX(rrequest[m].box[2],inylo);
+    yhi = MIN(rrequest[m].box[3],inyhi);
+    zlo = MAX(rrequest[m].box[4],inzlo);
+    zhi = MIN(rrequest[m].box[5],inzhi);
+    send[m].npack = indices(send[m].packlist,xlo,xhi,ylo,yhi,zlo,zhi);
+
+    proclist[m] = rrequest[m].sender;
+    sresponse[m].index = rrequest[m].index;
+    sresponse[m].box[0] = xlo;
+    sresponse[m].box[1] = xhi;
+    sresponse[m].box[2] = ylo;
+    sresponse[m].box[3] = yhi;
+    sresponse[m].box[4] = zlo;
+    sresponse[m].box[5] = zhi;
+  }
+
+  nsend = nrecv_request;
+  
+  // reply to each Request message with a Response message
+  // content: index for the overlap on requestor, overlap box on my owned grid
+
+  int nsend_response = nrecv_request;
+  int nrecv_response = irregular->create_data(nsend_response,proclist,1);
+  Response *rresponse =
+    (Response *) memory->smalloc(nrecv_response*sizeof(Response),"GridComm:rresponse");
+  irregular->exchange_data((char *) sresponse,sizeof(Response),(char *) rresponse);
+  irregular->destroy_data();
+  delete irregular;
+
+  // process received responses
+  // box used to setup my Recv data struct after unwrapping via PBC
+  // adjacent = 0 if any box of ghost cells does not adjoin my owned cells
+  
+  recv = (Recv *) memory->smalloc(nrecv_response*sizeof(Recv),"CommGrid:recv");
+  adjacent = 1;
+  
+  for (i = 0; i < nrecv_response; i++) {
+    m = rresponse[i].index;
+    recv[i].proc = overlap[m].proc;
+    xlo = rresponse[i].box[0] + overlap[m].pbc[0] * nx;
+    xhi = rresponse[i].box[1] + overlap[m].pbc[0] * nx;
+    ylo = rresponse[i].box[2] + overlap[m].pbc[1] * ny;
+    yhi = rresponse[i].box[3] + overlap[m].pbc[1] * ny;
+    zlo = rresponse[i].box[4] + overlap[m].pbc[2] * nz;
+    zhi = rresponse[i].box[5] + overlap[m].pbc[2] * nz;
+    recv[i].nunpack = indices(recv[i].unpacklist,xlo,xhi,ylo,yhi,zlo,zhi);
+    
+    if (xlo != inxhi+1 && xhi != inxlo-1 &&
+	ylo != inyhi+1 && yhi != inylo-1 &&
+	zlo != inzhi+1 && zhi != inzlo-1) adjacent = 0;
+  }
+
+  nrecv = nrecv_response;
+
+  // create Copy data struct from overlaps with self
+  
+  copy = (Copy *) memory->smalloc(ncopy*sizeof(Copy),"CommGrid:copy");
+ 
+  ncopy = 0;
+  for (m = 0; m < noverlap; m++) {
+    if (overlap[m].proc != me) continue;
+    xlo = overlap[m].box[0];
+    xhi = overlap[m].box[1];
+    ylo = overlap[m].box[2];
+    yhi = overlap[m].box[3];
+    zlo = overlap[m].box[4];
+    zhi = overlap[m].box[5];
+    copy[ncopy].npack = indices(copy[ncopy].packlist,xlo,xhi,ylo,yhi,zlo,zhi);
+    xlo = overlap[m].box[0] + overlap[m].pbc[0] * nx;
+    xhi = overlap[m].box[1] + overlap[m].pbc[0] * nx;
+    ylo = overlap[m].box[2] + overlap[m].pbc[1] * ny;
+    yhi = overlap[m].box[3] + overlap[m].pbc[1] * ny;
+    zlo = overlap[m].box[4] + overlap[m].pbc[2] * nz;
+    zhi = overlap[m].box[5] + overlap[m].pbc[2] * nz;
+    copy[ncopy].nunpack = indices(copy[ncopy].unpacklist,xlo,xhi,ylo,yhi,zlo,zhi);
+    ncopy++;
+  }
+
+  // set offsets for received data
+
+  int offset = 0;
+  for (m = 0; m < nsend; m++) {
+    send[m].offset = offset;
+    offset += send[m].npack;
+  }
+
+  offset = 0;
+  for (m = 0; m < nrecv; m++) {
+    recv[m].offset = offset;
+    offset += recv[m].nunpack;
+  }
+
+  // length of MPI requests vector is max of nsend, nrecv
+
+  int nrequest = MAX(nsend,nrecv);
+  requests = new MPI_Request[nrequest];
+    
+  // clean-up
+
+  memory->sfree(rcbinfo);
+  memory->destroy(proclist);
+  memory->destroy(overlap_procs);
+  memory->sfree(overlap);
+  memory->sfree(srequest);
+  memory->sfree(rrequest);
+  memory->sfree(sresponse);
+  memory->sfree(rresponse);
+
+  // nbuf1 = largest pack or unpack in any Send or Recv or Copy
+  // nbuf2 = larget of sum of all packs or unpacks in Send or Recv
+  
+  nbuf1 = 0;
+
+  for (m = 0; m < ncopy; m++) {
+    nbuf1 = MAX(nbuf1,copy[m].npack);
+    nbuf1 = MAX(nbuf1,copy[m].nunpack);
+  }
+
+  int nbufs = 0;
+  for (m = 0; m < nsend; m++) {
+    nbuf1 = MAX(nbuf1,send[m].npack);
+    nbufs += send[m].npack;
+  }
+
+  int nbufr = 0;
+  for (m = 0; m < nrecv; m++) {
+    nbuf1 = MAX(nbuf1,recv[m].nunpack);
+    nbufr += recv[m].nunpack;
+  }
+
+  nbuf2 = MAX(nbufs,nbufr);
+}
+
+/* ----------------------------------------------------------------------
+------------------------------------------------------------------------- */
+
+void GridComm::ghost_box_drop(int *box, int *pbc)
+{
+  int i,m;
+  
+  // newbox12 and newpbc are initially copies of caller box and pbc
+  
+  int newbox1[6],newbox2[6],newpbc[3];
+
+  for (i = 0; i < 6; i++) newbox1[i] = newbox2[i] = box[i];
+  for (i = 0; i < 3; i++) newpbc[i] = pbc[i];
+
+  // 6 if tests to see if box needs to be split across a periodic boundary
+  // final else is no split
+  
+  int splitflag = 1;
+  
+  if (box[0] < 0) {
+    newbox1[0] = 0;
+    newbox2[0] = box[0] + nx;
+    newbox2[1] = nx - 1;
+    newpbc[0]--;
+  } else if (box[1] >= nx) {
+    newbox1[1] = nx - 1;
+    newbox2[0] = 0;
+    newbox2[1] = box[1] - nx;
+    newpbc[0]++;
+  } else if (box[2] < 0) {
+    newbox1[2] = 0;
+    newbox2[2] = box[2] + ny;
+    newbox2[3] = ny - 1;
+    newpbc[1]--;
+  } else if (box[3] >= ny) {
+    newbox1[3] = ny - 1;
+    newbox2[2] = 0;
+    newbox2[3] = box[3] - ny; 
+    newpbc[1]++;
+  } else if (box[4] < 0) {
+    newbox1[4] = 0;
+    newbox2[4] = box[4] + nz;
+    newbox2[5] = nz - 1;
+    newpbc[2]--;
+  } else if (box[5] >= nz) {
+    newbox1[5] = nz - 1;
+    newbox2[4] = 0;
+    newbox2[5] = box[5] - nz;
+    newpbc[2]++;
+
+  // box is not split, drop on RCB tree
+  // returns nprocs = # of procs it overlaps, including self
+  // returns proc_overlap = list of proc IDs it overlaps
+  // skip self overlap if no crossing of periodic boundaries
+    
+  } else {
+    splitflag = 0;
+    int np = 0;
+    box_drop_grid(box,0,nprocs-1,np,overlap_procs);
+    for (m = 0; m < np; m++) {
+      if (noverlap == maxoverlap) grow_overlap();
+      if (overlap_procs[m] == me &&
+	  pbc[0] == 0 && pbc[1] == 0 && pbc[2] == 0) continue;
+      overlap[noverlap].proc = overlap_procs[m];
+      for (i = 0; i < 6; i++) overlap[noverlap].box[i] = box[i];
+      for (i = 0; i < 3; i++) overlap[noverlap].pbc[i] = pbc[i];
+      noverlap++;
+    }
+  }
+
+  // recurse with 2 split boxes
+  
+  if (splitflag) {
+    ghost_box_drop(newbox1,pbc);
+    ghost_box_drop(newbox2,newpbc);
+  }
+}
+
+/* ----------------------------------------------------------------------
+------------------------------------------------------------------------- */
+
+void GridComm::box_drop_grid(int *box, int proclower, int procupper,
+			      int &np, int *plist)
+{
+  // end recursion when partition is a single proc
+  // add proclower to plist
+
+  if (proclower == procupper) {
+    plist[np++] = proclower;
+    return;
+  }
+
+  // drop box on each side of cut it extends beyond
+  // use < and >= criteria so does not include a box it only touches
+  // procmid = 1st processor in upper half of partition
+  //         = location in tree that stores this cut
+  // cut = index of first grid cell in upper partition
+  // dim = 0,1,2 dimension of cut
+
+  int procmid = proclower + (procupper - proclower) / 2 + 1;
+  int dim = rcbinfo[procmid].dim;
+  int cut = rcbinfo[procmid].cut;
+
+  if (box[2*dim] < cut) box_drop_grid(box,proclower,procmid-1,np,plist);
+  if (box[2*dim+1] >= cut) box_drop_grid(box,procmid,procupper,np,plist);
+}
+
+/* ----------------------------------------------------------------------
+   check if all procs only need ghost info from adjacent procs
+   return 1 if yes, 0 if no
+------------------------------------------------------------------------- */
+
+int GridComm::ghost_adjacent()
+{
+  if (layout == REGULAR) return ghost_adjacent_regular();
+  return ghost_adjacent_tiled();
+}
+
+/* ----------------------------------------------------------------------
+   adjacent = 0 if a proc's ghost xyz lohi values exceed its subdomain size
+   return 0 if adjacent=0 for any proc, else 1
+------------------------------------------------------------------------- */
+
+int GridComm::ghost_adjacent_regular()
+{
+  adjacent = 1;
+  if (ghostxlo > inxhi-inxlo+1) adjacent = 0;
+  if (ghostxhi > inxhi-inxlo+1) adjacent = 0;
+  if (ghostylo > inyhi-inylo+1) adjacent = 0;
+  if (ghostyhi > inyhi-inylo+1) adjacent = 0;
+  if (ghostzlo > inzhi-inzlo+1) adjacent = 0;
+  if (ghostzhi > inzhi-inzlo+1) adjacent = 0;
+
+  int adjacent_all;
+  MPI_Allreduce(&adjacent,&adjacent_all,1,MPI_INT,MPI_MIN,gridcomm);
+  return adjacent_all;
+}
+
+/* ----------------------------------------------------------------------
+   adjacent = 0 if a proc's received ghosts were flagged
+     as non-adjacent in setup_tiled()
+   return 0 if adjacent=0 for any proc, else 1
+------------------------------------------------------------------------- */
+
+int GridComm::ghost_adjacent_tiled()
+{
+  int adjacent_all;
+  MPI_Allreduce(&adjacent,&adjacent_all,1,MPI_INT,MPI_MIN,gridcomm);
+  return adjacent_all;
 }
 
 /* ----------------------------------------------------------------------
    use swap list in forward order to acquire copy of all needed ghost grid pts
 ------------------------------------------------------------------------- */
 
-void GridComm::forward_comm(KSpace *kspace, int which)
+void GridComm::forward_comm_kspace(KSpace *kspace, int nper, int nbyte, int which,
+				    void *buf1, void *buf2, MPI_Datatype datatype)
 {
-  for (int m = 0; m < nswap; m++) {
+  if (layout == REGULAR)
+    forward_comm_kspace_regular(kspace,nper,nbyte,which,buf1,buf2,datatype);
+  else
+    forward_comm_kspace_tiled(kspace,nper,nbyte,which,buf1,buf2,datatype);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void GridComm::
+forward_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
+			    void *buf1, void *buf2, MPI_Datatype datatype)
+{
+  int m;
+  MPI_Request request;
+
+  for (m = 0; m < nswap; m++) {
     if (swap[m].sendproc == me)
-      kspace->pack_forward(which,buf2,swap[m].npack,swap[m].packlist);
+      kspace->pack_forward_grid(which,buf2,swap[m].npack,swap[m].packlist);
     else
-      kspace->pack_forward(which,buf1,swap[m].npack,swap[m].packlist);
+      kspace->pack_forward_grid(which,buf1,swap[m].npack,swap[m].packlist);
 
     if (swap[m].sendproc != me) {
-      MPI_Irecv(buf2,nforward*swap[m].nunpack,MPI_FFT_SCALAR,
-                swap[m].recvproc,0,gridcomm,&request);
-      MPI_Send(buf1,nforward*swap[m].npack,MPI_FFT_SCALAR,
-               swap[m].sendproc,0,gridcomm);
-      MPI_Wait(&request,MPI_STATUS_IGNORE);
+      if (swap[m].nunpack) MPI_Irecv(buf2,nper*swap[m].nunpack,datatype,
+				     swap[m].recvproc,0,gridcomm,&request);
+      if (swap[m].npack) MPI_Send(buf1,nper*swap[m].npack,datatype,
+				  swap[m].sendproc,0,gridcomm);
+      if (swap[m].nunpack) MPI_Wait(&request,MPI_STATUS_IGNORE);
     }
 
-    kspace->unpack_forward(which,buf2,swap[m].nunpack,swap[m].unpacklist);
+    kspace->unpack_forward_grid(which,buf2,swap[m].nunpack,swap[m].unpacklist);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void GridComm::
+forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
+			  void *buf1, void *vbuf2, MPI_Datatype datatype)
+{
+  int i,m,offset;
+
+  char *buf2 = (char *) vbuf2;
+  
+  // post all receives
+  
+  for (m = 0; m < nrecv; m++) {
+    offset = nper * recv[m].offset * nbyte;
+    MPI_Irecv((void *) &buf2[offset],nper*recv[m].nunpack,datatype,
+	      recv[m].proc,0,gridcomm,&requests[m]);
+  }
+
+  // perform all sends to other procs
+
+  for (m = 0; m < nsend; m++) {
+    kspace->pack_forward_grid(which,buf1,send[m].npack,send[m].packlist);
+    MPI_Send(buf1,nper*send[m].npack,datatype,send[m].proc,0,gridcomm);
+  }
+
+  // perform all copies to self
+
+  for (m = 0; m < ncopy; m++) {
+    kspace->pack_forward_grid(which,buf1,copy[m].npack,copy[m].packlist);
+    kspace->unpack_forward_grid(which,buf1,copy[m].nunpack,copy[m].unpacklist);
+  }
+
+  // unpack all received data
+  
+  for (i = 0; i < nrecv; i++) {
+    MPI_Waitany(nrecv,requests,&m,MPI_STATUS_IGNORE);
+    offset = nper * recv[m].offset * nbyte;
+    kspace->unpack_forward_grid(which,(void *) &buf2[offset],
+				recv[m].nunpack,recv[m].unpacklist);
   }
 }
 
@@ -510,26 +962,114 @@ void GridComm::forward_comm(KSpace *kspace, int which)
    for each owned grid pt that some other proc has copy of as a ghost grid pt
 ------------------------------------------------------------------------- */
 
-void GridComm::reverse_comm(KSpace *kspace, int which)
+void GridComm::reverse_comm_kspace(KSpace *kspace, int nper, int nbyte, int which,
+				    void *buf1, void *buf2, MPI_Datatype datatype)
 {
-  for (int m = nswap-1; m >= 0; m--) {
+  if (layout == REGULAR)
+    reverse_comm_kspace_regular(kspace,nper,nbyte,which,buf1,buf2,datatype);
+  else
+    reverse_comm_kspace_tiled(kspace,nper,nbyte,which,buf1,buf2,datatype);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void GridComm::
+reverse_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
+			    void *buf1, void *buf2, MPI_Datatype datatype)
+{
+  int m;
+  MPI_Request request;
+
+  for (m = nswap-1; m >= 0; m--) {
     if (swap[m].recvproc == me)
-      kspace->pack_reverse(which,buf2,swap[m].nunpack,swap[m].unpacklist);
+      kspace->pack_reverse_grid(which,buf2,swap[m].nunpack,swap[m].unpacklist);
     else
-      kspace->pack_reverse(which,buf1,swap[m].nunpack,swap[m].unpacklist);
+      kspace->pack_reverse_grid(which,buf1,swap[m].nunpack,swap[m].unpacklist);
 
     if (swap[m].recvproc != me) {
-      MPI_Irecv(buf2,nreverse*swap[m].npack,MPI_FFT_SCALAR,
-                swap[m].sendproc,0,gridcomm,&request);
-      MPI_Send(buf1,nreverse*swap[m].nunpack,MPI_FFT_SCALAR,
-               swap[m].recvproc,0,gridcomm);
-      MPI_Wait(&request,MPI_STATUS_IGNORE);
+      if (swap[m].npack) MPI_Irecv(buf2,nper*swap[m].npack,datatype,
+				   swap[m].sendproc,0,gridcomm,&request);
+      if (swap[m].nunpack) MPI_Send(buf1,nper*swap[m].nunpack,datatype,
+				     swap[m].recvproc,0,gridcomm);
+      if (swap[m].npack) MPI_Wait(&request,MPI_STATUS_IGNORE);
     }
 
-    kspace->unpack_reverse(which,buf2,swap[m].npack,swap[m].packlist);
+    kspace->unpack_reverse_grid(which,buf2,swap[m].npack,swap[m].packlist);
   }
 }
 
+/* ---------------------------------------------------------------------- */
+
+void GridComm::
+reverse_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
+			  void *buf1, void *vbuf2, MPI_Datatype datatype)
+{
+  int i,m,offset;
+
+  char *buf2 = (char *) vbuf2;
+
+  // post all receives
+  
+  for (m = 0; m < nsend; m++) {
+    offset = nper * send[m].offset * nbyte;
+    MPI_Irecv((void *) &buf2[offset],nper*send[m].npack,datatype,
+	      send[m].proc,0,gridcomm,&requests[m]);
+  }
+
+  // perform all sends to other procs
+
+  for (m = 0; m < nrecv; m++) {
+    kspace->pack_reverse_grid(which,buf1,recv[m].nunpack,recv[m].unpacklist);
+    MPI_Send(buf1,nper*recv[m].nunpack,datatype,recv[m].proc,0,gridcomm);
+  }
+
+  // perform all copies to self
+
+  for (m = 0; m < ncopy; m++) {
+    kspace->pack_reverse_grid(which,buf1,copy[m].nunpack,copy[m].unpacklist);
+    kspace->unpack_reverse_grid(which,buf1,copy[m].npack,copy[m].packlist);
+  }
+
+  // unpack all received data
+  
+  for (i = 0; i < nsend; i++) {
+    MPI_Waitany(nsend,requests,&m,MPI_STATUS_IGNORE);
+    offset = nper * send[m].offset * nbyte;
+    kspace->unpack_reverse_grid(which,(void *) &buf2[offset],
+				send[m].npack,send[m].packlist);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   create swap stencil for grid own/ghost communication
+   swaps covers all 3 dimensions and both directions
+   swaps cover multiple iterations in a direction if need grid pts
+     from further away than nearest-neighbor proc
+   same swap list used by forward and reverse communication
+------------------------------------------------------------------------- */
+
+void GridComm::grow_swap()
+{
+  maxswap += SWAPDELTA;
+  swap = (Swap *)
+    memory->srealloc(swap,maxswap*sizeof(Swap),"CommGrid:swap");
+}
+
+/* ----------------------------------------------------------------------
+   create swap stencil for grid own/ghost communication
+   swaps covers all 3 dimensions and both directions
+   swaps cover multiple iterations in a direction if need grid pts
+     from further away than nearest-neighbor proc
+   same swap list used by forward and reverse communication
+------------------------------------------------------------------------- */
+
+void GridComm::grow_overlap()
+{
+  maxoverlap += SWAPDELTA;
+  overlap = (Overlap *)
+    memory->srealloc(overlap,maxoverlap*sizeof(Overlap),"CommGrid:overlap");
+}
+
 /* ----------------------------------------------------------------------
    create 1d list of offsets into 3d array section (xlo:xhi,ylo:yhi,zlo:zhi)
    assume 3d array is allocated as (outxlo_max:outxhi_max,outylo_max:outyhi_max,
@@ -540,7 +1080,8 @@ int GridComm::indices(int *&list,
                        int xlo, int xhi, int ylo, int yhi, int zlo, int zhi)
 {
   int nmax = (xhi-xlo+1) * (yhi-ylo+1) * (zhi-zlo+1);
-  memory->create(list,nmax,"Commgrid:list");
+  memory->create(list,nmax,"CommGrid:indices");
+  if (nmax == 0) return 0;
 
   int nx = (outxhi_max-outxlo_max+1);
   int ny = (outyhi_max-outylo_max+1);
@@ -554,13 +1095,3 @@ int GridComm::indices(int *&list,
 
   return nmax;
 }
-
-/* ----------------------------------------------------------------------
-   memory usage of send/recv bufs
-------------------------------------------------------------------------- */
-
-double GridComm::memory_usage()
-{
-  double bytes = 2*nbuf * sizeof(double);
-  return bytes;
-}
diff --git a/src/KSPACE/gridcomm.h b/src/KSPACE/gridcomm.h
index faa6d5c4fb..1cdfe28da2 100644
--- a/src/KSPACE/gridcomm.h
+++ b/src/KSPACE/gridcomm.h
@@ -16,56 +16,57 @@
 
 #include "pointers.h"
 
-#ifdef FFT_SINGLE
-typedef float FFT_SCALAR;
-#define MPI_FFT_SCALAR MPI_FLOAT
-#else
-typedef double FFT_SCALAR;
-#define MPI_FFT_SCALAR MPI_DOUBLE
-#endif
-
 namespace LAMMPS_NS {
 
 class GridComm : protected Pointers {
  public:
-  GridComm(class LAMMPS *, MPI_Comm, int, int,
-           int, int, int, int, int, int,
-           int, int, int, int, int, int,
-           int, int, int, int, int, int);
-  GridComm(class LAMMPS *, MPI_Comm, int, int,
-           int, int, int, int, int, int,
-           int, int, int, int, int, int,
-           int, int, int, int, int, int,
-           int, int, int, int, int, int);
+  GridComm(class LAMMPS *, MPI_Comm, int, int, int,
+	   int, int, int, int, int, int,
+	   int, int, int, int, int, int);
+  GridComm(class LAMMPS *, MPI_Comm, int, int, int,
+	   int, int, int, int, int, int,
+	   int, int, int, int, int, int,
+	   int, int, int, int, int, int);
   ~GridComm();
-  void ghost_notify();
-  int ghost_overlap();
-  void setup();
-  void forward_comm(class KSpace *, int);
-  void reverse_comm(class KSpace *, int);
-  double memory_usage();
+  void setup(int &, int &);
+  int ghost_adjacent();
+  void forward_comm_kspace(class KSpace *, int, int, int,
+			   void *, void *, MPI_Datatype);
+  void reverse_comm_kspace(class KSpace *, int, int, int,
+			   void *, void *, MPI_Datatype);
 
  private:
-  int me;
-  int nforward,nreverse;
+  int me,nprocs;
+  int layout;                 // REGULAR or TILED
   MPI_Comm gridcomm;
-  MPI_Request request;
 
-  // in = inclusive indices of 3d grid chunk that I own
-  // out = inclusive indices of 3d grid chunk I own plus ghosts I use
-  // proc = 6 neighbor procs that surround me
-  // ghost = # of my owned grid planes needed from me
-  //         by each of 6 neighbor procs to become their ghost planes
+  // inputs from caller via constructor
 
-  int inxlo,inxhi,inylo,inyhi,inzlo,inzhi;
-  int outxlo,outxhi,outylo,outyhi,outzlo,outzhi;
-  int outxlo_max,outxhi_max,outylo_max,outyhi_max,outzlo_max,outzhi_max;
-  int procxlo,procxhi,procylo,procyhi,proczlo,proczhi;
-  int ghostxlo,ghostxhi,ghostylo,ghostyhi,ghostzlo,ghostzhi;
+  int nx,ny,nz;               // size of global grid in all 3 dims
+  int inxlo,inxhi;            // inclusive extent of my grid chunk
+  int inylo,inyhi;            //   0 <= in <= N-1
+  int inzlo,inzhi;   
+  int outxlo,outxhi;          // inclusive extent of my grid chunk plus
+  int outylo,outyhi;          //   ghost cells in all 6 directions
+  int outzlo,outzhi;          // lo indices can be < 0, hi indices can be >= N
+  int outxlo_max,outxhi_max;  // ??
+  int outylo_max,outyhi_max;
+  int outzlo_max,outzhi_max;
 
-  int nbuf;
-  FFT_SCALAR *buf1,*buf2;
+  // -------------------------------------------
+  // internal variables for REGULAR layout
+  // -------------------------------------------
 
+  int procxlo,procxhi;     // 6 neighbor procs that adjoin me
+  int procylo,procyhi;     //   not used for comm_style = tiled
+  int proczlo,proczhi;
+  
+  int ghostxlo,ghostxhi;   // # of my owned grid planes needed
+  int ghostylo,ghostyhi;   //   by neighobr procs in each dir as their ghost planes
+  int ghostzlo,ghostzhi;
+
+  // swap = exchange of owned and ghost grid cells between 2 procs, including self
+  
   struct Swap {
     int sendproc;       // proc to send to for forward comm
     int recvproc;       // proc to recv from for forward comm
@@ -75,9 +76,125 @@ class GridComm : protected Pointers {
     int *unpacklist;    // 3d array offsets to unpack
   };
 
-  int nswap;
+  int nswap,maxswap;
   Swap *swap;
 
+  // -------------------------------------------
+  // internal variables for TILED layout
+  // -------------------------------------------
+
+  int *overlap_procs;
+  MPI_Request *requests;
+
+  // RCB tree of cut info
+  // each proc contributes one value, except proc 0
+  
+  struct RCBinfo {
+    int dim;        // 0,1,2 = which dim the cut is in
+    int cut;        // grid index of lowest cell in upper half of cut
+  };
+
+  RCBinfo *rcbinfo;
+    
+  // overlap = a proc whose owned cells overlap with my extended ghost box
+  // includes overlaps across periodic boundaries, can also be self
+  
+  struct Overlap {
+    int proc;            // proc whose owned cells overlap my ghost cells
+    int box[6];          // box that overlaps otherproc's owned cells
+                         // this box is wholly contained within global grid
+    int pbc[3];          // PBC offsets to convert box to a portion of my ghost box
+                         // my ghost box may extend beyond global grid
+  };
+
+  int noverlap,maxoverlap;
+  Overlap *overlap;
+  
+  // request = sent to each proc whose owned cells overlap my ghost cells
+  
+  struct Request {
+    int sender;          // sending proc
+    int index;           // index of overlap on sender
+    int box[6];          // box that overlaps receiver's owned cells
+                         // wholly contained within global grid
+  };
+
+  Request *srequest,*rrequest;
+  
+  // response = reply from each proc whose owned cells overlap my ghost cells
+  
+  struct Response {
+    int index;           // index of my overlap for the initial request
+    int box[6];          // box that overlaps responder's owned cells
+                         // wholly contained within global grid
+                         // has to unwrapped by PBC to map to my ghost cells
+  };
+
+  Response *sresponse,*rresponse;
+  
+  // send = proc to send a subset of my owned cells to, for forward comm
+  // for reverse comm, proc I receive ghost overlaps with my owned cells from
+  // offset used in reverse comm to recv a message in middle of a large buffer
+
+  struct Send {
+    int proc;
+    int npack;
+    int *packlist;
+    int offset;
+  };
+
+  // recv = proc to recv a subset of my ghost cells from, for forward comm
+  // for reverse comm, proc I send a subset of my ghost cells to
+  // offset used in forward comm to recv a message in middle of a large buffer
+  
+  struct Recv {
+    int proc;
+    int nunpack;
+    int *unpacklist;
+    int offset;
+  };
+
+  int adjacent;      // 0 on a proc who receives ghosts from a non-neighbor proc
+
+  // copy = subset of my owned cells to copy into subset of my ghost cells
+  // that describes forward comm, for reverse comm it is the opposite
+  
+  struct Copy {
+    int npack;
+    int nunpack;
+    int *packlist;
+    int *unpacklist;
+  };
+
+  int nsend,nrecv,ncopy;
+  Send *send;
+  Recv *recv;
+  Copy *copy;
+
+  // -------------------------------------------
+  // internal methods
+  // -------------------------------------------
+  
+  void setup_regular(int &, int &);
+  void setup_tiled(int &, int &);
+  void ghost_box_drop(int *, int *);
+  void box_drop_grid(int *, int, int, int &, int *);
+  
+  int ghost_adjacent_regular();
+  int ghost_adjacent_tiled();
+  
+  void forward_comm_kspace_regular(class KSpace *, int, int, int,
+				   void *, void *, MPI_Datatype);
+  void forward_comm_kspace_tiled(class KSpace *, int, int, int,
+				 void *, void *, MPI_Datatype);
+  void reverse_comm_kspace_regular(class KSpace *, int, int, int,
+				   void *, void *, MPI_Datatype);
+  void reverse_comm_kspace_tiled(class KSpace *, int, int, int,
+				 void *, void *, MPI_Datatype);
+
+  void grow_swap();
+  void grow_overlap();
+  
   int indices(int *&, int, int, int, int, int, int);
 };
 
diff --git a/src/KSPACE/pppm.cpp b/src/KSPACE/pppm.cpp
index a44fc4ab6c..173893c22f 100644
--- a/src/KSPACE/pppm.cpp
+++ b/src/KSPACE/pppm.cpp
@@ -70,11 +70,12 @@ PPPM::PPPM(LAMMPS *lmp) : KSpace(lmp),
   u_brick(NULL), v0_brick(NULL), v1_brick(NULL), v2_brick(NULL), v3_brick(NULL),
   v4_brick(NULL), v5_brick(NULL), greensfn(NULL), vg(NULL), fkx(NULL), fky(NULL),
   fkz(NULL), density_fft(NULL), work1(NULL), work2(NULL), gf_b(NULL), rho1d(NULL),
-  rho_coeff(NULL), drho1d(NULL), drho_coeff(NULL), sf_precoeff1(NULL), sf_precoeff2(NULL),
-  sf_precoeff3(NULL), sf_precoeff4(NULL), sf_precoeff5(NULL), sf_precoeff6(NULL),
+  rho_coeff(NULL), drho1d(NULL), drho_coeff(NULL),
+  sf_precoeff1(NULL), sf_precoeff2(NULL), sf_precoeff3(NULL),
+  sf_precoeff4(NULL), sf_precoeff5(NULL), sf_precoeff6(NULL),
   acons(NULL), density_A_brick(NULL), density_B_brick(NULL), density_A_fft(NULL),
-  density_B_fft(NULL), fft1(NULL), fft2(NULL), remap(NULL), cg(NULL), cg_peratom(NULL),
-  part2grid(NULL), boxlo(NULL)
+  density_B_fft(NULL), fft1(NULL), fft2(NULL), remap(NULL), gc(NULL),
+  gc_buf1(NULL), gc_buf2(NULL), part2grid(NULL), boxlo(NULL)
 {
   peratom_allocate_flag = 0;
   group_allocate_flag = 0;
@@ -117,8 +118,8 @@ PPPM::PPPM(LAMMPS *lmp) : KSpace(lmp),
 
   fft1 = fft2 = NULL;
   remap = NULL;
-  cg = NULL;
-  cg_peratom = NULL;
+  gc = NULL;
+  gc_buf1 = gc_buf2 = NULL;
 
   nmax = 0;
   part2grid = NULL;
@@ -205,9 +206,6 @@ void PPPM::init()
                "slab correction");
   if (domain->dimension == 2)
     error->all(FLERR,"Cannot use PPPM with 2d simulation");
-  if (comm->style != 0)
-    error->universe_all(FLERR,"PPPM can only currently be used with "
-                        "comm_style brick");
 
   if (!atom->q_flag)
     error->all(FLERR,"Kspace style requires atom attribute q");
@@ -297,9 +295,7 @@ void PPPM::init()
   //   or overlap is allowed, then done
   // else reduce order and try again
 
-  int (*procneigh)[2] = comm->procneigh;
-
-  GridComm *cgtmp = NULL;
+  GridComm *gctmp = NULL;
   int iteration = 0;
 
   while (order >= minorder) {
@@ -312,24 +308,24 @@ void PPPM::init()
     set_grid_local();
     if (overlap_allowed) break;
 
-    cgtmp = new GridComm(lmp,world,1,1,
-                         nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                         procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                         procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-    cgtmp->ghost_notify();
-    if (!cgtmp->ghost_overlap()) break;
-    delete cgtmp;
+    gctmp = new GridComm(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+			 nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+			 nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
+
+    int tmp1,tmp2;
+    gctmp->setup(tmp1,tmp2);
+    if (gctmp->ghost_adjacent()) break;
+    delete gctmp;
 
     order--;
     iteration++;
   }
 
   if (order < minorder) error->all(FLERR,"PPPM order < minimum allowed order");
-  if (!overlap_allowed && cgtmp->ghost_overlap())
+  if (!overlap_allowed && !gctmp->ghost_adjacent())
     error->all(FLERR,"PPPM grid stencil extends "
                "beyond nearest neighbor processor");
-  if (cgtmp) delete cgtmp;
+  if (gctmp) delete gctmp;
 
   // adjust g_ewald
 
@@ -363,8 +359,6 @@ void PPPM::init()
   // don't invoke allocate peratom() or group(), will be allocated when needed
 
   allocate();
-  cg->ghost_notify();
-  cg->setup();
 
   // pre-compute Green's function denomiator expansion
   // pre-compute 1d charge distribution coefficients
@@ -578,11 +572,9 @@ void PPPM::setup_grid()
 
   allocate();
 
-  cg->ghost_notify();
-  if (overlap_allowed == 0 && cg->ghost_overlap())
+  if (!overlap_allowed && !gc->ghost_adjacent())
     error->all(FLERR,"PPPM grid stencil extends "
                "beyond nearest neighbor processor");
-  cg->setup();
 
   // pre-compute Green's function denomiator expansion
   // pre-compute 1d charge distribution coefficients
@@ -591,7 +583,7 @@ void PPPM::setup_grid()
   if (differentiation_flag == 1) compute_sf_precoeff();
   compute_rho_coeff();
 
-  // pre-compute volume-dependent coeffs
+  // pre-compute volume-dependent coeffs for portion of grid I now own
 
   setup();
 }
@@ -609,11 +601,7 @@ void PPPM::compute(int eflag, int vflag)
 
   ev_init(eflag,vflag);
 
-  if (evflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    cg_peratom->ghost_notify();
-    cg_peratom->setup();
-  }
+  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();
 
   // if atom count has changed, update qsum and qsqsum
 
@@ -652,7 +640,8 @@ void PPPM::compute(int eflag, int vflag)
   //   to fully sum contribution in their 3d bricks
   // remap from 3d decomposition to FFT decomposition
 
-  cg->reverse_comm(this,REVERSE_RHO);
+  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+			  gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   brick2fft();
 
   // compute potential gradient on my FFT grid and
@@ -665,16 +654,22 @@ void PPPM::compute(int eflag, int vflag)
   // all procs communicate E-field values
   // to fill ghost cells surrounding their 3d bricks
 
-  if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
-  else cg->forward_comm(this,FORWARD_IK);
+  if (differentiation_flag == 1)
+    gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  else
+    gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
   // extra per-atom energy/virial communication
 
   if (evflag_atom) {
     if (differentiation_flag == 1 && vflag_atom)
-      cg_peratom->forward_comm(this,FORWARD_AD_PERATOM);
+      gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     else if (differentiation_flag == 0)
-      cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
+      gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   }
 
   // calculate the force on my particles
@@ -821,21 +816,19 @@ void PPPM::allocate()
                     1,0,0,FFT_PRECISION,collective_flag);
 
   // create ghost grid object for rho and electric field communication
+  // also create 2 bufs for ghost grid cell comm, passed to GridComm methods
+  
+  gc = new GridComm(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+		    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+		    nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
 
-  int (*procneigh)[2] = comm->procneigh;
+  gc->setup(ngc_buf1,ngc_buf2);
 
-  if (differentiation_flag == 1)
-    cg = new GridComm(lmp,world,1,1,
-                      nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                      nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                      procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                      procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-  else
-    cg = new GridComm(lmp,world,3,1,
-                      nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                      nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                      procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                      procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+  if (differentiation_flag) npergrid = 1;
+  else npergrid = 3;
+
+  memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
+  memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
 }
 
 /* ----------------------------------------------------------------------
@@ -886,7 +879,9 @@ void PPPM::deallocate()
   delete fft1;
   delete fft2;
   delete remap;
-  delete cg;
+  delete gc;
+  memory->destroy(gc_buf1);
+  memory->destroy(gc_buf2);
 }
 
 /* ----------------------------------------------------------------------
@@ -915,24 +910,16 @@ void PPPM::allocate_peratom()
   memory->create3d_offset(v5_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                           nxlo_out,nxhi_out,"pppm:v5_brick");
 
-  // create ghost grid object for rho and electric field communication
+  // use same GC ghost grid object for peratom grid communication
+  // but need to reallocate a larger gc_buf1 and gc_buf2
 
-  int (*procneigh)[2] = comm->procneigh;
+  if (differentiation_flag) npergrid = 6;
+  else npergrid = 7;
 
-  if (differentiation_flag == 1)
-    cg_peratom =
-      new GridComm(lmp,world,6,1,
-                   nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                   nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                   procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                   procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-  else
-    cg_peratom =
-      new GridComm(lmp,world,7,1,
-                   nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                   nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                   procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                   procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+  memory->destroy(gc_buf1);
+  memory->destroy(gc_buf2);
+  memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
+  memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
 }
 
 /* ----------------------------------------------------------------------
@@ -952,8 +939,6 @@ void PPPM::deallocate_peratom()
 
   if (differentiation_flag != 1)
     memory->destroy3d_offset(u_brick,nzlo_out,nylo_out,nxlo_out);
-
-  delete cg_peratom;
 }
 
 /* ----------------------------------------------------------------------
@@ -1002,7 +987,8 @@ void PPPM::set_grid_global()
       int count = 0;
       while (1) {
 
-        // set grid dimension
+        // set grid dimensions
+	
         nx_pppm = static_cast<int> (xprd/h_x);
         ny_pppm = static_cast<int> (yprd/h_y);
         nz_pppm = static_cast<int> (zprd_slab/h_z);
@@ -1011,31 +997,16 @@ void PPPM::set_grid_global()
         if (ny_pppm <= 1) ny_pppm = 2;
         if (nz_pppm <= 1) nz_pppm = 2;
 
-        //set local grid dimension
-        int npey_fft,npez_fft;
-        if (nz_pppm >= nprocs) {
-          npey_fft = 1;
-          npez_fft = nprocs;
-        } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);
-
-        int me_y = me % npey_fft;
-        int me_z = me / npey_fft;
-
-        nxlo_fft = 0;
-        nxhi_fft = nx_pppm - 1;
-        nylo_fft = me_y*ny_pppm/npey_fft;
-        nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1;
-        nzlo_fft = me_z*nz_pppm/npez_fft;
-        nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1;
-
+	// estimate Kspace force error
+	
         double df_kspace = compute_df_kspace();
 
-        count++;
-
         // break loop if the accuracy has been reached or
         // too many loops have been performed
 
+	count++;
         if (df_kspace <= accuracy) break;
+
         if (count > 500) error->all(FLERR, "Could not compute grid size");
         h *= 0.95;
         h_x = h_y = h_z = h;
@@ -1163,7 +1134,11 @@ double PPPM::compute_df_kspace()
 
 double PPPM::compute_qopt()
 {
-  double qopt = 0.0;
+  int k,l,m,nx,ny,nz;
+  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  double u1,u2,sqk;
+  double sum1,sum2,sum3,sum4,dot2;
+
   double *prd = domain->prd;
 
   const double xprd = prd[0];
@@ -1176,67 +1151,69 @@ double PPPM::compute_qopt()
   const double unitky = (MY_2PI/yprd);
   const double unitkz = (MY_2PI/zprd_slab);
 
-  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
-  double u1, u2, sqk;
-  double sum1,sum2,sum3,sum4,dot2;
-
-  int k,l,m,nx,ny,nz;
   const int twoorder = 2*order;
 
-  for (m = nzlo_fft; m <= nzhi_fft; m++) {
+  // loop over entire FFT grid
+  // each proc calculates contributions from every Pth grid point
+  
+  bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
+  int nxy_pppm = nx_pppm * ny_pppm;
+  
+  double qopt = 0.0;
+
+  for (bigint i = me; i < ngridtotal; i += nprocs) {
+    k = i % nx_pppm;
+    l = (i/nx_pppm) % ny_pppm;
+    m = i / nxy_pppm;
+
+    const int kper = k - nx_pppm*(2*k/nx_pppm);
+    const int lper = l - ny_pppm*(2*l/ny_pppm);
     const int mper = m - nz_pppm*(2*m/nz_pppm);
 
-    for (l = nylo_fft; l <= nyhi_fft; l++) {
-      const int lper = l - ny_pppm*(2*l/ny_pppm);
+    sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
+    if (sqk == 0.0) continue;
 
-      for (k = nxlo_fft; k <= nxhi_fft; k++) {
-        const int kper = k - nx_pppm*(2*k/nx_pppm);
-
-        sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
-
-        if (sqk != 0.0) {
-
-          sum1 = 0.0;
-          sum2 = 0.0;
-          sum3 = 0.0;
-          sum4 = 0.0;
-          for (nx = -2; nx <= 2; nx++) {
-            qx = unitkx*(kper+nx_pppm*nx);
-            sx = exp(-0.25*square(qx/g_ewald));
-            argx = 0.5*qx*xprd/nx_pppm;
-            wx = powsinxx(argx,twoorder);
-            qx *= qx;
-
-            for (ny = -2; ny <= 2; ny++) {
-              qy = unitky*(lper+ny_pppm*ny);
-              sy = exp(-0.25*square(qy/g_ewald));
-              argy = 0.5*qy*yprd/ny_pppm;
-              wy = powsinxx(argy,twoorder);
-              qy *= qy;
-
-              for (nz = -2; nz <= 2; nz++) {
-                qz = unitkz*(mper+nz_pppm*nz);
-                sz = exp(-0.25*square(qz/g_ewald));
-                argz = 0.5*qz*zprd_slab/nz_pppm;
-                wz = powsinxx(argz,twoorder);
-                qz *= qz;
-
-                dot2 = qx+qy+qz;
-                u1   = sx*sy*sz;
-                u2   = wx*wy*wz;
-                sum1 += u1*u1/dot2*MY_4PI*MY_4PI;
-                sum2 += u1 * u2 * MY_4PI;
-                sum3 += u2;
-                sum4 += dot2*u2;
-              }
-            }
-          }
-          sum2 *= sum2;
-          qopt += sum1 - sum2/(sum3*sum4);
-        }
+    sum1 = sum2 = sum3 = sum4 = 0.0;
+    
+    for (nx = -2; nx <= 2; nx++) {
+      qx = unitkx*(kper+nx_pppm*nx);
+      sx = exp(-0.25*square(qx/g_ewald));
+      argx = 0.5*qx*xprd/nx_pppm;
+      wx = powsinxx(argx,twoorder);
+      qx *= qx;
+      
+      for (ny = -2; ny <= 2; ny++) {
+	qy = unitky*(lper+ny_pppm*ny);
+	sy = exp(-0.25*square(qy/g_ewald));
+	argy = 0.5*qy*yprd/ny_pppm;
+	wy = powsinxx(argy,twoorder);
+	qy *= qy;
+	
+	for (nz = -2; nz <= 2; nz++) {
+	  qz = unitkz*(mper+nz_pppm*nz);
+	  sz = exp(-0.25*square(qz/g_ewald));
+	  argz = 0.5*qz*zprd_slab/nz_pppm;
+	  wz = powsinxx(argz,twoorder);
+	  qz *= qz;
+	  
+	  dot2 = qx+qy+qz;
+	  u1   = sx*sy*sz;
+	  u2   = wx*wy*wz;
+	  
+	  sum1 += u1*u1/dot2*MY_4PI*MY_4PI;
+	  sum2 += u1 * u2 * MY_4PI;
+	  sum3 += u2;
+	  sum4 += dot2*u2;
+	}
       }
     }
+    
+    sum2 *= sum2;
+    qopt += sum1 - sum2/(sum3*sum4);
   }
+
+  // sum qopt over all procs
+  
   double qopt_all;
   MPI_Allreduce(&qopt,&qopt_all,1,MPI_DOUBLE,MPI_SUM,world);
   return qopt_all;
@@ -1349,17 +1326,30 @@ void PPPM::set_grid_local()
   // nlo_in,nhi_in = lower/upper limits of the 3d sub-brick of
   //   global PPPM grid that I own without ghost cells
   // for slab PPPM, assign z grid as if it were not extended
+  // both non-tiled and tiled proc layouts use 0-1 fractional sumdomain info
+  
+  if (comm->layout != Comm::LAYOUT_TILED) {
+    nxlo_in = static_cast<int> (comm->xsplit[comm->myloc[0]] * nx_pppm);
+    nxhi_in = static_cast<int> (comm->xsplit[comm->myloc[0]+1] * nx_pppm) - 1;
 
-  nxlo_in = static_cast<int> (comm->xsplit[comm->myloc[0]] * nx_pppm);
-  nxhi_in = static_cast<int> (comm->xsplit[comm->myloc[0]+1] * nx_pppm) - 1;
+    nylo_in = static_cast<int> (comm->ysplit[comm->myloc[1]] * ny_pppm);
+    nyhi_in = static_cast<int> (comm->ysplit[comm->myloc[1]+1] * ny_pppm) - 1;
 
-  nylo_in = static_cast<int> (comm->ysplit[comm->myloc[1]] * ny_pppm);
-  nyhi_in = static_cast<int> (comm->ysplit[comm->myloc[1]+1] * ny_pppm) - 1;
-
-  nzlo_in = static_cast<int>
+    nzlo_in = static_cast<int>
       (comm->zsplit[comm->myloc[2]] * nz_pppm/slab_volfactor);
-  nzhi_in = static_cast<int>
+    nzhi_in = static_cast<int>
       (comm->zsplit[comm->myloc[2]+1] * nz_pppm/slab_volfactor) - 1;
+    
+  } else {
+    nxlo_in = static_cast<int> (comm->mysplit[0][0] * nx_pppm);
+    nxhi_in = static_cast<int> (comm->mysplit[0][1] * nx_pppm) - 1;
+
+    nylo_in = static_cast<int> (comm->mysplit[1][0] * ny_pppm);
+    nyhi_in = static_cast<int> (comm->mysplit[1][1] * ny_pppm) - 1;
+
+    nzlo_in = static_cast<int> (comm->mysplit[2][0] * nz_pppm/slab_volfactor);
+    nzhi_in = static_cast<int> (comm->mysplit[2][1] * nz_pppm/slab_volfactor) - 1;
+  }
 
   // nlower,nupper = stencil size for mapping particles to PPPM grid
 
@@ -1446,22 +1436,26 @@ void PPPM::set_grid_local()
   //   -z proc, but not vice versa
   // this is accomplished by nzhi_in = nzhi_out on +z end (no ghost cells)
   // also insure no other procs use ghost cells beyond +z limit
-
+  // differnet logic for non-tiled vs tiled decomposition
+  
   if (slabflag == 1) {
-    if (comm->myloc[2] == comm->procgrid[2]-1)
-      nzhi_in = nzhi_out = nz_pppm - 1;
+    if (comm->layout != Comm::LAYOUT_TILED) {
+      if (comm->myloc[2] == comm->procgrid[2]-1) nzhi_in = nzhi_out = nz_pppm - 1;
+    } else {
+      if (comm->mysplit[2][1] == 1.0) nzhi_in = nzhi_out = nz_pppm - 1;
+    }
     nzhi_out = MIN(nzhi_out,nz_pppm-1);
   }
 
-  // decomposition of FFT mesh
+  // x-pencil decomposition of FFT mesh
   // global indices range from 0 to N-1
-  // proc owns entire x-dimension, clumps of columns in y,z dimensions
+  // each proc owns entire x-dimension, clumps of columns in y,z dimensions
   // npey_fft,npez_fft = # of procs in y,z dims
   // if nprocs is small enough, proc can own 1 or more entire xy planes,
   //   else proc owns 2d sub-blocks of yz plane
   // me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions
   // nlo_fft,nhi_fft = lower/upper limit of the section
-  //   of the global FFT mesh that I own
+  //   of the global FFT mesh that I own in x-pencil decomposition
 
   int npey_fft,npez_fft;
   if (nz_pppm >= nprocs) {
@@ -1479,13 +1473,13 @@ void PPPM::set_grid_local()
   nzlo_fft = me_z*nz_pppm/npez_fft;
   nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1;
 
-  // PPPM grid pts owned by this proc, including ghosts
+  // ngrid = count of PPPM grid pts owned by this proc, including ghosts
 
   ngrid = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
     (nzhi_out-nzlo_out+1);
 
-  // FFT grids owned by this proc, without ghosts
-  // nfft = FFT points in FFT decomposition on this proc
+  // count of FFT grids pts owned by this proc, without ghosts
+  // nfft = FFT points in x-pencil FFT decomposition on this proc
   // nfft_brick = FFT points in 3d brick-decomposition on this proc
   // nfft_both = greater of 2 values
 
@@ -2637,8 +2631,10 @@ void PPPM::fieldforce_peratom()
    pack own values to buf to send to another proc
 ------------------------------------------------------------------------- */
 
-void PPPM::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPM::pack_forward_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+  
   int n = 0;
 
   if (flag == FORWARD_IK) {
@@ -2695,8 +2691,10 @@ void PPPM::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
    unpack another proc's own values from buf and set own ghost values
 ------------------------------------------------------------------------- */
 
-void PPPM::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPM::unpack_forward_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
   int n = 0;
 
   if (flag == FORWARD_IK) {
@@ -2753,8 +2751,10 @@ void PPPM::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
    pack ghost values into buf to send to another proc
 ------------------------------------------------------------------------- */
 
-void PPPM::pack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPM::pack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+    
   if (flag == REVERSE_RHO) {
     FFT_SCALAR *src = &density_brick[nzlo_out][nylo_out][nxlo_out];
     for (int i = 0; i < nlist; i++)
@@ -2766,8 +2766,10 @@ void PPPM::pack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
    unpack another proc's ghost values from buf and add to own values
 ------------------------------------------------------------------------- */
 
-void PPPM::unpack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPM::unpack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
   if (flag == REVERSE_RHO) {
     FFT_SCALAR *dest = &density_brick[nzlo_out][nylo_out][nxlo_out];
     for (int i = 0; i < nlist; i++)
@@ -3056,6 +3058,7 @@ int PPPM::timing_3d(int n, double &time3d)
 double PPPM::memory_usage()
 {
   double bytes = nmax*3 * sizeof(double);
+
   int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
     (nzhi_out-nzlo_out+1);
   if (differentiation_flag == 1) {
@@ -3063,6 +3066,7 @@ double PPPM::memory_usage()
   } else {
     bytes += 4 * nbrick * sizeof(FFT_SCALAR);
   }
+
   if (triclinic) bytes += 3 * nfft_both * sizeof(double);
   bytes += 6 * nfft_both * sizeof(double);
   bytes += nfft_both * sizeof(double);
@@ -3076,8 +3080,10 @@ double PPPM::memory_usage()
     bytes += 2 * nfft_both * sizeof(FFT_SCALAR);;
   }
 
-  if (cg) bytes += cg->memory_usage();
+  // two GridComm bufs
 
+  bytes += (ngc_buf1 + ngc_buf2) * npergrid * sizeof(FFT_SCALAR);
+  
   return bytes;
 }
 
@@ -3134,7 +3140,8 @@ void PPPM::compute_group_group(int groupbit_A, int groupbit_B, int AA_flag)
   density_brick = density_A_brick;
   density_fft = density_A_fft;
 
-  cg->reverse_comm(this,REVERSE_RHO);
+  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+			  gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   brick2fft();
 
   // group B
@@ -3142,7 +3149,8 @@ void PPPM::compute_group_group(int groupbit_A, int groupbit_B, int AA_flag)
   density_brick = density_B_brick;
   density_fft = density_B_fft;
 
-  cg->reverse_comm(this,REVERSE_RHO);
+  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+			  gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   brick2fft();
 
   // switch back pointers
diff --git a/src/KSPACE/pppm.h b/src/KSPACE/pppm.h
index f73fb49ad3..b7416a0a9c 100644
--- a/src/KSPACE/pppm.h
+++ b/src/KSPACE/pppm.h
@@ -96,17 +96,21 @@ class PPPM : public KSpace {
   double sf_coeff[6];          // coefficients for calculating ad self-forces
   double **acons;
 
+  // FFTs and grid communication
+
+  class FFT3d *fft1,*fft2;
+  class Remap *remap;
+  class GridComm *gc;
+  
+  FFT_SCALAR *gc_buf1,*gc_buf2;
+  int ngc_buf1,ngc_buf2,npergrid;
+
   // group-group interactions
 
   int group_allocate_flag;
   FFT_SCALAR ***density_A_brick,***density_B_brick;
   FFT_SCALAR *density_A_fft,*density_B_fft;
 
-  class FFT3d *fft1,*fft2;
-  class Remap *remap;
-  class GridComm *cg;
-  class GridComm *cg_peratom;
-
   int **part2grid;             // storage for particle -> grid mapping
   int nmax;
 
@@ -160,10 +164,10 @@ class PPPM : public KSpace {
 
   // grid communication
 
-  virtual void pack_forward(int, FFT_SCALAR *, int, int *);
-  virtual void unpack_forward(int, FFT_SCALAR *, int, int *);
-  virtual void pack_reverse(int, FFT_SCALAR *, int, int *);
-  virtual void unpack_reverse(int, FFT_SCALAR *, int, int *);
+  virtual void pack_forward_grid(int, void *, int, int *);
+  virtual void unpack_forward_grid(int, void *, int, int *);
+  virtual void pack_reverse_grid(int, void *, int, int *);
+  virtual void unpack_reverse_grid(int, void *, int, int *);
 
   // triclinic
 
diff --git a/src/KSPACE/pppm_cg.cpp b/src/KSPACE/pppm_cg.cpp
index 14631c480e..081113ea0d 100644
--- a/src/KSPACE/pppm_cg.cpp
+++ b/src/KSPACE/pppm_cg.cpp
@@ -90,12 +90,8 @@ void PPPMCG::compute(int eflag, int vflag)
 
   ev_init(eflag,vflag);
 
-  if (evflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    cg_peratom->ghost_notify();
-    cg_peratom->setup();
-  }
-
+  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();
+  
   // if atom count has changed, update qsum and qsqsum
 
   if (atom->natoms != natoms_original) {
@@ -162,6 +158,7 @@ void PPPMCG::compute(int eflag, int vflag)
   }
 
   // only need to rebuild this list after a neighbor list update
+  
   if (neighbor->ago == 0) {
     num_charged = 0;
     for (int i = 0; i < atom->nlocal; ++i) {
@@ -182,7 +179,8 @@ void PPPMCG::compute(int eflag, int vflag)
   //   to fully sum contribution in their 3d bricks
   // remap from 3d decomposition to FFT decomposition
 
-  cg->reverse_comm(this,REVERSE_RHO);
+  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+			  gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   brick2fft();
 
   // compute potential gradient on my FFT grid and
@@ -195,16 +193,22 @@ void PPPMCG::compute(int eflag, int vflag)
   // all procs communicate E-field values
   // to fill ghost cells surrounding their 3d bricks
 
-  if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
-  else cg->forward_comm(this,FORWARD_IK);
+  if (differentiation_flag == 1)
+    gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  else
+    gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
   // extra per-atom energy/virial communication
 
   if (evflag_atom) {
     if (differentiation_flag == 1 && vflag_atom)
-      cg_peratom->forward_comm(this,FORWARD_AD_PERATOM);
+      gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     else if (differentiation_flag == 0)
-      cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
+      gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   }
 
   // calculate the force on my particles
diff --git a/src/KSPACE/pppm_dipole.cpp b/src/KSPACE/pppm_dipole.cpp
index 314e8704b3..312da6c304 100644
--- a/src/KSPACE/pppm_dipole.cpp
+++ b/src/KSPACE/pppm_dipole.cpp
@@ -79,8 +79,7 @@ PPPMDipole::PPPMDipole(LAMMPS *lmp) : PPPM(lmp),
   dipoleflag = 1;
   group_group_enable = 0;
 
-  cg_dipole = NULL;
-  cg_peratom_dipole = NULL;
+  gc_dipole = NULL;
 }
 
 /* ----------------------------------------------------------------------
@@ -93,10 +92,6 @@ PPPMDipole::~PPPMDipole()
 
   deallocate();
   if (peratom_allocate_flag) deallocate_peratom();
-  fft1 = NULL;
-  fft2 = NULL;
-  remap = NULL;
-  cg_dipole = NULL;
 }
 
 /* ----------------------------------------------------------------------
@@ -195,9 +190,7 @@ void PPPMDipole::init()
   //   or overlap is allowed, then done
   // else reduce order and try again
 
-  int (*procneigh)[2] = comm->procneigh;
-
-  GridComm *cgtmp = NULL;
+  GridComm *gctmp = NULL;
   int iteration = 0;
 
   while (order >= minorder) {
@@ -210,24 +203,24 @@ void PPPMDipole::init()
     set_grid_local();
     if (overlap_allowed) break;
 
-    cgtmp = new GridComm(lmp,world,1,1,
+    gctmp = new GridComm(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                          nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                         procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                         procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-    cgtmp->ghost_notify();
-    if (!cgtmp->ghost_overlap()) break;
-    delete cgtmp;
+                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
+
+    int tmp1,tmp2;
+    gctmp->setup(tmp1,tmp2);
+    if (gctmp->ghost_adjacent()) break;
+    delete gctmp;
 
     order--;
     iteration++;
   }
 
   if (order < minorder) error->all(FLERR,"PPPMDipole order < minimum allowed order");
-  if (!overlap_allowed && cgtmp->ghost_overlap())
+  if (!overlap_allowed && !gctmp->ghost_adjacent())
     error->all(FLERR,"PPPMDipole grid stencil extends "
                "beyond nearest neighbor processor");
-  if (cgtmp) delete cgtmp;
+  if (gctmp) delete gctmp;
 
   // adjust g_ewald
 
@@ -261,8 +254,6 @@ void PPPMDipole::init()
   // don't invoke allocate peratom(), will be allocated when needed
 
   allocate();
-  cg_dipole->ghost_notify();
-  cg_dipole->setup();
 
   // pre-compute Green's function denomiator expansion
   // pre-compute 1d charge distribution coefficients
@@ -385,11 +376,9 @@ void PPPMDipole::setup_grid()
 
   allocate();
 
-  cg_dipole->ghost_notify();
-  if (overlap_allowed == 0 && cg_dipole->ghost_overlap())
+  if (!overlap_allowed && !gc_dipole->ghost_adjacent())
     error->all(FLERR,"PPPMDipole grid stencil extends "
                "beyond nearest neighbor processor");
-  cg_dipole->setup();
 
   // pre-compute Green's function denomiator expansion
   // pre-compute 1d charge distribution coefficients
@@ -421,11 +410,7 @@ void PPPMDipole::compute(int eflag, int vflag)
     error->all(FLERR,"Cannot (yet) compute per-atom virial "
                        "with kspace style pppm/dipole");
 
-  if (evflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    cg_peratom_dipole->ghost_notify();
-    cg_peratom_dipole->setup();
-  }
+  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();
 
   // if atom count has changed, update qsum and qsqsum
 
@@ -459,8 +444,9 @@ void PPPMDipole::compute(int eflag, int vflag)
   // all procs communicate density values from their ghost cells
   //   to fully sum contribution in their 3d bricks
   // remap from 3d decomposition to FFT decomposition
-
-  cg_dipole->reverse_comm(this,REVERSE_MU);
+  
+  gc_dipole->reverse_comm_kspace(this,3,sizeof(FFT_SCALAR),REVERSE_MU,
+				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   brick2fft_dipole();
 
   // compute potential gradient on my FFT grid and
@@ -473,13 +459,14 @@ void PPPMDipole::compute(int eflag, int vflag)
   // all procs communicate E-field values
   // to fill ghost cells surrounding their 3d bricks
 
-  cg_dipole->forward_comm(this,FORWARD_MU);
+  gc_dipole->forward_comm_kspace(this,9,sizeof(FFT_SCALAR),FORWARD_MU,
+				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
   // extra per-atom energy/virial communication
 
-  if (evflag_atom) {
-    cg_peratom_dipole->forward_comm(this,FORWARD_MU_PERATOM);
-  }
+  if (evflag_atom)
+    gc->forward_comm_kspace(this,18,sizeof(FFT_SCALAR),FORWARD_MU_PERATOM,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
   // calculate the force on my particles
 
@@ -522,7 +509,8 @@ void PPPMDipole::compute(int eflag, int vflag)
     if (eflag_atom) {
       for (i = 0; i < nlocal; i++) {
         eatom[i] *= 0.5;
-        eatom[i] -= (mu[i][0]*mu[i][0] + mu[i][1]*mu[i][1] + mu[i][2]*mu[i][2])*2.0*g3/3.0/MY_PIS;
+        eatom[i] -= (mu[i][0]*mu[i][0] + mu[i][1]*mu[i][1] +
+		     mu[i][2]*mu[i][2])*2.0*g3/3.0/MY_PIS;
         eatom[i] *= qscale;
       }
     }
@@ -619,14 +607,18 @@ void PPPMDipole::allocate()
                     1,0,0,FFT_PRECISION,collective_flag);
 
   // create ghost grid object for rho and electric field communication
+  // also create 2 bufs for ghost grid cell comm, passed to GridComm methods
 
-  int (*procneigh)[2] = comm->procneigh;
-
-  cg_dipole = new GridComm(lmp,world,9,3,
+  gc_dipole = new GridComm(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                            nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                           nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                           procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                           procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+                           nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
+
+  gc->setup(ngc_buf1,ngc_buf2);
+
+  npergrid = 9;
+
+  memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
+  memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
 }
 
 /* ----------------------------------------------------------------------
@@ -674,7 +666,9 @@ void PPPMDipole::deallocate()
   delete fft1;
   delete fft2;
   delete remap;
-  delete cg_dipole;
+  delete gc_dipole;
+  memory->destroy(gc_buf1);
+  memory->destroy(gc_buf2);
 }
 
 /* ----------------------------------------------------------------------
@@ -724,16 +718,15 @@ void PPPMDipole::allocate_peratom()
   memory->create3d_offset(v5z_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                           nxlo_out,nxhi_out,"pppm_dipole:v5z_brick_dipole");
 
-  // create ghost grid object for rho and electric field communication
+  // use same GC ghost grid object for peratom grid communication
+  // but need to reallocate a larger gc_buf1 and gc_buf2
 
-  int (*procneigh)[2] = comm->procneigh;
+  npergrid = 18;
 
-  cg_peratom_dipole =
-    new GridComm(lmp,world,18,1,
-                 nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                 nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                 procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                 procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+  memory->destroy(gc_buf1);
+  memory->destroy(gc_buf2);
+  memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
+  memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
 }
 
 /* ----------------------------------------------------------------------
@@ -764,8 +757,6 @@ void PPPMDipole::deallocate_peratom()
   memory->destroy3d_offset(v3z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
   memory->destroy3d_offset(v4z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
   memory->destroy3d_offset(v5z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
-
-  delete cg_peratom_dipole;
 }
 
 /* ----------------------------------------------------------------------
@@ -2171,8 +2162,10 @@ void PPPMDipole::fieldforce_peratom_dipole()
    pack own values to buf to send to another proc
 ------------------------------------------------------------------------- */
 
-void PPPMDipole::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPMDipole::pack_forward_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
   int n = 0;
 
   if (flag == FORWARD_MU) {
@@ -2242,8 +2235,10 @@ void PPPMDipole::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
    unpack another proc's own values from buf and set own ghost values
 ------------------------------------------------------------------------- */
 
-void PPPMDipole::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPMDipole::unpack_forward_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
   int n = 0;
 
   if (flag == FORWARD_MU) {
@@ -2313,8 +2308,10 @@ void PPPMDipole::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
    pack ghost values into buf to send to another proc
 ------------------------------------------------------------------------- */
 
-void PPPMDipole::pack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPMDipole::pack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
   int n = 0;
   if (flag == REVERSE_MU) {
     FFT_SCALAR *src_dipole0 = &densityx_brick_dipole[nzlo_out][nylo_out][nxlo_out];
@@ -2332,8 +2329,10 @@ void PPPMDipole::pack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
    unpack another proc's ghost values from buf and add to own values
 ------------------------------------------------------------------------- */
 
-void PPPMDipole::unpack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPMDipole::unpack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
   int n = 0;
   if (flag == REVERSE_MU) {
     FFT_SCALAR *dest_dipole0 = &densityx_brick_dipole[nzlo_out][nylo_out][nxlo_out];
@@ -2484,6 +2483,7 @@ int PPPMDipole::timing_3d(int n, double &time3d)
 double PPPMDipole::memory_usage()
 {
   double bytes = nmax*3 * sizeof(double);
+  
   int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
     (nzhi_out-nzlo_out+1);
   bytes += 6 * nfft_both * sizeof(double);   // vg
@@ -2495,8 +2495,9 @@ double PPPMDipole::memory_usage()
   if (peratom_allocate_flag)
     bytes += 21 * nbrick * sizeof(FFT_SCALAR);
 
-  if (cg_dipole) bytes += cg_dipole->memory_usage();
-  if (cg_peratom_dipole) bytes += cg_peratom_dipole->memory_usage();
+  // two GridComm bufs
+
+  bytes += (ngc_buf1 + ngc_buf2) * npergrid * sizeof(FFT_SCALAR);
 
   return bytes;
 }
diff --git a/src/KSPACE/pppm_dipole.h b/src/KSPACE/pppm_dipole.h
index a767f8b4c2..f7a8b63930 100644
--- a/src/KSPACE/pppm_dipole.h
+++ b/src/KSPACE/pppm_dipole.h
@@ -50,10 +50,10 @@ class PPPMDipole : public PPPM {
 
   // grid communication
 
-  void pack_forward(int, FFT_SCALAR *, int, int *);
-  void unpack_forward(int, FFT_SCALAR *, int, int *);
-  void pack_reverse(int, FFT_SCALAR *, int, int *);
-  void unpack_reverse(int, FFT_SCALAR *, int, int *);
+  void pack_forward_grid(int, void *, int, int *);
+  void unpack_forward_grid(int, void *, int, int *);
+  void pack_reverse_grid(int, void *, int, int *);
+  void unpack_reverse_grid(int, void *, int, int *);
 
   // dipole
 
@@ -69,10 +69,12 @@ class PPPMDipole : public PPPM {
   FFT_SCALAR ***v3z_brick_dipole,***v4z_brick_dipole,***v5z_brick_dipole;
   FFT_SCALAR *work3,*work4;
   FFT_SCALAR *densityx_fft_dipole,*densityy_fft_dipole,*densityz_fft_dipole;
-  class GridComm *cg_dipole;
-  class GridComm *cg_peratom_dipole;
+
+  class GridComm *gc_dipole;
+
   int only_dipole_flag;
   double musum,musqsum,mu2;
+  
   double find_gewald_dipole(double, double, bigint, double, double);
   double newton_raphson_f_dipole(double, double, bigint, double, double);
   double derivf_dipole(double, double, bigint, double, double);
@@ -87,7 +89,6 @@ class PPPMDipole : public PPPM {
   void fieldforce_peratom_dipole();
   double final_accuracy_dipole();
   void musum_musq();
-
 };
 
 }
diff --git a/src/KSPACE/pppm_dipole_spin.cpp b/src/KSPACE/pppm_dipole_spin.cpp
index a4373eb276..c8cebdfeef 100644
--- a/src/KSPACE/pppm_dipole_spin.cpp
+++ b/src/KSPACE/pppm_dipole_spin.cpp
@@ -81,7 +81,7 @@ PPPMDipoleSpin::~PPPMDipoleSpin()
   fft1 = NULL;
   fft2 = NULL;
   remap = NULL;
-  cg_dipole = NULL;
+  gc_dipole = NULL;
 }
 
 /* ----------------------------------------------------------------------
@@ -175,9 +175,7 @@ void PPPMDipoleSpin::init()
   //   or overlap is allowed, then done
   // else reduce order and try again
 
-  int (*procneigh)[2] = comm->procneigh;
-
-  GridComm *cgtmp = NULL;
+  GridComm *gctmp = NULL;
   int iteration = 0;
 
   while (order >= minorder) {
@@ -190,24 +188,24 @@ void PPPMDipoleSpin::init()
     set_grid_local();
     if (overlap_allowed) break;
 
-    cgtmp = new GridComm(lmp,world,1,1,
+    gctmp = new GridComm(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                          nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                         procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                         procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-    cgtmp->ghost_notify();
-    if (!cgtmp->ghost_overlap()) break;
-    delete cgtmp;
+                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
+
+    int tmp1,tmp2;
+    gctmp->setup(tmp1,tmp2);
+    if (gctmp->ghost_adjacent()) break;
+    delete gctmp;
 
     order--;
     iteration++;
   }
 
   if (order < minorder) error->all(FLERR,"PPPMDipoleSpin order < minimum allowed order");
-  if (!overlap_allowed && cgtmp->ghost_overlap())
+  if (!overlap_allowed && !gctmp->ghost_adjacent())
     error->all(FLERR,"PPPMDipoleSpin grid stencil extends "
                "beyond nearest neighbor processor");
-  if (cgtmp) delete cgtmp;
+  if (gctmp) delete gctmp;
 
   // adjust g_ewald
 
@@ -241,8 +239,6 @@ void PPPMDipoleSpin::init()
   // don't invoke allocate peratom(), will be allocated when needed
 
   allocate();
-  cg_dipole->ghost_notify();
-  cg_dipole->setup();
 
   // pre-compute Green's function denominator expansion
   // pre-compute 1d charge distribution coefficients
@@ -270,11 +266,7 @@ void PPPMDipoleSpin::compute(int eflag, int vflag)
     error->all(FLERR,"Cannot (yet) compute per-atom virial "
                        "with kspace style pppm/dipole/spin");
 
-  if (evflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    cg_peratom_dipole->ghost_notify();
-    cg_peratom_dipole->setup();
-  }
+  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();
 
   // if atom count has changed, update qsum and qsqsum
 
@@ -309,7 +301,8 @@ void PPPMDipoleSpin::compute(int eflag, int vflag)
   //   to fully sum contribution in their 3d bricks
   // remap from 3d decomposition to FFT decomposition
 
-  cg_dipole->reverse_comm(this,REVERSE_MU);
+  gc_dipole->reverse_comm_kspace(this,3,sizeof(FFT_SCALAR),REVERSE_MU,
+				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   brick2fft_dipole();
 
   // compute potential gradient on my FFT grid and
@@ -322,13 +315,14 @@ void PPPMDipoleSpin::compute(int eflag, int vflag)
   // all procs communicate E-field values
   // to fill ghost cells surrounding their 3d bricks
 
-  cg_dipole->forward_comm(this,FORWARD_MU);
+  gc_dipole->forward_comm_kspace(this,9,sizeof(FFT_SCALAR),FORWARD_MU,
+				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
   // extra per-atom energy/virial communication
 
-  if (evflag_atom) {
-    cg_peratom_dipole->forward_comm(this,FORWARD_MU_PERATOM);
-  }
+  if (evflag_atom)
+    gc->forward_comm_kspace(this,18,sizeof(FFT_SCALAR),FORWARD_MU_PERATOM,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
   // calculate the force on my particles
 
diff --git a/src/KSPACE/pppm_dipole_spin.h b/src/KSPACE/pppm_dipole_spin.h
index fe88fc75ce..e50b342b19 100644
--- a/src/KSPACE/pppm_dipole_spin.h
+++ b/src/KSPACE/pppm_dipole_spin.h
@@ -46,7 +46,6 @@ class PPPMDipoleSpin : public PPPMDipole {
   void fieldforce_ik_spin();
   void fieldforce_peratom_spin();
   void spsum_spsq();
-
 };
 
 }
diff --git a/src/KSPACE/pppm_disp.cpp b/src/KSPACE/pppm_disp.cpp
index b5fd14796c..67bfc170ee 100644
--- a/src/KSPACE/pppm_disp.cpp
+++ b/src/KSPACE/pppm_disp.cpp
@@ -51,8 +51,8 @@ enum{REVERSE_RHO, REVERSE_RHO_G, REVERSE_RHO_A, REVERSE_RHO_NONE};
 enum{FORWARD_IK, FORWARD_AD, FORWARD_IK_PERATOM, FORWARD_AD_PERATOM,
      FORWARD_IK_G, FORWARD_AD_G, FORWARD_IK_PERATOM_G, FORWARD_AD_PERATOM_G,
      FORWARD_IK_A, FORWARD_AD_A, FORWARD_IK_PERATOM_A, FORWARD_AD_PERATOM_A,
-     FORWARD_IK_NONE, FORWARD_AD_NONE, FORWARD_IK_PERATOM_NONE, FORWARD_AD_PERATOM_NONE};
-
+     FORWARD_IK_NONE, FORWARD_AD_NONE, FORWARD_IK_PERATOM_NONE,
+     FORWARD_AD_PERATOM_NONE};
 
 #ifdef FFT_SINGLE
 #define ZEROF 0.0f
@@ -104,8 +104,8 @@ PPPMDisp::PPPMDisp(LAMMPS *lmp) : KSpace(lmp),
   sf_precoeff6_6(NULL), rho1d(NULL), rho_coeff(NULL), drho1d(NULL), drho_coeff(NULL),
   rho1d_6(NULL), rho_coeff_6(NULL), drho1d_6(NULL), drho_coeff_6(NULL), work1(NULL),
    work2(NULL), work1_6(NULL), work2_6(NULL), fft1(NULL), fft2(NULL), fft1_6(NULL),
-   fft2_6(NULL), remap(NULL), remap_6(NULL), cg(NULL), cg_peratom(NULL), cg_6(NULL),
-   cg_peratom_6(NULL), part2grid(NULL), part2grid_6(NULL), boxlo(NULL)
+   fft2_6(NULL), remap(NULL), remap_6(NULL), gc(NULL), gc6(NULL),
+   part2grid(NULL), part2grid_6(NULL), boxlo(NULL)
 {
   triclinic_support = 0;
   pppmflag = dispersionflag = 1;
@@ -210,17 +210,15 @@ PPPMDisp::PPPMDisp(LAMMPS *lmp) : KSpace(lmp),
   fft1_6 = fft2_6 = NULL;
   remap = NULL;
   remap_6 = NULL;
+  gc = gc6 = NULL;
+  gc_buf1 = gc_buf2 = NULL;
+  gc6_buf1 = gc6_buf2 = NULL;
 
   nmax = 0;
   part2grid = NULL;
   part2grid_6 = NULL;
 
-  cg = NULL;
-  cg_peratom = NULL;
-  cg_6 = NULL;
-  cg_peratom_6 = NULL;
-
-  memset(function, 0, EWALD_FUNCS*sizeof(int));
+  memset(function,0,EWALD_FUNCS*sizeof(int));
 }
 
 /* ---------------------------------------------------------------------- */
@@ -410,11 +408,10 @@ void PPPMDisp::init()
   if (accuracy_absolute >= 0.0) accuracy = accuracy_absolute;
   else accuracy = accuracy_relative * two_charge_force;
 
-  int (*procneigh)[2] = comm->procneigh;
-
   int iteration = 0;
   if (function[0]) {
-    GridComm *cgtmp = NULL;
+    
+    GridComm *gctmp = NULL;
     while (order >= minorder) {
 
       if (iteration && me == 0)
@@ -442,23 +439,24 @@ void PPPMDisp::init()
 
       if (overlap_allowed) break;
 
-      cgtmp = new GridComm(lmp, world,1,1,
+      gctmp = new GridComm(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                            nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                           nxlo_out,nxhi_out,nylo_out,nyhi_out,
-                           nzlo_out,nzhi_out,
-                           procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                           procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-      cgtmp->ghost_notify();
-      if (!cgtmp->ghost_overlap()) break;
-      delete cgtmp;
+                           nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
+
+      int tmp1,tmp2;
+      gctmp->setup(tmp1,tmp2);
+      if (gctmp->ghost_adjacent()) break;
+      delete gctmp;
 
       order--;
     }
 
     if (order < minorder)
-      error->all(FLERR,
-                 "Coulomb PPPMDisp order has been reduced below minorder");
-    if (cgtmp) delete cgtmp;
+      error->all(FLERR,"Coulomb PPPMDisp order has been reduced below minorder");
+    if (!overlap_allowed && !gctmp->ghost_adjacent())
+      error->all(FLERR,"PPPMDisp grid stencil extends "
+		 "beyond nearest neighbor processor");
+    if (gctmp) delete gctmp;
 
     // adjust g_ewald
 
@@ -493,7 +491,8 @@ void PPPMDisp::init()
 
   iteration = 0;
   if (function[1] + function[2] + function[3]) {
-    GridComm *cgtmp = NULL;
+    
+    GridComm *gctmp = NULL;
     while (order_6 >= minorder) {
 
       if (iteration && me == 0)
@@ -519,23 +518,27 @@ void PPPMDisp::init()
 
       if (overlap_allowed) break;
 
-      cgtmp = new GridComm(lmp,world,1,1,
+      gctmp = new GridComm(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6,
                            nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,
                            nzlo_in_6,nzhi_in_6,
                            nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,
-                           nzlo_out_6,nzhi_out_6,
-                           procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                           procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-      cgtmp->ghost_notify();
-      if (!cgtmp->ghost_overlap()) break;
-      delete cgtmp;
+                           nzlo_out_6,nzhi_out_6);
+      
+      int tmp1,tmp2;
+      gctmp->setup(tmp1,tmp2);
+      if (gctmp->ghost_adjacent()) break;
+      delete gctmp;
+
       order_6--;
     }
 
     if (order_6 < minorder)
       error->all(FLERR,"Dispersion PPPMDisp order has been "
                  "reduced below minorder");
-    if (cgtmp) delete cgtmp;
+    if (!overlap_allowed && !gctmp->ghost_adjacent())
+      error->all(FLERR,"Dispersion PPPMDisp grid stencil extends "
+		 "beyond nearest neighbor processor");
+    if (gctmp) delete gctmp;
 
     // adjust g_ewald_6
 
@@ -547,7 +550,6 @@ void PPPMDisp::init()
     double acc, acc_real, acc_kspace;
     final_accuracy_6(acc, acc_real, acc_kspace);
 
-
     // print stats
 
     int ngrid_max,nfft_both_max;
@@ -581,8 +583,6 @@ void PPPMDisp::init()
   if (function[0]) {
     compute_gf_denom(gf_b, order);
     compute_rho_coeff(rho_coeff, drho_coeff, order);
-    cg->ghost_notify();
-    cg->setup();
     if (differentiation_flag == 1)
       compute_sf_precoeff(nx_pppm, ny_pppm, nz_pppm, order,
                           nxlo_fft, nylo_fft, nzlo_fft,
@@ -593,8 +593,6 @@ void PPPMDisp::init()
   if (function[1] + function[2] + function[3]) {
     compute_gf_denom(gf_b_6, order_6);
     compute_rho_coeff(rho_coeff_6, drho_coeff_6, order_6);
-    cg_6->ghost_notify();
-    cg_6->setup();
     if (differentiation_flag == 1)
       compute_sf_precoeff(nx_pppm_6, ny_pppm_6, nz_pppm_6, order_6,
                           nxlo_fft_6, nylo_fft_6, nzlo_fft_6,
@@ -602,7 +600,6 @@ void PPPMDisp::init()
                           sf_precoeff1_6, sf_precoeff2_6, sf_precoeff3_6,
                           sf_precoeff4_6, sf_precoeff5_6, sf_precoeff6_6);
   }
-
 }
 
 /* ----------------------------------------------------------------------
@@ -611,7 +608,6 @@ void PPPMDisp::init()
 
 void PPPMDisp::setup()
 {
-
   if (slabflag == 0 && domain->nonperiodic > 0)
     error->all(FLERR,"Cannot use non-periodic boundaries with PPPMDisp");
   if (slabflag == 1) {
@@ -642,7 +638,8 @@ void PPPMDisp::setup()
   double unitkz = (2.0*MY_PI/zprd_slab);
 
   //compute the virial coefficients and green functions
-  if (function[0]){
+  
+  if (function[0]) {
 
     delxinv = nx_pppm/xprd;
     delyinv = ny_pppm/yprd;
@@ -831,18 +828,14 @@ void PPPMDisp::setup_grid()
   allocate();
 
   if (function[0]) {
-    cg->ghost_notify();
-    if (overlap_allowed == 0 && cg->ghost_overlap())
-      error->all(FLERR,"PPPM grid stencil extends "
+    if (!overlap_allowed && !gc->ghost_adjacent())
+      error->all(FLERR,"PPPMDisp grid stencil extends "
                  "beyond nearest neighbor processor");
-    cg->setup();
   }
   if (function[1] + function[2] + function[3]) {
-    cg_6->ghost_notify();
-    if (overlap_allowed == 0 && cg_6->ghost_overlap())
-      error->all(FLERR,"PPPM grid stencil extends "
+    if (!overlap_allowed && !gc6->ghost_adjacent())
+      error->all(FLERR,"Dispersion PPPMDisp grid stencil extends "
                  "beyond nearest neighbor processor");
-    cg_6->setup();
   }
 
   // pre-compute Green's function denomiator expansion
@@ -880,34 +873,26 @@ void PPPMDisp::setup_grid()
 
 void PPPMDisp::compute(int eflag, int vflag)
 {
-
   int i;
-  // convert atoms from box to lamda coords
+  
+  // set energy/virial flags
+  // invoke allocate_peratom() if needed for first time
 
   ev_init(eflag,vflag);
 
-  if (evflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    if (function[0]) {
-      cg_peratom->ghost_notify();
-      cg_peratom->setup();
-    }
-    if (function[1] + function[2] + function[3]) {
-      cg_peratom_6->ghost_notify();
-      cg_peratom_6->setup();
-    }
-    peratom_allocate_flag = 1;
-  }
+  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();
+
+  // convert atoms from box to lamda coords
 
   if (triclinic == 0) boxlo = domain->boxlo;
   else {
     boxlo = domain->boxlo_lamda;
     domain->x2lamda(atom->nlocal);
   }
+  
   // extend size of per-atom arrays if necessary
 
   if (atom->nmax > nmax) {
-
     if (function[0]) memory->destroy(part2grid);
     if (function[1] + function[2] + function[3]) memory->destroy(part2grid_6);
     nmax = atom->nmax;
@@ -916,7 +901,6 @@ void PPPMDisp::compute(int eflag, int vflag)
       memory->create(part2grid_6,nmax,3,"pppm/disp:part2grid_6");
   }
 
-
   energy = 0.0;
   energy_1 = 0.0;
   energy_6 = 0.0;
@@ -931,20 +915,20 @@ void PPPMDisp::compute(int eflag, int vflag)
 
   if (function[0]) {
 
-    //perform calculations for coulomb interactions only
+    // perform calculations for coulomb interactions only
 
     particle_map_c(delxinv, delyinv, delzinv, shift, part2grid, nupper, nlower,
-                 nxlo_out, nylo_out, nzlo_out, nxhi_out, nyhi_out, nzhi_out);
+		   nxlo_out, nylo_out, nzlo_out, nxhi_out, nyhi_out, nzhi_out);
 
     make_rho_c();
-
-    cg->reverse_comm(this,REVERSE_RHO);
+    
+    gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     brick2fft(nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
               density_brick, density_fft, work1,remap);
 
     if (differentiation_flag == 1) {
-
       poisson_ad(work1, work2, density_fft, fft1, fft2,
                  nx_pppm, ny_pppm, nz_pppm, nfft,
                  nxlo_fft, nylo_fft, nzlo_fft, nxhi_fft, nyhi_fft, nzhi_fft,
@@ -953,11 +937,14 @@ void PPPMDisp::compute(int eflag, int vflag)
                  virial_1, vg,vg2,
                  u_brick, v0_brick, v1_brick, v2_brick, v3_brick, v4_brick, v5_brick);
 
-      cg->forward_comm(this,FORWARD_AD);
+      gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
+			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       fieldforce_c_ad();
 
-      if (vflag_atom) cg_peratom->forward_comm(this, FORWARD_AD_PERATOM);
+      if (vflag_atom)
+	gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+				gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     } else {
       poisson_ik(work1, work2, density_fft, fft1, fft2,
@@ -969,42 +956,54 @@ void PPPMDisp::compute(int eflag, int vflag)
                  vdx_brick, vdy_brick, vdz_brick, virial_1, vg,vg2,
                  u_brick, v0_brick, v1_brick, v2_brick, v3_brick, v4_brick, v5_brick);
 
-      cg->forward_comm(this, FORWARD_IK);
+      gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
+			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       fieldforce_c_ik();
 
-      if (evflag_atom) cg_peratom->forward_comm(this, FORWARD_IK_PERATOM);
+      if (evflag_atom)
+	gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+				gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     }
+    
     if (evflag_atom) fieldforce_c_peratom();
   }
 
   if (function[1]) {
-    //perform calculations for geometric mixing
-    particle_map(delxinv_6, delyinv_6, delzinv_6, shift_6, part2grid_6, nupper_6, nlower_6,
-                 nxlo_out_6, nylo_out_6, nzlo_out_6, nxhi_out_6, nyhi_out_6, nzhi_out_6);
+    
+    // perform calculations for geometric mixing
+    
+    particle_map(delxinv_6, delyinv_6, delzinv_6, shift_6, part2grid_6,
+		 nupper_6, nlower_6,
+                 nxlo_out_6, nylo_out_6, nzlo_out_6,
+		 nxhi_out_6, nyhi_out_6, nzhi_out_6);
+    
     make_rho_g();
 
-
-    cg_6->reverse_comm(this, REVERSE_RHO_G);
+    gc6->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO_G,
+			     gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     brick2fft(nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, nzhi_in_6,
               density_brick_g, density_fft_g, work1_6,remap_6);
 
     if (differentiation_flag == 1) {
-
       poisson_ad(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
                  nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6,
                  nxlo_fft_6, nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
                  nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, nzhi_in_6,
                  energy_6, greensfn_6,
                  virial_6, vg_6, vg2_6,
-                 u_brick_g, v0_brick_g, v1_brick_g, v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);
+                 u_brick_g, v0_brick_g, v1_brick_g, v2_brick_g,
+		 v3_brick_g, v4_brick_g, v5_brick_g);
 
-      cg_6->forward_comm(this,FORWARD_AD_G);
+      gc6->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD_G,
+			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       fieldforce_g_ad();
 
-      if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_G);
+      if (vflag_atom)
+	gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_G,
+				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     } else {
       poisson_ik(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
@@ -1014,55 +1013,73 @@ void PPPMDisp::compute(int eflag, int vflag)
                  energy_6, greensfn_6,
                  fkx_6, fky_6, fkz_6,fkx2_6, fky2_6, fkz2_6,
                  vdx_brick_g, vdy_brick_g, vdz_brick_g, virial_6, vg_6, vg2_6,
-                 u_brick_g, v0_brick_g, v1_brick_g, v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);
+                 u_brick_g, v0_brick_g, v1_brick_g, v2_brick_g,
+		 v3_brick_g, v4_brick_g, v5_brick_g);
 
-      cg_6->forward_comm(this,FORWARD_IK_G);
+      gc6->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK_G,
+			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       fieldforce_g_ik();
 
-
-      if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_G);
+      if (evflag_atom)
+	gc6->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_G,
+				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     }
+    
     if (evflag_atom) fieldforce_g_peratom();
   }
 
   if (function[2]) {
-    //perform calculations for arithmetic mixing
-    particle_map(delxinv_6, delyinv_6, delzinv_6, shift_6, part2grid_6, nupper_6, nlower_6,
-                 nxlo_out_6, nylo_out_6, nzlo_out_6, nxhi_out_6, nyhi_out_6, nzhi_out_6);
+    
+    // perform calculations for arithmetic mixing
+    
+    particle_map(delxinv_6, delyinv_6, delzinv_6, shift_6, part2grid_6,
+		 nupper_6, nlower_6,
+                 nxlo_out_6, nylo_out_6, nzlo_out_6,
+		 nxhi_out_6, nyhi_out_6, nzhi_out_6);
+    
     make_rho_a();
 
-    cg_6->reverse_comm(this, REVERSE_RHO_A);
+    gc->reverse_comm_kspace(this,7,sizeof(FFT_SCALAR),REVERSE_RHO_A,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     brick2fft_a();
 
-    if ( differentiation_flag == 1) {
-
+    if (differentiation_flag == 1) {
       poisson_ad(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
                  nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6,
                  nxlo_fft_6, nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
                  nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, nzhi_in_6,
                  energy_6, greensfn_6,
                  virial_6, vg_6, vg2_6,
-                 u_brick_a3, v0_brick_a3, v1_brick_a3, v2_brick_a3, v3_brick_a3, v4_brick_a3, v5_brick_a3);
+                 u_brick_a3, v0_brick_a3, v1_brick_a3, v2_brick_a3,
+		 v3_brick_a3, v4_brick_a3, v5_brick_a3);
       poisson_2s_ad(density_fft_a0, density_fft_a6,
-                    u_brick_a0, v0_brick_a0, v1_brick_a0, v2_brick_a0, v3_brick_a0, v4_brick_a0, v5_brick_a0,
-                    u_brick_a6, v0_brick_a6, v1_brick_a6, v2_brick_a6, v3_brick_a6, v4_brick_a6, v5_brick_a6);
+                    u_brick_a0, v0_brick_a0, v1_brick_a0, v2_brick_a0,
+		    v3_brick_a0, v4_brick_a0, v5_brick_a0,
+                    u_brick_a6, v0_brick_a6, v1_brick_a6, v2_brick_a6,
+		    v3_brick_a6, v4_brick_a6, v5_brick_a6);
       poisson_2s_ad(density_fft_a1, density_fft_a5,
-                    u_brick_a1, v0_brick_a1, v1_brick_a1, v2_brick_a1, v3_brick_a1, v4_brick_a1, v5_brick_a1,
-                    u_brick_a5, v0_brick_a5, v1_brick_a5, v2_brick_a5, v3_brick_a5, v4_brick_a5, v5_brick_a5);
+                    u_brick_a1, v0_brick_a1, v1_brick_a1, v2_brick_a1,
+		    v3_brick_a1, v4_brick_a1, v5_brick_a1,
+                    u_brick_a5, v0_brick_a5, v1_brick_a5, v2_brick_a5,
+		    v3_brick_a5, v4_brick_a5, v5_brick_a5);
       poisson_2s_ad(density_fft_a2, density_fft_a4,
-                    u_brick_a2, v0_brick_a2, v1_brick_a2, v2_brick_a2, v3_brick_a2, v4_brick_a2, v5_brick_a2,
-                    u_brick_a4, v0_brick_a4, v1_brick_a4, v2_brick_a4, v3_brick_a4, v4_brick_a4, v5_brick_a4);
+                    u_brick_a2, v0_brick_a2, v1_brick_a2, v2_brick_a2,
+		    v3_brick_a2, v4_brick_a2, v5_brick_a2,
+                    u_brick_a4, v0_brick_a4, v1_brick_a4, v2_brick_a4,
+		    v3_brick_a4, v4_brick_a4, v5_brick_a4);
 
-      cg_6->forward_comm(this, FORWARD_AD_A);
+      gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_AD_A,
+			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       fieldforce_a_ad();
 
-      if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_AD_PERATOM_A);
+      if (evflag_atom)
+	gc6->forward_comm_kspace(this,42,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_A,
+				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     }  else {
-
       poisson_ik(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
                  nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6,
                  nxlo_fft_6, nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
@@ -1070,45 +1087,60 @@ void PPPMDisp::compute(int eflag, int vflag)
                  energy_6, greensfn_6,
                  fkx_6, fky_6, fkz_6,fkx2_6, fky2_6, fkz2_6,
                  vdx_brick_a3, vdy_brick_a3, vdz_brick_a3, virial_6, vg_6, vg2_6,
-                 u_brick_a3, v0_brick_a3, v1_brick_a3, v2_brick_a3, v3_brick_a3, v4_brick_a3, v5_brick_a3);
+                 u_brick_a3, v0_brick_a3, v1_brick_a3, v2_brick_a3,
+		 v3_brick_a3, v4_brick_a3, v5_brick_a3);
       poisson_2s_ik(density_fft_a0, density_fft_a6,
                     vdx_brick_a0, vdy_brick_a0, vdz_brick_a0,
                     vdx_brick_a6, vdy_brick_a6, vdz_brick_a6,
-                    u_brick_a0, v0_brick_a0, v1_brick_a0, v2_brick_a0, v3_brick_a0, v4_brick_a0, v5_brick_a0,
-                    u_brick_a6, v0_brick_a6, v1_brick_a6, v2_brick_a6, v3_brick_a6, v4_brick_a6, v5_brick_a6);
+                    u_brick_a0, v0_brick_a0, v1_brick_a0, v2_brick_a0,
+		    v3_brick_a0, v4_brick_a0, v5_brick_a0,
+                    u_brick_a6, v0_brick_a6, v1_brick_a6, v2_brick_a6,
+		    v3_brick_a6, v4_brick_a6, v5_brick_a6);
       poisson_2s_ik(density_fft_a1, density_fft_a5,
                     vdx_brick_a1, vdy_brick_a1, vdz_brick_a1,
                     vdx_brick_a5, vdy_brick_a5, vdz_brick_a5,
-                    u_brick_a1, v0_brick_a1, v1_brick_a1, v2_brick_a1, v3_brick_a1, v4_brick_a1, v5_brick_a1,
-                    u_brick_a5, v0_brick_a5, v1_brick_a5, v2_brick_a5, v3_brick_a5, v4_brick_a5, v5_brick_a5);
+                    u_brick_a1, v0_brick_a1, v1_brick_a1, v2_brick_a1,
+		    v3_brick_a1, v4_brick_a1, v5_brick_a1,
+                    u_brick_a5, v0_brick_a5, v1_brick_a5, v2_brick_a5,
+		    v3_brick_a5, v4_brick_a5, v5_brick_a5);
       poisson_2s_ik(density_fft_a2, density_fft_a4,
                     vdx_brick_a2, vdy_brick_a2, vdz_brick_a2,
                     vdx_brick_a4, vdy_brick_a4, vdz_brick_a4,
-                    u_brick_a2, v0_brick_a2, v1_brick_a2, v2_brick_a2, v3_brick_a2, v4_brick_a2, v5_brick_a2,
-                    u_brick_a4, v0_brick_a4, v1_brick_a4, v2_brick_a4, v3_brick_a4, v4_brick_a4, v5_brick_a4);
+                    u_brick_a2, v0_brick_a2, v1_brick_a2, v2_brick_a2,
+		    v3_brick_a2, v4_brick_a2, v5_brick_a2,
+                    u_brick_a4, v0_brick_a4, v1_brick_a4, v2_brick_a4,
+		    v3_brick_a4, v4_brick_a4, v5_brick_a4);
 
-      cg_6->forward_comm(this, FORWARD_IK_A);
+      gc6->forward_comm_kspace(this,18,sizeof(FFT_SCALAR),FORWARD_IK_A,
+			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       fieldforce_a_ik();
 
-      if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_A);
+      if (evflag_atom)
+	gc6->forward_comm_kspace(this,49,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_A,
+				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     }
+    
     if (evflag_atom) fieldforce_a_peratom();
   }
 
   if (function[3]) {
-    //perform calculations if no mixing rule applies
-    particle_map(delxinv_6, delyinv_6, delzinv_6, shift_6, part2grid_6, nupper_6, nlower_6,
-                 nxlo_out_6, nylo_out_6, nzlo_out_6, nxhi_out_6, nyhi_out_6, nzhi_out_6);
+    
+    // perform calculations if no mixing rule applies
+    
+    particle_map(delxinv_6, delyinv_6, delzinv_6, shift_6, part2grid_6,
+		 nupper_6, nlower_6,
+                 nxlo_out_6, nylo_out_6, nzlo_out_6,
+		 nxhi_out_6, nyhi_out_6, nzhi_out_6);
 
     make_rho_none();
 
-    cg_6->reverse_comm(this, REVERSE_RHO_NONE);
+    gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO_NONE,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     brick2fft_none();
 
     if (differentiation_flag == 1) {
-
       int n = 0;
       for (int k = 0; k<nsplit_alloc/2; k++) {
         poisson_none_ad(n,n+1,density_fft_none[n],density_fft_none[n+1],
@@ -1118,11 +1150,14 @@ void PPPMDisp::compute(int eflag, int vflag)
         n += 2;
       }
 
-      cg_6->forward_comm(this,FORWARD_AD_NONE);
+      gc6->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD_NONE,
+			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       fieldforce_none_ad();
 
-      if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_NONE);
+      if (vflag_atom)
+	gc6->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_NONE,
+				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     } else {
       int n = 0;
@@ -1136,13 +1171,16 @@ void PPPMDisp::compute(int eflag, int vflag)
         n += 2;
       }
 
-      cg_6->forward_comm(this,FORWARD_IK_NONE);
+      gc6->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK_NONE,
+			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       fieldforce_none_ik();
 
       if (evflag_atom)
-        cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_NONE);
+	gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_NONE,
+				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     }
+    
     if (evflag_atom) fieldforce_none_peratom();
   }
 
@@ -1156,6 +1194,7 @@ void PPPMDisp::compute(int eflag, int vflag)
   // sum energy across procs and add in volume-dependent term
 
   const double qscale = force->qqrd2e * scale;
+  
   if (eflag_global) {
     double energy_all;
     MPI_Allreduce(&energy_1,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
@@ -1192,8 +1231,10 @@ void PPPMDisp::compute(int eflag, int vflag)
   if (eflag_atom) {
     if (function[0]) {
       double *q = atom->q;
+      // coulomb self energy correction
       for (i = 0; i < atom->nlocal; i++) {
-        eatom[i] -= qscale*g_ewald*q[i]*q[i]/MY_PIS + qscale*MY_PI2*q[i]*qsum / (g_ewald*g_ewald*volume); //coulomb self energy correction
+        eatom[i] -= qscale*g_ewald*q[i]*q[i]/MY_PIS +
+	  qscale*MY_PI2*q[i]*qsum / (g_ewald*g_ewald*volume); 
       }
     }
     if (function[1] + function[2] + function[3]) {
@@ -1209,9 +1250,11 @@ void PPPMDisp::compute(int eflag, int vflag)
   if (vflag_atom) {
     if (function[1] + function[2] + function[3]) {
       int tmp;
+      // dispersion self virial correction
       for (i = 0; i < atom->nlocal; i++) {
         tmp = atom->type[i];
-        for (int n = 0; n < 3; n++) vatom[i][n] -= MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumi[tmp]; //dispersion self virial correction
+        for (int n = 0; n < 3; n++)
+	  vatom[i][n] -= MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumi[tmp];
       }
     }
   }
@@ -1605,9 +1648,6 @@ int PPPMDisp::check_convergence(double** A,double** Q,double** A0,
 
 void _noopt PPPMDisp::allocate()
 {
-
-  int (*procneigh)[2] = comm->procneigh;
-
   if (function[0]) {
     memory->create(work1,2*nfft_both,"pppm/disp:work1");
     memory->create(work2,2*nfft_both,"pppm/disp:work2");
@@ -1620,12 +1660,12 @@ void _noopt PPPMDisp::allocate()
     memory->create1d_offset(fky2,nylo_fft,nyhi_fft,"pppm/disp:fky2");
     memory->create1d_offset(fkz2,nzlo_fft,nzhi_fft,"pppm/disp:fkz2");
 
-
     memory->create(gf_b,order,"pppm/disp:gf_b");
     memory->create2d_offset(rho1d,3,-order/2,order/2,"pppm/disp:rho1d");
     memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm/disp:rho_coeff");
     memory->create2d_offset(drho1d,3,-order/2,order/2,"pppm/disp:rho1d");
-    memory->create2d_offset(drho_coeff,order,(1-order)/2,order/2,"pppm/disp:drho_coeff");
+    memory->create2d_offset(drho_coeff,order,(1-order)/2,order/2,
+			    "pppm/disp:drho_coeff");
 
     memory->create(greensfn,nfft_both,"pppm/disp:greensfn");
     memory->create(vg,nfft_both,6,"pppm/disp:vg");
@@ -1633,7 +1673,7 @@ void _noopt PPPMDisp::allocate()
 
     memory->create3d_offset(density_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                             nxlo_out,nxhi_out,"pppm/disp:density_brick");
-    if ( differentiation_flag == 1) {
+    if (differentiation_flag == 1) {
       memory->create3d_offset(u_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                               nxlo_out,nxhi_out,"pppm/disp:u_brick");
       memory->create(sf_precoeff1,nfft_both,"pppm/disp:sf_precoeff1");
@@ -1670,20 +1710,20 @@ void _noopt PPPMDisp::allocate()
                       nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                       1,0,0,FFT_PRECISION,collective_flag);
 
-  // create ghost grid object for rho and electric field communication
+    // create ghost grid object for rho and electric field communication
+    // also create 2 bufs for ghost grid cell comm, passed to GridComm methods
 
-  if (differentiation_flag == 1)
-    cg = new GridComm(lmp,world,1,1,
+    gc = new GridComm(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                       nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                      nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                      procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                      procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-  else
-    cg = new GridComm(lmp,world,3,1,
-                      nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                      nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                      procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                      procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+                      nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
+
+    gc->setup(ngc_buf1,ngc_buf2);
+
+    if (differentiation_flag) npergrid = 1;
+    else npergrid = 3;
+    
+    memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
+    memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
   }
 
   if (function[1]) {
@@ -1700,9 +1740,11 @@ void _noopt PPPMDisp::allocate()
 
     memory->create(gf_b_6,order_6,"pppm/disp:gf_b_6");
     memory->create2d_offset(rho1d_6,3,-order_6/2,order_6/2,"pppm/disp:rho1d_6");
-    memory->create2d_offset(rho_coeff_6,order_6,(1-order_6)/2,order_6/2,"pppm/disp:rho_coeff_6");
+    memory->create2d_offset(rho_coeff_6,order_6,(1-order_6)/2,order_6/2,
+			    "pppm/disp:rho_coeff_6");
     memory->create2d_offset(drho1d_6,3,-order_6/2,order_6/2,"pppm/disp:drho1d_6");
-    memory->create2d_offset(drho_coeff_6,order_6,(1-order_6)/2,order_6/2,"pppm/disp:drho_coeff_6");
+    memory->create2d_offset(drho_coeff_6,order_6,(1-order_6)/2,order_6/2,
+			    "pppm/disp:drho_coeff_6");
 
     memory->create(greensfn_6,nfft_both_6,"pppm/disp:greensfn_6");
     memory->create(vg_6,nfft_both_6,6,"pppm/disp:vg_6");
@@ -1710,7 +1752,7 @@ void _noopt PPPMDisp::allocate()
 
     memory->create3d_offset(density_brick_g,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:density_brick_g");
-    if ( differentiation_flag == 1) {
+    if (differentiation_flag == 1) {
       memory->create3d_offset(u_brick_g,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                               nxlo_out_6,nxhi_out_6,"pppm/disp:u_brick_g");
 
@@ -1721,7 +1763,7 @@ void _noopt PPPMDisp::allocate()
       memory->create(sf_precoeff5_6,nfft_both_6,"pppm/disp:sf_precoeff5_6");
       memory->create(sf_precoeff6_6,nfft_both_6,"pppm/disp:sf_precoeff6_6");
 
-    }  else {
+    } else {
       memory->create3d_offset(vdx_brick_g,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                               nxlo_out_6,nxhi_out_6,"pppm/disp:vdx_brick_g");
       memory->create3d_offset(vdy_brick_g,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
@@ -1731,38 +1773,41 @@ void _noopt PPPMDisp::allocate()
     }
     memory->create(density_fft_g,nfft_both_6,"pppm/disp:density_fft_g");
 
-
     int tmp;
 
-    fft1_6 = new FFT3d(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6,
-                     nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
-                     nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
-                     0,0,&tmp,collective_flag);
+    fft1_6 =
+      new FFT3d(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6,
+		nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
+		nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
+		0,0,&tmp,collective_flag);
 
-    fft2_6 = new FFT3d(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6,
-                     nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
-                     nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
-                     0,0,&tmp,collective_flag);
+    fft2_6 =
+      new FFT3d(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6,
+		nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
+		nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
+		0,0,&tmp,collective_flag);
 
-    remap_6 = new Remap(lmp,world,
-                      nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
-                      nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
-                      1,0,0,FFT_PRECISION,collective_flag);
+    remap_6 =
+      new Remap(lmp,world,
+		nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
+		nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
+		1,0,0,FFT_PRECISION,collective_flag);
 
     // create ghost grid object for rho and electric field communication
+    // also create 2 bufs for ghost grid cell comm, passed to GridComm methods
 
-    if (differentiation_flag == 1)
-      cg_6 = new GridComm(lmp,world,1,1,
-                        nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
-                        nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,nzlo_out_6,nzhi_out_6,
-                        procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                        procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-    else
-      cg_6 = new GridComm(lmp,world,3,1,
-                        nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
-                        nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,nzlo_out_6,nzhi_out_6,
-                        procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                        procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+    gc6 =
+      new GridComm(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6,
+		   nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
+		   nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,nzlo_out_6,nzhi_out_6);
+
+    gc6->setup(ngc6_buf1,ngc6_buf2);
+
+    if (differentiation_flag) npergrid6 = 1;
+    else npergrid6 = 7;
+    
+    memory->create(gc6_buf1,npergrid6*ngc6_buf1,"pppm:gc_buf1");
+    memory->create(gc6_buf2,npergrid6*ngc6_buf2,"pppm:gc_buf2");
   }
 
   if (function[2]) {
@@ -1779,27 +1824,36 @@ void _noopt PPPMDisp::allocate()
 
     memory->create(gf_b_6,order_6,"pppm/disp:gf_b_6");
     memory->create2d_offset(rho1d_6,3,-order_6/2,order_6/2,"pppm/disp:rho1d_6");
-    memory->create2d_offset(rho_coeff_6,order_6,(1-order_6)/2,order_6/2,"pppm/disp:rho_coeff_6");
+    memory->create2d_offset(rho_coeff_6,order_6,(1-order_6)/2,order_6/2,
+			    "pppm/disp:rho_coeff_6");
     memory->create2d_offset(drho1d_6,3,-order_6/2,order_6/2,"pppm/disp:drho1d_6");
-    memory->create2d_offset(drho_coeff_6,order_6,(1-order_6)/2,order_6/2,"pppm/disp:drho_coeff_6");
+    memory->create2d_offset(drho_coeff_6,order_6,(1-order_6)/2,order_6/2,
+			    "pppm/disp:drho_coeff_6");
 
     memory->create(greensfn_6,nfft_both_6,"pppm/disp:greensfn_6");
     memory->create(vg_6,nfft_both_6,6,"pppm/disp:vg_6");
     memory->create(vg2_6,nfft_both_6,3,"pppm/disp:vg2_6");
 
-    memory->create3d_offset(density_brick_a0,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+    memory->create3d_offset(density_brick_a0,nzlo_out_6,
+			    nzhi_out_6,nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:density_brick_a0");
-    memory->create3d_offset(density_brick_a1,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+    memory->create3d_offset(density_brick_a1,nzlo_out_6,
+			    nzhi_out_6,nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:density_brick_a1");
-    memory->create3d_offset(density_brick_a2,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+    memory->create3d_offset(density_brick_a2,nzlo_out_6,nzhi_out_6,
+			    nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:density_brick_a2");
-    memory->create3d_offset(density_brick_a3,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+    memory->create3d_offset(density_brick_a3,nzlo_out_6,nzhi_out_6,
+			    nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:density_brick_a3");
-    memory->create3d_offset(density_brick_a4,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+    memory->create3d_offset(density_brick_a4,nzlo_out_6,nzhi_out_6,
+			    nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:density_brick_a4");
-    memory->create3d_offset(density_brick_a5,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+    memory->create3d_offset(density_brick_a5,nzlo_out_6,nzhi_out_6,
+			    nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:density_brick_a5");
-    memory->create3d_offset(density_brick_a6,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+    memory->create3d_offset(density_brick_a6,nzlo_out_6,nzhi_out_6,
+			    nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:density_brick_a6");
 
     memory->create(density_fft_a0,nfft_both_6,"pppm/disp:density_fft_a0");
@@ -1811,7 +1865,7 @@ void _noopt PPPMDisp::allocate()
     memory->create(density_fft_a6,nfft_both_6,"pppm/disp:density_fft_a6");
 
 
-    if ( differentiation_flag == 1 ) {
+    if (differentiation_flag == 1) {
       memory->create3d_offset(u_brick_a0,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                               nxlo_out_6,nxhi_out_6,"pppm/disp:u_brick_a0");
       memory->create3d_offset(u_brick_a1,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
@@ -1886,8 +1940,6 @@ void _noopt PPPMDisp::allocate()
                               nxlo_out_6,nxhi_out_6,"pppm/disp:vdz_brick_a6");
     }
 
-
-
     int tmp;
 
     fft1_6 = new FFT3d(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6,
@@ -1906,20 +1958,20 @@ void _noopt PPPMDisp::allocate()
                       1,0,0,FFT_PRECISION,collective_flag);
 
     // create ghost grid object for rho and electric field communication
+    // also create 2 bufs for ghost grid cell comm, passed to GridComm methods
+    
+    gc6 =
+      new GridComm(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6,
+		   nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
+		   nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,nzlo_out_6,nzhi_out_6);
 
+    gc6->setup(ngc6_buf1,ngc6_buf2);
 
-    if (differentiation_flag == 1)
-      cg_6 = new GridComm(lmp,world,7,7,
-                        nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
-                        nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,nzlo_out_6,nzhi_out_6,
-                        procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                        procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-    else
-      cg_6 = new GridComm(lmp,world,21,7,
-                        nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
-                        nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,nzlo_out_6,nzhi_out_6,
-                        procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                        procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+    if (differentiation_flag) npergrid6 = 7;
+    else npergrid6 = 18;
+    
+    memory->create(gc6_buf1,npergrid6*ngc6_buf1,"pppm:gc_buf1");
+    memory->create(gc6_buf2,npergrid6*ngc6_buf2,"pppm:gc_buf2");
   }
 
   if (function[3]) {
@@ -1936,18 +1988,22 @@ void _noopt PPPMDisp::allocate()
 
     memory->create(gf_b_6,order_6,"pppm/disp:gf_b_6");
     memory->create2d_offset(rho1d_6,3,-order_6/2,order_6/2,"pppm/disp:rho1d_6");
-    memory->create2d_offset(rho_coeff_6,order_6,(1-order_6)/2,order_6/2,"pppm/disp:rho_coeff_6");
+    memory->create2d_offset(rho_coeff_6,order_6,(1-order_6)/2,order_6/2,
+			    "pppm/disp:rho_coeff_6");
     memory->create2d_offset(drho1d_6,3,-order_6/2,order_6/2,"pppm/disp:drho1d_6");
-    memory->create2d_offset(drho_coeff_6,order_6,(1-order_6)/2,order_6/2,"pppm/disp:drho_coeff_6");
+    memory->create2d_offset(drho_coeff_6,order_6,(1-order_6)/2,order_6/2,
+			    "pppm/disp:drho_coeff_6");
 
     memory->create(greensfn_6,nfft_both_6,"pppm/disp:greensfn_6");
     memory->create(vg_6,nfft_both_6,6,"pppm/disp:vg_6");
     memory->create(vg2_6,nfft_both_6,3,"pppm/disp:vg2_6");
 
-    memory->create4d_offset(density_brick_none,nsplit_alloc,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+    memory->create4d_offset(density_brick_none,nsplit_alloc,
+			    nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:density_brick_none");
     if ( differentiation_flag == 1) {
-      memory->create4d_offset(u_brick_none,nsplit_alloc,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+      memory->create4d_offset(u_brick_none,nsplit_alloc,
+			      nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                               nxlo_out_6,nxhi_out_6,"pppm/disp:u_brick_none");
 
       memory->create(sf_precoeff1_6,nfft_both_6,"pppm/disp:sf_precoeff1_6");
@@ -1958,15 +2014,18 @@ void _noopt PPPMDisp::allocate()
       memory->create(sf_precoeff6_6,nfft_both_6,"pppm/disp:sf_precoeff6_6");
 
     }  else {
-      memory->create4d_offset(vdx_brick_none,nsplit_alloc,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+      memory->create4d_offset(vdx_brick_none,nsplit_alloc,
+			      nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                               nxlo_out_6,nxhi_out_6,"pppm/disp:vdx_brick_none");
-      memory->create4d_offset(vdy_brick_none,nsplit_alloc,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+      memory->create4d_offset(vdy_brick_none,nsplit_alloc,
+			      nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                               nxlo_out_6,nxhi_out_6,"pppm/disp:vdy_brick_none");
-      memory->create4d_offset(vdz_brick_none,nsplit_alloc,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+      memory->create4d_offset(vdz_brick_none,nsplit_alloc,
+			      nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                               nxlo_out_6,nxhi_out_6,"pppm/disp:vdz_brick_none");
     }
-    memory->create(density_fft_none,nsplit_alloc,nfft_both_6,"pppm/disp:density_fft_none");
-
+    memory->create(density_fft_none,nsplit_alloc,nfft_both_6,
+		   "pppm/disp:density_fft_none");
 
     int tmp;
 
@@ -1986,21 +2045,21 @@ void _noopt PPPMDisp::allocate()
                       1,0,0,FFT_PRECISION,collective_flag);
 
     // create ghost grid object for rho and electric field communication
+    // also create 2 bufs for ghost grid cell comm, passed to GridComm methods
+    
+    gc6 =
+      new GridComm(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6,
+		   nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
+		   nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,nzlo_out_6,nzhi_out_6);
 
-    if (differentiation_flag == 1)
-      cg_6 = new GridComm(lmp,world,nsplit_alloc,nsplit_alloc,
-                        nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
-                        nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,nzlo_out_6,nzhi_out_6,
-                        procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                        procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-    else
-      cg_6 = new GridComm(lmp,world,3*nsplit_alloc,nsplit_alloc,
-                        nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
-                        nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,nzlo_out_6,nzhi_out_6,
-                        procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                        procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+    gc6->setup(ngc6_buf1,ngc6_buf2);
+
+    if (differentiation_flag) npergrid6 = 1;
+    else npergrid6 = 3;
+    
+    memory->create(gc6_buf1,npergrid6*ngc6_buf1,"pppm:gc_buf1");
+    memory->create(gc6_buf2,npergrid6*ngc6_buf2,"pppm:gc_buf2");
   }
-
 }
 
 /* ----------------------------------------------------------------------
@@ -2010,15 +2069,12 @@ void _noopt PPPMDisp::allocate()
 
 void PPPMDisp::allocate_peratom()
 {
-
-  int (*procneigh)[2] = comm->procneigh;
+  peratom_allocate_flag = 1;
 
   if (function[0]) {
-
     if (differentiation_flag != 1)
       memory->create3d_offset(u_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                               nxlo_out,nxhi_out,"pppm/disp:u_brick");
-
     memory->create3d_offset(v0_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                             nxlo_out,nxhi_out,"pppm/disp:v0_brick");
     memory->create3d_offset(v1_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
@@ -2032,32 +2088,22 @@ void PPPMDisp::allocate_peratom()
     memory->create3d_offset(v5_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                             nxlo_out,nxhi_out,"pppm/disp:v5_brick");
 
-    // create ghost grid object for rho and electric field communication
+    // use same GC ghost grid object for peratom grid communication
+    // but need to reallocate a larger gc_buf1 and gc_buf2
 
-    if (differentiation_flag == 1)
-      cg_peratom =
-        new GridComm(lmp,world,6,1,
-                     nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                     nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                     procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                     procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-    else
-      cg_peratom =
-        new GridComm(lmp,world,7,1,
-                     nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                     nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                     procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                     procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+    if (differentiation_flag) npergrid = 6;
+    else npergrid = 7;
 
+    memory->destroy(gc_buf1);
+    memory->destroy(gc_buf2);
+    memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
+    memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
   }
 
-
   if (function[1]) {
-
-    if ( differentiation_flag != 1 )
+    if (differentiation_flag != 1 )
       memory->create3d_offset(u_brick_g,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                               nxlo_out_6,nxhi_out_6,"pppm/disp:u_brick_g");
-
     memory->create3d_offset(v0_brick_g,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:v0_brick_g");
     memory->create3d_offset(v1_brick_g,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
@@ -2071,28 +2117,20 @@ void PPPMDisp::allocate_peratom()
     memory->create3d_offset(v5_brick_g,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:v5_brick_g");
 
-    // create ghost grid object for rho and electric field communication
+    // use same GC ghost grid object for peratom grid communication
+    // but need to reallocate a larger gc_buf1 and gc_buf2
 
-    if (differentiation_flag == 1)
-      cg_peratom_6 =
-        new GridComm(lmp,world,6,1,
-                     nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
-                     nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,nzlo_out_6,nzhi_out_6,
-                     procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                     procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-    else
-      cg_peratom_6 =
-        new GridComm(lmp,world,7,1,
-                     nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
-                     nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,nzlo_out_6,nzhi_out_6,
-                     procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                     procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+    if (differentiation_flag) npergrid = 6;
+    else npergrid = 7;
 
+    memory->destroy(gc_buf1);
+    memory->destroy(gc_buf2);
+    memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
+    memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
   }
 
   if (function[2]) {
-
-    if ( differentiation_flag != 1 ) {
+    if (differentiation_flag != 1 ) {
       memory->create3d_offset(u_brick_a0,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                               nxlo_out_6,nxhi_out_6,"pppm/disp:u_brick_a0");
       memory->create3d_offset(u_brick_a1,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
@@ -2200,65 +2238,56 @@ void PPPMDisp::allocate_peratom()
     memory->create3d_offset(v5_brick_a6,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                                 nxlo_out_6,nxhi_out_6,"pppm/disp:v5_brick_a6");
 
-    // create ghost grid object for rho and electric field communication
+    // use same GC ghost grid object for peratom grid communication
+    // but need to reallocate a larger gc_buf1 and gc_buf2
 
-    if (differentiation_flag == 1)
-      cg_peratom_6 =
-        new GridComm(lmp,world,42,1,
-                     nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
-                     nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,nzlo_out_6,nzhi_out_6,
-                     procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                     procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-    else
-      cg_peratom_6 =
-        new GridComm(lmp,world,49,1,
-                     nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
-                     nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,nzlo_out_6,nzhi_out_6,
-                     procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                     procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+    if (differentiation_flag) npergrid = 42;
+    else npergrid = 49;
 
+    memory->destroy(gc_buf1);
+    memory->destroy(gc_buf2);
+    memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
+    memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
   }
 
   if (function[3]) {
-
-    if ( differentiation_flag != 1 )
-      memory->create4d_offset(u_brick_none,nsplit_alloc,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+    if (differentiation_flag != 1)
+      memory->create4d_offset(u_brick_none,nsplit_alloc,
+			      nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                               nxlo_out_6,nxhi_out_6,"pppm/disp:u_brick_none");
 
-    memory->create4d_offset(v0_brick_none,nsplit_alloc,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+    memory->create4d_offset(v0_brick_none,nsplit_alloc,
+			    nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:v0_brick_none");
-    memory->create4d_offset(v1_brick_none,nsplit_alloc,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+    memory->create4d_offset(v1_brick_none,nsplit_alloc,
+			    nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:v1_brick_none");
-    memory->create4d_offset(v2_brick_none,nsplit_alloc,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+    memory->create4d_offset(v2_brick_none,nsplit_alloc,
+			    nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:v2_brick_none");
-    memory->create4d_offset(v3_brick_none,nsplit_alloc,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+    memory->create4d_offset(v3_brick_none,nsplit_alloc,
+			    nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:v3_brick_none");
-    memory->create4d_offset(v4_brick_none,nsplit_alloc,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+    memory->create4d_offset(v4_brick_none,nsplit_alloc,
+			    nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:v4_brick_none");
-    memory->create4d_offset(v5_brick_none,nsplit_alloc,nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+    memory->create4d_offset(v5_brick_none,nsplit_alloc,
+			    nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:v5_brick_none");
 
-    // create ghost grid object for rho and electric field communication
+    // use same GC ghost grid object for peratom grid communication
+    // but need to reallocate a larger gc_buf1 and gc_buf2
 
-    if (differentiation_flag == 1)
-      cg_peratom_6 =
-        new GridComm(lmp,world,6*nsplit_alloc,1,
-                     nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
-                     nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,nzlo_out_6,nzhi_out_6,
-                     procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                     procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-    else
-      cg_peratom_6 =
-        new GridComm(lmp,world,7*nsplit_alloc,1,
-                     nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
-                     nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,nzlo_out_6,nzhi_out_6,
-                     procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                     procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+    if (differentiation_flag) npergrid = 6;
+    else npergrid = 7;
 
+    memory->destroy(gc_buf1);
+    memory->destroy(gc_buf2);
+    memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
+    memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
   }
 }
 
-
 /* ----------------------------------------------------------------------
    deallocate memory that depends on # of K-vectors and order
 ------------------------------------------------------------------------- */
@@ -2351,7 +2380,8 @@ void PPPMDisp::deallocate()
   memory->destroy(sf_precoeff4);
   memory->destroy(sf_precoeff5);
   memory->destroy(sf_precoeff6);
-  sf_precoeff1 = sf_precoeff2 = sf_precoeff3 = sf_precoeff4 = sf_precoeff5 = sf_precoeff6 = NULL;
+  sf_precoeff1 = sf_precoeff2 = sf_precoeff3 =
+    sf_precoeff4 = sf_precoeff5 = sf_precoeff6 = NULL;
 
   memory->destroy(sf_precoeff1_6);
   memory->destroy(sf_precoeff2_6);
@@ -2359,7 +2389,8 @@ void PPPMDisp::deallocate()
   memory->destroy(sf_precoeff4_6);
   memory->destroy(sf_precoeff5_6);
   memory->destroy(sf_precoeff6_6);
-  sf_precoeff1_6 = sf_precoeff2_6 = sf_precoeff3_6 = sf_precoeff4_6 = sf_precoeff5_6 = sf_precoeff6_6 = NULL;
+  sf_precoeff1_6 = sf_precoeff2_6 = sf_precoeff3_6 =
+    sf_precoeff4_6 = sf_precoeff5_6 = sf_precoeff6_6 = NULL;
 
   memory->destroy(greensfn);
   memory->destroy(greensfn_6);
@@ -2395,7 +2426,6 @@ void PPPMDisp::deallocate()
   memory->destroy1d_offset(fkz2_6,nzlo_fft_6);
   fkx2_6 = fky2_6 = fkz2_6 = NULL;
 
-
   memory->destroy(gf_b);
   memory->destroy2d_offset(rho1d,-order/2);
   memory->destroy2d_offset(rho_coeff,(1-order)/2);
@@ -2415,21 +2445,14 @@ void PPPMDisp::deallocate()
   delete fft1;
   delete fft2;
   delete remap;
-  delete cg;
-  fft1 = fft2 = NULL;
-  remap = NULL;
-  cg = NULL;
+  delete gc;
 
   delete fft1_6;
   delete fft2_6;
   delete remap_6;
-  delete cg_6;
-  fft1_6 = fft2_6 = NULL;
-  remap_6 = NULL;
-  cg_6 = NULL;
+  delete gc6;
 }
 
-
 /* ----------------------------------------------------------------------
    deallocate memory that depends on # of K-vectors and order
    for per atom calculations
@@ -2446,7 +2469,8 @@ void PPPMDisp::deallocate_peratom()
   memory->destroy3d_offset(v3_brick, nzlo_out, nylo_out, nxlo_out);
   memory->destroy3d_offset(v4_brick, nzlo_out, nylo_out, nxlo_out);
   memory->destroy3d_offset(v5_brick, nzlo_out, nylo_out, nxlo_out);
-  u_brick = v0_brick = v1_brick = v2_brick = v3_brick = v4_brick = v5_brick = NULL;
+  u_brick = v0_brick = v1_brick = v2_brick =
+    v3_brick = v4_brick = v5_brick = NULL;
 
   memory->destroy3d_offset(u_brick_g, nzlo_out_6, nylo_out_6, nxlo_out_6);
   memory->destroy3d_offset(v0_brick_g, nzlo_out_6, nylo_out_6, nxlo_out_6);
@@ -2455,7 +2479,8 @@ void PPPMDisp::deallocate_peratom()
   memory->destroy3d_offset(v3_brick_g, nzlo_out_6, nylo_out_6, nxlo_out_6);
   memory->destroy3d_offset(v4_brick_g, nzlo_out_6, nylo_out_6, nxlo_out_6);
   memory->destroy3d_offset(v5_brick_g, nzlo_out_6, nylo_out_6, nxlo_out_6);
-  u_brick_g = v0_brick_g = v1_brick_g = v2_brick_g = v3_brick_g = v4_brick_g = v5_brick_g = NULL;
+  u_brick_g = v0_brick_g = v1_brick_g = v2_brick_g =
+    v3_brick_g = v4_brick_g = v5_brick_g = NULL;
 
   memory->destroy3d_offset(u_brick_a0, nzlo_out_6, nylo_out_6, nxlo_out_6);
   memory->destroy3d_offset(v0_brick_a0, nzlo_out_6, nylo_out_6, nxlo_out_6);
@@ -2464,7 +2489,8 @@ void PPPMDisp::deallocate_peratom()
   memory->destroy3d_offset(v3_brick_a0, nzlo_out_6, nylo_out_6, nxlo_out_6);
   memory->destroy3d_offset(v4_brick_a0, nzlo_out_6, nylo_out_6, nxlo_out_6);
   memory->destroy3d_offset(v5_brick_a0, nzlo_out_6, nylo_out_6, nxlo_out_6);
-  u_brick_a0 = v0_brick_a0 = v1_brick_a0 = v2_brick_a0 = v3_brick_a0 = v4_brick_a0 = v5_brick_a0 = NULL;
+  u_brick_a0 = v0_brick_a0 = v1_brick_a0 = v2_brick_a0 =
+    v3_brick_a0 = v4_brick_a0 = v5_brick_a0 = NULL;
 
   memory->destroy3d_offset(u_brick_a1, nzlo_out_6, nylo_out_6, nxlo_out_6);
   memory->destroy3d_offset(v0_brick_a1, nzlo_out_6, nylo_out_6, nxlo_out_6);
@@ -2473,7 +2499,8 @@ void PPPMDisp::deallocate_peratom()
   memory->destroy3d_offset(v3_brick_a1, nzlo_out_6, nylo_out_6, nxlo_out_6);
   memory->destroy3d_offset(v4_brick_a1, nzlo_out_6, nylo_out_6, nxlo_out_6);
   memory->destroy3d_offset(v5_brick_a1, nzlo_out_6, nylo_out_6, nxlo_out_6);
-  u_brick_a1 = v0_brick_a1 = v1_brick_a1 = v2_brick_a1 = v3_brick_a1 = v4_brick_a1 = v5_brick_a1 = NULL;
+  u_brick_a1 = v0_brick_a1 = v1_brick_a1 = v2_brick_a1 =
+    v3_brick_a1 = v4_brick_a1 = v5_brick_a1 = NULL;
 
   memory->destroy3d_offset(u_brick_a2, nzlo_out_6, nylo_out_6, nxlo_out_6);
   memory->destroy3d_offset(v0_brick_a2, nzlo_out_6, nylo_out_6, nxlo_out_6);
@@ -2500,7 +2527,8 @@ void PPPMDisp::deallocate_peratom()
   memory->destroy3d_offset(v3_brick_a4, nzlo_out_6, nylo_out_6, nxlo_out_6);
   memory->destroy3d_offset(v4_brick_a4, nzlo_out_6, nylo_out_6, nxlo_out_6);
   memory->destroy3d_offset(v5_brick_a4, nzlo_out_6, nylo_out_6, nxlo_out_6);
-  u_brick_a4 = v0_brick_a4 = v1_brick_a4 = v2_brick_a4 = v3_brick_a4 = v4_brick_a4 = v5_brick_a4 = NULL;
+  u_brick_a4 = v0_brick_a4 = v1_brick_a4 = v2_brick_a4 =
+    v3_brick_a4 = v4_brick_a4 = v5_brick_a4 = NULL;
 
   memory->destroy3d_offset(u_brick_a5, nzlo_out_6, nylo_out_6, nxlo_out_6);
   memory->destroy3d_offset(v0_brick_a5, nzlo_out_6, nylo_out_6, nxlo_out_6);
@@ -2509,7 +2537,8 @@ void PPPMDisp::deallocate_peratom()
   memory->destroy3d_offset(v3_brick_a5, nzlo_out_6, nylo_out_6, nxlo_out_6);
   memory->destroy3d_offset(v4_brick_a5, nzlo_out_6, nylo_out_6, nxlo_out_6);
   memory->destroy3d_offset(v5_brick_a5, nzlo_out_6, nylo_out_6, nxlo_out_6);
-  u_brick_a5 = v0_brick_a5 = v1_brick_a5 = v2_brick_a5 = v3_brick_a5 = v4_brick_a5 = v5_brick_a5 = NULL;
+  u_brick_a5 = v0_brick_a5 = v1_brick_a5 = v2_brick_a5 =
+    v3_brick_a5 = v4_brick_a5 = v5_brick_a5 = NULL;
 
   memory->destroy3d_offset(u_brick_a6, nzlo_out_6, nylo_out_6, nxlo_out_6);
   memory->destroy3d_offset(v0_brick_a6, nzlo_out_6, nylo_out_6, nxlo_out_6);
@@ -2518,7 +2547,8 @@ void PPPMDisp::deallocate_peratom()
   memory->destroy3d_offset(v3_brick_a6, nzlo_out_6, nylo_out_6, nxlo_out_6);
   memory->destroy3d_offset(v4_brick_a6, nzlo_out_6, nylo_out_6, nxlo_out_6);
   memory->destroy3d_offset(v5_brick_a6, nzlo_out_6, nylo_out_6, nxlo_out_6);
-  u_brick_a6 = v0_brick_a6 = v1_brick_a6 = v2_brick_a6 = v3_brick_a6 = v4_brick_a6 = v5_brick_a6 = NULL;
+  u_brick_a6 = v0_brick_a6 = v1_brick_a6 = v2_brick_a6 =
+    v3_brick_a6 = v4_brick_a6 = v5_brick_a6 = NULL;
 
   memory->destroy4d_offset(u_brick_none, nzlo_out_6, nylo_out_6, nxlo_out_6);
   memory->destroy4d_offset(v0_brick_none, nzlo_out_6, nylo_out_6, nxlo_out_6);
@@ -2527,11 +2557,8 @@ void PPPMDisp::deallocate_peratom()
   memory->destroy4d_offset(v3_brick_none, nzlo_out_6, nylo_out_6, nxlo_out_6);
   memory->destroy4d_offset(v4_brick_none, nzlo_out_6, nylo_out_6, nxlo_out_6);
   memory->destroy4d_offset(v5_brick_none, nzlo_out_6, nylo_out_6, nxlo_out_6);
-  u_brick_none = v0_brick_none = v1_brick_none = v2_brick_none = v3_brick_none = v4_brick_none = v5_brick_none = NULL;
-
-  delete cg_peratom;
-  delete cg_peratom_6;
-  cg_peratom = cg_peratom_6 = NULL;
+  u_brick_none = v0_brick_none = v1_brick_none = v2_brick_none =
+    v3_brick_none = v4_brick_none = v5_brick_none = NULL;
 }
 
 /* ----------------------------------------------------------------------
@@ -2577,6 +2604,7 @@ void PPPMDisp::set_grid()
     while (1) {
 
       // set grid dimension
+      
       nx_pppm = static_cast<int> (xprd/h_x);
       ny_pppm = static_cast<int> (yprd/h_y);
       nz_pppm = static_cast<int> (zprd_slab/h_z);
@@ -2585,32 +2613,18 @@ void PPPMDisp::set_grid()
       if (ny_pppm <= 1) ny_pppm = 2;
       if (nz_pppm <= 1) nz_pppm = 2;
 
-      //set local grid dimension
-      int npey_fft,npez_fft;
-      if (nz_pppm >= nprocs) {
-        npey_fft = 1;
-        npez_fft = nprocs;
-      } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);
-
-      int me_y = me % npey_fft;
-      int me_z = me / npey_fft;
-
-      nxlo_fft = 0;
-      nxhi_fft = nx_pppm - 1;
-      nylo_fft = me_y*ny_pppm/npey_fft;
-      nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1;
-      nzlo_fft = me_z*nz_pppm/npez_fft;
-      nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1;
+      // estimate Kspace force error
 
       double qopt = compute_qopt();
-
       double dfkspace = sqrt(qopt/natoms)*q2/(xprd*yprd*zprd_slab);
 
-      count++;
+      // break loop if the accuracy has been reached or
+      // too many loops have been performed
 
-      // break loop if the accuracy has been reached or too many loops have been performed
+      count++;
       if (dfkspace <= accuracy) break;
-      if (count > 500) error->all(FLERR, "Could not compute grid size for Coulomb interaction");
+      
+      if (count > 500) error->all(FLERR, "Could not compute grid size");
       h *= 0.95;
       h_x = h_y = h_z = h;
     }
@@ -2628,15 +2642,15 @@ void PPPMDisp::set_grid()
 ------------------------------------------------------------------------- */
 
 void PPPMDisp::set_fft_parameters(int& nx_p,int& ny_p,int& nz_p,
-                                   int& nxlo_f,int& nylo_f,int& nzlo_f,
-                                   int& nxhi_f,int& nyhi_f,int& nzhi_f,
-                                   int& nxlo_i,int& nylo_i,int& nzlo_i,
-                                   int& nxhi_i,int& nyhi_i,int& nzhi_i,
-                                   int& nxlo_o,int& nylo_o,int& nzlo_o,
-                                   int& nxhi_o,int& nyhi_o,int& nzhi_o,
-                                   int& nlow, int& nupp,
-                                   int& ng, int& nf, int& nfb,
-                                   double& sft,double& sftone, int& ord)
+				  int& nxlo_f,int& nylo_f,int& nzlo_f,
+				  int& nxhi_f,int& nyhi_f,int& nzhi_f,
+				  int& nxlo_i,int& nylo_i,int& nzlo_i,
+				  int& nxhi_i,int& nyhi_i,int& nzhi_i,
+				  int& nxlo_o,int& nylo_o,int& nzlo_o,
+				  int& nxhi_o,int& nyhi_o,int& nzhi_o,
+				  int& nlow, int& nupp,
+				  int& ng, int& nf, int& nfb,
+				  double& sft,double& sftone, int& ord)
 {
   // global indices of PPPM grid range from 0 to N-1
   // nlo_in,nhi_in = lower/upper limits of the 3d sub-brick of
@@ -2770,20 +2784,16 @@ void PPPMDisp::set_fft_parameters(int& nx_p,int& ny_p,int& nz_p,
 
   // PPPM grid for this proc, including ghosts
 
-  ng = (nxhi_o-nxlo_o+1) * (nyhi_o-nylo_o+1) *
-    (nzhi_o-nzlo_o+1);
+  ng = (nxhi_o-nxlo_o+1) * (nyhi_o-nylo_o+1) * (nzhi_o-nzlo_o+1);
 
   // FFT arrays on this proc, without ghosts
   // nfft = FFT points in FFT decomposition on this proc
   // nfft_brick = FFT points in 3d brick-decomposition on this proc
   // nfft_both = greater of 2 values
 
-  nf = (nxhi_f-nxlo_f+1) * (nyhi_f-nylo_f+1) *
-    (nzhi_f-nzlo_f+1);
-  int nfft_brick = (nxhi_i-nxlo_i+1) * (nyhi_i-nylo_i+1) *
-    (nzhi_i-nzlo_i+1);
+  nf = (nxhi_f-nxlo_f+1) * (nyhi_f-nylo_f+1) * (nzhi_f-nzlo_f+1);
+  int nfft_brick = (nxhi_i-nxlo_i+1) * (nyhi_i-nylo_i+1) * (nzhi_i-nzlo_i+1);
   nfb = MAX(nf,nfft_brick);
-
 }
 
 /* ----------------------------------------------------------------------
@@ -2811,9 +2821,9 @@ int PPPMDisp::factorable(int n)
 /* ----------------------------------------------------------------------
    pre-compute Green's function denominator expansion coeffs, Gamma(2n)
 ------------------------------------------------------------------------- */
+
 void PPPMDisp::adjust_gewald()
 {
-
   // Use Newton solver to find g_ewald
 
   double dx;
@@ -2829,12 +2839,11 @@ void PPPMDisp::adjust_gewald()
   // Failed to converge
 
   error->all(FLERR, "Could not compute g_ewald");
-
 }
 
 /* ----------------------------------------------------------------------
- Calculate f(x)
- ------------------------------------------------------------------------- */
+   calculate f(x)
+------------------------------------------------------------------------- */
 
 double PPPMDisp::f()
 {
@@ -2856,9 +2865,9 @@ double PPPMDisp::f()
 }
 
 /* ----------------------------------------------------------------------
- Calculate numerical derivative f'(x) using forward difference
- [f(x + h) - f(x)] / h
- ------------------------------------------------------------------------- */
+   calculate numerical derivative f'(x) using forward difference
+   [f(x + h) - f(x)] / h
+------------------------------------------------------------------------- */
 
 double PPPMDisp::derivf()
 {
@@ -2876,7 +2885,7 @@ double PPPMDisp::derivf()
 }
 
 /* ----------------------------------------------------------------------
-   Calculate the final estimator for the accuracy
+   calculate the final estimator for the accuracy
 ------------------------------------------------------------------------- */
 
 double PPPMDisp::final_accuracy()
@@ -2900,7 +2909,7 @@ double PPPMDisp::final_accuracy()
 }
 
 /* ----------------------------------------------------------------------
-   Calculate the final estimator for the Dispersion accuracy
+   calculate the final estimator for the Dispersion accuracy
 ------------------------------------------------------------------------- */
 
 void PPPMDisp::final_accuracy_6(double& acc, double& acc_real, double& acc_kspace)
@@ -2913,54 +2922,46 @@ void PPPMDisp::final_accuracy_6(double& acc, double& acc_real, double& acc_kspac
   acc_real = lj_rspace_error();
 
   double qopt = compute_qopt_6();
-
   acc_kspace = sqrt(qopt/natoms)*csum/(xprd*yprd*zprd_slab);
-
   acc = sqrt(acc_real*acc_real + acc_kspace*acc_kspace);
+
   return;
 }
 
 /* ----------------------------------------------------------------------
-   Compute qopt for Coulomb interactions
+   compute qopt for Coulomb interactions
 ------------------------------------------------------------------------- */
 
 double PPPMDisp::compute_qopt()
 {
   double qopt;
-  if (differentiation_flag == 1) {
-    qopt = compute_qopt_ad();
-  } else {
-    qopt = compute_qopt_ik();
-  }
+  if (differentiation_flag == 1) qopt = compute_qopt_ad();
+  else qopt = compute_qopt_ik();
   double qopt_all;
   MPI_Allreduce(&qopt,&qopt_all,1,MPI_DOUBLE,MPI_SUM,world);
   return qopt_all;
 }
 
 /* ----------------------------------------------------------------------
-   Compute qopt for Dispersion interactions
+   compute qopt for Dispersion interactions
 ------------------------------------------------------------------------- */
 
 double PPPMDisp::compute_qopt_6()
 {
   double qopt;
-  if (differentiation_flag == 1) {
-    qopt = compute_qopt_6_ad();
-  } else {
-    qopt = compute_qopt_6_ik();
-  }
+  if (differentiation_flag == 1) qopt = compute_qopt_6_ad();
+  else qopt = compute_qopt_6_ik();
   double qopt_all;
   MPI_Allreduce(&qopt,&qopt_all,1,MPI_DOUBLE,MPI_SUM,world);
   return qopt_all;
 }
 
 /* ----------------------------------------------------------------------
-   Compute qopt for the ik differentiation scheme and Coulomb interaction
+   compute qopt for the ik differentiation scheme and Coulomb interaction
 ------------------------------------------------------------------------- */
 
 double PPPMDisp::compute_qopt_ik()
 {
-  double qopt = 0.0;
   int k,l,m;
   double *prd;
 
@@ -2985,67 +2986,73 @@ double PPPMDisp::compute_qopt_ik()
   int nby = 2;
   int nbz = 2;
 
-  for (m = nzlo_fft; m <= nzhi_fft; m++) {
-    mper = m - nz_pppm*(2*m/nz_pppm);
+  // loop over entire FFT grid
+  // each proc calculates contributions from every Pth grid point
+  
+  bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
+  int nxy_pppm = nx_pppm * ny_pppm;
 
-    for (l = nylo_fft; l <= nyhi_fft; l++) {
-      lper = l - ny_pppm*(2*l/ny_pppm);
+  double qopt = 0.0;
 
-      for (k = nxlo_fft; k <= nxhi_fft; k++) {
-        kper = k - nx_pppm*(2*k/nx_pppm);
+  for (bigint i = me; i < ngridtotal; i += nprocs) {
+    k = i % nx_pppm;
+    l = (i/nx_pppm) % ny_pppm;
+    m = i / nxy_pppm;
 
-        sqk = pow(unitkx*kper,2.0) + pow(unitky*lper,2.0) +
-          pow(unitkz*mper,2.0);
+    const int kper = k - nx_pppm*(2*k/nx_pppm);
+    const int lper = l - ny_pppm*(2*l/ny_pppm);
+    const int mper = m - nz_pppm*(2*m/nz_pppm);
 
-        if (sqk != 0.0) {
-          sum1 = 0.0;
-          sum2 = 0.0;
-          sum3 = 0.0;
-          for (nx = -nbx; nx <= nbx; nx++) {
-            qx = unitkx*(kper+nx_pppm*nx);
-            sx = exp(-0.25*pow(qx/g_ewald,2.0));
-            wx = 1.0;
-            argx = 0.5*qx*xprd/nx_pppm;
-            if (argx != 0.0) wx = pow(sin(argx)/argx,order);
-            for (ny = -nby; ny <= nby; ny++) {
-              qy = unitky*(lper+ny_pppm*ny);
-              sy = exp(-0.25*pow(qy/g_ewald,2.0));
-              wy = 1.0;
-              argy = 0.5*qy*yprd/ny_pppm;
-              if (argy != 0.0) wy = pow(sin(argy)/argy,order);
-              for (nz = -nbz; nz <= nbz; nz++) {
-                qz = unitkz*(mper+nz_pppm*nz);
-                sz = exp(-0.25*pow(qz/g_ewald,2.0));
-                wz = 1.0;
-                argz = 0.5*qz*zprd_slab/nz_pppm;
-                if (argz != 0.0) wz = pow(sin(argz)/argz,order);
-
-                dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
-                dot2 = qx*qx+qy*qy+qz*qz;
-                u2 =  pow(wx*wy*wz,2.0);
-                sum1 += sx*sy*sz*sx*sy*sz/dot2*4.0*4.0*MY_PI*MY_PI;
-                sum2 += u2*sx*sy*sz*4.0*MY_PI/dot2*dot1;
-                sum3 += u2;
-              }
-            }
-          }
-          sum2 *= sum2;
-          sum3 *= sum3*sqk;
-          qopt += sum1 -sum2/sum3;
-        }
+    sqk = pow(unitkx*kper,2.0) + pow(unitky*lper,2.0) + pow(unitkz*mper,2.0);
+    if (sqk == 0.0) continue;
+    
+    sum1 = sum2 = sum3 = 0.0;
+    
+    for (nx = -nbx; nx <= nbx; nx++) {
+      qx = unitkx*(kper+nx_pppm*nx);
+      sx = exp(-0.25*pow(qx/g_ewald,2.0));
+      wx = 1.0;
+      argx = 0.5*qx*xprd/nx_pppm;
+      if (argx != 0.0) wx = pow(sin(argx)/argx,order);
+      
+      for (ny = -nby; ny <= nby; ny++) {
+	qy = unitky*(lper+ny_pppm*ny);
+	sy = exp(-0.25*pow(qy/g_ewald,2.0));
+	wy = 1.0;
+	argy = 0.5*qy*yprd/ny_pppm;
+	if (argy != 0.0) wy = pow(sin(argy)/argy,order);
+	
+	for (nz = -nbz; nz <= nbz; nz++) {
+	  qz = unitkz*(mper+nz_pppm*nz);
+	  sz = exp(-0.25*pow(qz/g_ewald,2.0));
+	  wz = 1.0;
+	  argz = 0.5*qz*zprd_slab/nz_pppm;
+	  if (argz != 0.0) wz = pow(sin(argz)/argz,order);
+	  
+	  dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
+	  dot2 = qx*qx+qy*qy+qz*qz;
+	  u2 =  pow(wx*wy*wz,2.0);
+	  sum1 += sx*sy*sz*sx*sy*sz/dot2*4.0*4.0*MY_PI*MY_PI;
+	  sum2 += u2*sx*sy*sz*4.0*MY_PI/dot2*dot1;
+	  sum3 += u2;
+	}
       }
     }
+    
+    sum2 *= sum2;
+    sum3 *= sum3*sqk;
+    qopt += sum1 -sum2/sum3;
   }
+  
   return qopt;
 }
 
 /* ----------------------------------------------------------------------
-   Compute qopt for the ad differentiation scheme and Coulomb interaction
+   compute qopt for the ad differentiation scheme and Coulomb interaction
 ------------------------------------------------------------------------- */
 
 double PPPMDisp::compute_qopt_ad()
 {
-  double qopt = 0.0;
   int k,l,m;
   double *prd;
 
@@ -3057,7 +3064,6 @@ double PPPMDisp::compute_qopt_ad()
   double zprd = prd[2];
   double zprd_slab = zprd*slab_volfactor;
 
-
   double unitkx = (2.0*MY_PI/xprd);
   double unitky = (2.0*MY_PI/yprd);
   double unitkz = (2.0*MY_PI/zprd_slab);
@@ -3071,68 +3077,72 @@ double PPPMDisp::compute_qopt_ad()
   int nby = 2;
   int nbz = 2;
 
-  for (m = nzlo_fft; m <= nzhi_fft; m++) {
-    mper = m - nz_pppm*(2*m/nz_pppm);
+  // loop over entire FFT grid
+  // each proc calculates contributions from every Pth grid point
+  
+  bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
+  int nxy_pppm = nx_pppm * ny_pppm;
 
-    for (l = nylo_fft; l <= nyhi_fft; l++) {
-      lper = l - ny_pppm*(2*l/ny_pppm);
+  double qopt = 0.0;
 
-      for (k = nxlo_fft; k <= nxhi_fft; k++) {
-        kper = k - nx_pppm*(2*k/nx_pppm);
+  for (bigint i = me; i < ngridtotal; i += nprocs) {
+    k = i % nx_pppm;
+    l = (i/nx_pppm) % ny_pppm;
+    m = i / nxy_pppm;
 
-        sqk = pow(unitkx*kper,2.0) + pow(unitky*lper,2.0) +
-          pow(unitkz*mper,2.0);
+    const int kper = k - nx_pppm*(2*k/nx_pppm);
+    const int lper = l - ny_pppm*(2*l/ny_pppm);
+    const int mper = m - nz_pppm*(2*m/nz_pppm);
 
-        if (sqk != 0.0) {
+    sqk = pow(unitkx*kper,2.0) + pow(unitky*lper,2.0) + pow(unitkz*mper,2.0);
+    if (sqk == 0.0) continue;
 
-          sum1 = 0.0;
-          sum2 = 0.0;
-          sum3 = 0.0;
-          sum4 = 0.0;
-          for (nx = -nbx; nx <= nbx; nx++) {
-            qx = unitkx*(kper+nx_pppm*nx);
-            sx = exp(-0.25*pow(qx/g_ewald,2.0));
-            wx = 1.0;
-            argx = 0.5*qx*xprd/nx_pppm;
-            if (argx != 0.0) wx = pow(sin(argx)/argx,order);
-            for (ny = -nby; ny <= nby; ny++) {
-              qy = unitky*(lper+ny_pppm*ny);
-              sy = exp(-0.25*pow(qy/g_ewald,2.0));
-              wy = 1.0;
-              argy = 0.5*qy*yprd/ny_pppm;
-              if (argy != 0.0) wy = pow(sin(argy)/argy,order);
-              for (nz = -nbz; nz <= nbz; nz++) {
-                qz = unitkz*(mper+nz_pppm*nz);
-                sz = exp(-0.25*pow(qz/g_ewald,2.0));
-                wz = 1.0;
-                argz = 0.5*qz*zprd_slab/nz_pppm;
-                if (argz != 0.0) wz = pow(sin(argz)/argz,order);
+    sum1 = sum2 = sum3 = sum4 = 0.0;
 
-                dot2 = qx*qx+qy*qy+qz*qz;
-                u2 =  pow(wx*wy*wz,2.0);
-                sum1 += sx*sy*sz*sx*sy*sz/dot2*4.0*4.0*MY_PI*MY_PI;
-                sum2 += sx*sy*sz * u2*4.0*MY_PI;
-                sum3 += u2;
-                sum4 += dot2*u2;
-              }
-            }
-          }
-          sum2 *= sum2;
-          qopt += sum1 - sum2/(sum3*sum4);
-        }
+    for (nx = -nbx; nx <= nbx; nx++) {
+      qx = unitkx*(kper+nx_pppm*nx);
+      sx = exp(-0.25*pow(qx/g_ewald,2.0));
+      wx = 1.0;
+      argx = 0.5*qx*xprd/nx_pppm;
+      if (argx != 0.0) wx = pow(sin(argx)/argx,order);
+      
+      for (ny = -nby; ny <= nby; ny++) {
+	qy = unitky*(lper+ny_pppm*ny);
+	sy = exp(-0.25*pow(qy/g_ewald,2.0));
+	wy = 1.0;
+	argy = 0.5*qy*yprd/ny_pppm;
+	if (argy != 0.0) wy = pow(sin(argy)/argy,order);
+	
+	for (nz = -nbz; nz <= nbz; nz++) {
+	  qz = unitkz*(mper+nz_pppm*nz);
+	  sz = exp(-0.25*pow(qz/g_ewald,2.0));
+	  wz = 1.0;
+	  argz = 0.5*qz*zprd_slab/nz_pppm;
+	  if (argz != 0.0) wz = pow(sin(argz)/argz,order);
+
+	  dot2 = qx*qx+qy*qy+qz*qz;
+	  u2 =  pow(wx*wy*wz,2.0);
+	  sum1 += sx*sy*sz*sx*sy*sz/dot2*4.0*4.0*MY_PI*MY_PI;
+	  sum2 += sx*sy*sz * u2*4.0*MY_PI;
+	  sum3 += u2;
+	  sum4 += dot2*u2;
+	}
       }
     }
+    
+    sum2 *= sum2;
+    qopt += sum1 - sum2/(sum3*sum4);
   }
+
   return qopt;
 }
 
 /* ----------------------------------------------------------------------
-   Compute qopt for the ik differentiation scheme and Dispersion interaction
+   compute qopt for the ik differentiation scheme and Dispersion interaction
 ------------------------------------------------------------------------- */
 
 double PPPMDisp::compute_qopt_6_ik()
 {
-  double qopt = 0.0;
   int k,l,m;
   double *prd;
 
@@ -3161,71 +3171,76 @@ double PPPMDisp::compute_qopt_6_ik()
   int nby = 2;
   int nbz = 2;
 
-  for (m = nzlo_fft_6; m <= nzhi_fft_6; m++) {
-    mper = m - nz_pppm_6*(2*m/nz_pppm_6);
+  // loop over entire FFT grid
+  // each proc calculates contributions from every Pth grid point
+  
+  bigint ngridtotal = (bigint) nx_pppm_6 * ny_pppm_6 * nz_pppm_6;
+  int nxy_pppm_6 = nx_pppm_6 * ny_pppm_6;
 
-    for (l = nylo_fft_6; l <= nyhi_fft_6; l++) {
-      lper = l - ny_pppm_6*(2*l/ny_pppm_6);
+  double qopt = 0.0;
 
-      for (k = nxlo_fft_6; k <= nxhi_fft_6; k++) {
-        kper = k - nx_pppm_6*(2*k/nx_pppm_6);
+  for (bigint i = me; i < ngridtotal; i += nprocs) {
+    k = i % nx_pppm_6;
+    l = (i/nx_pppm_6) % ny_pppm_6;
+    m = i / nxy_pppm_6;
 
-        sqk = pow(unitkx*kper,2.0) + pow(unitky*lper,2.0) +
-          pow(unitkz*mper,2.0);
+    const int kper = k - nx_pppm_6*(2*k/nx_pppm);
+    const int lper = l - ny_pppm_6*(2*l/ny_pppm);
+    const int mper = m - nz_pppm_6*(2*m/nz_pppm);
 
-        if (sqk != 0.0) {
-          sum1 = 0.0;
-          sum2 = 0.0;
-          sum3 = 0.0;
-          for (nx = -nbx; nx <= nbx; nx++) {
-            qx = unitkx*(kper+nx_pppm_6*nx);
-            sx = exp(-qx*qx*inv2ew*inv2ew);
-            wx = 1.0;
-            argx = 0.5*qx*xprd/nx_pppm_6;
-            if (argx != 0.0) wx = pow(sin(argx)/argx,order_6);
-            for (ny = -nby; ny <= nby; ny++) {
-              qy = unitky*(lper+ny_pppm_6*ny);
-              sy = exp(-qy*qy*inv2ew*inv2ew);
-              wy = 1.0;
-              argy = 0.5*qy*yprd/ny_pppm_6;
-              if (argy != 0.0) wy = pow(sin(argy)/argy,order_6);
-              for (nz = -nbz; nz <= nbz; nz++) {
-                qz = unitkz*(mper+nz_pppm_6*nz);
-                sz = exp(-qz*qz*inv2ew*inv2ew);
-                wz = 1.0;
-                argz = 0.5*qz*zprd_slab/nz_pppm_6;
-                if (argz != 0.0) wz = pow(sin(argz)/argz,order_6);
+    sqk = pow(unitkx*kper,2.0) + pow(unitky*lper,2.0) + pow(unitkz*mper,2.0);
+    if (sqk == 0.0) continue;
+    
+    sum1 = sum2 = sum3 = 0.0;
 
-                dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
-                dot2 = qx*qx+qy*qy+qz*qz;
-                rtdot2 = sqrt(dot2);
-                term = (1-2*dot2*inv2ew*inv2ew)*sx*sy*sz +
-                       2*dot2*rtdot2*inv2ew*inv2ew*inv2ew*rtpi*erfc(rtdot2*inv2ew);
-                term *= g_ewald_6*g_ewald_6*g_ewald_6;
-                u2 =  pow(wx*wy*wz,2.0);
-                sum1 += term*term*MY_PI*MY_PI*MY_PI/9.0 * dot2;
-                sum2 += -u2*term*MY_PI*rtpi/3.0*dot1;
-                sum3 += u2;
-              }
-            }
-          }
-          sum2 *= sum2;
-          sum3 *= sum3*sqk;
-          qopt += sum1 -sum2/sum3;
-        }
+    for (nx = -nbx; nx <= nbx; nx++) {
+      qx = unitkx*(kper+nx_pppm_6*nx);
+      sx = exp(-qx*qx*inv2ew*inv2ew);
+      wx = 1.0;
+      argx = 0.5*qx*xprd/nx_pppm_6;
+      if (argx != 0.0) wx = pow(sin(argx)/argx,order_6);
+      
+      for (ny = -nby; ny <= nby; ny++) {
+	qy = unitky*(lper+ny_pppm_6*ny);
+	sy = exp(-qy*qy*inv2ew*inv2ew);
+	wy = 1.0;
+	argy = 0.5*qy*yprd/ny_pppm_6;
+	if (argy != 0.0) wy = pow(sin(argy)/argy,order_6);
+	
+	for (nz = -nbz; nz <= nbz; nz++) {
+	  qz = unitkz*(mper+nz_pppm_6*nz);
+	  sz = exp(-qz*qz*inv2ew*inv2ew);
+	  wz = 1.0;
+	  argz = 0.5*qz*zprd_slab/nz_pppm_6;
+	  if (argz != 0.0) wz = pow(sin(argz)/argz,order_6);
+
+	  dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
+	  dot2 = qx*qx+qy*qy+qz*qz;
+	  rtdot2 = sqrt(dot2);
+	  term = (1-2*dot2*inv2ew*inv2ew)*sx*sy*sz +
+	    2*dot2*rtdot2*inv2ew*inv2ew*inv2ew*rtpi*erfc(rtdot2*inv2ew);
+	  term *= g_ewald_6*g_ewald_6*g_ewald_6;
+	  u2 =  pow(wx*wy*wz,2.0);
+	  sum1 += term*term*MY_PI*MY_PI*MY_PI/9.0 * dot2;
+	  sum2 += -u2*term*MY_PI*rtpi/3.0*dot1;
+	  sum3 += u2;
+	}
       }
     }
+    sum2 *= sum2;
+    sum3 *= sum3*sqk;
+    qopt += sum1 -sum2/sum3;
   }
+
   return qopt;
 }
 
 /* ----------------------------------------------------------------------
-   Compute qopt for the ad differentiation scheme and Dispersion interaction
+   compute qopt for the ad differentiation scheme and Dispersion interaction
 ------------------------------------------------------------------------- */
 
 double PPPMDisp::compute_qopt_6_ad()
 {
-  double qopt = 0.0;
   int k,l,m;
   double *prd;
 
@@ -3254,62 +3269,66 @@ double PPPMDisp::compute_qopt_6_ad()
   int nby = 2;
   int nbz = 2;
 
-  for (m = nzlo_fft_6; m <= nzhi_fft_6; m++) {
-    mper = m - nz_pppm_6*(2*m/nz_pppm_6);
+  // loop over entire FFT grid
+  // each proc calculates contributions from every Pth grid point
+  
+  bigint ngridtotal = (bigint) nx_pppm_6 * ny_pppm_6 * nz_pppm_6;
+  int nxy_pppm_6 = nx_pppm_6 * ny_pppm_6;
 
-    for (l = nylo_fft_6; l <= nyhi_fft_6; l++) {
-      lper = l - ny_pppm_6*(2*l/ny_pppm_6);
+  double qopt = 0.0;
 
-      for (k = nxlo_fft_6; k <= nxhi_fft_6; k++) {
-        kper = k - nx_pppm_6*(2*k/nx_pppm_6);
+  for (bigint i = me; i < ngridtotal; i += nprocs) {
+    k = i % nx_pppm_6;
+    l = (i/nx_pppm_6) % ny_pppm_6;
+    m = i / nxy_pppm_6;
 
-        sqk = pow(unitkx*kper,2.0) + pow(unitky*lper,2.0) +
-          pow(unitkz*mper,2.0);
+    const int kper = k - nx_pppm_6*(2*k/nx_pppm);
+    const int lper = l - ny_pppm_6*(2*l/ny_pppm);
+    const int mper = m - nz_pppm_6*(2*m/nz_pppm);
 
-        if (sqk != 0.0) {
+    sqk = pow(unitkx*kper,2.0) + pow(unitky*lper,2.0) + pow(unitkz*mper,2.0);
+    if (sqk == 0.0) continue;
+    
+    sum1 = sum2 = sum3 = sum4 = 0.0;
 
-          sum1 = 0.0;
-          sum2 = 0.0;
-          sum3 = 0.0;
-          sum4 = 0.0;
-          for (nx = -nbx; nx <= nbx; nx++) {
-            qx = unitkx*(kper+nx_pppm_6*nx);
-            sx = exp(-qx*qx*inv2ew*inv2ew);
-            wx = 1.0;
-            argx = 0.5*qx*xprd/nx_pppm_6;
-            if (argx != 0.0) wx = pow(sin(argx)/argx,order_6);
-            for (ny = -nby; ny <= nby; ny++) {
-              qy = unitky*(lper+ny_pppm_6*ny);
-              sy = exp(-qy*qy*inv2ew*inv2ew);
-              wy = 1.0;
-              argy = 0.5*qy*yprd/ny_pppm_6;
-              if (argy != 0.0) wy = pow(sin(argy)/argy,order_6);
-              for (nz = -nbz; nz <= nbz; nz++) {
-                qz = unitkz*(mper+nz_pppm_6*nz);
-                sz = exp(-qz*qz*inv2ew*inv2ew);
-                wz = 1.0;
-                argz = 0.5*qz*zprd_slab/nz_pppm_6;
-                if (argz != 0.0) wz = pow(sin(argz)/argz,order_6);
+    for (nx = -nbx; nx <= nbx; nx++) {
+      qx = unitkx*(kper+nx_pppm_6*nx);
+      sx = exp(-qx*qx*inv2ew*inv2ew);
+      wx = 1.0;
+      argx = 0.5*qx*xprd/nx_pppm_6;
+      if (argx != 0.0) wx = pow(sin(argx)/argx,order_6);
 
-                dot2 = qx*qx+qy*qy+qz*qz;
-                rtdot2 = sqrt(dot2);
-                term = (1-2*dot2*inv2ew*inv2ew)*sx*sy*sz +
-                       2*dot2*rtdot2*inv2ew*inv2ew*inv2ew*rtpi*erfc(rtdot2*inv2ew);
-                term *= g_ewald_6*g_ewald_6*g_ewald_6;
-                u2 =  pow(wx*wy*wz,2.0);
-                sum1 += term*term*MY_PI*MY_PI*MY_PI/9.0 * dot2;
-                sum2 += -term*MY_PI*rtpi/3.0 * u2 * dot2;
-                sum3 += u2;
-                sum4 += dot2*u2;
-              }
-            }
-          }
-          sum2 *= sum2;
-          qopt += sum1 - sum2/(sum3*sum4);
-        }
+      for (ny = -nby; ny <= nby; ny++) {
+	qy = unitky*(lper+ny_pppm_6*ny);
+	sy = exp(-qy*qy*inv2ew*inv2ew);
+	wy = 1.0;
+	argy = 0.5*qy*yprd/ny_pppm_6;
+	if (argy != 0.0) wy = pow(sin(argy)/argy,order_6);
+
+	for (nz = -nbz; nz <= nbz; nz++) {
+	  qz = unitkz*(mper+nz_pppm_6*nz);
+	  sz = exp(-qz*qz*inv2ew*inv2ew);
+	  wz = 1.0;
+	  argz = 0.5*qz*zprd_slab/nz_pppm_6;
+	  if (argz != 0.0) wz = pow(sin(argz)/argz,order_6);
+
+	  dot2 = qx*qx+qy*qy+qz*qz;
+	  rtdot2 = sqrt(dot2);
+	  term = (1-2*dot2*inv2ew*inv2ew)*sx*sy*sz +
+	    2*dot2*rtdot2*inv2ew*inv2ew*inv2ew*rtpi*erfc(rtdot2*inv2ew);
+	  term *= g_ewald_6*g_ewald_6*g_ewald_6;
+	  u2 =  pow(wx*wy*wz,2.0);
+	  sum1 += term*term*MY_PI*MY_PI*MY_PI/9.0 * dot2;
+	  sum2 += -term*MY_PI*rtpi/3.0 * u2 * dot2;
+	  sum3 += u2;
+	  sum4 += dot2*u2;
+	}
       }
     }
+    sum2 *= sum2;
+    qopt += sum1 - sum2/(sum3*sum4);
   }
+
   return qopt;
 }
 
@@ -3320,7 +3339,8 @@ double PPPMDisp::compute_qopt_6_ad()
 
 void PPPMDisp::set_grid_6()
 {
-  // Calculate csum
+  // calculate csum
+  
   if (!csumflag) calc_csum();
   if (!gewaldflag_6) set_init_g6();
   if (!gridflag_6) set_n_pppm_6();
@@ -3352,8 +3372,7 @@ void PPPMDisp::calc_csum()
   int *neach = new int[ntypes+1];
   for (i = 0; i<=ntypes; i++) neach[i] = 0;
 
-  //the following variables are needed to distinguish between arithmetic
-  //  and geometric mixing
+  // following variables distinguish between arithmetic and geometric mixing
 
   if (function[1]) {
     for (i = 1; i <= ntypes; i++)
@@ -3388,7 +3407,6 @@ void PPPMDisp::calc_csum()
     }
   }
 
-
   double tmp2;
   MPI_Allreduce(&csum,&tmp2,1,MPI_DOUBLE,MPI_SUM,world);
   csum = tmp2;
@@ -3398,8 +3416,10 @@ void PPPMDisp::calc_csum()
   MPI_Allreduce(neach,neach_all,ntypes+1,MPI_INT,MPI_SUM,world);
 
   // copmute csumij and csumi
+  
   double d1, d2;
-  if (function[1]){
+  
+  if (function[1]) {
     for (i=1; i<=ntypes; i++) {
       for (j=1; j<=ntypes; j++) {
         csumi[i] += neach_all[j]*B[i]*B[j];
@@ -3410,6 +3430,7 @@ void PPPMDisp::calc_csum()
       }
     }
   }
+  
   if (function[2]) {
     for (i=1; i<=ntypes; i++) {
       for (j=1; j<=ntypes; j++) {
@@ -3423,6 +3444,7 @@ void PPPMDisp::calc_csum()
       }
     }
   }
+  
   if (function[3]) {
     for (i=1; i<=ntypes; i++) {
       for (j=1; j<=ntypes; j++) {
@@ -3446,10 +3468,11 @@ void PPPMDisp::calc_csum()
 
 void PPPMDisp::adjust_gewald_6()
 {
-  // Use Newton solver to find g_ewald_6
+  // use Newton solver to find g_ewald_6
+  
   double dx;
 
-  // Start loop
+  // start loop
 
   for (int i = 0; i <  LARGE; i++) {
     dx = f_6() / derivf_6();
@@ -3457,15 +3480,14 @@ void PPPMDisp::adjust_gewald_6()
     if (fabs(f_6()) < SMALL) return;
   }
 
-  // Failed to converge
+  // failed to converge
 
   error->all(FLERR, "Could not adjust g_ewald_6");
-
 }
 
 /* ----------------------------------------------------------------------
- Calculate f(x) for Dispersion interaction
- ------------------------------------------------------------------------- */
+   calculate f(x) for Dispersion interaction
+------------------------------------------------------------------------- */
 
 double PPPMDisp::f_6()
 {
@@ -3490,13 +3512,13 @@ double PPPMDisp::f_6()
 }
 
 /* ----------------------------------------------------------------------
- Calculate numerical derivative f'(x) using forward difference
- [f(x + h) - f(x)] / h
- ------------------------------------------------------------------------- */
+   calculate numerical derivative f'(x) using forward difference
+   [f(x + h) - f(x)] / h
+------------------------------------------------------------------------- */
 
 double PPPMDisp::derivf_6()
 {
-  double h = 0.000001;  //Derivative step-size
+  double h = 0.000001;           // derivative step-size
   double df,f1,f2,g_ewald_old;
 
   f1 = f_6();
@@ -3512,7 +3534,7 @@ double PPPMDisp::derivf_6()
 
 /* ----------------------------------------------------------------------
    calculate an initial value for g_ewald_6
-   ---------------------------------------------------------------------- */
+ ---------------------------------------------------------------------- */
 
 void PPPMDisp::set_init_g6()
 {
@@ -3527,11 +3549,13 @@ void PPPMDisp::set_init_g6()
   // if df_real > 0, repeat divide g_ewald_6 by 2 until df_real < 0
   // else, repeat multiply g_ewald_6 by 2 until df_real > 0
   // perform bisection for the last two values of
+  
   double df_real;
   double g_ewald_old;
   double gmin, gmax;
 
   // check if there is a user defined accuracy
+  
   double acc_rspace = accuracy;
   if (accuracy_real_6 > 0) acc_rspace = accuracy_real_6;
 
@@ -3570,12 +3594,11 @@ void PPPMDisp::set_init_g6()
     g_ewald_6 = gmin + 0.5*(gmax-gmin);
   }
   if (counter >= LARGE-1) error->all(FLERR,"Cannot compute initial g_ewald_disp");
-
 }
 
 /* ----------------------------------------------------------------------
    calculate nx_pppm, ny_pppm, nz_pppm for dispersion interaction
-   ---------------------------------------------------------------------- */
+ ---------------------------------------------------------------------- */
 
 void PPPMDisp::set_n_pppm_6()
 {
@@ -3596,10 +3619,13 @@ void PPPMDisp::set_n_pppm_6()
   if (accuracy_kspace_6 > 0.0) acc_kspace = accuracy_kspace_6;
 
   // initial value for the grid spacing
+  
   h = h_x = h_y = h_z = 4.0/g_ewald_6;
+
   // decrease grid spacing until required precision is obtained
+
   int count = 0;
-  while(1) {
+  while (1) {
 
     // set grid dimension
     nx_pppm_6 = static_cast<int> (xprd/h_x);
@@ -3633,7 +3659,9 @@ void PPPMDisp::set_n_pppm_6()
 
     count++;
 
-    // break loop if the accuracy has been reached or too many loops have been performed
+    // break loop if the accuracy has been reached or
+    // too many loops have been performed
+    
     if (df_kspace <= acc_kspace) break;
     if (count > 500) error->all(FLERR, "Could not compute grid size for Dispersion");
     h *= 0.95;
@@ -3643,7 +3671,7 @@ void PPPMDisp::set_n_pppm_6()
 
 /* ----------------------------------------------------------------------
    calculate the real space error for dispersion interactions
-   ---------------------------------------------------------------------- */
+---------------------------------------------------------------------- */
 
 double PPPMDisp::lj_rspace_error()
 {
@@ -3657,15 +3685,15 @@ double PPPMDisp::lj_rspace_error()
   double rgs = (cutoff_lj*g_ewald_6);
   rgs *= rgs;
   double rgs_inv = 1.0/rgs;
-  deltaf = csum/sqrt(natoms*xprd*yprd*zprd_slab*cutoff_lj)*sqrt(MY_PI)*pow(g_ewald_6, 5)*
+  deltaf = csum/sqrt(natoms*xprd*yprd*zprd_slab*cutoff_lj)*
+    sqrt(MY_PI)*pow(g_ewald_6, 5)*
     exp(-rgs)*(1+rgs_inv*(3+rgs_inv*(6+rgs_inv*6)));
   return deltaf;
 }
 
-
 /* ----------------------------------------------------------------------
-   Compyute the modified (hockney-eastwood) coulomb green function
-   ---------------------------------------------------------------------- */
+   compute the modified (hockney-eastwood) coulomb green function
+---------------------------------------------------------------------- */
 
 void PPPMDisp::compute_gf()
 {
@@ -3691,7 +3719,6 @@ void PPPMDisp::compute_gf()
   double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
   double numerator,denominator;
 
-
   n = 0;
   for (m = nzlo_fft; m <= nzhi_fft; m++) {
     mper = m - nz_pppm*(2*m/nz_pppm);
@@ -3744,12 +3771,11 @@ void PPPMDisp::compute_gf()
 ------------------------------------------------------------------------- */
 
 void PPPMDisp::compute_sf_precoeff(int nxp, int nyp, int nzp, int ord,
-                                    int nxlo_ft, int nylo_ft, int nzlo_ft,
-                                    int nxhi_ft, int nyhi_ft, int nzhi_ft,
-                                    double *sf_pre1, double *sf_pre2, double *sf_pre3,
-                                    double *sf_pre4, double *sf_pre5, double *sf_pre6)
+				   int nxlo_ft, int nylo_ft, int nzlo_ft,
+				   int nxhi_ft, int nyhi_ft, int nzhi_ft,
+				   double *sf_pre1, double *sf_pre2, double *sf_pre3,
+				   double *sf_pre4, double *sf_pre5, double *sf_pre6)
 {
-
   int i,k,l,m,n;
   double *prd;
 
@@ -3866,8 +3892,8 @@ void PPPMDisp::compute_sf_precoeff(int nxp, int nyp, int nzp, int ord,
 }
 
 /* ----------------------------------------------------------------------
-   Compute the modified (hockney-eastwood) dispersion green function
-   ---------------------------------------------------------------------- */
+   compute the modified (hockney-eastwood) dispersion green function
+ ---------------------------------------------------------------------- */
 
 void PPPMDisp::compute_gf_6()
 {
@@ -3986,7 +4012,7 @@ void PPPMDisp::compute_sf_coeff()
     }
   }
 
-  // Compute the coefficients for the self-force correction
+  // compute the coefficients for the self-force correction
 
   double prex, prey, prez;
   prex = prey = prez = MY_PI/volume;
@@ -4063,7 +4089,6 @@ void PPPMDisp::compute_sf_coeff_6()
   double tmp[6];
   MPI_Allreduce(sf_coeff_6,tmp,6,MPI_DOUBLE,MPI_SUM,world);
   for (n = 0; n < 6; n++) sf_coeff_6[n] = tmp[n];
-
 }
 
 /* ----------------------------------------------------------------------
@@ -4498,20 +4523,22 @@ void PPPMDisp::make_rho_none()
 ------------------------------------------------------------------------- */
 
 void PPPMDisp::poisson_ik(FFT_SCALAR* wk1, FFT_SCALAR* wk2,
-                           FFT_SCALAR* dfft, LAMMPS_NS::FFT3d* ft1,LAMMPS_NS::FFT3d* ft2,
-                           int nx_p, int ny_p, int nz_p, int nft,
-                           int nxlo_ft, int nylo_ft, int nzlo_ft,
-                           int nxhi_ft, int nyhi_ft, int nzhi_ft,
-                           int nxlo_i, int nylo_i, int nzlo_i,
-                           int nxhi_i, int nyhi_i, int nzhi_i,
-                           double& egy, double* gfn,
-                           double* kx, double* ky, double* kz,
-                           double* kx2, double* ky2, double* kz2,
-                           FFT_SCALAR*** vx_brick, FFT_SCALAR*** vy_brick, FFT_SCALAR*** vz_brick,
-                           double* vir, double** vcoeff, double** vcoeff2,
-                           FFT_SCALAR*** u_pa, FFT_SCALAR*** v0_pa, FFT_SCALAR*** v1_pa, FFT_SCALAR*** v2_pa,
-                           FFT_SCALAR*** v3_pa, FFT_SCALAR*** v4_pa, FFT_SCALAR*** v5_pa)
-
+			  FFT_SCALAR* dfft, LAMMPS_NS::FFT3d* ft1,
+			  LAMMPS_NS::FFT3d* ft2,
+			  int nx_p, int ny_p, int nz_p, int nft,
+			  int nxlo_ft, int nylo_ft, int nzlo_ft,
+			  int nxhi_ft, int nyhi_ft, int nzhi_ft,
+			  int nxlo_i, int nylo_i, int nzlo_i,
+			  int nxhi_i, int nyhi_i, int nzhi_i,
+			  double& egy, double* gfn,
+			  double* kx, double* ky, double* kz,
+			  double* kx2, double* ky2, double* kz2,
+			  FFT_SCALAR*** vx_brick, FFT_SCALAR*** vy_brick,
+			  FFT_SCALAR*** vz_brick,
+			  double* vir, double** vcoeff, double** vcoeff2,
+			  FFT_SCALAR*** u_pa, FFT_SCALAR*** v0_pa,
+			  FFT_SCALAR*** v1_pa, FFT_SCALAR*** v2_pa,
+			  FFT_SCALAR*** v3_pa, FFT_SCALAR*** v4_pa, FFT_SCALAR*** v5_pa)
 
 {
   int i,j,k,n;
@@ -4642,18 +4669,18 @@ void PPPMDisp::poisson_ik(FFT_SCALAR* wk1, FFT_SCALAR* wk2,
 ------------------------------------------------------------------------- */
 
 void PPPMDisp::poisson_ad(FFT_SCALAR* wk1, FFT_SCALAR* wk2,
-                           FFT_SCALAR* dfft, LAMMPS_NS::FFT3d* ft1,LAMMPS_NS::FFT3d* ft2,
-                           int nx_p, int ny_p, int nz_p, int nft,
-                           int nxlo_ft, int nylo_ft, int nzlo_ft,
-                           int nxhi_ft, int nyhi_ft, int nzhi_ft,
-                           int nxlo_i, int nylo_i, int nzlo_i,
-                           int nxhi_i, int nyhi_i, int nzhi_i,
-                           double& egy, double* gfn,
-                           double* vir, double** vcoeff, double** vcoeff2,
-                           FFT_SCALAR*** u_pa, FFT_SCALAR*** v0_pa, FFT_SCALAR*** v1_pa, FFT_SCALAR*** v2_pa,
-                           FFT_SCALAR*** v3_pa, FFT_SCALAR*** v4_pa, FFT_SCALAR*** v5_pa)
-
-
+			  FFT_SCALAR* dfft, LAMMPS_NS::FFT3d* ft1,LAMMPS_NS::FFT3d* ft2,
+			  int nx_p, int ny_p, int nz_p, int nft,
+			  int nxlo_ft, int nylo_ft, int nzlo_ft,
+			  int nxhi_ft, int nyhi_ft, int nzhi_ft,
+			  int nxlo_i, int nylo_i, int nzlo_i,
+			  int nxhi_i, int nyhi_i, int nzhi_i,
+			  double& egy, double* gfn,
+			  double* vir, double** vcoeff, double** vcoeff2,
+			  FFT_SCALAR*** u_pa, FFT_SCALAR*** v0_pa,
+			  FFT_SCALAR*** v1_pa, FFT_SCALAR*** v2_pa,
+			  FFT_SCALAR*** v3_pa, FFT_SCALAR*** v4_pa,
+			  FFT_SCALAR*** v5_pa)
 {
   int i,j,k,n;
   double eng;
@@ -4733,11 +4760,13 @@ void PPPMDisp::poisson_ad(FFT_SCALAR* wk1, FFT_SCALAR* wk2,
 ------------------------------------------------------------------------- */
 
 void PPPMDisp:: poisson_peratom(FFT_SCALAR* wk1, FFT_SCALAR* wk2, LAMMPS_NS::FFT3d* ft2,
-                                 double** vcoeff, double** vcoeff2, int nft,
-                                 int nxlo_i, int nylo_i, int nzlo_i,
-                                 int nxhi_i, int nyhi_i, int nzhi_i,
-                                 FFT_SCALAR*** v0_pa, FFT_SCALAR*** v1_pa, FFT_SCALAR*** v2_pa,
-                                 FFT_SCALAR*** v3_pa, FFT_SCALAR*** v4_pa, FFT_SCALAR*** v5_pa)
+				double** vcoeff, double** vcoeff2, int nft,
+				int nxlo_i, int nylo_i, int nzlo_i,
+				int nxhi_i, int nyhi_i, int nzhi_i,
+				FFT_SCALAR*** v0_pa, FFT_SCALAR*** v1_pa,
+				FFT_SCALAR*** v2_pa,
+				FFT_SCALAR*** v3_pa, FFT_SCALAR*** v4_pa,
+				FFT_SCALAR*** v5_pa)
 {
  //v0 & v1 term
   int n, i, j, k;
@@ -4803,13 +4832,16 @@ void PPPMDisp:: poisson_peratom(FFT_SCALAR* wk1, FFT_SCALAR* wk2, LAMMPS_NS::FFT
    for ik scheme
 ------------------------------------------------------------------------- */
 
-void PPPMDisp::poisson_2s_ik(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
-                              FFT_SCALAR*** vxbrick_1, FFT_SCALAR*** vybrick_1, FFT_SCALAR*** vzbrick_1,
-                              FFT_SCALAR*** vxbrick_2, FFT_SCALAR*** vybrick_2, FFT_SCALAR*** vzbrick_2,
-                              FFT_SCALAR*** u_pa_1, FFT_SCALAR*** v0_pa_1, FFT_SCALAR*** v1_pa_1, FFT_SCALAR*** v2_pa_1,
-                              FFT_SCALAR*** v3_pa_1, FFT_SCALAR*** v4_pa_1, FFT_SCALAR*** v5_pa_1,
-                              FFT_SCALAR*** u_pa_2, FFT_SCALAR*** v0_pa_2, FFT_SCALAR*** v1_pa_2, FFT_SCALAR*** v2_pa_2,
-                              FFT_SCALAR*** v3_pa_2, FFT_SCALAR*** v4_pa_2, FFT_SCALAR*** v5_pa_2)
+void PPPMDisp::
+poisson_2s_ik(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
+	      FFT_SCALAR*** vxbrick_1, FFT_SCALAR*** vybrick_1, FFT_SCALAR*** vzbrick_1,
+	      FFT_SCALAR*** vxbrick_2, FFT_SCALAR*** vybrick_2, FFT_SCALAR*** vzbrick_2,
+	      FFT_SCALAR*** u_pa_1, FFT_SCALAR*** v0_pa_1,
+	      FFT_SCALAR*** v1_pa_1, FFT_SCALAR*** v2_pa_1,
+	      FFT_SCALAR*** v3_pa_1, FFT_SCALAR*** v4_pa_1, FFT_SCALAR*** v5_pa_1,
+	      FFT_SCALAR*** u_pa_2, FFT_SCALAR*** v0_pa_2,
+	      FFT_SCALAR*** v1_pa_2, FFT_SCALAR*** v2_pa_2,
+	      FFT_SCALAR*** v3_pa_2, FFT_SCALAR*** v4_pa_2, FFT_SCALAR*** v5_pa_2)
 
 {
   int i,j,k,n;
@@ -4848,7 +4880,8 @@ void PPPMDisp::poisson_2s_ik(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
     if (vflag_global) {
       n = 0;
       for (i = 0; i < nfft_6; i++) {
-        eng = 2 * s2 * greensfn_6[i] * (work1_6[n]*work2_6[n+1] - work1_6[n+1]*work2_6[n]);
+        eng = 2 * s2 * greensfn_6[i] *
+	  (work1_6[n]*work2_6[n+1] - work1_6[n+1]*work2_6[n]);
         for (j = 0; j < 6; j++) virial_6[j] += eng*vg_6[i][j];
         if (eflag_global)energy_6 += eng;
         n += 2;
@@ -4861,7 +4894,9 @@ void PPPMDisp::poisson_2s_ik(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
         n += 2;
       }
     }
+    
     // unify the two transformed vectors for efficient calculations later
+    
     for ( i = 0; i < 2*nfft_6; i++) {
       work1_6[i] += work2_6[i];
     }
@@ -4961,8 +4996,10 @@ void PPPMDisp::poisson_2s_ik(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
         }
   }
 
-  if (vflag_atom) poisson_2s_peratom(v0_pa_1, v1_pa_1, v2_pa_1, v3_pa_1, v4_pa_1, v5_pa_1,
-                                     v0_pa_2, v1_pa_2, v2_pa_2, v3_pa_2, v4_pa_2, v5_pa_2);
+  if (vflag_atom) poisson_2s_peratom(v0_pa_1, v1_pa_1, v2_pa_1,
+				     v3_pa_1, v4_pa_1, v5_pa_1,
+                                     v0_pa_2, v1_pa_2, v2_pa_2,
+				     v3_pa_2, v4_pa_2, v5_pa_2);
 }
 
 
@@ -4971,11 +5008,15 @@ void PPPMDisp::poisson_2s_ik(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
    for ik scheme
 ------------------------------------------------------------------------- */
 
-void PPPMDisp::poisson_none_ik(int n1, int n2,FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
-                              FFT_SCALAR*** vxbrick_1, FFT_SCALAR*** vybrick_1, FFT_SCALAR*** vzbrick_1,
-                              FFT_SCALAR*** vxbrick_2, FFT_SCALAR*** vybrick_2, FFT_SCALAR*** vzbrick_2,
-                              FFT_SCALAR**** u_pa, FFT_SCALAR**** v0_pa, FFT_SCALAR**** v1_pa, FFT_SCALAR**** v2_pa,
-                              FFT_SCALAR**** v3_pa, FFT_SCALAR**** v4_pa, FFT_SCALAR**** v5_pa)
+void PPPMDisp::
+poisson_none_ik(int n1, int n2,FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
+		FFT_SCALAR*** vxbrick_1, FFT_SCALAR*** vybrick_1,
+		FFT_SCALAR*** vzbrick_1,
+		FFT_SCALAR*** vxbrick_2, FFT_SCALAR*** vybrick_2,
+		FFT_SCALAR*** vzbrick_2,
+		FFT_SCALAR**** u_pa, FFT_SCALAR**** v0_pa,
+		FFT_SCALAR**** v1_pa, FFT_SCALAR**** v2_pa,
+		FFT_SCALAR**** v3_pa, FFT_SCALAR**** v4_pa, FFT_SCALAR**** v5_pa)
 {
   int i,j,k,n;
   double eng;
@@ -4983,8 +5024,8 @@ void PPPMDisp::poisson_none_ik(int n1, int n2,FFT_SCALAR* dfft_1, FFT_SCALAR* df
   double scaleinv = 1.0/(nx_pppm_6*ny_pppm_6*nz_pppm_6);
 
   // transform charge/dispersion density (r -> k)
-  // only one tansform required when energies and pressures do not
-  //  need to be calculated
+  // only one tansform required when energies and pressures not needed
+  
   if (eflag_global + vflag_global == 0) {
     n = 0;
     for (i = 0; i < nfft_6; i++) {
@@ -4995,9 +5036,8 @@ void PPPMDisp::poisson_none_ik(int n1, int n2,FFT_SCALAR* dfft_1, FFT_SCALAR* df
     fft1_6->compute(work1_6,work1_6,1);
   }
 
-
-  // two transforms are required when energies and pressures are
-  //   calculated
+  // two transforms are required when energies and pressures are calculated
+  
   else {
     n = 0;
     for (i = 0; i < nfft_6; i++) {
@@ -5016,7 +5056,9 @@ void PPPMDisp::poisson_none_ik(int n1, int n2,FFT_SCALAR* dfft_1, FFT_SCALAR* df
     if (vflag_global) {
       n = 0;
       for (i = 0; i < nfft_6; i++) {
-        eng = s2 * greensfn_6[i] * (B[n1]*(work1_6[n]*work1_6[n] + work1_6[n+1]*work1_6[n+1]) + B[n2]*(work2_6[n]*work2_6[n] + work2_6[n+1]*work2_6[n+1]));
+        eng = s2 * greensfn_6[i] *
+	  (B[n1]*(work1_6[n]*work1_6[n] + work1_6[n+1]*work1_6[n+1]) +
+	   B[n2]*(work2_6[n]*work2_6[n] + work2_6[n+1]*work2_6[n+1]));
         for (j = 0; j < 6; j++) virial_6[j] += eng*vg_6[i][j];
         if (eflag_global)energy_6 += eng;
         n += 2;
@@ -5025,11 +5067,15 @@ void PPPMDisp::poisson_none_ik(int n1, int n2,FFT_SCALAR* dfft_1, FFT_SCALAR* df
       n = 0;
       for (i = 0; i < nfft_6; i++) {
         energy_6 +=
-          s2 * greensfn_6[i] * (B[n1]*(work1_6[n]*work1_6[n] + work1_6[n+1]*work1_6[n+1]) + B[n2]*(work2_6[n]*work2_6[n] + work2_6[n+1]*work2_6[n+1]));
+          s2 * greensfn_6[i] *
+	  (B[n1]*(work1_6[n]*work1_6[n] + work1_6[n+1]*work1_6[n+1]) +
+	   B[n2]*(work2_6[n]*work2_6[n] + work2_6[n+1]*work2_6[n+1]));
         n += 2;
       }
     }
+    
     // unify the two transformed vectors for efficient calculations later
+    
     for ( i = 0; i < 2*nfft_6; i++) {
       work1_6[i] += work2_6[i];
     }
@@ -5139,12 +5185,14 @@ void PPPMDisp::poisson_none_ik(int n1, int n2,FFT_SCALAR* dfft_1, FFT_SCALAR* df
    for ad scheme
 ------------------------------------------------------------------------- */
 
-void PPPMDisp::poisson_2s_ad(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
-                              FFT_SCALAR*** u_pa_1, FFT_SCALAR*** v0_pa_1, FFT_SCALAR*** v1_pa_1, FFT_SCALAR*** v2_pa_1,
-                              FFT_SCALAR*** v3_pa_1, FFT_SCALAR*** v4_pa_1, FFT_SCALAR*** v5_pa_1,
-                              FFT_SCALAR*** u_pa_2, FFT_SCALAR*** v0_pa_2, FFT_SCALAR*** v1_pa_2, FFT_SCALAR*** v2_pa_2,
-                              FFT_SCALAR*** v3_pa_2, FFT_SCALAR*** v4_pa_2, FFT_SCALAR*** v5_pa_2)
-
+void PPPMDisp::
+poisson_2s_ad(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
+	      FFT_SCALAR*** u_pa_1, FFT_SCALAR*** v0_pa_1,
+	      FFT_SCALAR*** v1_pa_1, FFT_SCALAR*** v2_pa_1,
+	      FFT_SCALAR*** v3_pa_1, FFT_SCALAR*** v4_pa_1, FFT_SCALAR*** v5_pa_1,
+	      FFT_SCALAR*** u_pa_2, FFT_SCALAR*** v0_pa_2,
+	      FFT_SCALAR*** v1_pa_2, FFT_SCALAR*** v2_pa_2,
+	      FFT_SCALAR*** v3_pa_2, FFT_SCALAR*** v4_pa_2, FFT_SCALAR*** v5_pa_2)
 {
   int i,j,k,n;
   double eng;
@@ -5152,8 +5200,8 @@ void PPPMDisp::poisson_2s_ad(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
   double scaleinv = 1.0/(nx_pppm_6*ny_pppm_6*nz_pppm_6);
 
   // transform charge/dispersion density (r -> k)
-  // only one tansform required when energies and pressures do not
-  //  need to be calculated
+  // only one tansform required when energies and pressures not needed
+
   if (eflag_global + vflag_global == 0) {
     n = 0;
     for (i = 0; i < nfft_6; i++) {
@@ -5163,8 +5211,9 @@ void PPPMDisp::poisson_2s_ad(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
 
     fft1_6->compute(work1_6,work1_6,1);
   }
-  // two transforms are required when energies and pressures are
-  //   calculated
+  
+  // two transforms are required when energies and pressures are calculated
+  
   else {
     n = 0;
     for (i = 0; i < nfft_6; i++) {
@@ -5182,7 +5231,8 @@ void PPPMDisp::poisson_2s_ad(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
     if (vflag_global) {
       n = 0;
       for (i = 0; i < nfft_6; i++) {
-        eng = 2 * s2 * greensfn_6[i] * (work1_6[n]*work2_6[n+1] - work1_6[n+1]*work2_6[n]);
+        eng = 2 * s2 * greensfn_6[i] *
+	  (work1_6[n]*work2_6[n+1] - work1_6[n+1]*work2_6[n]);
         for (j = 0; j < 6; j++) virial_6[j] += eng*vg_6[i][j];
         if (eflag_global)energy_6 += eng;
         n += 2;
@@ -5226,8 +5276,10 @@ void PPPMDisp::poisson_2s_ad(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
         u_pa_2[k][j][i] = work2_6[n++];
       }
 
-  if (vflag_atom) poisson_2s_peratom(v0_pa_1, v1_pa_1, v2_pa_1, v3_pa_1, v4_pa_1, v5_pa_1,
-                                     v0_pa_2, v1_pa_2, v2_pa_2, v3_pa_2, v4_pa_2, v5_pa_2);
+  if (vflag_atom) poisson_2s_peratom(v0_pa_1, v1_pa_1, v2_pa_1,
+				     v3_pa_1, v4_pa_1, v5_pa_1,
+                                     v0_pa_2, v1_pa_2, v2_pa_2,
+				     v3_pa_2, v4_pa_2, v5_pa_2);
 }
 
 /* ----------------------------------------------------------------------
@@ -5235,10 +5287,11 @@ void PPPMDisp::poisson_2s_ad(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
    for ad scheme
 ------------------------------------------------------------------------- */
 
-void PPPMDisp::poisson_none_ad(int n1, int n2, FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
-                               FFT_SCALAR*** u_pa_1, FFT_SCALAR*** u_pa_2,
-                               FFT_SCALAR**** v0_pa, FFT_SCALAR**** v1_pa, FFT_SCALAR**** v2_pa,
-                               FFT_SCALAR**** v3_pa, FFT_SCALAR**** v4_pa, FFT_SCALAR**** v5_pa)
+void PPPMDisp::
+poisson_none_ad(int n1, int n2, FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
+		FFT_SCALAR*** u_pa_1, FFT_SCALAR*** u_pa_2,
+		FFT_SCALAR**** v0_pa, FFT_SCALAR**** v1_pa, FFT_SCALAR**** v2_pa,
+		FFT_SCALAR**** v3_pa, FFT_SCALAR**** v4_pa, FFT_SCALAR**** v5_pa)
 {
   int i,j,k,n;
   double eng;
@@ -5246,8 +5299,8 @@ void PPPMDisp::poisson_none_ad(int n1, int n2, FFT_SCALAR* dfft_1, FFT_SCALAR* d
   double scaleinv = 1.0/(nx_pppm_6*ny_pppm_6*nz_pppm_6);
 
   // transform charge/dispersion density (r -> k)
-  // only one tansform required when energies and pressures do not
-  //  need to be calculated
+  // only one tansform required when energies and pressures not needed
+
   if (eflag_global + vflag_global == 0) {
     n = 0;
     for (i = 0; i < nfft_6; i++) {
@@ -5257,8 +5310,9 @@ void PPPMDisp::poisson_none_ad(int n1, int n2, FFT_SCALAR* dfft_1, FFT_SCALAR* d
 
     fft1_6->compute(work1_6,work1_6,1);
   }
-  // two transforms are required when energies and pressures are
-  //   calculated
+  
+  // two transforms are required when energies and pressures are calculated
+  
   else {
     n = 0;
     for (i = 0; i < nfft_6; i++) {
@@ -5276,7 +5330,9 @@ void PPPMDisp::poisson_none_ad(int n1, int n2, FFT_SCALAR* dfft_1, FFT_SCALAR* d
     if (vflag_global) {
       n = 0;
       for (i = 0; i < nfft_6; i++) {
-        eng = s2 * greensfn_6[i] * (B[n1]*(work1_6[n]*work1_6[n] + work1_6[n+1]*work1_6[n+1]) + B[n2]*(work2_6[n]*work2_6[n] + work2_6[n+1]*work2_6[n+1]));
+        eng = s2 * greensfn_6[i] *
+	  (B[n1]*(work1_6[n]*work1_6[n] + work1_6[n+1]*work1_6[n+1]) +
+	   B[n2]*(work2_6[n]*work2_6[n] + work2_6[n+1]*work2_6[n+1]));
         for (j = 0; j < 6; j++) virial_6[j] += eng*vg_6[i][j];
         if (eflag_global)energy_6 += eng;
         n += 2;
@@ -5285,11 +5341,15 @@ void PPPMDisp::poisson_none_ad(int n1, int n2, FFT_SCALAR* dfft_1, FFT_SCALAR* d
       n = 0;
       for (i = 0; i < nfft_6; i++) {
         energy_6 +=
-          s2 * greensfn_6[i] * (B[n1]*(work1_6[n]*work1_6[n] + work1_6[n+1]*work1_6[n+1]) + B[n2]*(work2_6[n]*work2_6[n] + work2_6[n+1]*work2_6[n+1]));
+          s2 * greensfn_6[i] *
+	  (B[n1]*(work1_6[n]*work1_6[n] + work1_6[n+1]*work1_6[n+1]) +
+	   B[n2]*(work2_6[n]*work2_6[n] + work2_6[n+1]*work2_6[n+1]));
         n += 2;
       }
     }
+    
     // unify the two transformed vectors for efficient calculations later
+    
     for ( i = 0; i < 2*nfft_6; i++) {
       work1_6[i] += work2_6[i];
     }
@@ -5329,12 +5389,14 @@ void PPPMDisp::poisson_none_ad(int n1, int n2, FFT_SCALAR* dfft_1, FFT_SCALAR* d
    Fourier Transform for per atom virial calculations
 ------------------------------------------------------------------------- */
 
-void PPPMDisp::poisson_2s_peratom(FFT_SCALAR*** v0_pa_1, FFT_SCALAR*** v1_pa_1, FFT_SCALAR*** v2_pa_1,
-                                   FFT_SCALAR*** v3_pa_1, FFT_SCALAR*** v4_pa_1, FFT_SCALAR*** v5_pa_1,
-                                   FFT_SCALAR*** v0_pa_2, FFT_SCALAR*** v1_pa_2, FFT_SCALAR*** v2_pa_2,
-                                   FFT_SCALAR*** v3_pa_2, FFT_SCALAR*** v4_pa_2, FFT_SCALAR*** v5_pa_2)
+void PPPMDisp::
+poisson_2s_peratom(FFT_SCALAR*** v0_pa_1, FFT_SCALAR*** v1_pa_1, FFT_SCALAR*** v2_pa_1,
+		   FFT_SCALAR*** v3_pa_1, FFT_SCALAR*** v4_pa_1, FFT_SCALAR*** v5_pa_1,
+		   FFT_SCALAR*** v0_pa_2, FFT_SCALAR*** v1_pa_2, FFT_SCALAR*** v2_pa_2,
+		   FFT_SCALAR*** v3_pa_2, FFT_SCALAR*** v4_pa_2, FFT_SCALAR*** v5_pa_2)
 {
   //Compute first virial term v0
+  
   int n, i, j, k;
 
   n = 0;
@@ -5454,11 +5516,16 @@ void PPPMDisp::poisson_2s_peratom(FFT_SCALAR*** v0_pa_1, FFT_SCALAR*** v1_pa_1,
    Fourier Transform for per atom virial calculations
 ------------------------------------------------------------------------- */
 
-void PPPMDisp::poisson_none_peratom(int n1, int n2,
-                                 FFT_SCALAR*** v0_pa_1, FFT_SCALAR*** v1_pa_1, FFT_SCALAR*** v2_pa_1,
-                                 FFT_SCALAR*** v3_pa_1, FFT_SCALAR*** v4_pa_1, FFT_SCALAR*** v5_pa_1,
-                                 FFT_SCALAR*** v0_pa_2, FFT_SCALAR*** v1_pa_2, FFT_SCALAR*** v2_pa_2,
-                                 FFT_SCALAR*** v3_pa_2, FFT_SCALAR*** v4_pa_2, FFT_SCALAR*** v5_pa_2)
+void PPPMDisp::
+poisson_none_peratom(int n1, int n2,
+		     FFT_SCALAR*** v0_pa_1, FFT_SCALAR*** v1_pa_1,
+		     FFT_SCALAR*** v2_pa_1,
+		     FFT_SCALAR*** v3_pa_1, FFT_SCALAR*** v4_pa_1,
+		     FFT_SCALAR*** v5_pa_1,
+		     FFT_SCALAR*** v0_pa_2, FFT_SCALAR*** v1_pa_2,
+		     FFT_SCALAR*** v2_pa_2,
+		     FFT_SCALAR*** v3_pa_2, FFT_SCALAR*** v4_pa_2,
+		     FFT_SCALAR*** v5_pa_2)
 {
   //Compute first virial term v0
   int n, i, j, k;
@@ -5946,7 +6013,6 @@ void PPPMDisp::fieldforce_g_ad()
     sf += sf_coeff_6[5]*sin(4*MY_PI*s3);
     sf *= 2*lj*lj;
     if (slabflag != 2) f[i][2] += ekz*lj - sf;
-
   }
 }
 
@@ -6259,17 +6325,21 @@ void PPPMDisp::fieldforce_a_ad()
     sf = sf_coeff_6[0]*sin(2*MY_PI*s1);
     sf += sf_coeff_6[1]*sin(4*MY_PI*s1);
     sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
-    f[i][0] += lj0*ekx0 + lj1*ekx1 + lj2*ekx2 + lj3*ekx3 + lj4*ekx4 + lj5*ekx5 + lj6*ekx6 - sf;
+    f[i][0] += lj0*ekx0 + lj1*ekx1 + lj2*ekx2 + lj3*ekx3 +
+      lj4*ekx4 + lj5*ekx5 + lj6*ekx6 - sf;
 
     sf = sf_coeff_6[2]*sin(2*MY_PI*s2);
     sf += sf_coeff_6[3]*sin(4*MY_PI*s2);
     sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
-    f[i][1] += lj0*eky0 + lj1*eky1 + lj2*eky2 + lj3*eky3 + lj4*eky4 + lj5*eky5 + lj6*eky6 - sf;
+    f[i][1] += lj0*eky0 + lj1*eky1 + lj2*eky2 + lj3*eky3 +
+      lj4*eky4 + lj5*eky5 + lj6*eky6 - sf;
 
     sf = sf_coeff_6[4]*sin(2*MY_PI*s3);
     sf += sf_coeff_6[5]*sin(4*MY_PI*s3);
     sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
-    if (slabflag != 2) f[i][2] += lj0*ekz0 + lj1*ekz1 + lj2*ekz2 + lj3*ekz3 + lj4*ekz4 + lj5*ekz5 + lj6*ekz6 - sf;
+    if (slabflag != 2)
+      f[i][2] += lj0*ekz0 + lj1*ekz1 +
+	lj2*ekz2 + lj3*ekz3 + lj4*ekz4 + lj5*ekz5 + lj6*ekz6 - sf;
   }
 }
 
@@ -6709,8 +6779,10 @@ void PPPMDisp::fieldforce_none_peratom()
    pack values to buf to send to another proc
 ------------------------------------------------------------------------- */
 
-void PPPMDisp::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPMDisp::pack_forward_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
   int n = 0;
 
   switch (flag) {
@@ -7213,7 +7285,6 @@ void PPPMDisp::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
     }
     break;
   }
-
   }
 }
 
@@ -7221,8 +7292,10 @@ void PPPMDisp::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
    unpack another proc's own values from buf and set own ghost values
 ------------------------------------------------------------------------- */
 
-void PPPMDisp::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPMDisp::unpack_forward_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
   int n = 0;
 
   switch (flag) {
@@ -7725,7 +7798,6 @@ void PPPMDisp::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
     }
     break;
   }
-
   }
 }
 
@@ -7733,8 +7805,10 @@ void PPPMDisp::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
    pack ghost values into buf to send to another proc
 ------------------------------------------------------------------------- */
 
-void PPPMDisp::pack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPMDisp::pack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
   int n = 0;
 
   //Coulomb interactions
@@ -7787,8 +7861,10 @@ void PPPMDisp::pack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
    unpack another proc's ghost values from buf and add to own values
 ------------------------------------------------------------------------- */
 
-void PPPMDisp::unpack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPMDisp::unpack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
   int n = 0;
 
   //Coulomb interactions
@@ -8172,6 +8248,7 @@ int PPPMDisp::timing_3d(int n, double &time3d)
 double PPPMDisp::memory_usage()
 {
   double bytes = nmax*3 * sizeof(double);
+  
   int mixing = 1;
   int diff = 3;     //depends on differentiation
   int per = 7;      //depends on per atom calculations
@@ -8190,17 +8267,23 @@ double PPPMDisp::memory_usage()
     bytes += 6 * nfft_both * sizeof(double);      // vg
     bytes += nfft_both * sizeof(double);          // greensfn
     bytes += nfft_both * 3 * sizeof(FFT_SCALAR);    // density_FFT, work1, work2
-    if (cg) bytes += cg->memory_usage();
   }
 
   if (function[1] + function[2] + function[3]) {
     int nbrick = (nxhi_out_6-nxlo_out_6+1) * (nyhi_out_6-nylo_out_6+1) *
       (nzhi_out_6-nzlo_out_6+1);
-    bytes += (1 + diff + per ) * nbrick * sizeof(FFT_SCALAR) * mixing;     // density_brick + vd_brick + per atom bricks
+    // density_brick + vd_brick + per atom bricks
+    bytes += (1 + diff + per ) * nbrick * sizeof(FFT_SCALAR) * mixing;     
     bytes += 6 * nfft_both_6 * sizeof(double);      // vg
     bytes += nfft_both_6 * sizeof(double);          // greensfn
-    bytes += nfft_both_6 * (mixing + 2) * sizeof(FFT_SCALAR);    // density_FFT, work1, work2
-    if (cg_6) bytes += cg_6->memory_usage();
+    // density_FFT, work1, work2
+    bytes += nfft_both_6 * (mixing + 2) * sizeof(FFT_SCALAR);
   }
+
+  // four GridComm bufs
+
+  bytes += (ngc_buf1 + ngc_buf2) * npergrid * sizeof(FFT_SCALAR);
+  bytes += (ngc6_buf1 + ngc6_buf2) * npergrid * sizeof(FFT_SCALAR);
+
   return bytes;
 }
diff --git a/src/KSPACE/pppm_disp.h b/src/KSPACE/pppm_disp.h
index 130671fa28..37170478b0 100644
--- a/src/KSPACE/pppm_disp.h
+++ b/src/KSPACE/pppm_disp.h
@@ -42,7 +42,6 @@ typedef double FFT_SCALAR;
 
 namespace LAMMPS_NS {
 
-
 #define EWALD_MAXORDER  6
 #define EWALD_FUNCS     4
 
@@ -190,15 +189,14 @@ Variables needed for calculating the 1/r and 1/r^6 potential
   FFT_SCALAR *work1,*work2;
   FFT_SCALAR *work1_6, *work2_6;
 
-
   class FFT3d *fft1,*fft2 ;
-  class FFT3d *fft1_6, *fft2_6;
-  class Remap *remap;
-  class Remap *remap_6;
-  class GridComm *cg;
-  class GridComm *cg_peratom;
-  class GridComm *cg_6;
-  class GridComm *cg_peratom_6;
+  class FFT3d *fft1_6,*fft2_6;
+  class Remap *remap,*remap_6;
+  class GridComm *gc,*gc6;
+
+  FFT_SCALAR *gc_buf1,*gc_buf2,*gc6_buf1,*gc6_buf2;
+  int ngc_buf1,ngc_buf2,npergrid;
+  int ngc6_buf1,ngc6_buf2,npergrid6;
 
   int **part2grid;             // storage for particle -> grid mapping
   int **part2grid_6;
@@ -257,7 +255,6 @@ Variables needed for calculating the 1/r and 1/r^6 potential
   void compute_gf_denom(double*, int);
   double gf_denom(double, double, double, double*, int);
 
-
   void compute_sf_precoeff(int, int, int, int,
                            int, int, int,
                            int, int, int,
@@ -268,7 +265,6 @@ Variables needed for calculating the 1/r and 1/r^6 potential
   void compute_gf_6();
   void compute_sf_coeff_6();
 
-
   virtual void particle_map(double, double, double,
                              double, int **, int, int,
                              int, int, int,
@@ -295,8 +291,10 @@ Variables needed for calculating the 1/r and 1/r^6 potential
                           int, int, int, double&, double *,
                           double *, double *, double *,
                           double *, double *, double *,
-                          FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***, double *, double **, double **,
-                          FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
+                          FFT_SCALAR ***, FFT_SCALAR ***,
+			  FFT_SCALAR ***, double *, double **, double **,
+                          FFT_SCALAR ***, FFT_SCALAR ***,
+			  FFT_SCALAR ***, FFT_SCALAR ***,
                           FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***);
 
   virtual void poisson_ad(FFT_SCALAR*, FFT_SCALAR*,
@@ -317,14 +315,18 @@ Variables needed for calculating the 1/r and 1/r^6 potential
   virtual void poisson_2s_ik(FFT_SCALAR *, FFT_SCALAR *,
                              FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
                              FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
-                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
                              FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
-                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
+			     FFT_SCALAR ***,
+                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
+                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
+			     FFT_SCALAR ***,
                              FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***);
   virtual void poisson_2s_ad(FFT_SCALAR *, FFT_SCALAR *,
-                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
                              FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
-                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
+			     FFT_SCALAR ***,
+                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
+                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
+			     FFT_SCALAR ***,
                              FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***);
 
   virtual void poisson_2s_peratom(FFT_SCALAR***, FFT_SCALAR***, FFT_SCALAR***,
@@ -339,9 +341,11 @@ Variables needed for calculating the 1/r and 1/r^6 potential
   virtual void poisson_none_ik(int, int, FFT_SCALAR *, FFT_SCALAR *,
                                FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
                                FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
-                               FFT_SCALAR ****, FFT_SCALAR ****, FFT_SCALAR ****, FFT_SCALAR ****,
+                               FFT_SCALAR ****, FFT_SCALAR ****, FFT_SCALAR ****,
+			       FFT_SCALAR ****,
                                FFT_SCALAR ****, FFT_SCALAR ****, FFT_SCALAR ****);
-  virtual void poisson_none_peratom(int, int, FFT_SCALAR***, FFT_SCALAR***, FFT_SCALAR***,
+  virtual void poisson_none_peratom(int, int,
+				    FFT_SCALAR***, FFT_SCALAR***, FFT_SCALAR***,
                                     FFT_SCALAR***, FFT_SCALAR***, FFT_SCALAR***,
                                     FFT_SCALAR***, FFT_SCALAR***, FFT_SCALAR***,
                                     FFT_SCALAR***, FFT_SCALAR***, FFT_SCALAR***);
@@ -369,10 +373,10 @@ Variables needed for calculating the 1/r and 1/r^6 potential
 
   // grid communication
 
-  void pack_forward(int, FFT_SCALAR *, int, int *);
-  void unpack_forward(int, FFT_SCALAR *, int, int *);
-  void pack_reverse(int, FFT_SCALAR *, int, int *);
-  void unpack_reverse(int, FFT_SCALAR *, int, int *);
+  void pack_forward_grid(int, void *, int, int *);
+  void unpack_forward_grid(int, void *, int, int *);
+  void pack_reverse_grid(int, void *, int, int *);
+  void unpack_reverse_grid(int, void *, int, int *);
 };
 
 }
diff --git a/src/KSPACE/pppm_stagger.cpp b/src/KSPACE/pppm_stagger.cpp
index d7466ee0d4..0444b93a71 100644
--- a/src/KSPACE/pppm_stagger.cpp
+++ b/src/KSPACE/pppm_stagger.cpp
@@ -123,11 +123,7 @@ void PPPMStagger::compute(int eflag, int vflag)
 
   ev_init(eflag,vflag);
 
-  if (evflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    cg_peratom->ghost_notify();
-    cg_peratom->setup();
-  }
+  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();
 
   // convert atoms from box to lamda coords
 
@@ -160,7 +156,8 @@ void PPPMStagger::compute(int eflag, int vflag)
     //   to fully sum contribution in their 3d bricks
     // remap from 3d decomposition to FFT decomposition
 
-    cg->reverse_comm(this,REVERSE_RHO);
+    gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     brick2fft();
 
     // compute potential gradient on my FFT grid and
@@ -173,16 +170,22 @@ void PPPMStagger::compute(int eflag, int vflag)
     // all procs communicate E-field values
     // to fill ghost cells surrounding their 3d bricks
 
-    if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
-    else cg->forward_comm(this,FORWARD_IK);
+    if (differentiation_flag == 1)
+      gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
+			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+    else
+      gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
+			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     // extra per-atom energy/virial communication
 
     if (evflag_atom) {
       if (differentiation_flag == 1 && vflag_atom)
-        cg_peratom->forward_comm(this,FORWARD_AD_PERATOM);
+	gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+				gc_buf1,gc_buf2,MPI_FFT_SCALAR);
       else if (differentiation_flag == 0)
-        cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
+	gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+				gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     }
 
     // calculate the force on my particles
diff --git a/src/KSPACE/pppm_stagger.h b/src/KSPACE/pppm_stagger.h
index 80161a3707..2f4cf9fb9b 100644
--- a/src/KSPACE/pppm_stagger.h
+++ b/src/KSPACE/pppm_stagger.h
@@ -50,7 +50,6 @@ class PPPMStagger : public PPPM {
   virtual void fieldforce_ad();
   virtual void fieldforce_peratom();
 
-
   inline double gf_denom2(const double &x, const double &y,
                          const double &z) const
   {
diff --git a/src/USER-INTEL/pppm_disp_intel.cpp b/src/USER-INTEL/pppm_disp_intel.cpp
index 75fdc3b1df..97e5c57d6e 100644
--- a/src/USER-INTEL/pppm_disp_intel.cpp
+++ b/src/USER-INTEL/pppm_disp_intel.cpp
@@ -173,23 +173,15 @@ void PPPMDispIntel::compute(int eflag, int vflag)
     return;
   }
   #endif
+  
   int i;
+  
   // convert atoms from box to lamda coords
 
   ev_init(eflag,vflag);
 
-  if (evflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    if (function[0]) {
-      cg_peratom->ghost_notify();
-      cg_peratom->setup();
-    }
-    if (function[1] + function[2] + function[3]) {
-      cg_peratom_6->ghost_notify();
-      cg_peratom_6->setup();
-    }
-    peratom_allocate_flag = 1;
-  }
+  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();
+
   if (triclinic == 0) boxlo = domain->boxlo;
   else {
     boxlo = domain->boxlo_lamda;
@@ -299,7 +291,8 @@ void PPPMDispIntel::compute(int eflag, int vflag)
       make_rho_c<float,float>(fix->get_single_buffers());
     }
 
-    cg->reverse_comm(this,REVERSE_RHO);
+    gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     brick2fft(nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
               density_brick, density_fft, work1,remap);
@@ -312,7 +305,8 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                  energy_1, greensfn, virial_1, vg,vg2, u_brick, v0_brick,
                  v1_brick, v2_brick, v3_brick, v4_brick, v5_brick);
 
-      cg->forward_comm(this,FORWARD_AD);
+      gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
+			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
         fieldforce_c_ad<float,double>(fix->get_mixed_buffers());
@@ -322,7 +316,9 @@ void PPPMDispIntel::compute(int eflag, int vflag)
         fieldforce_c_ad<float,float>(fix->get_single_buffers());
       }
 
-      if (vflag_atom) cg_peratom->forward_comm(this, FORWARD_AD_PERATOM);
+      if (vflag_atom)
+	gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+				gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     } else {
       poisson_ik(work1, work2, density_fft, fft1, fft2,
@@ -334,7 +330,8 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                  u_brick, v0_brick, v1_brick, v2_brick, v3_brick, v4_brick,
                  v5_brick);
 
-      cg->forward_comm(this, FORWARD_IK);
+      gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
+			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
         fieldforce_c_ik<float,double>(fix->get_mixed_buffers());
@@ -344,12 +341,15 @@ void PPPMDispIntel::compute(int eflag, int vflag)
         fieldforce_c_ik<float,float>(fix->get_single_buffers());
       }
 
-      if (evflag_atom) cg_peratom->forward_comm(this, FORWARD_IK_PERATOM);
+      if (evflag_atom)
+	gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+				gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     }
     if (evflag_atom) fieldforce_c_peratom();
   }
 
   if (function[1]) {
+    
     //perform calculations for geometric mixing
 
     if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
@@ -375,14 +375,13 @@ void PPPMDispIntel::compute(int eflag, int vflag)
       make_rho_g<float,float>(fix->get_single_buffers());
     }
 
-
-    cg_6->reverse_comm(this, REVERSE_RHO_G);
+    gc6->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO_G,
+			     gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     brick2fft(nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, nzhi_in_6,
               density_brick_g, density_fft_g, work1_6,remap_6);
 
     if (differentiation_flag == 1) {
-
       poisson_ad(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
                  nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6,
                  nxlo_fft_6, nylo_fft_6, nzlo_fft_6, nxhi_fft_6,
@@ -391,17 +390,20 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                  virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g, v1_brick_g,
                  v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);
 
-      cg_6->forward_comm(this,FORWARD_AD_G);
+      gc6->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD_G,
+			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
-    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-      fieldforce_g_ad<float,double>(fix->get_mixed_buffers());
-    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-      fieldforce_g_ad<double,double>(fix->get_double_buffers());
-    } else {
-      fieldforce_g_ad<float,float>(fix->get_single_buffers());
-    }
+      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+	fieldforce_g_ad<float,double>(fix->get_mixed_buffers());
+      } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+	fieldforce_g_ad<double,double>(fix->get_double_buffers());
+      } else {
+	fieldforce_g_ad<float,float>(fix->get_single_buffers());
+      }
 
-      if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_G);
+      if (vflag_atom)
+	gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_G,
+				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     } else {
       poisson_ik(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
@@ -413,19 +415,22 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                  vdz_brick_g, virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g,
                  v1_brick_g, v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);
 
-      cg_6->forward_comm(this,FORWARD_IK_G);
+      gc6->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK_G,
+			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
-    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-      fieldforce_g_ik<float,double>(fix->get_mixed_buffers());
-    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-      fieldforce_g_ik<double,double>(fix->get_double_buffers());
-    } else {
-      fieldforce_g_ik<float,float>(fix->get_single_buffers());
+      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+	fieldforce_g_ik<float,double>(fix->get_mixed_buffers());
+      } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+	fieldforce_g_ik<double,double>(fix->get_double_buffers());
+      } else {
+	fieldforce_g_ik<float,float>(fix->get_single_buffers());
+      }
+
+      if (evflag_atom)
+	gc6->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_G,
+				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     }
 
-
-      if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_G);
-    }
     if (evflag_atom) fieldforce_g_peratom();
   }
 
@@ -455,12 +460,12 @@ void PPPMDispIntel::compute(int eflag, int vflag)
       make_rho_a<float,float>(fix->get_single_buffers());
     }
 
-    cg_6->reverse_comm(this, REVERSE_RHO_A);
+    gc->reverse_comm_kspace(this,7,sizeof(FFT_SCALAR),REVERSE_RHO_A,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     brick2fft_a();
 
-    if ( differentiation_flag == 1) {
-
+    if (differentiation_flag == 1) {
       poisson_ad(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
                  nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6,
                  nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
@@ -481,20 +486,22 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                     v5_brick_a2, u_brick_a4, v0_brick_a4, v1_brick_a4,
                     v2_brick_a4, v3_brick_a4, v4_brick_a4, v5_brick_a4);
 
-      cg_6->forward_comm(this, FORWARD_AD_A);
+      gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_AD_A,
+			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
-    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-      fieldforce_a_ad<float,double>(fix->get_mixed_buffers());
-    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-      fieldforce_a_ad<double,double>(fix->get_double_buffers());
-    } else {
-      fieldforce_a_ad<float,float>(fix->get_single_buffers());
-    }
+      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+	fieldforce_a_ad<float,double>(fix->get_mixed_buffers());
+      } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+	fieldforce_a_ad<double,double>(fix->get_double_buffers());
+      } else {
+	fieldforce_a_ad<float,float>(fix->get_single_buffers());
+      }
 
-      if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_AD_PERATOM_A);
+      if (evflag_atom)
+	gc6->forward_comm_kspace(this,42,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_A,
+				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     }  else {
-
       poisson_ik(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
                  nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6,
                  nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
@@ -522,7 +529,8 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                     u_brick_a4, v0_brick_a4, v1_brick_a4, v2_brick_a4,
                     v3_brick_a4, v4_brick_a4, v5_brick_a4);
 
-      cg_6->forward_comm(this, FORWARD_IK_A);
+      gc6->forward_comm_kspace(this,18,sizeof(FFT_SCALAR),FORWARD_IK_A,
+			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
         fieldforce_a_ik<float,double>(fix->get_mixed_buffers());
@@ -532,13 +540,17 @@ void PPPMDispIntel::compute(int eflag, int vflag)
         fieldforce_a_ik<float,float>(fix->get_single_buffers());
       }
 
-      if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_A);
+      if (evflag_atom)
+	gc6->forward_comm_kspace(this,49,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_A,
+				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     }
+
     if (evflag_atom) fieldforce_a_peratom();
   }
 
   if (function[3]) {
-    //perform calculations if no mixing rule applies
+    
+    // perform calculations if no mixing rule applies
 
     if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
       particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
@@ -563,7 +575,8 @@ void PPPMDispIntel::compute(int eflag, int vflag)
       make_rho_none<float,float>(fix->get_single_buffers());
     }
 
-    cg_6->reverse_comm(this, REVERSE_RHO_NONE);
+    gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO_NONE,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     brick2fft_none();
 
@@ -578,17 +591,20 @@ void PPPMDispIntel::compute(int eflag, int vflag)
         n += 2;
       }
 
-      cg_6->forward_comm(this,FORWARD_AD_NONE);
+      gc6->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD_NONE,
+			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
-    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-      fieldforce_none_ad<float,double>(fix->get_mixed_buffers());
-    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-      fieldforce_none_ad<double,double>(fix->get_double_buffers());
-    } else {
-      fieldforce_none_ad<float,float>(fix->get_single_buffers());
-    }
+      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+	fieldforce_none_ad<float,double>(fix->get_mixed_buffers());
+      } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+	fieldforce_none_ad<double,double>(fix->get_double_buffers());
+      } else {
+	fieldforce_none_ad<float,float>(fix->get_single_buffers());
+      }
 
-      if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_NONE);
+      if (vflag_atom)
+	gc6->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_NONE,
+				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     } else {
       int n = 0;
@@ -604,19 +620,22 @@ void PPPMDispIntel::compute(int eflag, int vflag)
         n += 2;
       }
 
-      cg_6->forward_comm(this,FORWARD_IK_NONE);
+      gc6->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK_NONE,
+			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
-    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-      fieldforce_none_ik<float,double>(fix->get_mixed_buffers());
-    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-      fieldforce_none_ik<double,double>(fix->get_double_buffers());
-    } else {
-      fieldforce_none_ik<float,float>(fix->get_single_buffers());
-    }
+      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+	fieldforce_none_ik<float,double>(fix->get_mixed_buffers());
+      } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+	fieldforce_none_ik<double,double>(fix->get_double_buffers());
+      } else {
+	fieldforce_none_ik<float,float>(fix->get_single_buffers());
+      }
 
       if (evflag_atom)
-        cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_NONE);
+	gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_NONE,
+				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     }
+
     if (evflag_atom) fieldforce_none_peratom();
   }
 
diff --git a/src/USER-INTEL/pppm_intel.cpp b/src/USER-INTEL/pppm_intel.cpp
index d643da96b2..26af974086 100644
--- a/src/USER-INTEL/pppm_intel.cpp
+++ b/src/USER-INTEL/pppm_intel.cpp
@@ -164,11 +164,7 @@ void PPPMIntel::compute_first(int eflag, int vflag)
 
   ev_init(eflag,vflag);
 
-  if (evflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    cg_peratom->ghost_notify();
-    cg_peratom->setup();
-  }
+  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();
 
   // if atom count has changed, update qsum and qsqsum
 
@@ -232,7 +228,8 @@ void PPPMIntel::compute_first(int eflag, int vflag)
   //   to fully sum contribution in their 3d bricks
   // remap from 3d decomposition to FFT decomposition
 
-  cg->reverse_comm(this,REVERSE_RHO);
+  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+			  gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   brick2fft();
 
   // compute potential gradient on my FFT grid and
@@ -246,16 +243,22 @@ void PPPMIntel::compute_first(int eflag, int vflag)
   // all procs communicate E-field values
   // to fill ghost cells surrounding their 3d bricks
 
-  if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
-  else cg->forward_comm(this,FORWARD_IK);
+  if (differentiation_flag == 1)
+    gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  else
+    gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
   // extra per-atom energy/virial communication
 
   if (evflag_atom) {
     if (differentiation_flag == 1 && vflag_atom)
-      cg_peratom->forward_comm(this,FORWARD_AD_PERATOM);
+      gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     else if (differentiation_flag == 0)
-      cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
+      gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   }
 }
 
diff --git a/src/force.cpp b/src/force.cpp
index 595ffd3140..babb052e84 100644
--- a/src/force.cpp
+++ b/src/force.cpp
@@ -664,10 +664,6 @@ void Force::create_kspace(const std::string &style, int trysuffix)
   int sflag;
   kspace = new_kspace(style,trysuffix,sflag);
   store_style(kspace_style,style,sflag);
-
-  //if (comm->style == 1 && !kspace_match("ewald",0))
-  // error->all(FLERR,
-  //             "Cannot yet use KSpace solver with grid with comm style tiled");
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/kspace.h b/src/kspace.h
index f5229c57a7..110a790dfe 100644
--- a/src/kspace.h
+++ b/src/kspace.h
@@ -121,12 +121,6 @@ class KSpace : protected Pointers {
   virtual void compute(int, int) = 0;
   virtual void compute_group_group(int, int, int) {};
 
-  // can remove these 4 when done with new GridComm
-  virtual void pack_forward(int, FFT_SCALAR *, int, int *) {};
-  virtual void unpack_forward(int, FFT_SCALAR *, int, int *) {};
-  virtual void pack_reverse(int, FFT_SCALAR *, int, int *) {};
-  virtual void unpack_reverse(int, FFT_SCALAR *, int, int *) {};
-
   virtual void pack_forward_grid(int, void *, int, int *) {};
   virtual void unpack_forward_grid(int, void *, int, int *) {};
   virtual void pack_reverse_grid(int, void *, int, int *) {};

From ecec36cc155dd299f77e43b7a94b1450ecb7f825 Mon Sep 17 00:00:00 2001
From: Steve Plimpton <sjplimp@sandia.gov>
Date: Wed, 12 Aug 2020 14:26:15 -0600
Subject: [PATCH 04/38] remove trial versions of PPPM2 and GridComm2

---
 src/KSPACE/gridcomm2.cpp | 1097 ------------
 src/KSPACE/gridcomm2.h   |  203 ---
 src/KSPACE/pppm2.cpp     | 3524 --------------------------------------
 src/KSPACE/pppm2.h       |  360 ----
 4 files changed, 5184 deletions(-)
 delete mode 100644 src/KSPACE/gridcomm2.cpp
 delete mode 100644 src/KSPACE/gridcomm2.h
 delete mode 100644 src/KSPACE/pppm2.cpp
 delete mode 100644 src/KSPACE/pppm2.h

diff --git a/src/KSPACE/gridcomm2.cpp b/src/KSPACE/gridcomm2.cpp
deleted file mode 100644
index 15ca165c74..0000000000
--- a/src/KSPACE/gridcomm2.cpp
+++ /dev/null
@@ -1,1097 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#include "gridcomm2.h"
-#include <mpi.h>
-#include "comm.h"
-#include "kspace.h"
-#include "irregular.h"
-#include "memory.h"
-
-using namespace LAMMPS_NS;
-
-enum{REGULAR,TILED};
-
-#define SWAPDELTA 8
-
-// NOTE: gridcomm needs to be world for TILED, will it work with MSM?
-// NOTE: Tiled implementation here only works for RCB, not general tiled
-
-/* ----------------------------------------------------------------------
-   gcomm = MPI communicator that shares this grid
-           does not have to be world, see MSM
-   gn xyz = size of global grid
-   i xyz lohi = portion of global grid this proc owns, 0 <= index < N
-   o xyz lohi = owned grid portion + ghost grid cells needed in all directions
-   if o indices are < 0 or hi indices are >= N,
-     then grid is treated as periodic in that dimension,
-     communication is done across the periodic boundaries
-------------------------------------------------------------------------- */
-
-GridComm2::GridComm2(LAMMPS *lmp, MPI_Comm gcomm,
-		     int gnx, int gny, int gnz,
-		     int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
-		     int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi)
-  : Pointers(lmp)
-{
-  gridcomm = gcomm;
-  MPI_Comm_rank(gridcomm,&me);
-  MPI_Comm_size(gridcomm,&nprocs);
-
-  nx = gnx;
-  ny = gny;
-  nz = gnz;
-  
-  inxlo = ixlo;
-  inxhi = ixhi;
-  inylo = iylo;
-  inyhi = iyhi;
-  inzlo = izlo;
-  inzhi = izhi;
-
-  outxlo = oxlo;
-  outxhi = oxhi;
-  outylo = oylo;
-  outyhi = oyhi;
-  outzlo = ozlo;
-  outzhi = ozhi;
-
-  // layout == REGULAR or TILED
-  // for REGULAR, proc xyz lohi = my 6 neighbor procs
-  
-  layout = REGULAR;
-  if (comm->layout == Comm::LAYOUT_TILED) layout = TILED;
-  
-  outxlo_max = oxlo;
-  outxhi_max = oxhi;
-  outylo_max = oylo;
-  outyhi_max = oyhi;
-  outzlo_max = ozlo;
-  outzhi_max = ozhi;
-
-  if (layout == REGULAR) {
-    int (*procneigh)[2] = comm->procneigh;
-
-    procxlo = procneigh[0][0];
-    procxhi = procneigh[0][1];
-    procylo = procneigh[1][0];
-    procyhi = procneigh[1][1];
-    proczlo = procneigh[2][0];
-    proczhi = procneigh[2][1];
-  }
-  
-  nswap = maxswap = 0;
-  swap = NULL;
-
-  nsend = nrecv = ncopy = 0;
-  send = NULL;
-  recv = NULL;
-  copy = NULL;
-  requests = NULL;
-}
-
-/* ----------------------------------------------------------------------
-   same as first constructor except o xyz lohi max are added arguments
-   this is for case when caller stores grid in a larger array than o xyz lohi
-   only affects indices() method which generates indices into the caller's array
-------------------------------------------------------------------------- */
-
-GridComm2::GridComm2(LAMMPS *lmp, MPI_Comm gcomm,
-		     int gnx, int gny, int gnz,
-		     int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
-		     int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi,
-		     int oxlo_max, int oxhi_max, int oylo_max, int oyhi_max,
-		     int ozlo_max, int ozhi_max)
-  : Pointers(lmp)
-{
-  gridcomm = gcomm;
-  MPI_Comm_rank(gridcomm,&me);
-  MPI_Comm_size(gridcomm,&nprocs);
-
-  nx = gnx;
-  ny = gny;
-  nz = gnz;
-
-  inxlo = ixlo;
-  inxhi = ixhi;
-  inylo = iylo;
-  inyhi = iyhi;
-  inzlo = izlo;
-  inzhi = izhi;
-
-  outxlo = oxlo;
-  outxhi = oxhi;
-  outylo = oylo;
-  outyhi = oyhi;
-  outzlo = ozlo;
-  outzhi = ozhi;
-
-  outxlo_max = oxlo_max;
-  outxhi_max = oxhi_max;
-  outylo_max = oylo_max;
-  outyhi_max = oyhi_max;
-  outzlo_max = ozlo_max;
-  outzhi_max = ozhi_max;
-
-  // layout == REGULAR or TILED
-  // for REGULAR, proc xyz lohi = my 6 neighbor procs
-
-  layout = REGULAR;
-  if (comm->layout == Comm::LAYOUT_TILED) layout = TILED;
-
-  if (layout == REGULAR) {
-    int (*procneigh)[2] = comm->procneigh;
-
-    procxlo = procneigh[0][0];
-    procxhi = procneigh[0][1];
-    procylo = procneigh[1][0];
-    procyhi = procneigh[1][1];
-    proczlo = procneigh[2][0];
-    proczhi = procneigh[2][1];
-  }
-
-  nswap = maxswap = 0;
-  swap = NULL;
-
-  nsend = nrecv = ncopy = 0;
-  send = NULL;
-  recv = NULL;
-  copy = NULL;
-  requests = NULL;
-}
-
-/* ---------------------------------------------------------------------- */
-
-GridComm2::~GridComm2()
-{
-  // regular comm data struct
-  
-  for (int i = 0; i < nswap; i++) {
-    memory->destroy(swap[i].packlist);
-    memory->destroy(swap[i].unpacklist);
-  }
-  memory->sfree(swap);
-
-  // tiled comm data structs
-  
-  for (int i = 0; i < nsend; i++)
-    memory->destroy(send[i].packlist);
-  memory->sfree(send);
-
-  for (int i = 0; i < nrecv; i++)
-    memory->destroy(recv[i].unpacklist);
-  memory->sfree(recv);
-
-  for (int i = 0; i < ncopy; i++) {
-    memory->destroy(copy[i].packlist);
-    memory->destroy(copy[i].unpacklist);
-  }
-  memory->sfree(copy);
-
-  delete [] requests;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void GridComm2::setup(int &nbuf1, int &nbuf2)
-{
-  if (layout == REGULAR) setup_regular(nbuf1,nbuf2);
-  else setup_tiled(nbuf1,nbuf2);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void GridComm2::setup_regular(int &nbuf1, int &nbuf2)
-{
-  int nsent,sendfirst,sendlast,recvfirst,recvlast;
-  int sendplanes,recvplanes;
-  int notdoneme,notdone;
-
-  // notify 6 neighbor procs how many ghost grid planes I need from them
-  // ghost xyz lo = # of my lower grid planes that proc xyz lo needs as its ghosts
-  // ghost xyz hi = # of my upper grid planes that proc xyz hi needs as its ghosts
-  // if this proc is its own neighbor across periodic bounary, value is from self
-
-  int nplanes = inxlo - outxlo;
-  if (procxlo != me)
-      MPI_Sendrecv(&nplanes,1,MPI_INT,procxlo,0,
-                   &ghostxhi,1,MPI_INT,procxhi,0,gridcomm,MPI_STATUS_IGNORE);
-  else ghostxhi = nplanes;
-
-  nplanes = outxhi - inxhi;
-  if (procxhi != me)
-      MPI_Sendrecv(&nplanes,1,MPI_INT,procxhi,0,
-                   &ghostxlo,1,MPI_INT,procxlo,0,gridcomm,MPI_STATUS_IGNORE);
-  else ghostxlo = nplanes;
-
-  nplanes = inylo - outylo;
-  if (procylo != me)
-    MPI_Sendrecv(&nplanes,1,MPI_INT,procylo,0,
-                 &ghostyhi,1,MPI_INT,procyhi,0,gridcomm,MPI_STATUS_IGNORE);
-  else ghostyhi = nplanes;
-
-  nplanes = outyhi - inyhi;
-  if (procyhi != me)
-    MPI_Sendrecv(&nplanes,1,MPI_INT,procyhi,0,
-                 &ghostylo,1,MPI_INT,procylo,0,gridcomm,MPI_STATUS_IGNORE);
-  else ghostylo = nplanes;
-
-  nplanes = inzlo - outzlo;
-  if (proczlo != me)
-    MPI_Sendrecv(&nplanes,1,MPI_INT,proczlo,0,
-                 &ghostzhi,1,MPI_INT,proczhi,0,gridcomm,MPI_STATUS_IGNORE);
-  else ghostzhi = nplanes;
-
-  nplanes = outzhi - inzhi;
-  if (proczhi != me)
-    MPI_Sendrecv(&nplanes,1,MPI_INT,proczhi,0,
-                 &ghostzlo,1,MPI_INT,proczlo,0,gridcomm,MPI_STATUS_IGNORE);
-  else ghostzlo = nplanes;
-
-  // setup swaps = exchange of grid data with one of 6 neighobr procs
-  // can be more than one in a direction if ghost region extends beyond neigh proc
-  // all procs have same swap count, but swapsize npack/nunpack can be empty
-  
-  nswap = 0;
-
-  // send own grid pts to -x processor, recv ghost grid pts from +x processor
-
-  nsent = 0;
-  sendfirst = inxlo;
-  sendlast = inxhi;
-  recvfirst = inxhi+1;
-  notdone = 1;
-
-  while (notdone) {
-    if (nswap == maxswap) grow_swap();
-
-    swap[nswap].sendproc = procxlo;
-    swap[nswap].recvproc = procxhi;
-    sendplanes = MIN(sendlast-sendfirst+1,ghostxlo-nsent);
-    swap[nswap].npack =
-      indices(swap[nswap].packlist,
-              sendfirst,sendfirst+sendplanes-1,inylo,inyhi,inzlo,inzhi);
-
-    if (procxlo != me)
-      MPI_Sendrecv(&sendplanes,1,MPI_INT,procxlo,0,
-                   &recvplanes,1,MPI_INT,procxhi,0,gridcomm,MPI_STATUS_IGNORE);
-    else recvplanes = sendplanes;
-
-    swap[nswap].nunpack =
-      indices(swap[nswap].unpacklist,
-              recvfirst,recvfirst+recvplanes-1,inylo,inyhi,inzlo,inzhi);
-
-    nsent += sendplanes;
-    sendfirst += sendplanes;
-    sendlast += recvplanes;
-    recvfirst += recvplanes;
-    nswap++;
-
-    if (nsent < ghostxlo) notdoneme = 1;
-    else notdoneme = 0;
-    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
-  }
-
-  // send own grid pts to +x processor, recv ghost grid pts from -x processor
-
-  nsent = 0;
-  sendfirst = inxlo;
-  sendlast = inxhi;
-  recvlast = inxlo-1;
-  notdone = 1;
-
-  while (notdone) {
-    if (nswap == maxswap) grow_swap();
-
-    swap[nswap].sendproc = procxhi;
-    swap[nswap].recvproc = procxlo;
-    sendplanes = MIN(sendlast-sendfirst+1,ghostxhi-nsent);
-    swap[nswap].npack =
-      indices(swap[nswap].packlist,
-              sendlast-sendplanes+1,sendlast,inylo,inyhi,inzlo,inzhi);
-
-    if (procxhi != me)
-      MPI_Sendrecv(&sendplanes,1,MPI_INT,procxhi,0,
-                   &recvplanes,1,MPI_INT,procxlo,0,gridcomm,MPI_STATUS_IGNORE);
-    else recvplanes = sendplanes;
-
-    swap[nswap].nunpack =
-      indices(swap[nswap].unpacklist,
-              recvlast-recvplanes+1,recvlast,inylo,inyhi,inzlo,inzhi);
-
-    nsent += sendplanes;
-    sendfirst -= recvplanes;
-    sendlast -= sendplanes;
-    recvlast -= recvplanes;
-    nswap++;
-
-    if (nsent < ghostxhi) notdoneme = 1;
-    else notdoneme = 0;
-    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
-  }
-
-  // send own grid pts to -y processor, recv ghost grid pts from +y processor
-
-  nsent = 0;
-  sendfirst = inylo;
-  sendlast = inyhi;
-  recvfirst = inyhi+1;
-  notdone = 1;
-
-  while (notdone) {
-    if (nswap == maxswap) grow_swap();
-
-    swap[nswap].sendproc = procylo;
-    swap[nswap].recvproc = procyhi;
-    sendplanes = MIN(sendlast-sendfirst+1,ghostylo-nsent);
-    swap[nswap].npack =
-      indices(swap[nswap].packlist,
-              outxlo,outxhi,sendfirst,sendfirst+sendplanes-1,inzlo,inzhi);
-
-    if (procylo != me)
-      MPI_Sendrecv(&sendplanes,1,MPI_INT,procylo,0,
-                   &recvplanes,1,MPI_INT,procyhi,0,gridcomm,MPI_STATUS_IGNORE);
-    else recvplanes = sendplanes;
-
-    swap[nswap].nunpack =
-      indices(swap[nswap].unpacklist,
-              outxlo,outxhi,recvfirst,recvfirst+recvplanes-1,inzlo,inzhi);
-
-    nsent += sendplanes;
-    sendfirst += sendplanes;
-    sendlast += recvplanes;
-    recvfirst += recvplanes;
-    nswap++;
-
-    if (nsent < ghostylo) notdoneme = 1;
-    else notdoneme = 0;
-    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
-  }
-
-  // send own grid pts to +y processor, recv ghost grid pts from -y processor
-
-  nsent = 0;
-  sendfirst = inylo;
-  sendlast = inyhi;
-  recvlast = inylo-1;
-  notdone = 1;
-
-  while (notdone) {
-    if (nswap == maxswap) grow_swap();
-
-    swap[nswap].sendproc = procyhi;
-    swap[nswap].recvproc = procylo;
-    sendplanes = MIN(sendlast-sendfirst+1,ghostyhi-nsent);
-    swap[nswap].npack =
-      indices(swap[nswap].packlist,
-              outxlo,outxhi,sendlast-sendplanes+1,sendlast,inzlo,inzhi);
-
-    if (procyhi != me)
-      MPI_Sendrecv(&sendplanes,1,MPI_INT,procyhi,0,
-                   &recvplanes,1,MPI_INT,procylo,0,gridcomm,MPI_STATUS_IGNORE);
-    else recvplanes = sendplanes;
-
-    swap[nswap].nunpack =
-      indices(swap[nswap].unpacklist,
-              outxlo,outxhi,recvlast-recvplanes+1,recvlast,inzlo,inzhi);
-
-    nsent += sendplanes;
-    sendfirst -= recvplanes;
-    sendlast -= sendplanes;
-    recvlast -= recvplanes;
-    nswap++;
-
-    if (nsent < ghostyhi) notdoneme = 1;
-    else notdoneme = 0;
-    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
-  }
-
-  // send own grid pts to -z processor, recv ghost grid pts from +z processor
-
-  nsent = 0;
-  sendfirst = inzlo;
-  sendlast = inzhi;
-  recvfirst = inzhi+1;
-  notdone = 1;
-
-  while (notdone) {
-    if (nswap == maxswap) grow_swap();
-
-    swap[nswap].sendproc = proczlo;
-    swap[nswap].recvproc = proczhi;
-    sendplanes = MIN(sendlast-sendfirst+1,ghostzlo-nsent);
-    swap[nswap].npack =
-      indices(swap[nswap].packlist,
-              outxlo,outxhi,outylo,outyhi,sendfirst,sendfirst+sendplanes-1);
-
-    if (proczlo != me)
-      MPI_Sendrecv(&sendplanes,1,MPI_INT,proczlo,0,
-                   &recvplanes,1,MPI_INT,proczhi,0,gridcomm,MPI_STATUS_IGNORE);
-    else recvplanes = sendplanes;
-
-    swap[nswap].nunpack =
-      indices(swap[nswap].unpacklist,
-              outxlo,outxhi,outylo,outyhi,recvfirst,recvfirst+recvplanes-1);
-
-    nsent += sendplanes;
-    sendfirst += sendplanes;
-    sendlast += recvplanes;
-    recvfirst += recvplanes;
-    nswap++;
-
-    if (nsent < ghostzlo) notdoneme = 1;
-    else notdoneme = 0;
-    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
-  }
-
-  // send own grid pts to +z processor, recv ghost grid pts from -z processor
-
-  nsent = 0;
-  sendfirst = inzlo;
-  sendlast = inzhi;
-  recvlast = inzlo-1;
-  notdone = 1;
-
-  while (notdone) {
-    if (nswap == maxswap) grow_swap();
-
-    swap[nswap].sendproc = proczhi;
-    swap[nswap].recvproc = proczlo;
-    sendplanes = MIN(sendlast-sendfirst+1,ghostzhi-nsent);
-    swap[nswap].npack =
-      indices(swap[nswap].packlist,
-              outxlo,outxhi,outylo,outyhi,sendlast-sendplanes+1,sendlast);
-
-    if (proczhi != me)
-      MPI_Sendrecv(&sendplanes,1,MPI_INT,proczhi,0,
-                   &recvplanes,1,MPI_INT,proczlo,0,gridcomm,MPI_STATUS_IGNORE);
-    else recvplanes = sendplanes;
-
-    swap[nswap].nunpack =
-      indices(swap[nswap].unpacklist,
-              outxlo,outxhi,outylo,outyhi,recvlast-recvplanes+1,recvlast);
-
-    nsent += sendplanes;
-    sendfirst -= recvplanes;
-    sendlast -= sendplanes;
-    recvlast -= recvplanes;
-    nswap++;
-
-    if (nsent < ghostzhi) notdoneme = 1;
-    else notdoneme = 0;
-    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
-  }
-
-  // ngrid = max of any forward/reverse pack/unpack grid points
-
-  int ngrid = 0;
-  for (int i = 0; i < nswap; i++) {
-    ngrid = MAX(ngrid,swap[i].npack);
-    ngrid = MAX(ngrid,swap[i].nunpack);
-  }
-
-  nbuf1 = nbuf2 = ngrid;
-}
-
-/* ----------------------------------------------------------------------
-------------------------------------------------------------------------- */
-
-void GridComm2::setup_tiled(int &nbuf1, int &nbuf2)
-{
-  int i,m;
-  double xlo,xhi,ylo,yhi,zlo,zhi;
-  int ghostbox[6],pbc[3];
-
-  // setup RCB tree of cut info for grid
-  // access CommTiled to get cut dimension
-  // cut = this proc's inlo in that dim
-  // dim is -1 for proc 0, but never accessed
-  
-  rcbinfo = (RCBinfo *)
-    memory->smalloc(nprocs*sizeof(RCBinfo),"GridComm:rcbinfo");
-  RCBinfo rcbone;
-  rcbone.dim = comm->rcbcutdim;
-  if (rcbone.dim <= 0) rcbone.cut = inxlo;
-  else if (rcbone.dim == 1) rcbone.cut = inylo;
-  else if (rcbone.dim == 2) rcbone.cut = inzlo;
-  MPI_Allgather(&rcbone,sizeof(RCBinfo),MPI_CHAR,
-                rcbinfo,sizeof(RCBinfo),MPI_CHAR,gridcomm);
-
-  // find overlaps of my extended ghost box with all other procs
-  // accounts for crossings of periodic boundaries
-  // noverlap = # of overlaps, including self
-  // overlap = vector of overlap info using Overlap data struct
-  
-  ghostbox[0] = outxlo;
-  ghostbox[1] = outxhi;
-  ghostbox[2] = outylo;
-  ghostbox[3] = outyhi;
-  ghostbox[4] = outzlo;
-  ghostbox[5] = outzhi;
-  
-  pbc[0] = pbc[1] = pbc[2] = 0;
-
-  memory->create(overlap_procs,nprocs,"GridComm:overlap_procs");
-  noverlap = maxoverlap = 0;
-  overlap = NULL;
-
-  ghost_box_drop(ghostbox,pbc);
-
-  // send each proc an overlap message
-  // content: me, index of my overlap, box that overlaps with its owned cells
-  // ncopy = # of overlaps with myself, across a periodic boundary
-
-  int *proclist;
-  memory->create(proclist,noverlap,"GridComm:proclist");
-  srequest = (Request *)
-    memory->smalloc(noverlap*sizeof(Request),"GridComm:srequest");
-  
-  int nsend_request = 0;
-  ncopy = 0;
-  
-  for (m = 0; m < noverlap; m++) {
-    if (overlap[m].proc == me) ncopy++;
-    else {
-      proclist[nsend_request] = overlap[m].proc;
-      srequest[nsend_request].sender = me;
-      srequest[nsend_request].index = m;
-      for (i = 0; i < 6; i++)
-	srequest[nsend_request].box[i] = overlap[m].box[i];
-      nsend_request++;
-    }
-  }
-
-  Irregular *irregular = new Irregular(lmp);
-  int nrecv_request = irregular->create_data(nsend_request,proclist,1);
-  Request *rrequest =
-    (Request *) memory->smalloc(nrecv_request*sizeof(Request),"GridComm:rrequest");
-  irregular->exchange_data((char *) srequest,sizeof(Request),(char *) rrequest);
-  irregular->destroy_data();
-  
-  // compute overlaps between received ghost boxes and my owned box
-  // overlap box used to setup my Send data struct and respond to requests
-
-  send = (Send *) memory->smalloc(nrecv_request*sizeof(Send),"GridComm:send");
-  sresponse = (Response *)
-    memory->smalloc(nrecv_request*sizeof(Response),"GridComm:sresponse");
-  memory->destroy(proclist);
-  memory->create(proclist,nrecv_request,"GridComm:proclist");
-
-  for (m = 0; m < nrecv_request; m++) {
-    send[m].proc = rrequest[m].sender;
-    xlo = MAX(rrequest[m].box[0],inxlo);
-    xhi = MIN(rrequest[m].box[1],inxhi);
-    ylo = MAX(rrequest[m].box[2],inylo);
-    yhi = MIN(rrequest[m].box[3],inyhi);
-    zlo = MAX(rrequest[m].box[4],inzlo);
-    zhi = MIN(rrequest[m].box[5],inzhi);
-    send[m].npack = indices(send[m].packlist,xlo,xhi,ylo,yhi,zlo,zhi);
-
-    proclist[m] = rrequest[m].sender;
-    sresponse[m].index = rrequest[m].index;
-    sresponse[m].box[0] = xlo;
-    sresponse[m].box[1] = xhi;
-    sresponse[m].box[2] = ylo;
-    sresponse[m].box[3] = yhi;
-    sresponse[m].box[4] = zlo;
-    sresponse[m].box[5] = zhi;
-  }
-
-  nsend = nrecv_request;
-  
-  // reply to each Request message with a Response message
-  // content: index for the overlap on requestor, overlap box on my owned grid
-
-  int nsend_response = nrecv_request;
-  int nrecv_response = irregular->create_data(nsend_response,proclist,1);
-  Response *rresponse =
-    (Response *) memory->smalloc(nrecv_response*sizeof(Response),"GridComm:rresponse");
-  irregular->exchange_data((char *) sresponse,sizeof(Response),(char *) rresponse);
-  irregular->destroy_data();
-  delete irregular;
-
-  // process received responses
-  // box used to setup my Recv data struct after unwrapping via PBC
-  // adjacent = 0 if any box of ghost cells does not adjoin my owned cells
-  
-  recv = (Recv *) memory->smalloc(nrecv_response*sizeof(Recv),"CommGrid:recv");
-  adjacent = 1;
-  
-  for (i = 0; i < nrecv_response; i++) {
-    m = rresponse[i].index;
-    recv[i].proc = overlap[m].proc;
-    xlo = rresponse[i].box[0] + overlap[m].pbc[0] * nx;
-    xhi = rresponse[i].box[1] + overlap[m].pbc[0] * nx;
-    ylo = rresponse[i].box[2] + overlap[m].pbc[1] * ny;
-    yhi = rresponse[i].box[3] + overlap[m].pbc[1] * ny;
-    zlo = rresponse[i].box[4] + overlap[m].pbc[2] * nz;
-    zhi = rresponse[i].box[5] + overlap[m].pbc[2] * nz;
-    recv[i].nunpack = indices(recv[i].unpacklist,xlo,xhi,ylo,yhi,zlo,zhi);
-    
-    if (xlo != inxhi+1 && xhi != inxlo-1 &&
-	ylo != inyhi+1 && yhi != inylo-1 &&
-	zlo != inzhi+1 && zhi != inzlo-1) adjacent = 0;
-  }
-
-  nrecv = nrecv_response;
-
-  // create Copy data struct from overlaps with self
-  
-  copy = (Copy *) memory->smalloc(ncopy*sizeof(Copy),"CommGrid:copy");
- 
-  ncopy = 0;
-  for (m = 0; m < noverlap; m++) {
-    if (overlap[m].proc != me) continue;
-    xlo = overlap[m].box[0];
-    xhi = overlap[m].box[1];
-    ylo = overlap[m].box[2];
-    yhi = overlap[m].box[3];
-    zlo = overlap[m].box[4];
-    zhi = overlap[m].box[5];
-    copy[ncopy].npack = indices(copy[ncopy].packlist,xlo,xhi,ylo,yhi,zlo,zhi);
-    xlo = overlap[m].box[0] + overlap[m].pbc[0] * nx;
-    xhi = overlap[m].box[1] + overlap[m].pbc[0] * nx;
-    ylo = overlap[m].box[2] + overlap[m].pbc[1] * ny;
-    yhi = overlap[m].box[3] + overlap[m].pbc[1] * ny;
-    zlo = overlap[m].box[4] + overlap[m].pbc[2] * nz;
-    zhi = overlap[m].box[5] + overlap[m].pbc[2] * nz;
-    copy[ncopy].nunpack = indices(copy[ncopy].unpacklist,xlo,xhi,ylo,yhi,zlo,zhi);
-    ncopy++;
-  }
-
-  // set offsets for received data
-
-  int offset = 0;
-  for (m = 0; m < nsend; m++) {
-    send[m].offset = offset;
-    offset += send[m].npack;
-  }
-
-  offset = 0;
-  for (m = 0; m < nrecv; m++) {
-    recv[m].offset = offset;
-    offset += recv[m].nunpack;
-  }
-
-  // length of MPI requests vector is max of nsend, nrecv
-
-  int nrequest = MAX(nsend,nrecv);
-  requests = new MPI_Request[nrequest];
-    
-  // clean-up
-
-  memory->sfree(rcbinfo);
-  memory->destroy(proclist);
-  memory->destroy(overlap_procs);
-  memory->sfree(overlap);
-  memory->sfree(srequest);
-  memory->sfree(rrequest);
-  memory->sfree(sresponse);
-  memory->sfree(rresponse);
-
-  // nbuf1 = largest pack or unpack in any Send or Recv or Copy
-  // nbuf2 = larget of sum of all packs or unpacks in Send or Recv
-  
-  nbuf1 = 0;
-
-  for (m = 0; m < ncopy; m++) {
-    nbuf1 = MAX(nbuf1,copy[m].npack);
-    nbuf1 = MAX(nbuf1,copy[m].nunpack);
-  }
-
-  int nbufs = 0;
-  for (m = 0; m < nsend; m++) {
-    nbuf1 = MAX(nbuf1,send[m].npack);
-    nbufs += send[m].npack;
-  }
-
-  int nbufr = 0;
-  for (m = 0; m < nrecv; m++) {
-    nbuf1 = MAX(nbuf1,recv[m].nunpack);
-    nbufr += recv[m].nunpack;
-  }
-
-  nbuf2 = MAX(nbufs,nbufr);
-}
-
-/* ----------------------------------------------------------------------
-------------------------------------------------------------------------- */
-
-void GridComm2::ghost_box_drop(int *box, int *pbc)
-{
-  int i,m;
-  
-  // newbox12 and newpbc are initially copies of caller box and pbc
-  
-  int newbox1[6],newbox2[6],newpbc[3];
-
-  for (i = 0; i < 6; i++) newbox1[i] = newbox2[i] = box[i];
-  for (i = 0; i < 3; i++) newpbc[i] = pbc[i];
-
-  // 6 if tests to see if box needs to be split across a periodic boundary
-  // final else is no split
-  
-  int splitflag = 1;
-  
-  if (box[0] < 0) {
-    newbox1[0] = 0;
-    newbox2[0] = box[0] + nx;
-    newbox2[1] = nx - 1;
-    newpbc[0]--;
-  } else if (box[1] >= nx) {
-    newbox1[1] = nx - 1;
-    newbox2[0] = 0;
-    newbox2[1] = box[1] - nx;
-    newpbc[0]++;
-  } else if (box[2] < 0) {
-    newbox1[2] = 0;
-    newbox2[2] = box[2] + ny;
-    newbox2[3] = ny - 1;
-    newpbc[1]--;
-  } else if (box[3] >= ny) {
-    newbox1[3] = ny - 1;
-    newbox2[2] = 0;
-    newbox2[3] = box[3] - ny; 
-    newpbc[1]++;
-  } else if (box[4] < 0) {
-    newbox1[4] = 0;
-    newbox2[4] = box[4] + nz;
-    newbox2[5] = nz - 1;
-    newpbc[2]--;
-  } else if (box[5] >= nz) {
-    newbox1[5] = nz - 1;
-    newbox2[4] = 0;
-    newbox2[5] = box[5] - nz;
-    newpbc[2]++;
-
-  // box is not split, drop on RCB tree
-  // returns nprocs = # of procs it overlaps, including self
-  // returns proc_overlap = list of proc IDs it overlaps
-  // skip self overlap if no crossing of periodic boundaries
-    
-  } else {
-    splitflag = 0;
-    int np = 0;
-    box_drop_grid(box,0,nprocs-1,np,overlap_procs);
-    for (m = 0; m < np; m++) {
-      if (noverlap == maxoverlap) grow_overlap();
-      if (overlap_procs[m] == me &&
-	  pbc[0] == 0 && pbc[1] == 0 && pbc[2] == 0) continue;
-      overlap[noverlap].proc = overlap_procs[m];
-      for (i = 0; i < 6; i++) overlap[noverlap].box[i] = box[i];
-      for (i = 0; i < 3; i++) overlap[noverlap].pbc[i] = pbc[i];
-      noverlap++;
-    }
-  }
-
-  // recurse with 2 split boxes
-  
-  if (splitflag) {
-    ghost_box_drop(newbox1,pbc);
-    ghost_box_drop(newbox2,newpbc);
-  }
-}
-
-/* ----------------------------------------------------------------------
-------------------------------------------------------------------------- */
-
-void GridComm2::box_drop_grid(int *box, int proclower, int procupper,
-			      int &np, int *plist)
-{
-  // end recursion when partition is a single proc
-  // add proclower to plist
-
-  if (proclower == procupper) {
-    plist[np++] = proclower;
-    return;
-  }
-
-  // drop box on each side of cut it extends beyond
-  // use < and >= criteria so does not include a box it only touches
-  // procmid = 1st processor in upper half of partition
-  //         = location in tree that stores this cut
-  // cut = index of first grid cell in upper partition
-  // dim = 0,1,2 dimension of cut
-
-  int procmid = proclower + (procupper - proclower) / 2 + 1;
-  int dim = rcbinfo[procmid].dim;
-  int cut = rcbinfo[procmid].cut;
-
-  if (box[2*dim] < cut) box_drop_grid(box,proclower,procmid-1,np,plist);
-  if (box[2*dim+1] >= cut) box_drop_grid(box,procmid,procupper,np,plist);
-}
-
-/* ----------------------------------------------------------------------
-   check if all procs only need ghost info from adjacent procs
-   return 1 if yes, 0 if no
-------------------------------------------------------------------------- */
-
-int GridComm2::ghost_adjacent()
-{
-  if (layout == REGULAR) return ghost_adjacent_regular();
-  return ghost_adjacent_tiled();
-}
-
-/* ----------------------------------------------------------------------
-   adjacent = 0 if a proc's ghost xyz lohi values exceed its subdomain size
-   return 0 if adjacent=0 for any proc, else 1
-------------------------------------------------------------------------- */
-
-int GridComm2::ghost_adjacent_regular()
-{
-  adjacent = 1;
-  if (ghostxlo > inxhi-inxlo+1) adjacent = 0;
-  if (ghostxhi > inxhi-inxlo+1) adjacent = 0;
-  if (ghostylo > inyhi-inylo+1) adjacent = 0;
-  if (ghostyhi > inyhi-inylo+1) adjacent = 0;
-  if (ghostzlo > inzhi-inzlo+1) adjacent = 0;
-  if (ghostzhi > inzhi-inzlo+1) adjacent = 0;
-
-  int adjacent_all;
-  MPI_Allreduce(&adjacent,&adjacent_all,1,MPI_INT,MPI_MIN,gridcomm);
-  return adjacent_all;
-}
-
-/* ----------------------------------------------------------------------
-   adjacent = 0 if a proc's received ghosts were flagged
-     as non-adjacent in setup_tiled()
-   return 0 if adjacent=0 for any proc, else 1
-------------------------------------------------------------------------- */
-
-int GridComm2::ghost_adjacent_tiled()
-{
-  int adjacent_all;
-  MPI_Allreduce(&adjacent,&adjacent_all,1,MPI_INT,MPI_MIN,gridcomm);
-  return adjacent_all;
-}
-
-/* ----------------------------------------------------------------------
-   use swap list in forward order to acquire copy of all needed ghost grid pts
-------------------------------------------------------------------------- */
-
-void GridComm2::forward_comm_kspace(KSpace *kspace, int nper, int nbyte, int which,
-				    void *buf1, void *buf2, MPI_Datatype datatype)
-{
-  if (layout == REGULAR)
-    forward_comm_kspace_regular(kspace,nper,nbyte,which,buf1,buf2,datatype);
-  else
-    forward_comm_kspace_tiled(kspace,nper,nbyte,which,buf1,buf2,datatype);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void GridComm2::
-forward_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
-			    void *buf1, void *buf2, MPI_Datatype datatype)
-{
-  int m;
-  MPI_Request request;
-
-  for (m = 0; m < nswap; m++) {
-    if (swap[m].sendproc == me)
-      kspace->pack_forward_grid(which,buf2,swap[m].npack,swap[m].packlist);
-    else
-      kspace->pack_forward_grid(which,buf1,swap[m].npack,swap[m].packlist);
-
-    if (swap[m].sendproc != me) {
-      if (swap[m].nunpack) MPI_Irecv(buf2,nper*swap[m].nunpack,datatype,
-				     swap[m].recvproc,0,gridcomm,&request);
-      if (swap[m].npack) MPI_Send(buf1,nper*swap[m].npack,datatype,
-				  swap[m].sendproc,0,gridcomm);
-      if (swap[m].nunpack) MPI_Wait(&request,MPI_STATUS_IGNORE);
-    }
-
-    kspace->unpack_forward_grid(which,buf2,swap[m].nunpack,swap[m].unpacklist);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void GridComm2::
-forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
-			  void *buf1, void *vbuf2, MPI_Datatype datatype)
-{
-  int i,m,offset;
-
-  char *buf2 = (char *) vbuf2;
-  
-  // post all receives
-  
-  for (m = 0; m < nrecv; m++) {
-    offset = nper * recv[m].offset * nbyte;
-    MPI_Irecv((void *) &buf2[offset],nper*recv[m].nunpack,datatype,
-	      recv[m].proc,0,gridcomm,&requests[m]);
-  }
-
-  // perform all sends to other procs
-
-  for (m = 0; m < nsend; m++) {
-    kspace->pack_forward_grid(which,buf1,send[m].npack,send[m].packlist);
-    MPI_Send(buf1,nper*send[m].npack,datatype,send[m].proc,0,gridcomm);
-  }
-
-  // perform all copies to self
-
-  for (m = 0; m < ncopy; m++) {
-    kspace->pack_forward_grid(which,buf1,copy[m].npack,copy[m].packlist);
-    kspace->unpack_forward_grid(which,buf1,copy[m].nunpack,copy[m].unpacklist);
-  }
-
-  // unpack all received data
-  
-  for (i = 0; i < nrecv; i++) {
-    MPI_Waitany(nrecv,requests,&m,MPI_STATUS_IGNORE);
-    offset = nper * recv[m].offset * nbyte;
-    kspace->unpack_forward_grid(which,(void *) &buf2[offset],
-				recv[m].nunpack,recv[m].unpacklist);
-  }
-}
-
-/* ----------------------------------------------------------------------
-   use swap list in reverse order to compute fully summed value
-   for each owned grid pt that some other proc has copy of as a ghost grid pt
-------------------------------------------------------------------------- */
-
-void GridComm2::reverse_comm_kspace(KSpace *kspace, int nper, int nbyte, int which,
-				    void *buf1, void *buf2, MPI_Datatype datatype)
-{
-  if (layout == REGULAR)
-    reverse_comm_kspace_regular(kspace,nper,nbyte,which,buf1,buf2,datatype);
-  else
-    reverse_comm_kspace_tiled(kspace,nper,nbyte,which,buf1,buf2,datatype);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void GridComm2::
-reverse_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
-			    void *buf1, void *buf2, MPI_Datatype datatype)
-{
-  int m;
-  MPI_Request request;
-
-  for (m = nswap-1; m >= 0; m--) {
-    if (swap[m].recvproc == me)
-      kspace->pack_reverse_grid(which,buf2,swap[m].nunpack,swap[m].unpacklist);
-    else
-      kspace->pack_reverse_grid(which,buf1,swap[m].nunpack,swap[m].unpacklist);
-
-    if (swap[m].recvproc != me) {
-      if (swap[m].npack) MPI_Irecv(buf2,nper*swap[m].npack,datatype,
-				   swap[m].sendproc,0,gridcomm,&request);
-      if (swap[m].nunpack) MPI_Send(buf1,nper*swap[m].nunpack,datatype,
-				     swap[m].recvproc,0,gridcomm);
-      if (swap[m].npack) MPI_Wait(&request,MPI_STATUS_IGNORE);
-    }
-
-    kspace->unpack_reverse_grid(which,buf2,swap[m].npack,swap[m].packlist);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void GridComm2::
-reverse_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
-			  void *buf1, void *vbuf2, MPI_Datatype datatype)
-{
-  int i,m,offset;
-
-  char *buf2 = (char *) vbuf2;
-
-  // post all receives
-  
-  for (m = 0; m < nsend; m++) {
-    offset = nper * send[m].offset * nbyte;
-    MPI_Irecv((void *) &buf2[offset],nper*send[m].npack,datatype,
-	      send[m].proc,0,gridcomm,&requests[m]);
-  }
-
-  // perform all sends to other procs
-
-  for (m = 0; m < nrecv; m++) {
-    kspace->pack_reverse_grid(which,buf1,recv[m].nunpack,recv[m].unpacklist);
-    MPI_Send(buf1,nper*recv[m].nunpack,datatype,recv[m].proc,0,gridcomm);
-  }
-
-  // perform all copies to self
-
-  for (m = 0; m < ncopy; m++) {
-    kspace->pack_reverse_grid(which,buf1,copy[m].nunpack,copy[m].unpacklist);
-    kspace->unpack_reverse_grid(which,buf1,copy[m].npack,copy[m].packlist);
-  }
-
-  // unpack all received data
-  
-  for (i = 0; i < nsend; i++) {
-    MPI_Waitany(nsend,requests,&m,MPI_STATUS_IGNORE);
-    offset = nper * send[m].offset * nbyte;
-    kspace->unpack_reverse_grid(which,(void *) &buf2[offset],
-				send[m].npack,send[m].packlist);
-  }
-}
-
-/* ----------------------------------------------------------------------
-   create swap stencil for grid own/ghost communication
-   swaps covers all 3 dimensions and both directions
-   swaps cover multiple iterations in a direction if need grid pts
-     from further away than nearest-neighbor proc
-   same swap list used by forward and reverse communication
-------------------------------------------------------------------------- */
-
-void GridComm2::grow_swap()
-{
-  maxswap += SWAPDELTA;
-  swap = (Swap *)
-    memory->srealloc(swap,maxswap*sizeof(Swap),"CommGrid:swap");
-}
-
-/* ----------------------------------------------------------------------
-   create swap stencil for grid own/ghost communication
-   swaps covers all 3 dimensions and both directions
-   swaps cover multiple iterations in a direction if need grid pts
-     from further away than nearest-neighbor proc
-   same swap list used by forward and reverse communication
-------------------------------------------------------------------------- */
-
-void GridComm2::grow_overlap()
-{
-  maxoverlap += SWAPDELTA;
-  overlap = (Overlap *)
-    memory->srealloc(overlap,maxoverlap*sizeof(Overlap),"CommGrid:overlap");
-}
-
-/* ----------------------------------------------------------------------
-   create 1d list of offsets into 3d array section (xlo:xhi,ylo:yhi,zlo:zhi)
-   assume 3d array is allocated as (outxlo_max:outxhi_max,outylo_max:outyhi_max,
-     outzlo_max:outzhi_max)
-------------------------------------------------------------------------- */
-
-int GridComm2::indices(int *&list,
-                       int xlo, int xhi, int ylo, int yhi, int zlo, int zhi)
-{
-  int nmax = (xhi-xlo+1) * (yhi-ylo+1) * (zhi-zlo+1);
-  memory->create(list,nmax,"CommGrid:indices");
-  if (nmax == 0) return 0;
-
-  int nx = (outxhi_max-outxlo_max+1);
-  int ny = (outyhi_max-outylo_max+1);
-
-  int n = 0;
-  int ix,iy,iz;
-  for (iz = zlo; iz <= zhi; iz++)
-    for (iy = ylo; iy <= yhi; iy++)
-      for (ix = xlo; ix <= xhi; ix++)
-        list[n++] = (iz-outzlo_max)*ny*nx + (iy-outylo_max)*nx + (ix-outxlo_max);
-
-  return nmax;
-}
diff --git a/src/KSPACE/gridcomm2.h b/src/KSPACE/gridcomm2.h
deleted file mode 100644
index 66cf9d42e5..0000000000
--- a/src/KSPACE/gridcomm2.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifndef LMP_GRIDCOMM2_H
-#define LMP_GRIDCOMM2_H
-
-#include "pointers.h"
-
-namespace LAMMPS_NS {
-
-class GridComm2 : protected Pointers {
- public:
-  GridComm2(class LAMMPS *, MPI_Comm, int, int, int,
-	    int, int, int, int, int, int,
-	    int, int, int, int, int, int);
-  GridComm2(class LAMMPS *, MPI_Comm, int, int, int,
-	    int, int, int, int, int, int,
-	    int, int, int, int, int, int,
-	    int, int, int, int, int, int);
-  ~GridComm2();
-  void setup(int &, int &);
-  int ghost_adjacent();
-  void forward_comm_kspace(class KSpace *, int, int, int,
-			   void *, void *, MPI_Datatype);
-  void reverse_comm_kspace(class KSpace *, int, int, int,
-			   void *, void *, MPI_Datatype);
-
- private:
-  int me,nprocs;
-  int layout;                 // REGULAR or TILED
-  MPI_Comm gridcomm;
-
-  // inputs from caller via constructor
-
-  int nx,ny,nz;               // size of global grid in all 3 dims
-  int inxlo,inxhi;            // inclusive extent of my grid chunk
-  int inylo,inyhi;            //   0 <= in <= N-1
-  int inzlo,inzhi;   
-  int outxlo,outxhi;          // inclusive extent of my grid chunk plus
-  int outylo,outyhi;          //   ghost cells in all 6 directions
-  int outzlo,outzhi;          // lo indices can be < 0, hi indices can be >= N
-  int outxlo_max,outxhi_max;  // ??
-  int outylo_max,outyhi_max;
-  int outzlo_max,outzhi_max;
-
-  // -------------------------------------------
-  // internal variables for REGULAR layout
-  // -------------------------------------------
-
-  int procxlo,procxhi;     // 6 neighbor procs that adjoin me
-  int procylo,procyhi;     //   not used for comm_style = tiled
-  int proczlo,proczhi;
-  
-  int ghostxlo,ghostxhi;   // # of my owned grid planes needed
-  int ghostylo,ghostyhi;   //   by neighobr procs in each dir as their ghost planes
-  int ghostzlo,ghostzhi;
-
-  // swap = exchange of owned and ghost grid cells between 2 procs, including self
-  
-  struct Swap {
-    int sendproc;       // proc to send to for forward comm
-    int recvproc;       // proc to recv from for forward comm
-    int npack;          // # of datums to pack
-    int nunpack;        // # of datums to unpack
-    int *packlist;      // 3d array offsets to pack
-    int *unpacklist;    // 3d array offsets to unpack
-  };
-
-  int nswap,maxswap;
-  Swap *swap;
-
-  // -------------------------------------------
-  // internal variables for TILED layout
-  // -------------------------------------------
-
-  int *overlap_procs;
-  MPI_Request *requests;
-
-  // RCB tree of cut info
-  // each proc contributes one value, except proc 0
-  
-  struct RCBinfo {
-    int dim;        // 0,1,2 = which dim the cut is in
-    int cut;        // grid index of lowest cell in upper half of cut
-  };
-
-  RCBinfo *rcbinfo;
-    
-  // overlap = a proc whose owned cells overlap with my extended ghost box
-  // includes overlaps across periodic boundaries, can also be self
-  
-  struct Overlap {
-    int proc;            // proc whose owned cells overlap my ghost cells
-    int box[6];          // box that overlaps otherproc's owned cells
-                         // this box is wholly contained within global grid
-    int pbc[3];          // PBC offsets to convert box to a portion of my ghost box
-                         // my ghost box may extend beyond global grid
-  };
-
-  int noverlap,maxoverlap;
-  Overlap *overlap;
-  
-  // request = sent to each proc whose owned cells overlap my ghost cells
-  
-  struct Request {
-    int sender;          // sending proc
-    int index;           // index of overlap on sender
-    int box[6];          // box that overlaps receiver's owned cells
-                         // wholly contained within global grid
-  };
-
-  Request *srequest,*rrequest;
-  
-  // response = reply from each proc whose owned cells overlap my ghost cells
-  
-  struct Response {
-    int index;           // index of my overlap for the initial request
-    int box[6];          // box that overlaps responder's owned cells
-                         // wholly contained within global grid
-                         // has to unwrapped by PBC to map to my ghost cells
-  };
-
-  Response *sresponse,*rresponse;
-  
-  // send = proc to send a subset of my owned cells to, for forward comm
-  // for reverse comm, proc I receive ghost overlaps with my owned cells from
-  // offset used in reverse comm to recv a message in middle of a large buffer
-
-  struct Send {
-    int proc;
-    int npack;
-    int *packlist;
-    int offset;
-  };
-
-  // recv = proc to recv a subset of my ghost cells from, for forward comm
-  // for reverse comm, proc I send a subset of my ghost cells to
-  // offset used in forward comm to recv a message in middle of a large buffer
-  
-  struct Recv {
-    int proc;
-    int nunpack;
-    int *unpacklist;
-    int offset;
-  };
-
-  int adjacent;      // 0 on a proc who receives ghosts from a non-neighbor proc
-
-  // copy = subset of my owned cells to copy into subset of my ghost cells
-  // that describes forward comm, for reverse comm it is the opposite
-  
-  struct Copy {
-    int npack;
-    int nunpack;
-    int *packlist;
-    int *unpacklist;
-  };
-
-  int nsend,nrecv,ncopy;
-  Send *send;
-  Recv *recv;
-  Copy *copy;
-
-  // -------------------------------------------
-  // internal methods
-  // -------------------------------------------
-  
-  void setup_regular(int &, int &);
-  void setup_tiled(int &, int &);
-  void ghost_box_drop(int *, int *);
-  void box_drop_grid(int *, int, int, int &, int *);
-  
-  int ghost_adjacent_regular();
-  int ghost_adjacent_tiled();
-  
-  void forward_comm_kspace_regular(class KSpace *, int, int, int,
-				   void *, void *, MPI_Datatype);
-  void forward_comm_kspace_tiled(class KSpace *, int, int, int,
-				 void *, void *, MPI_Datatype);
-  void reverse_comm_kspace_regular(class KSpace *, int, int, int,
-				   void *, void *, MPI_Datatype);
-  void reverse_comm_kspace_tiled(class KSpace *, int, int, int,
-				 void *, void *, MPI_Datatype);
-
-  void grow_swap();
-  void grow_overlap();
-  
-  int indices(int *&, int, int, int, int, int, int);
-};
-
-}
-
-#endif
diff --git a/src/KSPACE/pppm2.cpp b/src/KSPACE/pppm2.cpp
deleted file mode 100644
index 927c9edee5..0000000000
--- a/src/KSPACE/pppm2.cpp
+++ /dev/null
@@ -1,3524 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Roy Pollock (LLNL), Paul Crozier (SNL)
-     per-atom energy/virial & group/group energy/force added by Stan Moore (BYU)
-     analytic diff (2 FFT) option added by Rolf Isele-Holder (Aachen University)
-     triclinic added by Stan Moore (SNL)
-------------------------------------------------------------------------- */
-
-#include "pppm2.h"
-#include <mpi.h>
-#include <cstring>
-#include <string>
-#include <cmath>
-#include "atom.h"
-#include "comm.h"
-#include "gridcomm2.h"
-#include "neighbor.h"
-#include "force.h"
-#include "pair.h"
-#include "bond.h"
-#include "angle.h"
-#include "domain.h"
-#include "fft3d_wrap.h"
-#include "remap_wrap.h"
-#include "memory.h"
-#include "error.h"
-#include "utils.h"
-#include "fmt/format.h"
-
-#include "math_const.h"
-#include "math_special.h"
-
-using namespace LAMMPS_NS;
-using namespace MathConst;
-using namespace MathSpecial;
-
-#define MAXORDER 7
-#define OFFSET 16384
-#define LARGE 10000.0
-#define SMALL 0.00001
-#define EPS_HOC 1.0e-7
-
-enum{REVERSE_RHO};
-enum{FORWARD_IK,FORWARD_AD,FORWARD_IK_PERATOM,FORWARD_AD_PERATOM};
-
-#ifdef FFT_SINGLE
-#define ZEROF 0.0f
-#define ONEF  1.0f
-#else
-#define ZEROF 0.0
-#define ONEF  1.0
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-PPPM2::PPPM2(LAMMPS *lmp) : KSpace(lmp),
-  factors(NULL), density_brick(NULL), vdx_brick(NULL), vdy_brick(NULL), vdz_brick(NULL),
-  u_brick(NULL), v0_brick(NULL), v1_brick(NULL), v2_brick(NULL), v3_brick(NULL),
-  v4_brick(NULL), v5_brick(NULL), greensfn(NULL), vg(NULL), fkx(NULL), fky(NULL),
-  fkz(NULL), density_fft(NULL), work1(NULL), work2(NULL), gf_b(NULL), rho1d(NULL),
-  rho_coeff(NULL), drho1d(NULL), drho_coeff(NULL),
-  sf_precoeff1(NULL), sf_precoeff2(NULL), sf_precoeff3(NULL),
-  sf_precoeff4(NULL), sf_precoeff5(NULL), sf_precoeff6(NULL),
-  acons(NULL), density_A_brick(NULL), density_B_brick(NULL), density_A_fft(NULL),
-  density_B_fft(NULL), fft1(NULL), fft2(NULL), remap(NULL), gc(NULL),
-  gc_buf1(NULL), gc_buf2(NULL), part2grid(NULL), boxlo(NULL)
-{
-  peratom_allocate_flag = 0;
-  group_allocate_flag = 0;
-
-  pppmflag = 1;
-  group_group_enable = 1;
-  triclinic = domain->triclinic;
-
-  nfactors = 3;
-  factors = new int[nfactors];
-  factors[0] = 2;
-  factors[1] = 3;
-  factors[2] = 5;
-
-  MPI_Comm_rank(world,&me);
-  MPI_Comm_size(world,&nprocs);
-
-  nfft_both = 0;
-  nxhi_in = nxlo_in = nxhi_out = nxlo_out = 0;
-  nyhi_in = nylo_in = nyhi_out = nylo_out = 0;
-  nzhi_in = nzlo_in = nzhi_out = nzlo_out = 0;
-
-  density_brick = vdx_brick = vdy_brick = vdz_brick = NULL;
-  density_fft = NULL;
-  u_brick = NULL;
-  v0_brick = v1_brick = v2_brick = v3_brick = v4_brick = v5_brick = NULL;
-  greensfn = NULL;
-  work1 = work2 = NULL;
-  vg = NULL;
-  fkx = fky = fkz = NULL;
-
-  sf_precoeff1 = sf_precoeff2 = sf_precoeff3 =
-    sf_precoeff4 = sf_precoeff5 = sf_precoeff6 = NULL;
-
-  density_A_brick = density_B_brick = NULL;
-  density_A_fft = density_B_fft = NULL;
-
-  gf_b = NULL;
-  rho1d = rho_coeff = drho1d = drho_coeff = NULL;
-
-  fft1 = fft2 = NULL;
-  remap = NULL;
-  gc = NULL;
-  gc_buf1 = gc_buf2 = NULL;
-
-  nmax = 0;
-  part2grid = NULL;
-
-  // define acons coefficients for estimation of kspace errors
-  // see JCP 109, pg 7698 for derivation of coefficients
-  // higher order coefficients may be computed if needed
-
-  memory->create(acons,8,7,"pppm:acons");
-  acons[1][0] = 2.0 / 3.0;
-  acons[2][0] = 1.0 / 50.0;
-  acons[2][1] = 5.0 / 294.0;
-  acons[3][0] = 1.0 / 588.0;
-  acons[3][1] = 7.0 / 1440.0;
-  acons[3][2] = 21.0 / 3872.0;
-  acons[4][0] = 1.0 / 4320.0;
-  acons[4][1] = 3.0 / 1936.0;
-  acons[4][2] = 7601.0 / 2271360.0;
-  acons[4][3] = 143.0 / 28800.0;
-  acons[5][0] = 1.0 / 23232.0;
-  acons[5][1] = 7601.0 / 13628160.0;
-  acons[5][2] = 143.0 / 69120.0;
-  acons[5][3] = 517231.0 / 106536960.0;
-  acons[5][4] = 106640677.0 / 11737571328.0;
-  acons[6][0] = 691.0 / 68140800.0;
-  acons[6][1] = 13.0 / 57600.0;
-  acons[6][2] = 47021.0 / 35512320.0;
-  acons[6][3] = 9694607.0 / 2095994880.0;
-  acons[6][4] = 733191589.0 / 59609088000.0;
-  acons[6][5] = 326190917.0 / 11700633600.0;
-  acons[7][0] = 1.0 / 345600.0;
-  acons[7][1] = 3617.0 / 35512320.0;
-  acons[7][2] = 745739.0 / 838397952.0;
-  acons[7][3] = 56399353.0 / 12773376000.0;
-  acons[7][4] = 25091609.0 / 1560084480.0;
-  acons[7][5] = 1755948832039.0 / 36229939200000.0;
-  acons[7][6] = 4887769399.0 / 37838389248.0;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PPPM2::settings(int narg, char **arg)
-{
-  if (narg < 1) error->all(FLERR,"Illegal kspace_style pppm command");
-  accuracy_relative = fabs(force->numeric(FLERR,arg[0]));
-}
-
-/* ----------------------------------------------------------------------
-   free all memory
-------------------------------------------------------------------------- */
-
-PPPM2::~PPPM2()
-{
-  if (copymode) return;
-
-  delete [] factors;
-  deallocate();
-  if (peratom_allocate_flag) deallocate_peratom();
-  if (group_allocate_flag) deallocate_groups();
-  memory->destroy(part2grid);
-  memory->destroy(acons);
-}
-
-/* ----------------------------------------------------------------------
-   called once before run
-------------------------------------------------------------------------- */
-
-void PPPM2::init()
-{
-  if (me == 0) utils::logmesg(lmp,"PPPM initialization ...\n");
-
-  // error check
-
-  triclinic_check();
-
-  if (triclinic != domain->triclinic)
-    error->all(FLERR,"Must redefine kspace_style after changing to triclinic box");
-
-  if (domain->triclinic && differentiation_flag == 1)
-    error->all(FLERR,"Cannot (yet) use PPPM with triclinic box "
-               "and kspace_modify diff ad");
-  if (domain->triclinic && slabflag)
-    error->all(FLERR,"Cannot (yet) use PPPM with triclinic box and "
-               "slab correction");
-  if (domain->dimension == 2)
-    error->all(FLERR,"Cannot use PPPM with 2d simulation");
-
-  if (!atom->q_flag)
-    error->all(FLERR,"Kspace style requires atom attribute q");
-
-  if (slabflag == 0 && domain->nonperiodic > 0)
-    error->all(FLERR,"Cannot use non-periodic boundaries with PPPM");
-  if (slabflag) {
-    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
-        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
-      error->all(FLERR,"Incorrect boundaries with slab PPPM");
-  }
-
-  if (order < 2 || order > MAXORDER)
-    error->all(FLERR,fmt::format("PPPM order cannot be < 2 or > {}",MAXORDER));
-
-  // compute two charge force
-
-  two_charge();
-
-  // extract short-range Coulombic cutoff from pair style
-
-  triclinic = domain->triclinic;
-  pair_check();
-
-  int itmp = 0;
-  double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
-  if (p_cutoff == NULL)
-    error->all(FLERR,"KSpace style is incompatible with Pair style");
-  cutoff = *p_cutoff;
-
-  // if kspace is TIP4P, extract TIP4P params from pair style
-  // bond/angle are not yet init(), so insure equilibrium request is valid
-
-  qdist = 0.0;
-
-  if (tip4pflag) {
-    if (me == 0) utils::logmesg(lmp,"  extracting TIP4P info from pair style\n");
-
-    double *p_qdist = (double *) force->pair->extract("qdist",itmp);
-    int *p_typeO = (int *) force->pair->extract("typeO",itmp);
-    int *p_typeH = (int *) force->pair->extract("typeH",itmp);
-    int *p_typeA = (int *) force->pair->extract("typeA",itmp);
-    int *p_typeB = (int *) force->pair->extract("typeB",itmp);
-    if (!p_qdist || !p_typeO || !p_typeH || !p_typeA || !p_typeB)
-      error->all(FLERR,"Pair style is incompatible with TIP4P KSpace style");
-    qdist = *p_qdist;
-    typeO = *p_typeO;
-    typeH = *p_typeH;
-    int typeA = *p_typeA;
-    int typeB = *p_typeB;
-
-    if (force->angle == NULL || force->bond == NULL ||
-        force->angle->setflag == NULL || force->bond->setflag == NULL)
-      error->all(FLERR,"Bond and angle potentials must be defined for TIP4P");
-    if (typeA < 1 || typeA > atom->nangletypes ||
-        force->angle->setflag[typeA] == 0)
-      error->all(FLERR,"Bad TIP4P angle type for PPPM/TIP4P");
-    if (typeB < 1 || typeB > atom->nbondtypes ||
-        force->bond->setflag[typeB] == 0)
-      error->all(FLERR,"Bad TIP4P bond type for PPPM/TIP4P");
-    double theta = force->angle->equilibrium_angle(typeA);
-    double blen = force->bond->equilibrium_distance(typeB);
-    alpha = qdist / (cos(0.5*theta) * blen);
-  }
-
-  // compute qsum & qsqsum and warn if not charge-neutral
-
-  scale = 1.0;
-  qqrd2e = force->qqrd2e;
-  qsum_qsq();
-  natoms_original = atom->natoms;
-
-  // set accuracy (force units) from accuracy_relative or accuracy_absolute
-
-  if (accuracy_absolute >= 0.0) accuracy = accuracy_absolute;
-  else accuracy = accuracy_relative * two_charge_force;
-
-  // free all arrays previously allocated
-
-  deallocate();
-  if (peratom_allocate_flag) deallocate_peratom();
-  if (group_allocate_flag) deallocate_groups();
-
-  // setup FFT grid resolution and g_ewald
-  // normally one iteration thru while loop is all that is required
-  // if grid stencil does not extend beyond neighbor proc
-  //   or overlap is allowed, then done
-  // else reduce order and try again
-
-  int (*procneigh)[2] = comm->procneigh;
-
-  GridComm2 *gctmp = NULL;
-  int iteration = 0;
-
-  while (order >= minorder) {
-    if (iteration && me == 0)
-      error->warning(FLERR,"Reducing PPPM order b/c stencil extends "
-                     "beyond nearest neighbor processor");
-
-    if (stagger_flag && !differentiation_flag) compute_gf_denom();
-    set_grid_global();
-    set_grid_local();
-    if (overlap_allowed) break;
-
-    gctmp = new GridComm2(lmp,world,nx_pppm,ny_pppm,nz_pppm,
-			  nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-			  nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
-
-    int tmp1,tmp2;
-    gctmp->setup(tmp1,tmp2);
-    if (gctmp->ghost_adjacent()) break;
-    delete gctmp;
-
-    order--;
-    iteration++;
-  }
-
-  if (order < minorder) error->all(FLERR,"PPPM order < minimum allowed order");
-  if (!overlap_allowed && !gctmp->ghost_adjacent())
-    error->all(FLERR,"PPPM grid stencil extends "
-               "beyond nearest neighbor processor");
-  if (gctmp) delete gctmp;
-
-  // adjust g_ewald
-
-  if (!gewaldflag) adjust_gewald();
-
-  // calculate the final accuracy
-
-  double estimated_accuracy = final_accuracy();
-
-  // print stats
-
-  int ngrid_max,nfft_both_max;
-  MPI_Allreduce(&ngrid,&ngrid_max,1,MPI_INT,MPI_MAX,world);
-  MPI_Allreduce(&nfft_both,&nfft_both_max,1,MPI_INT,MPI_MAX,world);
-
-  if (me == 0) {
-    std::string mesg = fmt::format("  G vector (1/distance) = {:.8g}\n",g_ewald);
-    mesg += fmt::format("  grid = {} {} {}\n",nx_pppm,ny_pppm,nz_pppm);
-    mesg += fmt::format("  stencil order = {}\n",order);
-    mesg += fmt::format("  estimated absolute RMS force accuracy = {:.8g}\n",
-                       estimated_accuracy);
-    mesg += fmt::format("  estimated relative force accuracy = {:.8g}\n",
-                       estimated_accuracy/two_charge_force);
-    mesg += "  using " LMP_FFT_PREC " precision " LMP_FFT_LIB "\n";
-    mesg += fmt::format("  3d grid and FFT values/proc = {} {}\n",
-                       ngrid_max,nfft_both_max);
-    utils::logmesg(lmp,mesg);
-  }
-
-  // allocate K-space dependent memory
-  // don't invoke allocate peratom() or group(), will be allocated when needed
-
-  allocate();
-
-  // pre-compute Green's function denomiator expansion
-  // pre-compute 1d charge distribution coefficients
-
-  compute_gf_denom();
-  if (differentiation_flag == 1) compute_sf_precoeff();
-  compute_rho_coeff();
-}
-
-/* ----------------------------------------------------------------------
-   adjust PPPM coeffs, called initially and whenever volume has changed
-------------------------------------------------------------------------- */
-
-void PPPM2::setup()
-{
-  if (triclinic) {
-    setup_triclinic();
-    return;
-  }
-
-  // perform some checks to avoid illegal boundaries with read_data
-
-  if (slabflag == 0 && domain->nonperiodic > 0)
-    error->all(FLERR,"Cannot use non-periodic boundaries with PPPM");
-  if (slabflag) {
-    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
-        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
-      error->all(FLERR,"Incorrect boundaries with slab PPPM");
-  }
-
-  int i,j,k,n;
-  double *prd;
-
-  // volume-dependent factors
-  // adjust z dimension for 2d slab PPPM
-  // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
-
-  if (triclinic == 0) prd = domain->prd;
-  else prd = domain->prd_lamda;
-
-  double xprd = prd[0];
-  double yprd = prd[1];
-  double zprd = prd[2];
-  double zprd_slab = zprd*slab_volfactor;
-  volume = xprd * yprd * zprd_slab;
-
-  delxinv = nx_pppm/xprd;
-  delyinv = ny_pppm/yprd;
-  delzinv = nz_pppm/zprd_slab;
-
-  delvolinv = delxinv*delyinv*delzinv;
-
-  double unitkx = (MY_2PI/xprd);
-  double unitky = (MY_2PI/yprd);
-  double unitkz = (MY_2PI/zprd_slab);
-
-  // fkx,fky,fkz for my FFT grid pts
-
-  double per;
-
-  for (i = nxlo_fft; i <= nxhi_fft; i++) {
-    per = i - nx_pppm*(2*i/nx_pppm);
-    fkx[i] = unitkx*per;
-  }
-
-  for (i = nylo_fft; i <= nyhi_fft; i++) {
-    per = i - ny_pppm*(2*i/ny_pppm);
-    fky[i] = unitky*per;
-  }
-
-  for (i = nzlo_fft; i <= nzhi_fft; i++) {
-    per = i - nz_pppm*(2*i/nz_pppm);
-    fkz[i] = unitkz*per;
-  }
-
-  // virial coefficients
-
-  double sqk,vterm;
-
-  n = 0;
-  for (k = nzlo_fft; k <= nzhi_fft; k++) {
-    for (j = nylo_fft; j <= nyhi_fft; j++) {
-      for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k];
-        if (sqk == 0.0) {
-          vg[n][0] = 0.0;
-          vg[n][1] = 0.0;
-          vg[n][2] = 0.0;
-          vg[n][3] = 0.0;
-          vg[n][4] = 0.0;
-          vg[n][5] = 0.0;
-        } else {
-          vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
-          vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i];
-          vg[n][1] = 1.0 + vterm*fky[j]*fky[j];
-          vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k];
-          vg[n][3] = vterm*fkx[i]*fky[j];
-          vg[n][4] = vterm*fkx[i]*fkz[k];
-          vg[n][5] = vterm*fky[j]*fkz[k];
-        }
-        n++;
-      }
-    }
-  }
-
-  if (differentiation_flag == 1) compute_gf_ad();
-  else compute_gf_ik();
-}
-
-/* ----------------------------------------------------------------------
-   adjust PPPM coeffs, called initially and whenever volume has changed
-   for a triclinic system
-------------------------------------------------------------------------- */
-
-void PPPM2::setup_triclinic()
-{
-  int i,j,k,n;
-  double *prd;
-
-  // volume-dependent factors
-  // adjust z dimension for 2d slab PPPM
-  // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
-
-  prd = domain->prd;
-
-  double xprd = prd[0];
-  double yprd = prd[1];
-  double zprd = prd[2];
-  double zprd_slab = zprd*slab_volfactor;
-  volume = xprd * yprd * zprd_slab;
-
-  // use lamda (0-1) coordinates
-
-  delxinv = nx_pppm;
-  delyinv = ny_pppm;
-  delzinv = nz_pppm;
-  delvolinv = delxinv*delyinv*delzinv/volume;
-
-  // fkx,fky,fkz for my FFT grid pts
-
-  double per_i,per_j,per_k;
-
-  n = 0;
-  for (k = nzlo_fft; k <= nzhi_fft; k++) {
-    per_k = k - nz_pppm*(2*k/nz_pppm);
-    for (j = nylo_fft; j <= nyhi_fft; j++) {
-      per_j = j - ny_pppm*(2*j/ny_pppm);
-      for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        per_i = i - nx_pppm*(2*i/nx_pppm);
-
-        double unitk_lamda[3];
-        unitk_lamda[0] = 2.0*MY_PI*per_i;
-        unitk_lamda[1] = 2.0*MY_PI*per_j;
-        unitk_lamda[2] = 2.0*MY_PI*per_k;
-        x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
-        fkx[n] = unitk_lamda[0];
-        fky[n] = unitk_lamda[1];
-        fkz[n] = unitk_lamda[2];
-        n++;
-      }
-    }
-  }
-
-  // virial coefficients
-
-  double sqk,vterm;
-
-  for (n = 0; n < nfft; n++) {
-    sqk = fkx[n]*fkx[n] + fky[n]*fky[n] + fkz[n]*fkz[n];
-    if (sqk == 0.0) {
-      vg[n][0] = 0.0;
-      vg[n][1] = 0.0;
-      vg[n][2] = 0.0;
-      vg[n][3] = 0.0;
-      vg[n][4] = 0.0;
-      vg[n][5] = 0.0;
-    } else {
-      vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
-      vg[n][0] = 1.0 + vterm*fkx[n]*fkx[n];
-      vg[n][1] = 1.0 + vterm*fky[n]*fky[n];
-      vg[n][2] = 1.0 + vterm*fkz[n]*fkz[n];
-      vg[n][3] = vterm*fkx[n]*fky[n];
-      vg[n][4] = vterm*fkx[n]*fkz[n];
-      vg[n][5] = vterm*fky[n]*fkz[n];
-    }
-  }
-
-  compute_gf_ik_triclinic();
-}
-
-/* ----------------------------------------------------------------------
-   reset local grid arrays and communication stencils
-   called by fix balance b/c it changed sizes of processor sub-domains
-------------------------------------------------------------------------- */
-
-void PPPM2::setup_grid()
-{
-  // free all arrays previously allocated
-
-  deallocate();
-  if (peratom_allocate_flag) deallocate_peratom();
-  if (group_allocate_flag) deallocate_groups();
-
-  // reset portion of global grid that each proc owns
-
-  set_grid_local();
-
-  // reallocate K-space dependent memory
-  // check if grid communication is now overlapping if not allowed
-  // don't invoke allocate peratom() or group(), will be allocated when needed
-
-  allocate();
-
-  if (!overlap_allowed && !gc->ghost_adjacent())
-    error->all(FLERR,"PPPM grid stencil extends "
-               "beyond nearest neighbor processor");
-
-  // pre-compute Green's function denomiator expansion
-  // pre-compute 1d charge distribution coefficients
-
-  compute_gf_denom();
-  if (differentiation_flag == 1) compute_sf_precoeff();
-  compute_rho_coeff();
-
-  // pre-compute volume-dependent coeffs for portion of grid I now own
-
-  setup();
-}
-
-/* ----------------------------------------------------------------------
-   compute the PPPM long-range force, energy, virial
-------------------------------------------------------------------------- */
-
-void PPPM2::compute(int eflag, int vflag)
-{
-  int i,j;
-
-  // set energy/virial flags
-  // invoke allocate_peratom() if needed for first time
-
-  ev_init(eflag,vflag);
-
-  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();
-
-  // if atom count has changed, update qsum and qsqsum
-
-  if (atom->natoms != natoms_original) {
-    qsum_qsq();
-    natoms_original = atom->natoms;
-  }
-
-  // return if there are no charges
-
-  if (qsqsum == 0.0) return;
-
-  // convert atoms from box to lamda coords
-
-  if (triclinic == 0) boxlo = domain->boxlo;
-  else {
-    boxlo = domain->boxlo_lamda;
-    domain->x2lamda(atom->nlocal);
-  }
-
-  // extend size of per-atom arrays if necessary
-
-  if (atom->nmax > nmax) {
-    memory->destroy(part2grid);
-    nmax = atom->nmax;
-    memory->create(part2grid,nmax,3,"pppm:part2grid");
-  }
-
-  // find grid points for all my particles
-  // map my particle charge onto my local 3d density grid
-
-  particle_map();
-  make_rho();
-
-  // all procs communicate density values from their ghost cells
-  //   to fully sum contribution in their 3d bricks
-  // remap from 3d decomposition to FFT decomposition
-
-  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
-			  gc_buf1,gc_buf2,MPI_FFT_SCALAR);
-  brick2fft();
-
-  // compute potential gradient on my FFT grid and
-  //   portion of e_long on this proc's FFT grid
-  // return gradients (electric fields) in 3d brick decomposition
-  // also performs per-atom calculations via poisson_peratom()
-
-  poisson();
-
-  // all procs communicate E-field values
-  // to fill ghost cells surrounding their 3d bricks
-
-  if (differentiation_flag == 1)
-    gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
-			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
-  else
-    gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
-			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
-
-  // extra per-atom energy/virial communication
-
-  if (evflag_atom) {
-    if (differentiation_flag == 1 && vflag_atom)
-      gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
-			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
-    else if (differentiation_flag == 0)
-      gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
-			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
-  }
-
-  // calculate the force on my particles
-
-  fieldforce();
-
-  // extra per-atom energy/virial communication
-
-  if (evflag_atom) fieldforce_peratom();
-
-  // sum global energy across procs and add in volume-dependent term
-
-  const double qscale = qqrd2e * scale;
-
-  if (eflag_global) {
-    double energy_all;
-    MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
-    energy = energy_all;
-
-    energy *= 0.5*volume;
-    energy -= g_ewald*qsqsum/MY_PIS +
-      MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume);
-    energy *= qscale;
-  }
-
-  // sum global virial across procs
-
-  if (vflag_global) {
-    double virial_all[6];
-    MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
-    for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
-  }
-
-  // per-atom energy/virial
-  // energy includes self-energy correction
-  // ntotal accounts for TIP4P tallying eatom/vatom for ghost atoms
-
-  if (evflag_atom) {
-    double *q = atom->q;
-    int nlocal = atom->nlocal;
-    int ntotal = nlocal;
-    if (tip4pflag) ntotal += atom->nghost;
-
-    if (eflag_atom) {
-      for (i = 0; i < nlocal; i++) {
-        eatom[i] *= 0.5;
-        eatom[i] -= g_ewald*q[i]*q[i]/MY_PIS + MY_PI2*q[i]*qsum /
-          (g_ewald*g_ewald*volume);
-        eatom[i] *= qscale;
-      }
-      for (i = nlocal; i < ntotal; i++) eatom[i] *= 0.5*qscale;
-    }
-
-    if (vflag_atom) {
-      for (i = 0; i < ntotal; i++)
-        for (j = 0; j < 6; j++) vatom[i][j] *= 0.5*qscale;
-    }
-  }
-
-  // 2d slab correction
-
-  if (slabflag == 1) slabcorr();
-
-  // convert atoms back from lamda to box coords
-
-  if (triclinic) domain->lamda2x(atom->nlocal);
-}
-
-/* ----------------------------------------------------------------------
-   allocate memory that depends on # of K-vectors and order
-------------------------------------------------------------------------- */
-
-void PPPM2::allocate()
-{
-  memory->create3d_offset(density_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:density_brick");
-
-  memory->create(density_fft,nfft_both,"pppm:density_fft");
-  memory->create(greensfn,nfft_both,"pppm:greensfn");
-  memory->create(work1,2*nfft_both,"pppm:work1");
-  memory->create(work2,2*nfft_both,"pppm:work2");
-  memory->create(vg,nfft_both,6,"pppm:vg");
-
-  if (triclinic == 0) {
-    memory->create1d_offset(fkx,nxlo_fft,nxhi_fft,"pppm:fkx");
-    memory->create1d_offset(fky,nylo_fft,nyhi_fft,"pppm:fky");
-    memory->create1d_offset(fkz,nzlo_fft,nzhi_fft,"pppm:fkz");
-  } else {
-    memory->create(fkx,nfft_both,"pppm:fkx");
-    memory->create(fky,nfft_both,"pppm:fky");
-    memory->create(fkz,nfft_both,"pppm:fkz");
-  }
-
-  if (differentiation_flag == 1) {
-    memory->create3d_offset(u_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:u_brick");
-
-    memory->create(sf_precoeff1,nfft_both,"pppm:sf_precoeff1");
-    memory->create(sf_precoeff2,nfft_both,"pppm:sf_precoeff2");
-    memory->create(sf_precoeff3,nfft_both,"pppm:sf_precoeff3");
-    memory->create(sf_precoeff4,nfft_both,"pppm:sf_precoeff4");
-    memory->create(sf_precoeff5,nfft_both,"pppm:sf_precoeff5");
-    memory->create(sf_precoeff6,nfft_both,"pppm:sf_precoeff6");
-
-  } else {
-    memory->create3d_offset(vdx_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                            nxlo_out,nxhi_out,"pppm:vdx_brick");
-    memory->create3d_offset(vdy_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                            nxlo_out,nxhi_out,"pppm:vdy_brick");
-    memory->create3d_offset(vdz_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                            nxlo_out,nxhi_out,"pppm:vdz_brick");
-  }
-
-  // summation coeffs
-
-  order_allocated = order;
-  if (!stagger_flag) memory->create(gf_b,order,"pppm:gf_b");
-  memory->create2d_offset(rho1d,3,-order/2,order/2,"pppm:rho1d");
-  memory->create2d_offset(drho1d,3,-order/2,order/2,"pppm:drho1d");
-  memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm:rho_coeff");
-  memory->create2d_offset(drho_coeff,order,(1-order)/2,order/2,
-                          "pppm:drho_coeff");
-
-  // create 2 FFTs and a Remap
-  // 1st FFT keeps data in FFT decomposition
-  // 2nd FFT returns data in 3d brick decomposition
-  // remap takes data from 3d brick to FFT decomposition
-
-  int tmp;
-
-  fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
-                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                   0,0,&tmp,collective_flag);
-
-  fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
-                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                   nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                   0,0,&tmp,collective_flag);
-
-  remap = new Remap(lmp,world,
-                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                    1,0,0,FFT_PRECISION,collective_flag);
-
-  // create ghost grid object for rho and electric field communication
-  // also create 2 bufs for ghost grid cell comm, passed to GridComm methods
-  
-  gc = new GridComm2(lmp,world,nx_pppm,ny_pppm,nz_pppm,
-		     nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-		     nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
-
-  gc->setup(ngc_buf1,ngc_buf2);
-
-  if (differentiation_flag) npergrid = 1;
-  else npergrid = 3;
-
-  memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
-  memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
-}
-
-/* ----------------------------------------------------------------------
-   deallocate memory that depends on # of K-vectors and order
-------------------------------------------------------------------------- */
-
-void PPPM2::deallocate()
-{
-  memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out);
-
-  if (differentiation_flag == 1) {
-    memory->destroy3d_offset(u_brick,nzlo_out,nylo_out,nxlo_out);
-    memory->destroy(sf_precoeff1);
-    memory->destroy(sf_precoeff2);
-    memory->destroy(sf_precoeff3);
-    memory->destroy(sf_precoeff4);
-    memory->destroy(sf_precoeff5);
-    memory->destroy(sf_precoeff6);
-  } else {
-    memory->destroy3d_offset(vdx_brick,nzlo_out,nylo_out,nxlo_out);
-    memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
-    memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
-  }
-
-  memory->destroy(density_fft);
-  memory->destroy(greensfn);
-  memory->destroy(work1);
-  memory->destroy(work2);
-  memory->destroy(vg);
-
-  if (triclinic == 0) {
-    memory->destroy1d_offset(fkx,nxlo_fft);
-    memory->destroy1d_offset(fky,nylo_fft);
-    memory->destroy1d_offset(fkz,nzlo_fft);
-  } else {
-    memory->destroy(fkx);
-    memory->destroy(fky);
-    memory->destroy(fkz);
-  }
-
-  memory->destroy(gf_b);
-  if (stagger_flag) gf_b = NULL;
-  memory->destroy2d_offset(rho1d,-order_allocated/2);
-  memory->destroy2d_offset(drho1d,-order_allocated/2);
-  memory->destroy2d_offset(rho_coeff,(1-order_allocated)/2);
-  memory->destroy2d_offset(drho_coeff,(1-order_allocated)/2);
-
-  delete fft1;
-  delete fft2;
-  delete remap;
-  delete gc;
-  memory->destroy(gc_buf1);
-  memory->destroy(gc_buf2);
-}
-
-/* ----------------------------------------------------------------------
-   allocate per-atom memory that depends on # of K-vectors and order
-------------------------------------------------------------------------- */
-
-void PPPM2::allocate_peratom()
-{
-  peratom_allocate_flag = 1;
-
-  if (differentiation_flag != 1)
-    memory->create3d_offset(u_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                            nxlo_out,nxhi_out,"pppm:u_brick");
-
-  memory->create3d_offset(v0_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:v0_brick");
-
-  memory->create3d_offset(v1_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:v1_brick");
-  memory->create3d_offset(v2_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:v2_brick");
-  memory->create3d_offset(v3_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:v3_brick");
-  memory->create3d_offset(v4_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:v4_brick");
-  memory->create3d_offset(v5_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:v5_brick");
-
-  // use same GC ghost grid object for peratom grid communication
-  // but need to reallocate a larger gc_buf1 and gc_buf2
-
-  if (differentiation_flag) npergrid = 6;
-  else npergrid = 7;
-
-  memory->destroy(gc_buf1);
-  memory->destroy(gc_buf2);
-  memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
-  memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
-}
-
-/* ----------------------------------------------------------------------
-   deallocate per-atom memory that depends on # of K-vectors and order
-------------------------------------------------------------------------- */
-
-void PPPM2::deallocate_peratom()
-{
-  peratom_allocate_flag = 0;
-
-  memory->destroy3d_offset(v0_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->destroy3d_offset(v1_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->destroy3d_offset(v2_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->destroy3d_offset(v3_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->destroy3d_offset(v4_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->destroy3d_offset(v5_brick,nzlo_out,nylo_out,nxlo_out);
-
-  if (differentiation_flag != 1)
-    memory->destroy3d_offset(u_brick,nzlo_out,nylo_out,nxlo_out);
-}
-
-/* ----------------------------------------------------------------------
-   set global size of PPPM grid = nx,ny,nz_pppm
-   used for charge accumulation, FFTs, and electric field interpolation
-------------------------------------------------------------------------- */
-
-void PPPM2::set_grid_global()
-{
-  // use xprd,yprd,zprd (even if triclinic, and then scale later)
-  // adjust z dimension for 2d slab PPPM
-  // 3d PPPM just uses zprd since slab_volfactor = 1.0
-
-  double xprd = domain->xprd;
-  double yprd = domain->yprd;
-  double zprd = domain->zprd;
-  double zprd_slab = zprd*slab_volfactor;
-
-  // make initial g_ewald estimate
-  // based on desired accuracy and real space cutoff
-  // fluid-occupied volume used to estimate real-space error
-  // zprd used rather than zprd_slab
-
-  double h;
-  bigint natoms = atom->natoms;
-
-  if (!gewaldflag) {
-    if (accuracy <= 0.0)
-      error->all(FLERR,"KSpace accuracy must be > 0");
-    if (q2 == 0.0)
-      error->all(FLERR,"Must use kspace_modify gewald for uncharged system");
-    g_ewald = accuracy*sqrt(natoms*cutoff*xprd*yprd*zprd) / (2.0*q2);
-    if (g_ewald >= 1.0) g_ewald = (1.35 - 0.15*log(accuracy))/cutoff;
-    else g_ewald = sqrt(-log(g_ewald)) / cutoff;
-  }
-
-  // set optimal nx_pppm,ny_pppm,nz_pppm based on order and accuracy
-  // nz_pppm uses extended zprd_slab instead of zprd
-  // reduce it until accuracy target is met
-
-  if (!gridflag) {
-
-    if (differentiation_flag == 1 || stagger_flag) {
-
-      h = h_x = h_y = h_z = 4.0/g_ewald;
-      int count = 0;
-      while (1) {
-
-        // set grid dimensions
-	
-        nx_pppm = static_cast<int> (xprd/h_x);
-        ny_pppm = static_cast<int> (yprd/h_y);
-        nz_pppm = static_cast<int> (zprd_slab/h_z);
-
-        if (nx_pppm <= 1) nx_pppm = 2;
-        if (ny_pppm <= 1) ny_pppm = 2;
-        if (nz_pppm <= 1) nz_pppm = 2;
-
-	// estimate Kspace force error
-	
-        double df_kspace = compute_df_kspace();
-
-        // break loop if the accuracy has been reached or
-        // too many loops have been performed
-
-	count++;
-        if (df_kspace <= accuracy) break;
-
-        if (count > 500) error->all(FLERR, "Could not compute grid size");
-        h *= 0.95;
-        h_x = h_y = h_z = h;
-      }
-
-    } else {
-
-      double err;
-      h_x = h_y = h_z = 1.0/g_ewald;
-
-      nx_pppm = static_cast<int> (xprd/h_x) + 1;
-      ny_pppm = static_cast<int> (yprd/h_y) + 1;
-      nz_pppm = static_cast<int> (zprd_slab/h_z) + 1;
-
-      err = estimate_ik_error(h_x,xprd,natoms);
-      while (err > accuracy) {
-        err = estimate_ik_error(h_x,xprd,natoms);
-        nx_pppm++;
-        h_x = xprd/nx_pppm;
-      }
-
-      err = estimate_ik_error(h_y,yprd,natoms);
-      while (err > accuracy) {
-        err = estimate_ik_error(h_y,yprd,natoms);
-        ny_pppm++;
-        h_y = yprd/ny_pppm;
-      }
-
-      err = estimate_ik_error(h_z,zprd_slab,natoms);
-      while (err > accuracy) {
-        err = estimate_ik_error(h_z,zprd_slab,natoms);
-        nz_pppm++;
-        h_z = zprd_slab/nz_pppm;
-      }
-    }
-
-    // scale grid for triclinic skew
-
-    if (triclinic) {
-      double tmp[3];
-      tmp[0] = nx_pppm/xprd;
-      tmp[1] = ny_pppm/yprd;
-      tmp[2] = nz_pppm/zprd;
-      lamda2xT(&tmp[0],&tmp[0]);
-      nx_pppm = static_cast<int>(tmp[0]) + 1;
-      ny_pppm = static_cast<int>(tmp[1]) + 1;
-      nz_pppm = static_cast<int>(tmp[2]) + 1;
-    }
-  }
-
-  // boost grid size until it is factorable
-
-  while (!factorable(nx_pppm)) nx_pppm++;
-  while (!factorable(ny_pppm)) ny_pppm++;
-  while (!factorable(nz_pppm)) nz_pppm++;
-
-  if (triclinic == 0) {
-    h_x = xprd/nx_pppm;
-    h_y = yprd/ny_pppm;
-    h_z = zprd_slab/nz_pppm;
-  } else {
-    double tmp[3];
-    tmp[0] = nx_pppm;
-    tmp[1] = ny_pppm;
-    tmp[2] = nz_pppm;
-    x2lamdaT(&tmp[0],&tmp[0]);
-    h_x = 1.0/tmp[0];
-    h_y = 1.0/tmp[1];
-    h_z = 1.0/tmp[2];
-  }
-
-  if (nx_pppm >= OFFSET || ny_pppm >= OFFSET || nz_pppm >= OFFSET)
-    error->all(FLERR,"PPPM grid is too large");
-}
-
-/* ----------------------------------------------------------------------
-   check if all factors of n are in list of factors
-   return 1 if yes, 0 if no
-------------------------------------------------------------------------- */
-
-int PPPM2::factorable(int n)
-{
-  int i;
-
-  while (n > 1) {
-    for (i = 0; i < nfactors; i++) {
-      if (n % factors[i] == 0) {
-        n /= factors[i];
-        break;
-      }
-    }
-    if (i == nfactors) return 0;
-  }
-
-  return 1;
-}
-
-/* ----------------------------------------------------------------------
-   compute estimated kspace force error
-------------------------------------------------------------------------- */
-
-double PPPM2::compute_df_kspace()
-{
-  double xprd = domain->xprd;
-  double yprd = domain->yprd;
-  double zprd = domain->zprd;
-  double zprd_slab = zprd*slab_volfactor;
-  bigint natoms = atom->natoms;
-  double df_kspace = 0.0;
-  if (differentiation_flag == 1 || stagger_flag) {
-    double qopt = compute_qopt();
-    df_kspace = sqrt(qopt/natoms)*q2/(xprd*yprd*zprd_slab);
-  } else {
-    double lprx = estimate_ik_error(h_x,xprd,natoms);
-    double lpry = estimate_ik_error(h_y,yprd,natoms);
-    double lprz = estimate_ik_error(h_z,zprd_slab,natoms);
-    df_kspace = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0);
-  }
-  return df_kspace;
-}
-
-/* ----------------------------------------------------------------------
-   compute qopt
-------------------------------------------------------------------------- */
-
-double PPPM2::compute_qopt()
-{
-  int k,l,m,nx,ny,nz;
-  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
-  double u1,u2,sqk;
-  double sum1,sum2,sum3,sum4,dot2;
-
-  double *prd = domain->prd;
-
-  const double xprd = prd[0];
-  const double yprd = prd[1];
-  const double zprd = prd[2];
-  const double zprd_slab = zprd*slab_volfactor;
-  volume = xprd * yprd * zprd_slab;
-
-  const double unitkx = (MY_2PI/xprd);
-  const double unitky = (MY_2PI/yprd);
-  const double unitkz = (MY_2PI/zprd_slab);
-
-  const int twoorder = 2*order;
-
-  // loop over entire FFT grid
-  // each proc calculates contributions from every Pth grid point
-  
-  bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
-  int nxy_pppm = nx_pppm * ny_pppm;
-  
-  double qopt = 0.0;
-
-  for (bigint i = me; i < ngridtotal; i += nprocs) {
-    k = i % nx_pppm;
-    l = (i/nx_pppm) % ny_pppm;
-    m = i / nxy_pppm;
-
-    const int kper = k - nx_pppm*(2*k/nx_pppm);
-    const int lper = l - ny_pppm*(2*l/ny_pppm);
-    const int mper = m - nz_pppm*(2*m/nz_pppm);
-
-    sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
-    if (sqk == 0.0) continue;
-
-    sum1 = sum2 = sum3 = sum4 = 0.0;
-    
-    for (nx = -2; nx <= 2; nx++) {
-      qx = unitkx*(kper+nx_pppm*nx);
-      sx = exp(-0.25*square(qx/g_ewald));
-      argx = 0.5*qx*xprd/nx_pppm;
-      wx = powsinxx(argx,twoorder);
-      qx *= qx;
-      
-      for (ny = -2; ny <= 2; ny++) {
-	qy = unitky*(lper+ny_pppm*ny);
-	sy = exp(-0.25*square(qy/g_ewald));
-	argy = 0.5*qy*yprd/ny_pppm;
-	wy = powsinxx(argy,twoorder);
-	qy *= qy;
-	
-	for (nz = -2; nz <= 2; nz++) {
-	  qz = unitkz*(mper+nz_pppm*nz);
-	  sz = exp(-0.25*square(qz/g_ewald));
-	  argz = 0.5*qz*zprd_slab/nz_pppm;
-	  wz = powsinxx(argz,twoorder);
-	  qz *= qz;
-	  
-	  dot2 = qx+qy+qz;
-	  u1   = sx*sy*sz;
-	  u2   = wx*wy*wz;
-	  
-	  sum1 += u1*u1/dot2*MY_4PI*MY_4PI;
-	  sum2 += u1 * u2 * MY_4PI;
-	  sum3 += u2;
-	  sum4 += dot2*u2;
-	}
-      }
-    }
-    
-    sum2 *= sum2;
-    qopt += sum1 - sum2/(sum3*sum4);
-  }
-
-  // sum qopt over all procs
-  
-  double qopt_all;
-  MPI_Allreduce(&qopt,&qopt_all,1,MPI_DOUBLE,MPI_SUM,world);
-  return qopt_all;
-}
-
-/* ----------------------------------------------------------------------
-   estimate kspace force error for ik method
-------------------------------------------------------------------------- */
-
-double PPPM2::estimate_ik_error(double h, double prd, bigint natoms)
-{
-  double sum = 0.0;
-  if (natoms == 0) return 0.0;
-  for (int m = 0; m < order; m++)
-    sum += acons[order][m] * pow(h*g_ewald,2.0*m);
-  double value = q2 * pow(h*g_ewald,(double)order) *
-    sqrt(g_ewald*prd*sqrt(MY_2PI)*sum/natoms) / (prd*prd);
-
-  return value;
-}
-
-/* ----------------------------------------------------------------------
-   adjust the g_ewald parameter to near its optimal value
-   using a Newton-Raphson solver
-------------------------------------------------------------------------- */
-
-void PPPM2::adjust_gewald()
-{
-  double dx;
-
-  for (int i = 0; i < LARGE; i++) {
-    dx = newton_raphson_f() / derivf();
-    g_ewald -= dx;
-    if (fabs(newton_raphson_f()) < SMALL) return;
-  }
-  error->all(FLERR, "Could not compute g_ewald");
-}
-
-/* ----------------------------------------------------------------------
-   calculate f(x) using Newton-Raphson solver
-------------------------------------------------------------------------- */
-
-double PPPM2::newton_raphson_f()
-{
-  double xprd = domain->xprd;
-  double yprd = domain->yprd;
-  double zprd = domain->zprd;
-  bigint natoms = atom->natoms;
-
-  double df_rspace = 2.0*q2*exp(-g_ewald*g_ewald*cutoff*cutoff) /
-       sqrt(natoms*cutoff*xprd*yprd*zprd);
-
-  double df_kspace = compute_df_kspace();
-
-  return df_rspace - df_kspace;
-}
-
-/* ----------------------------------------------------------------------
-   calculate numerical derivative f'(x) using forward difference
-   [f(x + h) - f(x)] / h
-------------------------------------------------------------------------- */
-
-double PPPM2::derivf()
-{
-  double h = 0.000001;  //Derivative step-size
-  double df,f1,f2,g_ewald_old;
-
-  f1 = newton_raphson_f();
-  g_ewald_old = g_ewald;
-  g_ewald += h;
-  f2 = newton_raphson_f();
-  g_ewald = g_ewald_old;
-  df = (f2 - f1)/h;
-
-  return df;
-}
-
-/* ----------------------------------------------------------------------
-   calculate the final estimate of the accuracy
-------------------------------------------------------------------------- */
-
-double PPPM2::final_accuracy()
-{
-  double xprd = domain->xprd;
-  double yprd = domain->yprd;
-  double zprd = domain->zprd;
-  bigint natoms = atom->natoms;
-  if (natoms == 0) natoms = 1; // avoid division by zero
-
-  double df_kspace = compute_df_kspace();
-  double q2_over_sqrt = q2 / sqrt(natoms*cutoff*xprd*yprd*zprd);
-  double df_rspace = 2.0 * q2_over_sqrt * exp(-g_ewald*g_ewald*cutoff*cutoff);
-  double df_table = estimate_table_accuracy(q2_over_sqrt,df_rspace);
-  double estimated_accuracy = sqrt(df_kspace*df_kspace + df_rspace*df_rspace +
-                                   df_table*df_table);
-
-  return estimated_accuracy;
-}
-
-/* ----------------------------------------------------------------------
-   set local subset of PPPM/FFT grid that I own
-   n xyz lo/hi in = 3d brick that I own (inclusive)
-   n xyz lo/hi out = 3d brick + ghost cells in 6 directions (inclusive)
-   n xyz lo/hi fft = FFT columns that I own (all of x dim, 2d decomp in yz)
-------------------------------------------------------------------------- */
-
-void PPPM2::set_grid_local()
-{
-  // global indices of PPPM grid range from 0 to N-1
-  // nlo_in,nhi_in = lower/upper limits of the 3d sub-brick of
-  //   global PPPM grid that I own without ghost cells
-  // for slab PPPM, assign z grid as if it were not extended
-  // both non-tiled and tiled proc layouts use 0-1 fractional sumdomain info
-  
-  if (comm->layout != Comm::LAYOUT_TILED) {
-    nxlo_in = static_cast<int> (comm->xsplit[comm->myloc[0]] * nx_pppm);
-    nxhi_in = static_cast<int> (comm->xsplit[comm->myloc[0]+1] * nx_pppm) - 1;
-
-    nylo_in = static_cast<int> (comm->ysplit[comm->myloc[1]] * ny_pppm);
-    nyhi_in = static_cast<int> (comm->ysplit[comm->myloc[1]+1] * ny_pppm) - 1;
-
-    nzlo_in = static_cast<int>
-      (comm->zsplit[comm->myloc[2]] * nz_pppm/slab_volfactor);
-    nzhi_in = static_cast<int>
-      (comm->zsplit[comm->myloc[2]+1] * nz_pppm/slab_volfactor) - 1;
-    
-  } else {
-    nxlo_in = static_cast<int> (comm->mysplit[0][0] * nx_pppm);
-    nxhi_in = static_cast<int> (comm->mysplit[0][1] * nx_pppm) - 1;
-
-    nylo_in = static_cast<int> (comm->mysplit[1][0] * ny_pppm);
-    nyhi_in = static_cast<int> (comm->mysplit[1][1] * ny_pppm) - 1;
-
-    nzlo_in = static_cast<int> (comm->mysplit[2][0] * nz_pppm/slab_volfactor);
-    nzhi_in = static_cast<int> (comm->mysplit[2][1] * nz_pppm/slab_volfactor) - 1;
-  }
-
-  // nlower,nupper = stencil size for mapping particles to PPPM grid
-
-  nlower = -(order-1)/2;
-  nupper = order/2;
-
-  // shift values for particle <-> grid mapping
-  // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
-
-  if (order % 2) shift = OFFSET + 0.5;
-  else shift = OFFSET;
-  if (order % 2) shiftone = 0.0;
-  else shiftone = 0.5;
-
-  // nlo_out,nhi_out = lower/upper limits of the 3d sub-brick of
-  //   global PPPM grid that my particles can contribute charge to
-  // effectively nlo_in,nhi_in + ghost cells
-  // nlo,nhi = global coords of grid pt to "lower left" of smallest/largest
-  //           position a particle in my box can be at
-  // dist[3] = particle position bound = subbox + skin/2.0 + qdist
-  //   qdist = offset due to TIP4P fictitious charge
-  //   convert to triclinic if necessary
-  // nlo_out,nhi_out = nlo,nhi + stencil size for particle mapping
-  // for slab PPPM, assign z grid as if it were not extended
-
-  double *prd,*sublo,*subhi;
-
-  if (triclinic == 0) {
-    prd = domain->prd;
-    boxlo = domain->boxlo;
-    sublo = domain->sublo;
-    subhi = domain->subhi;
-  } else {
-    prd = domain->prd_lamda;
-    boxlo = domain->boxlo_lamda;
-    sublo = domain->sublo_lamda;
-    subhi = domain->subhi_lamda;
-  }
-
-  double xprd = prd[0];
-  double yprd = prd[1];
-  double zprd = prd[2];
-  double zprd_slab = zprd*slab_volfactor;
-
-  double dist[3] = {0.0,0.0,0.0};
-  double cuthalf = 0.5*neighbor->skin + qdist;
-  if (triclinic == 0) dist[0] = dist[1] = dist[2] = cuthalf;
-  else kspacebbox(cuthalf,&dist[0]);
-
-  int nlo,nhi;
-  nlo = nhi = 0;
-
-  nlo = static_cast<int> ((sublo[0]-dist[0]-boxlo[0]) *
-                            nx_pppm/xprd + shift) - OFFSET;
-  nhi = static_cast<int> ((subhi[0]+dist[0]-boxlo[0]) *
-                            nx_pppm/xprd + shift) - OFFSET;
-  nxlo_out = nlo + nlower;
-  nxhi_out = nhi + nupper;
-
-  nlo = static_cast<int> ((sublo[1]-dist[1]-boxlo[1]) *
-                            ny_pppm/yprd + shift) - OFFSET;
-  nhi = static_cast<int> ((subhi[1]+dist[1]-boxlo[1]) *
-                            ny_pppm/yprd + shift) - OFFSET;
-  nylo_out = nlo + nlower;
-  nyhi_out = nhi + nupper;
-
-  nlo = static_cast<int> ((sublo[2]-dist[2]-boxlo[2]) *
-                            nz_pppm/zprd_slab + shift) - OFFSET;
-  nhi = static_cast<int> ((subhi[2]+dist[2]-boxlo[2]) *
-                            nz_pppm/zprd_slab + shift) - OFFSET;
-  nzlo_out = nlo + nlower;
-  nzhi_out = nhi + nupper;
-
-  if (stagger_flag) {
-    nxhi_out++;
-    nyhi_out++;
-    nzhi_out++;
-  }
-
-  // for slab PPPM, change the grid boundary for processors at +z end
-  //   to include the empty volume between periodically repeating slabs
-  // for slab PPPM, want charge data communicated from -z proc to +z proc,
-  //   but not vice versa, also want field data communicated from +z proc to
-  //   -z proc, but not vice versa
-  // this is accomplished by nzhi_in = nzhi_out on +z end (no ghost cells)
-  // also insure no other procs use ghost cells beyond +z limit
-  // differnet logic for non-tiled vs tiled decomposition
-  
-  if (slabflag == 1) {
-    if (comm->layout != Comm::LAYOUT_TILED) {
-      if (comm->myloc[2] == comm->procgrid[2]-1) nzhi_in = nzhi_out = nz_pppm - 1;
-    } else {
-      if (comm->mysplit[2][1] == 1.0) nzhi_in = nzhi_out = nz_pppm - 1;
-    }
-    nzhi_out = MIN(nzhi_out,nz_pppm-1);
-  }
-
-  // x-pencil decomposition of FFT mesh
-  // global indices range from 0 to N-1
-  // each proc owns entire x-dimension, clumps of columns in y,z dimensions
-  // npey_fft,npez_fft = # of procs in y,z dims
-  // if nprocs is small enough, proc can own 1 or more entire xy planes,
-  //   else proc owns 2d sub-blocks of yz plane
-  // me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions
-  // nlo_fft,nhi_fft = lower/upper limit of the section
-  //   of the global FFT mesh that I own in x-pencil decomposition
-
-  int npey_fft,npez_fft;
-  if (nz_pppm >= nprocs) {
-    npey_fft = 1;
-    npez_fft = nprocs;
-  } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);
-
-  int me_y = me % npey_fft;
-  int me_z = me / npey_fft;
-
-  nxlo_fft = 0;
-  nxhi_fft = nx_pppm - 1;
-  nylo_fft = me_y*ny_pppm/npey_fft;
-  nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1;
-  nzlo_fft = me_z*nz_pppm/npez_fft;
-  nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1;
-
-  // ngrid = count of PPPM grid pts owned by this proc, including ghosts
-
-  ngrid = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
-    (nzhi_out-nzlo_out+1);
-
-  // count of FFT grids pts owned by this proc, without ghosts
-  // nfft = FFT points in x-pencil FFT decomposition on this proc
-  // nfft_brick = FFT points in 3d brick-decomposition on this proc
-  // nfft_both = greater of 2 values
-
-  nfft = (nxhi_fft-nxlo_fft+1) * (nyhi_fft-nylo_fft+1) *
-    (nzhi_fft-nzlo_fft+1);
-  int nfft_brick = (nxhi_in-nxlo_in+1) * (nyhi_in-nylo_in+1) *
-    (nzhi_in-nzlo_in+1);
-  nfft_both = MAX(nfft,nfft_brick);
-}
-
-/* ----------------------------------------------------------------------
-   pre-compute Green's function denominator expansion coeffs, Gamma(2n)
-------------------------------------------------------------------------- */
-
-void PPPM2::compute_gf_denom()
-{
-  int k,l,m;
-
-  for (l = 1; l < order; l++) gf_b[l] = 0.0;
-  gf_b[0] = 1.0;
-
-  for (m = 1; m < order; m++) {
-    for (l = m; l > 0; l--)
-      gf_b[l] = 4.0 * (gf_b[l]*(l-m)*(l-m-0.5)-gf_b[l-1]*(l-m-1)*(l-m-1));
-    gf_b[0] = 4.0 * (gf_b[0]*(l-m)*(l-m-0.5));
-  }
-
-  bigint ifact = 1;
-  for (k = 1; k < 2*order; k++) ifact *= k;
-  double gaminv = 1.0/ifact;
-  for (l = 0; l < order; l++) gf_b[l] *= gaminv;
-}
-
-/* ----------------------------------------------------------------------
-   pre-compute modified (Hockney-Eastwood) Coulomb Green's function
-------------------------------------------------------------------------- */
-
-void PPPM2::compute_gf_ik()
-{
-  const double * const prd = domain->prd;
-
-  const double xprd = prd[0];
-  const double yprd = prd[1];
-  const double zprd = prd[2];
-  const double zprd_slab = zprd*slab_volfactor;
-  const double unitkx = (MY_2PI/xprd);
-  const double unitky = (MY_2PI/yprd);
-  const double unitkz = (MY_2PI/zprd_slab);
-
-  double snx,sny,snz;
-  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
-  double sum1,dot1,dot2;
-  double numerator,denominator;
-  double sqk;
-
-  int k,l,m,n,nx,ny,nz,kper,lper,mper;
-
-  const int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) *
-                                    pow(-log(EPS_HOC),0.25));
-  const int nby = static_cast<int> ((g_ewald*yprd/(MY_PI*ny_pppm)) *
-                                    pow(-log(EPS_HOC),0.25));
-  const int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) *
-                                    pow(-log(EPS_HOC),0.25));
-  const int twoorder = 2*order;
-
-  n = 0;
-  for (m = nzlo_fft; m <= nzhi_fft; m++) {
-    mper = m - nz_pppm*(2*m/nz_pppm);
-    snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm));
-
-    for (l = nylo_fft; l <= nyhi_fft; l++) {
-      lper = l - ny_pppm*(2*l/ny_pppm);
-      sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));
-
-      for (k = nxlo_fft; k <= nxhi_fft; k++) {
-        kper = k - nx_pppm*(2*k/nx_pppm);
-        snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));
-
-        sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
-
-        if (sqk != 0.0) {
-          numerator = 12.5663706/sqk;
-          denominator = gf_denom(snx,sny,snz);
-          sum1 = 0.0;
-
-          for (nx = -nbx; nx <= nbx; nx++) {
-            qx = unitkx*(kper+nx_pppm*nx);
-            sx = exp(-0.25*square(qx/g_ewald));
-            argx = 0.5*qx*xprd/nx_pppm;
-            wx = powsinxx(argx,twoorder);
-
-            for (ny = -nby; ny <= nby; ny++) {
-              qy = unitky*(lper+ny_pppm*ny);
-              sy = exp(-0.25*square(qy/g_ewald));
-              argy = 0.5*qy*yprd/ny_pppm;
-              wy = powsinxx(argy,twoorder);
-
-              for (nz = -nbz; nz <= nbz; nz++) {
-                qz = unitkz*(mper+nz_pppm*nz);
-                sz = exp(-0.25*square(qz/g_ewald));
-                argz = 0.5*qz*zprd_slab/nz_pppm;
-                wz = powsinxx(argz,twoorder);
-
-                dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
-                dot2 = qx*qx+qy*qy+qz*qz;
-                sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
-              }
-            }
-          }
-          greensfn[n++] = numerator*sum1/denominator;
-        } else greensfn[n++] = 0.0;
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   pre-compute modified (Hockney-Eastwood) Coulomb Green's function
-   for a triclinic system
-------------------------------------------------------------------------- */
-
-void PPPM2::compute_gf_ik_triclinic()
-{
-  double snx,sny,snz;
-  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
-  double sum1,dot1,dot2;
-  double numerator,denominator;
-  double sqk;
-
-  int k,l,m,n,nx,ny,nz,kper,lper,mper;
-
-  double tmp[3];
-  tmp[0] = (g_ewald/(MY_PI*nx_pppm)) * pow(-log(EPS_HOC),0.25);
-  tmp[1] = (g_ewald/(MY_PI*ny_pppm)) * pow(-log(EPS_HOC),0.25);
-  tmp[2] = (g_ewald/(MY_PI*nz_pppm)) * pow(-log(EPS_HOC),0.25);
-  lamda2xT(&tmp[0],&tmp[0]);
-  const int nbx = static_cast<int> (tmp[0]);
-  const int nby = static_cast<int> (tmp[1]);
-  const int nbz = static_cast<int> (tmp[2]);
-
-  const int twoorder = 2*order;
-
-  n = 0;
-  for (m = nzlo_fft; m <= nzhi_fft; m++) {
-    mper = m - nz_pppm*(2*m/nz_pppm);
-    snz = square(sin(MY_PI*mper/nz_pppm));
-
-    for (l = nylo_fft; l <= nyhi_fft; l++) {
-      lper = l - ny_pppm*(2*l/ny_pppm);
-      sny = square(sin(MY_PI*lper/ny_pppm));
-
-      for (k = nxlo_fft; k <= nxhi_fft; k++) {
-        kper = k - nx_pppm*(2*k/nx_pppm);
-        snx = square(sin(MY_PI*kper/nx_pppm));
-
-        double unitk_lamda[3];
-        unitk_lamda[0] = 2.0*MY_PI*kper;
-        unitk_lamda[1] = 2.0*MY_PI*lper;
-        unitk_lamda[2] = 2.0*MY_PI*mper;
-        x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
-
-        sqk = square(unitk_lamda[0]) + square(unitk_lamda[1]) + square(unitk_lamda[2]);
-
-        if (sqk != 0.0) {
-          numerator = 12.5663706/sqk;
-          denominator = gf_denom(snx,sny,snz);
-          sum1 = 0.0;
-
-          for (nx = -nbx; nx <= nbx; nx++) {
-            argx = MY_PI*kper/nx_pppm + MY_PI*nx;
-            wx = powsinxx(argx,twoorder);
-
-            for (ny = -nby; ny <= nby; ny++) {
-              argy = MY_PI*lper/ny_pppm + MY_PI*ny;
-              wy = powsinxx(argy,twoorder);
-
-              for (nz = -nbz; nz <= nbz; nz++) {
-                argz = MY_PI*mper/nz_pppm + MY_PI*nz;
-                wz = powsinxx(argz,twoorder);
-
-                double b[3];
-                b[0] = 2.0*MY_PI*nx_pppm*nx;
-                b[1] = 2.0*MY_PI*ny_pppm*ny;
-                b[2] = 2.0*MY_PI*nz_pppm*nz;
-                x2lamdaT(&b[0],&b[0]);
-
-                qx = unitk_lamda[0]+b[0];
-                sx = exp(-0.25*square(qx/g_ewald));
-
-                qy = unitk_lamda[1]+b[1];
-                sy = exp(-0.25*square(qy/g_ewald));
-
-                qz = unitk_lamda[2]+b[2];
-                sz = exp(-0.25*square(qz/g_ewald));
-
-                dot1 = unitk_lamda[0]*qx + unitk_lamda[1]*qy + unitk_lamda[2]*qz;
-                dot2 = qx*qx+qy*qy+qz*qz;
-                sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
-              }
-            }
-          }
-          greensfn[n++] = numerator*sum1/denominator;
-        } else greensfn[n++] = 0.0;
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   compute optimized Green's function for energy calculation
-------------------------------------------------------------------------- */
-
-void PPPM2::compute_gf_ad()
-{
-  const double * const prd = domain->prd;
-
-  const double xprd = prd[0];
-  const double yprd = prd[1];
-  const double zprd = prd[2];
-  const double zprd_slab = zprd*slab_volfactor;
-  const double unitkx = (MY_2PI/xprd);
-  const double unitky = (MY_2PI/yprd);
-  const double unitkz = (MY_2PI/zprd_slab);
-
-  double snx,sny,snz,sqk;
-  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
-  double numerator,denominator;
-  int k,l,m,n,kper,lper,mper;
-
-  const int twoorder = 2*order;
-
-  for (int i = 0; i < 6; i++) sf_coeff[i] = 0.0;
-
-  n = 0;
-  for (m = nzlo_fft; m <= nzhi_fft; m++) {
-    mper = m - nz_pppm*(2*m/nz_pppm);
-    qz = unitkz*mper;
-    snz = square(sin(0.5*qz*zprd_slab/nz_pppm));
-    sz = exp(-0.25*square(qz/g_ewald));
-    argz = 0.5*qz*zprd_slab/nz_pppm;
-    wz = powsinxx(argz,twoorder);
-
-    for (l = nylo_fft; l <= nyhi_fft; l++) {
-      lper = l - ny_pppm*(2*l/ny_pppm);
-      qy = unitky*lper;
-      sny = square(sin(0.5*qy*yprd/ny_pppm));
-      sy = exp(-0.25*square(qy/g_ewald));
-      argy = 0.5*qy*yprd/ny_pppm;
-      wy = powsinxx(argy,twoorder);
-
-      for (k = nxlo_fft; k <= nxhi_fft; k++) {
-        kper = k - nx_pppm*(2*k/nx_pppm);
-        qx = unitkx*kper;
-        snx = square(sin(0.5*qx*xprd/nx_pppm));
-        sx = exp(-0.25*square(qx/g_ewald));
-        argx = 0.5*qx*xprd/nx_pppm;
-        wx = powsinxx(argx,twoorder);
-
-        sqk = qx*qx + qy*qy + qz*qz;
-
-        if (sqk != 0.0) {
-          numerator = MY_4PI/sqk;
-          denominator = gf_denom(snx,sny,snz);
-          greensfn[n] = numerator*sx*sy*sz*wx*wy*wz/denominator;
-          sf_coeff[0] += sf_precoeff1[n]*greensfn[n];
-          sf_coeff[1] += sf_precoeff2[n]*greensfn[n];
-          sf_coeff[2] += sf_precoeff3[n]*greensfn[n];
-          sf_coeff[3] += sf_precoeff4[n]*greensfn[n];
-          sf_coeff[4] += sf_precoeff5[n]*greensfn[n];
-          sf_coeff[5] += sf_precoeff6[n]*greensfn[n];
-          n++;
-        } else {
-          greensfn[n] = 0.0;
-          sf_coeff[0] += sf_precoeff1[n]*greensfn[n];
-          sf_coeff[1] += sf_precoeff2[n]*greensfn[n];
-          sf_coeff[2] += sf_precoeff3[n]*greensfn[n];
-          sf_coeff[3] += sf_precoeff4[n]*greensfn[n];
-          sf_coeff[4] += sf_precoeff5[n]*greensfn[n];
-          sf_coeff[5] += sf_precoeff6[n]*greensfn[n];
-          n++;
-        }
-      }
-    }
-  }
-
-  // compute the coefficients for the self-force correction
-
-  double prex, prey, prez;
-  prex = prey = prez = MY_PI/volume;
-  prex *= nx_pppm/xprd;
-  prey *= ny_pppm/yprd;
-  prez *= nz_pppm/zprd_slab;
-  sf_coeff[0] *= prex;
-  sf_coeff[1] *= prex*2;
-  sf_coeff[2] *= prey;
-  sf_coeff[3] *= prey*2;
-  sf_coeff[4] *= prez;
-  sf_coeff[5] *= prez*2;
-
-  // communicate values with other procs
-
-  double tmp[6];
-  MPI_Allreduce(sf_coeff,tmp,6,MPI_DOUBLE,MPI_SUM,world);
-  for (n = 0; n < 6; n++) sf_coeff[n] = tmp[n];
-}
-
-/* ----------------------------------------------------------------------
-   compute self force coefficients for ad-differentiation scheme
-------------------------------------------------------------------------- */
-
-void PPPM2::compute_sf_precoeff()
-{
-  int i,k,l,m,n;
-  int nx,ny,nz,kper,lper,mper;
-  double wx0[5],wy0[5],wz0[5],wx1[5],wy1[5],wz1[5],wx2[5],wy2[5],wz2[5];
-  double qx0,qy0,qz0,qx1,qy1,qz1,qx2,qy2,qz2;
-  double u0,u1,u2,u3,u4,u5,u6;
-  double sum1,sum2,sum3,sum4,sum5,sum6;
-
-  n = 0;
-  for (m = nzlo_fft; m <= nzhi_fft; m++) {
-    mper = m - nz_pppm*(2*m/nz_pppm);
-
-    for (l = nylo_fft; l <= nyhi_fft; l++) {
-      lper = l - ny_pppm*(2*l/ny_pppm);
-
-      for (k = nxlo_fft; k <= nxhi_fft; k++) {
-        kper = k - nx_pppm*(2*k/nx_pppm);
-
-        sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = 0.0;
-        for (i = 0; i < 5; i++) {
-
-          qx0 = MY_2PI*(kper+nx_pppm*(i-2));
-          qx1 = MY_2PI*(kper+nx_pppm*(i-1));
-          qx2 = MY_2PI*(kper+nx_pppm*(i  ));
-          wx0[i] = powsinxx(0.5*qx0/nx_pppm,order);
-          wx1[i] = powsinxx(0.5*qx1/nx_pppm,order);
-          wx2[i] = powsinxx(0.5*qx2/nx_pppm,order);
-
-          qy0 = MY_2PI*(lper+ny_pppm*(i-2));
-          qy1 = MY_2PI*(lper+ny_pppm*(i-1));
-          qy2 = MY_2PI*(lper+ny_pppm*(i  ));
-          wy0[i] = powsinxx(0.5*qy0/ny_pppm,order);
-          wy1[i] = powsinxx(0.5*qy1/ny_pppm,order);
-          wy2[i] = powsinxx(0.5*qy2/ny_pppm,order);
-
-          qz0 = MY_2PI*(mper+nz_pppm*(i-2));
-          qz1 = MY_2PI*(mper+nz_pppm*(i-1));
-          qz2 = MY_2PI*(mper+nz_pppm*(i  ));
-
-          wz0[i] = powsinxx(0.5*qz0/nz_pppm,order);
-          wz1[i] = powsinxx(0.5*qz1/nz_pppm,order);
-          wz2[i] = powsinxx(0.5*qz2/nz_pppm,order);
-        }
-
-        for (nx = 0; nx < 5; nx++) {
-          for (ny = 0; ny < 5; ny++) {
-            for (nz = 0; nz < 5; nz++) {
-              u0 = wx0[nx]*wy0[ny]*wz0[nz];
-              u1 = wx1[nx]*wy0[ny]*wz0[nz];
-              u2 = wx2[nx]*wy0[ny]*wz0[nz];
-              u3 = wx0[nx]*wy1[ny]*wz0[nz];
-              u4 = wx0[nx]*wy2[ny]*wz0[nz];
-              u5 = wx0[nx]*wy0[ny]*wz1[nz];
-              u6 = wx0[nx]*wy0[ny]*wz2[nz];
-
-              sum1 += u0*u1;
-              sum2 += u0*u2;
-              sum3 += u0*u3;
-              sum4 += u0*u4;
-              sum5 += u0*u5;
-              sum6 += u0*u6;
-            }
-          }
-        }
-
-        // store values
-
-        sf_precoeff1[n] = sum1;
-        sf_precoeff2[n] = sum2;
-        sf_precoeff3[n] = sum3;
-        sf_precoeff4[n] = sum4;
-        sf_precoeff5[n] = sum5;
-        sf_precoeff6[n++] = sum6;
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   find center grid pt for each of my particles
-   check that full stencil for the particle will fit in my 3d brick
-   store central grid pt indices in part2grid array
-------------------------------------------------------------------------- */
-
-void PPPM2::particle_map()
-{
-  int nx,ny,nz;
-
-  double **x = atom->x;
-  int nlocal = atom->nlocal;
-
-  int flag = 0;
-
-  if (!std::isfinite(boxlo[0]) || !std::isfinite(boxlo[1]) || !std::isfinite(boxlo[2]))
-    error->one(FLERR,"Non-numeric box dimensions - simulation unstable");
-
-  for (int i = 0; i < nlocal; i++) {
-
-    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-    // current particle coord can be outside global and local box
-    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
-
-    nx = static_cast<int> ((x[i][0]-boxlo[0])*delxinv+shift) - OFFSET;
-    ny = static_cast<int> ((x[i][1]-boxlo[1])*delyinv+shift) - OFFSET;
-    nz = static_cast<int> ((x[i][2]-boxlo[2])*delzinv+shift) - OFFSET;
-
-    part2grid[i][0] = nx;
-    part2grid[i][1] = ny;
-    part2grid[i][2] = nz;
-
-    // check that entire stencil around nx,ny,nz will fit in my 3d brick
-
-    if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
-        ny+nlower < nylo_out || ny+nupper > nyhi_out ||
-        nz+nlower < nzlo_out || nz+nupper > nzhi_out)
-      flag = 1;
-  }
-
-  if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPM");
-}
-
-/* ----------------------------------------------------------------------
-   create discretized "density" on section of global grid due to my particles
-   density(x,y,z) = charge "density" at grid points of my 3d brick
-   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
-   in global grid
-------------------------------------------------------------------------- */
-
-void PPPM2::make_rho()
-{
-  int l,m,n,nx,ny,nz,mx,my,mz;
-  FFT_SCALAR dx,dy,dz,x0,y0,z0;
-
-  // clear 3d density array
-
-  memset(&(density_brick[nzlo_out][nylo_out][nxlo_out]),0,
-         ngrid*sizeof(FFT_SCALAR));
-
-  // loop over my charges, add their contribution to nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-
-  double *q = atom->q;
-  double **x = atom->x;
-  int nlocal = atom->nlocal;
-
-  for (int i = 0; i < nlocal; i++) {
-
-    nx = part2grid[i][0];
-    ny = part2grid[i][1];
-    nz = part2grid[i][2];
-    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
-    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
-    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
-
-    compute_rho1d(dx,dy,dz);
-
-    z0 = delvolinv * q[i];
-    for (n = nlower; n <= nupper; n++) {
-      mz = n+nz;
-      y0 = z0*rho1d[2][n];
-      for (m = nlower; m <= nupper; m++) {
-        my = m+ny;
-        x0 = y0*rho1d[1][m];
-        for (l = nlower; l <= nupper; l++) {
-          mx = l+nx;
-          density_brick[mz][my][mx] += x0*rho1d[0][l];
-        }
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   remap density from 3d brick decomposition to FFT decomposition
-------------------------------------------------------------------------- */
-
-void PPPM2::brick2fft()
-{
-  int n,ix,iy,iz;
-
-  // copy grabs inner portion of density from 3d brick
-  // remap could be done as pre-stage of FFT,
-  //   but this works optimally on only double values, not complex values
-
-  n = 0;
-  for (iz = nzlo_in; iz <= nzhi_in; iz++)
-    for (iy = nylo_in; iy <= nyhi_in; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++)
-        density_fft[n++] = density_brick[iz][iy][ix];
-
-  remap->perform(density_fft,density_fft,work1);
-}
-
-/* ----------------------------------------------------------------------
-   FFT-based Poisson solver
-------------------------------------------------------------------------- */
-
-void PPPM2::poisson()
-{
-  if (differentiation_flag == 1) poisson_ad();
-  else poisson_ik();
-}
-
-/* ----------------------------------------------------------------------
-   FFT-based Poisson solver for ik
-------------------------------------------------------------------------- */
-
-void PPPM2::poisson_ik()
-{
-  int i,j,k,n;
-  double eng;
-
-  // transform charge density (r -> k)
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work1[n++] = density_fft[i];
-    work1[n++] = ZEROF;
-  }
-
-  fft1->compute(work1,work1,1);
-
-  // global energy and virial contribution
-
-  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
-  double s2 = scaleinv*scaleinv;
-
-  if (eflag_global || vflag_global) {
-    if (vflag_global) {
-      n = 0;
-      for (i = 0; i < nfft; i++) {
-        eng = s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
-        for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j];
-        if (eflag_global) energy += eng;
-        n += 2;
-      }
-    } else {
-      n = 0;
-      for (i = 0; i < nfft; i++) {
-        energy +=
-          s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
-        n += 2;
-      }
-    }
-  }
-
-  // scale by 1/total-grid-pts to get rho(k)
-  // multiply by Green's function to get V(k)
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work1[n++] *= scaleinv * greensfn[i];
-    work1[n++] *= scaleinv * greensfn[i];
-  }
-
-  // extra FFTs for per-atom energy/virial
-
-  if (evflag_atom) poisson_peratom();
-
-  // triclinic system
-
-  if (triclinic) {
-    poisson_ik_triclinic();
-    return;
-  }
-
-  // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
-  // FFT leaves data in 3d brick decomposition
-  // copy it into inner portion of vdx,vdy,vdz arrays
-
-  // x direction gradient
-
-  n = 0;
-  for (k = nzlo_fft; k <= nzhi_fft; k++)
-    for (j = nylo_fft; j <= nyhi_fft; j++)
-      for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        work2[n] = fkx[i]*work1[n+1];
-        work2[n+1] = -fkx[i]*work1[n];
-        n += 2;
-      }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        vdx_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-
-  // y direction gradient
-
-  n = 0;
-  for (k = nzlo_fft; k <= nzhi_fft; k++)
-    for (j = nylo_fft; j <= nyhi_fft; j++)
-      for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        work2[n] = fky[j]*work1[n+1];
-        work2[n+1] = -fky[j]*work1[n];
-        n += 2;
-      }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        vdy_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-
-  // z direction gradient
-
-  n = 0;
-  for (k = nzlo_fft; k <= nzhi_fft; k++)
-    for (j = nylo_fft; j <= nyhi_fft; j++)
-      for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        work2[n] = fkz[k]*work1[n+1];
-        work2[n+1] = -fkz[k]*work1[n];
-        n += 2;
-      }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        vdz_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-}
-
-/* ----------------------------------------------------------------------
-   FFT-based Poisson solver for ik for a triclinic system
-------------------------------------------------------------------------- */
-
-void PPPM2::poisson_ik_triclinic()
-{
-  int i,j,k,n;
-
-  // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
-  // FFT leaves data in 3d brick decomposition
-  // copy it into inner portion of vdx,vdy,vdz arrays
-
-  // x direction gradient
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work2[n] = fkx[i]*work1[n+1];
-    work2[n+1] = -fkx[i]*work1[n];
-    n += 2;
-  }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        vdx_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-
-  // y direction gradient
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work2[n] = fky[i]*work1[n+1];
-    work2[n+1] = -fky[i]*work1[n];
-    n += 2;
-  }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        vdy_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-
-  // z direction gradient
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work2[n] = fkz[i]*work1[n+1];
-    work2[n+1] = -fkz[i]*work1[n];
-    n += 2;
-  }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        vdz_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-}
-
-/* ----------------------------------------------------------------------
-   FFT-based Poisson solver for ad
-------------------------------------------------------------------------- */
-
-void PPPM2::poisson_ad()
-{
-  int i,j,k,n;
-  double eng;
-
-  // transform charge density (r -> k)
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work1[n++] = density_fft[i];
-    work1[n++] = ZEROF;
-  }
-
-  fft1->compute(work1,work1,1);
-
-  // global energy and virial contribution
-
-  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
-  double s2 = scaleinv*scaleinv;
-
-  if (eflag_global || vflag_global) {
-    if (vflag_global) {
-      n = 0;
-      for (i = 0; i < nfft; i++) {
-        eng = s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
-        for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j];
-        if (eflag_global) energy += eng;
-        n += 2;
-      }
-    } else {
-      n = 0;
-      for (i = 0; i < nfft; i++) {
-        energy +=
-          s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
-        n += 2;
-      }
-    }
-  }
-
-  // scale by 1/total-grid-pts to get rho(k)
-  // multiply by Green's function to get V(k)
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work1[n++] *= scaleinv * greensfn[i];
-    work1[n++] *= scaleinv * greensfn[i];
-  }
-
-  // extra FFTs for per-atom energy/virial
-
-  if (vflag_atom) poisson_peratom();
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work2[n] = work1[n];
-    work2[n+1] = work1[n+1];
-    n += 2;
-  }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        u_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-}
-
-/* ----------------------------------------------------------------------
-   FFT-based Poisson solver for per-atom energy/virial
-------------------------------------------------------------------------- */
-
-void PPPM2::poisson_peratom()
-{
-  int i,j,k,n;
-
-  // energy
-
-  if (eflag_atom && differentiation_flag != 1) {
-    n = 0;
-    for (i = 0; i < nfft; i++) {
-      work2[n] = work1[n];
-      work2[n+1] = work1[n+1];
-      n += 2;
-    }
-
-    fft2->compute(work2,work2,-1);
-
-    n = 0;
-    for (k = nzlo_in; k <= nzhi_in; k++)
-      for (j = nylo_in; j <= nyhi_in; j++)
-        for (i = nxlo_in; i <= nxhi_in; i++) {
-          u_brick[k][j][i] = work2[n];
-          n += 2;
-        }
-  }
-
-  // 6 components of virial in v0 thru v5
-
-  if (!vflag_atom) return;
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work2[n] = work1[n]*vg[i][0];
-    work2[n+1] = work1[n+1]*vg[i][0];
-    n += 2;
-  }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        v0_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work2[n] = work1[n]*vg[i][1];
-    work2[n+1] = work1[n+1]*vg[i][1];
-    n += 2;
-  }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        v1_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work2[n] = work1[n]*vg[i][2];
-    work2[n+1] = work1[n+1]*vg[i][2];
-    n += 2;
-  }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        v2_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work2[n] = work1[n]*vg[i][3];
-    work2[n+1] = work1[n+1]*vg[i][3];
-    n += 2;
-  }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        v3_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work2[n] = work1[n]*vg[i][4];
-    work2[n+1] = work1[n+1]*vg[i][4];
-    n += 2;
-  }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        v4_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work2[n] = work1[n]*vg[i][5];
-    work2[n+1] = work1[n+1]*vg[i][5];
-    n += 2;
-  }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        v5_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-}
-
-/* ----------------------------------------------------------------------
-   interpolate from grid to get electric field & force on my particles
-------------------------------------------------------------------------- */
-
-void PPPM2::fieldforce()
-{
-  if (differentiation_flag == 1) fieldforce_ad();
-  else fieldforce_ik();
-}
-
-/* ----------------------------------------------------------------------
-   interpolate from grid to get electric field & force on my particles for ik
-------------------------------------------------------------------------- */
-
-void PPPM2::fieldforce_ik()
-{
-  int i,l,m,n,nx,ny,nz,mx,my,mz;
-  FFT_SCALAR dx,dy,dz,x0,y0,z0;
-  FFT_SCALAR ekx,eky,ekz;
-
-  // loop over my charges, interpolate electric field from nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-  // ek = 3 components of E-field on particle
-
-  double *q = atom->q;
-  double **x = atom->x;
-  double **f = atom->f;
-
-  int nlocal = atom->nlocal;
-
-  for (i = 0; i < nlocal; i++) {
-    nx = part2grid[i][0];
-    ny = part2grid[i][1];
-    nz = part2grid[i][2];
-    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
-    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
-    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
-
-    compute_rho1d(dx,dy,dz);
-
-    ekx = eky = ekz = ZEROF;
-    for (n = nlower; n <= nupper; n++) {
-      mz = n+nz;
-      z0 = rho1d[2][n];
-      for (m = nlower; m <= nupper; m++) {
-        my = m+ny;
-        y0 = z0*rho1d[1][m];
-        for (l = nlower; l <= nupper; l++) {
-          mx = l+nx;
-          x0 = y0*rho1d[0][l];
-          ekx -= x0*vdx_brick[mz][my][mx];
-          eky -= x0*vdy_brick[mz][my][mx];
-          ekz -= x0*vdz_brick[mz][my][mx];
-        }
-      }
-    }
-
-    // convert E-field to force
-
-    const double qfactor = qqrd2e * scale * q[i];
-    f[i][0] += qfactor*ekx;
-    f[i][1] += qfactor*eky;
-    if (slabflag != 2) f[i][2] += qfactor*ekz;
-  }
-}
-
-/* ----------------------------------------------------------------------
-   interpolate from grid to get electric field & force on my particles for ad
-------------------------------------------------------------------------- */
-
-void PPPM2::fieldforce_ad()
-{
-  int i,l,m,n,nx,ny,nz,mx,my,mz;
-  FFT_SCALAR dx,dy,dz;
-  FFT_SCALAR ekx,eky,ekz;
-  double s1,s2,s3;
-  double sf = 0.0;
-  double *prd;
-
-  prd = domain->prd;
-  double xprd = prd[0];
-  double yprd = prd[1];
-  double zprd = prd[2];
-
-  double hx_inv = nx_pppm/xprd;
-  double hy_inv = ny_pppm/yprd;
-  double hz_inv = nz_pppm/zprd;
-
-  // loop over my charges, interpolate electric field from nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-  // ek = 3 components of E-field on particle
-
-  double *q = atom->q;
-  double **x = atom->x;
-  double **f = atom->f;
-
-  int nlocal = atom->nlocal;
-
-  for (i = 0; i < nlocal; i++) {
-    nx = part2grid[i][0];
-    ny = part2grid[i][1];
-    nz = part2grid[i][2];
-    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
-    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
-    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
-
-    compute_rho1d(dx,dy,dz);
-    compute_drho1d(dx,dy,dz);
-
-    ekx = eky = ekz = ZEROF;
-    for (n = nlower; n <= nupper; n++) {
-      mz = n+nz;
-      for (m = nlower; m <= nupper; m++) {
-        my = m+ny;
-        for (l = nlower; l <= nupper; l++) {
-          mx = l+nx;
-          ekx += drho1d[0][l]*rho1d[1][m]*rho1d[2][n]*u_brick[mz][my][mx];
-          eky += rho1d[0][l]*drho1d[1][m]*rho1d[2][n]*u_brick[mz][my][mx];
-          ekz += rho1d[0][l]*rho1d[1][m]*drho1d[2][n]*u_brick[mz][my][mx];
-        }
-      }
-    }
-    ekx *= hx_inv;
-    eky *= hy_inv;
-    ekz *= hz_inv;
-
-    // convert E-field to force and subtract self forces
-
-    const double qfactor = qqrd2e * scale;
-
-    s1 = x[i][0]*hx_inv;
-    s2 = x[i][1]*hy_inv;
-    s3 = x[i][2]*hz_inv;
-    sf = sf_coeff[0]*sin(2*MY_PI*s1);
-    sf += sf_coeff[1]*sin(4*MY_PI*s1);
-    sf *= 2*q[i]*q[i];
-    f[i][0] += qfactor*(ekx*q[i] - sf);
-
-    sf = sf_coeff[2]*sin(2*MY_PI*s2);
-    sf += sf_coeff[3]*sin(4*MY_PI*s2);
-    sf *= 2*q[i]*q[i];
-    f[i][1] += qfactor*(eky*q[i] - sf);
-
-
-    sf = sf_coeff[4]*sin(2*MY_PI*s3);
-    sf += sf_coeff[5]*sin(4*MY_PI*s3);
-    sf *= 2*q[i]*q[i];
-    if (slabflag != 2) f[i][2] += qfactor*(ekz*q[i] - sf);
-  }
-}
-
-/* ----------------------------------------------------------------------
-   interpolate from grid to get per-atom energy/virial
-------------------------------------------------------------------------- */
-
-void PPPM2::fieldforce_peratom()
-{
-  int i,l,m,n,nx,ny,nz,mx,my,mz;
-  FFT_SCALAR dx,dy,dz,x0,y0,z0;
-  FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
-
-  // loop over my charges, interpolate from nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-
-  double *q = atom->q;
-  double **x = atom->x;
-
-  int nlocal = atom->nlocal;
-
-  for (i = 0; i < nlocal; i++) {
-    nx = part2grid[i][0];
-    ny = part2grid[i][1];
-    nz = part2grid[i][2];
-    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
-    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
-    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
-
-    compute_rho1d(dx,dy,dz);
-
-    u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
-    for (n = nlower; n <= nupper; n++) {
-      mz = n+nz;
-      z0 = rho1d[2][n];
-      for (m = nlower; m <= nupper; m++) {
-        my = m+ny;
-        y0 = z0*rho1d[1][m];
-        for (l = nlower; l <= nupper; l++) {
-          mx = l+nx;
-          x0 = y0*rho1d[0][l];
-          if (eflag_atom) u += x0*u_brick[mz][my][mx];
-          if (vflag_atom) {
-            v0 += x0*v0_brick[mz][my][mx];
-            v1 += x0*v1_brick[mz][my][mx];
-            v2 += x0*v2_brick[mz][my][mx];
-            v3 += x0*v3_brick[mz][my][mx];
-            v4 += x0*v4_brick[mz][my][mx];
-            v5 += x0*v5_brick[mz][my][mx];
-          }
-        }
-      }
-    }
-
-    if (eflag_atom) eatom[i] += q[i]*u;
-    if (vflag_atom) {
-      vatom[i][0] += q[i]*v0;
-      vatom[i][1] += q[i]*v1;
-      vatom[i][2] += q[i]*v2;
-      vatom[i][3] += q[i]*v3;
-      vatom[i][4] += q[i]*v4;
-      vatom[i][5] += q[i]*v5;
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   pack own values to buf to send to another proc
-------------------------------------------------------------------------- */
-
-void PPPM2::pack_forward_grid(int flag, void *pbuf, int nlist, int *list)
-{
-  FFT_SCALAR *buf = (FFT_SCALAR *) pbuf;
-  
-  int n = 0;
-
-  if (flag == FORWARD_IK) {
-    FFT_SCALAR *xsrc = &vdx_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *ysrc = &vdy_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *zsrc = &vdz_brick[nzlo_out][nylo_out][nxlo_out];
-    for (int i = 0; i < nlist; i++) {
-      buf[n++] = xsrc[list[i]];
-      buf[n++] = ysrc[list[i]];
-      buf[n++] = zsrc[list[i]];
-    }
-  } else if (flag == FORWARD_AD) {
-    FFT_SCALAR *src = &u_brick[nzlo_out][nylo_out][nxlo_out];
-    for (int i = 0; i < nlist; i++)
-      buf[i] = src[list[i]];
-  } else if (flag == FORWARD_IK_PERATOM) {
-    FFT_SCALAR *esrc = &u_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
-    for (int i = 0; i < nlist; i++) {
-      if (eflag_atom) buf[n++] = esrc[list[i]];
-      if (vflag_atom) {
-        buf[n++] = v0src[list[i]];
-        buf[n++] = v1src[list[i]];
-        buf[n++] = v2src[list[i]];
-        buf[n++] = v3src[list[i]];
-        buf[n++] = v4src[list[i]];
-        buf[n++] = v5src[list[i]];
-      }
-    }
-  } else if (flag == FORWARD_AD_PERATOM) {
-    FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
-    for (int i = 0; i < nlist; i++) {
-      buf[n++] = v0src[list[i]];
-      buf[n++] = v1src[list[i]];
-      buf[n++] = v2src[list[i]];
-      buf[n++] = v3src[list[i]];
-      buf[n++] = v4src[list[i]];
-      buf[n++] = v5src[list[i]];
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack another proc's own values from buf and set own ghost values
-------------------------------------------------------------------------- */
-
-void PPPM2::unpack_forward_grid(int flag, void *pbuf, int nlist, int *list)
-{
-  FFT_SCALAR *buf = (FFT_SCALAR *) pbuf;
-
-  int n = 0;
-
-  if (flag == FORWARD_IK) {
-    FFT_SCALAR *xdest = &vdx_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *ydest = &vdy_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *zdest = &vdz_brick[nzlo_out][nylo_out][nxlo_out];
-    for (int i = 0; i < nlist; i++) {
-      xdest[list[i]] = buf[n++];
-      ydest[list[i]] = buf[n++];
-      zdest[list[i]] = buf[n++];
-    }
-  } else if (flag == FORWARD_AD) {
-    FFT_SCALAR *dest = &u_brick[nzlo_out][nylo_out][nxlo_out];
-    for (int i = 0; i < nlist; i++)
-      dest[list[i]] = buf[i];
-  } else if (flag == FORWARD_IK_PERATOM) {
-    FFT_SCALAR *esrc = &u_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
-    for (int i = 0; i < nlist; i++) {
-      if (eflag_atom) esrc[list[i]] = buf[n++];
-      if (vflag_atom) {
-        v0src[list[i]] = buf[n++];
-        v1src[list[i]] = buf[n++];
-        v2src[list[i]] = buf[n++];
-        v3src[list[i]] = buf[n++];
-        v4src[list[i]] = buf[n++];
-        v5src[list[i]] = buf[n++];
-      }
-    }
-  } else if (flag == FORWARD_AD_PERATOM) {
-    FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
-    FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
-    for (int i = 0; i < nlist; i++) {
-      v0src[list[i]] = buf[n++];
-      v1src[list[i]] = buf[n++];
-      v2src[list[i]] = buf[n++];
-      v3src[list[i]] = buf[n++];
-      v4src[list[i]] = buf[n++];
-      v5src[list[i]] = buf[n++];
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   pack ghost values into buf to send to another proc
-------------------------------------------------------------------------- */
-
-void PPPM2::pack_reverse_grid(int flag, void *pbuf, int nlist, int *list)
-{
-  FFT_SCALAR *buf = (FFT_SCALAR *) pbuf;
-    
-  if (flag == REVERSE_RHO) {
-    FFT_SCALAR *src = &density_brick[nzlo_out][nylo_out][nxlo_out];
-    for (int i = 0; i < nlist; i++)
-      buf[i] = src[list[i]];
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack another proc's ghost values from buf and add to own values
-------------------------------------------------------------------------- */
-
-void PPPM2::unpack_reverse_grid(int flag, void *pbuf, int nlist, int *list)
-{
-  FFT_SCALAR *buf = (FFT_SCALAR *) pbuf;
-
-  if (flag == REVERSE_RHO) {
-    FFT_SCALAR *dest = &density_brick[nzlo_out][nylo_out][nxlo_out];
-    for (int i = 0; i < nlist; i++)
-      dest[list[i]] += buf[i];
-  }
-}
-
-/* ----------------------------------------------------------------------
-   map nprocs to NX by NY grid as PX by PY procs - return optimal px,py
-------------------------------------------------------------------------- */
-
-void PPPM2::procs2grid2d(int nprocs, int nx, int ny, int *px, int *py)
-{
-  // loop thru all possible factorizations of nprocs
-  // surf = surface area of largest proc sub-domain
-  // innermost if test minimizes surface area and surface/volume ratio
-
-  int bestsurf = 2 * (nx + ny);
-  int bestboxx = 0;
-  int bestboxy = 0;
-
-  int boxx,boxy,surf,ipx,ipy;
-
-  ipx = 1;
-  while (ipx <= nprocs) {
-    if (nprocs % ipx == 0) {
-      ipy = nprocs/ipx;
-      boxx = nx/ipx;
-      if (nx % ipx) boxx++;
-      boxy = ny/ipy;
-      if (ny % ipy) boxy++;
-      surf = boxx + boxy;
-      if (surf < bestsurf ||
-          (surf == bestsurf && boxx*boxy > bestboxx*bestboxy)) {
-        bestsurf = surf;
-        bestboxx = boxx;
-        bestboxy = boxy;
-        *px = ipx;
-        *py = ipy;
-      }
-    }
-    ipx++;
-  }
-}
-
-/* ----------------------------------------------------------------------
-   charge assignment into rho1d
-   dx,dy,dz = distance of particle from "lower left" grid point
-------------------------------------------------------------------------- */
-
-void PPPM2::compute_rho1d(const FFT_SCALAR &dx, const FFT_SCALAR &dy,
-                         const FFT_SCALAR &dz)
-{
-  int k,l;
-  FFT_SCALAR r1,r2,r3;
-
-  for (k = (1-order)/2; k <= order/2; k++) {
-    r1 = r2 = r3 = ZEROF;
-
-    for (l = order-1; l >= 0; l--) {
-      r1 = rho_coeff[l][k] + r1*dx;
-      r2 = rho_coeff[l][k] + r2*dy;
-      r3 = rho_coeff[l][k] + r3*dz;
-    }
-    rho1d[0][k] = r1;
-    rho1d[1][k] = r2;
-    rho1d[2][k] = r3;
-  }
-}
-
-/* ----------------------------------------------------------------------
-   charge assignment into drho1d
-   dx,dy,dz = distance of particle from "lower left" grid point
-------------------------------------------------------------------------- */
-
-void PPPM2::compute_drho1d(const FFT_SCALAR &dx, const FFT_SCALAR &dy,
-                          const FFT_SCALAR &dz)
-{
-  int k,l;
-  FFT_SCALAR r1,r2,r3;
-
-  for (k = (1-order)/2; k <= order/2; k++) {
-    r1 = r2 = r3 = ZEROF;
-
-    for (l = order-2; l >= 0; l--) {
-      r1 = drho_coeff[l][k] + r1*dx;
-      r2 = drho_coeff[l][k] + r2*dy;
-      r3 = drho_coeff[l][k] + r3*dz;
-    }
-    drho1d[0][k] = r1;
-    drho1d[1][k] = r2;
-    drho1d[2][k] = r3;
-  }
-}
-
-/* ----------------------------------------------------------------------
-   generate coeffients for the weight function of order n
-
-              (n-1)
-  Wn(x) =     Sum    wn(k,x) , Sum is over every other integer
-           k=-(n-1)
-  For k=-(n-1),-(n-1)+2, ....., (n-1)-2,n-1
-      k is odd integers if n is even and even integers if n is odd
-              ---
-             | n-1
-             | Sum a(l,j)*(x-k/2)**l   if abs(x-k/2) < 1/2
-  wn(k,x) = <  l=0
-             |
-             |  0                       otherwise
-              ---
-  a coeffients are packed into the array rho_coeff to eliminate zeros
-  rho_coeff(l,((k+mod(n+1,2))/2) = a(l,k)
-------------------------------------------------------------------------- */
-
-void PPPM2::compute_rho_coeff()
-{
-  int j,k,l,m;
-  FFT_SCALAR s;
-
-  FFT_SCALAR **a;
-  memory->create2d_offset(a,order,-order,order,"pppm:a");
-
-  for (k = -order; k <= order; k++)
-    for (l = 0; l < order; l++)
-      a[l][k] = 0.0;
-
-  a[0][0] = 1.0;
-  for (j = 1; j < order; j++) {
-    for (k = -j; k <= j; k += 2) {
-      s = 0.0;
-      for (l = 0; l < j; l++) {
-        a[l+1][k] = (a[l][k+1]-a[l][k-1]) / (l+1);
-#ifdef FFT_SINGLE
-        s += powf(0.5,(float) l+1) *
-          (a[l][k-1] + powf(-1.0,(float) l) * a[l][k+1]) / (l+1);
-#else
-        s += pow(0.5,(double) l+1) *
-          (a[l][k-1] + pow(-1.0,(double) l) * a[l][k+1]) / (l+1);
-#endif
-      }
-      a[0][k] = s;
-    }
-  }
-
-  m = (1-order)/2;
-  for (k = -(order-1); k < order; k += 2) {
-    for (l = 0; l < order; l++)
-      rho_coeff[l][m] = a[l][k];
-    for (l = 1; l < order; l++)
-      drho_coeff[l-1][m] = l*a[l][k];
-    m++;
-  }
-
-  memory->destroy2d_offset(a,-order);
-}
-
-/* ----------------------------------------------------------------------
-   Slab-geometry correction term to dampen inter-slab interactions between
-   periodically repeating slabs.  Yields good approximation to 2D Ewald if
-   adequate empty space is left between repeating slabs (J. Chem. Phys.
-   111, 3155).  Slabs defined here to be parallel to the xy plane. Also
-   extended to non-neutral systems (J. Chem. Phys. 131, 094107).
-------------------------------------------------------------------------- */
-
-void PPPM2::slabcorr()
-{
-  // compute local contribution to global dipole moment
-
-  double *q = atom->q;
-  double **x = atom->x;
-  double zprd = domain->zprd;
-  int nlocal = atom->nlocal;
-
-  double dipole = 0.0;
-  for (int i = 0; i < nlocal; i++) dipole += q[i]*x[i][2];
-
-  // sum local contributions to get global dipole moment
-
-  double dipole_all;
-  MPI_Allreduce(&dipole,&dipole_all,1,MPI_DOUBLE,MPI_SUM,world);
-
-  // need to make non-neutral systems and/or
-  //  per-atom energy translationally invariant
-
-  double dipole_r2 = 0.0;
-  if (eflag_atom || fabs(qsum) > SMALL) {
-    for (int i = 0; i < nlocal; i++)
-      dipole_r2 += q[i]*x[i][2]*x[i][2];
-
-    // sum local contributions
-
-    double tmp;
-    MPI_Allreduce(&dipole_r2,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
-    dipole_r2 = tmp;
-  }
-
-  // compute corrections
-
-  const double e_slabcorr = MY_2PI*(dipole_all*dipole_all -
-    qsum*dipole_r2 - qsum*qsum*zprd*zprd/12.0)/volume;
-  const double qscale = qqrd2e * scale;
-
-  if (eflag_global) energy += qscale * e_slabcorr;
-
-  // per-atom energy
-
-  if (eflag_atom) {
-    double efact = qscale * MY_2PI/volume;
-    for (int i = 0; i < nlocal; i++)
-      eatom[i] += efact * q[i]*(x[i][2]*dipole_all - 0.5*(dipole_r2 +
-        qsum*x[i][2]*x[i][2]) - qsum*zprd*zprd/12.0);
-  }
-
-  // add on force corrections
-
-  double ffact = qscale * (-4.0*MY_PI/volume);
-  double **f = atom->f;
-
-  for (int i = 0; i < nlocal; i++) f[i][2] += ffact * q[i]*(dipole_all - qsum*x[i][2]);
-}
-
-/* ----------------------------------------------------------------------
-   perform and time the 1d FFTs required for N timesteps
-------------------------------------------------------------------------- */
-
-int PPPM2::timing_1d(int n, double &time1d)
-{
-  double time1,time2;
-
-  for (int i = 0; i < 2*nfft_both; i++) work1[i] = ZEROF;
-
-  MPI_Barrier(world);
-  time1 = MPI_Wtime();
-
-  for (int i = 0; i < n; i++) {
-    fft1->timing1d(work1,nfft_both,1);
-    fft2->timing1d(work1,nfft_both,-1);
-    if (differentiation_flag != 1) {
-      fft2->timing1d(work1,nfft_both,-1);
-      fft2->timing1d(work1,nfft_both,-1);
-    }
-  }
-
-  MPI_Barrier(world);
-  time2 = MPI_Wtime();
-  time1d = time2 - time1;
-
-  if (differentiation_flag) return 2;
-  return 4;
-}
-
-/* ----------------------------------------------------------------------
-   perform and time the 3d FFTs required for N timesteps
-------------------------------------------------------------------------- */
-
-int PPPM2::timing_3d(int n, double &time3d)
-{
-  double time1,time2;
-
-  for (int i = 0; i < 2*nfft_both; i++) work1[i] = ZEROF;
-
-  MPI_Barrier(world);
-  time1 = MPI_Wtime();
-
-  for (int i = 0; i < n; i++) {
-    fft1->compute(work1,work1,1);
-    fft2->compute(work1,work1,-1);
-    if (differentiation_flag != 1) {
-      fft2->compute(work1,work1,-1);
-      fft2->compute(work1,work1,-1);
-    }
-  }
-
-  MPI_Barrier(world);
-  time2 = MPI_Wtime();
-  time3d = time2 - time1;
-
-  if (differentiation_flag) return 2;
-  return 4;
-}
-
-/* ----------------------------------------------------------------------
-   memory usage of local arrays
-------------------------------------------------------------------------- */
-
-double PPPM2::memory_usage()
-{
-  double bytes = nmax*3 * sizeof(double);
-
-  int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
-    (nzhi_out-nzlo_out+1);
-  if (differentiation_flag == 1) {
-    bytes += 2 * nbrick * sizeof(FFT_SCALAR);
-  } else {
-    bytes += 4 * nbrick * sizeof(FFT_SCALAR);
-  }
-
-  if (triclinic) bytes += 3 * nfft_both * sizeof(double);
-  bytes += 6 * nfft_both * sizeof(double);
-  bytes += nfft_both * sizeof(double);
-  bytes += nfft_both*5 * sizeof(FFT_SCALAR);
-
-  if (peratom_allocate_flag)
-    bytes += 6 * nbrick * sizeof(FFT_SCALAR);
-
-  if (group_allocate_flag) {
-    bytes += 2 * nbrick * sizeof(FFT_SCALAR);
-    bytes += 2 * nfft_both * sizeof(FFT_SCALAR);;
-  }
-
-  // two GridComm bufs
-
-  bytes += (ngc_buf1 + ngc_buf2) * npergrid * sizeof(FFT_SCALAR);
-  
-  return bytes;
-}
-
-/* ----------------------------------------------------------------------
-   group-group interactions
- ------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   compute the PPPM total long-range force and energy for groups A and B
- ------------------------------------------------------------------------- */
-
-void PPPM2::compute_group_group(int groupbit_A, int groupbit_B, int AA_flag)
-{
-  if (slabflag && triclinic)
-    error->all(FLERR,"Cannot (yet) use K-space slab "
-               "correction with compute group/group for triclinic systems");
-
-  if (differentiation_flag)
-    error->all(FLERR,"Cannot (yet) use kspace_modify "
-               "diff ad with compute group/group");
-
-  if (!group_allocate_flag) allocate_groups();
-
-  // convert atoms from box to lamda coords
-
-  if (triclinic == 0) boxlo = domain->boxlo;
-  else {
-    boxlo = domain->boxlo_lamda;
-    domain->x2lamda(atom->nlocal);
-  }
-
-  e2group = 0.0; //energy
-  f2group[0] = 0.0; //force in x-direction
-  f2group[1] = 0.0; //force in y-direction
-  f2group[2] = 0.0; //force in z-direction
-
-  // map my particle charge onto my local 3d density grid
-
-  make_rho_groups(groupbit_A,groupbit_B,AA_flag);
-
-  // all procs communicate density values from their ghost cells
-  //   to fully sum contribution in their 3d bricks
-  // remap from 3d decomposition to FFT decomposition
-
-  // temporarily store and switch pointers so we can
-  //  use brick2fft() for groups A and B (without
-  //  writing an additional function)
-
-  FFT_SCALAR ***density_brick_real = density_brick;
-  FFT_SCALAR *density_fft_real = density_fft;
-
-  // group A
-
-  density_brick = density_A_brick;
-  density_fft = density_A_fft;
-
-  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
-			  gc_buf1,gc_buf2,MPI_FFT_SCALAR);
-  brick2fft();
-
-  // group B
-
-  density_brick = density_B_brick;
-  density_fft = density_B_fft;
-
-  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
-			  gc_buf1,gc_buf2,MPI_FFT_SCALAR);
-  brick2fft();
-
-  // switch back pointers
-
-  density_brick = density_brick_real;
-  density_fft = density_fft_real;
-
-  // compute potential gradient on my FFT grid and
-  //   portion of group-group energy/force on this proc's FFT grid
-
-  poisson_groups(AA_flag);
-
-  const double qscale = qqrd2e * scale;
-
-  // total group A <--> group B energy
-  // self and boundary correction terms are in compute_group_group.cpp
-
-  double e2group_all;
-  MPI_Allreduce(&e2group,&e2group_all,1,MPI_DOUBLE,MPI_SUM,world);
-  e2group = e2group_all;
-
-  e2group *= qscale*0.5*volume;
-
-  // total group A <--> group B force
-
-  double f2group_all[3];
-  MPI_Allreduce(f2group,f2group_all,3,MPI_DOUBLE,MPI_SUM,world);
-
-  f2group[0] = qscale*volume*f2group_all[0];
-  f2group[1] = qscale*volume*f2group_all[1];
-  if (slabflag != 2) f2group[2] = qscale*volume*f2group_all[2];
-
-  // convert atoms back from lamda to box coords
-
-  if (triclinic) domain->lamda2x(atom->nlocal);
-
-  if (slabflag == 1)
-    slabcorr_groups(groupbit_A, groupbit_B, AA_flag);
-}
-
-/* ----------------------------------------------------------------------
- allocate group-group memory that depends on # of K-vectors and order
- ------------------------------------------------------------------------- */
-
-void PPPM2::allocate_groups()
-{
-  group_allocate_flag = 1;
-
-  memory->create3d_offset(density_A_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:density_A_brick");
-  memory->create3d_offset(density_B_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:density_B_brick");
-  memory->create(density_A_fft,nfft_both,"pppm:density_A_fft");
-  memory->create(density_B_fft,nfft_both,"pppm:density_B_fft");
-}
-
-/* ----------------------------------------------------------------------
- deallocate group-group memory that depends on # of K-vectors and order
- ------------------------------------------------------------------------- */
-
-void PPPM2::deallocate_groups()
-{
-  group_allocate_flag = 0;
-
-  memory->destroy3d_offset(density_A_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->destroy3d_offset(density_B_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->destroy(density_A_fft);
-  memory->destroy(density_B_fft);
-}
-
-/* ----------------------------------------------------------------------
- create discretized "density" on section of global grid due to my particles
- density(x,y,z) = charge "density" at grid points of my 3d brick
- (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
- in global grid for group-group interactions
- ------------------------------------------------------------------------- */
-
-void PPPM2::make_rho_groups(int groupbit_A, int groupbit_B, int AA_flag)
-{
-  int l,m,n,nx,ny,nz,mx,my,mz;
-  FFT_SCALAR dx,dy,dz,x0,y0,z0;
-
-  // clear 3d density arrays
-
-  memset(&(density_A_brick[nzlo_out][nylo_out][nxlo_out]),0,
-         ngrid*sizeof(FFT_SCALAR));
-
-  memset(&(density_B_brick[nzlo_out][nylo_out][nxlo_out]),0,
-         ngrid*sizeof(FFT_SCALAR));
-
-  // loop over my charges, add their contribution to nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-
-  double *q = atom->q;
-  double **x = atom->x;
-  int nlocal = atom->nlocal;
-  int *mask = atom->mask;
-
-  for (int i = 0; i < nlocal; i++) {
-
-    if (!((mask[i] & groupbit_A) && (mask[i] & groupbit_B)))
-      if (AA_flag) continue;
-
-    if ((mask[i] & groupbit_A) || (mask[i] & groupbit_B)) {
-
-      nx = part2grid[i][0];
-      ny = part2grid[i][1];
-      nz = part2grid[i][2];
-      dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
-      dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
-      dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
-
-      compute_rho1d(dx,dy,dz);
-
-      z0 = delvolinv * q[i];
-      for (n = nlower; n <= nupper; n++) {
-        mz = n+nz;
-        y0 = z0*rho1d[2][n];
-        for (m = nlower; m <= nupper; m++) {
-          my = m+ny;
-          x0 = y0*rho1d[1][m];
-          for (l = nlower; l <= nupper; l++) {
-            mx = l+nx;
-
-            // group A
-
-            if (mask[i] & groupbit_A)
-              density_A_brick[mz][my][mx] += x0*rho1d[0][l];
-
-            // group B
-
-            if (mask[i] & groupbit_B)
-              density_B_brick[mz][my][mx] += x0*rho1d[0][l];
-          }
-        }
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   FFT-based Poisson solver for group-group interactions
- ------------------------------------------------------------------------- */
-
-void PPPM2::poisson_groups(int AA_flag)
-{
-  int i,j,k,n;
-
-  // reuse memory (already declared)
-
-  FFT_SCALAR *work_A = work1;
-  FFT_SCALAR *work_B = work2;
-
-  // transform charge density (r -> k)
-
-  // group A
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work_A[n++] = density_A_fft[i];
-    work_A[n++] = ZEROF;
-  }
-
-  fft1->compute(work_A,work_A,1);
-
-  // group B
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work_B[n++] = density_B_fft[i];
-    work_B[n++] = ZEROF;
-  }
-
-  fft1->compute(work_B,work_B,1);
-
-  // group-group energy and force contribution,
-  //  keep everything in reciprocal space so
-  //  no inverse FFTs needed
-
-  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
-  double s2 = scaleinv*scaleinv;
-
-  // energy
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    e2group += s2 * greensfn[i] *
-      (work_A[n]*work_B[n] + work_A[n+1]*work_B[n+1]);
-    n += 2;
-  }
-
-  if (AA_flag) return;
-
-
-  // multiply by Green's function and s2
-  //  (only for work_A so it is not squared below)
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work_A[n++] *= s2 * greensfn[i];
-    work_A[n++] *= s2 * greensfn[i];
-  }
-
-  // triclinic system
-
-  if (triclinic) {
-    poisson_groups_triclinic();
-    return;
-  }
-
-  double partial_group;
-
-  // force, x direction
-
-  n = 0;
-  for (k = nzlo_fft; k <= nzhi_fft; k++)
-    for (j = nylo_fft; j <= nyhi_fft; j++)
-      for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
-        f2group[0] += fkx[i] * partial_group;
-        n += 2;
-      }
-
-  // force, y direction
-
-  n = 0;
-  for (k = nzlo_fft; k <= nzhi_fft; k++)
-    for (j = nylo_fft; j <= nyhi_fft; j++)
-      for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
-        f2group[1] += fky[j] * partial_group;
-        n += 2;
-      }
-
-  // force, z direction
-
-  n = 0;
-  for (k = nzlo_fft; k <= nzhi_fft; k++)
-    for (j = nylo_fft; j <= nyhi_fft; j++)
-      for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
-        f2group[2] += fkz[k] * partial_group;
-        n += 2;
-      }
-}
-
-/* ----------------------------------------------------------------------
-   FFT-based Poisson solver for group-group interactions
-   for a triclinic system
- ------------------------------------------------------------------------- */
-
-void PPPM2::poisson_groups_triclinic()
-{
-  int i,n;
-
-  // reuse memory (already declared)
-
-  FFT_SCALAR *work_A = work1;
-  FFT_SCALAR *work_B = work2;
-
-  double partial_group;
-
-  // force, x direction
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
-    f2group[0] += fkx[i] * partial_group;
-    n += 2;
-  }
-
-  // force, y direction
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
-    f2group[1] += fky[i] * partial_group;
-    n += 2;
-  }
-
-  // force, z direction
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
-    f2group[2] += fkz[i] * partial_group;
-    n += 2;
-  }
-}
-
-/* ----------------------------------------------------------------------
-   Slab-geometry correction term to dampen inter-slab interactions between
-   periodically repeating slabs.  Yields good approximation to 2D Ewald if
-   adequate empty space is left between repeating slabs (J. Chem. Phys.
-   111, 3155).  Slabs defined here to be parallel to the xy plane. Also
-   extended to non-neutral systems (J. Chem. Phys. 131, 094107).
-------------------------------------------------------------------------- */
-
-void PPPM2::slabcorr_groups(int groupbit_A, int groupbit_B, int AA_flag)
-{
-  // compute local contribution to global dipole moment
-
-  double *q = atom->q;
-  double **x = atom->x;
-  double zprd = domain->zprd;
-  int *mask = atom->mask;
-  int nlocal = atom->nlocal;
-
-  double qsum_A = 0.0;
-  double qsum_B = 0.0;
-  double dipole_A = 0.0;
-  double dipole_B = 0.0;
-  double dipole_r2_A = 0.0;
-  double dipole_r2_B = 0.0;
-
-  for (int i = 0; i < nlocal; i++) {
-    if (!((mask[i] & groupbit_A) && (mask[i] & groupbit_B)))
-      if (AA_flag) continue;
-
-    if (mask[i] & groupbit_A) {
-      qsum_A += q[i];
-      dipole_A += q[i]*x[i][2];
-      dipole_r2_A += q[i]*x[i][2]*x[i][2];
-    }
-
-    if (mask[i] & groupbit_B) {
-      qsum_B += q[i];
-      dipole_B += q[i]*x[i][2];
-      dipole_r2_B += q[i]*x[i][2]*x[i][2];
-    }
-  }
-
-  // sum local contributions to get total charge and global dipole moment
-  //  for each group
-
-  double tmp;
-  MPI_Allreduce(&qsum_A,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
-  qsum_A = tmp;
-
-  MPI_Allreduce(&qsum_B,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
-  qsum_B = tmp;
-
-  MPI_Allreduce(&dipole_A,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
-  dipole_A = tmp;
-
-  MPI_Allreduce(&dipole_B,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
-  dipole_B = tmp;
-
-  MPI_Allreduce(&dipole_r2_A,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
-  dipole_r2_A = tmp;
-
-  MPI_Allreduce(&dipole_r2_B,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
-  dipole_r2_B = tmp;
-
-  // compute corrections
-
-  const double qscale = qqrd2e * scale;
-  const double efact = qscale * MY_2PI/volume;
-
-  e2group += efact * (dipole_A*dipole_B - 0.5*(qsum_A*dipole_r2_B +
-    qsum_B*dipole_r2_A) - qsum_A*qsum_B*zprd*zprd/12.0);
-
-  // add on force corrections
-
-  const double ffact = qscale * (-4.0*MY_PI/volume);
-  f2group[2] += ffact * (qsum_A*dipole_B - qsum_B*dipole_A);
-}
diff --git a/src/KSPACE/pppm2.h b/src/KSPACE/pppm2.h
deleted file mode 100644
index 11c9e74737..0000000000
--- a/src/KSPACE/pppm2.h
+++ /dev/null
@@ -1,360 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifdef KSPACE_CLASS
-
-KSpaceStyle(pppm2,PPPM2)
-
-#else
-
-#ifndef LMP_PPPM2_H
-#define LMP_PPPM2_H
-
-#include "kspace.h"
-
-#if defined(FFT_FFTW3)
-#define LMP_FFT_LIB "FFTW3"
-#elif defined(FFT_MKL)
-#define LMP_FFT_LIB "MKL FFT"
-#elif defined(FFT_CUFFT)
-#define LMP_FFT_LIB "cuFFT"
-#else
-#define LMP_FFT_LIB "KISS FFT"
-#endif
-
-#ifdef FFT_SINGLE
-typedef float FFT_SCALAR;
-#define LMP_FFT_PREC "single"
-#define MPI_FFT_SCALAR MPI_FLOAT
-#else
-
-typedef double FFT_SCALAR;
-#define LMP_FFT_PREC "double"
-#define MPI_FFT_SCALAR MPI_DOUBLE
-#endif
-
-namespace LAMMPS_NS {
-
-class PPPM2 : public KSpace {
- public:
-  PPPM2(class LAMMPS *);
-  virtual ~PPPM2();
-  virtual void settings(int, char **);
-  virtual void init();
-  virtual void setup();
-  virtual void setup_grid();
-  virtual void compute(int, int);
-  virtual int timing_1d(int, double &);
-  virtual int timing_3d(int, double &);
-  virtual double memory_usage();
-
-  virtual void compute_group_group(int, int, int);
-
- protected:
-  int me,nprocs;
-  int nfactors;
-  int *factors;
-  double cutoff;
-  double volume;
-  double delxinv,delyinv,delzinv,delvolinv;
-  double h_x,h_y,h_z;
-  double shift,shiftone;
-  int peratom_allocate_flag;
-
-  int nxlo_in,nylo_in,nzlo_in,nxhi_in,nyhi_in,nzhi_in;
-  int nxlo_out,nylo_out,nzlo_out,nxhi_out,nyhi_out,nzhi_out;
-  int nxlo_ghost,nxhi_ghost,nylo_ghost,nyhi_ghost,nzlo_ghost,nzhi_ghost;
-  int nxlo_fft,nylo_fft,nzlo_fft,nxhi_fft,nyhi_fft,nzhi_fft;
-  int nlower,nupper;
-  int ngrid,nfft,nfft_both;
-
-  FFT_SCALAR ***density_brick;
-  FFT_SCALAR ***vdx_brick,***vdy_brick,***vdz_brick;
-  FFT_SCALAR ***u_brick;
-  FFT_SCALAR ***v0_brick,***v1_brick,***v2_brick;
-  FFT_SCALAR ***v3_brick,***v4_brick,***v5_brick;
-  double *greensfn;
-  double **vg;
-  double *fkx,*fky,*fkz;
-  FFT_SCALAR *density_fft;
-  FFT_SCALAR *work1,*work2;
-
-  double *gf_b;
-  FFT_SCALAR **rho1d,**rho_coeff,**drho1d,**drho_coeff;
-  double *sf_precoeff1, *sf_precoeff2, *sf_precoeff3;
-  double *sf_precoeff4, *sf_precoeff5, *sf_precoeff6;
-  double sf_coeff[6];          // coefficients for calculating ad self-forces
-  double **acons;
-
-  // FFTs and grid communication
-
-  class FFT3d *fft1,*fft2;
-  class Remap *remap;
-  class GridComm2 *gc;
-  FFT_SCALAR *gc_buf1,*gc_buf2;
-  int ngc_buf1,ngc_buf2,npergrid;
-
-  // group-group interactions
-
-  int group_allocate_flag;
-  FFT_SCALAR ***density_A_brick,***density_B_brick;
-  FFT_SCALAR *density_A_fft,*density_B_fft;
-
-  int **part2grid;             // storage for particle -> grid mapping
-  int nmax;
-
-  double *boxlo;
-                               // TIP4P settings
-  int typeH,typeO;             // atom types of TIP4P water H and O atoms
-  double qdist;                // distance from O site to negative charge
-  double alpha;                // geometric factor
-
-  virtual void set_grid_global();
-  void set_grid_local();
-  void adjust_gewald();
-  virtual double newton_raphson_f();
-  double derivf();
-  double final_accuracy();
-
-  virtual void allocate();
-  virtual void allocate_peratom();
-  virtual void deallocate();
-  virtual void deallocate_peratom();
-  int factorable(int);
-  double compute_df_kspace();
-  double estimate_ik_error(double, double, bigint);
-  virtual double compute_qopt();
-  virtual void compute_gf_denom();
-  virtual void compute_gf_ik();
-  virtual void compute_gf_ad();
-  void compute_sf_precoeff();
-
-  virtual void particle_map();
-  virtual void make_rho();
-  virtual void brick2fft();
-
-  virtual void poisson();
-  virtual void poisson_ik();
-  virtual void poisson_ad();
-
-  virtual void fieldforce();
-  virtual void fieldforce_ik();
-  virtual void fieldforce_ad();
-
-  virtual void poisson_peratom();
-  virtual void fieldforce_peratom();
-  void procs2grid2d(int,int,int,int *, int*);
-  void compute_rho1d(const FFT_SCALAR &, const FFT_SCALAR &,
-                     const FFT_SCALAR &);
-  void compute_drho1d(const FFT_SCALAR &, const FFT_SCALAR &,
-                     const FFT_SCALAR &);
-  void compute_rho_coeff();
-  virtual void slabcorr();
-
-  // grid communication
-
-  virtual void pack_forward_grid(int, void *, int, int *);
-  virtual void unpack_forward_grid(int, void *, int, int *);
-  virtual void pack_reverse_grid(int, void *, int, int *);
-  virtual void unpack_reverse_grid(int, void *, int, int *);
-
-  // triclinic
-
-  int triclinic;               // domain settings, orthog or triclinic
-  void setup_triclinic();
-  void compute_gf_ik_triclinic();
-  void poisson_ik_triclinic();
-  void poisson_groups_triclinic();
-
-  // group-group interactions
-
-  virtual void allocate_groups();
-  virtual void deallocate_groups();
-  virtual void make_rho_groups(int, int, int);
-  virtual void poisson_groups(int);
-  virtual void slabcorr_groups(int,int,int);
-
-/* ----------------------------------------------------------------------
-   denominator for Hockney-Eastwood Green's function
-     of x,y,z = sin(kx*deltax/2), etc
-
-            inf                 n-1
-   S(n,k) = Sum  W(k+pi*j)**2 = Sum b(l)*(z*z)**l
-           j=-inf               l=0
-
-          = -(z*z)**n /(2n-1)! * (d/dx)**(2n-1) cot(x)  at z = sin(x)
-   gf_b = denominator expansion coeffs
-------------------------------------------------------------------------- */
-
-  inline double gf_denom(const double &x, const double &y,
-                         const double &z) const {
-    double sx,sy,sz;
-    sz = sy = sx = 0.0;
-    for (int l = order-1; l >= 0; l--) {
-      sx = gf_b[l] + sx*x;
-      sy = gf_b[l] + sy*y;
-      sz = gf_b[l] + sz*z;
-    }
-    double s = sx*sy*sz;
-    return s*s;
-  };
-};
-
-}
-
-#endif
-#endif
-
-/* ERROR/WARNING messages:
-
-E: Illegal ... command
-
-Self-explanatory.  Check the input script syntax and compare to the
-documentation for the command.  You can use -echo screen as a
-command-line option when running LAMMPS to see the offending line.
-
-E: Must redefine kspace_style after changing to triclinic box
-
-UNDOCUMENTED
-
-E: Cannot (yet) use PPPM with triclinic box and kspace_modify diff ad
-
-This feature is not yet supported.
-
-E: Cannot (yet) use PPPM with triclinic box and slab correction
-
-This feature is not yet supported.
-
-E: Cannot use PPPM with 2d simulation
-
-The kspace style pppm cannot be used in 2d simulations.  You can use
-2d PPPM in a 3d simulation; see the kspace_modify command.
-
-E: PPPM can only currently be used with comm_style brick
-
-This is a current restriction in LAMMPS.
-
-E: Kspace style requires atom attribute q
-
-The atom style defined does not have these attributes.
-
-E: Cannot use non-periodic boundaries with PPPM
-
-For kspace style pppm, all 3 dimensions must have periodic boundaries
-unless you use the kspace_modify command to define a 2d slab with a
-non-periodic z dimension.
-
-E: Incorrect boundaries with slab PPPM
-
-Must have periodic x,y dimensions and non-periodic z dimension to use
-2d slab option with PPPM.
-
-E: PPPM order cannot be < 2 or > than %d
-
-This is a limitation of the PPPM implementation in LAMMPS.
-
-E: KSpace style is incompatible with Pair style
-
-Setting a kspace style requires that a pair style with matching
-long-range Coulombic or dispersion components be used.
-
-E: Pair style is incompatible with TIP4P KSpace style
-
-The pair style does not have the requires TIP4P settings.
-
-E: Bond and angle potentials must be defined for TIP4P
-
-Cannot use TIP4P pair potential unless bond and angle potentials
-are defined.
-
-E: Bad TIP4P angle type for PPPM/TIP4P
-
-Specified angle type is not valid.
-
-E: Bad TIP4P bond type for PPPM/TIP4P
-
-Specified bond type is not valid.
-
-W: Reducing PPPM order b/c stencil extends beyond nearest neighbor processor
-
-This may lead to a larger grid than desired.  See the kspace_modify overlap
-command to prevent changing of the PPPM order.
-
-E: PPPM order < minimum allowed order
-
-The default minimum order is 2.  This can be reset by the
-kspace_modify minorder command.
-
-E: PPPM grid stencil extends beyond nearest neighbor processor
-
-This is not allowed if the kspace_modify overlap setting is no.
-
-E: KSpace accuracy must be > 0
-
-The kspace accuracy designated in the input must be greater than zero.
-
-E: Must use kspace_modify gewald for uncharged system
-
-UNDOCUMENTED
-
-E: Could not compute grid size
-
-The code is unable to compute a grid size consistent with the desired
-accuracy.  This error should not occur for typical problems.  Please
-send an email to the developers.
-
-E: PPPM grid is too large
-
-The global PPPM grid is larger than OFFSET in one or more dimensions.
-OFFSET is currently set to 4096.  You likely need to decrease the
-requested accuracy.
-
-E: Could not compute g_ewald
-
-The Newton-Raphson solver failed to converge to a good value for
-g_ewald.  This error should not occur for typical problems.  Please
-send an email to the developers.
-
-E: Non-numeric box dimensions - simulation unstable
-
-The box size has apparently blown up.
-
-E: Out of range atoms - cannot compute PPPM
-
-One or more atoms are attempting to map their charge to a PPPM grid
-point that is not owned by a processor.  This is likely for one of two
-reasons, both of them bad.  First, it may mean that an atom near the
-boundary of a processor's sub-domain has moved more than 1/2 the
-"neighbor skin distance"_neighbor.html without neighbor lists being
-rebuilt and atoms being migrated to new processors.  This also means
-you may be missing pairwise interactions that need to be computed.
-The solution is to change the re-neighboring criteria via the
-"neigh_modify"_neigh_modify command.  The safest settings are "delay 0
-every 1 check yes".  Second, it may mean that an atom has moved far
-outside a processor's sub-domain or even the entire simulation box.
-This indicates bad physics, e.g. due to highly overlapping atoms, too
-large a timestep, etc.
-
-E: Cannot (yet) use K-space slab correction with compute group/group for triclinic systems
-
-This option is not yet supported.
-
-E: Cannot (yet) use kspace_modify diff ad with compute group/group
-
-This option is not yet supported.
-
-U: Cannot (yet) use PPPM with triclinic box and TIP4P
-
-This feature is not yet supported.
-
-*/

From 3a1b88c57fcd02fa671c5d30652a67579b951bd0 Mon Sep 17 00:00:00 2001
From: Steve Plimpton <sjplimp@sandia.gov>
Date: Wed, 12 Aug 2020 17:41:31 -0600
Subject: [PATCH 05/38] enable MSM to work withe new GridComm class

---
 src/KSPACE/gridcomm.cpp | 253 +++++++++++++++++---------------
 src/KSPACE/gridcomm.h   |  28 ++--
 src/KSPACE/msm.cpp      | 310 ++++++++++++++++++++++------------------
 src/KSPACE/msm.h        |  27 ++--
 src/KSPACE/msm_cg.cpp   |  52 ++++---
 src/KSPACE/pppm.cpp     |   2 +-
 6 files changed, 368 insertions(+), 304 deletions(-)

diff --git a/src/KSPACE/gridcomm.cpp b/src/KSPACE/gridcomm.cpp
index b292fd53da..06d786e309 100644
--- a/src/KSPACE/gridcomm.cpp
+++ b/src/KSPACE/gridcomm.cpp
@@ -17,6 +17,7 @@
 #include "kspace.h"
 #include "irregular.h"
 #include "memory.h"
+#include "error.h"
 
 using namespace LAMMPS_NS;
 
@@ -24,12 +25,17 @@ enum{REGULAR,TILED};
 
 #define SWAPDELTA 8
 
-// NOTE: gridcomm needs to be world for TILED, will it work with MSM?
-// NOTE: Tiled implementation here only works for RCB, not general tiled
+/* ----------------------------------------------------------------------
+   NOTES
+   tiled implementation only currently works for RCB, not general tiled
+   if o indices for ghosts are < 0 or hi indices are >= N,
+     then grid is treated as periodic in that dimension,
+     communication is done across the periodic boundaries
+------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   gcomm = MPI communicator that shares this grid
-           does not have to be world, see MSM
+   constructor called by all classes except MSM
+   gcomm = world communicator
    gn xyz = size of global grid
    i xyz lohi = portion of global grid this proc owns, 0 <= index < N
    o xyz lohi = owned grid portion + ghost grid cells needed in all directions
@@ -44,130 +50,79 @@ GridComm::GridComm(LAMMPS *lmp, MPI_Comm gcomm,
 		   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi)
   : Pointers(lmp)
 {
-  gridcomm = gcomm;
-  MPI_Comm_rank(gridcomm,&me);
-  MPI_Comm_size(gridcomm,&nprocs);
-
-  nx = gnx;
-  ny = gny;
-  nz = gnz;
-  
-  inxlo = ixlo;
-  inxhi = ixhi;
-  inylo = iylo;
-  inyhi = iyhi;
-  inzlo = izlo;
-  inzhi = izhi;
-
-  outxlo = oxlo;
-  outxhi = oxhi;
-  outylo = oylo;
-  outyhi = oyhi;
-  outzlo = ozlo;
-  outzhi = ozhi;
-
-  // layout == REGULAR or TILED
-  // for REGULAR, proc xyz lohi = my 6 neighbor procs
-  
-  layout = REGULAR;
   if (comm->layout == Comm::LAYOUT_TILED) layout = TILED;
-  
-  outxlo_max = oxlo;
-  outxhi_max = oxhi;
-  outylo_max = oylo;
-  outyhi_max = oyhi;
-  outzlo_max = ozlo;
-  outzhi_max = ozhi;
+  else layout = REGULAR;
 
   if (layout == REGULAR) {
     int (*procneigh)[2] = comm->procneigh;
-
-    procxlo = procneigh[0][0];
-    procxhi = procneigh[0][1];
-    procylo = procneigh[1][0];
-    procyhi = procneigh[1][1];
-    proczlo = procneigh[2][0];
-    proczhi = procneigh[2][1];
+    initialize(gcomm,gnx,gny,gnz,
+	       ixlo,ixhi,iylo,iyhi,izlo,izhi,
+	       oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
+	       oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
+	       procneigh[0][0],procneigh[0][1],
+	       procneigh[1][0],procneigh[1][1],
+	       procneigh[2][0],procneigh[2][1]);
+  } else {
+    initialize(gcomm,gnx,gny,gnz,
+	       ixlo,ixhi,iylo,iyhi,izlo,izhi,
+	       oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
+	       oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
+	       0,0,0,0,0,0);
   }
-  
-  nswap = maxswap = 0;
-  swap = NULL;
-
-  nsend = nrecv = ncopy = 0;
-  send = NULL;
-  recv = NULL;
-  copy = NULL;
-  requests = NULL;
 }
 
 /* ----------------------------------------------------------------------
-   same as first constructor except o xyz lohi max are added arguments
-   this is for case when caller stores grid in a larger array than o xyz lohi
-   only affects indices() method which generates indices into the caller's array
+   constructor called by MSM
+   gcomm = world communicator or sub-communicator for a hierarchical grid
+   flag = 1 if e xyz lohi values = larger grid stored by caller in gcomm = world
+   flag = 2 if e xyz lohi values = 6 neighbor procs in gcomm
+   gn xyz = size of global grid
+   i xyz lohi = portion of global grid this proc owns, 0 <= index < N
+   o xyz lohi = owned grid portion + ghost grid cells needed in all directions
+   e xyz lohi for flag = 1: extent of larger grid stored by caller
+   e xyz lohi for flag = 2: 6 neighbor procs
 ------------------------------------------------------------------------- */
 
-GridComm::GridComm(LAMMPS *lmp, MPI_Comm gcomm,
+GridComm::GridComm(LAMMPS *lmp, MPI_Comm gcomm, int flag,
 		   int gnx, int gny, int gnz,
 		   int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
 		   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi,
-		   int oxlo_max, int oxhi_max, int oylo_max, int oyhi_max,
-		   int ozlo_max, int ozhi_max)
+		   int exlo, int exhi, int eylo, int eyhi, int ezlo, int ezhi)
   : Pointers(lmp)
 {
-  gridcomm = gcomm;
-  MPI_Comm_rank(gridcomm,&me);
-  MPI_Comm_size(gridcomm,&nprocs);
-
-  nx = gnx;
-  ny = gny;
-  nz = gnz;
-
-  inxlo = ixlo;
-  inxhi = ixhi;
-  inylo = iylo;
-  inyhi = iyhi;
-  inzlo = izlo;
-  inzhi = izhi;
-
-  outxlo = oxlo;
-  outxhi = oxhi;
-  outylo = oylo;
-  outyhi = oyhi;
-  outzlo = ozlo;
-  outzhi = ozhi;
-
-  outxlo_max = oxlo_max;
-  outxhi_max = oxhi_max;
-  outylo_max = oylo_max;
-  outyhi_max = oyhi_max;
-  outzlo_max = ozlo_max;
-  outzhi_max = ozhi_max;
-
-  // layout == REGULAR or TILED
-  // for REGULAR, proc xyz lohi = my 6 neighbor procs
-
-  layout = REGULAR;
   if (comm->layout == Comm::LAYOUT_TILED) layout = TILED;
+  else layout = REGULAR;
 
-  if (layout == REGULAR) {
-    int (*procneigh)[2] = comm->procneigh;
-
-    procxlo = procneigh[0][0];
-    procxhi = procneigh[0][1];
-    procylo = procneigh[1][0];
-    procyhi = procneigh[1][1];
-    proczlo = procneigh[2][0];
-    proczhi = procneigh[2][1];
+  if (flag == 1) {
+    if (layout == REGULAR) {
+      // this assumes gcomm = world
+      int (*procneigh)[2] = comm->procneigh;
+      initialize(gcomm,gnx,gny,gnz,
+		 ixlo,ixhi,iylo,iyhi,izlo,izhi,
+		 oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
+		 exlo,exhi,eylo,eyhi,ezlo,ezhi,
+		 procneigh[0][0],procneigh[0][1],
+		 procneigh[1][0],procneigh[1][1],
+		 procneigh[2][0],procneigh[2][1]);
+    } else {
+      initialize(gcomm,gnx,gny,gnz,
+		 ixlo,ixhi,iylo,iyhi,izlo,izhi,
+		 oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
+		 exlo,exhi,eylo,eyhi,ezlo,ezhi,
+		 0,0,0,0,0,0);
+    }
+    
+  } else if (flag == 2) {
+    if (layout == REGULAR) {
+      initialize(gcomm,gnx,gny,gnz,
+		 ixlo,ixhi,iylo,iyhi,izlo,izhi,
+		 oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
+		 oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
+		 exlo,exhi,eylo,eyhi,ezlo,ezhi);
+    } else {
+      error->all(FLERR,"GridComm does not support tiled layout with neighbor procs");
+    }
   }
-
-  nswap = maxswap = 0;
-  swap = NULL;
-
-  nsend = nrecv = ncopy = 0;
-  send = NULL;
-  recv = NULL;
-  copy = NULL;
-  requests = NULL;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -201,6 +156,69 @@ GridComm::~GridComm()
   delete [] requests;
 }
 
+/* ----------------------------------------------------------------------
+   store constructor args in local variables
+------------------------------------------------------------------------- */
+
+void GridComm::initialize(MPI_Comm gcomm,
+			  int gnx, int gny, int gnz,
+			  int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
+			  int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi,
+			  int fxlo, int fxhi, int fylo, int fyhi, int fzlo, int fzhi,
+			  int pxlo, int pxhi, int pylo, int pyhi, int pzlo, int pzhi)
+{
+  gridcomm = gcomm;
+  MPI_Comm_rank(gridcomm,&me);
+  MPI_Comm_size(gridcomm,&nprocs);
+
+  nx = gnx;
+  ny = gny;
+  nz = gnz;
+  
+  inxlo = ixlo;
+  inxhi = ixhi;
+  inylo = iylo;
+  inyhi = iyhi;
+  inzlo = izlo;
+  inzhi = izhi;
+
+  outxlo = oxlo;
+  outxhi = oxhi;
+  outylo = oylo;
+  outyhi = oyhi;
+  outzlo = ozlo;
+  outzhi = ozhi;
+
+  fullxlo = fxlo;
+  fullxhi = fxhi;
+  fullylo = fylo;
+  fullyhi = fyhi;
+  fullzlo = fzlo;
+  fullzhi = fzhi;
+
+  // for REGULAR layout, proc xyz lohi = my 6 neighbor procs in this MPI_Comm
+
+  if (layout == REGULAR) {
+    procxlo = pxlo;
+    procxhi = pxhi;
+    procylo = pylo;
+    procyhi = pyhi;
+    proczlo = pzlo;
+    proczhi = pzhi;
+  }
+
+  // internal data initializations
+  
+  nswap = maxswap = 0;
+  swap = NULL;
+
+  nsend = nrecv = ncopy = 0;
+  send = NULL;
+  recv = NULL;
+  copy = NULL;
+  requests = NULL;
+}
+
 /* ---------------------------------------------------------------------- */
 
 void GridComm::setup(int &nbuf1, int &nbuf2)
@@ -504,6 +522,7 @@ void GridComm::setup_regular(int &nbuf1, int &nbuf2)
 }
 
 /* ----------------------------------------------------------------------
+   NOTE: need to doc this header
 ------------------------------------------------------------------------- */
 
 void GridComm::setup_tiled(int &nbuf1, int &nbuf2)
@@ -725,6 +744,8 @@ void GridComm::setup_tiled(int &nbuf1, int &nbuf2)
 }
 
 /* ----------------------------------------------------------------------
+   NOTE: need to doc this header
+   recursive ...
 ------------------------------------------------------------------------- */
 
 void GridComm::ghost_box_drop(int *box, int *pbc)
@@ -803,10 +824,12 @@ void GridComm::ghost_box_drop(int *box, int *pbc)
 }
 
 /* ----------------------------------------------------------------------
+   NOTE: need to doc this header
+   recursive ...
 ------------------------------------------------------------------------- */
 
 void GridComm::box_drop_grid(int *box, int proclower, int procupper,
-			      int &np, int *plist)
+			     int &np, int *plist)
 {
   // end recursion when partition is a single proc
   // add proclower to plist
@@ -880,7 +903,7 @@ int GridComm::ghost_adjacent_tiled()
 ------------------------------------------------------------------------- */
 
 void GridComm::forward_comm_kspace(KSpace *kspace, int nper, int nbyte, int which,
-				    void *buf1, void *buf2, MPI_Datatype datatype)
+				   void *buf1, void *buf2, MPI_Datatype datatype)
 {
   if (layout == REGULAR)
     forward_comm_kspace_regular(kspace,nper,nbyte,which,buf1,buf2,datatype);
@@ -1083,15 +1106,15 @@ int GridComm::indices(int *&list,
   memory->create(list,nmax,"CommGrid:indices");
   if (nmax == 0) return 0;
 
-  int nx = (outxhi_max-outxlo_max+1);
-  int ny = (outyhi_max-outylo_max+1);
+  int nx = (fullxhi-fullxlo+1);
+  int ny = (fullyhi-fullylo+1);
 
   int n = 0;
   int ix,iy,iz;
   for (iz = zlo; iz <= zhi; iz++)
     for (iy = ylo; iy <= yhi; iy++)
       for (ix = xlo; ix <= xhi; ix++)
-        list[n++] = (iz-outzlo_max)*ny*nx + (iy-outylo_max)*nx + (ix-outxlo_max);
+        list[n++] = (iz-fullzlo)*ny*nx + (iy-fullylo)*nx + (ix-fullxlo);
 
   return nmax;
 }
diff --git a/src/KSPACE/gridcomm.h b/src/KSPACE/gridcomm.h
index 1cdfe28da2..8b2539b977 100644
--- a/src/KSPACE/gridcomm.h
+++ b/src/KSPACE/gridcomm.h
@@ -23,7 +23,7 @@ class GridComm : protected Pointers {
   GridComm(class LAMMPS *, MPI_Comm, int, int, int,
 	   int, int, int, int, int, int,
 	   int, int, int, int, int, int);
-  GridComm(class LAMMPS *, MPI_Comm, int, int, int,
+  GridComm(class LAMMPS *, MPI_Comm, int, int, int, int,
 	   int, int, int, int, int, int,
 	   int, int, int, int, int, int,
 	   int, int, int, int, int, int);
@@ -38,7 +38,8 @@ class GridComm : protected Pointers {
  private:
   int me,nprocs;
   int layout;                 // REGULAR or TILED
-  MPI_Comm gridcomm;
+  MPI_Comm gridcomm;          // communicator for this class
+                              // usually world, but MSM calls with subset
 
   // inputs from caller via constructor
 
@@ -48,21 +49,21 @@ class GridComm : protected Pointers {
   int inzlo,inzhi;   
   int outxlo,outxhi;          // inclusive extent of my grid chunk plus
   int outylo,outyhi;          //   ghost cells in all 6 directions
-  int outzlo,outzhi;          // lo indices can be < 0, hi indices can be >= N
-  int outxlo_max,outxhi_max;  // ??
-  int outylo_max,outyhi_max;
-  int outzlo_max,outzhi_max;
+  int outzlo,outzhi;          //   lo indices can be < 0, hi indices can be >= N
+  int fullxlo,fullxhi;        // extent of grid chunk that caller stores
+  int fullylo,fullyhi;        //   can be same as out indices or larger
+  int fullzlo,fullzhi;
 
   // -------------------------------------------
   // internal variables for REGULAR layout
   // -------------------------------------------
 
   int procxlo,procxhi;     // 6 neighbor procs that adjoin me
-  int procylo,procyhi;     //   not used for comm_style = tiled
+  int procylo,procyhi;     // not used for comm_style = tiled
   int proczlo,proczhi;
   
   int ghostxlo,ghostxhi;   // # of my owned grid planes needed
-  int ghostylo,ghostyhi;   //   by neighobr procs in each dir as their ghost planes
+  int ghostylo,ghostyhi;   // by neighobr procs in each dir as their ghost planes
   int ghostzlo,ghostzhi;
 
   // swap = exchange of owned and ghost grid cells between 2 procs, including self
@@ -83,8 +84,8 @@ class GridComm : protected Pointers {
   // internal variables for TILED layout
   // -------------------------------------------
 
-  int *overlap_procs;
-  MPI_Request *requests;
+  int *overlap_procs;          // length of Nprocs in communicator
+  MPI_Request *requests;       // length of max messages this proc receives
 
   // RCB tree of cut info
   // each proc contributes one value, except proc 0
@@ -174,7 +175,12 @@ class GridComm : protected Pointers {
   // -------------------------------------------
   // internal methods
   // -------------------------------------------
-  
+
+  void initialize(MPI_Comm, int, int, int,
+		  int, int, int, int, int, int,
+		  int, int, int, int, int, int,
+		  int, int, int, int, int, int,
+		  int, int, int, int, int, int);
   void setup_regular(int &, int &);
   void setup_tiled(int &, int &);
   void ghost_box_drop(int *, int *);
diff --git a/src/KSPACE/msm.cpp b/src/KSPACE/msm.cpp
index e522ccb7ad..93ecce6904 100644
--- a/src/KSPACE/msm.cpp
+++ b/src/KSPACE/msm.cpp
@@ -42,6 +42,7 @@ using namespace MathConst;
 
 enum{REVERSE_RHO,REVERSE_AD,REVERSE_AD_PERATOM};
 enum{FORWARD_RHO,FORWARD_AD,FORWARD_AD_PERATOM};
+
 /* ---------------------------------------------------------------------- */
 
 MSM::MSM(LAMMPS *lmp) : KSpace(lmp),
@@ -49,14 +50,15 @@ MSM::MSM(LAMMPS *lmp) : KSpace(lmp),
   ny_msm(NULL), nz_msm(NULL), nxlo_in(NULL), nylo_in(NULL), nzlo_in(NULL),
   nxhi_in(NULL), nyhi_in(NULL), nzhi_in(NULL), nxlo_out(NULL), nylo_out(NULL),
   nzlo_out(NULL), nxhi_out(NULL), nyhi_out(NULL), nzhi_out(NULL), ngrid(NULL),
-  active_flag(NULL), alpha(NULL), betax(NULL), betay(NULL), betaz(NULL), peratom_allocate_flag(0),
+  active_flag(NULL), alpha(NULL), betax(NULL), betay(NULL), betaz(NULL),
+  peratom_allocate_flag(0),
   levels(0), world_levels(NULL), qgrid(NULL), egrid(NULL), v0grid(NULL), v1grid(NULL),
   v2grid(NULL), v3grid(NULL), v4grid(NULL), v5grid(NULL), g_direct(NULL),
   v0_direct(NULL), v1_direct(NULL), v2_direct(NULL), v3_direct(NULL), v4_direct(NULL),
   v5_direct(NULL), g_direct_top(NULL), v0_direct_top(NULL), v1_direct_top(NULL),
   v2_direct_top(NULL), v3_direct_top(NULL), v4_direct_top(NULL), v5_direct_top(NULL),
-  phi1d(NULL), dphi1d(NULL), procneigh_levels(NULL), cg(NULL), cg_peratom(NULL),
-  cg_all(NULL), cg_peratom_all(NULL), part2grid(NULL), boxlo(NULL)
+  phi1d(NULL), dphi1d(NULL), procneigh_levels(NULL), gc(NULL),
+  gcall(NULL), part2grid(NULL), boxlo(NULL)
 {
   msmflag = 1;
 
@@ -117,6 +119,7 @@ MSM::~MSM()
   delete [] factors;
   deallocate();
   if (peratom_allocate_flag) deallocate_peratom();
+  deallocate_levels();
   memory->destroy(part2grid);
   memory->destroy(g_direct);
   memory->destroy(g_direct_top);
@@ -132,7 +135,6 @@ MSM::~MSM()
   memory->destroy(v3_direct_top);
   memory->destroy(v4_direct_top);
   memory->destroy(v5_direct_top);
-  deallocate_levels();
 }
 
 /* ----------------------------------------------------------------------
@@ -397,17 +399,6 @@ void MSM::setup()
   // don't invoke allocate_peratom(), compute() will allocate when needed
 
   allocate();
-
-  // setup commgrid
-
-  cg_all->ghost_notify();
-  cg_all->setup();
-  for (int n=0; n<levels; n++) {
-    if (!active_flag[n]) continue;
-    cg[n]->ghost_notify();
-    cg[n]->setup();
-  }
-
 }
 
 /* ----------------------------------------------------------------------
@@ -448,16 +439,7 @@ void MSM::compute(int eflag, int vflag)
 
   // invoke allocate_peratom() if needed for first time
 
-  if (vflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    cg_peratom_all->ghost_notify();
-    cg_peratom_all->setup();
-    for (int n=0; n<levels; n++) {
-      if (!active_flag[n]) continue;
-      cg_peratom[n]->ghost_notify();
-      cg_peratom[n]->setup();
-    }
-  }
+  if (vflag_atom && !peratom_allocate_flag) allocate_peratom();
 
   // convert atoms from box to lamda coords
 
@@ -483,7 +465,8 @@ void MSM::compute(int eflag, int vflag)
   // to fully sum contribution in their 3d grid
 
   current_level = 0;
-  cg_all->reverse_comm(this,REVERSE_RHO);
+  gcall->reverse_comm_kspace(this,1,sizeof(double),REVERSE_RHO,
+			     gcall_buf1,gcall_buf2,MPI_DOUBLE);
 
   // forward communicate charge density values to fill ghost grid points
   // compute direct sum interaction and then restrict to coarser grid
@@ -491,8 +474,8 @@ void MSM::compute(int eflag, int vflag)
   for (int n=0; n<=levels-2; n++) {
     if (!active_flag[n]) continue;
     current_level = n;
-    cg[n]->forward_comm(this,FORWARD_RHO);
-
+    gc[n]->forward_comm_kspace(this,1,sizeof(double),FORWARD_RHO,
+			       gc_buf1[n],gc_buf2[n],MPI_DOUBLE);
     direct(n);
     restriction(n);
   }
@@ -503,11 +486,18 @@ void MSM::compute(int eflag, int vflag)
   if (active_flag[levels-1]) {
     if (domain->nonperiodic) {
       current_level = levels-1;
-      cg[levels-1]->forward_comm(this,FORWARD_RHO);
+      gc[levels-1]->
+	forward_comm_kspace(this,1,sizeof(double),FORWARD_RHO,
+			    gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
       direct_top(levels-1);
-      cg[levels-1]->reverse_comm(this,REVERSE_AD);
+      gc[levels-1]->
+	reverse_comm_kspace(this,1,sizeof(double),REVERSE_AD,
+			    gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
       if (vflag_atom)
-        cg_peratom[levels-1]->reverse_comm(this,REVERSE_AD_PERATOM);
+	gc[levels-1]->
+	  reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
+			      gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
+      
     } else {
       // Here using MPI_Allreduce is cheaper than using commgrid
       grid_swap_forward(levels-1,qgrid[levels-1]);
@@ -515,7 +505,9 @@ void MSM::compute(int eflag, int vflag)
       grid_swap_reverse(levels-1,egrid[levels-1]);
       current_level = levels-1;
       if (vflag_atom)
-        cg_peratom[levels-1]->reverse_comm(this,REVERSE_AD_PERATOM);
+	gc[levels-1]->
+	  reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
+			      gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
     }
   }
 
@@ -527,24 +519,28 @@ void MSM::compute(int eflag, int vflag)
     prolongation(n);
 
     current_level = n;
-    cg[n]->reverse_comm(this,REVERSE_AD);
+    gc[n]->reverse_comm_kspace(this,1,sizeof(double),REVERSE_AD,
+			       gc_buf1[n],gc_buf2[n],MPI_DOUBLE);
 
     // extra per-atom virial communication
 
     if (vflag_atom)
-      cg_peratom[n]->reverse_comm(this,REVERSE_AD_PERATOM);
+      gc[n]->reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
+				 gc_buf1[n],gc_buf2[n],MPI_DOUBLE);
   }
 
   // all procs communicate E-field values
   // to fill ghost cells surrounding their 3d bricks
 
   current_level = 0;
-  cg_all->forward_comm(this,FORWARD_AD);
+  gcall->forward_comm_kspace(this,1,sizeof(double),FORWARD_AD,
+			     gcall_buf1,gcall_buf2,MPI_DOUBLE);
 
   // extra per-atom energy/virial communication
 
   if (vflag_atom)
-    cg_peratom_all->forward_comm(this,FORWARD_AD_PERATOM);
+    gcall->forward_comm_kspace(this,6,sizeof(double),FORWARD_AD_PERATOM,
+			       gcall_buf1,gcall_buf2,MPI_DOUBLE);
 
   // calculate the force on my particles (interpolation)
 
@@ -603,8 +599,7 @@ void MSM::compute(int eflag, int vflag)
 
   // convert atoms back from lamda to box coords
 
-  if (triclinic)
-    domain->lamda2x(atom->nlocal);
+  if (triclinic) domain->lamda2x(atom->nlocal);
 }
 
 /* ----------------------------------------------------------------------
@@ -621,15 +616,18 @@ void MSM::allocate()
 
   // commgrid using all processors for finest grid level
 
-  int (*procneigh_all)[2] = comm->procneigh;
+  gcall = new GridComm(lmp,world,1,nx_msm[0],ny_msm[0],nz_msm[0],
+		       nxlo_in[0],nxhi_in[0],nylo_in[0],
+		       nyhi_in[0],nzlo_in[0],nzhi_in[0],
+		       nxlo_out_all,nxhi_out_all,nylo_out_all,
+		       nyhi_out_all,nzlo_out_all,nzhi_out_all,
+		       nxlo_out[0],nxhi_out[0],nylo_out[0],
+		       nyhi_out[0],nzlo_out[0],nzhi_out[0]);
 
-
-  cg_all = new GridComm(lmp,world,1,1,
-                    nxlo_in[0],nxhi_in[0],nylo_in[0],nyhi_in[0],nzlo_in[0],nzhi_in[0],
-                    nxlo_out_all,nxhi_out_all,nylo_out_all,nyhi_out_all,nzlo_out_all,nzhi_out_all,
-                    nxlo_out[0],nxhi_out[0],nylo_out[0],nyhi_out[0],nzlo_out[0],nzhi_out[0],
-                    procneigh_all[0][0],procneigh_all[0][1],procneigh_all[1][0],
-                    procneigh_all[1][1],procneigh_all[2][0],procneigh_all[2][1]);
+  gcall->setup(ngcall_buf1,ngcall_buf2);
+  npergrid = 1;
+  memory->create(gcall_buf1,npergrid*ngcall_buf1,"msm:gcall_buf1");
+  memory->create(gcall_buf2,npergrid*ngcall_buf2,"msm:gcall_buf2");
 
   // allocate memory for each grid level
 
@@ -644,12 +642,23 @@ void MSM::allocate()
 
     if (active_flag[n]) {
       int **procneigh = procneigh_levels[n];
-      cg[n] = new GridComm(lmp,world_levels[n],1,1,
-                        nxlo_in[n],nxhi_in[n],nylo_in[n],nyhi_in[n],nzlo_in[n],nzhi_in[n],
-                        nxlo_out[n],nxhi_out[n],nylo_out[n],nyhi_out[n],nzlo_out[n],nzhi_out[n],
-                        procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                        procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-    } else cg[n] = nullptr;
+      gc[n] = new GridComm(lmp,world_levels[n],2,nx_msm[n],ny_msm[n],nz_msm[n],
+			   nxlo_in[n],nxhi_in[n],nylo_in[n],nyhi_in[n],
+			   nzlo_in[n],nzhi_in[n],
+			   nxlo_out[n],nxhi_out[n],nylo_out[n],nyhi_out[n],
+			   nzlo_out[n],nzhi_out[n],
+			   procneigh[0][0],procneigh[0][1],procneigh[1][0],
+			   procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+
+      gc[n]->setup(ngc_buf1[n],ngc_buf2[n]);
+      npergrid = 1;
+      memory->create(gc_buf1[n],npergrid*ngc_buf1[n],"msm:gc_buf1");
+      memory->create(gc_buf2[n],npergrid*ngc_buf2[n],"msm:gc_buf2");
+
+    } else {
+      gc[n] = nullptr;
+      gc_buf1[n] = gc_buf2[n] = nullptr;
+    }
   }
 }
 
@@ -662,9 +671,12 @@ void MSM::deallocate()
   memory->destroy2d_offset(phi1d,-order_allocated);
   memory->destroy2d_offset(dphi1d,-order_allocated);
 
-  if (cg_all) delete cg_all;
-  cg_all = nullptr;
-
+  if (gcall) delete gcall;
+  memory->destroy(gcall_buf1);
+  memory->destroy(gcall_buf2);
+  gcall = nullptr;
+  gcall_buf1 = gcall_buf2 = nullptr;
+  
   for (int n=0; n<levels; n++) {
     if (qgrid[n])
       memory->destroy3d_offset(qgrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
@@ -676,10 +688,13 @@ void MSM::deallocate()
       if (world_levels[n] != MPI_COMM_NULL)
           MPI_Comm_free(&world_levels[n]);
 
-    if (cg) {
-      if (cg[n]) {
-        delete cg[n];
-        cg[n] = nullptr;
+    if (gc) {
+      if (gc[n]) {
+        delete gc[n];
+	memory->destroy(gc_buf1[n]);
+	memory->destroy(gc_buf2[n]);
+        gc[n] = nullptr;
+	gc_buf1[n] = gc_buf2[n] = nullptr;
       }
     }
   }
@@ -695,15 +710,11 @@ void MSM::allocate_peratom()
 
   // create commgrid object for per-atom virial using all processors
 
-  int (*procneigh_all)[2] = comm->procneigh;
-
-  cg_peratom_all =
-    new GridComm(lmp,world,6,6,
-                 nxlo_in[0],nxhi_in[0],nylo_in[0],nyhi_in[0],nzlo_in[0],nzhi_in[0],
-                 nxlo_out_all,nxhi_out_all,nylo_out_all,nyhi_out_all,nzlo_out_all,nzhi_out_all,
-                 nxlo_out[0],nxhi_out[0],nylo_out[0],nyhi_out[0],nzlo_out[0],nzhi_out[0],
-                 procneigh_all[0][0],procneigh_all[0][1],procneigh_all[1][0],
-                 procneigh_all[1][1],procneigh_all[2][0],procneigh_all[2][1]);
+  npergrid = 6;
+  memory->destroy(gcall_buf1);
+  memory->destroy(gcall_buf2);
+  memory->create(gcall_buf1,npergrid*ngcall_buf1,"pppm:gcall_buf1");
+  memory->create(gcall_buf2,npergrid*ngcall_buf2,"pppm:gcall_buf2");
 
   // allocate memory for each grid level
 
@@ -724,13 +735,11 @@ void MSM::allocate_peratom()
     // create commgrid object for per-atom virial
 
     if (active_flag[n]) {
-      int **procneigh = procneigh_levels[n];
-      cg_peratom[n] =
-        new GridComm(lmp,world_levels[n],6,6,
-                     nxlo_in[n],nxhi_in[n],nylo_in[n],nyhi_in[n],nzlo_in[n],nzhi_in[n],
-                     nxlo_out[n],nxhi_out[n],nylo_out[n],nyhi_out[n],nzlo_out[n],nzhi_out[n],
-                     procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                     procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+      npergrid = 6;
+      memory->destroy(gc_buf1[n]);
+      memory->destroy(gc_buf2[n]);
+      memory->create(gc_buf1[n],npergrid*ngc_buf1[n],"pppm:gc_buf1");
+      memory->create(gc_buf2[n],npergrid*ngc_buf2[n],"pppm:gc_buf2");
     }
   }
 }
@@ -743,8 +752,6 @@ void MSM::deallocate_peratom()
 {
   peratom_allocate_flag = 0;
 
-  if (cg_peratom_all) delete cg_peratom_all;
-
   for (int n=0; n<levels; n++) {
     if (v0grid[n])
       memory->destroy3d_offset(v0grid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
@@ -758,9 +765,6 @@ void MSM::deallocate_peratom()
       memory->destroy3d_offset(v4grid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
     if (v5grid[n])
       memory->destroy3d_offset(v5grid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
-
-    if (cg_peratom)
-      if (cg_peratom[n]) delete cg_peratom[n];
   }
 }
 
@@ -771,10 +775,13 @@ void MSM::deallocate_peratom()
 void MSM::allocate_levels()
 {
   ngrid = new int[levels];
-
-  cg = new GridComm*[levels];
-  cg_peratom = new GridComm*[levels];
-
+ 
+  gc = new GridComm*[levels];
+  gc_buf1 = new double*[levels];
+  gc_buf2 = new double*[levels];
+  ngc_buf1 = new int[levels];
+  ngc_buf2 = new int[levels];
+ 
   memory->create(procneigh_levels,levels,3,2,"msm:procneigh_levels");
   world_levels = new MPI_Comm[levels];
   active_flag = new int[levels];
@@ -819,9 +826,8 @@ void MSM::allocate_levels()
   v5grid = new double***[levels];
 
   for (int n=0; n<levels; n++) {
-    cg[n] = NULL;
+    gc[n] = NULL;
     world_levels[n] = MPI_COMM_NULL;
-    cg_peratom[n] = NULL;
 
     qgrid[n] = NULL;
     egrid[n] = NULL;
@@ -833,7 +839,6 @@ void MSM::allocate_levels()
     v4grid[n] = NULL;
     v5grid[n] = NULL;
   }
-
 }
 
 /* ----------------------------------------------------------------------
@@ -842,15 +847,18 @@ void MSM::allocate_levels()
 
 void MSM::deallocate_levels()
 {
-  if (cg) deallocate();
   delete [] ngrid;
   ngrid = nullptr;
 
   memory->destroy(procneigh_levels);
   delete [] world_levels;
   delete [] active_flag;
-  delete [] cg;
-  delete [] cg_peratom;
+
+  delete [] gc;
+  delete [] gc_buf1;
+  delete [] gc_buf2;
+  delete [] ngc_buf1;
+  delete [] ngc_buf2;
 
   delete [] alpha;
   delete [] betax;
@@ -893,8 +901,8 @@ void MSM::deallocate_levels()
 
   world_levels = nullptr;
   active_flag = nullptr;
-  cg = nullptr;
-  cg_peratom = nullptr;
+  gc = nullptr;
+  gc_buf1 = gc_buf2 = nullptr;
 
   alpha = nullptr;
   betax = nullptr;
@@ -1377,7 +1385,7 @@ void MSM::set_proc_grid(int n)
 
   // define a new MPI communicator for this grid level that only includes active procs
 
-  if(world_levels[n] != MPI_COMM_NULL) MPI_Comm_free(&world_levels[n]);
+  if (world_levels[n] != MPI_COMM_NULL) MPI_Comm_free(&world_levels[n]);
   MPI_Comm_split(world,color,me,&world_levels[n]);
 
   if (!active_flag[n]) return;
@@ -2434,6 +2442,7 @@ void MSM::prolongation(int n)
    be cheaper than using nearest-neighbor communication (commgrid), right
    now only works for periodic boundary conditions
 ------------------------------------------------------------------------- */
+
 void MSM::grid_swap_forward(int n, double*** &gridn)
 {
   double ***gridn_tmp;
@@ -2533,32 +2542,31 @@ void MSM::grid_swap_reverse(int n, double*** &gridn)
    pack own values to buf to send to another proc (used by commgrid)
 ------------------------------------------------------------------------- */
 
-void MSM::pack_forward(int flag, double *buf, int nlist, int *list)
+void MSM::pack_forward_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  double *buf = (double *) vbuf;
+    
   int n = current_level;
-
-  double ***qgridn = qgrid[n];
-  double ***egridn = egrid[n];
-
-  double ***v0gridn = v0grid[n];
-  double ***v1gridn = v1grid[n];
-  double ***v2gridn = v2grid[n];
-  double ***v3gridn = v3grid[n];
-  double ***v4gridn = v4grid[n];
-  double ***v5gridn = v5grid[n];
-
   int k = 0;
-
+  
   if (flag == FORWARD_RHO) {
+    double ***qgridn = qgrid[n];
     double *qsrc = &qgridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
     for (int i = 0; i < nlist; i++) {
       buf[k++] = qsrc[list[i]];
     }
   } else if (flag == FORWARD_AD) {
+    double ***egridn = egrid[n];
     double *src = &egridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
     for (int i = 0; i < nlist; i++)
       buf[i] = src[list[i]];
   } else if (flag == FORWARD_AD_PERATOM) {
+    double ***v0gridn = v0grid[n];
+    double ***v1gridn = v1grid[n];
+    double ***v2gridn = v2grid[n];
+    double ***v3gridn = v3grid[n];
+    double ***v4gridn = v4grid[n];
+    double ***v5gridn = v5grid[n];
     double *v0src = &v0gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
     double *v1src = &v1gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
     double *v2src = &v2gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
@@ -2580,32 +2588,31 @@ void MSM::pack_forward(int flag, double *buf, int nlist, int *list)
    unpack another proc's own values from buf and set own ghost values
 ------------------------------------------------------------------------- */
 
-void MSM::unpack_forward(int flag, double *buf, int nlist, int *list)
+void MSM::unpack_forward_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  double *buf = (double *) vbuf;
+
   int n = current_level;
-
-  double ***qgridn = qgrid[n];
-  double ***egridn = egrid[n];
-
-  double ***v0gridn = v0grid[n];
-  double ***v1gridn = v1grid[n];
-  double ***v2gridn = v2grid[n];
-  double ***v3gridn = v3grid[n];
-  double ***v4gridn = v4grid[n];
-  double ***v5gridn = v5grid[n];
-
   int k = 0;
 
   if (flag == FORWARD_RHO) {
+  double ***qgridn = qgrid[n];
     double *dest = &qgridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
     for (int i = 0; i < nlist; i++) {
       dest[list[i]] = buf[k++];
     }
   } else if (flag == FORWARD_AD) {
+    double ***egridn = egrid[n];
     double *dest = &egridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
     for (int i = 0; i < nlist; i++)
       dest[list[i]] = buf[k++];
   } else if (flag == FORWARD_AD_PERATOM) {
+    double ***v0gridn = v0grid[n];
+    double ***v1gridn = v1grid[n];
+    double ***v2gridn = v2grid[n];
+    double ***v3gridn = v3grid[n];
+    double ***v4gridn = v4grid[n];
+    double ***v5gridn = v5grid[n];
     double *v0src = &v0gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
     double *v1src = &v1gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
     double *v2src = &v2gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
@@ -2627,32 +2634,31 @@ void MSM::unpack_forward(int flag, double *buf, int nlist, int *list)
    pack ghost values into buf to send to another proc
 ------------------------------------------------------------------------- */
 
-void MSM::pack_reverse(int flag, double *buf, int nlist, int *list)
+void MSM::pack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  double *buf = (double *) vbuf;
+
   int n = current_level;
-
-  double ***qgridn = qgrid[n];
-  double ***egridn = egrid[n];
-
-  double ***v0gridn = v0grid[n];
-  double ***v1gridn = v1grid[n];
-  double ***v2gridn = v2grid[n];
-  double ***v3gridn = v3grid[n];
-  double ***v4gridn = v4grid[n];
-  double ***v5gridn = v5grid[n];
-
   int k = 0;
 
   if (flag == REVERSE_RHO) {
+    double ***qgridn = qgrid[n];
     double *qsrc = &qgridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
     for (int i = 0; i < nlist; i++) {
       buf[k++] = qsrc[list[i]];
     }
   } else if (flag == REVERSE_AD) {
+    double ***egridn = egrid[n];
     double *src = &egridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
     for (int i = 0; i < nlist; i++)
       buf[i] = src[list[i]];
   } else if (flag == REVERSE_AD_PERATOM) {
+    double ***v0gridn = v0grid[n];
+    double ***v1gridn = v1grid[n];
+    double ***v2gridn = v2grid[n];
+    double ***v3gridn = v3grid[n];
+    double ***v4gridn = v4grid[n];
+    double ***v5gridn = v5grid[n];
     double *v0src = &v0gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
     double *v1src = &v1gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
     double *v2src = &v2gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
@@ -2674,32 +2680,31 @@ void MSM::pack_reverse(int flag, double *buf, int nlist, int *list)
    unpack another proc's ghost values from buf and add to own values
 ------------------------------------------------------------------------- */
 
-void MSM::unpack_reverse(int flag, double *buf, int nlist, int *list)
+void MSM::unpack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  double *buf = (double *) vbuf;
+
   int n = current_level;
-
-  double ***qgridn = qgrid[n];
-  double ***egridn = egrid[n];
-
-  double ***v0gridn = v0grid[n];
-  double ***v1gridn = v1grid[n];
-  double ***v2gridn = v2grid[n];
-  double ***v3gridn = v3grid[n];
-  double ***v4gridn = v4grid[n];
-  double ***v5gridn = v5grid[n];
-
   int k = 0;
 
   if (flag == REVERSE_RHO) {
+    double ***qgridn = qgrid[n];
     double *dest = &qgridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
     for (int i = 0; i < nlist; i++) {
       dest[list[i]] += buf[k++];
     }
   } else if (flag == REVERSE_AD) {
+    double ***egridn = egrid[n];
     double *dest = &egridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
     for (int i = 0; i < nlist; i++)
       dest[list[i]] += buf[k++];
   } else if (flag == REVERSE_AD_PERATOM) {
+    double ***v0gridn = v0grid[n];
+    double ***v1gridn = v1grid[n];
+    double ***v2gridn = v2grid[n];
+    double ***v3gridn = v3grid[n];
+    double ***v4gridn = v4grid[n];
+    double ***v5gridn = v5grid[n];
     double *v0src = &v0gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
     double *v1src = &v1gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
     double *v2src = &v2gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
@@ -3416,3 +3421,24 @@ void MSM::get_virial_direct_top(int n)
     }
   }
 }
+
+/* ----------------------------------------------------------------------
+   memory usage of local arrays
+------------------------------------------------------------------------- */
+
+double MSM::memory_usage()
+{
+  double bytes = 0;
+
+  // NOTE: Stan, fill in other memory allocations here
+  
+  // all GridComm bufs
+
+  bytes += (ngcall_buf1 + ngcall_buf2) * npergrid * sizeof(double);
+
+  for (int n=0; n<levels; n++)
+    if (active_flag[n])
+      bytes += (ngc_buf1[n] + ngc_buf2[n]) * npergrid * sizeof(double);
+
+  return bytes;
+}
diff --git a/src/KSPACE/msm.h b/src/KSPACE/msm.h
index bf393473e7..12b0cbf309 100644
--- a/src/KSPACE/msm.h
+++ b/src/KSPACE/msm.h
@@ -32,6 +32,7 @@ class MSM : public KSpace {
   void setup();
   virtual void settings(int, char **);
   virtual void compute(int, int);
+  virtual double memory_usage();
 
  protected:
   int me,nprocs;
@@ -79,16 +80,21 @@ class MSM : public KSpace {
   int procgrid[3];                  // procs assigned in each dim of 3d grid
   int myloc[3];                     // which proc I am in each dim
   int ***procneigh_levels;          // my 6 neighboring procs, 0/1 = left/right
-  class GridComm **cg;
-  class GridComm **cg_peratom;
-  class GridComm *cg_all;
-  class GridComm *cg_peratom_all;
+  
+  class GridComm *gcall;       // GridComm class for finest level grid
+  class GridComm **gc;         // GridComm classes for each hierarchical level
+
+  double *gcall_buf1,*gcall_buf2;
+  double **gc_buf1,**gc_buf2;
+  int ngcall_buf1,ngcall_buf2,npergrid;
+  int *ngc_buf1,*ngc_buf2;
 
   int current_level;
 
   int **part2grid;             // storage for particle -> grid mapping
   int nmax;
 
+  int triclinic;
   double *boxlo;
 
   void set_grid_global();
@@ -126,15 +132,12 @@ class MSM : public KSpace {
   void get_g_direct_top(int);
   void get_virial_direct_top(int);
 
-  // triclinic
-
-  int triclinic;
-
   // grid communication
-  void pack_forward(int, double *, int, int *);
-  void unpack_forward(int, double *, int, int *);
-  void pack_reverse(int, double *, int, int *);
-  void unpack_reverse(int, double *, int, int *);
+  
+  void pack_forward_grid(int, void *, int, int *);
+  void unpack_forward_grid(int, void *, int, int *);
+  void pack_reverse_grid(int, void *, int, int *);
+  void unpack_reverse_grid(int, void *, int, int *);
 };
 
 }
diff --git a/src/KSPACE/msm_cg.cpp b/src/KSPACE/msm_cg.cpp
index 43bb106051..8236f93c9c 100644
--- a/src/KSPACE/msm_cg.cpp
+++ b/src/KSPACE/msm_cg.cpp
@@ -91,17 +91,7 @@ void MSMCG::compute(int eflag, int vflag)
 
   // invoke allocate_peratom() if needed for first time
 
-  if (vflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    cg_peratom_all->ghost_notify();
-    cg_peratom_all->setup();
-    for (int n=0; n<levels; n++) {
-      if (!active_flag[n]) continue;
-      cg_peratom[n]->ghost_notify();
-      cg_peratom[n]->setup();
-    }
-    peratom_allocate_flag = 1;
-  }
+  if (vflag_atom && !peratom_allocate_flag) allocate_peratom();
 
   // extend size of per-atom arrays if necessary
 
@@ -171,7 +161,8 @@ void MSMCG::compute(int eflag, int vflag)
   //   to fully sum contribution in their 3d grid
 
   current_level = 0;
-  cg_all->reverse_comm(this,REVERSE_RHO);
+  gcall->reverse_comm_kspace(this,1,sizeof(double),REVERSE_RHO,
+			     gcall_buf1,gcall_buf2,MPI_DOUBLE);
 
   // forward communicate charge density values to fill ghost grid points
   // compute direct sum interaction and then restrict to coarser grid
@@ -179,24 +170,30 @@ void MSMCG::compute(int eflag, int vflag)
   for (int n=0; n<=levels-2; n++) {
     if (!active_flag[n]) continue;
     current_level = n;
-    cg[n]->forward_comm(this,FORWARD_RHO);
-
+    gc[n]->forward_comm_kspace(this,1,sizeof(double),FORWARD_RHO,
+			       gc_buf1[n],gc_buf2[n],MPI_DOUBLE);
     direct(n);
     restriction(n);
   }
 
-
   // compute direct interaction for top grid level for non-periodic
   //   and for second from top grid level for periodic
 
   if (active_flag[levels-1]) {
     if (domain->nonperiodic) {
       current_level = levels-1;
-      cg[levels-1]->forward_comm(this,FORWARD_RHO);
+      gc[levels-1]->
+	forward_comm_kspace(this,1,sizeof(double),FORWARD_RHO,
+			    gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
       direct_top(levels-1);
-      cg[levels-1]->reverse_comm(this,REVERSE_AD);
+      gc[levels-1]->
+	reverse_comm_kspace(this,1,sizeof(double),REVERSE_AD,
+			    gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
       if (vflag_atom)
-        cg_peratom[levels-1]->reverse_comm(this,REVERSE_AD_PERATOM);
+	gc[levels-1]->
+	  reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
+			      gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
+
     } else {
       // Here using MPI_Allreduce is cheaper than using commgrid
       grid_swap_forward(levels-1,qgrid[levels-1]);
@@ -204,7 +201,9 @@ void MSMCG::compute(int eflag, int vflag)
       grid_swap_reverse(levels-1,egrid[levels-1]);
       current_level = levels-1;
       if (vflag_atom)
-        cg_peratom[levels-1]->reverse_comm(this,REVERSE_AD_PERATOM);
+	gc[levels-1]->
+	  reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
+			      gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
     }
   }
 
@@ -216,24 +215,28 @@ void MSMCG::compute(int eflag, int vflag)
     prolongation(n);
 
     current_level = n;
-    cg[n]->reverse_comm(this,REVERSE_AD);
+    gc[n]->reverse_comm_kspace(this,1,sizeof(double),REVERSE_AD,
+			       gc_buf1[n],gc_buf2[n],MPI_DOUBLE);
 
     // extra per-atom virial communication
 
     if (vflag_atom)
-      cg_peratom[n]->reverse_comm(this,REVERSE_AD_PERATOM);
+      gc[n]->reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
+				 gc_buf1[n],gc_buf2[n],MPI_DOUBLE);
   }
 
   // all procs communicate E-field values
   // to fill ghost cells surrounding their 3d bricks
 
   current_level = 0;
-  cg_all->forward_comm(this,FORWARD_AD);
+  gcall->forward_comm_kspace(this,1,sizeof(double),FORWARD_AD,
+			     gcall_buf1,gcall_buf2,MPI_DOUBLE);
 
   // extra per-atom energy/virial communication
 
   if (vflag_atom)
-    cg_peratom_all->forward_comm(this,FORWARD_AD_PERATOM);
+    gcall->forward_comm_kspace(this,6,sizeof(double),FORWARD_AD_PERATOM,
+			       gcall_buf1,gcall_buf2,MPI_DOUBLE);
 
   // calculate the force on my particles (interpolation)
 
@@ -536,6 +539,9 @@ void MSMCG::fieldforce_peratom()
   }
 }
 
+/* ----------------------------------------------------------------------
+   memory usage of local arrays
+------------------------------------------------------------------------- */
 
 double MSMCG::memory_usage()
 {
diff --git a/src/KSPACE/pppm.cpp b/src/KSPACE/pppm.cpp
index 173893c22f..d643ce12e2 100644
--- a/src/KSPACE/pppm.cpp
+++ b/src/KSPACE/pppm.cpp
@@ -3444,7 +3444,7 @@ void PPPM::poisson_groups_triclinic()
 }
 
 /* ----------------------------------------------------------------------
-   Slab-geometry correction term to dampen inter-slab interactions between
+   slab-geometry correction term to dampen inter-slab interactions between
    periodically repeating slabs.  Yields good approximation to 2D Ewald if
    adequate empty space is left between repeating slabs (J. Chem. Phys.
    111, 3155).  Slabs defined here to be parallel to the xy plane. Also

From bd7917919bd442883f5717f5f0300ed61c925987 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Thu, 13 Aug 2020 09:34:02 -0400
Subject: [PATCH 06/38] update list of pointer variables to be initialized to
 NULL in the msm constructor

---
 src/KSPACE/msm.cpp | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/src/KSPACE/msm.cpp b/src/KSPACE/msm.cpp
index 93ecce6904..5c401edb8a 100644
--- a/src/KSPACE/msm.cpp
+++ b/src/KSPACE/msm.cpp
@@ -45,20 +45,21 @@ enum{FORWARD_RHO,FORWARD_AD,FORWARD_AD_PERATOM};
 
 /* ---------------------------------------------------------------------- */
 
-MSM::MSM(LAMMPS *lmp) : KSpace(lmp),
-  factors(NULL), delxinv(NULL), delyinv(NULL), delzinv(NULL), nx_msm(NULL),
-  ny_msm(NULL), nz_msm(NULL), nxlo_in(NULL), nylo_in(NULL), nzlo_in(NULL),
-  nxhi_in(NULL), nyhi_in(NULL), nzhi_in(NULL), nxlo_out(NULL), nylo_out(NULL),
-  nzlo_out(NULL), nxhi_out(NULL), nyhi_out(NULL), nzhi_out(NULL), ngrid(NULL),
-  active_flag(NULL), alpha(NULL), betax(NULL), betay(NULL), betaz(NULL),
-  peratom_allocate_flag(0),
-  levels(0), world_levels(NULL), qgrid(NULL), egrid(NULL), v0grid(NULL), v1grid(NULL),
-  v2grid(NULL), v3grid(NULL), v4grid(NULL), v5grid(NULL), g_direct(NULL),
-  v0_direct(NULL), v1_direct(NULL), v2_direct(NULL), v3_direct(NULL), v4_direct(NULL),
-  v5_direct(NULL), g_direct_top(NULL), v0_direct_top(NULL), v1_direct_top(NULL),
-  v2_direct_top(NULL), v3_direct_top(NULL), v4_direct_top(NULL), v5_direct_top(NULL),
-  phi1d(NULL), dphi1d(NULL), procneigh_levels(NULL), gc(NULL),
-  gcall(NULL), part2grid(NULL), boxlo(NULL)
+MSM::MSM(LAMMPS *lmp)
+  : KSpace(lmp),
+    factors(NULL), delxinv(NULL), delyinv(NULL), delzinv(NULL), nx_msm(NULL),
+    ny_msm(NULL), nz_msm(NULL), nxlo_in(NULL), nylo_in(NULL), nzlo_in(NULL),
+    nxhi_in(NULL), nyhi_in(NULL), nzhi_in(NULL), nxlo_out(NULL), nylo_out(NULL),
+    nzlo_out(NULL), nxhi_out(NULL), nyhi_out(NULL), nzhi_out(NULL), ngrid(NULL),
+    active_flag(NULL), alpha(NULL), betax(NULL), betay(NULL), betaz(NULL),
+    peratom_allocate_flag(0),levels(0),world_levels(NULL),qgrid(NULL),egrid(NULL),
+    v0grid(NULL), v1grid(NULL),v2grid(NULL),v3grid(NULL),v4grid(NULL),v5grid(NULL),
+    g_direct(NULL),v0_direct(NULL),v1_direct(NULL),v2_direct(NULL),v3_direct(NULL),
+    v4_direct(NULL),v5_direct(NULL),g_direct_top(NULL),v0_direct_top(NULL),
+    v1_direct_top(NULL),v2_direct_top(NULL),v3_direct_top(NULL),v4_direct_top(NULL),
+    v5_direct_top(NULL),phi1d(NULL),dphi1d(NULL),procneigh_levels(NULL),gcall(NULL),
+    gc(NULL),gcall_buf1(NULL),gcall_buf2(NULL),gc_buf1(NULL),gc_buf2(NULL),
+    ngc_buf1(NULL),ngc_buf2(NULL),part2grid(NULL),boxlo(NULL)
 {
   msmflag = 1;
 

From a1ca4ecbe84874d10b417d089723da68a6f0e691 Mon Sep 17 00:00:00 2001
From: Steve Plimpton <sjplimp@sandia.gov>
Date: Thu, 13 Aug 2020 12:03:32 -0600
Subject: [PATCH 07/38] add GridComm API changes to msm/cg/omp

---
 src/USER-OMP/msm_cg_omp.cpp | 50 ++++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/src/USER-OMP/msm_cg_omp.cpp b/src/USER-OMP/msm_cg_omp.cpp
index 8ff954e83f..95f9cdb3e8 100644
--- a/src/USER-OMP/msm_cg_omp.cpp
+++ b/src/USER-OMP/msm_cg_omp.cpp
@@ -95,17 +95,7 @@ void MSMCGOMP::compute(int eflag, int vflag)
 
   // invoke allocate_peratom() if needed for first time
 
-  if (vflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    cg_peratom_all->ghost_notify();
-    cg_peratom_all->setup();
-    for (int n=0; n<levels; n++) {
-      if (!active_flag[n]) continue;
-      cg_peratom[n]->ghost_notify();
-      cg_peratom[n]->setup();
-    }
-    peratom_allocate_flag = 1;
-  }
+  if (vflag_atom && !peratom_allocate_flag) allocate_peratom();
 
   // extend size of per-atom arrays if necessary
 
@@ -175,7 +165,8 @@ void MSMCGOMP::compute(int eflag, int vflag)
   //   to fully sum contribution in their 3d grid
 
   current_level = 0;
-  cg_all->reverse_comm(this,REVERSE_RHO);
+  gcall->reverse_comm_kspace(this,1,sizeof(double),REVERSE_RHO,
+			     gcall_buf1,gcall_buf2,MPI_DOUBLE);
 
   // forward communicate charge density values to fill ghost grid points
   // compute direct sum interaction and then restrict to coarser grid
@@ -183,24 +174,30 @@ void MSMCGOMP::compute(int eflag, int vflag)
   for (int n=0; n<=levels-2; n++) {
     if (!active_flag[n]) continue;
     current_level = n;
-    cg[n]->forward_comm(this,FORWARD_RHO);
-
+    gc[n]->forward_comm_kspace(this,1,sizeof(double),FORWARD_RHO,
+			       gc_buf1[n],gc_buf2[n],MPI_DOUBLE);
     direct(n);
     restriction(n);
   }
 
-
   // compute direct interaction for top grid level for non-periodic
   //   and for second from top grid level for periodic
 
   if (active_flag[levels-1]) {
     if (domain->nonperiodic) {
       current_level = levels-1;
-      cg[levels-1]->forward_comm(this,FORWARD_RHO);
+      gc[levels-1]->
+	forward_comm_kspace(this,1,sizeof(double),FORWARD_RHO,
+			    gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
       direct_top(levels-1);
-      cg[levels-1]->reverse_comm(this,REVERSE_AD);
+      gc[levels-1]->
+	reverse_comm_kspace(this,1,sizeof(double),REVERSE_AD,
+			    gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
       if (vflag_atom)
-        cg_peratom[levels-1]->reverse_comm(this,REVERSE_AD_PERATOM);
+	gc[levels-1]->
+	  reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
+			      gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
+
     } else {
       // Here using MPI_Allreduce is cheaper than using commgrid
       grid_swap_forward(levels-1,qgrid[levels-1]);
@@ -208,7 +205,9 @@ void MSMCGOMP::compute(int eflag, int vflag)
       grid_swap_reverse(levels-1,egrid[levels-1]);
       current_level = levels-1;
       if (vflag_atom)
-        cg_peratom[levels-1]->reverse_comm(this,REVERSE_AD_PERATOM);
+	gc[levels-1]->
+	  reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
+			      gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
     }
   }
 
@@ -220,24 +219,28 @@ void MSMCGOMP::compute(int eflag, int vflag)
     prolongation(n);
 
     current_level = n;
-    cg[n]->reverse_comm(this,REVERSE_AD);
+    gc[n]->reverse_comm_kspace(this,1,sizeof(double),REVERSE_AD,
+			       gc_buf1[n],gc_buf2[n],MPI_DOUBLE);
 
     // extra per-atom virial communication
 
     if (vflag_atom)
-      cg_peratom[n]->reverse_comm(this,REVERSE_AD_PERATOM);
+      gc[n]->reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
+				 gc_buf1[n],gc_buf2[n],MPI_DOUBLE);
   }
 
   // all procs communicate E-field values
   // to fill ghost cells surrounding their 3d bricks
 
   current_level = 0;
-  cg_all->forward_comm(this,FORWARD_AD);
+  gcall->forward_comm_kspace(this,1,sizeof(double),FORWARD_AD,
+			     gcall_buf1,gcall_buf2,MPI_DOUBLE);
 
   // extra per-atom energy/virial communication
 
   if (vflag_atom)
-    cg_peratom_all->forward_comm(this,FORWARD_AD_PERATOM);
+    gcall->forward_comm_kspace(this,6,sizeof(double),FORWARD_AD_PERATOM,
+			       gcall_buf1,gcall_buf2,MPI_DOUBLE);
 
   // calculate the force on my particles (interpolation)
 
@@ -556,6 +559,7 @@ void MSMCGOMP::fieldforce_peratom()
   }
 }
 
+/* ---------------------------------------------------------------------- */
 
 double MSMCGOMP::memory_usage()
 {

From 40e55af8d7e818db2d2136dc845e5404e3b5a549 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Thu, 13 Aug 2020 14:51:58 -0600
Subject: [PATCH 08/38] Port GridComm changes to Kokkos

---
 src/KOKKOS/gridcomm_kokkos.cpp | 847 ++++++++++++++++++++++-----------
 src/KOKKOS/gridcomm_kokkos.h   |  94 ++--
 src/KOKKOS/kokkos_base_fft.h   |   4 +-
 src/KOKKOS/pppm_kokkos.cpp     |  18 +-
 src/KOKKOS/pppm_kokkos.h       |   5 +-
 src/KSPACE/gridcomm.cpp        |   4 +-
 src/KSPACE/gridcomm.h          |  38 +-
 7 files changed, 654 insertions(+), 356 deletions(-)

diff --git a/src/KOKKOS/gridcomm_kokkos.cpp b/src/KOKKOS/gridcomm_kokkos.cpp
index 024428366e..4bf93a8de9 100644
--- a/src/KOKKOS/gridcomm_kokkos.cpp
+++ b/src/KOKKOS/gridcomm_kokkos.cpp
@@ -15,6 +15,7 @@
 #include <mpi.h>
 #include "comm.h"
 #include "kspace.h"
+#include "irregular.h"
 #include "memory_kokkos.h"
 #include "error.h"
 #include "kokkos_base_fft.h"
@@ -22,104 +23,64 @@
 
 using namespace LAMMPS_NS;
 
+enum{REGULAR,TILED};
+
 #define SWAPDELTA 8
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   NOTES
+   tiled implementation only currently works for RCB, not general tiled
+   if o indices for ghosts are < 0 or hi indices are >= N,
+     then grid is treated as periodic in that dimension,
+     communication is done across the periodic boundaries
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   constructor called by all classes except MSM
+   gcomm = world communicator
+   gn xyz = size of global grid
+   i xyz lohi = portion of global grid this proc owns, 0 <= index < N
+   o xyz lohi = owned grid portion + ghost grid cells needed in all directions
+   if o indices are < 0 or hi indices are >= N,
+     then grid is treated as periodic in that dimension,
+     communication is done across the periodic boundaries
+------------------------------------------------------------------------- */
 
 template<class DeviceType>
-GridCommKokkos<DeviceType>::GridCommKokkos(LAMMPS *lmp, MPI_Comm gcomm, int forward, int reverse,
-                   int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
-                   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi,
-                   int pxlo, int pxhi, int pylo, int pyhi, int pzlo, int pzhi)
-  : Pointers(lmp)
+GridCommKokkos<DeviceType>::GridCommKokkos(LAMMPS *lmp, MPI_Comm gcomm,
+		   int gnx, int gny, int gnz,
+		   int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
+		   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi)
+  : GridComm(lmp, gcomm,
+             gnx, gny, gnz,
+             ixlo,ixhi, iylo, iyhi, izlo, izhi,
+             oxlo, oxhi, oylo, oyhi, ozlo, ozhi)
 {
-  gridcomm = gcomm;
-  MPI_Comm_rank(gridcomm,&me);
-
-  nforward = forward;
-  nreverse = reverse;
-
-  inxlo = ixlo;
-  inxhi = ixhi;
-  inylo = iylo;
-  inyhi = iyhi;
-  inzlo = izlo;
-  inzhi = izhi;
-
-  outxlo = oxlo;
-  outxhi = oxhi;
-  outylo = oylo;
-  outyhi = oyhi;
-  outzlo = ozlo;
-  outzhi = ozhi;
-
-  outxlo_max = oxlo;
-  outxhi_max = oxhi;
-  outylo_max = oylo;
-  outyhi_max = oyhi;
-  outzlo_max = ozlo;
-  outzhi_max = ozhi;
-
-  procxlo = pxlo;
-  procxhi = pxhi;
-  procylo = pylo;
-  procyhi = pyhi;
-  proczlo = pzlo;
-  proczhi = pzhi;
-
-  nswap = 0;
-  swap = NULL;
-  //buf1 = buf2 = NULL;
 }
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   constructor called by MSM
+   gcomm = world communicator or sub-communicator for a hierarchical grid
+   flag = 1 if e xyz lohi values = larger grid stored by caller in gcomm = world
+   flag = 2 if e xyz lohi values = 6 neighbor procs in gcomm
+   gn xyz = size of global grid
+   i xyz lohi = portion of global grid this proc owns, 0 <= index < N
+   o xyz lohi = owned grid portion + ghost grid cells needed in all directions
+   e xyz lohi for flag = 1: extent of larger grid stored by caller
+   e xyz lohi for flag = 2: 6 neighbor procs
+------------------------------------------------------------------------- */
 
 template<class DeviceType>
-GridCommKokkos<DeviceType>::GridCommKokkos(LAMMPS *lmp, MPI_Comm gcomm, int forward, int reverse,
-                   int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
-                   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi,
-                   int oxlo_max, int oxhi_max, int oylo_max, int oyhi_max,
-                   int ozlo_max, int ozhi_max,
-                   int pxlo, int pxhi, int pylo, int pyhi, int pzlo, int pzhi)
-  : Pointers(lmp)
+GridCommKokkos<DeviceType>::GridCommKokkos(LAMMPS *lmp, MPI_Comm gcomm, int flag,
+		   int gnx, int gny, int gnz,
+		   int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
+		   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi,
+		   int exlo, int exhi, int eylo, int eyhi, int ezlo, int ezhi)
+  : GridComm(lmp, gcomm,
+             gnx, gny, gnz,
+             ixlo,ixhi, iylo, iyhi, izlo, izhi,
+             oxlo, oxhi, oylo, oyhi, ozlo, ozhi)
 {
-  gridcomm = gcomm;
-  MPI_Comm_rank(gridcomm,&me);
-
-  nforward = forward;
-  nreverse = reverse;
-
-  inxlo = ixlo;
-  inxhi = ixhi;
-  inylo = iylo;
-  inyhi = iyhi;
-  inzlo = izlo;
-  inzhi = izhi;
-
-  outxlo = oxlo;
-  outxhi = oxhi;
-  outylo = oylo;
-  outyhi = oyhi;
-  outzlo = ozlo;
-  outzhi = ozhi;
-
-  outxlo_max = oxlo_max;
-  outxhi_max = oxhi_max;
-  outylo_max = oylo_max;
-  outyhi_max = oyhi_max;
-  outzlo_max = ozlo_max;
-  outzhi_max = ozhi_max;
-
-  procxlo = pxlo;
-  procxhi = pxhi;
-  procylo = pylo;
-  procyhi = pyhi;
-  proczlo = pzlo;
-  proczhi = pzhi;
-
-  nswap = 0;
-  swap = NULL;
-  //buf1 = buf2 = NULL;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -127,28 +88,42 @@ GridCommKokkos<DeviceType>::GridCommKokkos(LAMMPS *lmp, MPI_Comm gcomm, int forw
 template<class DeviceType>
 GridCommKokkos<DeviceType>::~GridCommKokkos()
 {
+  // regular comm data struct
+  
   for (int i = 0; i < nswap; i++) {
-    //memoryKK->destroy_kokkos(swap[i].k_packlist,swap[i].packlist);
-    //memoryKK->destroy_kokkos(swap[i].k_unpacklist,swap[i].unpacklist);
+    swap[i].packlist = NULL;
+    swap[i].unpacklist = NULL;
+  }
+
+  // tiled comm data structs
+  
+  for (int i = 0; i < nsend; i++)
+    send[i].packlist = NULL;
+
+  for (int i = 0; i < nrecv; i++)
+    k_recv_unpacklist,i = NULL;
+
+  for (int i = 0; i < ncopy; i++) {
+    copy[i].packlist = NULL;
+    copy[i].unpacklist = NULL;
   }
-  memory->sfree(swap);
 
-  //memory->destroy(buf1);
-  //memory->destroy(buf2);
 }
 
-/* ----------------------------------------------------------------------
-   notify 6 neighbor procs how many ghost grid planes I need from them
-   ghostxlo = # of lower grid planes I own that are needed from me
-              by procxlo to become its upper ghost planes
-   ghostxhi = # of upper grid planes I own that are needed from me
-              by procxhi to become its lower ghost planes
-   if no neighbor proc, value is from self
-------------------------------------------------------------------------- */
+/* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
-void GridCommKokkos<DeviceType>::ghost_notify()
+void GridCommKokkos<DeviceType>::setup_regular(int &nbuf1, int &nbuf2)
 {
+  int nsent,sendfirst,sendlast,recvfirst,recvlast;
+  int sendplanes,recvplanes;
+  int notdoneme,notdone;
+
+  // notify 6 neighbor procs how many ghost grid planes I need from them
+  // ghost xyz lo = # of my lower grid planes that proc xyz lo needs as its ghosts
+  // ghost xyz hi = # of my upper grid planes that proc xyz hi needs as its ghosts
+  // if this proc is its own neighbor across periodic bounary, value is from self
+
   int nplanes = inxlo - outxlo;
   if (procxlo != me)
       MPI_Sendrecv(&nplanes,1,MPI_INT,procxlo,0,
@@ -184,49 +159,11 @@ void GridCommKokkos<DeviceType>::ghost_notify()
     MPI_Sendrecv(&nplanes,1,MPI_INT,proczhi,0,
                  &ghostzlo,1,MPI_INT,proczlo,0,gridcomm,MPI_STATUS_IGNORE);
   else ghostzlo = nplanes;
-}
 
-/* ----------------------------------------------------------------------
-   check if all ghost grid comm needs overlap into non nearest-neighbor proc
-   if yes, return 1, else return 0
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-int GridCommKokkos<DeviceType>::ghost_overlap()
-{
-  int nearest = 0;
-  if (ghostxlo > inxhi-inxlo+1) nearest = 1;
-  if (ghostxhi > inxhi-inxlo+1) nearest = 1;
-  if (ghostylo > inyhi-inylo+1) nearest = 1;
-  if (ghostyhi > inyhi-inylo+1) nearest = 1;
-  if (ghostzlo > inzhi-inzlo+1) nearest = 1;
-  if (ghostzhi > inzhi-inzlo+1) nearest = 1;
-
-  int nearest_all;
-  MPI_Allreduce(&nearest,&nearest_all,1,MPI_INT,MPI_MIN,gridcomm);
-
-  return nearest_all;
-}
-
-/* ----------------------------------------------------------------------
-   create swap stencil for grid own/ghost communication
-   swaps covers all 3 dimensions and both directions
-   swaps cover multiple iterations in a direction if need grid pts
-     from further away than nearest-neighbor proc
-   same swap list used by forward and reverse communication
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-void GridCommKokkos<DeviceType>::setup()
-{
-  int nsent,sendfirst,sendlast,recvfirst,recvlast;
-  int sendplanes,recvplanes;
-  int notdoneme,notdone;
-
-  int maxswap = 6;
-  swap = (Swap *) memory->smalloc(maxswap*sizeof(Swap),"Commgrid:swap");
-  k_packlist = DAT::tdual_int_2d("Commgrid:packlist",maxswap,1);
-  k_unpacklist = DAT::tdual_int_2d("Commgrid:unpacklist",maxswap,1);
+  // setup swaps = exchange of grid data with one of 6 neighobr procs
+  // can be more than one in a direction if ghost region extends beyond neigh proc
+  // all procs have same swap count, but swapsize npack/nunpack can be empty
+  
   nswap = 0;
 
   // send own grid pts to -x processor, recv ghost grid pts from +x processor
@@ -238,19 +175,13 @@ void GridCommKokkos<DeviceType>::setup()
   notdone = 1;
 
   while (notdone) {
-    if (nswap == maxswap) {
-      maxswap += SWAPDELTA;
-      swap = (Swap *)
-        memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap");
-      k_packlist.resize(maxswap,k_packlist.extent(1));
-      k_unpacklist.resize(maxswap,k_unpacklist.extent(1));
-    }
+    if (nswap == maxswap) grow_swap();
 
     swap[nswap].sendproc = procxlo;
     swap[nswap].recvproc = procxhi;
     sendplanes = MIN(sendlast-sendfirst+1,ghostxlo-nsent);
     swap[nswap].npack =
-      indices(k_packlist,nswap,
+      indices_kokkos(k_swap_packlist,nswap,
               sendfirst,sendfirst+sendplanes-1,inylo,inyhi,inzlo,inzhi);
 
     if (procxlo != me)
@@ -259,7 +190,7 @@ void GridCommKokkos<DeviceType>::setup()
     else recvplanes = sendplanes;
 
     swap[nswap].nunpack =
-      indices(k_unpacklist,nswap,
+      indices_kokkos(k_swap_unpacklist,nswap,
               recvfirst,recvfirst+recvplanes-1,inylo,inyhi,inzlo,inzhi);
 
     nsent += sendplanes;
@@ -282,19 +213,13 @@ void GridCommKokkos<DeviceType>::setup()
   notdone = 1;
 
   while (notdone) {
-    if (nswap == maxswap) {
-      maxswap += 1;
-      swap = (Swap *)
-        memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap");
-      k_packlist.resize(maxswap,k_packlist.extent(1));
-      k_unpacklist.resize(maxswap,k_unpacklist.extent(1));
-    }
+    if (nswap == maxswap) grow_swap();
 
     swap[nswap].sendproc = procxhi;
     swap[nswap].recvproc = procxlo;
     sendplanes = MIN(sendlast-sendfirst+1,ghostxhi-nsent);
     swap[nswap].npack =
-      indices(k_packlist,nswap,
+      indices_kokkos(k_swap_packlist,nswap,
               sendlast-sendplanes+1,sendlast,inylo,inyhi,inzlo,inzhi);
 
     if (procxhi != me)
@@ -303,7 +228,7 @@ void GridCommKokkos<DeviceType>::setup()
     else recvplanes = sendplanes;
 
     swap[nswap].nunpack =
-      indices(k_unpacklist,nswap,
+      indices_kokkos(k_swap_unpacklist,nswap,
               recvlast-recvplanes+1,recvlast,inylo,inyhi,inzlo,inzhi);
 
     nsent += sendplanes;
@@ -326,19 +251,13 @@ void GridCommKokkos<DeviceType>::setup()
   notdone = 1;
 
   while (notdone) {
-    if (nswap == maxswap) {
-      maxswap += SWAPDELTA;
-      swap = (Swap *)
-        memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap");
-      k_packlist.resize(maxswap,k_packlist.extent(1));
-      k_unpacklist.resize(maxswap,k_unpacklist.extent(1));
-    }
+    if (nswap == maxswap) grow_swap();
 
     swap[nswap].sendproc = procylo;
     swap[nswap].recvproc = procyhi;
     sendplanes = MIN(sendlast-sendfirst+1,ghostylo-nsent);
     swap[nswap].npack =
-      indices(k_packlist,nswap,
+      indices_kokkos(k_swap_packlist,nswap,
               outxlo,outxhi,sendfirst,sendfirst+sendplanes-1,inzlo,inzhi);
 
     if (procylo != me)
@@ -347,7 +266,7 @@ void GridCommKokkos<DeviceType>::setup()
     else recvplanes = sendplanes;
 
     swap[nswap].nunpack =
-      indices(k_unpacklist,nswap,
+      indices_kokkos(k_swap_unpacklist,nswap,
               outxlo,outxhi,recvfirst,recvfirst+recvplanes-1,inzlo,inzhi);
 
     nsent += sendplanes;
@@ -370,19 +289,13 @@ void GridCommKokkos<DeviceType>::setup()
   notdone = 1;
 
   while (notdone) {
-    if (nswap == maxswap) {
-      maxswap += 1;
-      swap = (Swap *)
-        memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap");
-      k_packlist.resize(maxswap,k_packlist.extent(1));
-      k_unpacklist.resize(maxswap,k_unpacklist.extent(1));
-    }
+    if (nswap == maxswap) grow_swap();
 
     swap[nswap].sendproc = procyhi;
     swap[nswap].recvproc = procylo;
     sendplanes = MIN(sendlast-sendfirst+1,ghostyhi-nsent);
     swap[nswap].npack =
-      indices(k_packlist,nswap,
+      indices_kokkos(k_swap_packlist,nswap,
               outxlo,outxhi,sendlast-sendplanes+1,sendlast,inzlo,inzhi);
 
     if (procyhi != me)
@@ -391,7 +304,7 @@ void GridCommKokkos<DeviceType>::setup()
     else recvplanes = sendplanes;
 
     swap[nswap].nunpack =
-      indices(k_unpacklist,nswap,
+      indices_kokkos(k_swap_unpacklist,nswap,
               outxlo,outxhi,recvlast-recvplanes+1,recvlast,inzlo,inzhi);
 
     nsent += sendplanes;
@@ -414,19 +327,13 @@ void GridCommKokkos<DeviceType>::setup()
   notdone = 1;
 
   while (notdone) {
-    if (nswap == maxswap) {
-      maxswap += SWAPDELTA;
-      swap = (Swap *)
-        memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap");
-      k_packlist.resize(maxswap,k_packlist.extent(1));
-      k_unpacklist.resize(maxswap,k_unpacklist.extent(1));
-    }
+    if (nswap == maxswap) grow_swap();
 
     swap[nswap].sendproc = proczlo;
     swap[nswap].recvproc = proczhi;
     sendplanes = MIN(sendlast-sendfirst+1,ghostzlo-nsent);
     swap[nswap].npack =
-      indices(k_packlist,nswap,
+      indices_kokkos(k_swap_packlist,nswap,
               outxlo,outxhi,outylo,outyhi,sendfirst,sendfirst+sendplanes-1);
 
     if (proczlo != me)
@@ -435,7 +342,7 @@ void GridCommKokkos<DeviceType>::setup()
     else recvplanes = sendplanes;
 
     swap[nswap].nunpack =
-      indices(k_unpacklist,nswap,
+      indices_kokkos(k_swap_unpacklist,nswap,
               outxlo,outxhi,outylo,outyhi,recvfirst,recvfirst+recvplanes-1);
 
     nsent += sendplanes;
@@ -458,19 +365,13 @@ void GridCommKokkos<DeviceType>::setup()
   notdone = 1;
 
   while (notdone) {
-    if (nswap == maxswap) {
-      maxswap += 1;
-      swap = (Swap *)
-        memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap");
-      k_packlist.resize(maxswap,k_packlist.extent(1));
-      k_unpacklist.resize(maxswap,k_unpacklist.extent(1));
-    }
+    if (nswap == maxswap) grow_swap();
 
     swap[nswap].sendproc = proczhi;
     swap[nswap].recvproc = proczlo;
     sendplanes = MIN(sendlast-sendfirst+1,ghostzhi-nsent);
     swap[nswap].npack =
-      indices(k_packlist,nswap,
+      indices_kokkos(k_swap_packlist,nswap,
               outxlo,outxhi,outylo,outyhi,sendlast-sendplanes+1,sendlast);
 
     if (proczhi != me)
@@ -479,7 +380,7 @@ void GridCommKokkos<DeviceType>::setup()
     else recvplanes = sendplanes;
 
     swap[nswap].nunpack =
-      indices(k_unpacklist,nswap,
+      indices_kokkos(k_swap_unpacklist,nswap,
               outxlo,outxhi,outylo,outyhi,recvlast-recvplanes+1,recvlast);
 
     nsent += sendplanes;
@@ -493,18 +394,247 @@ void GridCommKokkos<DeviceType>::setup()
     MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
   }
 
-  // nbuf = max of any forward/reverse pack/unpack
+  // ngrid = max of any forward/reverse pack/unpack grid points
 
-  nbuf = 0;
+  int ngrid = 0;
   for (int i = 0; i < nswap; i++) {
-    nbuf = MAX(nbuf,swap[i].npack);
-    nbuf = MAX(nbuf,swap[i].nunpack);
+    ngrid = MAX(ngrid,swap[i].npack);
+    ngrid = MAX(ngrid,swap[i].nunpack);
   }
-  nbuf *= MAX(nforward,nreverse);
-  //memory->create(buf1,nbuf,"Commgrid:buf1");
-  k_buf1 = FFT_DAT::tdual_FFT_SCALAR_1d("Commgrid:buf1",nbuf);
-  //memory->create(buf2,nbuf,"Commgrid:buf2");
-  k_buf2 = FFT_DAT::tdual_FFT_SCALAR_1d("Commgrid:buf2",nbuf);
+
+  nbuf1 = nbuf2 = ngrid;
+}
+
+/* ----------------------------------------------------------------------
+   NOTE: need to doc this header
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
+{
+  int i,m;
+  double xlo,xhi,ylo,yhi,zlo,zhi;
+  int ghostbox[6],pbc[3];
+
+  // setup RCB tree of cut info for grid
+  // access CommTiled to get cut dimension
+  // cut = this proc's inlo in that dim
+  // dim is -1 for proc 0, but never accessed
+  
+  rcbinfo = (RCBinfo *)
+    memory->smalloc(nprocs*sizeof(RCBinfo),"GridCommKokkos:rcbinfo");
+  RCBinfo rcbone;
+  rcbone.dim = comm->rcbcutdim;
+  if (rcbone.dim <= 0) rcbone.cut = inxlo;
+  else if (rcbone.dim == 1) rcbone.cut = inylo;
+  else if (rcbone.dim == 2) rcbone.cut = inzlo;
+  MPI_Allgather(&rcbone,sizeof(RCBinfo),MPI_CHAR,
+                rcbinfo,sizeof(RCBinfo),MPI_CHAR,gridcomm);
+
+  // find overlaps of my extended ghost box with all other procs
+  // accounts for crossings of periodic boundaries
+  // noverlap = # of overlaps, including self
+  // overlap = vector of overlap info using Overlap data struct
+  
+  ghostbox[0] = outxlo;
+  ghostbox[1] = outxhi;
+  ghostbox[2] = outylo;
+  ghostbox[3] = outyhi;
+  ghostbox[4] = outzlo;
+  ghostbox[5] = outzhi;
+  
+  pbc[0] = pbc[1] = pbc[2] = 0;
+
+  memory->create(overlap_procs,nprocs,"GridCommKokkos:overlap_procs");
+  noverlap = maxoverlap = 0;
+  overlap = NULL;
+
+  ghost_box_drop(ghostbox,pbc);
+
+  // send each proc an overlap message
+  // content: me, index of my overlap, box that overlaps with its owned cells
+  // ncopy = # of overlaps with myself, across a periodic boundary
+
+  int *proclist;
+  memory->create(proclist,noverlap,"GridCommKokkos:proclist");
+  srequest = (Request *)
+    memory->smalloc(noverlap*sizeof(Request),"GridCommKokkos:srequest");
+  
+  int nsend_request = 0;
+  ncopy = 0;
+  
+  for (m = 0; m < noverlap; m++) {
+    if (overlap[m].proc == me) ncopy++;
+    else {
+      proclist[nsend_request] = overlap[m].proc;
+      srequest[nsend_request].sender = me;
+      srequest[nsend_request].index = m;
+      for (i = 0; i < 6; i++)
+	srequest[nsend_request].box[i] = overlap[m].box[i];
+      nsend_request++;
+    }
+  }
+
+  Irregular *irregular = new Irregular(lmp);
+  int nrecv_request = irregular->create_data(nsend_request,proclist,1);
+  Request *rrequest =
+    (Request *) memory->smalloc(nrecv_request*sizeof(Request),"GridCommKokkos:rrequest");
+  irregular->exchange_data((char *) srequest,sizeof(Request),(char *) rrequest);
+  irregular->destroy_data();
+  
+  // compute overlaps between received ghost boxes and my owned box
+  // overlap box used to setup my Send data struct and respond to requests
+
+  send = (Send *) memory->smalloc(nrecv_request*sizeof(Send),"GridCommKokkos:send");
+
+  k_send_packlist = DAT::t_int_1d("GridCommKokkos:send_packlist",nrecv_request,k_send_packlist.extent(1));
+
+  sresponse = (Response *)
+    memory->smalloc(nrecv_request*sizeof(Response),"GridCommKokkos:sresponse");
+  memory->destroy(proclist);
+  memory->create(proclist,nrecv_request,"GridCommKokkos:proclist");
+
+  for (m = 0; m < nrecv_request; m++) {
+    send[m].proc = rrequest[m].sender;
+    xlo = MAX(rrequest[m].box[0],inxlo);
+    xhi = MIN(rrequest[m].box[1],inxhi);
+    ylo = MAX(rrequest[m].box[2],inylo);
+    yhi = MIN(rrequest[m].box[3],inyhi);
+    zlo = MAX(rrequest[m].box[4],inzlo);
+    zhi = MIN(rrequest[m].box[5],inzhi);
+    send[m].npack = indices_kokkos(k_send_packlist,m,xlo,xhi,ylo,yhi,zlo,zhi);
+
+    proclist[m] = rrequest[m].sender;
+    sresponse[m].index = rrequest[m].index;
+    sresponse[m].box[0] = xlo;
+    sresponse[m].box[1] = xhi;
+    sresponse[m].box[2] = ylo;
+    sresponse[m].box[3] = yhi;
+    sresponse[m].box[4] = zlo;
+    sresponse[m].box[5] = zhi;
+  }
+
+  nsend = nrecv_request;
+  
+  // reply to each Request message with a Response message
+  // content: index for the overlap on requestor, overlap box on my owned grid
+
+  int nsend_response = nrecv_request;
+  int nrecv_response = irregular->create_data(nsend_response,proclist,1);
+  Response *rresponse =
+    (Response *) memory->smalloc(nrecv_response*sizeof(Response),"GridCommKokkos:rresponse");
+  irregular->exchange_data((char *) sresponse,sizeof(Response),(char *) rresponse);
+  irregular->destroy_data();
+  delete irregular;
+
+  // process received responses
+  // box used to setup my Recv data struct after unwrapping via PBC
+  // adjacent = 0 if any box of ghost cells does not adjoin my owned cells
+  
+  recv = (Recv *) memory->smalloc(nrecv_response*sizeof(Recv),"GridCommKokkos:recv");
+
+  k_recv_unpacklist = DAT::t_int_1d("GridCommKokkos:recv_unpacklist",nrecv_response,k_recv_unpacklist.extent(1));
+
+  adjacent = 1;
+  
+  for (i = 0; i < nrecv_response; i++) {
+    m = rresponse[i].index;
+    recv[i].proc = overlap[m].proc;
+    xlo = rresponse[i].box[0] + overlap[m].pbc[0] * nx;
+    xhi = rresponse[i].box[1] + overlap[m].pbc[0] * nx;
+    ylo = rresponse[i].box[2] + overlap[m].pbc[1] * ny;
+    yhi = rresponse[i].box[3] + overlap[m].pbc[1] * ny;
+    zlo = rresponse[i].box[4] + overlap[m].pbc[2] * nz;
+    zhi = rresponse[i].box[5] + overlap[m].pbc[2] * nz;
+    recv[i].nunpack = indices(k_recv_unpacklist,i,xlo,xhi,ylo,yhi,zlo,zhi);
+    
+    if (xlo != inxhi+1 && xhi != inxlo-1 &&
+	ylo != inyhi+1 && yhi != inylo-1 &&
+	zlo != inzhi+1 && zhi != inzlo-1) adjacent = 0;
+  }
+
+  nrecv = nrecv_response;
+
+  // create Copy data struct from overlaps with self
+  
+  copy = (Copy *) memory->smalloc(ncopy*sizeof(Copy),"GridCommKokkos:copy");
+
+  k_copy_packlist = DAT::t_int_2d("GridCommKokkos:copy_packlist",ncopy,k_copy_packlist.extent(1));
+  k_copy_unpacklist = DAT::t_int_2d("GridCommKokkos:copy_unpacklist",ncopy,k_copy_unpacklist.extent(1));
+ 
+  ncopy = 0;
+  for (m = 0; m < noverlap; m++) {
+    if (overlap[m].proc != me) continue;
+    xlo = overlap[m].box[0];
+    xhi = overlap[m].box[1];
+    ylo = overlap[m].box[2];
+    yhi = overlap[m].box[3];
+    zlo = overlap[m].box[4];
+    zhi = overlap[m].box[5];
+    copy[ncopy].npack = indices_kokkos(k_copy_packlist,ncopy,xlo,xhi,ylo,yhi,zlo,zhi);
+    xlo = overlap[m].box[0] + overlap[m].pbc[0] * nx;
+    xhi = overlap[m].box[1] + overlap[m].pbc[0] * nx;
+    ylo = overlap[m].box[2] + overlap[m].pbc[1] * ny;
+    yhi = overlap[m].box[3] + overlap[m].pbc[1] * ny;
+    zlo = overlap[m].box[4] + overlap[m].pbc[2] * nz;
+    zhi = overlap[m].box[5] + overlap[m].pbc[2] * nz;
+    copy[ncopy].nunpack = indices_kokkos(k_copy_unpacklist,ncopy,xlo,xhi,ylo,yhi,zlo,zhi);
+    ncopy++;
+  }
+
+  // set offsets for received data
+
+  int offset = 0;
+  for (m = 0; m < nsend; m++) {
+    send[m].offset = offset;
+    offset += send[m].npack;
+  }
+
+  offset = 0;
+  for (m = 0; m < nrecv; m++) {
+    recv[m].offset = offset;
+    offset += recv[m].nunpack;
+  }
+
+  // length of MPI requests vector is max of nsend, nrecv
+
+  int nrequest = MAX(nsend,nrecv);
+  requests = new MPI_Request[nrequest];
+    
+  // clean-up
+
+  memory->sfree(rcbinfo);
+  memory->destroy(proclist);
+  memory->destroy(overlap_procs);
+  memory->sfree(overlap);
+  memory->sfree(srequest);
+  memory->sfree(rrequest);
+  memory->sfree(sresponse);
+  memory->sfree(rresponse);
+
+  // nbuf1 = largest pack or unpack in any Send or Recv or Copy
+  // nbuf2 = larget of sum of all packs or unpacks in Send or Recv
+  
+  nbuf1 = 0;
+
+  for (m = 0; m < ncopy; m++) {
+    nbuf1 = MAX(nbuf1,copy[m].npack);
+    nbuf1 = MAX(nbuf1,copy[m].nunpack);
+  }
+
+  int nbufs = 0;
+  for (m = 0; m < nsend; m++) {
+    nbuf1 = MAX(nbuf1,send[m].npack);
+    nbufs += send[m].npack;
+  }
+
+  int nbufr = 0;
+  for (m = 0; m < nrecv; m++) {
+    nbuf1 = MAX(nbuf1,recv[m].nunpack);
+    nbufr += recv[m].nunpack;
+  }
+
+  nbuf2 = MAX(nbufs,nbufr);
 }
 
 /* ----------------------------------------------------------------------
@@ -512,38 +642,55 @@ void GridCommKokkos<DeviceType>::setup()
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
-void GridCommKokkos<DeviceType>::forward_comm(KSpace *kspace, int which)
+void GridCommKokkos<DeviceType>::forward_comm_kspace(KSpace *kspace, int nper, int nbyte, int which,
+				   FFT_DAT::tdual_FFT_SCALAR_1d k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d k_buf2, MPI_Datatype datatype)
 {
-  k_packlist.sync<DeviceType>();
-  k_unpacklist.sync<DeviceType>();
+  if (layout == REGULAR)
+    forward_comm_kspace_regular(kspace,nper,nbyte,which,k_buf1,k_buf2,datatype);
+  else
+    forward_comm_kspace_tiled(kspace,nper,nbyte,which,k_buf1,k_buf2,datatype);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void GridCommKokkos<DeviceType>::
+forward_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
+			    FFT_DAT::tdual_FFT_SCALAR_1d k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d k_buf2, MPI_Datatype datatype)
+{
+  int m;
+  MPI_Request request;
 
   KokkosBaseFFT* kspaceKKBase = dynamic_cast<KokkosBaseFFT*>(kspace);
+  FFT_SCALAR* buf1;
+  FFT_SCALAR* buf2;
+  if (lmp->kokkos->cuda_aware_flag) {
+    buf1 = k_buf1.view<DeviceType>().data();
+    buf2 = k_buf2.view<DeviceType>().data();
+  } else {
+    buf1 = k_buf1.h_view.data();
+    buf2 = k_buf2.h_view.data();
+  }
 
-  for (int m = 0; m < nswap; m++) {
+  for (m = 0; m < nswap; m++) {
     if (swap[m].sendproc == me)
-      kspaceKKBase->pack_forward_kspace_kokkos(which,k_buf2,swap[m].npack,k_packlist,m);
+      KokkosBaseFFT->pack_forward_grid_kokkos(which,k_buf2,swap[m].npack,k_swap_packlist,m);
     else
-      kspaceKKBase->pack_forward_kspace_kokkos(which,k_buf1,swap[m].npack,k_packlist,m);
+      KokkosBaseFFT->pack_forward_grid_kokkos(which,k_buf1,swap[m].npack,k_swap_packlist,m);
     DeviceType().fence();
 
     if (swap[m].sendproc != me) {
-      FFT_SCALAR* buf1;
-      FFT_SCALAR* buf2;
-      if (lmp->kokkos->cuda_aware_flag) {
-        buf1 = k_buf1.view<DeviceType>().data();
-        buf2 = k_buf2.view<DeviceType>().data();
-      } else {
+
+      if (!lmp->kokkos->cuda_aware_flag) {
         k_buf1.modify<DeviceType>();
         k_buf1.sync<LMPHostType>();
-        buf1 = k_buf1.h_view.data();
-        buf2 = k_buf2.h_view.data();
       }
 
-      MPI_Irecv(buf2,nforward*swap[m].nunpack,MPI_FFT_SCALAR,
-                swap[m].recvproc,0,gridcomm,&request);
-      MPI_Send(buf1,nforward*swap[m].npack,MPI_FFT_SCALAR,
-               swap[m].sendproc,0,gridcomm);
-      MPI_Wait(&request,MPI_STATUS_IGNORE);
+      if (swap[m].nunpack) MPI_Irecv(buf2,nper*swap[m].nunpack,datatype,
+				     swap[m].recvproc,0,gridcomm,&request);
+      if (swap[m].npack) MPI_Send(buf1,nper*swap[m].npack,datatype,
+				  swap[m].sendproc,0,gridcomm);
+      if (swap[m].nunpack) MPI_Wait(&request,MPI_STATUS_IGNORE);
 
       if (!lmp->kokkos->cuda_aware_flag) {
         k_buf2.modify<LMPHostType>();
@@ -551,7 +698,75 @@ void GridCommKokkos<DeviceType>::forward_comm(KSpace *kspace, int which)
       }
     }
 
-    kspaceKKBase->unpack_forward_kspace_kokkos(which,k_buf2,swap[m].nunpack,k_unpacklist,m);
+    KokkosBaseFFT->unpack_forward_grid_kokkos(which,k_buf2,0,swap[m].nunpack,k_swap_unpacklist,m);
+    DeviceType().fence();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void GridCommKokkos<DeviceType>::
+forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
+			  FFT_DAT::tdual_FFT_SCALAR_1d k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d k_buf2, MPI_Datatype datatype)
+{
+  int i,m,offset;
+
+  KokkosBaseFFT* kspaceKKBase = dynamic_cast<KokkosBaseFFT*>(kspace);
+
+  KokkosBaseFFT* kspaceKKBase = dynamic_cast<KokkosBaseFFT*>(kspace);
+  FFT_SCALAR* buf1;
+  FFT_SCALAR* buf2;
+  if (lmp->kokkos->cuda_aware_flag) {
+    buf1 = k_buf1.view<DeviceType>().data();
+    buf2 = k_buf2.view<DeviceType>().data();
+  } else {
+    buf1 = k_buf1.h_view.data();
+    buf2 = k_buf2.h_view.data();
+  }
+
+  // post all receives
+  
+  for (m = 0; m < nrecv; m++) {
+    offset = nper * recv[m].offset * nbyte;
+    MPI_Irecv(buf2[offset],nper*recv[m].nunpack,datatype,
+	      recv[m].proc,0,gridcomm,&requests[m]);
+  }
+
+  // perform all sends to other procs
+
+  for (m = 0; m < nsend; m++) {
+    KokkosBaseFFT->pack_forward_grid_kokkos(which,k_buf1,send[m].npack,k_send_packlist,m);
+    DeviceType().fence();
+
+    if (!lmp->kokkos->cuda_aware_flag) {
+      k_buf1.modify<DeviceType>();
+      k_buf1.sync<LMPHostType>();
+    }
+
+    MPI_Send(buf1,nper*send[m].npack,datatype,send[m].proc,0,gridcomm);
+  }
+
+  // perform all copies to self
+
+  for (m = 0; m < ncopy; m++) {
+    KokkosBaseFFT->pack_forward_grid_kokkos(which,k_buf1,copy[m].npack,k_copy_packlist,m);
+    KokkosBaseFFT->unpack_forward_grid_kokkos(which,k_buf1,0,copy[m].nunpack,k_copy_unpacklist,m);
+  }
+
+  // unpack all received data
+  
+  for (i = 0; i < nrecv; i++) {
+    MPI_Waitany(nrecv,requests,&m,MPI_STATUS_IGNORE);
+
+    if (!lmp->kokkos->cuda_aware_flag) {
+      k_buf2.modify<LMPHostType>();
+      k_buf2.sync<DeviceType>();
+    }
+
+    offset = nper * recv[m].offset * nbyte;
+    KokkosBaseFFT->unpack_forward_grid_kokkos(which,k_buf2,offset,
+				recv[m].nunpack,k_recv_unpacklist,m);
     DeviceType().fence();
   }
 }
@@ -562,38 +777,56 @@ void GridCommKokkos<DeviceType>::forward_comm(KSpace *kspace, int which)
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
-void GridCommKokkos<DeviceType>::reverse_comm(KSpace *kspace, int which)
+void GridCommKokkos<DeviceType>::reverse_comm_kspace(KSpace *kspace, int nper, int nbyte, int which,
+				    FFT_DAT::tdual_FFT_SCALAR_1d k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d k_buf2, MPI_Datatype datatype)
 {
-  k_packlist.sync<DeviceType>();
-  k_unpacklist.sync<DeviceType>();
+  if (layout == REGULAR)
+    reverse_comm_kspace_regular(kspace,nper,nbyte,which,k_buf1,k_buf2,datatype);
+  else
+    reverse_comm_kspace_tiled(kspace,nper,nbyte,which,k_buf1,k_buf2,datatype);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void GridCommKokkos<DeviceType>::
+reverse_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
+			    FFT_DAT::tdual_FFT_SCALAR_1d k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d k_buf2, MPI_Datatype datatype)
+{
+  int m;
+  MPI_Request request;
 
   KokkosBaseFFT* kspaceKKBase = dynamic_cast<KokkosBaseFFT*>(kspace);
+  FFT_SCALAR* buf1;
+  FFT_SCALAR* buf2;
+  if (lmp->kokkos->cuda_aware_flag) {
+    buf1 = k_buf1.view<DeviceType>().data();
+    buf2 = k_buf2.view<DeviceType>().data();
+  } else {
+    buf1 = k_buf1.h_view.data();
+    buf2 = k_buf2.h_view.data();
+  }
 
-  for (int m = nswap-1; m >= 0; m--) {
+  for (m = nswap-1; m >= 0; m--) {
     if (swap[m].recvproc == me)
-      kspaceKKBase->pack_reverse_kspace_kokkos(which,k_buf2,swap[m].nunpack,k_unpacklist,m);
+      KokkosBaseFFT->pack_reverse_grid(which,k_buf2,swap[m].nunpack,k_swap_unpacklist,m);
     else
-      kspaceKKBase->pack_reverse_kspace_kokkos(which,k_buf1,swap[m].nunpack,k_unpacklist,m);
+      KokkosBaseFFT->pack_reverse_grid(which,k_buf1,swap[m].nunpack,k_swap_unpacklist,m);
     DeviceType().fence();
 
     if (swap[m].recvproc != me) {
-      FFT_SCALAR* buf1;
-      FFT_SCALAR* buf2;
-      if (lmp->kokkos->cuda_aware_flag) {
-        buf1 = k_buf1.view<DeviceType>().data();
-        buf2 = k_buf2.view<DeviceType>().data();
-      } else {
+
+      if (!lmp->kokkos->cuda_aware_flag) {
         k_buf1.modify<DeviceType>();
         k_buf1.sync<LMPHostType>();
-        buf1 = k_buf1.h_view.data();
-        buf2 = k_buf2.h_view.data();
       }
 
-      MPI_Irecv(buf2,nreverse*swap[m].npack,MPI_FFT_SCALAR,
-                swap[m].sendproc,0,gridcomm,&request);
-      MPI_Send(buf1,nreverse*swap[m].nunpack,MPI_FFT_SCALAR,
-               swap[m].recvproc,0,gridcomm);
-      MPI_Wait(&request,MPI_STATUS_IGNORE);
+      if (swap[m].npack) MPI_Irecv(buf2,nper*swap[m].npack,datatype,
+				   swap[m].sendproc,0,gridcomm,&request);
+      if (swap[m].nunpack) MPI_Send(buf1,nper*swap[m].nunpack,datatype,
+				     swap[m].recvproc,0,gridcomm);
+      if (swap[m].npack) MPI_Wait(&request,MPI_STATUS_IGNORE);
+
 
       if (!lmp->kokkos->cuda_aware_flag) {
         k_buf2.modify<LMPHostType>();
@@ -601,27 +834,120 @@ void GridCommKokkos<DeviceType>::reverse_comm(KSpace *kspace, int which)
       }
     }
 
-    kspaceKKBase->unpack_reverse_kspace_kokkos(which,k_buf2,swap[m].npack,k_packlist,m);
+    KokkosBaseFFT->unpack_reverse_grid_kokkos(which,k_buf2,0,swap[m].npack,k_swap_packlist,m);
+    DeviceType().fence();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void GridCommKokkos<DeviceType>::
+reverse_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
+			  FFT_DAT::tdual_FFT_SCALAR_1d k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d k_buf2, MPI_Datatype datatype)
+{
+  int i,m,offset;
+
+  KokkosBaseFFT* kspaceKKBase = dynamic_cast<KokkosBaseFFT*>(kspace);
+
+  FFT_SCALAR* buf1;
+  FFT_SCALAR* buf2;
+  if (lmp->kokkos->cuda_aware_flag) {
+    buf1 = k_buf1.view<DeviceType>().data();
+    buf2 = k_buf2.view<DeviceType>().data();
+  } else {
+    k_buf1.modify<DeviceType>();
+    k_buf1.sync<LMPHostType>();
+    buf1 = k_buf1.h_view.data();
+    buf2 = k_buf2.h_view.data();
+  }
+
+  // post all receives
+  
+  for (m = 0; m < nsend; m++) {
+    offset = nper * send[m].offset * nbyte;
+    MPI_Irecv(buf2[offset],nper*send[m].npack,datatype,
+	      send[m].proc,0,gridcomm,&requests[m]);
+  }
+
+  // perform all sends to other procs
+
+  for (m = 0; m < nrecv; m++) {
+    KokkosBaseFFT->pack_reverse_grid(which,k_buf1,recv[m].nunpack,k_recv_unpacklist,m);
+    DeviceType().fence();
+
+    if (!lmp->kokkos->cuda_aware_flag) {
+      k_buf1.modify<DeviceType>();
+      k_buf1.sync<LMPHostType>();
+    }
+
+    MPI_Send(buf1,nper*recv[m].nunpack,datatype,recv[m].proc,0,gridcomm);
+  }
+
+  // perform all copies to self
+
+  for (m = 0; m < ncopy; m++) {
+    KokkosBaseFFT->pack_reverse_grid(which,k_buf1,copy[m].nunpack,k_copy_unpacklist,m);
+    KokkosBaseFFT->unpack_reverse_grid_kokkos(which,k_buf1,0,copy[m].npack,k_copy_packlist,m);
+  }
+
+  // unpack all received data
+  
+  for (i = 0; i < nsend; i++) {
+    MPI_Waitany(nsend,requests,&m,MPI_STATUS_IGNORE);
+
+    if (!lmp->kokkos->cuda_aware_flag) {
+      k_buf2.modify<LMPHostType>();
+      k_buf2.sync<DeviceType>();
+    }
+
+    offset = nper * send[m].offset * nbyte;
+    KokkosBaseFFT->unpack_reverse_grid_kokkos(which,k_buf2,offset,
+				send[m].npack,k_send_packlist,m);
     DeviceType().fence();
   }
 }
 
 /* ----------------------------------------------------------------------
-   create 1d list of offsets into 3d array section (xlo:xhi,ylo:yhi,zlo:zhi)
-   assume 3d array is allocated as (0:outxhi_max-outxlo_max+1,0:outyhi_max-outylo_max+1,
-     0:outzhi_max-outzlo_max+1)
+   create swap stencil for grid own/ghost communication
+   swaps covers all 3 dimensions and both directions
+   swaps cover multiple iterations in a direction if need grid pts
+     from further away than nearest-neighbor proc
+   same swap list used by forward and reverse communication
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
-int GridCommKokkos<DeviceType>::indices(DAT::tdual_int_2d &k_list, int index,
+void GridCommKokkos<DeviceType>::grow_swap()
+{
+  maxswap += SWAPDELTA;
+  swap = (Swap *)
+    memory->srealloc(swap,maxswap*sizeof(Swap),"GridCommKokkos:swap");
+
+  if (!k_swap_packlist.data() {
+    k_swap_packlist = DAT::t_int_1d("GridCommKokkos:swap_packlist",maxswap,k_swap_packlist.extent(1));
+    k_swap_unpacklist = DAT::t_int_1d("GridCommKokkos:swap_unpacklist",maxswap,k_swap_unpacklist.extent(1));
+  } else {
+    k_swap_packlist.resize(maxswap,k_swap_packlist.extent(1));
+    k_swap_unpacklist.resize(maxswap,k_swap_unpacklist.extent(1));
+  }
+}
+
+/* ----------------------------------------------------------------------
+   create 1d list of offsets into 3d array section (xlo:xhi,ylo:yhi,zlo:zhi)
+   assume 3d array is allocated as (0:fullxhi-fullxlo+1,0:fullyhi-fullylo+1,
+     0:fullzhi-fullzlo+1)
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+int GridCommKokkos<DeviceType>::indices_kokkos(DAT::tdual_int_2d &k_list, int index,
                        int xlo, int xhi, int ylo, int yhi, int zlo, int zhi)
 {
   int nmax = (xhi-xlo+1) * (yhi-ylo+1) * (zhi-zlo+1);
   if (k_list.extent(1) < nmax)
     k_list.resize(k_list.extent(0),nmax);
 
-  int nx = (outxhi_max-outxlo_max+1);
-  int ny = (outyhi_max-outylo_max+1);
+  int nx = (fullxhi-fullxlo+1);
+  int ny = (fullyhi-fullylo+1);
 
   k_list.sync<LMPHostType>();
 
@@ -630,7 +956,7 @@ int GridCommKokkos<DeviceType>::indices(DAT::tdual_int_2d &k_list, int index,
   for (iz = zlo; iz <= zhi; iz++)
     for (iy = ylo; iy <= yhi; iy++)
       for (ix = xlo; ix <= xhi; ix++)
-        k_list.h_view(index,n++) = (iz-outzlo_max)*ny*nx + (iy-outylo_max)*nx + (ix-outxlo_max);
+        k_list.h_view(index,n++) = (iz-fullzlo)*ny*nx + (iy-fullylo)*nx + (ix-fullxlo);
 
   k_list.modify<LMPHostType>();
   k_list.sync<DeviceType>();
@@ -638,21 +964,10 @@ int GridCommKokkos<DeviceType>::indices(DAT::tdual_int_2d &k_list, int index,
   return nmax;
 }
 
-
-/* ----------------------------------------------------------------------
-   memory usage of send/recv bufs
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-double GridCommKokkos<DeviceType>::memory_usage()
-{
-  double bytes = 2*nbuf * sizeof(double);
-  return bytes;
-}
-
 namespace LAMMPS_NS {
 template class GridCommKokkos<LMPDeviceType>;
 #ifdef LMP_KOKKOS_GPU
 template class GridCommKokkos<LMPHostType>;
 #endif
 }
+
diff --git a/src/KOKKOS/gridcomm_kokkos.h b/src/KOKKOS/gridcomm_kokkos.h
index 086834b467..5a057f8c6b 100644
--- a/src/KOKKOS/gridcomm_kokkos.h
+++ b/src/KOKKOS/gridcomm_kokkos.h
@@ -14,83 +14,63 @@
 #ifndef LMP_GRIDCOMM_KOKKOS_H
 #define LMP_GRIDCOMM_KOKKOS_H
 
-#include "pointers.h"
+#include "gridcomm.h"
 #include "kokkos_type.h"
 #include "fftdata_kokkos.h"
 
-#ifdef FFT_SINGLE
-typedef float FFT_SCALAR;
-#define MPI_FFT_SCALAR MPI_FLOAT
-#else
-typedef double FFT_SCALAR;
-#define MPI_FFT_SCALAR MPI_DOUBLE
-#endif
-
 namespace LAMMPS_NS {
 
 template<class DeviceType>
-class GridCommKokkos : protected Pointers {
+class GridCommKokkos : public GridComm {
  public:
   typedef DeviceType device_type;
   typedef ArrayTypes<DeviceType> AT;
   typedef FFTArrayTypes<DeviceType> FFT_AT;
-
-  GridCommKokkos(class LAMMPS *, MPI_Comm, int, int,
-           int, int, int, int, int, int,
-           int, int, int, int, int, int,
-           int, int, int, int, int, int);
-  GridCommKokkos(class LAMMPS *, MPI_Comm, int, int,
-           int, int, int, int, int, int,
-           int, int, int, int, int, int,
-           int, int, int, int, int, int,
-           int, int, int, int, int, int);
+  GridCommKokkos(class LAMMPS *, MPI_Comm, int, int, int,
+	   int, int, int, int, int, int,
+	   int, int, int, int, int, int);
+  GridCommKokkos(class LAMMPS *, MPI_Comm, int, int, int, int,
+	   int, int, int, int, int, int,
+	   int, int, int, int, int, int,
+	   int, int, int, int, int, int);
   ~GridCommKokkos();
-  void ghost_notify();
-  int ghost_overlap();
-  void setup();
-  void forward_comm(class KSpace *, int);
-  void reverse_comm(class KSpace *, int);
-  double memory_usage();
+  void setup(int &, int &);
+  int ghost_adjacent();
+  void forward_comm_kspace(class KSpace *, int, int, int,
+			   void *, void *, MPI_Datatype);
+  void reverse_comm_kspace(class KSpace *, int, int, int,
+			   void *, void *, MPI_Datatype);
 
  private:
-  int me;
-  int nforward,nreverse;
-  MPI_Comm gridcomm;
-  MPI_Request request;
+  DAT::tdual_int_2d k_swap_packlist;
+  DAT::tdual_int_2d k_swap_unpacklist;
 
-  // in = inclusive indices of 3d grid chunk that I own
-  // out = inclusive indices of 3d grid chunk I own plus ghosts I use
-  // proc = 6 neighbor procs that surround me
-  // ghost = # of my owned grid planes needed from me
-  //         by each of 6 neighbor procs to become their ghost planes
+  DAT::tdual_int_2d k_send_packlist;
 
-  int inxlo,inxhi,inylo,inyhi,inzlo,inzhi;
-  int outxlo,outxhi,outylo,outyhi,outzlo,outzhi;
-  int outxlo_max,outxhi_max,outylo_max,outyhi_max,outzlo_max,outzhi_max;
-  int procxlo,procxhi,procylo,procyhi,proczlo,proczhi;
-  int ghostxlo,ghostxhi,ghostylo,ghostyhi,ghostzlo,ghostzhi;
+  DAT::tdual_int_2d k_recv_unpacklist;
 
-  int nbuf;
-  //FFT_SCALAR *buf1,*buf2;
-  FFT_DAT::tdual_FFT_SCALAR_1d k_buf1;
-  FFT_DAT::tdual_FFT_SCALAR_1d k_buf2;
+  DAT::tdual_int_2d k_copy_packlist;
+  DAT::tdual_int_2d k_copy_unpacklist;
 
-  struct Swap {
-    int sendproc;       // proc to send to for forward comm
-    int recvproc;       // proc to recv from for forward comm
-    int npack;          // # of datums to pack
-    int nunpack;        // # of datums to unpack
-    //int *packlist;      // 3d array offsets to pack
-    //int *unpacklist;    // 3d array offsets to unpack
-  };
+  // -------------------------------------------
+  // internal methods
+  // -------------------------------------------
 
-  DAT::tdual_int_2d k_packlist;
-  DAT::tdual_int_2d k_unpacklist;
+  void setup_regular(int &, int &);
+  void setup_tiled(int &, int &);
 
-  int nswap;
-  Swap *swap;
+  void forward_comm_kspace_regular(class KSpace *, int, int, int,
+                                   void *, void *, MPI_Datatype);
+  void forward_comm_kspace_tiled(class KSpace *, int, int, int,
+                                 void *, void *, MPI_Datatype);
+  void reverse_comm_kspace_regular(class KSpace *, int, int, int,
+                                   void *, void *, MPI_Datatype);
+  void reverse_comm_kspace_tiled(class KSpace *, int, int, int,
+                                 void *, void *, MPI_Datatype);
 
-  int indices(DAT::tdual_int_2d &, int, int, int, int, int, int, int);
+  void grow_swap();
+  
+  int indices_kokkos(DAT::tdual_int_2d &, int, int, int, int, int, int, int);
 };
 
 }
diff --git a/src/KOKKOS/kokkos_base_fft.h b/src/KOKKOS/kokkos_base_fft.h
index 7d0829e4d6..2814e9c0bc 100644
--- a/src/KOKKOS/kokkos_base_fft.h
+++ b/src/KOKKOS/kokkos_base_fft.h
@@ -24,9 +24,9 @@ class KokkosBaseFFT {
 
   //Kspace
   virtual void pack_forward_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
-  virtual void unpack_forward_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
+  virtual void unpack_forward_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, int, DAT::tdual_int_2d &, int) {};
   virtual void pack_reverse_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
-  virtual void unpack_reverse_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
+  virtual void unpack_reverse_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, int, DAT::tdual_int_2d &, int) {};
 };
 
 }
diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp
index 743f2daec9..2893c5c5b3 100644
--- a/src/KOKKOS/pppm_kokkos.cpp
+++ b/src/KOKKOS/pppm_kokkos.cpp
@@ -2568,7 +2568,7 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_fieldforce_peratom, const int &i
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
-void PPPMKokkos<DeviceType>::pack_forward_kspace_kokkos(int flag, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf, int nlist, DAT::tdual_int_2d &k_list, int index)
+void PPPMKokkos<DeviceType>::pack_forward_grid_kokkos(int flag, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf, int nlist, DAT::tdual_int_2d &k_list, int index)
 {
   typename AT::t_int_2d_um d_list = k_list.view<DeviceType>();
   d_list_index = Kokkos::subview(d_list,index,Kokkos::ALL());
@@ -2624,11 +2624,12 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_pack_forward2, const int &i) con
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
-void PPPMKokkos<DeviceType>::unpack_forward_kspace_kokkos(int flag, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf, int nlist, DAT::tdual_int_2d &k_list, int index)
+void PPPMKokkos<DeviceType>::unpack_forward_grid_kokkos(int flag, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf, int offset, int nlist, DAT::tdual_int_2d &k_list, int index)
 {
   typename AT::t_int_2d_um d_list = k_list.view<DeviceType>();
   d_list_index = Kokkos::subview(d_list,index,Kokkos::ALL());
   d_buf = k_buf.view<DeviceType>();
+  unpack_offset = offset;
 
   nx = (nxhi_out-nxlo_out+1);
   ny = (nyhi_out-nylo_out+1);
@@ -2652,9 +2653,9 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_unpack_forward1, const int &i) c
   const int iz = (int) (dlist/(nx*ny));
   const int iy = (int) ((dlist - iz*nx*ny)/nx);
   const int ix = d_list_index[i] - iz*nx*ny - iy*nx;
-  d_vdx_brick(iz,iy,ix) = d_buf[3*i];
-  d_vdy_brick(iz,iy,ix) = d_buf[3*i+1];
-  d_vdz_brick(iz,iy,ix) = d_buf[3*i+2];
+  d_vdx_brick(iz,iy,ix) = d_buf[3*i + unpack_offset];
+  d_vdy_brick(iz,iy,ix) = d_buf[3*i+1 + unpack_offset];
+  d_vdz_brick(iz,iy,ix) = d_buf[3*i+2 + unpack_offset];
 }
 
 template<class DeviceType>
@@ -2681,7 +2682,7 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_unpack_forward2, const int &i) c
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
-void PPPMKokkos<DeviceType>::pack_reverse_kspace_kokkos(int flag, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf, int nlist, DAT::tdual_int_2d &k_list, int index)
+void PPPMKokkos<DeviceType>::pack_reverse_grid_kokkos(int flag, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf, int nlist, DAT::tdual_int_2d &k_list, int index)
 {
   typename AT::t_int_2d_um d_list = k_list.view<DeviceType>();
   d_list_index = Kokkos::subview(d_list,index,Kokkos::ALL());
@@ -2711,11 +2712,12 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_pack_reverse, const int &i) cons
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
-void PPPMKokkos<DeviceType>::unpack_reverse_kspace_kokkos(int flag, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf, int nlist, DAT::tdual_int_2d &k_list, int index)
+void PPPMKokkos<DeviceType>::unpack_reverse_grid_kokkos(int flag, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf, int offset, int nlist, DAT::tdual_int_2d &k_list, int index)
 {
   typename AT::t_int_2d_um d_list = k_list.view<DeviceType>();
   d_list_index = Kokkos::subview(d_list,index,Kokkos::ALL());
   d_buf = k_buf.view<DeviceType>();
+  unpack_offset = offset;
 
   nx = (nxhi_out-nxlo_out+1);
   ny = (nyhi_out-nylo_out+1);
@@ -2733,7 +2735,7 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_unpack_reverse, const int &i) co
   const int iz = (int) (dlist/(nx*ny));
   const int iy = (int) ((dlist - iz*nx*ny)/nx);
   const int ix = d_list_index[i] - iz*nx*ny - iy*nx;
-  d_density_brick(iz,iy,ix) += d_buf[i];
+  d_density_brick(iz,iy,ix) += d_buf[i + unpack_offset];
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/KOKKOS/pppm_kokkos.h b/src/KOKKOS/pppm_kokkos.h
index 56cd6f5140..4e98826105 100644
--- a/src/KOKKOS/pppm_kokkos.h
+++ b/src/KOKKOS/pppm_kokkos.h
@@ -311,6 +311,7 @@ class PPPMKokkos : public PPPM, public KokkosBaseFFT {
   int nx,ny,nz;
   typename AT::t_int_1d_um d_list_index;
   typename FFT_AT::t_FFT_SCALAR_1d_um d_buf;
+  int unpack_offset;
 
   DAT::tdual_int_scalar k_flag;
 
@@ -404,9 +405,9 @@ class PPPMKokkos : public PPPM, public KokkosBaseFFT {
   // grid communication
 
   void pack_forward_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int);
-  void unpack_forward_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int);
+  void unpack_forward_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, int, DAT::tdual_int_2d &, int);
   void pack_reverse_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int);
-  void unpack_reverse_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int);
+  void unpack_reverse_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, int, DAT::tdual_int_2d &, int);
 
   // triclinic
 
diff --git a/src/KSPACE/gridcomm.cpp b/src/KSPACE/gridcomm.cpp
index 06d786e309..51623691a3 100644
--- a/src/KSPACE/gridcomm.cpp
+++ b/src/KSPACE/gridcomm.cpp
@@ -1095,8 +1095,8 @@ void GridComm::grow_overlap()
 
 /* ----------------------------------------------------------------------
    create 1d list of offsets into 3d array section (xlo:xhi,ylo:yhi,zlo:zhi)
-   assume 3d array is allocated as (outxlo_max:outxhi_max,outylo_max:outyhi_max,
-     outzlo_max:outzhi_max)
+   assume 3d array is allocated as (fullxlo:fullxhi,fullylo:fullyhi,
+     fullzlo:fullzhi)
 ------------------------------------------------------------------------- */
 
 int GridComm::indices(int *&list,
diff --git a/src/KSPACE/gridcomm.h b/src/KSPACE/gridcomm.h
index 8b2539b977..4fc410d435 100644
--- a/src/KSPACE/gridcomm.h
+++ b/src/KSPACE/gridcomm.h
@@ -27,15 +27,15 @@ class GridComm : protected Pointers {
 	   int, int, int, int, int, int,
 	   int, int, int, int, int, int,
 	   int, int, int, int, int, int);
-  ~GridComm();
-  void setup(int &, int &);
-  int ghost_adjacent();
-  void forward_comm_kspace(class KSpace *, int, int, int,
-			   void *, void *, MPI_Datatype);
-  void reverse_comm_kspace(class KSpace *, int, int, int,
-			   void *, void *, MPI_Datatype);
+  virtual ~GridComm();
+  virtual void setup(int &, int &);
+  virtual int ghost_adjacent();
+  virtual void forward_comm_kspace(class KSpace *, int, int, int,
+                                   void *, void *, MPI_Datatype);
+  virtual void reverse_comm_kspace(class KSpace *, int, int, int,
+                                   void *, void *, MPI_Datatype);
 
- private:
+ protected:
   int me,nprocs;
   int layout;                 // REGULAR or TILED
   MPI_Comm gridcomm;          // communicator for this class
@@ -181,24 +181,24 @@ class GridComm : protected Pointers {
 		  int, int, int, int, int, int,
 		  int, int, int, int, int, int,
 		  int, int, int, int, int, int);
-  void setup_regular(int &, int &);
-  void setup_tiled(int &, int &);
+  virtual void setup_regular(int &, int &);
+  virtual void setup_tiled(int &, int &);
   void ghost_box_drop(int *, int *);
   void box_drop_grid(int *, int, int, int &, int *);
   
   int ghost_adjacent_regular();
   int ghost_adjacent_tiled();
   
-  void forward_comm_kspace_regular(class KSpace *, int, int, int,
-				   void *, void *, MPI_Datatype);
-  void forward_comm_kspace_tiled(class KSpace *, int, int, int,
-				 void *, void *, MPI_Datatype);
-  void reverse_comm_kspace_regular(class KSpace *, int, int, int,
-				   void *, void *, MPI_Datatype);
-  void reverse_comm_kspace_tiled(class KSpace *, int, int, int,
-				 void *, void *, MPI_Datatype);
+  virtual void forward_comm_kspace_regular(class KSpace *, int, int, int,
+                                         void *, void *, MPI_Datatype);
+  virtual void forward_comm_kspace_tiled(class KSpace *, int, int, int,
+                                         void *, void *, MPI_Datatype);
+  virtual void reverse_comm_kspace_regular(class KSpace *, int, int, int,
+                                           void *, void *, MPI_Datatype);
+  virtual void reverse_comm_kspace_tiled(class KSpace *, int, int, int,
+                                         void *, void *, MPI_Datatype);
 
-  void grow_swap();
+  virtual void grow_swap();
   void grow_overlap();
   
   int indices(int *&, int, int, int, int, int, int);

From eb78f8842fe894c8ae8fd0faf10e109d9e8fa49a Mon Sep 17 00:00:00 2001
From: Steve Plimpton <sjplimp@sandia.gov>
Date: Fri, 14 Aug 2020 09:41:30 -0600
Subject: [PATCH 09/38] another change to pppm/stagger

---
 src/KSPACE/pppm_stagger.cpp | 225 ++++++++++++++++++------------------
 1 file changed, 115 insertions(+), 110 deletions(-)

diff --git a/src/KSPACE/pppm_stagger.cpp b/src/KSPACE/pppm_stagger.cpp
index 0444b93a71..03a7374587 100644
--- a/src/KSPACE/pppm_stagger.cpp
+++ b/src/KSPACE/pppm_stagger.cpp
@@ -271,10 +271,16 @@ void PPPMStagger::compute(int eflag, int vflag)
 
 double PPPMStagger::compute_qopt()
 {
-  if (differentiation_flag == 1)
-    return compute_qopt_ad();
+  if (differentiation_flag == 1) return compute_qopt_ad();
+
+  int k,l,m,nx,ny,nz,kper,lper,mper;
+  double snx,sny,snz;
+  double cnx,cny,cnz;
+  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  double sum1,sum2,dot1,dot2;
+  double numerator,denominator;
+  double u1,u2,u3,sqk;
 
-  double qopt = 0.0;
   const double * const prd = domain->prd;
 
   const double xprd = prd[0];
@@ -285,77 +291,76 @@ double PPPMStagger::compute_qopt()
   const double unitky = (MY_2PI/yprd);
   const double unitkz = (MY_2PI/zprd_slab);
 
-  double snx,sny,snz;
-  double cnx,cny,cnz;
-  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
-  double sum1,sum2,dot1,dot2;
-  double numerator,denominator;
-  double u1,u2,u3,sqk;
-
-  int k,l,m,nx,ny,nz,kper,lper,mper;
-
   const int nbx = 2;
   const int nby = 2;
   const int nbz = 2;
 
   const int twoorder = 2*order;
 
-  for (m = nzlo_fft; m <= nzhi_fft; m++) {
-    mper = m - nz_pppm*(2*m/nz_pppm);
+  // loop over entire FFT grid
+  // each proc calculates contributions from every Pth grid point
+  
+  bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
+  int nxy_pppm = nx_pppm * ny_pppm;
+  
+  double qopt = 0.0;
+
+  for (bigint i = me; i < ngridtotal; i += nprocs) {
+    k = i % nx_pppm;
+    l = (i/nx_pppm) % ny_pppm;
+    m = i / nxy_pppm;
+
+    const int kper = k - nx_pppm*(2*k/nx_pppm);
+    const int lper = l - ny_pppm*(2*l/ny_pppm);
+    const int mper = m - nz_pppm*(2*m/nz_pppm);
+
+    sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
+    if (sqk == 0.0) continue;
+
+    snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));
+    cnx = cos(0.5*unitkx*kper*xprd/nx_pppm);
+    sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));
+    cny = cos(0.5*unitky*lper*yprd/ny_pppm);
     snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm));
     cnz = cos(0.5*unitkz*mper*zprd_slab/nz_pppm);
 
-    for (l = nylo_fft; l <= nyhi_fft; l++) {
-      lper = l - ny_pppm*(2*l/ny_pppm);
-      sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));
-      cny = cos(0.5*unitky*lper*yprd/ny_pppm);
+    numerator = MY_4PI/sqk;
+    denominator = 0.5*(gf_denom(snx,sny,snz) + gf_denom2(cnx,cny,cnz));
 
-      for (k = nxlo_fft; k <= nxhi_fft; k++) {
-        kper = k - nx_pppm*(2*k/nx_pppm);
-        snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));
-        cnx = cos(0.5*unitkx*kper*xprd/nx_pppm);
+    sum1 = sum2 = 0.0;
 
-        sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
+    for (nx = -nbx; nx <= nbx; nx++) {
+      qx = unitkx*(kper+nx_pppm*nx);
+      sx = exp(-0.25*square(qx/g_ewald));
+      argx = 0.5*qx*xprd/nx_pppm;
+      wx = powsinxx(argx,twoorder);
 
-        if (sqk != 0.0) {
-          numerator = MY_4PI/sqk;
-          denominator = 0.5*(gf_denom(snx,sny,snz) + gf_denom2(cnx,cny,cnz));
-          sum1 = 0.0;
-          sum2 = 0.0;
+      for (ny = -nby; ny <= nby; ny++) {
+	qy = unitky*(lper+ny_pppm*ny);
+	sy = exp(-0.25*square(qy/g_ewald));
+	argy = 0.5*qy*yprd/ny_pppm;
+	wy = powsinxx(argy,twoorder);
 
-          for (nx = -nbx; nx <= nbx; nx++) {
-            qx = unitkx*(kper+nx_pppm*nx);
-            sx = exp(-0.25*square(qx/g_ewald));
-            argx = 0.5*qx*xprd/nx_pppm;
-            wx = powsinxx(argx,twoorder);
+	for (nz = -nbz; nz <= nbz; nz++) {
+	  qz = unitkz*(mper+nz_pppm*nz);
+	  sz = exp(-0.25*square(qz/g_ewald));
+	  argz = 0.5*qz*zprd_slab/nz_pppm;
+	  wz = powsinxx(argz,twoorder);
 
-            for (ny = -nby; ny <= nby; ny++) {
-              qy = unitky*(lper+ny_pppm*ny);
-              sy = exp(-0.25*square(qy/g_ewald));
-              argy = 0.5*qy*yprd/ny_pppm;
-              wy = powsinxx(argy,twoorder);
-
-              for (nz = -nbz; nz <= nbz; nz++) {
-                qz = unitkz*(mper+nz_pppm*nz);
-                sz = exp(-0.25*square(qz/g_ewald));
-                argz = 0.5*qz*zprd_slab/nz_pppm;
-                wz = powsinxx(argz,twoorder);
-
-                dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
-                dot2 = qx*qx + qy*qy + qz*qz;
-                u1   = sx*sy*sz;
-                u2   = wx*wy*wz;
-                u3   = numerator*u1*u2*dot1;
-                sum1 += u1*u1*MY_4PI*MY_4PI/dot2;
-                sum2 += u3*u3/dot2;
-              }
-            }
-          }
-          qopt += sum1 - sum2/denominator;
-        }
+	  dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
+	  dot2 = qx*qx + qy*qy + qz*qz;
+	  u1   = sx*sy*sz;
+	  u2   = wx*wy*wz;
+	  u3   = numerator*u1*u2*dot1;
+	  sum1 += u1*u1*MY_4PI*MY_4PI/dot2;
+	  sum2 += u3*u3/dot2;
+	}
       }
     }
+      
+    qopt += sum1 - sum2/denominator;
   }
+  
   double qopt_all;
   MPI_Allreduce(&qopt,&qopt_all,1,MPI_DOUBLE,MPI_SUM,world);
   return qopt_all;
@@ -367,7 +372,11 @@ double PPPMStagger::compute_qopt()
 
 double PPPMStagger::compute_qopt_ad()
 {
-  double qopt = 0.0;
+  int k,l,m,nx,ny,nz,kper,lper,mper;
+  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  double sum1,sum2,sum3,sum4,sum5,sum6,dot2;
+  double u1,u2,sqk;
+
   const double * const prd = domain->prd;
 
   const double xprd = prd[0];
@@ -378,72 +387,68 @@ double PPPMStagger::compute_qopt_ad()
   const double unitky = (MY_2PI/yprd);
   const double unitkz = (MY_2PI/zprd_slab);
 
-  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
-  double sum1,sum2,sum3,sum4,sum5,sum6,dot2;
-  double u1,u2,sqk;
-
-  int k,l,m,nx,ny,nz,kper,lper,mper;
-
   const int nbx = 2;
   const int nby = 2;
   const int nbz = 2;
 
   const int twoorder = 2*order;
 
-  for (m = nzlo_fft; m <= nzhi_fft; m++) {
-    mper = m - nz_pppm*(2*m/nz_pppm);
+  // loop over entire FFT grid
+  // each proc calculates contributions from every Pth grid point
+  
+  bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
+  int nxy_pppm = nx_pppm * ny_pppm;
+  
+  double qopt = 0.0;
 
-    for (l = nylo_fft; l <= nyhi_fft; l++) {
-      lper = l - ny_pppm*(2*l/ny_pppm);
+  for (bigint i = me; i < ngridtotal; i += nprocs) {
+    k = i % nx_pppm;
+    l = (i/nx_pppm) % ny_pppm;
+    m = i / nxy_pppm;
 
-      for (k = nxlo_fft; k <= nxhi_fft; k++) {
-        kper = k - nx_pppm*(2*k/nx_pppm);
+    const int kper = k - nx_pppm*(2*k/nx_pppm);
+    const int lper = l - ny_pppm*(2*l/ny_pppm);
+    const int mper = m - nz_pppm*(2*m/nz_pppm);
 
-        sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
+    sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
+    if (sqk == 0.0) continue;
 
-        if (sqk != 0.0) {
-          sum1 = 0.0;
-          sum2 = 0.0;
-          sum3 = 0.0;
-          sum4 = 0.0;
-          sum5 = 0.0;
-          sum6 = 0.0;
+    sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = 0.0;
+    
+    for (nx = -nbx; nx <= nbx; nx++) {
+      qx = unitkx*(kper+nx_pppm*nx);
+      sx = exp(-0.25*square(qx/g_ewald));
+      argx = 0.5*qx*xprd/nx_pppm;
+      wx = powsinxx(argx,twoorder);
 
-          for (nx = -nbx; nx <= nbx; nx++) {
-            qx = unitkx*(kper+nx_pppm*nx);
-            sx = exp(-0.25*square(qx/g_ewald));
-            argx = 0.5*qx*xprd/nx_pppm;
-            wx = powsinxx(argx,twoorder);
-
-            for (ny = -nby; ny <= nby; ny++) {
-              qy = unitky*(lper+ny_pppm*ny);
-              sy = exp(-0.25*square(qy/g_ewald));
-              argy = 0.5*qy*yprd/ny_pppm;
-              wy = powsinxx(argy,twoorder);
-
-              for (nz = -nbz; nz <= nbz; nz++) {
-                qz = unitkz*(mper+nz_pppm*nz);
-                sz = exp(-0.25*square(qz/g_ewald));
-                argz = 0.5*qz*zprd_slab/nz_pppm;
-                wz = powsinxx(argz,twoorder);
-
-                dot2 = qx*qx + qy*qy + qz*qz;
-                u1   = sx*sy*sz;
-                u2   = wx*wy*wz;
-                sum1 += u1*u1/dot2*MY_4PI*MY_4PI;
-                sum2 += u1*u1*u2*u2*MY_4PI*MY_4PI;
-                sum3 += u2;
-                sum4 += dot2*u2;
-                sum5 += u2*powint(-1.0,nx+ny+nz);
-                sum6 += dot2*u2*powint(-1.0,nx+ny+nz);
-              }
-            }
-          }
-          qopt += sum1 - sum2/(0.5*(sum3*sum4 + sum5*sum6));
-        }
+      for (ny = -nby; ny <= nby; ny++) {
+	qy = unitky*(lper+ny_pppm*ny);
+	sy = exp(-0.25*square(qy/g_ewald));
+	argy = 0.5*qy*yprd/ny_pppm;
+	wy = powsinxx(argy,twoorder);
+	
+	for (nz = -nbz; nz <= nbz; nz++) {
+	  qz = unitkz*(mper+nz_pppm*nz);
+	  sz = exp(-0.25*square(qz/g_ewald));
+	  argz = 0.5*qz*zprd_slab/nz_pppm;
+	  wz = powsinxx(argz,twoorder);
+	  
+	  dot2 = qx*qx + qy*qy + qz*qz;
+	  u1   = sx*sy*sz;
+	  u2   = wx*wy*wz;
+	  sum1 += u1*u1/dot2*MY_4PI*MY_4PI;
+	  sum2 += u1*u1*u2*u2*MY_4PI*MY_4PI;
+	  sum3 += u2;
+	  sum4 += dot2*u2;
+	  sum5 += u2*powint(-1.0,nx+ny+nz);
+	  sum6 += dot2*u2*powint(-1.0,nx+ny+nz);
+	}
       }
     }
+    
+    qopt += sum1 - sum2/(0.5*(sum3*sum4 + sum5*sum6));
   }
+
   double qopt_all;
   MPI_Allreduce(&qopt,&qopt_all,1,MPI_DOUBLE,MPI_SUM,world);
   return qopt_all;

From 4d31afce2d61f9c15091fad52c01f51976f3aeb7 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Tue, 18 Aug 2020 16:06:43 -0600
Subject: [PATCH 10/38] Port changes to PPPMKokkos

---
 src/KOKKOS/gridcomm_kokkos.cpp |  64 ++++-----
 src/KOKKOS/gridcomm_kokkos.h   |  14 +-
 src/KOKKOS/kokkos_base_fft.h   |   8 +-
 src/KOKKOS/pppm_kokkos.cpp     | 253 +++++++--------------------------
 src/KOKKOS/pppm_kokkos.h       |  16 ++-
 5 files changed, 104 insertions(+), 251 deletions(-)

diff --git a/src/KOKKOS/gridcomm_kokkos.cpp b/src/KOKKOS/gridcomm_kokkos.cpp
index 4bf93a8de9..3c9e61ed00 100644
--- a/src/KOKKOS/gridcomm_kokkos.cpp
+++ b/src/KOKKOS/gridcomm_kokkos.cpp
@@ -101,7 +101,7 @@ GridCommKokkos<DeviceType>::~GridCommKokkos()
     send[i].packlist = NULL;
 
   for (int i = 0; i < nrecv; i++)
-    k_recv_unpacklist,i = NULL;
+    recv[i].unpacklist = NULL;
 
   for (int i = 0; i < ncopy; i++) {
     copy[i].packlist = NULL;
@@ -487,7 +487,7 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
 
   send = (Send *) memory->smalloc(nrecv_request*sizeof(Send),"GridCommKokkos:send");
 
-  k_send_packlist = DAT::t_int_1d("GridCommKokkos:send_packlist",nrecv_request,k_send_packlist.extent(1));
+  k_send_packlist = DAT::tdual_int_2d("GridCommKokkos:send_packlist",nrecv_request,k_send_packlist.extent(1));
 
   sresponse = (Response *)
     memory->smalloc(nrecv_request*sizeof(Response),"GridCommKokkos:sresponse");
@@ -533,7 +533,7 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
   
   recv = (Recv *) memory->smalloc(nrecv_response*sizeof(Recv),"GridCommKokkos:recv");
 
-  k_recv_unpacklist = DAT::t_int_1d("GridCommKokkos:recv_unpacklist",nrecv_response,k_recv_unpacklist.extent(1));
+  k_recv_unpacklist = DAT::tdual_int_2d("GridCommKokkos:recv_unpacklist",nrecv_response,k_recv_unpacklist.extent(1));
 
   adjacent = 1;
   
@@ -546,7 +546,7 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
     yhi = rresponse[i].box[3] + overlap[m].pbc[1] * ny;
     zlo = rresponse[i].box[4] + overlap[m].pbc[2] * nz;
     zhi = rresponse[i].box[5] + overlap[m].pbc[2] * nz;
-    recv[i].nunpack = indices(k_recv_unpacklist,i,xlo,xhi,ylo,yhi,zlo,zhi);
+    recv[i].nunpack = indices_kokkos(k_recv_unpacklist,i,xlo,xhi,ylo,yhi,zlo,zhi);
     
     if (xlo != inxhi+1 && xhi != inxlo-1 &&
 	ylo != inyhi+1 && yhi != inylo-1 &&
@@ -559,8 +559,8 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
   
   copy = (Copy *) memory->smalloc(ncopy*sizeof(Copy),"GridCommKokkos:copy");
 
-  k_copy_packlist = DAT::t_int_2d("GridCommKokkos:copy_packlist",ncopy,k_copy_packlist.extent(1));
-  k_copy_unpacklist = DAT::t_int_2d("GridCommKokkos:copy_unpacklist",ncopy,k_copy_unpacklist.extent(1));
+  k_copy_packlist = DAT::tdual_int_2d("GridCommKokkos:copy_packlist",ncopy,k_copy_packlist.extent(1));
+  k_copy_unpacklist = DAT::tdual_int_2d("GridCommKokkos:copy_unpacklist",ncopy,k_copy_unpacklist.extent(1));
  
   ncopy = 0;
   for (m = 0; m < noverlap; m++) {
@@ -643,7 +643,7 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
 
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::forward_comm_kspace(KSpace *kspace, int nper, int nbyte, int which,
-				   FFT_DAT::tdual_FFT_SCALAR_1d k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d k_buf2, MPI_Datatype datatype)
+				   FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
 {
   if (layout == REGULAR)
     forward_comm_kspace_regular(kspace,nper,nbyte,which,k_buf1,k_buf2,datatype);
@@ -656,7 +656,7 @@ void GridCommKokkos<DeviceType>::forward_comm_kspace(KSpace *kspace, int nper, i
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::
 forward_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
-			    FFT_DAT::tdual_FFT_SCALAR_1d k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d k_buf2, MPI_Datatype datatype)
+			    FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
 {
   int m;
   MPI_Request request;
@@ -674,9 +674,9 @@ forward_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
 
   for (m = 0; m < nswap; m++) {
     if (swap[m].sendproc == me)
-      KokkosBaseFFT->pack_forward_grid_kokkos(which,k_buf2,swap[m].npack,k_swap_packlist,m);
+      kspaceKKBase->pack_forward_grid_kokkos(which,k_buf2,swap[m].npack,k_swap_packlist,m);
     else
-      KokkosBaseFFT->pack_forward_grid_kokkos(which,k_buf1,swap[m].npack,k_swap_packlist,m);
+      kspaceKKBase->pack_forward_grid_kokkos(which,k_buf1,swap[m].npack,k_swap_packlist,m);
     DeviceType().fence();
 
     if (swap[m].sendproc != me) {
@@ -698,7 +698,7 @@ forward_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
       }
     }
 
-    KokkosBaseFFT->unpack_forward_grid_kokkos(which,k_buf2,0,swap[m].nunpack,k_swap_unpacklist,m);
+    kspaceKKBase->unpack_forward_grid_kokkos(which,k_buf2,0,swap[m].nunpack,k_swap_unpacklist,m);
     DeviceType().fence();
   }
 }
@@ -708,12 +708,10 @@ forward_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::
 forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
-			  FFT_DAT::tdual_FFT_SCALAR_1d k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d k_buf2, MPI_Datatype datatype)
+			  FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
 {
   int i,m,offset;
 
-  KokkosBaseFFT* kspaceKKBase = dynamic_cast<KokkosBaseFFT*>(kspace);
-
   KokkosBaseFFT* kspaceKKBase = dynamic_cast<KokkosBaseFFT*>(kspace);
   FFT_SCALAR* buf1;
   FFT_SCALAR* buf2;
@@ -729,14 +727,14 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
   
   for (m = 0; m < nrecv; m++) {
     offset = nper * recv[m].offset * nbyte;
-    MPI_Irecv(buf2[offset],nper*recv[m].nunpack,datatype,
+    MPI_Irecv(&buf2[offset],nper*recv[m].nunpack,datatype,
 	      recv[m].proc,0,gridcomm,&requests[m]);
   }
 
   // perform all sends to other procs
 
   for (m = 0; m < nsend; m++) {
-    KokkosBaseFFT->pack_forward_grid_kokkos(which,k_buf1,send[m].npack,k_send_packlist,m);
+    kspaceKKBase->pack_forward_grid_kokkos(which,k_buf1,send[m].npack,k_send_packlist,m);
     DeviceType().fence();
 
     if (!lmp->kokkos->cuda_aware_flag) {
@@ -750,8 +748,8 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
   // perform all copies to self
 
   for (m = 0; m < ncopy; m++) {
-    KokkosBaseFFT->pack_forward_grid_kokkos(which,k_buf1,copy[m].npack,k_copy_packlist,m);
-    KokkosBaseFFT->unpack_forward_grid_kokkos(which,k_buf1,0,copy[m].nunpack,k_copy_unpacklist,m);
+    kspaceKKBase->pack_forward_grid_kokkos(which,k_buf1,copy[m].npack,k_copy_packlist,m);
+    kspaceKKBase->unpack_forward_grid_kokkos(which,k_buf1,0,copy[m].nunpack,k_copy_unpacklist,m);
   }
 
   // unpack all received data
@@ -765,7 +763,7 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
     }
 
     offset = nper * recv[m].offset * nbyte;
-    KokkosBaseFFT->unpack_forward_grid_kokkos(which,k_buf2,offset,
+    kspaceKKBase->unpack_forward_grid_kokkos(which,k_buf2,offset,
 				recv[m].nunpack,k_recv_unpacklist,m);
     DeviceType().fence();
   }
@@ -778,7 +776,7 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
 
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::reverse_comm_kspace(KSpace *kspace, int nper, int nbyte, int which,
-				    FFT_DAT::tdual_FFT_SCALAR_1d k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d k_buf2, MPI_Datatype datatype)
+				    FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
 {
   if (layout == REGULAR)
     reverse_comm_kspace_regular(kspace,nper,nbyte,which,k_buf1,k_buf2,datatype);
@@ -791,7 +789,7 @@ void GridCommKokkos<DeviceType>::reverse_comm_kspace(KSpace *kspace, int nper, i
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::
 reverse_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
-			    FFT_DAT::tdual_FFT_SCALAR_1d k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d k_buf2, MPI_Datatype datatype)
+			    FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
 {
   int m;
   MPI_Request request;
@@ -809,9 +807,9 @@ reverse_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
 
   for (m = nswap-1; m >= 0; m--) {
     if (swap[m].recvproc == me)
-      KokkosBaseFFT->pack_reverse_grid(which,k_buf2,swap[m].nunpack,k_swap_unpacklist,m);
+      kspaceKKBase->pack_reverse_grid_kokkos(which,k_buf2,swap[m].nunpack,k_swap_unpacklist,m);
     else
-      KokkosBaseFFT->pack_reverse_grid(which,k_buf1,swap[m].nunpack,k_swap_unpacklist,m);
+      kspaceKKBase->pack_reverse_grid_kokkos(which,k_buf1,swap[m].nunpack,k_swap_unpacklist,m);
     DeviceType().fence();
 
     if (swap[m].recvproc != me) {
@@ -834,7 +832,7 @@ reverse_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
       }
     }
 
-    KokkosBaseFFT->unpack_reverse_grid_kokkos(which,k_buf2,0,swap[m].npack,k_swap_packlist,m);
+    kspaceKKBase->unpack_reverse_grid_kokkos(which,k_buf2,0,swap[m].npack,k_swap_packlist,m);
     DeviceType().fence();
   }
 }
@@ -844,7 +842,7 @@ reverse_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::
 reverse_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
-			  FFT_DAT::tdual_FFT_SCALAR_1d k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d k_buf2, MPI_Datatype datatype)
+			  FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
 {
   int i,m,offset;
 
@@ -866,14 +864,14 @@ reverse_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
   
   for (m = 0; m < nsend; m++) {
     offset = nper * send[m].offset * nbyte;
-    MPI_Irecv(buf2[offset],nper*send[m].npack,datatype,
+    MPI_Irecv(&buf2[offset],nper*send[m].npack,datatype,
 	      send[m].proc,0,gridcomm,&requests[m]);
   }
 
   // perform all sends to other procs
 
   for (m = 0; m < nrecv; m++) {
-    KokkosBaseFFT->pack_reverse_grid(which,k_buf1,recv[m].nunpack,k_recv_unpacklist,m);
+    kspaceKKBase->pack_reverse_grid_kokkos(which,k_buf1,recv[m].nunpack,k_recv_unpacklist,m);
     DeviceType().fence();
 
     if (!lmp->kokkos->cuda_aware_flag) {
@@ -887,8 +885,8 @@ reverse_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
   // perform all copies to self
 
   for (m = 0; m < ncopy; m++) {
-    KokkosBaseFFT->pack_reverse_grid(which,k_buf1,copy[m].nunpack,k_copy_unpacklist,m);
-    KokkosBaseFFT->unpack_reverse_grid_kokkos(which,k_buf1,0,copy[m].npack,k_copy_packlist,m);
+    kspaceKKBase->pack_reverse_grid_kokkos(which,k_buf1,copy[m].nunpack,k_copy_unpacklist,m);
+    kspaceKKBase->unpack_reverse_grid_kokkos(which,k_buf1,0,copy[m].npack,k_copy_packlist,m);
   }
 
   // unpack all received data
@@ -902,7 +900,7 @@ reverse_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
     }
 
     offset = nper * send[m].offset * nbyte;
-    KokkosBaseFFT->unpack_reverse_grid_kokkos(which,k_buf2,offset,
+    kspaceKKBase->unpack_reverse_grid_kokkos(which,k_buf2,offset,
 				send[m].npack,k_send_packlist,m);
     DeviceType().fence();
   }
@@ -923,9 +921,9 @@ void GridCommKokkos<DeviceType>::grow_swap()
   swap = (Swap *)
     memory->srealloc(swap,maxswap*sizeof(Swap),"GridCommKokkos:swap");
 
-  if (!k_swap_packlist.data() {
-    k_swap_packlist = DAT::t_int_1d("GridCommKokkos:swap_packlist",maxswap,k_swap_packlist.extent(1));
-    k_swap_unpacklist = DAT::t_int_1d("GridCommKokkos:swap_unpacklist",maxswap,k_swap_unpacklist.extent(1));
+  if (!k_swap_packlist.d_view.data()) {
+    k_swap_packlist = DAT::tdual_int_2d("GridCommKokkos:swap_packlist",maxswap,k_swap_packlist.extent(1));
+    k_swap_unpacklist = DAT::tdual_int_2d("GridCommKokkos:swap_unpacklist",maxswap,k_swap_unpacklist.extent(1));
   } else {
     k_swap_packlist.resize(maxswap,k_swap_packlist.extent(1));
     k_swap_unpacklist.resize(maxswap,k_swap_unpacklist.extent(1));
diff --git a/src/KOKKOS/gridcomm_kokkos.h b/src/KOKKOS/gridcomm_kokkos.h
index 5a057f8c6b..3148df36a5 100644
--- a/src/KOKKOS/gridcomm_kokkos.h
+++ b/src/KOKKOS/gridcomm_kokkos.h
@@ -34,12 +34,10 @@ class GridCommKokkos : public GridComm {
 	   int, int, int, int, int, int,
 	   int, int, int, int, int, int);
   ~GridCommKokkos();
-  void setup(int &, int &);
-  int ghost_adjacent();
   void forward_comm_kspace(class KSpace *, int, int, int,
-			   void *, void *, MPI_Datatype);
+			   FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
   void reverse_comm_kspace(class KSpace *, int, int, int,
-			   void *, void *, MPI_Datatype);
+			   FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
 
  private:
   DAT::tdual_int_2d k_swap_packlist;
@@ -60,13 +58,13 @@ class GridCommKokkos : public GridComm {
   void setup_tiled(int &, int &);
 
   void forward_comm_kspace_regular(class KSpace *, int, int, int,
-                                   void *, void *, MPI_Datatype);
+                                   FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
   void forward_comm_kspace_tiled(class KSpace *, int, int, int,
-                                 void *, void *, MPI_Datatype);
+                                 FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
   void reverse_comm_kspace_regular(class KSpace *, int, int, int,
-                                   void *, void *, MPI_Datatype);
+                                   FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
   void reverse_comm_kspace_tiled(class KSpace *, int, int, int,
-                                 void *, void *, MPI_Datatype);
+                                 FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
 
   void grow_swap();
   
diff --git a/src/KOKKOS/kokkos_base_fft.h b/src/KOKKOS/kokkos_base_fft.h
index 2814e9c0bc..de3a293126 100644
--- a/src/KOKKOS/kokkos_base_fft.h
+++ b/src/KOKKOS/kokkos_base_fft.h
@@ -23,10 +23,10 @@ class KokkosBaseFFT {
   KokkosBaseFFT() {}
 
   //Kspace
-  virtual void pack_forward_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
-  virtual void unpack_forward_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, int, DAT::tdual_int_2d &, int) {};
-  virtual void pack_reverse_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
-  virtual void unpack_reverse_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, int, DAT::tdual_int_2d &, int) {};
+  virtual void pack_forward_grid_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
+  virtual void unpack_forward_grid_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, int, DAT::tdual_int_2d &, int) {};
+  virtual void pack_reverse_grid_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
+  virtual void unpack_reverse_grid_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, int, DAT::tdual_int_2d &, int) {};
 };
 
 }
diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp
index 2893c5c5b3..2167b1d8c7 100644
--- a/src/KOKKOS/pppm_kokkos.cpp
+++ b/src/KOKKOS/pppm_kokkos.cpp
@@ -96,8 +96,7 @@ PPPMKokkos<DeviceType>::PPPMKokkos(LAMMPS *lmp) : PPPM(lmp)
 
   fft1 = fft2 = NULL;
   remap = NULL;
-  cg = NULL;
-  cg_peratom = NULL;
+  gc = NULL;
 
   nmax = 0;
   //part2grid = NULL;
@@ -255,9 +254,7 @@ void PPPMKokkos<DeviceType>::init()
   //   or overlap is allowed, then done
   // else reduce order and try again
 
-  int (*procneigh)[2] = comm->procneigh;
-
-  GridCommKokkos<DeviceType> *cgtmp = NULL;
+  GridCommKokkos<DeviceType> *gctmp = NULL;
   int iteration = 0;
 
   while (order >= minorder) {
@@ -269,24 +266,23 @@ void PPPMKokkos<DeviceType>::init()
     set_grid_local();
     if (overlap_allowed) break;
 
-    cgtmp = new GridCommKokkos<DeviceType>(lmp,world,1,1,
+    gctmp = new GridCommKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                          nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                         procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                         procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-    cgtmp->ghost_notify();
-    if (!cgtmp->ghost_overlap()) break;
-    delete cgtmp;
+                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
+    int tmp1,tmp2;
+    gctmp->setup(tmp1,tmp2);
+    if (!gctmp->ghost_adjacent()) break;
+    delete gctmp;
 
     order--;
     iteration++;
   }
 
   if (order < minorder) error->all(FLERR,"PPPM order < minimum allowed order");
-  if (!overlap_allowed && cgtmp->ghost_overlap())
+  if (!overlap_allowed && gctmp->ghost_adjacent())
     error->all(FLERR,"PPPM grid stencil extends "
                "beyond nearest neighbor processor");
-  if (cgtmp) delete cgtmp;
+  if (gctmp) delete gctmp;
 
   // adjust g_ewald
 
@@ -320,8 +316,6 @@ void PPPMKokkos<DeviceType>::init()
   // don't invoke allocate peratom(), will be allocated when needed
 
   allocate();
-  cg->ghost_notify();
-  cg->setup();
 
   // pre-compute Green's function denomiator expansion
   // pre-compute 1d charge distribution coefficients
@@ -564,11 +558,9 @@ void PPPMKokkos<DeviceType>::setup_grid()
 
   allocate();
 
-  cg->ghost_notify();
-  if (overlap_allowed == 0 && cg->ghost_overlap())
+  if (!overlap_allowed && !gc->ghost_adjacent())
     error->all(FLERR,"PPPM grid stencil extends "
                "beyond nearest neighbor processor");
-  cg->setup();
 
   // pre-compute Green's function denomiator expansion
   // pre-compute 1d charge distribution coefficients
@@ -576,7 +568,7 @@ void PPPMKokkos<DeviceType>::setup_grid()
   compute_gf_denom();
   compute_rho_coeff();
 
-  // pre-compute volume-dependent coeffs
+  // pre-compute volume-dependent coeffs for portion of grid I now own
 
   setup();
 }
@@ -609,11 +601,8 @@ void PPPMKokkos<DeviceType>::compute(int eflag, int vflag)
     d_vatom = k_vatom.view<DeviceType>();
   }
 
-  if (evflag_atom && !peratom_allocate_flag) {
+  if (evflag_atom && !peratom_allocate_flag)
     allocate_peratom();
-    cg_peratom->ghost_notify();
-    cg_peratom->setup();
-  }
 
   x = atomKK->k_x.view<DeviceType>();
   f = atomKK->k_f.view<DeviceType>();
@@ -667,7 +656,8 @@ void PPPMKokkos<DeviceType>::compute(int eflag, int vflag)
   //   to fully sum contribution in their 3d bricks
   // remap from 3d decomposition to FFT decomposition
 
-  cg->reverse_comm(this,REVERSE_RHO);
+  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+                          k_gc_buf1,k_gc_buf2,MPI_FFT_SCALAR);
   brick2fft();
 
   // compute potential gradient on my FFT grid and
@@ -680,12 +670,14 @@ void PPPMKokkos<DeviceType>::compute(int eflag, int vflag)
   // all procs communicate E-field values
   // to fill ghost cells surrounding their 3d bricks
 
-  cg->forward_comm(this,FORWARD_IK);
+  gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
+			  k_gc_buf1,k_gc_buf2,MPI_FFT_SCALAR);
 
   // extra per-atom energy/virial communication
 
   if (evflag_atom)
-      cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
+      gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+                              k_gc_buf1,k_gc_buf2,MPI_FFT_SCALAR);
 
   // calculate the force on my particles
 
@@ -731,6 +723,7 @@ void PPPMKokkos<DeviceType>::compute(int eflag, int vflag)
       copymode = 1;
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_self1>(0,nlocal),*this);
       copymode = 0;
+      copymode = 0;
       //for (i = nlocal; i < ntotal; i++) d_eatom[i] *= 0.5*qscale;
     }
 
@@ -844,14 +837,19 @@ void PPPMKokkos<DeviceType>::allocate()
                           1,0,0,FFT_PRECISION,collective_flag,cuda_aware_flag);
 
   // create ghost grid object for rho and electric field communication
+  // also create 2 bufs for ghost grid cell comm, passed to GridComm methods
 
-  int (*procneigh)[2] = comm->procneigh;
-
-  cg = new GridCommKokkos<DeviceType>(lmp,world,3,1,
+  gc = new GridCommKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                     nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                    nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                    procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                    procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+                    nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
+
+  gc->setup(ngc_buf1,ngc_buf2);
+
+  if (differentiation_flag) npergrid = 1;
+   else npergrid = 3;
+
+  k_gc_buf1 = FFT_DAT::tdual_FFT_SCALAR_1d("pppm:gc_buf1",npergrid*ngc_buf1);
+  k_gc_buf2 = FFT_DAT::tdual_FFT_SCALAR_1d("pppm:gc_buf2",npergrid*ngc_buf2);
 }
 
 /* ----------------------------------------------------------------------
@@ -876,8 +874,8 @@ void PPPMKokkos<DeviceType>::deallocate()
   fft2 = NULL;
   delete remap;
   remap = NULL;
-  delete cg;
-  cg = NULL;
+  delete gc;
+  gc = NULL;
 }
 
 /* ----------------------------------------------------------------------
@@ -899,16 +897,14 @@ void PPPMKokkos<DeviceType>::allocate_peratom()
   d_v5_brick = typename FFT_AT::t_FFT_SCALAR_3d("pppm:d_v5_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
 
 
-  // create ghost grid object for rho and electric field communication
+  // use same GC ghost grid object for peratom grid communication
+   // but need to reallocate a larger gc_buf1 and gc_buf2
 
-  int (*procneigh)[2] = comm->procneigh;
+  if (differentiation_flag) npergrid = 6;
+   else npergrid = 7;
 
-  cg_peratom =
-    new GridCommKokkos<DeviceType>(lmp,world,7,1,
-                 nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                 nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                 procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                 procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+  k_gc_buf1 = FFT_DAT::tdual_FFT_SCALAR_1d("pppm:gc_buf1",npergrid*ngc_buf1);
+  k_gc_buf2 = FFT_DAT::tdual_FFT_SCALAR_1d("pppm:gc_buf2",npergrid*ngc_buf2);
 }
 
 /* ----------------------------------------------------------------------
@@ -919,9 +915,6 @@ template<class DeviceType>
 void PPPMKokkos<DeviceType>::deallocate_peratom()
 {
   peratom_allocate_flag = 0;
-
-  delete cg_peratom;
-  cg_peratom = NULL;
 }
 
 /* ----------------------------------------------------------------------
@@ -1185,153 +1178,11 @@ double PPPMKokkos<DeviceType>::final_accuracy()
 template<class DeviceType>
 void PPPMKokkos<DeviceType>::set_grid_local()
 {
-  // global indices of PPPM grid range from 0 to N-1
-  // nlo_in,nhi_in = lower/upper limits of the 3d sub-brick of
-  //   global PPPM grid that I own without ghost cells
-  // for slab PPPM, assign z grid as if it were not extended
+  PPPM::set_grid_local();
 
-  nxlo_in = static_cast<int> (comm->xsplit[comm->myloc[0]] * nx_pppm);
-  nxhi_in = static_cast<int> (comm->xsplit[comm->myloc[0]+1] * nx_pppm) - 1;
-
-  nylo_in = static_cast<int> (comm->ysplit[comm->myloc[1]] * ny_pppm);
-  nyhi_in = static_cast<int> (comm->ysplit[comm->myloc[1]+1] * ny_pppm) - 1;
-
-  nzlo_in = static_cast<int>
-      (comm->zsplit[comm->myloc[2]] * nz_pppm/slab_volfactor);
-  nzhi_in = static_cast<int>
-      (comm->zsplit[comm->myloc[2]+1] * nz_pppm/slab_volfactor) - 1;
-
-  // nlower,nupper = stencil size for mapping particles to PPPM grid
-
-  nlower = -(order-1)/2;
-  nupper = order/2;
-
-  // shift values for particle <-> grid mapping
-  // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
-
-  if (order % 2) shift = OFFSET + 0.5;
-  else shift = OFFSET;
-  if (order % 2) shiftone = 0.0;
-  else shiftone = 0.5;
-
-  // nlo_out,nhi_out = lower/upper limits of the 3d sub-brick of
-  //   global PPPM grid that my particles can contribute charge to
-  // effectively nlo_in,nhi_in + ghost cells
-  // nlo,nhi = global coords of grid pt to "lower left" of smallest/largest
-  //           position a particle in my box can be at
-  // dist[3] = particle position bound = subbox + skin/2.0 + qdist
-  //   qdist = offset due to TIP4P fictitious charge
-  //   convert to triclinic if necessary
-  // nlo_out,nhi_out = nlo,nhi + stencil size for particle mapping
-  // for slab PPPM, assign z grid as if it were not extended
-
-  double *prd,*sublo,*subhi;
-
-  if (triclinic == 0) {
-    prd = domain->prd;
-    boxlo[0] = domain->boxlo[0];
-    boxlo[1] = domain->boxlo[1];
-    boxlo[2] = domain->boxlo[2];
-    sublo = domain->sublo;
-    subhi = domain->subhi;
-  } else {
-    prd = domain->prd_lamda;
-    boxlo[0] = domain->boxlo_lamda[0];
-    boxlo[1] = domain->boxlo_lamda[1];
-    boxlo[2] = domain->boxlo_lamda[2];
-    domain->x2lamda(atomKK->nlocal);
-    sublo = domain->sublo_lamda;
-    subhi = domain->subhi_lamda;
-  }
-
-  double xprd = prd[0];
-  double yprd = prd[1];
-  double zprd = prd[2];
-  double zprd_slab = zprd*slab_volfactor;
-
-  double dist[3];
-  double cuthalf = 0.5*neighbor->skin + qdist;
-  if (triclinic == 0) dist[0] = dist[1] = dist[2] = cuthalf;
-  else kspacebbox(cuthalf,&dist[0]);
-
-  int nlo,nhi;
-
-  nlo = static_cast<int> ((sublo[0]-dist[0]-boxlo[0]) *
-                            nx_pppm/xprd + shift) - OFFSET;
-  nhi = static_cast<int> ((subhi[0]+dist[0]-boxlo[0]) *
-                            nx_pppm/xprd + shift) - OFFSET;
-  nxlo_out = nlo + nlower;
-  nxhi_out = nhi + nupper;
-
-  nlo = static_cast<int> ((sublo[1]-dist[1]-boxlo[1]) *
-                            ny_pppm/yprd + shift) - OFFSET;
-  nhi = static_cast<int> ((subhi[1]+dist[1]-boxlo[1]) *
-                            ny_pppm/yprd + shift) - OFFSET;
-  nylo_out = nlo + nlower;
-  nyhi_out = nhi + nupper;
-
-  nlo = static_cast<int> ((sublo[2]-dist[2]-boxlo[2]) *
-                            nz_pppm/zprd_slab + shift) - OFFSET;
-  nhi = static_cast<int> ((subhi[2]+dist[2]-boxlo[2]) *
-                            nz_pppm/zprd_slab + shift) - OFFSET;
-  nzlo_out = nlo + nlower;
-  nzhi_out = nhi + nupper;
-
-  // for slab PPPM, change the grid boundary for processors at +z end
-  //   to include the empty volume between periodically repeating slabs
-  // for slab PPPM, want charge data communicated from -z proc to +z proc,
-  //   but not vice versa, also want field data communicated from +z proc to
-  //   -z proc, but not vice versa
-  // this is accomplished by nzhi_in = nzhi_out on +z end (no ghost cells)
-  // also insure no other procs use ghost cells beyond +z limit
-
-  if (slabflag == 1) {
-    if (comm->myloc[2] == comm->procgrid[2]-1)
-      nzhi_in = nzhi_out = nz_pppm - 1;
-    nzhi_out = MIN(nzhi_out,nz_pppm-1);
-  }
-
-  // decomposition of FFT mesh
-  // global indices range from 0 to N-1
-  // proc owns entire x-dimension, clumps of columns in y,z dimensions
-  // npey_fft,npez_fft = # of procs in y,z dims
-  // if nprocs is small enough, proc can own 1 or more entire xy planes,
-  //   else proc owns 2d sub-blocks of yz plane
-  // me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions
-  // nlo_fft,nhi_fft = lower/upper limit of the section
-  //   of the global FFT mesh that I own
-
-  int npey_fft,npez_fft;
-  if (nz_pppm >= nprocs) {
-    npey_fft = 1;
-    npez_fft = nprocs;
-  } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);
-
-  int me_y = me % npey_fft;
-  int me_z = me / npey_fft;
-
-  nxlo_fft = 0;
-  nxhi_fft = nx_pppm - 1;
-  nylo_fft = me_y*ny_pppm/npey_fft;
-  nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1;
-  nzlo_fft = me_z*nz_pppm/npez_fft;
-  nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1;
-
-  // PPPM grid pts owned by this proc, including ghosts
-
-  ngrid = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
-    (nzhi_out-nzlo_out+1);
-
-  // FFT grids owned by this proc, without ghosts
-  // nfft = FFT points in FFT decomposition on this proc
-  // nfft_brick = FFT points in 3d brick-decomposition on this proc
-  // nfft_both = greater of 2 values
-
-  nfft = (nxhi_fft-nxlo_fft+1) * (nyhi_fft-nylo_fft+1) *
-    (nzhi_fft-nzlo_fft+1);
-  int nfft_brick = (nxhi_in-nxlo_in+1) * (nyhi_in-nylo_in+1) *
-    (nzhi_in-nzlo_in+1);
-  nfft_both = MAX(nfft,nfft_brick);
+  boxlo[0] = domain->boxlo[0];
+  boxlo[1] = domain->boxlo[1];
+  boxlo[2] = domain->boxlo[2];
 }
 
 /* ----------------------------------------------------------------------
@@ -2653,7 +2504,7 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_unpack_forward1, const int &i) c
   const int iz = (int) (dlist/(nx*ny));
   const int iy = (int) ((dlist - iz*nx*ny)/nx);
   const int ix = d_list_index[i] - iz*nx*ny - iy*nx;
-  d_vdx_brick(iz,iy,ix) = d_buf[3*i + unpack_offset];
+  d_vdx_brick(iz,iy,ix) = d_buf[3*i   + unpack_offset];
   d_vdy_brick(iz,iy,ix) = d_buf[3*i+1 + unpack_offset];
   d_vdz_brick(iz,iy,ix) = d_buf[3*i+2 + unpack_offset];
 }
@@ -2668,12 +2519,12 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_unpack_forward2, const int &i) c
   const int ix = d_list_index[i] - iz*nx*ny - iy*nx;
   if (eflag_atom) d_u_brick(iz,iy,ix) = d_buf[7*i];
   if (vflag_atom) {
-    d_v0_brick(iz,iy,ix) = d_buf[7*i+1];
-    d_v1_brick(iz,iy,ix) = d_buf[7*i+2];
-    d_v2_brick(iz,iy,ix) = d_buf[7*i+3];
-    d_v3_brick(iz,iy,ix) = d_buf[7*i+4];
-    d_v4_brick(iz,iy,ix) = d_buf[7*i+5];
-    d_v5_brick(iz,iy,ix) = d_buf[7*i+6];
+    d_v0_brick(iz,iy,ix) = d_buf[7*i+1 + unpack_offset];
+    d_v1_brick(iz,iy,ix) = d_buf[7*i+2 + unpack_offset];
+    d_v2_brick(iz,iy,ix) = d_buf[7*i+3 + unpack_offset];
+    d_v3_brick(iz,iy,ix) = d_buf[7*i+4 + unpack_offset];
+    d_v4_brick(iz,iy,ix) = d_buf[7*i+5 + unpack_offset];
+    d_v5_brick(iz,iy,ix) = d_buf[7*i+6 + unpack_offset];
   }
 }
 
@@ -3046,7 +2897,9 @@ double PPPMKokkos<DeviceType>::memory_usage()
   if (peratom_allocate_flag)
     bytes += 6 * nbrick * sizeof(FFT_SCALAR);
 
-  if (cg) bytes += cg->memory_usage();
+  // two GridComm bufs
+
+  bytes += (ngc_buf1 + ngc_buf2) * npergrid * sizeof(FFT_SCALAR);
 
   return bytes;
 }
diff --git a/src/KOKKOS/pppm_kokkos.h b/src/KOKKOS/pppm_kokkos.h
index 4e98826105..b2c7dbe852 100644
--- a/src/KOKKOS/pppm_kokkos.h
+++ b/src/KOKKOS/pppm_kokkos.h
@@ -354,10 +354,14 @@ class PPPMKokkos : public PPPM, public KokkosBaseFFT {
   //double **acons;
   typename Kokkos::DualView<F_FLOAT[8][7],Kokkos::LayoutRight,DeviceType>::t_host acons;
 
+  // FFTs and grid communication
+
   FFT3dKokkos<DeviceType> *fft1,*fft2;
   RemapKokkos<DeviceType> *remap;
-  GridCommKokkos<DeviceType> *cg;
-  GridCommKokkos<DeviceType> *cg_peratom;
+  GridCommKokkos<DeviceType> *gc;
+
+  FFT_DAT::tdual_FFT_SCALAR_1d k_gc_buf1,k_gc_buf2;
+  int ngc_buf1,ngc_buf2,npergrid;
 
   //int **part2grid;             // storage for particle -> grid mapping
   typename AT::t_int_1d_3 d_part2grid;
@@ -404,10 +408,10 @@ class PPPMKokkos : public PPPM, public KokkosBaseFFT {
 
   // grid communication
 
-  void pack_forward_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int);
-  void unpack_forward_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, int, DAT::tdual_int_2d &, int);
-  void pack_reverse_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int);
-  void unpack_reverse_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, int, DAT::tdual_int_2d &, int);
+  void pack_forward_grid_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int);
+  void unpack_forward_grid_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, int, DAT::tdual_int_2d &, int);
+  void pack_reverse_grid_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int);
+  void unpack_reverse_grid_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, int, DAT::tdual_int_2d &, int);
 
   // triclinic
 

From fcec5c35ea4eba4ca6d8a7b31ba6427a5237e6e0 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 19 Aug 2020 08:57:46 -0600
Subject: [PATCH 11/38] Remove error check in pppm_kokkos

---
 src/KOKKOS/pppm_kokkos.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp
index 2167b1d8c7..ccff40ca0c 100644
--- a/src/KOKKOS/pppm_kokkos.cpp
+++ b/src/KOKKOS/pppm_kokkos.cpp
@@ -191,9 +191,6 @@ void PPPMKokkos<DeviceType>::init()
                "slab correction");
   if (domain->dimension == 2) error->all(FLERR,
                                          "Cannot use PPPM with 2d simulation");
-  if (comm->style != 0)
-    error->universe_all(FLERR,"PPPM can only currently be used with "
-                        "comm_style brick");
 
   if (!atomKK->q_flag) error->all(FLERR,"Kspace style requires atom attribute q");
 

From 3a8b2aef17237b41b0d5b1981c733e49bd25a86d Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 19 Aug 2020 09:45:19 -0600
Subject: [PATCH 12/38] Small tweaks

---
 src/KOKKOS/gridcomm_kokkos.cpp | 38 ++++++++++++++++++----------------
 src/KSPACE/gridcomm.cpp        | 10 ++++-----
 2 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/src/KOKKOS/gridcomm_kokkos.cpp b/src/KOKKOS/gridcomm_kokkos.cpp
index 3c9e61ed00..f65a0324a5 100644
--- a/src/KOKKOS/gridcomm_kokkos.cpp
+++ b/src/KOKKOS/gridcomm_kokkos.cpp
@@ -422,7 +422,7 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
   // dim is -1 for proc 0, but never accessed
   
   rcbinfo = (RCBinfo *)
-    memory->smalloc(nprocs*sizeof(RCBinfo),"GridCommKokkos:rcbinfo");
+    memory->smalloc(nprocs*sizeof(RCBinfo),"GridComm:rcbinfo");
   RCBinfo rcbone;
   rcbone.dim = comm->rcbcutdim;
   if (rcbone.dim <= 0) rcbone.cut = inxlo;
@@ -445,7 +445,7 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
   
   pbc[0] = pbc[1] = pbc[2] = 0;
 
-  memory->create(overlap_procs,nprocs,"GridCommKokkos:overlap_procs");
+  memory->create(overlap_procs,nprocs,"GridComm:overlap_procs");
   noverlap = maxoverlap = 0;
   overlap = NULL;
 
@@ -456,9 +456,9 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
   // ncopy = # of overlaps with myself, across a periodic boundary
 
   int *proclist;
-  memory->create(proclist,noverlap,"GridCommKokkos:proclist");
+  memory->create(proclist,noverlap,"GridComm:proclist");
   srequest = (Request *)
-    memory->smalloc(noverlap*sizeof(Request),"GridCommKokkos:srequest");
+    memory->smalloc(noverlap*sizeof(Request),"GridComm:srequest");
   
   int nsend_request = 0;
   ncopy = 0;
@@ -478,21 +478,21 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
   Irregular *irregular = new Irregular(lmp);
   int nrecv_request = irregular->create_data(nsend_request,proclist,1);
   Request *rrequest =
-    (Request *) memory->smalloc(nrecv_request*sizeof(Request),"GridCommKokkos:rrequest");
+    (Request *) memory->smalloc(nrecv_request*sizeof(Request),"GridComm:rrequest");
   irregular->exchange_data((char *) srequest,sizeof(Request),(char *) rrequest);
   irregular->destroy_data();
   
   // compute overlaps between received ghost boxes and my owned box
   // overlap box used to setup my Send data struct and respond to requests
 
-  send = (Send *) memory->smalloc(nrecv_request*sizeof(Send),"GridCommKokkos:send");
+  send = (Send *) memory->smalloc(nrecv_request*sizeof(Send),"GridComm:send");
 
-  k_send_packlist = DAT::tdual_int_2d("GridCommKokkos:send_packlist",nrecv_request,k_send_packlist.extent(1));
+  k_send_packlist = DAT::tdual_int_2d("GridComm:send_packlist",nrecv_request,k_send_packlist.extent(1));
 
   sresponse = (Response *)
-    memory->smalloc(nrecv_request*sizeof(Response),"GridCommKokkos:sresponse");
+    memory->smalloc(nrecv_request*sizeof(Response),"GridComm:sresponse");
   memory->destroy(proclist);
-  memory->create(proclist,nrecv_request,"GridCommKokkos:proclist");
+  memory->create(proclist,nrecv_request,"GridComm:proclist");
 
   for (m = 0; m < nrecv_request; m++) {
     send[m].proc = rrequest[m].sender;
@@ -522,7 +522,7 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
   int nsend_response = nrecv_request;
   int nrecv_response = irregular->create_data(nsend_response,proclist,1);
   Response *rresponse =
-    (Response *) memory->smalloc(nrecv_response*sizeof(Response),"GridCommKokkos:rresponse");
+    (Response *) memory->smalloc(nrecv_response*sizeof(Response),"GridComm:rresponse");
   irregular->exchange_data((char *) sresponse,sizeof(Response),(char *) rresponse);
   irregular->destroy_data();
   delete irregular;
@@ -531,9 +531,9 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
   // box used to setup my Recv data struct after unwrapping via PBC
   // adjacent = 0 if any box of ghost cells does not adjoin my owned cells
   
-  recv = (Recv *) memory->smalloc(nrecv_response*sizeof(Recv),"GridCommKokkos:recv");
+  recv = (Recv *) memory->smalloc(nrecv_response*sizeof(Recv),"GridComm:recv");
 
-  k_recv_unpacklist = DAT::tdual_int_2d("GridCommKokkos:recv_unpacklist",nrecv_response,k_recv_unpacklist.extent(1));
+  k_recv_unpacklist = DAT::tdual_int_2d("GridComm:recv_unpacklist",nrecv_response,k_recv_unpacklist.extent(1));
 
   adjacent = 1;
   
@@ -557,10 +557,10 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
 
   // create Copy data struct from overlaps with self
   
-  copy = (Copy *) memory->smalloc(ncopy*sizeof(Copy),"GridCommKokkos:copy");
+  copy = (Copy *) memory->smalloc(ncopy*sizeof(Copy),"GridComm:copy");
 
-  k_copy_packlist = DAT::tdual_int_2d("GridCommKokkos:copy_packlist",ncopy,k_copy_packlist.extent(1));
-  k_copy_unpacklist = DAT::tdual_int_2d("GridCommKokkos:copy_unpacklist",ncopy,k_copy_unpacklist.extent(1));
+  k_copy_packlist = DAT::tdual_int_2d("GridComm:copy_packlist",ncopy,k_copy_packlist.extent(1));
+  k_copy_unpacklist = DAT::tdual_int_2d("GridComm:copy_unpacklist",ncopy,k_copy_unpacklist.extent(1));
  
   ncopy = 0;
   for (m = 0; m < noverlap; m++) {
@@ -919,11 +919,11 @@ void GridCommKokkos<DeviceType>::grow_swap()
 {
   maxswap += SWAPDELTA;
   swap = (Swap *)
-    memory->srealloc(swap,maxswap*sizeof(Swap),"GridCommKokkos:swap");
+    memory->srealloc(swap,maxswap*sizeof(Swap),"GridComm:swap");
 
   if (!k_swap_packlist.d_view.data()) {
-    k_swap_packlist = DAT::tdual_int_2d("GridCommKokkos:swap_packlist",maxswap,k_swap_packlist.extent(1));
-    k_swap_unpacklist = DAT::tdual_int_2d("GridCommKokkos:swap_unpacklist",maxswap,k_swap_unpacklist.extent(1));
+    k_swap_packlist = DAT::tdual_int_2d("GridComm:swap_packlist",maxswap,k_swap_packlist.extent(1));
+    k_swap_unpacklist = DAT::tdual_int_2d("GridComm:swap_unpacklist",maxswap,k_swap_unpacklist.extent(1));
   } else {
     k_swap_packlist.resize(maxswap,k_swap_packlist.extent(1));
     k_swap_unpacklist.resize(maxswap,k_swap_unpacklist.extent(1));
@@ -944,6 +944,8 @@ int GridCommKokkos<DeviceType>::indices_kokkos(DAT::tdual_int_2d &k_list, int in
   if (k_list.extent(1) < nmax)
     k_list.resize(k_list.extent(0),nmax);
 
+  if (nmax == 0) return 0; 
+
   int nx = (fullxhi-fullxlo+1);
   int ny = (fullyhi-fullylo+1);
 
diff --git a/src/KSPACE/gridcomm.cpp b/src/KSPACE/gridcomm.cpp
index 51623691a3..49067b09e5 100644
--- a/src/KSPACE/gridcomm.cpp
+++ b/src/KSPACE/gridcomm.cpp
@@ -643,7 +643,7 @@ void GridComm::setup_tiled(int &nbuf1, int &nbuf2)
   // box used to setup my Recv data struct after unwrapping via PBC
   // adjacent = 0 if any box of ghost cells does not adjoin my owned cells
   
-  recv = (Recv *) memory->smalloc(nrecv_response*sizeof(Recv),"CommGrid:recv");
+  recv = (Recv *) memory->smalloc(nrecv_response*sizeof(Recv),"GridComm:recv");
   adjacent = 1;
   
   for (i = 0; i < nrecv_response; i++) {
@@ -666,7 +666,7 @@ void GridComm::setup_tiled(int &nbuf1, int &nbuf2)
 
   // create Copy data struct from overlaps with self
   
-  copy = (Copy *) memory->smalloc(ncopy*sizeof(Copy),"CommGrid:copy");
+  copy = (Copy *) memory->smalloc(ncopy*sizeof(Copy),"GridComm:copy");
  
   ncopy = 0;
   for (m = 0; m < noverlap; m++) {
@@ -1075,7 +1075,7 @@ void GridComm::grow_swap()
 {
   maxswap += SWAPDELTA;
   swap = (Swap *)
-    memory->srealloc(swap,maxswap*sizeof(Swap),"CommGrid:swap");
+    memory->srealloc(swap,maxswap*sizeof(Swap),"GridComm:swap");
 }
 
 /* ----------------------------------------------------------------------
@@ -1090,7 +1090,7 @@ void GridComm::grow_overlap()
 {
   maxoverlap += SWAPDELTA;
   overlap = (Overlap *)
-    memory->srealloc(overlap,maxoverlap*sizeof(Overlap),"CommGrid:overlap");
+    memory->srealloc(overlap,maxoverlap*sizeof(Overlap),"GridComm:overlap");
 }
 
 /* ----------------------------------------------------------------------
@@ -1103,7 +1103,7 @@ int GridComm::indices(int *&list,
                        int xlo, int xhi, int ylo, int yhi, int zlo, int zhi)
 {
   int nmax = (xhi-xlo+1) * (yhi-ylo+1) * (zhi-zlo+1);
-  memory->create(list,nmax,"CommGrid:indices");
+  memory->create(list,nmax,"GridComm:indices");
   if (nmax == 0) return 0;
 
   int nx = (fullxhi-fullxlo+1);

From cc0d294478aff1f55f96d6727b71a526686571ae Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 19 Aug 2020 13:05:23 -0600
Subject: [PATCH 13/38] Fix segfault

---
 src/KOKKOS/gridcomm_kokkos.cpp | 30 ++++++++++++++----------------
 src/KOKKOS/gridcomm_kokkos.h   | 12 ++++++------
 src/KOKKOS/pppm_kokkos.cpp     | 12 +++++-------
 3 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/src/KOKKOS/gridcomm_kokkos.cpp b/src/KOKKOS/gridcomm_kokkos.cpp
index f65a0324a5..04f7b6ae7c 100644
--- a/src/KOKKOS/gridcomm_kokkos.cpp
+++ b/src/KOKKOS/gridcomm_kokkos.cpp
@@ -642,20 +642,20 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
-void GridCommKokkos<DeviceType>::forward_comm_kspace(KSpace *kspace, int nper, int nbyte, int which,
+void GridCommKokkos<DeviceType>::forward_comm_kspace(KSpace *kspace, int nper, int which,
 				   FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
 {
   if (layout == REGULAR)
-    forward_comm_kspace_regular(kspace,nper,nbyte,which,k_buf1,k_buf2,datatype);
+    forward_comm_kspace_regular(kspace,nper,which,k_buf1,k_buf2,datatype);
   else
-    forward_comm_kspace_tiled(kspace,nper,nbyte,which,k_buf1,k_buf2,datatype);
+    forward_comm_kspace_tiled(kspace,nper,which,k_buf1,k_buf2,datatype);
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::
-forward_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
+forward_comm_kspace_regular(KSpace *kspace, int nper, int which,
 			    FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
 {
   int m;
@@ -707,7 +707,7 @@ forward_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
 
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::
-forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
+forward_comm_kspace_tiled(KSpace *kspace, int nper, int which,
 			  FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
 {
   int i,m,offset;
@@ -726,7 +726,7 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
   // post all receives
   
   for (m = 0; m < nrecv; m++) {
-    offset = nper * recv[m].offset * nbyte;
+    offset = nper * recv[m].offset;
     MPI_Irecv(&buf2[offset],nper*recv[m].nunpack,datatype,
 	      recv[m].proc,0,gridcomm,&requests[m]);
   }
@@ -762,7 +762,7 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
       k_buf2.sync<DeviceType>();
     }
 
-    offset = nper * recv[m].offset * nbyte;
+    offset = nper * recv[m].offset;
     kspaceKKBase->unpack_forward_grid_kokkos(which,k_buf2,offset,
 				recv[m].nunpack,k_recv_unpacklist,m);
     DeviceType().fence();
@@ -775,20 +775,20 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
-void GridCommKokkos<DeviceType>::reverse_comm_kspace(KSpace *kspace, int nper, int nbyte, int which,
+void GridCommKokkos<DeviceType>::reverse_comm_kspace(KSpace *kspace, int nper, int which,
 				    FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
 {
   if (layout == REGULAR)
-    reverse_comm_kspace_regular(kspace,nper,nbyte,which,k_buf1,k_buf2,datatype);
+    reverse_comm_kspace_regular(kspace,nper,which,k_buf1,k_buf2,datatype);
   else
-    reverse_comm_kspace_tiled(kspace,nper,nbyte,which,k_buf1,k_buf2,datatype);
+    reverse_comm_kspace_tiled(kspace,nper,which,k_buf1,k_buf2,datatype);
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::
-reverse_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
+reverse_comm_kspace_regular(KSpace *kspace, int nper, int which,
 			    FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
 {
   int m;
@@ -841,7 +841,7 @@ reverse_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
 
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::
-reverse_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
+reverse_comm_kspace_tiled(KSpace *kspace, int nper, int which,
 			  FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
 {
   int i,m,offset;
@@ -854,8 +854,6 @@ reverse_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
     buf1 = k_buf1.view<DeviceType>().data();
     buf2 = k_buf2.view<DeviceType>().data();
   } else {
-    k_buf1.modify<DeviceType>();
-    k_buf1.sync<LMPHostType>();
     buf1 = k_buf1.h_view.data();
     buf2 = k_buf2.h_view.data();
   }
@@ -863,7 +861,7 @@ reverse_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
   // post all receives
   
   for (m = 0; m < nsend; m++) {
-    offset = nper * send[m].offset * nbyte;
+    offset = nper * send[m].offset;
     MPI_Irecv(&buf2[offset],nper*send[m].npack,datatype,
 	      send[m].proc,0,gridcomm,&requests[m]);
   }
@@ -899,7 +897,7 @@ reverse_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
       k_buf2.sync<DeviceType>();
     }
 
-    offset = nper * send[m].offset * nbyte;
+    offset = nper * send[m].offset;
     kspaceKKBase->unpack_reverse_grid_kokkos(which,k_buf2,offset,
 				send[m].npack,k_send_packlist,m);
     DeviceType().fence();
diff --git a/src/KOKKOS/gridcomm_kokkos.h b/src/KOKKOS/gridcomm_kokkos.h
index 3148df36a5..ad2f9c7995 100644
--- a/src/KOKKOS/gridcomm_kokkos.h
+++ b/src/KOKKOS/gridcomm_kokkos.h
@@ -34,9 +34,9 @@ class GridCommKokkos : public GridComm {
 	   int, int, int, int, int, int,
 	   int, int, int, int, int, int);
   ~GridCommKokkos();
-  void forward_comm_kspace(class KSpace *, int, int, int,
+  void forward_comm_kspace(class KSpace *, int, int,
 			   FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
-  void reverse_comm_kspace(class KSpace *, int, int, int,
+  void reverse_comm_kspace(class KSpace *, int, int,
 			   FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
 
  private:
@@ -57,13 +57,13 @@ class GridCommKokkos : public GridComm {
   void setup_regular(int &, int &);
   void setup_tiled(int &, int &);
 
-  void forward_comm_kspace_regular(class KSpace *, int, int, int,
+  void forward_comm_kspace_regular(class KSpace *, int, int,
                                    FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
-  void forward_comm_kspace_tiled(class KSpace *, int, int, int,
+  void forward_comm_kspace_tiled(class KSpace *, int, int,
                                  FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
-  void reverse_comm_kspace_regular(class KSpace *, int, int, int,
+  void reverse_comm_kspace_regular(class KSpace *, int, int,
                                    FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
-  void reverse_comm_kspace_tiled(class KSpace *, int, int, int,
+  void reverse_comm_kspace_tiled(class KSpace *, int, int,
                                  FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
 
   void grow_swap();
diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp
index ccff40ca0c..53279a9b71 100644
--- a/src/KOKKOS/pppm_kokkos.cpp
+++ b/src/KOKKOS/pppm_kokkos.cpp
@@ -653,7 +653,7 @@ void PPPMKokkos<DeviceType>::compute(int eflag, int vflag)
   //   to fully sum contribution in their 3d bricks
   // remap from 3d decomposition to FFT decomposition
 
-  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+  gc->reverse_comm_kspace(this,1,REVERSE_RHO,
                           k_gc_buf1,k_gc_buf2,MPI_FFT_SCALAR);
   brick2fft();
 
@@ -667,13 +667,13 @@ void PPPMKokkos<DeviceType>::compute(int eflag, int vflag)
   // all procs communicate E-field values
   // to fill ghost cells surrounding their 3d bricks
 
-  gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
+  gc->forward_comm_kspace(this,3,FORWARD_IK,
 			  k_gc_buf1,k_gc_buf2,MPI_FFT_SCALAR);
 
   // extra per-atom energy/virial communication
 
   if (evflag_atom)
-      gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+      gc->forward_comm_kspace(this,7,FORWARD_IK_PERATOM,
                               k_gc_buf1,k_gc_buf2,MPI_FFT_SCALAR);
 
   // calculate the force on my particles
@@ -842,8 +842,7 @@ void PPPMKokkos<DeviceType>::allocate()
 
   gc->setup(ngc_buf1,ngc_buf2);
 
-  if (differentiation_flag) npergrid = 1;
-   else npergrid = 3;
+  npergrid = 3;
 
   k_gc_buf1 = FFT_DAT::tdual_FFT_SCALAR_1d("pppm:gc_buf1",npergrid*ngc_buf1);
   k_gc_buf2 = FFT_DAT::tdual_FFT_SCALAR_1d("pppm:gc_buf2",npergrid*ngc_buf2);
@@ -897,8 +896,7 @@ void PPPMKokkos<DeviceType>::allocate_peratom()
   // use same GC ghost grid object for peratom grid communication
    // but need to reallocate a larger gc_buf1 and gc_buf2
 
-  if (differentiation_flag) npergrid = 6;
-   else npergrid = 7;
+  npergrid = 7;
 
   k_gc_buf1 = FFT_DAT::tdual_FFT_SCALAR_1d("pppm:gc_buf1",npergrid*ngc_buf1);
   k_gc_buf2 = FFT_DAT::tdual_FFT_SCALAR_1d("pppm:gc_buf2",npergrid*ngc_buf2);

From 33cc932d03489883a1fdaa3078b6d2f825f731b3 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 18 Aug 2020 21:29:43 -0400
Subject: [PATCH 14/38] Eigen3 is now hosted at GitLab. The bitbucket repo is
 gone. Update URLs and MD5s

---
 cmake/Modules/Packages/USER-SMD.cmake |  4 ++--
 lib/smd/Install.py                    | 16 +++++++---------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/cmake/Modules/Packages/USER-SMD.cmake b/cmake/Modules/Packages/USER-SMD.cmake
index 158df83712..67f4aae99d 100644
--- a/cmake/Modules/Packages/USER-SMD.cmake
+++ b/cmake/Modules/Packages/USER-SMD.cmake
@@ -9,8 +9,8 @@ if(DOWNLOAD_EIGEN3)
   message(STATUS "Eigen3 download requested - we will build our own")
   include(ExternalProject)
   ExternalProject_Add(Eigen3_build
-    URL https://bitbucket.org/eigen/eigen/get/3.3.7.tar.gz
-    URL_MD5 f2a417d083fe8ca4b8ed2bc613d20f07
+    URL https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.gz
+    URL_MD5 9e30f67e8531477de4117506fe44669b
     CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND ""
   )
   ExternalProject_get_property(Eigen3_build SOURCE_DIR)
diff --git a/lib/smd/Install.py b/lib/smd/Install.py
index 40a8bca523..16dd3038a8 100644
--- a/lib/smd/Install.py
+++ b/lib/smd/Install.py
@@ -22,12 +22,10 @@ tarball = "eigen.tar.gz"
 
 # known checksums for different Eigen versions. used to validate the download.
 checksums = { \
-              '3.3.4' : '1a47e78efe365a97de0c022d127607c3', \
-              '3.3.5' : 'ee48cafede2f51fe33984ff5c9f48026', \
-              '3.3.6' : 'd1be14064b50310b0eb2b49e402c64d7', \
-              '3.3.7' : 'f2a417d083fe8ca4b8ed2bc613d20f07' \
+              '3.3.7' : '9e30f67e8531477de4117506fe44669b' \
 }
 
+
 # help message
 
 HELP = """
@@ -36,7 +34,7 @@ Syntax from src dir: make lib-smd args="-b"
 
 Syntax from lib dir: python Install.py -b
                  or: python Install.py -p /usr/include/eigen3"
-                 or: python Install.py -v 3.3.4 -b
+                 or: python Install.py -v 3.3.7 -b
 
 Example:
 
@@ -78,7 +76,7 @@ if pathflag:
 if buildflag:
   print("Downloading Eigen ...")
   eigentar = os.path.join(homepath, tarball)
-  url = "https://bitbucket.org/eigen/eigen/get/%s.tar.gz" % version
+  url = "https://gitlab.com/libeigen/eigen/-/archive/%s/eigen-%s.tar.gz" %  (version,version)
   geturl(url, eigentar)
 
   # verify downloaded archive integrity via md5 checksum, if known.
@@ -89,7 +87,7 @@ if buildflag:
 
 
   print("Cleaning up old folders ...")
-  edir = glob.glob(os.path.join(homepath, "eigen-eigen-*"))
+  edir = glob.glob(os.path.join(homepath, "eigen-*"))
   edir.append(eigenpath)
   for one in edir:
     if os.path.isdir(one):
@@ -102,8 +100,8 @@ if buildflag:
     os.remove(eigentar)
   else:
     sys.exit("File %s is not a supported archive" % eigentar)
-  edir = glob.glob(os.path.join(homepath, "eigen-eigen-*"))
-  os.rename(edir[0], eigenpath)
+  edir = os.path.join(homepath, "eigen-%s" % version)
+  os.rename(edir, eigenpath)
 
 # create link in lib/smd to Eigen src dir
 

From 073c7ee1387a7e7636f2a1f028297f0d3f1da87a Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 19 Aug 2020 13:29:11 -0600
Subject: [PATCH 15/38] Fix warnings related to virtual function override

---
 src/KOKKOS/gridcomm_kokkos.cpp | 34 +++++++++++++++++-----------------
 src/KOKKOS/gridcomm_kokkos.h   |  4 ++--
 src/KSPACE/gridcomm.h          | 28 ++++++++++++++--------------
 3 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/src/KOKKOS/gridcomm_kokkos.cpp b/src/KOKKOS/gridcomm_kokkos.cpp
index 04f7b6ae7c..81e4435083 100644
--- a/src/KOKKOS/gridcomm_kokkos.cpp
+++ b/src/KOKKOS/gridcomm_kokkos.cpp
@@ -181,7 +181,7 @@ void GridCommKokkos<DeviceType>::setup_regular(int &nbuf1, int &nbuf2)
     swap[nswap].recvproc = procxhi;
     sendplanes = MIN(sendlast-sendfirst+1,ghostxlo-nsent);
     swap[nswap].npack =
-      indices_kokkos(k_swap_packlist,nswap,
+      indices(k_swap_packlist,nswap,
               sendfirst,sendfirst+sendplanes-1,inylo,inyhi,inzlo,inzhi);
 
     if (procxlo != me)
@@ -190,7 +190,7 @@ void GridCommKokkos<DeviceType>::setup_regular(int &nbuf1, int &nbuf2)
     else recvplanes = sendplanes;
 
     swap[nswap].nunpack =
-      indices_kokkos(k_swap_unpacklist,nswap,
+      indices(k_swap_unpacklist,nswap,
               recvfirst,recvfirst+recvplanes-1,inylo,inyhi,inzlo,inzhi);
 
     nsent += sendplanes;
@@ -219,7 +219,7 @@ void GridCommKokkos<DeviceType>::setup_regular(int &nbuf1, int &nbuf2)
     swap[nswap].recvproc = procxlo;
     sendplanes = MIN(sendlast-sendfirst+1,ghostxhi-nsent);
     swap[nswap].npack =
-      indices_kokkos(k_swap_packlist,nswap,
+      indices(k_swap_packlist,nswap,
               sendlast-sendplanes+1,sendlast,inylo,inyhi,inzlo,inzhi);
 
     if (procxhi != me)
@@ -228,7 +228,7 @@ void GridCommKokkos<DeviceType>::setup_regular(int &nbuf1, int &nbuf2)
     else recvplanes = sendplanes;
 
     swap[nswap].nunpack =
-      indices_kokkos(k_swap_unpacklist,nswap,
+      indices(k_swap_unpacklist,nswap,
               recvlast-recvplanes+1,recvlast,inylo,inyhi,inzlo,inzhi);
 
     nsent += sendplanes;
@@ -257,7 +257,7 @@ void GridCommKokkos<DeviceType>::setup_regular(int &nbuf1, int &nbuf2)
     swap[nswap].recvproc = procyhi;
     sendplanes = MIN(sendlast-sendfirst+1,ghostylo-nsent);
     swap[nswap].npack =
-      indices_kokkos(k_swap_packlist,nswap,
+      indices(k_swap_packlist,nswap,
               outxlo,outxhi,sendfirst,sendfirst+sendplanes-1,inzlo,inzhi);
 
     if (procylo != me)
@@ -266,7 +266,7 @@ void GridCommKokkos<DeviceType>::setup_regular(int &nbuf1, int &nbuf2)
     else recvplanes = sendplanes;
 
     swap[nswap].nunpack =
-      indices_kokkos(k_swap_unpacklist,nswap,
+      indices(k_swap_unpacklist,nswap,
               outxlo,outxhi,recvfirst,recvfirst+recvplanes-1,inzlo,inzhi);
 
     nsent += sendplanes;
@@ -295,7 +295,7 @@ void GridCommKokkos<DeviceType>::setup_regular(int &nbuf1, int &nbuf2)
     swap[nswap].recvproc = procylo;
     sendplanes = MIN(sendlast-sendfirst+1,ghostyhi-nsent);
     swap[nswap].npack =
-      indices_kokkos(k_swap_packlist,nswap,
+      indices(k_swap_packlist,nswap,
               outxlo,outxhi,sendlast-sendplanes+1,sendlast,inzlo,inzhi);
 
     if (procyhi != me)
@@ -304,7 +304,7 @@ void GridCommKokkos<DeviceType>::setup_regular(int &nbuf1, int &nbuf2)
     else recvplanes = sendplanes;
 
     swap[nswap].nunpack =
-      indices_kokkos(k_swap_unpacklist,nswap,
+      indices(k_swap_unpacklist,nswap,
               outxlo,outxhi,recvlast-recvplanes+1,recvlast,inzlo,inzhi);
 
     nsent += sendplanes;
@@ -333,7 +333,7 @@ void GridCommKokkos<DeviceType>::setup_regular(int &nbuf1, int &nbuf2)
     swap[nswap].recvproc = proczhi;
     sendplanes = MIN(sendlast-sendfirst+1,ghostzlo-nsent);
     swap[nswap].npack =
-      indices_kokkos(k_swap_packlist,nswap,
+      indices(k_swap_packlist,nswap,
               outxlo,outxhi,outylo,outyhi,sendfirst,sendfirst+sendplanes-1);
 
     if (proczlo != me)
@@ -342,7 +342,7 @@ void GridCommKokkos<DeviceType>::setup_regular(int &nbuf1, int &nbuf2)
     else recvplanes = sendplanes;
 
     swap[nswap].nunpack =
-      indices_kokkos(k_swap_unpacklist,nswap,
+      indices(k_swap_unpacklist,nswap,
               outxlo,outxhi,outylo,outyhi,recvfirst,recvfirst+recvplanes-1);
 
     nsent += sendplanes;
@@ -371,7 +371,7 @@ void GridCommKokkos<DeviceType>::setup_regular(int &nbuf1, int &nbuf2)
     swap[nswap].recvproc = proczlo;
     sendplanes = MIN(sendlast-sendfirst+1,ghostzhi-nsent);
     swap[nswap].npack =
-      indices_kokkos(k_swap_packlist,nswap,
+      indices(k_swap_packlist,nswap,
               outxlo,outxhi,outylo,outyhi,sendlast-sendplanes+1,sendlast);
 
     if (proczhi != me)
@@ -380,7 +380,7 @@ void GridCommKokkos<DeviceType>::setup_regular(int &nbuf1, int &nbuf2)
     else recvplanes = sendplanes;
 
     swap[nswap].nunpack =
-      indices_kokkos(k_swap_unpacklist,nswap,
+      indices(k_swap_unpacklist,nswap,
               outxlo,outxhi,outylo,outyhi,recvlast-recvplanes+1,recvlast);
 
     nsent += sendplanes;
@@ -502,7 +502,7 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
     yhi = MIN(rrequest[m].box[3],inyhi);
     zlo = MAX(rrequest[m].box[4],inzlo);
     zhi = MIN(rrequest[m].box[5],inzhi);
-    send[m].npack = indices_kokkos(k_send_packlist,m,xlo,xhi,ylo,yhi,zlo,zhi);
+    send[m].npack = indices(k_send_packlist,m,xlo,xhi,ylo,yhi,zlo,zhi);
 
     proclist[m] = rrequest[m].sender;
     sresponse[m].index = rrequest[m].index;
@@ -546,7 +546,7 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
     yhi = rresponse[i].box[3] + overlap[m].pbc[1] * ny;
     zlo = rresponse[i].box[4] + overlap[m].pbc[2] * nz;
     zhi = rresponse[i].box[5] + overlap[m].pbc[2] * nz;
-    recv[i].nunpack = indices_kokkos(k_recv_unpacklist,i,xlo,xhi,ylo,yhi,zlo,zhi);
+    recv[i].nunpack = indices(k_recv_unpacklist,i,xlo,xhi,ylo,yhi,zlo,zhi);
     
     if (xlo != inxhi+1 && xhi != inxlo-1 &&
 	ylo != inyhi+1 && yhi != inylo-1 &&
@@ -571,14 +571,14 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
     yhi = overlap[m].box[3];
     zlo = overlap[m].box[4];
     zhi = overlap[m].box[5];
-    copy[ncopy].npack = indices_kokkos(k_copy_packlist,ncopy,xlo,xhi,ylo,yhi,zlo,zhi);
+    copy[ncopy].npack = indices(k_copy_packlist,ncopy,xlo,xhi,ylo,yhi,zlo,zhi);
     xlo = overlap[m].box[0] + overlap[m].pbc[0] * nx;
     xhi = overlap[m].box[1] + overlap[m].pbc[0] * nx;
     ylo = overlap[m].box[2] + overlap[m].pbc[1] * ny;
     yhi = overlap[m].box[3] + overlap[m].pbc[1] * ny;
     zlo = overlap[m].box[4] + overlap[m].pbc[2] * nz;
     zhi = overlap[m].box[5] + overlap[m].pbc[2] * nz;
-    copy[ncopy].nunpack = indices_kokkos(k_copy_unpacklist,ncopy,xlo,xhi,ylo,yhi,zlo,zhi);
+    copy[ncopy].nunpack = indices(k_copy_unpacklist,ncopy,xlo,xhi,ylo,yhi,zlo,zhi);
     ncopy++;
   }
 
@@ -935,7 +935,7 @@ void GridCommKokkos<DeviceType>::grow_swap()
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
-int GridCommKokkos<DeviceType>::indices_kokkos(DAT::tdual_int_2d &k_list, int index,
+int GridCommKokkos<DeviceType>::indices(DAT::tdual_int_2d &k_list, int index,
                        int xlo, int xhi, int ylo, int yhi, int zlo, int zhi)
 {
   int nmax = (xhi-xlo+1) * (yhi-ylo+1) * (zhi-zlo+1);
diff --git a/src/KOKKOS/gridcomm_kokkos.h b/src/KOKKOS/gridcomm_kokkos.h
index ad2f9c7995..1c82ab18e4 100644
--- a/src/KOKKOS/gridcomm_kokkos.h
+++ b/src/KOKKOS/gridcomm_kokkos.h
@@ -33,7 +33,7 @@ class GridCommKokkos : public GridComm {
 	   int, int, int, int, int, int,
 	   int, int, int, int, int, int,
 	   int, int, int, int, int, int);
-  ~GridCommKokkos();
+  virtual ~GridCommKokkos();
   void forward_comm_kspace(class KSpace *, int, int,
 			   FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
   void reverse_comm_kspace(class KSpace *, int, int,
@@ -68,7 +68,7 @@ class GridCommKokkos : public GridComm {
 
   void grow_swap();
   
-  int indices_kokkos(DAT::tdual_int_2d &, int, int, int, int, int, int, int);
+  int indices(DAT::tdual_int_2d &, int, int, int, int, int, int, int);
 };
 
 }
diff --git a/src/KSPACE/gridcomm.h b/src/KSPACE/gridcomm.h
index 4fc410d435..941b84651f 100644
--- a/src/KSPACE/gridcomm.h
+++ b/src/KSPACE/gridcomm.h
@@ -28,12 +28,12 @@ class GridComm : protected Pointers {
 	   int, int, int, int, int, int,
 	   int, int, int, int, int, int);
   virtual ~GridComm();
-  virtual void setup(int &, int &);
-  virtual int ghost_adjacent();
-  virtual void forward_comm_kspace(class KSpace *, int, int, int,
-                                   void *, void *, MPI_Datatype);
-  virtual void reverse_comm_kspace(class KSpace *, int, int, int,
-                                   void *, void *, MPI_Datatype);
+  void setup(int &, int &);
+  int ghost_adjacent();
+  void forward_comm_kspace(class KSpace *, int, int, int,
+                           void *, void *, MPI_Datatype);
+  void reverse_comm_kspace(class KSpace *, int, int, int,
+                           void *, void *, MPI_Datatype);
 
  protected:
   int me,nprocs;
@@ -189,14 +189,14 @@ class GridComm : protected Pointers {
   int ghost_adjacent_regular();
   int ghost_adjacent_tiled();
   
-  virtual void forward_comm_kspace_regular(class KSpace *, int, int, int,
-                                         void *, void *, MPI_Datatype);
-  virtual void forward_comm_kspace_tiled(class KSpace *, int, int, int,
-                                         void *, void *, MPI_Datatype);
-  virtual void reverse_comm_kspace_regular(class KSpace *, int, int, int,
-                                           void *, void *, MPI_Datatype);
-  virtual void reverse_comm_kspace_tiled(class KSpace *, int, int, int,
-                                         void *, void *, MPI_Datatype);
+  void forward_comm_kspace_regular(class KSpace *, int, int, int,
+                                   void *, void *, MPI_Datatype);
+  void forward_comm_kspace_tiled(class KSpace *, int, int, int,
+                                 void *, void *, MPI_Datatype);
+  void reverse_comm_kspace_regular(class KSpace *, int, int, int,
+                                   void *, void *, MPI_Datatype);
+  void reverse_comm_kspace_tiled(class KSpace *, int, int, int,
+                                 void *, void *, MPI_Datatype);
 
   virtual void grow_swap();
   void grow_overlap();

From 5ea7ca94f40fb993b2edb5d2cffabadb921e930f Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 19 Aug 2020 13:54:06 -0600
Subject: [PATCH 16/38] Revert accidental change

---
 src/KOKKOS/pppm_kokkos.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp
index 53279a9b71..b6552523a3 100644
--- a/src/KOKKOS/pppm_kokkos.cpp
+++ b/src/KOKKOS/pppm_kokkos.cpp
@@ -720,7 +720,6 @@ void PPPMKokkos<DeviceType>::compute(int eflag, int vflag)
       copymode = 1;
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_self1>(0,nlocal),*this);
       copymode = 0;
-      copymode = 0;
       //for (i = nlocal; i < ntotal; i++) d_eatom[i] *= 0.5*qscale;
     }
 

From 6f37da4eb28572bb1d0188522c24a089a5fab5e2 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 19 Aug 2020 14:10:35 -0600
Subject: [PATCH 17/38] Whitespace cleanup

---
 src/KSPACE/msm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/KSPACE/msm.cpp b/src/KSPACE/msm.cpp
index 5c401edb8a..f2c3f6c820 100644
--- a/src/KSPACE/msm.cpp
+++ b/src/KSPACE/msm.cpp
@@ -2549,7 +2549,7 @@ void MSM::pack_forward_grid(int flag, void *vbuf, int nlist, int *list)
     
   int n = current_level;
   int k = 0;
-  
+
   if (flag == FORWARD_RHO) {
     double ***qgridn = qgrid[n];
     double *qsrc = &qgridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];

From 8f156bfee60a493af0e42d7c764587ee6e5ebe59 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 19 Aug 2020 14:25:06 -0600
Subject: [PATCH 18/38] Fix typo in pppm_gpu.cpp

---
 src/GPU/pppm_gpu.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp
index 4dcfbdfee2..cb5e08a46d 100644
--- a/src/GPU/pppm_gpu.cpp
+++ b/src/GPU/pppm_gpu.cpp
@@ -653,7 +653,7 @@ void PPPMGPU::pack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
    unpack another proc's ghost values from buf and add to own values
 ------------------------------------------------------------------------- */
 
-void PPPMGPU::unpack_reverse_grid(int flag, void *buf, int nlist, int *list)
+void PPPMGPU::unpack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
 {
   FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
 

From ba06a7bf0c6f262f54fa39e9b4aa7a9077d80370 Mon Sep 17 00:00:00 2001
From: Steve Plimpton <sjplimp@sandia.gov>
Date: Wed, 19 Aug 2020 16:05:30 -0600
Subject: [PATCH 19/38] final gridcomm comments and flip FFT forward/reverse

---
 src/KSPACE/fft3d.cpp    | 33 ++++++++++-----------
 src/KSPACE/gridcomm.cpp | 64 ++++++++++++++++++++++++++++++-----------
 2 files changed, 64 insertions(+), 33 deletions(-)

diff --git a/src/KSPACE/fft3d.cpp b/src/KSPACE/fft3d.cpp
index 3e4d131d3a..0b4bb85032 100644
--- a/src/KSPACE/fft3d.cpp
+++ b/src/KSPACE/fft3d.cpp
@@ -103,18 +103,18 @@ void fft_3d(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan)
   length = plan->length1;
 
 #if defined(FFT_MKL)
-  if (flag == -1)
+  if (flag == 1)
     DftiComputeForward(plan->handle_fast,data);
   else
     DftiComputeBackward(plan->handle_fast,data);
 #elif defined(FFT_FFTW3)
-  if (flag == -1)
+  if (flag == 1)
     theplan=plan->plan_fast_forward;
   else
     theplan=plan->plan_fast_backward;
   FFTW_API(execute_dft)(theplan,data,data);
 #else
-  if (flag == -1)
+  if (flag == 1)
     for (offset = 0; offset < total; offset += length)
       kiss_fft(plan->cfg_fast_forward,&data[offset],&data[offset]);
   else
@@ -137,18 +137,18 @@ void fft_3d(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan)
   length = plan->length2;
 
 #if defined(FFT_MKL)
-  if (flag == -1)
+  if (flag == 1)
     DftiComputeForward(plan->handle_mid,data);
   else
     DftiComputeBackward(plan->handle_mid,data);
 #elif defined(FFT_FFTW3)
-  if (flag == -1)
+  if (flag == 1)
     theplan=plan->plan_mid_forward;
   else
     theplan=plan->plan_mid_backward;
   FFTW_API(execute_dft)(theplan,data,data);
 #else
-  if (flag == -1)
+  if (flag == 1)
     for (offset = 0; offset < total; offset += length)
       kiss_fft(plan->cfg_mid_forward,&data[offset],&data[offset]);
   else
@@ -171,18 +171,18 @@ void fft_3d(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan)
   length = plan->length3;
 
 #if defined(FFT_MKL)
-  if (flag == -1)
+  if (flag == 1)
     DftiComputeForward(plan->handle_slow,data);
   else
     DftiComputeBackward(plan->handle_slow,data);
 #elif defined(FFT_FFTW3)
-  if (flag == -1)
+  if (flag == 1)
     theplan=plan->plan_slow_forward;
   else
     theplan=plan->plan_slow_backward;
   FFTW_API(execute_dft)(theplan,data,data);
 #else
-  if (flag == -1)
+  if (flag == 1)
     for (offset = 0; offset < total; offset += length)
       kiss_fft(plan->cfg_slow_forward,&data[offset],&data[offset]);
   else
@@ -198,7 +198,8 @@ void fft_3d(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan)
              (FFT_SCALAR *) plan->scratch, plan->post_plan);
 
   // scaling if required
-  if (flag == 1 && plan->scaled) {
+  
+  if (flag == -1 && plan->scaled) {
     norm = plan->norm;
     num = plan->normnum;
 #if defined(FFT_FFTW3)
@@ -745,7 +746,7 @@ void fft_1d_only(FFT_DATA *data, int nsize, int flag, struct fft_plan_3d *plan)
   // data is just an array of 0.0
 
 #if defined(FFT_MKL)
-  if (flag == -1) {
+  if (flag == 1) {
     DftiComputeForward(plan->handle_fast,data);
     DftiComputeForward(plan->handle_mid,data);
     DftiComputeForward(plan->handle_slow,data);
@@ -756,23 +757,23 @@ void fft_1d_only(FFT_DATA *data, int nsize, int flag, struct fft_plan_3d *plan)
   }
 #elif defined(FFT_FFTW3)
   FFTW_API(plan) theplan;
-  if (flag == -1)
+  if (flag == 1)
     theplan=plan->plan_fast_forward;
   else
     theplan=plan->plan_fast_backward;
   FFTW_API(execute_dft)(theplan,data,data);
-  if (flag == -1)
+  if (flag == 1)
     theplan=plan->plan_mid_forward;
   else
     theplan=plan->plan_mid_backward;
   FFTW_API(execute_dft)(theplan,data,data);
-  if (flag == -1)
+  if (flag == 1)
     theplan=plan->plan_slow_forward;
   else
     theplan=plan->plan_slow_backward;
   FFTW_API(execute_dft)(theplan,data,data);
 #else
-  if (flag == -1) {
+  if (flag == 1) {
     for (int offset = 0; offset < total1; offset += length1)
       kiss_fft(plan->cfg_fast_forward,&data[offset],&data[offset]);
     for (int offset = 0; offset < total2; offset += length2)
@@ -792,7 +793,7 @@ void fft_1d_only(FFT_DATA *data, int nsize, int flag, struct fft_plan_3d *plan)
   // scaling if required
   // limit num to size of data
 
-  if (flag == 1 && plan->scaled) {
+  if (flag == -1 && plan->scaled) {
     norm = plan->norm;
     num = MIN(plan->normnum,nsize);
 #if defined(FFT_FFTW3)
diff --git a/src/KSPACE/gridcomm.cpp b/src/KSPACE/gridcomm.cpp
index 49067b09e5..f0ec0bc768 100644
--- a/src/KSPACE/gridcomm.cpp
+++ b/src/KSPACE/gridcomm.cpp
@@ -23,11 +23,12 @@ using namespace LAMMPS_NS;
 
 enum{REGULAR,TILED};
 
-#define SWAPDELTA 8
+#define DELTA 16
 
 /* ----------------------------------------------------------------------
    NOTES
    tiled implementation only currently works for RCB, not general tiled
+   b/c RCB tree is used to find neighboring tiles
    if o indices for ghosts are < 0 or hi indices are >= N,
      then grid is treated as periodic in that dimension,
      communication is done across the periodic boundaries
@@ -227,7 +228,14 @@ void GridComm::setup(int &nbuf1, int &nbuf2)
   else setup_tiled(nbuf1,nbuf2);
 }
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   setup comm for a regular grid of procs
+   each proc has 6 neighbors
+   comm pattern = series of swaps with one of those 6 procs
+   can be multiple swaps with same proc if ghost extent is large
+   swap may not be symmetric if both procs do not need same layers of ghosts
+   all procs perform same # of swaps in a direction, even if some don't need it
+------------------------------------------------------------------------- */
 
 void GridComm::setup_regular(int &nbuf1, int &nbuf2)
 {
@@ -522,7 +530,12 @@ void GridComm::setup_regular(int &nbuf1, int &nbuf2)
 }
 
 /* ----------------------------------------------------------------------
-   NOTE: need to doc this header
+   setup comm for RCB tiled proc domains
+   each proc has arbitrary # of neighbors that overlap its ghost extent
+   identify which procs will send me ghost cells, and vice versa
+   may not be symmetric if both procs do not need same layers of ghosts
+   comm pattern = post recvs for all my ghosts, send my owned, wait on recvs
+     no exchanges by dimension, unlike CommTiled forward/reverse comm of particles
 ------------------------------------------------------------------------- */
 
 void GridComm::setup_tiled(int &nbuf1, int &nbuf2)
@@ -744,8 +757,13 @@ void GridComm::setup_tiled(int &nbuf1, int &nbuf2)
 }
 
 /* ----------------------------------------------------------------------
-   NOTE: need to doc this header
-   recursive ...
+   recursively split a box until it doesn't overlap any periodic boundaries
+   box = 6 integers = (xlo,xhi,ylo,yhi,zlo,zhi)
+     each lo/hi value may extend beyonw 0 to N-1 into another periodic image
+   pbc = flags in each dim of which periodic image the caller box was in
+   when a box straddles a periodic bounadry, split it in two
+   when a box does not straddle, drop it down RCB tree
+     add all the procs it overlaps with to Overlap list
 ------------------------------------------------------------------------- */
 
 void GridComm::ghost_box_drop(int *box, int *pbc)
@@ -760,6 +778,7 @@ void GridComm::ghost_box_drop(int *box, int *pbc)
   for (i = 0; i < 3; i++) newpbc[i] = pbc[i];
 
   // 6 if tests to see if box needs to be split across a periodic boundary
+  // newbox1 and 2 = new split boxes, newpbc increments current pbc
   // final else is no split
   
   int splitflag = 1;
@@ -799,6 +818,7 @@ void GridComm::ghost_box_drop(int *box, int *pbc)
   // returns nprocs = # of procs it overlaps, including self
   // returns proc_overlap = list of proc IDs it overlaps
   // skip self overlap if no crossing of periodic boundaries
+  // do not skip self if overlap is in another periodic image
     
   } else {
     splitflag = 0;
@@ -824,8 +844,11 @@ void GridComm::ghost_box_drop(int *box, int *pbc)
 }
 
 /* ----------------------------------------------------------------------
-   NOTE: need to doc this header
-   recursive ...
+   recursively drop a box down the RCB tree to find all procs it overlaps with
+   box = 6 integers = (xlo,xhi,ylo,yhi,zlo,zhi)
+     each lo/hi value ranges from 0 to N-1 in a dim, N = grid size in that dim
+     box is guaranteed to be wholly within the global domain
+   return Np = # of procs, plist = proc IDs
 ------------------------------------------------------------------------- */
 
 void GridComm::box_drop_grid(int *box, int proclower, int procupper,
@@ -899,7 +922,7 @@ int GridComm::ghost_adjacent_tiled()
 }
 
 /* ----------------------------------------------------------------------
-   use swap list in forward order to acquire copy of all needed ghost grid pts
+   forward comm of my owned cells to other's ghost cells
 ------------------------------------------------------------------------- */
 
 void GridComm::forward_comm_kspace(KSpace *kspace, int nper, int nbyte, int which,
@@ -911,7 +934,9 @@ void GridComm::forward_comm_kspace(KSpace *kspace, int nper, int nbyte, int whic
     forward_comm_kspace_tiled(kspace,nper,nbyte,which,buf1,buf2,datatype);
 }
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   forward comm on regular grid of procs via list of swaps with 6 neighbor procs
+------------------------------------------------------------------------- */
 
 void GridComm::
 forward_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
@@ -938,7 +963,9 @@ forward_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
   }
 }
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   forward comm on tiled grid decomp via Send/Recv lists of each neighbor proc
+------------------------------------------------------------------------- */
 
 void GridComm::
 forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
@@ -981,8 +1008,7 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
 }
 
 /* ----------------------------------------------------------------------
-   use swap list in reverse order to compute fully summed value
-   for each owned grid pt that some other proc has copy of as a ghost grid pt
+   reverse comm of my ghost cells to sum to owner cells
 ------------------------------------------------------------------------- */
 
 void GridComm::reverse_comm_kspace(KSpace *kspace, int nper, int nbyte, int which,
@@ -994,6 +1020,8 @@ void GridComm::reverse_comm_kspace(KSpace *kspace, int nper, int nbyte, int whic
     reverse_comm_kspace_tiled(kspace,nper,nbyte,which,buf1,buf2,datatype);
 }
 
+/* ----------------------------------------------------------------------
+   reverse comm on regular grid of procs via list of swaps with 6 neighbor procs
 /* ---------------------------------------------------------------------- */
 
 void GridComm::
@@ -1021,7 +1049,9 @@ reverse_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
   }
 }
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   reverse comm on tiled grid decomp via Send/Recv lists of each neighbor proc
+------------------------------------------------------------------------- */
 
 void GridComm::
 reverse_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
@@ -1073,7 +1103,7 @@ reverse_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
 
 void GridComm::grow_swap()
 {
-  maxswap += SWAPDELTA;
+  maxswap += DELTA;
   swap = (Swap *)
     memory->srealloc(swap,maxswap*sizeof(Swap),"GridComm:swap");
 }
@@ -1088,15 +1118,15 @@ void GridComm::grow_swap()
 
 void GridComm::grow_overlap()
 {
-  maxoverlap += SWAPDELTA;
+  maxoverlap += DELTA;
   overlap = (Overlap *)
     memory->srealloc(overlap,maxoverlap*sizeof(Overlap),"GridComm:overlap");
 }
 
 /* ----------------------------------------------------------------------
    create 1d list of offsets into 3d array section (xlo:xhi,ylo:yhi,zlo:zhi)
-   assume 3d array is allocated as (fullxlo:fullxhi,fullylo:fullyhi,
-     fullzlo:fullzhi)
+   assume 3d array is allocated as 
+     (fullxlo:fullxhi,fullylo:fullyhi,fullzlo:fullzhi)
 ------------------------------------------------------------------------- */
 
 int GridComm::indices(int *&list,

From 284cdceae82248f62fefddf1c0c40ecb450c5b0b Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 19 Aug 2020 18:19:28 -0400
Subject: [PATCH 20/38] silence compiler warnings

---
 src/KSPACE/gridcomm.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KSPACE/gridcomm.cpp b/src/KSPACE/gridcomm.cpp
index f0ec0bc768..e5b907f071 100644
--- a/src/KSPACE/gridcomm.cpp
+++ b/src/KSPACE/gridcomm.cpp
@@ -939,7 +939,7 @@ void GridComm::forward_comm_kspace(KSpace *kspace, int nper, int nbyte, int whic
 ------------------------------------------------------------------------- */
 
 void GridComm::
-forward_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
+forward_comm_kspace_regular(KSpace *kspace, int nper, int /*nbyte*/, int which,
 			    void *buf1, void *buf2, MPI_Datatype datatype)
 {
   int m;
@@ -968,7 +968,7 @@ forward_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
 ------------------------------------------------------------------------- */
 
 void GridComm::
-forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
+forward_comm_kspace_tiled(KSpace *kspace, int nper, int /*nbyte*/, int which,
 			  void *buf1, void *vbuf2, MPI_Datatype datatype)
 {
   int i,m,offset;

From fdb1e69768e3566c7f074bda25f83180b12b316c Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 19 Aug 2020 18:36:28 -0400
Subject: [PATCH 21/38] silence compiler warnings

---
 src/KOKKOS/gridcomm_kokkos.cpp | 4 ++--
 src/KSPACE/gridcomm.cpp        | 6 +++---
 src/KSPACE/pppm.cpp            | 6 +++---
 src/KSPACE/pppm_disp.cpp       | 8 ++++----
 src/KSPACE/pppm_stagger.cpp    | 4 ++--
 5 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/KOKKOS/gridcomm_kokkos.cpp b/src/KOKKOS/gridcomm_kokkos.cpp
index 81e4435083..21f2bb915f 100644
--- a/src/KOKKOS/gridcomm_kokkos.cpp
+++ b/src/KOKKOS/gridcomm_kokkos.cpp
@@ -71,11 +71,11 @@ GridCommKokkos<DeviceType>::GridCommKokkos(LAMMPS *lmp, MPI_Comm gcomm,
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
-GridCommKokkos<DeviceType>::GridCommKokkos(LAMMPS *lmp, MPI_Comm gcomm, int flag,
+GridCommKokkos<DeviceType>::GridCommKokkos(LAMMPS *lmp, MPI_Comm gcomm, int /*flag*/,
 		   int gnx, int gny, int gnz,
 		   int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
 		   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi,
-		   int exlo, int exhi, int eylo, int eyhi, int ezlo, int ezhi)
+           int /*exlo*/, int /*exhi*/, int /*eylo*/, int /*eyhi*/, int /*ezlo*/, int /*ezhi*/)
   : GridComm(lmp, gcomm,
              gnx, gny, gnz,
              ixlo,ixhi, iylo, iyhi, izlo, izhi,
diff --git a/src/KSPACE/gridcomm.cpp b/src/KSPACE/gridcomm.cpp
index e5b907f071..27952879ce 100644
--- a/src/KSPACE/gridcomm.cpp
+++ b/src/KSPACE/gridcomm.cpp
@@ -968,7 +968,7 @@ forward_comm_kspace_regular(KSpace *kspace, int nper, int /*nbyte*/, int which,
 ------------------------------------------------------------------------- */
 
 void GridComm::
-forward_comm_kspace_tiled(KSpace *kspace, int nper, int /*nbyte*/, int which,
+forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
 			  void *buf1, void *vbuf2, MPI_Datatype datatype)
 {
   int i,m,offset;
@@ -1022,10 +1022,10 @@ void GridComm::reverse_comm_kspace(KSpace *kspace, int nper, int nbyte, int whic
 
 /* ----------------------------------------------------------------------
    reverse comm on regular grid of procs via list of swaps with 6 neighbor procs
-/* ---------------------------------------------------------------------- */
+------------------------------------------------------------------------- */
 
 void GridComm::
-reverse_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
+reverse_comm_kspace_regular(KSpace *kspace, int nper, int /*nbyte*/, int which,
 			    void *buf1, void *buf2, MPI_Datatype datatype)
 {
   int m;
diff --git a/src/KSPACE/pppm.cpp b/src/KSPACE/pppm.cpp
index d643ce12e2..84b5d9ecb0 100644
--- a/src/KSPACE/pppm.cpp
+++ b/src/KSPACE/pppm.cpp
@@ -73,9 +73,9 @@ PPPM::PPPM(LAMMPS *lmp) : KSpace(lmp),
   rho_coeff(NULL), drho1d(NULL), drho_coeff(NULL),
   sf_precoeff1(NULL), sf_precoeff2(NULL), sf_precoeff3(NULL),
   sf_precoeff4(NULL), sf_precoeff5(NULL), sf_precoeff6(NULL),
-  acons(NULL), density_A_brick(NULL), density_B_brick(NULL), density_A_fft(NULL),
-  density_B_fft(NULL), fft1(NULL), fft2(NULL), remap(NULL), gc(NULL),
-  gc_buf1(NULL), gc_buf2(NULL), part2grid(NULL), boxlo(NULL)
+  acons(NULL), fft1(NULL), fft2(NULL), remap(NULL), gc(NULL),
+  gc_buf1(NULL), gc_buf2(NULL), density_A_brick(NULL), density_B_brick(NULL), density_A_fft(NULL),
+  density_B_fft(NULL), part2grid(NULL), boxlo(NULL)
 {
   peratom_allocate_flag = 0;
   group_allocate_flag = 0;
diff --git a/src/KSPACE/pppm_disp.cpp b/src/KSPACE/pppm_disp.cpp
index 67bfc170ee..313f41d2cc 100644
--- a/src/KSPACE/pppm_disp.cpp
+++ b/src/KSPACE/pppm_disp.cpp
@@ -2977,7 +2977,7 @@ double PPPMDisp::compute_qopt_ik()
   double unitky = (2.0*MY_PI/yprd);
   double unitkz = (2.0*MY_PI/zprd_slab);
 
-  int nx,ny,nz,kper,lper,mper;
+  int nx,ny,nz;
   double sqk, u2;
   double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
   double sum1,sum2, sum3,dot1,dot2;
@@ -3068,7 +3068,7 @@ double PPPMDisp::compute_qopt_ad()
   double unitky = (2.0*MY_PI/yprd);
   double unitkz = (2.0*MY_PI/zprd_slab);
 
-  int nx,ny,nz,kper,lper,mper;
+  int nx,ny,nz;
   double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
   double u2, sqk;
   double sum1,sum2,sum3,sum4,dot2;
@@ -3158,7 +3158,7 @@ double PPPMDisp::compute_qopt_6_ik()
   double unitky = (2.0*MY_PI/yprd);
   double unitkz = (2.0*MY_PI/zprd_slab);
 
-  int nx,ny,nz,kper,lper,mper;
+  int nx,ny,nz;
   double sqk, u2;
   double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
   double sum1,sum2, sum3;
@@ -3256,7 +3256,7 @@ double PPPMDisp::compute_qopt_6_ad()
   double unitky = (2.0*MY_PI/yprd);
   double unitkz = (2.0*MY_PI/zprd_slab);
 
-  int nx,ny,nz,kper,lper,mper;
+  int nx,ny,nz;
   double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
   double u2, sqk;
   double sum1,sum2,sum3,sum4;
diff --git a/src/KSPACE/pppm_stagger.cpp b/src/KSPACE/pppm_stagger.cpp
index 03a7374587..837644f0e3 100644
--- a/src/KSPACE/pppm_stagger.cpp
+++ b/src/KSPACE/pppm_stagger.cpp
@@ -273,7 +273,7 @@ double PPPMStagger::compute_qopt()
 {
   if (differentiation_flag == 1) return compute_qopt_ad();
 
-  int k,l,m,nx,ny,nz,kper,lper,mper;
+  int k,l,m,nx,ny,nz;
   double snx,sny,snz;
   double cnx,cny,cnz;
   double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
@@ -372,7 +372,7 @@ double PPPMStagger::compute_qopt()
 
 double PPPMStagger::compute_qopt_ad()
 {
-  int k,l,m,nx,ny,nz,kper,lper,mper;
+  int k,l,m,nx,ny,nz;
   double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
   double sum1,sum2,sum3,sum4,sum5,sum6,dot2;
   double u1,u2,sqk;

From 729b3d2717bf39d8fad41f377776056a573a024a Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 19 Aug 2020 18:46:35 -0400
Subject: [PATCH 22/38] major whitespace cleanup

---
 doc/src/bond_special.rst              |   4 +-
 src/GRANULAR/fix_wall_gran.cpp        |   2 +-
 src/GRANULAR/fix_wall_gran_region.cpp |   2 +-
 src/KOKKOS/gridcomm_kokkos.cpp        |  90 ++--
 src/KOKKOS/gridcomm_kokkos.h          |  16 +-
 src/KOKKOS/pair_snap_kokkos_impl.h    |   6 +-
 src/KOKKOS/sna_kokkos.h               |   2 +-
 src/KOKKOS/sna_kokkos_impl.h          |  20 +-
 src/KSPACE/fft3d.cpp                  |   2 +-
 src/KSPACE/gridcomm.cpp               | 176 +++----
 src/KSPACE/gridcomm.h                 |  50 +-
 src/KSPACE/msm.cpp                    |  74 +--
 src/KSPACE/msm.h                      |   4 +-
 src/KSPACE/pppm.cpp                   | 100 ++--
 src/KSPACE/pppm.h                     |   2 +-
 src/KSPACE/pppm_cg.cpp                |  14 +-
 src/KSPACE/pppm_dipole.cpp            |  12 +-
 src/KSPACE/pppm_dipole.h              |   2 +-
 src/KSPACE/pppm_dipole_spin.cpp       |   6 +-
 src/KSPACE/pppm_disp.cpp              | 696 +++++++++++++-------------
 src/KSPACE/pppm_stagger.cpp           | 106 ++--
 src/MISC/fix_gld.cpp                  |   2 +-
 src/MISC/fix_ttm.cpp                  |   2 +-
 src/MOLECULE/fix_cmap.cpp             |   2 +-
 src/USER-INTEL/pppm_disp_intel.cpp    |  94 ++--
 src/USER-MISC/fix_gle.cpp             |   2 +-
 src/USER-MISC/fix_srp.cpp             |   2 +-
 src/USER-MISC/pair_meam_spline.cpp    |   2 +-
 src/USER-OMP/ewald_omp.cpp            |   4 +-
 src/fix_neigh_history.cpp             |   2 +-
 30 files changed, 749 insertions(+), 749 deletions(-)

diff --git a/doc/src/bond_special.rst b/doc/src/bond_special.rst
index f7dc43a1b2..5f2cebbb5a 100644
--- a/doc/src/bond_special.rst
+++ b/doc/src/bond_special.rst
@@ -73,7 +73,7 @@ ensure that the new bonds created by this style do not create spurious
 Specifically 1-2 interactions must have weights of zero, 1-3
 interactions must either have weights of unity or :doc:`special_bonds
 angle yes <special_bonds>` must be used, and 1-4 interactions must
-have weights of unity or :doc:`special_bonds dihedral yes <special_bonds>` 
+have weights of unity or :doc:`special_bonds dihedral yes <special_bonds>`
 must be used.
 
 If this command is used to create bonded interactions between
@@ -95,7 +95,7 @@ compute interactions for individual pairs of atoms.  Manybody potentials
 are not compatible in general, but also some other pair styles are missing
 the required functionality and thus will cause an error.
 
-This command is not compatible with long-range Coulombic interactions. If a 
+This command is not compatible with long-range Coulombic interactions. If a
 `kspace_style <kspace_style>` is declared, an error will be issued.
 
 Related commands
diff --git a/src/GRANULAR/fix_wall_gran.cpp b/src/GRANULAR/fix_wall_gran.cpp
index 80a121c035..d7cbf0362a 100644
--- a/src/GRANULAR/fix_wall_gran.cpp
+++ b/src/GRANULAR/fix_wall_gran.cpp
@@ -1560,7 +1560,7 @@ void FixWallGran::unpack_restart(int nlocal, int nth)
 
   // skip to Nth set of extra values
   // unpack the Nth first values this way because other fixes pack them
-  
+
   int m = 0;
   for (int i = 0; i < nth; i++) m += static_cast<int> (extra[nlocal][m]);
   m++;
diff --git a/src/GRANULAR/fix_wall_gran_region.cpp b/src/GRANULAR/fix_wall_gran_region.cpp
index 6953165af6..543df02fc2 100644
--- a/src/GRANULAR/fix_wall_gran_region.cpp
+++ b/src/GRANULAR/fix_wall_gran_region.cpp
@@ -498,7 +498,7 @@ void FixWallGranRegion::unpack_restart(int nlocal, int nth)
 
   // skip to Nth set of extra values
   // unpack the Nth first values this way because other fixes pack them
-  
+
   int m = 0;
   for (int i = 0; i < nth; i++) m += static_cast<int> (extra[nlocal][m]);
   m++;
diff --git a/src/KOKKOS/gridcomm_kokkos.cpp b/src/KOKKOS/gridcomm_kokkos.cpp
index 21f2bb915f..c4e4a55da2 100644
--- a/src/KOKKOS/gridcomm_kokkos.cpp
+++ b/src/KOKKOS/gridcomm_kokkos.cpp
@@ -48,9 +48,9 @@ enum{REGULAR,TILED};
 
 template<class DeviceType>
 GridCommKokkos<DeviceType>::GridCommKokkos(LAMMPS *lmp, MPI_Comm gcomm,
-		   int gnx, int gny, int gnz,
-		   int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
-		   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi)
+                   int gnx, int gny, int gnz,
+                   int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
+                   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi)
   : GridComm(lmp, gcomm,
              gnx, gny, gnz,
              ixlo,ixhi, iylo, iyhi, izlo, izhi,
@@ -72,9 +72,9 @@ GridCommKokkos<DeviceType>::GridCommKokkos(LAMMPS *lmp, MPI_Comm gcomm,
 
 template<class DeviceType>
 GridCommKokkos<DeviceType>::GridCommKokkos(LAMMPS *lmp, MPI_Comm gcomm, int /*flag*/,
-		   int gnx, int gny, int gnz,
-		   int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
-		   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi,
+                   int gnx, int gny, int gnz,
+                   int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
+                   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi,
            int /*exlo*/, int /*exhi*/, int /*eylo*/, int /*eyhi*/, int /*ezlo*/, int /*ezhi*/)
   : GridComm(lmp, gcomm,
              gnx, gny, gnz,
@@ -89,14 +89,14 @@ template<class DeviceType>
 GridCommKokkos<DeviceType>::~GridCommKokkos()
 {
   // regular comm data struct
-  
+
   for (int i = 0; i < nswap; i++) {
     swap[i].packlist = NULL;
     swap[i].unpacklist = NULL;
   }
 
   // tiled comm data structs
-  
+
   for (int i = 0; i < nsend; i++)
     send[i].packlist = NULL;
 
@@ -163,7 +163,7 @@ void GridCommKokkos<DeviceType>::setup_regular(int &nbuf1, int &nbuf2)
   // setup swaps = exchange of grid data with one of 6 neighobr procs
   // can be more than one in a direction if ghost region extends beyond neigh proc
   // all procs have same swap count, but swapsize npack/nunpack can be empty
-  
+
   nswap = 0;
 
   // send own grid pts to -x processor, recv ghost grid pts from +x processor
@@ -420,7 +420,7 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
   // access CommTiled to get cut dimension
   // cut = this proc's inlo in that dim
   // dim is -1 for proc 0, but never accessed
-  
+
   rcbinfo = (RCBinfo *)
     memory->smalloc(nprocs*sizeof(RCBinfo),"GridComm:rcbinfo");
   RCBinfo rcbone;
@@ -435,14 +435,14 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
   // accounts for crossings of periodic boundaries
   // noverlap = # of overlaps, including self
   // overlap = vector of overlap info using Overlap data struct
-  
+
   ghostbox[0] = outxlo;
   ghostbox[1] = outxhi;
   ghostbox[2] = outylo;
   ghostbox[3] = outyhi;
   ghostbox[4] = outzlo;
   ghostbox[5] = outzhi;
-  
+
   pbc[0] = pbc[1] = pbc[2] = 0;
 
   memory->create(overlap_procs,nprocs,"GridComm:overlap_procs");
@@ -459,10 +459,10 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
   memory->create(proclist,noverlap,"GridComm:proclist");
   srequest = (Request *)
     memory->smalloc(noverlap*sizeof(Request),"GridComm:srequest");
-  
+
   int nsend_request = 0;
   ncopy = 0;
-  
+
   for (m = 0; m < noverlap; m++) {
     if (overlap[m].proc == me) ncopy++;
     else {
@@ -470,7 +470,7 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
       srequest[nsend_request].sender = me;
       srequest[nsend_request].index = m;
       for (i = 0; i < 6; i++)
-	srequest[nsend_request].box[i] = overlap[m].box[i];
+        srequest[nsend_request].box[i] = overlap[m].box[i];
       nsend_request++;
     }
   }
@@ -481,7 +481,7 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
     (Request *) memory->smalloc(nrecv_request*sizeof(Request),"GridComm:rrequest");
   irregular->exchange_data((char *) srequest,sizeof(Request),(char *) rrequest);
   irregular->destroy_data();
-  
+
   // compute overlaps between received ghost boxes and my owned box
   // overlap box used to setup my Send data struct and respond to requests
 
@@ -515,7 +515,7 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
   }
 
   nsend = nrecv_request;
-  
+
   // reply to each Request message with a Response message
   // content: index for the overlap on requestor, overlap box on my owned grid
 
@@ -530,13 +530,13 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
   // process received responses
   // box used to setup my Recv data struct after unwrapping via PBC
   // adjacent = 0 if any box of ghost cells does not adjoin my owned cells
-  
+
   recv = (Recv *) memory->smalloc(nrecv_response*sizeof(Recv),"GridComm:recv");
 
   k_recv_unpacklist = DAT::tdual_int_2d("GridComm:recv_unpacklist",nrecv_response,k_recv_unpacklist.extent(1));
 
   adjacent = 1;
-  
+
   for (i = 0; i < nrecv_response; i++) {
     m = rresponse[i].index;
     recv[i].proc = overlap[m].proc;
@@ -547,21 +547,21 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
     zlo = rresponse[i].box[4] + overlap[m].pbc[2] * nz;
     zhi = rresponse[i].box[5] + overlap[m].pbc[2] * nz;
     recv[i].nunpack = indices(k_recv_unpacklist,i,xlo,xhi,ylo,yhi,zlo,zhi);
-    
+
     if (xlo != inxhi+1 && xhi != inxlo-1 &&
-	ylo != inyhi+1 && yhi != inylo-1 &&
-	zlo != inzhi+1 && zhi != inzlo-1) adjacent = 0;
+        ylo != inyhi+1 && yhi != inylo-1 &&
+        zlo != inzhi+1 && zhi != inzlo-1) adjacent = 0;
   }
 
   nrecv = nrecv_response;
 
   // create Copy data struct from overlaps with self
-  
+
   copy = (Copy *) memory->smalloc(ncopy*sizeof(Copy),"GridComm:copy");
 
   k_copy_packlist = DAT::tdual_int_2d("GridComm:copy_packlist",ncopy,k_copy_packlist.extent(1));
   k_copy_unpacklist = DAT::tdual_int_2d("GridComm:copy_unpacklist",ncopy,k_copy_unpacklist.extent(1));
- 
+
   ncopy = 0;
   for (m = 0; m < noverlap; m++) {
     if (overlap[m].proc != me) continue;
@@ -600,7 +600,7 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
 
   int nrequest = MAX(nsend,nrecv);
   requests = new MPI_Request[nrequest];
-    
+
   // clean-up
 
   memory->sfree(rcbinfo);
@@ -614,7 +614,7 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
 
   // nbuf1 = largest pack or unpack in any Send or Recv or Copy
   // nbuf2 = larget of sum of all packs or unpacks in Send or Recv
-  
+
   nbuf1 = 0;
 
   for (m = 0; m < ncopy; m++) {
@@ -643,7 +643,7 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
 
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::forward_comm_kspace(KSpace *kspace, int nper, int which,
-				   FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
+                                   FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
 {
   if (layout == REGULAR)
     forward_comm_kspace_regular(kspace,nper,which,k_buf1,k_buf2,datatype);
@@ -656,7 +656,7 @@ void GridCommKokkos<DeviceType>::forward_comm_kspace(KSpace *kspace, int nper, i
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::
 forward_comm_kspace_regular(KSpace *kspace, int nper, int which,
-			    FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
+                            FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
 {
   int m;
   MPI_Request request;
@@ -687,9 +687,9 @@ forward_comm_kspace_regular(KSpace *kspace, int nper, int which,
       }
 
       if (swap[m].nunpack) MPI_Irecv(buf2,nper*swap[m].nunpack,datatype,
-				     swap[m].recvproc,0,gridcomm,&request);
+                                     swap[m].recvproc,0,gridcomm,&request);
       if (swap[m].npack) MPI_Send(buf1,nper*swap[m].npack,datatype,
-				  swap[m].sendproc,0,gridcomm);
+                                  swap[m].sendproc,0,gridcomm);
       if (swap[m].nunpack) MPI_Wait(&request,MPI_STATUS_IGNORE);
 
       if (!lmp->kokkos->cuda_aware_flag) {
@@ -708,7 +708,7 @@ forward_comm_kspace_regular(KSpace *kspace, int nper, int which,
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::
 forward_comm_kspace_tiled(KSpace *kspace, int nper, int which,
-			  FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
+                          FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
 {
   int i,m,offset;
 
@@ -724,11 +724,11 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int which,
   }
 
   // post all receives
-  
+
   for (m = 0; m < nrecv; m++) {
     offset = nper * recv[m].offset;
     MPI_Irecv(&buf2[offset],nper*recv[m].nunpack,datatype,
-	      recv[m].proc,0,gridcomm,&requests[m]);
+              recv[m].proc,0,gridcomm,&requests[m]);
   }
 
   // perform all sends to other procs
@@ -753,7 +753,7 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int which,
   }
 
   // unpack all received data
-  
+
   for (i = 0; i < nrecv; i++) {
     MPI_Waitany(nrecv,requests,&m,MPI_STATUS_IGNORE);
 
@@ -764,7 +764,7 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int which,
 
     offset = nper * recv[m].offset;
     kspaceKKBase->unpack_forward_grid_kokkos(which,k_buf2,offset,
-				recv[m].nunpack,k_recv_unpacklist,m);
+                                recv[m].nunpack,k_recv_unpacklist,m);
     DeviceType().fence();
   }
 }
@@ -776,7 +776,7 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int which,
 
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::reverse_comm_kspace(KSpace *kspace, int nper, int which,
-				    FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
+                                    FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
 {
   if (layout == REGULAR)
     reverse_comm_kspace_regular(kspace,nper,which,k_buf1,k_buf2,datatype);
@@ -789,7 +789,7 @@ void GridCommKokkos<DeviceType>::reverse_comm_kspace(KSpace *kspace, int nper, i
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::
 reverse_comm_kspace_regular(KSpace *kspace, int nper, int which,
-			    FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
+                            FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
 {
   int m;
   MPI_Request request;
@@ -820,9 +820,9 @@ reverse_comm_kspace_regular(KSpace *kspace, int nper, int which,
       }
 
       if (swap[m].npack) MPI_Irecv(buf2,nper*swap[m].npack,datatype,
-				   swap[m].sendproc,0,gridcomm,&request);
+                                   swap[m].sendproc,0,gridcomm,&request);
       if (swap[m].nunpack) MPI_Send(buf1,nper*swap[m].nunpack,datatype,
-				     swap[m].recvproc,0,gridcomm);
+                                     swap[m].recvproc,0,gridcomm);
       if (swap[m].npack) MPI_Wait(&request,MPI_STATUS_IGNORE);
 
 
@@ -842,7 +842,7 @@ reverse_comm_kspace_regular(KSpace *kspace, int nper, int which,
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::
 reverse_comm_kspace_tiled(KSpace *kspace, int nper, int which,
-			  FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
+                          FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
 {
   int i,m,offset;
 
@@ -859,11 +859,11 @@ reverse_comm_kspace_tiled(KSpace *kspace, int nper, int which,
   }
 
   // post all receives
-  
+
   for (m = 0; m < nsend; m++) {
     offset = nper * send[m].offset;
     MPI_Irecv(&buf2[offset],nper*send[m].npack,datatype,
-	      send[m].proc,0,gridcomm,&requests[m]);
+              send[m].proc,0,gridcomm,&requests[m]);
   }
 
   // perform all sends to other procs
@@ -888,7 +888,7 @@ reverse_comm_kspace_tiled(KSpace *kspace, int nper, int which,
   }
 
   // unpack all received data
-  
+
   for (i = 0; i < nsend; i++) {
     MPI_Waitany(nsend,requests,&m,MPI_STATUS_IGNORE);
 
@@ -899,7 +899,7 @@ reverse_comm_kspace_tiled(KSpace *kspace, int nper, int which,
 
     offset = nper * send[m].offset;
     kspaceKKBase->unpack_reverse_grid_kokkos(which,k_buf2,offset,
-				send[m].npack,k_send_packlist,m);
+                                send[m].npack,k_send_packlist,m);
     DeviceType().fence();
   }
 }
@@ -942,7 +942,7 @@ int GridCommKokkos<DeviceType>::indices(DAT::tdual_int_2d &k_list, int index,
   if (k_list.extent(1) < nmax)
     k_list.resize(k_list.extent(0),nmax);
 
-  if (nmax == 0) return 0; 
+  if (nmax == 0) return 0;
 
   int nx = (fullxhi-fullxlo+1);
   int ny = (fullyhi-fullylo+1);
diff --git a/src/KOKKOS/gridcomm_kokkos.h b/src/KOKKOS/gridcomm_kokkos.h
index 1c82ab18e4..1f93c111ca 100644
--- a/src/KOKKOS/gridcomm_kokkos.h
+++ b/src/KOKKOS/gridcomm_kokkos.h
@@ -27,17 +27,17 @@ class GridCommKokkos : public GridComm {
   typedef ArrayTypes<DeviceType> AT;
   typedef FFTArrayTypes<DeviceType> FFT_AT;
   GridCommKokkos(class LAMMPS *, MPI_Comm, int, int, int,
-	   int, int, int, int, int, int,
-	   int, int, int, int, int, int);
+           int, int, int, int, int, int,
+           int, int, int, int, int, int);
   GridCommKokkos(class LAMMPS *, MPI_Comm, int, int, int, int,
-	   int, int, int, int, int, int,
-	   int, int, int, int, int, int,
-	   int, int, int, int, int, int);
+           int, int, int, int, int, int,
+           int, int, int, int, int, int,
+           int, int, int, int, int, int);
   virtual ~GridCommKokkos();
   void forward_comm_kspace(class KSpace *, int, int,
-			   FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
+                           FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
   void reverse_comm_kspace(class KSpace *, int, int,
-			   FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
+                           FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
 
  private:
   DAT::tdual_int_2d k_swap_packlist;
@@ -67,7 +67,7 @@ class GridCommKokkos : public GridComm {
                                  FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
 
   void grow_swap();
-  
+
   int indices(DAT::tdual_int_2d &, int, int, int, int, int, int, int);
 };
 
diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h
index e2a66ee28c..fbdb23a79b 100644
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@@ -718,7 +718,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUi,const int ia
   const int iatom = iatom_mod + iatom_div * 32;
   if (iatom >= chunk_size) return;
 
-  if (j > twojmax) return; 
+  if (j > twojmax) return;
 
   int elem_count = chemflag ? nelements : 1;
 
@@ -739,7 +739,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUi,const int ia
 
         // Store
         my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
-          
+
         // Also zero yi
         my_sna.ylist_pack_re(iatom_mod, idxu_half, ielem, iatom_div) = 0.;
         my_sna.ylist_pack_im(iatom_mod, idxu_half, ielem, iatom_div) = 0.;
@@ -944,7 +944,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUiCPU, const in
 
   if (iatom >= chunk_size) return;
 
-  if (j > twojmax) return; 
+  if (j > twojmax) return;
 
   int elem_count = chemflag ? nelements : 1;
 
diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h
index 765ec1a05a..2c712a1c84 100644
--- a/src/KOKKOS/sna_kokkos.h
+++ b/src/KOKKOS/sna_kokkos.h
@@ -55,7 +55,7 @@ public:
   typedef Kokkos::View<SNAcomplex**[3], DeviceType> t_sna_3c3;
   typedef Kokkos::View<SNAcomplex*****, DeviceType> t_sna_5c;
 
-  typedef Kokkos::View<CayleyKleinPack**, DeviceType> t_sna_2ckp; 
+  typedef Kokkos::View<CayleyKleinPack**, DeviceType> t_sna_2ckp;
 
 inline
   SNAKokkos() {};
diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index 817617090d..0a7dae04a4 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -120,7 +120,7 @@ void SNAKokkos<DeviceType>::build_indexlist()
   idxu_max = idxu_count;
   Kokkos::deep_copy(idxu_block,h_idxu_block);
 
-  // index list for half uarray 
+  // index list for half uarray
   idxu_half_block = Kokkos::View<int*, DeviceType>("SNAKokkos::idxu_half_block",jdim);
   auto h_idxu_half_block = Kokkos::create_mirror_view(idxu_half_block);
 
@@ -316,7 +316,7 @@ void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
 
 /* ----------------------------------------------------------------------
    Precompute the Cayley-Klein parameters and the derivatives thereof.
-   This routine better exploits parallelism than the GPU ComputeUi and 
+   This routine better exploits parallelism than the GPU ComputeUi and
    ComputeFusedDeidrj, which are one warp per atom-neighbor pair.
 ------------------------------------------------------------------------- */
 
@@ -339,7 +339,7 @@ void SNAKokkos<DeviceType>::compute_cayley_klein(const int& iatom, const int& jn
   const double dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
 
   //const double wj_local = wj(iatom, jnbor);
-  double sfac, dsfac; 
+  double sfac, dsfac;
   compute_s_dsfac(r, rcut, sfac, dsfac);
   sfac *= wj_local;
   dsfac *= wj_local;
@@ -639,7 +639,7 @@ void SNAKokkos<DeviceType>::compute_zi(const int& iatom_mod, const int& jjz, con
 /* ----------------------------------------------------------------------
    compute Bi by summing conj(Ui)*Zi
    AoSoA data layout to take advantage of coalescing, avoiding warp
-   divergence. 
+   divergence.
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
@@ -1406,7 +1406,7 @@ void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy
       for(int ma = 0; ma <= j; ma++) {
         sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re +
                      dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im;
-        sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re + 
+        sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re +
                      dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im;
         sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re +
                      dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im;
@@ -1421,9 +1421,9 @@ void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy
       for(int ma = 0; ma < mb; ma++) {
         sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re +
                      dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im;
-        sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re + 
+        sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re +
                      dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im;
-        sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re + 
+        sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re +
                      dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im;
         jju_half++; jju_cache++;
       }
@@ -1431,9 +1431,9 @@ void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy
       //int ma = mb;
       sum_tmp.x += (dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re +
                     dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im)*0.5;
-      sum_tmp.y += (dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re + 
+      sum_tmp.y += (dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re +
                     dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im)*0.5;
-      sum_tmp.z += (dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re + 
+      sum_tmp.z += (dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re +
                     dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im)*0.5;
     } // end if jeven
 
@@ -2162,7 +2162,7 @@ double SNAKokkos<DeviceType>::memory_usage()
 
 #ifdef LMP_KOKKOS_GPU
   if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
-    
+
     auto natom_pad = (natom+32-1)/32;
 
     bytes += natom * idxu_half_max * nelements * sizeof(double);     // ulisttot_re
diff --git a/src/KSPACE/fft3d.cpp b/src/KSPACE/fft3d.cpp
index 0b4bb85032..7c555e99b5 100644
--- a/src/KSPACE/fft3d.cpp
+++ b/src/KSPACE/fft3d.cpp
@@ -198,7 +198,7 @@ void fft_3d(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan)
              (FFT_SCALAR *) plan->scratch, plan->post_plan);
 
   // scaling if required
-  
+
   if (flag == -1 && plan->scaled) {
     norm = plan->norm;
     num = plan->normnum;
diff --git a/src/KSPACE/gridcomm.cpp b/src/KSPACE/gridcomm.cpp
index 27952879ce..4024670b9f 100644
--- a/src/KSPACE/gridcomm.cpp
+++ b/src/KSPACE/gridcomm.cpp
@@ -46,9 +46,9 @@ enum{REGULAR,TILED};
 ------------------------------------------------------------------------- */
 
 GridComm::GridComm(LAMMPS *lmp, MPI_Comm gcomm,
-		   int gnx, int gny, int gnz,
-		   int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
-		   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi)
+                   int gnx, int gny, int gnz,
+                   int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
+                   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi)
   : Pointers(lmp)
 {
   if (comm->layout == Comm::LAYOUT_TILED) layout = TILED;
@@ -57,18 +57,18 @@ GridComm::GridComm(LAMMPS *lmp, MPI_Comm gcomm,
   if (layout == REGULAR) {
     int (*procneigh)[2] = comm->procneigh;
     initialize(gcomm,gnx,gny,gnz,
-	       ixlo,ixhi,iylo,iyhi,izlo,izhi,
-	       oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
-	       oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
-	       procneigh[0][0],procneigh[0][1],
-	       procneigh[1][0],procneigh[1][1],
-	       procneigh[2][0],procneigh[2][1]);
+               ixlo,ixhi,iylo,iyhi,izlo,izhi,
+               oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
+               oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
+               procneigh[0][0],procneigh[0][1],
+               procneigh[1][0],procneigh[1][1],
+               procneigh[2][0],procneigh[2][1]);
   } else {
     initialize(gcomm,gnx,gny,gnz,
-	       ixlo,ixhi,iylo,iyhi,izlo,izhi,
-	       oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
-	       oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
-	       0,0,0,0,0,0);
+               ixlo,ixhi,iylo,iyhi,izlo,izhi,
+               oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
+               oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
+               0,0,0,0,0,0);
   }
 }
 
@@ -85,10 +85,10 @@ GridComm::GridComm(LAMMPS *lmp, MPI_Comm gcomm,
 ------------------------------------------------------------------------- */
 
 GridComm::GridComm(LAMMPS *lmp, MPI_Comm gcomm, int flag,
-		   int gnx, int gny, int gnz,
-		   int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
-		   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi,
-		   int exlo, int exhi, int eylo, int eyhi, int ezlo, int ezhi)
+                   int gnx, int gny, int gnz,
+                   int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
+                   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi,
+                   int exlo, int exhi, int eylo, int eyhi, int ezlo, int ezhi)
   : Pointers(lmp)
 {
   if (comm->layout == Comm::LAYOUT_TILED) layout = TILED;
@@ -99,27 +99,27 @@ GridComm::GridComm(LAMMPS *lmp, MPI_Comm gcomm, int flag,
       // this assumes gcomm = world
       int (*procneigh)[2] = comm->procneigh;
       initialize(gcomm,gnx,gny,gnz,
-		 ixlo,ixhi,iylo,iyhi,izlo,izhi,
-		 oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
-		 exlo,exhi,eylo,eyhi,ezlo,ezhi,
-		 procneigh[0][0],procneigh[0][1],
-		 procneigh[1][0],procneigh[1][1],
-		 procneigh[2][0],procneigh[2][1]);
+                 ixlo,ixhi,iylo,iyhi,izlo,izhi,
+                 oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
+                 exlo,exhi,eylo,eyhi,ezlo,ezhi,
+                 procneigh[0][0],procneigh[0][1],
+                 procneigh[1][0],procneigh[1][1],
+                 procneigh[2][0],procneigh[2][1]);
     } else {
       initialize(gcomm,gnx,gny,gnz,
-		 ixlo,ixhi,iylo,iyhi,izlo,izhi,
-		 oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
-		 exlo,exhi,eylo,eyhi,ezlo,ezhi,
-		 0,0,0,0,0,0);
+                 ixlo,ixhi,iylo,iyhi,izlo,izhi,
+                 oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
+                 exlo,exhi,eylo,eyhi,ezlo,ezhi,
+                 0,0,0,0,0,0);
     }
-    
+
   } else if (flag == 2) {
     if (layout == REGULAR) {
       initialize(gcomm,gnx,gny,gnz,
-		 ixlo,ixhi,iylo,iyhi,izlo,izhi,
-		 oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
-		 oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
-		 exlo,exhi,eylo,eyhi,ezlo,ezhi);
+                 ixlo,ixhi,iylo,iyhi,izlo,izhi,
+                 oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
+                 oxlo,oxhi,oylo,oyhi,ozlo,ozhi,
+                 exlo,exhi,eylo,eyhi,ezlo,ezhi);
     } else {
       error->all(FLERR,"GridComm does not support tiled layout with neighbor procs");
     }
@@ -131,7 +131,7 @@ GridComm::GridComm(LAMMPS *lmp, MPI_Comm gcomm, int flag,
 GridComm::~GridComm()
 {
   // regular comm data struct
-  
+
   for (int i = 0; i < nswap; i++) {
     memory->destroy(swap[i].packlist);
     memory->destroy(swap[i].unpacklist);
@@ -139,7 +139,7 @@ GridComm::~GridComm()
   memory->sfree(swap);
 
   // tiled comm data structs
-  
+
   for (int i = 0; i < nsend; i++)
     memory->destroy(send[i].packlist);
   memory->sfree(send);
@@ -162,11 +162,11 @@ GridComm::~GridComm()
 ------------------------------------------------------------------------- */
 
 void GridComm::initialize(MPI_Comm gcomm,
-			  int gnx, int gny, int gnz,
-			  int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
-			  int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi,
-			  int fxlo, int fxhi, int fylo, int fyhi, int fzlo, int fzhi,
-			  int pxlo, int pxhi, int pylo, int pyhi, int pzlo, int pzhi)
+                          int gnx, int gny, int gnz,
+                          int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
+                          int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi,
+                          int fxlo, int fxhi, int fylo, int fyhi, int fzlo, int fzhi,
+                          int pxlo, int pxhi, int pylo, int pyhi, int pzlo, int pzhi)
 {
   gridcomm = gcomm;
   MPI_Comm_rank(gridcomm,&me);
@@ -175,7 +175,7 @@ void GridComm::initialize(MPI_Comm gcomm,
   nx = gnx;
   ny = gny;
   nz = gnz;
-  
+
   inxlo = ixlo;
   inxhi = ixhi;
   inylo = iylo;
@@ -209,7 +209,7 @@ void GridComm::initialize(MPI_Comm gcomm,
   }
 
   // internal data initializations
-  
+
   nswap = maxswap = 0;
   swap = NULL;
 
@@ -287,7 +287,7 @@ void GridComm::setup_regular(int &nbuf1, int &nbuf2)
   // setup swaps = exchange of grid data with one of 6 neighobr procs
   // can be more than one in a direction if ghost region extends beyond neigh proc
   // all procs have same swap count, but swapsize npack/nunpack can be empty
-  
+
   nswap = 0;
 
   // send own grid pts to -x processor, recv ghost grid pts from +x processor
@@ -548,7 +548,7 @@ void GridComm::setup_tiled(int &nbuf1, int &nbuf2)
   // access CommTiled to get cut dimension
   // cut = this proc's inlo in that dim
   // dim is -1 for proc 0, but never accessed
-  
+
   rcbinfo = (RCBinfo *)
     memory->smalloc(nprocs*sizeof(RCBinfo),"GridComm:rcbinfo");
   RCBinfo rcbone;
@@ -563,14 +563,14 @@ void GridComm::setup_tiled(int &nbuf1, int &nbuf2)
   // accounts for crossings of periodic boundaries
   // noverlap = # of overlaps, including self
   // overlap = vector of overlap info using Overlap data struct
-  
+
   ghostbox[0] = outxlo;
   ghostbox[1] = outxhi;
   ghostbox[2] = outylo;
   ghostbox[3] = outyhi;
   ghostbox[4] = outzlo;
   ghostbox[5] = outzhi;
-  
+
   pbc[0] = pbc[1] = pbc[2] = 0;
 
   memory->create(overlap_procs,nprocs,"GridComm:overlap_procs");
@@ -587,10 +587,10 @@ void GridComm::setup_tiled(int &nbuf1, int &nbuf2)
   memory->create(proclist,noverlap,"GridComm:proclist");
   srequest = (Request *)
     memory->smalloc(noverlap*sizeof(Request),"GridComm:srequest");
-  
+
   int nsend_request = 0;
   ncopy = 0;
-  
+
   for (m = 0; m < noverlap; m++) {
     if (overlap[m].proc == me) ncopy++;
     else {
@@ -598,7 +598,7 @@ void GridComm::setup_tiled(int &nbuf1, int &nbuf2)
       srequest[nsend_request].sender = me;
       srequest[nsend_request].index = m;
       for (i = 0; i < 6; i++)
-	srequest[nsend_request].box[i] = overlap[m].box[i];
+        srequest[nsend_request].box[i] = overlap[m].box[i];
       nsend_request++;
     }
   }
@@ -609,7 +609,7 @@ void GridComm::setup_tiled(int &nbuf1, int &nbuf2)
     (Request *) memory->smalloc(nrecv_request*sizeof(Request),"GridComm:rrequest");
   irregular->exchange_data((char *) srequest,sizeof(Request),(char *) rrequest);
   irregular->destroy_data();
-  
+
   // compute overlaps between received ghost boxes and my owned box
   // overlap box used to setup my Send data struct and respond to requests
 
@@ -640,7 +640,7 @@ void GridComm::setup_tiled(int &nbuf1, int &nbuf2)
   }
 
   nsend = nrecv_request;
-  
+
   // reply to each Request message with a Response message
   // content: index for the overlap on requestor, overlap box on my owned grid
 
@@ -655,10 +655,10 @@ void GridComm::setup_tiled(int &nbuf1, int &nbuf2)
   // process received responses
   // box used to setup my Recv data struct after unwrapping via PBC
   // adjacent = 0 if any box of ghost cells does not adjoin my owned cells
-  
+
   recv = (Recv *) memory->smalloc(nrecv_response*sizeof(Recv),"GridComm:recv");
   adjacent = 1;
-  
+
   for (i = 0; i < nrecv_response; i++) {
     m = rresponse[i].index;
     recv[i].proc = overlap[m].proc;
@@ -669,18 +669,18 @@ void GridComm::setup_tiled(int &nbuf1, int &nbuf2)
     zlo = rresponse[i].box[4] + overlap[m].pbc[2] * nz;
     zhi = rresponse[i].box[5] + overlap[m].pbc[2] * nz;
     recv[i].nunpack = indices(recv[i].unpacklist,xlo,xhi,ylo,yhi,zlo,zhi);
-    
+
     if (xlo != inxhi+1 && xhi != inxlo-1 &&
-	ylo != inyhi+1 && yhi != inylo-1 &&
-	zlo != inzhi+1 && zhi != inzlo-1) adjacent = 0;
+        ylo != inyhi+1 && yhi != inylo-1 &&
+        zlo != inzhi+1 && zhi != inzlo-1) adjacent = 0;
   }
 
   nrecv = nrecv_response;
 
   // create Copy data struct from overlaps with self
-  
+
   copy = (Copy *) memory->smalloc(ncopy*sizeof(Copy),"GridComm:copy");
- 
+
   ncopy = 0;
   for (m = 0; m < noverlap; m++) {
     if (overlap[m].proc != me) continue;
@@ -719,7 +719,7 @@ void GridComm::setup_tiled(int &nbuf1, int &nbuf2)
 
   int nrequest = MAX(nsend,nrecv);
   requests = new MPI_Request[nrequest];
-    
+
   // clean-up
 
   memory->sfree(rcbinfo);
@@ -733,7 +733,7 @@ void GridComm::setup_tiled(int &nbuf1, int &nbuf2)
 
   // nbuf1 = largest pack or unpack in any Send or Recv or Copy
   // nbuf2 = larget of sum of all packs or unpacks in Send or Recv
-  
+
   nbuf1 = 0;
 
   for (m = 0; m < ncopy; m++) {
@@ -769,9 +769,9 @@ void GridComm::setup_tiled(int &nbuf1, int &nbuf2)
 void GridComm::ghost_box_drop(int *box, int *pbc)
 {
   int i,m;
-  
+
   // newbox12 and newpbc are initially copies of caller box and pbc
-  
+
   int newbox1[6],newbox2[6],newpbc[3];
 
   for (i = 0; i < 6; i++) newbox1[i] = newbox2[i] = box[i];
@@ -780,9 +780,9 @@ void GridComm::ghost_box_drop(int *box, int *pbc)
   // 6 if tests to see if box needs to be split across a periodic boundary
   // newbox1 and 2 = new split boxes, newpbc increments current pbc
   // final else is no split
-  
+
   int splitflag = 1;
-  
+
   if (box[0] < 0) {
     newbox1[0] = 0;
     newbox2[0] = box[0] + nx;
@@ -801,7 +801,7 @@ void GridComm::ghost_box_drop(int *box, int *pbc)
   } else if (box[3] >= ny) {
     newbox1[3] = ny - 1;
     newbox2[2] = 0;
-    newbox2[3] = box[3] - ny; 
+    newbox2[3] = box[3] - ny;
     newpbc[1]++;
   } else if (box[4] < 0) {
     newbox1[4] = 0;
@@ -819,7 +819,7 @@ void GridComm::ghost_box_drop(int *box, int *pbc)
   // returns proc_overlap = list of proc IDs it overlaps
   // skip self overlap if no crossing of periodic boundaries
   // do not skip self if overlap is in another periodic image
-    
+
   } else {
     splitflag = 0;
     int np = 0;
@@ -827,7 +827,7 @@ void GridComm::ghost_box_drop(int *box, int *pbc)
     for (m = 0; m < np; m++) {
       if (noverlap == maxoverlap) grow_overlap();
       if (overlap_procs[m] == me &&
-	  pbc[0] == 0 && pbc[1] == 0 && pbc[2] == 0) continue;
+          pbc[0] == 0 && pbc[1] == 0 && pbc[2] == 0) continue;
       overlap[noverlap].proc = overlap_procs[m];
       for (i = 0; i < 6; i++) overlap[noverlap].box[i] = box[i];
       for (i = 0; i < 3; i++) overlap[noverlap].pbc[i] = pbc[i];
@@ -836,7 +836,7 @@ void GridComm::ghost_box_drop(int *box, int *pbc)
   }
 
   // recurse with 2 split boxes
-  
+
   if (splitflag) {
     ghost_box_drop(newbox1,pbc);
     ghost_box_drop(newbox2,newpbc);
@@ -852,7 +852,7 @@ void GridComm::ghost_box_drop(int *box, int *pbc)
 ------------------------------------------------------------------------- */
 
 void GridComm::box_drop_grid(int *box, int proclower, int procupper,
-			     int &np, int *plist)
+                             int &np, int *plist)
 {
   // end recursion when partition is a single proc
   // add proclower to plist
@@ -926,7 +926,7 @@ int GridComm::ghost_adjacent_tiled()
 ------------------------------------------------------------------------- */
 
 void GridComm::forward_comm_kspace(KSpace *kspace, int nper, int nbyte, int which,
-				   void *buf1, void *buf2, MPI_Datatype datatype)
+                                   void *buf1, void *buf2, MPI_Datatype datatype)
 {
   if (layout == REGULAR)
     forward_comm_kspace_regular(kspace,nper,nbyte,which,buf1,buf2,datatype);
@@ -940,7 +940,7 @@ void GridComm::forward_comm_kspace(KSpace *kspace, int nper, int nbyte, int whic
 
 void GridComm::
 forward_comm_kspace_regular(KSpace *kspace, int nper, int /*nbyte*/, int which,
-			    void *buf1, void *buf2, MPI_Datatype datatype)
+                            void *buf1, void *buf2, MPI_Datatype datatype)
 {
   int m;
   MPI_Request request;
@@ -953,9 +953,9 @@ forward_comm_kspace_regular(KSpace *kspace, int nper, int /*nbyte*/, int which,
 
     if (swap[m].sendproc != me) {
       if (swap[m].nunpack) MPI_Irecv(buf2,nper*swap[m].nunpack,datatype,
-				     swap[m].recvproc,0,gridcomm,&request);
+                                     swap[m].recvproc,0,gridcomm,&request);
       if (swap[m].npack) MPI_Send(buf1,nper*swap[m].npack,datatype,
-				  swap[m].sendproc,0,gridcomm);
+                                  swap[m].sendproc,0,gridcomm);
       if (swap[m].nunpack) MPI_Wait(&request,MPI_STATUS_IGNORE);
     }
 
@@ -969,18 +969,18 @@ forward_comm_kspace_regular(KSpace *kspace, int nper, int /*nbyte*/, int which,
 
 void GridComm::
 forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
-			  void *buf1, void *vbuf2, MPI_Datatype datatype)
+                          void *buf1, void *vbuf2, MPI_Datatype datatype)
 {
   int i,m,offset;
 
   char *buf2 = (char *) vbuf2;
-  
+
   // post all receives
-  
+
   for (m = 0; m < nrecv; m++) {
     offset = nper * recv[m].offset * nbyte;
     MPI_Irecv((void *) &buf2[offset],nper*recv[m].nunpack,datatype,
-	      recv[m].proc,0,gridcomm,&requests[m]);
+              recv[m].proc,0,gridcomm,&requests[m]);
   }
 
   // perform all sends to other procs
@@ -998,12 +998,12 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
   }
 
   // unpack all received data
-  
+
   for (i = 0; i < nrecv; i++) {
     MPI_Waitany(nrecv,requests,&m,MPI_STATUS_IGNORE);
     offset = nper * recv[m].offset * nbyte;
     kspace->unpack_forward_grid(which,(void *) &buf2[offset],
-				recv[m].nunpack,recv[m].unpacklist);
+                                recv[m].nunpack,recv[m].unpacklist);
   }
 }
 
@@ -1012,7 +1012,7 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
 ------------------------------------------------------------------------- */
 
 void GridComm::reverse_comm_kspace(KSpace *kspace, int nper, int nbyte, int which,
-				    void *buf1, void *buf2, MPI_Datatype datatype)
+                                    void *buf1, void *buf2, MPI_Datatype datatype)
 {
   if (layout == REGULAR)
     reverse_comm_kspace_regular(kspace,nper,nbyte,which,buf1,buf2,datatype);
@@ -1026,7 +1026,7 @@ void GridComm::reverse_comm_kspace(KSpace *kspace, int nper, int nbyte, int whic
 
 void GridComm::
 reverse_comm_kspace_regular(KSpace *kspace, int nper, int /*nbyte*/, int which,
-			    void *buf1, void *buf2, MPI_Datatype datatype)
+                            void *buf1, void *buf2, MPI_Datatype datatype)
 {
   int m;
   MPI_Request request;
@@ -1039,9 +1039,9 @@ reverse_comm_kspace_regular(KSpace *kspace, int nper, int /*nbyte*/, int which,
 
     if (swap[m].recvproc != me) {
       if (swap[m].npack) MPI_Irecv(buf2,nper*swap[m].npack,datatype,
-				   swap[m].sendproc,0,gridcomm,&request);
+                                   swap[m].sendproc,0,gridcomm,&request);
       if (swap[m].nunpack) MPI_Send(buf1,nper*swap[m].nunpack,datatype,
-				     swap[m].recvproc,0,gridcomm);
+                                     swap[m].recvproc,0,gridcomm);
       if (swap[m].npack) MPI_Wait(&request,MPI_STATUS_IGNORE);
     }
 
@@ -1055,18 +1055,18 @@ reverse_comm_kspace_regular(KSpace *kspace, int nper, int /*nbyte*/, int which,
 
 void GridComm::
 reverse_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
-			  void *buf1, void *vbuf2, MPI_Datatype datatype)
+                          void *buf1, void *vbuf2, MPI_Datatype datatype)
 {
   int i,m,offset;
 
   char *buf2 = (char *) vbuf2;
 
   // post all receives
-  
+
   for (m = 0; m < nsend; m++) {
     offset = nper * send[m].offset * nbyte;
     MPI_Irecv((void *) &buf2[offset],nper*send[m].npack,datatype,
-	      send[m].proc,0,gridcomm,&requests[m]);
+              send[m].proc,0,gridcomm,&requests[m]);
   }
 
   // perform all sends to other procs
@@ -1084,12 +1084,12 @@ reverse_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
   }
 
   // unpack all received data
-  
+
   for (i = 0; i < nsend; i++) {
     MPI_Waitany(nsend,requests,&m,MPI_STATUS_IGNORE);
     offset = nper * send[m].offset * nbyte;
     kspace->unpack_reverse_grid(which,(void *) &buf2[offset],
-				send[m].npack,send[m].packlist);
+                                send[m].npack,send[m].packlist);
   }
 }
 
@@ -1125,7 +1125,7 @@ void GridComm::grow_overlap()
 
 /* ----------------------------------------------------------------------
    create 1d list of offsets into 3d array section (xlo:xhi,ylo:yhi,zlo:zhi)
-   assume 3d array is allocated as 
+   assume 3d array is allocated as
      (fullxlo:fullxhi,fullylo:fullyhi,fullzlo:fullzhi)
 ------------------------------------------------------------------------- */
 
diff --git a/src/KSPACE/gridcomm.h b/src/KSPACE/gridcomm.h
index 941b84651f..97c914999f 100644
--- a/src/KSPACE/gridcomm.h
+++ b/src/KSPACE/gridcomm.h
@@ -21,12 +21,12 @@ namespace LAMMPS_NS {
 class GridComm : protected Pointers {
  public:
   GridComm(class LAMMPS *, MPI_Comm, int, int, int,
-	   int, int, int, int, int, int,
-	   int, int, int, int, int, int);
+           int, int, int, int, int, int,
+           int, int, int, int, int, int);
   GridComm(class LAMMPS *, MPI_Comm, int, int, int, int,
-	   int, int, int, int, int, int,
-	   int, int, int, int, int, int,
-	   int, int, int, int, int, int);
+           int, int, int, int, int, int,
+           int, int, int, int, int, int,
+           int, int, int, int, int, int);
   virtual ~GridComm();
   void setup(int &, int &);
   int ghost_adjacent();
@@ -46,7 +46,7 @@ class GridComm : protected Pointers {
   int nx,ny,nz;               // size of global grid in all 3 dims
   int inxlo,inxhi;            // inclusive extent of my grid chunk
   int inylo,inyhi;            //   0 <= in <= N-1
-  int inzlo,inzhi;   
+  int inzlo,inzhi;
   int outxlo,outxhi;          // inclusive extent of my grid chunk plus
   int outylo,outyhi;          //   ghost cells in all 6 directions
   int outzlo,outzhi;          //   lo indices can be < 0, hi indices can be >= N
@@ -61,13 +61,13 @@ class GridComm : protected Pointers {
   int procxlo,procxhi;     // 6 neighbor procs that adjoin me
   int procylo,procyhi;     // not used for comm_style = tiled
   int proczlo,proczhi;
-  
+
   int ghostxlo,ghostxhi;   // # of my owned grid planes needed
   int ghostylo,ghostyhi;   // by neighobr procs in each dir as their ghost planes
   int ghostzlo,ghostzhi;
 
   // swap = exchange of owned and ghost grid cells between 2 procs, including self
-  
+
   struct Swap {
     int sendproc;       // proc to send to for forward comm
     int recvproc;       // proc to recv from for forward comm
@@ -89,17 +89,17 @@ class GridComm : protected Pointers {
 
   // RCB tree of cut info
   // each proc contributes one value, except proc 0
-  
+
   struct RCBinfo {
     int dim;        // 0,1,2 = which dim the cut is in
     int cut;        // grid index of lowest cell in upper half of cut
   };
 
   RCBinfo *rcbinfo;
-    
+
   // overlap = a proc whose owned cells overlap with my extended ghost box
   // includes overlaps across periodic boundaries, can also be self
-  
+
   struct Overlap {
     int proc;            // proc whose owned cells overlap my ghost cells
     int box[6];          // box that overlaps otherproc's owned cells
@@ -110,9 +110,9 @@ class GridComm : protected Pointers {
 
   int noverlap,maxoverlap;
   Overlap *overlap;
-  
+
   // request = sent to each proc whose owned cells overlap my ghost cells
-  
+
   struct Request {
     int sender;          // sending proc
     int index;           // index of overlap on sender
@@ -121,9 +121,9 @@ class GridComm : protected Pointers {
   };
 
   Request *srequest,*rrequest;
-  
+
   // response = reply from each proc whose owned cells overlap my ghost cells
-  
+
   struct Response {
     int index;           // index of my overlap for the initial request
     int box[6];          // box that overlaps responder's owned cells
@@ -132,7 +132,7 @@ class GridComm : protected Pointers {
   };
 
   Response *sresponse,*rresponse;
-  
+
   // send = proc to send a subset of my owned cells to, for forward comm
   // for reverse comm, proc I receive ghost overlaps with my owned cells from
   // offset used in reverse comm to recv a message in middle of a large buffer
@@ -147,7 +147,7 @@ class GridComm : protected Pointers {
   // recv = proc to recv a subset of my ghost cells from, for forward comm
   // for reverse comm, proc I send a subset of my ghost cells to
   // offset used in forward comm to recv a message in middle of a large buffer
-  
+
   struct Recv {
     int proc;
     int nunpack;
@@ -159,7 +159,7 @@ class GridComm : protected Pointers {
 
   // copy = subset of my owned cells to copy into subset of my ghost cells
   // that describes forward comm, for reverse comm it is the opposite
-  
+
   struct Copy {
     int npack;
     int nunpack;
@@ -177,18 +177,18 @@ class GridComm : protected Pointers {
   // -------------------------------------------
 
   void initialize(MPI_Comm, int, int, int,
-		  int, int, int, int, int, int,
-		  int, int, int, int, int, int,
-		  int, int, int, int, int, int,
-		  int, int, int, int, int, int);
+                  int, int, int, int, int, int,
+                  int, int, int, int, int, int,
+                  int, int, int, int, int, int,
+                  int, int, int, int, int, int);
   virtual void setup_regular(int &, int &);
   virtual void setup_tiled(int &, int &);
   void ghost_box_drop(int *, int *);
   void box_drop_grid(int *, int, int, int &, int *);
-  
+
   int ghost_adjacent_regular();
   int ghost_adjacent_tiled();
-  
+
   void forward_comm_kspace_regular(class KSpace *, int, int, int,
                                    void *, void *, MPI_Datatype);
   void forward_comm_kspace_tiled(class KSpace *, int, int, int,
@@ -200,7 +200,7 @@ class GridComm : protected Pointers {
 
   virtual void grow_swap();
   void grow_overlap();
-  
+
   int indices(int *&, int, int, int, int, int, int);
 };
 
diff --git a/src/KSPACE/msm.cpp b/src/KSPACE/msm.cpp
index f2c3f6c820..973302a054 100644
--- a/src/KSPACE/msm.cpp
+++ b/src/KSPACE/msm.cpp
@@ -467,7 +467,7 @@ void MSM::compute(int eflag, int vflag)
 
   current_level = 0;
   gcall->reverse_comm_kspace(this,1,sizeof(double),REVERSE_RHO,
-			     gcall_buf1,gcall_buf2,MPI_DOUBLE);
+                             gcall_buf1,gcall_buf2,MPI_DOUBLE);
 
   // forward communicate charge density values to fill ghost grid points
   // compute direct sum interaction and then restrict to coarser grid
@@ -476,7 +476,7 @@ void MSM::compute(int eflag, int vflag)
     if (!active_flag[n]) continue;
     current_level = n;
     gc[n]->forward_comm_kspace(this,1,sizeof(double),FORWARD_RHO,
-			       gc_buf1[n],gc_buf2[n],MPI_DOUBLE);
+                               gc_buf1[n],gc_buf2[n],MPI_DOUBLE);
     direct(n);
     restriction(n);
   }
@@ -488,17 +488,17 @@ void MSM::compute(int eflag, int vflag)
     if (domain->nonperiodic) {
       current_level = levels-1;
       gc[levels-1]->
-	forward_comm_kspace(this,1,sizeof(double),FORWARD_RHO,
-			    gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
+        forward_comm_kspace(this,1,sizeof(double),FORWARD_RHO,
+                            gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
       direct_top(levels-1);
       gc[levels-1]->
-	reverse_comm_kspace(this,1,sizeof(double),REVERSE_AD,
-			    gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
+        reverse_comm_kspace(this,1,sizeof(double),REVERSE_AD,
+                            gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
       if (vflag_atom)
-	gc[levels-1]->
-	  reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
-			      gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
-      
+        gc[levels-1]->
+          reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
+                              gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
+
     } else {
       // Here using MPI_Allreduce is cheaper than using commgrid
       grid_swap_forward(levels-1,qgrid[levels-1]);
@@ -506,9 +506,9 @@ void MSM::compute(int eflag, int vflag)
       grid_swap_reverse(levels-1,egrid[levels-1]);
       current_level = levels-1;
       if (vflag_atom)
-	gc[levels-1]->
-	  reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
-			      gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
+        gc[levels-1]->
+          reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
+                              gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
     }
   }
 
@@ -521,13 +521,13 @@ void MSM::compute(int eflag, int vflag)
 
     current_level = n;
     gc[n]->reverse_comm_kspace(this,1,sizeof(double),REVERSE_AD,
-			       gc_buf1[n],gc_buf2[n],MPI_DOUBLE);
+                               gc_buf1[n],gc_buf2[n],MPI_DOUBLE);
 
     // extra per-atom virial communication
 
     if (vflag_atom)
       gc[n]->reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
-				 gc_buf1[n],gc_buf2[n],MPI_DOUBLE);
+                                 gc_buf1[n],gc_buf2[n],MPI_DOUBLE);
   }
 
   // all procs communicate E-field values
@@ -535,13 +535,13 @@ void MSM::compute(int eflag, int vflag)
 
   current_level = 0;
   gcall->forward_comm_kspace(this,1,sizeof(double),FORWARD_AD,
-			     gcall_buf1,gcall_buf2,MPI_DOUBLE);
+                             gcall_buf1,gcall_buf2,MPI_DOUBLE);
 
   // extra per-atom energy/virial communication
 
   if (vflag_atom)
     gcall->forward_comm_kspace(this,6,sizeof(double),FORWARD_AD_PERATOM,
-			       gcall_buf1,gcall_buf2,MPI_DOUBLE);
+                               gcall_buf1,gcall_buf2,MPI_DOUBLE);
 
   // calculate the force on my particles (interpolation)
 
@@ -618,12 +618,12 @@ void MSM::allocate()
   // commgrid using all processors for finest grid level
 
   gcall = new GridComm(lmp,world,1,nx_msm[0],ny_msm[0],nz_msm[0],
-		       nxlo_in[0],nxhi_in[0],nylo_in[0],
-		       nyhi_in[0],nzlo_in[0],nzhi_in[0],
-		       nxlo_out_all,nxhi_out_all,nylo_out_all,
-		       nyhi_out_all,nzlo_out_all,nzhi_out_all,
-		       nxlo_out[0],nxhi_out[0],nylo_out[0],
-		       nyhi_out[0],nzlo_out[0],nzhi_out[0]);
+                       nxlo_in[0],nxhi_in[0],nylo_in[0],
+                       nyhi_in[0],nzlo_in[0],nzhi_in[0],
+                       nxlo_out_all,nxhi_out_all,nylo_out_all,
+                       nyhi_out_all,nzlo_out_all,nzhi_out_all,
+                       nxlo_out[0],nxhi_out[0],nylo_out[0],
+                       nyhi_out[0],nzlo_out[0],nzhi_out[0]);
 
   gcall->setup(ngcall_buf1,ngcall_buf2);
   npergrid = 1;
@@ -644,12 +644,12 @@ void MSM::allocate()
     if (active_flag[n]) {
       int **procneigh = procneigh_levels[n];
       gc[n] = new GridComm(lmp,world_levels[n],2,nx_msm[n],ny_msm[n],nz_msm[n],
-			   nxlo_in[n],nxhi_in[n],nylo_in[n],nyhi_in[n],
-			   nzlo_in[n],nzhi_in[n],
-			   nxlo_out[n],nxhi_out[n],nylo_out[n],nyhi_out[n],
-			   nzlo_out[n],nzhi_out[n],
-			   procneigh[0][0],procneigh[0][1],procneigh[1][0],
-			   procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+                           nxlo_in[n],nxhi_in[n],nylo_in[n],nyhi_in[n],
+                           nzlo_in[n],nzhi_in[n],
+                           nxlo_out[n],nxhi_out[n],nylo_out[n],nyhi_out[n],
+                           nzlo_out[n],nzhi_out[n],
+                           procneigh[0][0],procneigh[0][1],procneigh[1][0],
+                           procneigh[1][1],procneigh[2][0],procneigh[2][1]);
 
       gc[n]->setup(ngc_buf1[n],ngc_buf2[n]);
       npergrid = 1;
@@ -677,7 +677,7 @@ void MSM::deallocate()
   memory->destroy(gcall_buf2);
   gcall = nullptr;
   gcall_buf1 = gcall_buf2 = nullptr;
-  
+
   for (int n=0; n<levels; n++) {
     if (qgrid[n])
       memory->destroy3d_offset(qgrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
@@ -692,10 +692,10 @@ void MSM::deallocate()
     if (gc) {
       if (gc[n]) {
         delete gc[n];
-	memory->destroy(gc_buf1[n]);
-	memory->destroy(gc_buf2[n]);
+        memory->destroy(gc_buf1[n]);
+        memory->destroy(gc_buf2[n]);
         gc[n] = nullptr;
-	gc_buf1[n] = gc_buf2[n] = nullptr;
+        gc_buf1[n] = gc_buf2[n] = nullptr;
       }
     }
   }
@@ -776,13 +776,13 @@ void MSM::deallocate_peratom()
 void MSM::allocate_levels()
 {
   ngrid = new int[levels];
- 
+
   gc = new GridComm*[levels];
   gc_buf1 = new double*[levels];
   gc_buf2 = new double*[levels];
   ngc_buf1 = new int[levels];
   ngc_buf2 = new int[levels];
- 
+
   memory->create(procneigh_levels,levels,3,2,"msm:procneigh_levels");
   world_levels = new MPI_Comm[levels];
   active_flag = new int[levels];
@@ -2546,7 +2546,7 @@ void MSM::grid_swap_reverse(int n, double*** &gridn)
 void MSM::pack_forward_grid(int flag, void *vbuf, int nlist, int *list)
 {
   double *buf = (double *) vbuf;
-    
+
   int n = current_level;
   int k = 0;
 
@@ -3432,7 +3432,7 @@ double MSM::memory_usage()
   double bytes = 0;
 
   // NOTE: Stan, fill in other memory allocations here
-  
+
   // all GridComm bufs
 
   bytes += (ngcall_buf1 + ngcall_buf2) * npergrid * sizeof(double);
diff --git a/src/KSPACE/msm.h b/src/KSPACE/msm.h
index 12b0cbf309..a239e6f139 100644
--- a/src/KSPACE/msm.h
+++ b/src/KSPACE/msm.h
@@ -80,7 +80,7 @@ class MSM : public KSpace {
   int procgrid[3];                  // procs assigned in each dim of 3d grid
   int myloc[3];                     // which proc I am in each dim
   int ***procneigh_levels;          // my 6 neighboring procs, 0/1 = left/right
-  
+
   class GridComm *gcall;       // GridComm class for finest level grid
   class GridComm **gc;         // GridComm classes for each hierarchical level
 
@@ -133,7 +133,7 @@ class MSM : public KSpace {
   void get_virial_direct_top(int);
 
   // grid communication
-  
+
   void pack_forward_grid(int, void *, int, int *);
   void unpack_forward_grid(int, void *, int, int *);
   void pack_reverse_grid(int, void *, int, int *);
diff --git a/src/KSPACE/pppm.cpp b/src/KSPACE/pppm.cpp
index 84b5d9ecb0..e399727001 100644
--- a/src/KSPACE/pppm.cpp
+++ b/src/KSPACE/pppm.cpp
@@ -309,8 +309,8 @@ void PPPM::init()
     if (overlap_allowed) break;
 
     gctmp = new GridComm(lmp,world,nx_pppm,ny_pppm,nz_pppm,
-			 nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-			 nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
+                         nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
 
     int tmp1,tmp2;
     gctmp->setup(tmp1,tmp2);
@@ -641,7 +641,7 @@ void PPPM::compute(int eflag, int vflag)
   // remap from 3d decomposition to FFT decomposition
 
   gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
-			  gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                          gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   brick2fft();
 
   // compute potential gradient on my FFT grid and
@@ -656,20 +656,20 @@ void PPPM::compute(int eflag, int vflag)
 
   if (differentiation_flag == 1)
     gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
-			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   else
     gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
-			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
   // extra per-atom energy/virial communication
 
   if (evflag_atom) {
     if (differentiation_flag == 1 && vflag_atom)
       gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
-			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     else if (differentiation_flag == 0)
       gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
-			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   }
 
   // calculate the force on my particles
@@ -817,10 +817,10 @@ void PPPM::allocate()
 
   // create ghost grid object for rho and electric field communication
   // also create 2 bufs for ghost grid cell comm, passed to GridComm methods
-  
+
   gc = new GridComm(lmp,world,nx_pppm,ny_pppm,nz_pppm,
-		    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-		    nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
+                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                    nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
 
   gc->setup(ngc_buf1,ngc_buf2);
 
@@ -988,7 +988,7 @@ void PPPM::set_grid_global()
       while (1) {
 
         // set grid dimensions
-	
+
         nx_pppm = static_cast<int> (xprd/h_x);
         ny_pppm = static_cast<int> (yprd/h_y);
         nz_pppm = static_cast<int> (zprd_slab/h_z);
@@ -997,14 +997,14 @@ void PPPM::set_grid_global()
         if (ny_pppm <= 1) ny_pppm = 2;
         if (nz_pppm <= 1) nz_pppm = 2;
 
-	// estimate Kspace force error
-	
+        // estimate Kspace force error
+
         double df_kspace = compute_df_kspace();
 
         // break loop if the accuracy has been reached or
         // too many loops have been performed
 
-	count++;
+        count++;
         if (df_kspace <= accuracy) break;
 
         if (count > 500) error->all(FLERR, "Could not compute grid size");
@@ -1155,10 +1155,10 @@ double PPPM::compute_qopt()
 
   // loop over entire FFT grid
   // each proc calculates contributions from every Pth grid point
-  
+
   bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
   int nxy_pppm = nx_pppm * ny_pppm;
-  
+
   double qopt = 0.0;
 
   for (bigint i = me; i < ngridtotal; i += nprocs) {
@@ -1174,46 +1174,46 @@ double PPPM::compute_qopt()
     if (sqk == 0.0) continue;
 
     sum1 = sum2 = sum3 = sum4 = 0.0;
-    
+
     for (nx = -2; nx <= 2; nx++) {
       qx = unitkx*(kper+nx_pppm*nx);
       sx = exp(-0.25*square(qx/g_ewald));
       argx = 0.5*qx*xprd/nx_pppm;
       wx = powsinxx(argx,twoorder);
       qx *= qx;
-      
+
       for (ny = -2; ny <= 2; ny++) {
-	qy = unitky*(lper+ny_pppm*ny);
-	sy = exp(-0.25*square(qy/g_ewald));
-	argy = 0.5*qy*yprd/ny_pppm;
-	wy = powsinxx(argy,twoorder);
-	qy *= qy;
-	
-	for (nz = -2; nz <= 2; nz++) {
-	  qz = unitkz*(mper+nz_pppm*nz);
-	  sz = exp(-0.25*square(qz/g_ewald));
-	  argz = 0.5*qz*zprd_slab/nz_pppm;
-	  wz = powsinxx(argz,twoorder);
-	  qz *= qz;
-	  
-	  dot2 = qx+qy+qz;
-	  u1   = sx*sy*sz;
-	  u2   = wx*wy*wz;
-	  
-	  sum1 += u1*u1/dot2*MY_4PI*MY_4PI;
-	  sum2 += u1 * u2 * MY_4PI;
-	  sum3 += u2;
-	  sum4 += dot2*u2;
-	}
+        qy = unitky*(lper+ny_pppm*ny);
+        sy = exp(-0.25*square(qy/g_ewald));
+        argy = 0.5*qy*yprd/ny_pppm;
+        wy = powsinxx(argy,twoorder);
+        qy *= qy;
+
+        for (nz = -2; nz <= 2; nz++) {
+          qz = unitkz*(mper+nz_pppm*nz);
+          sz = exp(-0.25*square(qz/g_ewald));
+          argz = 0.5*qz*zprd_slab/nz_pppm;
+          wz = powsinxx(argz,twoorder);
+          qz *= qz;
+
+          dot2 = qx+qy+qz;
+          u1   = sx*sy*sz;
+          u2   = wx*wy*wz;
+
+          sum1 += u1*u1/dot2*MY_4PI*MY_4PI;
+          sum2 += u1 * u2 * MY_4PI;
+          sum3 += u2;
+          sum4 += dot2*u2;
+        }
       }
     }
-    
+
     sum2 *= sum2;
     qopt += sum1 - sum2/(sum3*sum4);
   }
 
   // sum qopt over all procs
-  
+
   double qopt_all;
   MPI_Allreduce(&qopt,&qopt_all,1,MPI_DOUBLE,MPI_SUM,world);
   return qopt_all;
@@ -1327,7 +1327,7 @@ void PPPM::set_grid_local()
   //   global PPPM grid that I own without ghost cells
   // for slab PPPM, assign z grid as if it were not extended
   // both non-tiled and tiled proc layouts use 0-1 fractional sumdomain info
-  
+
   if (comm->layout != Comm::LAYOUT_TILED) {
     nxlo_in = static_cast<int> (comm->xsplit[comm->myloc[0]] * nx_pppm);
     nxhi_in = static_cast<int> (comm->xsplit[comm->myloc[0]+1] * nx_pppm) - 1;
@@ -1339,7 +1339,7 @@ void PPPM::set_grid_local()
       (comm->zsplit[comm->myloc[2]] * nz_pppm/slab_volfactor);
     nzhi_in = static_cast<int>
       (comm->zsplit[comm->myloc[2]+1] * nz_pppm/slab_volfactor) - 1;
-    
+
   } else {
     nxlo_in = static_cast<int> (comm->mysplit[0][0] * nx_pppm);
     nxhi_in = static_cast<int> (comm->mysplit[0][1] * nx_pppm) - 1;
@@ -1437,7 +1437,7 @@ void PPPM::set_grid_local()
   // this is accomplished by nzhi_in = nzhi_out on +z end (no ghost cells)
   // also insure no other procs use ghost cells beyond +z limit
   // differnet logic for non-tiled vs tiled decomposition
-  
+
   if (slabflag == 1) {
     if (comm->layout != Comm::LAYOUT_TILED) {
       if (comm->myloc[2] == comm->procgrid[2]-1) nzhi_in = nzhi_out = nz_pppm - 1;
@@ -2634,7 +2634,7 @@ void PPPM::fieldforce_peratom()
 void PPPM::pack_forward_grid(int flag, void *vbuf, int nlist, int *list)
 {
   FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
-  
+
   int n = 0;
 
   if (flag == FORWARD_IK) {
@@ -2754,7 +2754,7 @@ void PPPM::unpack_forward_grid(int flag, void *vbuf, int nlist, int *list)
 void PPPM::pack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
 {
   FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
-    
+
   if (flag == REVERSE_RHO) {
     FFT_SCALAR *src = &density_brick[nzlo_out][nylo_out][nxlo_out];
     for (int i = 0; i < nlist; i++)
@@ -3083,7 +3083,7 @@ double PPPM::memory_usage()
   // two GridComm bufs
 
   bytes += (ngc_buf1 + ngc_buf2) * npergrid * sizeof(FFT_SCALAR);
-  
+
   return bytes;
 }
 
@@ -3141,7 +3141,7 @@ void PPPM::compute_group_group(int groupbit_A, int groupbit_B, int AA_flag)
   density_fft = density_A_fft;
 
   gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
-			  gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                          gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   brick2fft();
 
   // group B
@@ -3150,7 +3150,7 @@ void PPPM::compute_group_group(int groupbit_A, int groupbit_B, int AA_flag)
   density_fft = density_B_fft;
 
   gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
-			  gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                          gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   brick2fft();
 
   // switch back pointers
diff --git a/src/KSPACE/pppm.h b/src/KSPACE/pppm.h
index b7416a0a9c..7451af47b4 100644
--- a/src/KSPACE/pppm.h
+++ b/src/KSPACE/pppm.h
@@ -101,7 +101,7 @@ class PPPM : public KSpace {
   class FFT3d *fft1,*fft2;
   class Remap *remap;
   class GridComm *gc;
-  
+
   FFT_SCALAR *gc_buf1,*gc_buf2;
   int ngc_buf1,ngc_buf2,npergrid;
 
diff --git a/src/KSPACE/pppm_cg.cpp b/src/KSPACE/pppm_cg.cpp
index 081113ea0d..392d19336a 100644
--- a/src/KSPACE/pppm_cg.cpp
+++ b/src/KSPACE/pppm_cg.cpp
@@ -91,7 +91,7 @@ void PPPMCG::compute(int eflag, int vflag)
   ev_init(eflag,vflag);
 
   if (evflag_atom && !peratom_allocate_flag) allocate_peratom();
-  
+
   // if atom count has changed, update qsum and qsqsum
 
   if (atom->natoms != natoms_original) {
@@ -158,7 +158,7 @@ void PPPMCG::compute(int eflag, int vflag)
   }
 
   // only need to rebuild this list after a neighbor list update
-  
+
   if (neighbor->ago == 0) {
     num_charged = 0;
     for (int i = 0; i < atom->nlocal; ++i) {
@@ -180,7 +180,7 @@ void PPPMCG::compute(int eflag, int vflag)
   // remap from 3d decomposition to FFT decomposition
 
   gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
-			  gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                          gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   brick2fft();
 
   // compute potential gradient on my FFT grid and
@@ -195,20 +195,20 @@ void PPPMCG::compute(int eflag, int vflag)
 
   if (differentiation_flag == 1)
     gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
-			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   else
     gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
-			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
   // extra per-atom energy/virial communication
 
   if (evflag_atom) {
     if (differentiation_flag == 1 && vflag_atom)
       gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
-			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     else if (differentiation_flag == 0)
       gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
-			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   }
 
   // calculate the force on my particles
diff --git a/src/KSPACE/pppm_dipole.cpp b/src/KSPACE/pppm_dipole.cpp
index 312da6c304..1a87c81bee 100644
--- a/src/KSPACE/pppm_dipole.cpp
+++ b/src/KSPACE/pppm_dipole.cpp
@@ -444,9 +444,9 @@ void PPPMDipole::compute(int eflag, int vflag)
   // all procs communicate density values from their ghost cells
   //   to fully sum contribution in their 3d bricks
   // remap from 3d decomposition to FFT decomposition
-  
+
   gc_dipole->reverse_comm_kspace(this,3,sizeof(FFT_SCALAR),REVERSE_MU,
-				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   brick2fft_dipole();
 
   // compute potential gradient on my FFT grid and
@@ -460,13 +460,13 @@ void PPPMDipole::compute(int eflag, int vflag)
   // to fill ghost cells surrounding their 3d bricks
 
   gc_dipole->forward_comm_kspace(this,9,sizeof(FFT_SCALAR),FORWARD_MU,
-				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
   // extra per-atom energy/virial communication
 
   if (evflag_atom)
     gc->forward_comm_kspace(this,18,sizeof(FFT_SCALAR),FORWARD_MU_PERATOM,
-			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
   // calculate the force on my particles
 
@@ -510,7 +510,7 @@ void PPPMDipole::compute(int eflag, int vflag)
       for (i = 0; i < nlocal; i++) {
         eatom[i] *= 0.5;
         eatom[i] -= (mu[i][0]*mu[i][0] + mu[i][1]*mu[i][1] +
-		     mu[i][2]*mu[i][2])*2.0*g3/3.0/MY_PIS;
+                     mu[i][2]*mu[i][2])*2.0*g3/3.0/MY_PIS;
         eatom[i] *= qscale;
       }
     }
@@ -2483,7 +2483,7 @@ int PPPMDipole::timing_3d(int n, double &time3d)
 double PPPMDipole::memory_usage()
 {
   double bytes = nmax*3 * sizeof(double);
-  
+
   int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
     (nzhi_out-nzlo_out+1);
   bytes += 6 * nfft_both * sizeof(double);   // vg
diff --git a/src/KSPACE/pppm_dipole.h b/src/KSPACE/pppm_dipole.h
index f7a8b63930..eb49361842 100644
--- a/src/KSPACE/pppm_dipole.h
+++ b/src/KSPACE/pppm_dipole.h
@@ -74,7 +74,7 @@ class PPPMDipole : public PPPM {
 
   int only_dipole_flag;
   double musum,musqsum,mu2;
-  
+
   double find_gewald_dipole(double, double, bigint, double, double);
   double newton_raphson_f_dipole(double, double, bigint, double, double);
   double derivf_dipole(double, double, bigint, double, double);
diff --git a/src/KSPACE/pppm_dipole_spin.cpp b/src/KSPACE/pppm_dipole_spin.cpp
index c8cebdfeef..4e33c11793 100644
--- a/src/KSPACE/pppm_dipole_spin.cpp
+++ b/src/KSPACE/pppm_dipole_spin.cpp
@@ -302,7 +302,7 @@ void PPPMDipoleSpin::compute(int eflag, int vflag)
   // remap from 3d decomposition to FFT decomposition
 
   gc_dipole->reverse_comm_kspace(this,3,sizeof(FFT_SCALAR),REVERSE_MU,
-				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   brick2fft_dipole();
 
   // compute potential gradient on my FFT grid and
@@ -316,13 +316,13 @@ void PPPMDipoleSpin::compute(int eflag, int vflag)
   // to fill ghost cells surrounding their 3d bricks
 
   gc_dipole->forward_comm_kspace(this,9,sizeof(FFT_SCALAR),FORWARD_MU,
-				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
   // extra per-atom energy/virial communication
 
   if (evflag_atom)
     gc->forward_comm_kspace(this,18,sizeof(FFT_SCALAR),FORWARD_MU_PERATOM,
-			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
   // calculate the force on my particles
 
diff --git a/src/KSPACE/pppm_disp.cpp b/src/KSPACE/pppm_disp.cpp
index 313f41d2cc..8f141bb695 100644
--- a/src/KSPACE/pppm_disp.cpp
+++ b/src/KSPACE/pppm_disp.cpp
@@ -410,7 +410,7 @@ void PPPMDisp::init()
 
   int iteration = 0;
   if (function[0]) {
-    
+
     GridComm *gctmp = NULL;
     while (order >= minorder) {
 
@@ -455,7 +455,7 @@ void PPPMDisp::init()
       error->all(FLERR,"Coulomb PPPMDisp order has been reduced below minorder");
     if (!overlap_allowed && !gctmp->ghost_adjacent())
       error->all(FLERR,"PPPMDisp grid stencil extends "
-		 "beyond nearest neighbor processor");
+                 "beyond nearest neighbor processor");
     if (gctmp) delete gctmp;
 
     // adjust g_ewald
@@ -491,7 +491,7 @@ void PPPMDisp::init()
 
   iteration = 0;
   if (function[1] + function[2] + function[3]) {
-    
+
     GridComm *gctmp = NULL;
     while (order_6 >= minorder) {
 
@@ -523,7 +523,7 @@ void PPPMDisp::init()
                            nzlo_in_6,nzhi_in_6,
                            nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,
                            nzlo_out_6,nzhi_out_6);
-      
+
       int tmp1,tmp2;
       gctmp->setup(tmp1,tmp2);
       if (gctmp->ghost_adjacent()) break;
@@ -537,7 +537,7 @@ void PPPMDisp::init()
                  "reduced below minorder");
     if (!overlap_allowed && !gctmp->ghost_adjacent())
       error->all(FLERR,"Dispersion PPPMDisp grid stencil extends "
-		 "beyond nearest neighbor processor");
+                 "beyond nearest neighbor processor");
     if (gctmp) delete gctmp;
 
     // adjust g_ewald_6
@@ -638,7 +638,7 @@ void PPPMDisp::setup()
   double unitkz = (2.0*MY_PI/zprd_slab);
 
   //compute the virial coefficients and green functions
-  
+
   if (function[0]) {
 
     delxinv = nx_pppm/xprd;
@@ -874,7 +874,7 @@ void PPPMDisp::setup_grid()
 void PPPMDisp::compute(int eflag, int vflag)
 {
   int i;
-  
+
   // set energy/virial flags
   // invoke allocate_peratom() if needed for first time
 
@@ -889,7 +889,7 @@ void PPPMDisp::compute(int eflag, int vflag)
     boxlo = domain->boxlo_lamda;
     domain->x2lamda(atom->nlocal);
   }
-  
+
   // extend size of per-atom arrays if necessary
 
   if (atom->nmax > nmax) {
@@ -918,12 +918,12 @@ void PPPMDisp::compute(int eflag, int vflag)
     // perform calculations for coulomb interactions only
 
     particle_map_c(delxinv, delyinv, delzinv, shift, part2grid, nupper, nlower,
-		   nxlo_out, nylo_out, nzlo_out, nxhi_out, nyhi_out, nzhi_out);
+                   nxlo_out, nylo_out, nzlo_out, nxhi_out, nyhi_out, nzhi_out);
 
     make_rho_c();
-    
+
     gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
-			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     brick2fft(nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
               density_brick, density_fft, work1,remap);
@@ -938,13 +938,13 @@ void PPPMDisp::compute(int eflag, int vflag)
                  u_brick, v0_brick, v1_brick, v2_brick, v3_brick, v4_brick, v5_brick);
 
       gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
-			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       fieldforce_c_ad();
 
       if (vflag_atom)
-	gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
-				gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+        gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+                                gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     } else {
       poisson_ik(work1, work2, density_fft, fft1, fft2,
@@ -957,31 +957,31 @@ void PPPMDisp::compute(int eflag, int vflag)
                  u_brick, v0_brick, v1_brick, v2_brick, v3_brick, v4_brick, v5_brick);
 
       gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
-			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       fieldforce_c_ik();
 
       if (evflag_atom)
-	gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
-				gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+        gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+                                gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     }
-    
+
     if (evflag_atom) fieldforce_c_peratom();
   }
 
   if (function[1]) {
-    
+
     // perform calculations for geometric mixing
-    
+
     particle_map(delxinv_6, delyinv_6, delzinv_6, shift_6, part2grid_6,
-		 nupper_6, nlower_6,
+                 nupper_6, nlower_6,
                  nxlo_out_6, nylo_out_6, nzlo_out_6,
-		 nxhi_out_6, nyhi_out_6, nzhi_out_6);
-    
+                 nxhi_out_6, nyhi_out_6, nzhi_out_6);
+
     make_rho_g();
 
     gc6->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO_G,
-			     gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                             gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     brick2fft(nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, nzhi_in_6,
               density_brick_g, density_fft_g, work1_6,remap_6);
@@ -994,16 +994,16 @@ void PPPMDisp::compute(int eflag, int vflag)
                  energy_6, greensfn_6,
                  virial_6, vg_6, vg2_6,
                  u_brick_g, v0_brick_g, v1_brick_g, v2_brick_g,
-		 v3_brick_g, v4_brick_g, v5_brick_g);
+                 v3_brick_g, v4_brick_g, v5_brick_g);
 
       gc6->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD_G,
-			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       fieldforce_g_ad();
 
       if (vflag_atom)
-	gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_G,
-				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+        gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_G,
+                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     } else {
       poisson_ik(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
@@ -1014,34 +1014,34 @@ void PPPMDisp::compute(int eflag, int vflag)
                  fkx_6, fky_6, fkz_6,fkx2_6, fky2_6, fkz2_6,
                  vdx_brick_g, vdy_brick_g, vdz_brick_g, virial_6, vg_6, vg2_6,
                  u_brick_g, v0_brick_g, v1_brick_g, v2_brick_g,
-		 v3_brick_g, v4_brick_g, v5_brick_g);
+                 v3_brick_g, v4_brick_g, v5_brick_g);
 
       gc6->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK_G,
-			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       fieldforce_g_ik();
 
       if (evflag_atom)
-	gc6->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_G,
-				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+        gc6->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_G,
+                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     }
-    
+
     if (evflag_atom) fieldforce_g_peratom();
   }
 
   if (function[2]) {
-    
+
     // perform calculations for arithmetic mixing
-    
+
     particle_map(delxinv_6, delyinv_6, delzinv_6, shift_6, part2grid_6,
-		 nupper_6, nlower_6,
+                 nupper_6, nlower_6,
                  nxlo_out_6, nylo_out_6, nzlo_out_6,
-		 nxhi_out_6, nyhi_out_6, nzhi_out_6);
-    
+                 nxhi_out_6, nyhi_out_6, nzhi_out_6);
+
     make_rho_a();
 
     gc->reverse_comm_kspace(this,7,sizeof(FFT_SCALAR),REVERSE_RHO_A,
-			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     brick2fft_a();
 
@@ -1053,31 +1053,31 @@ void PPPMDisp::compute(int eflag, int vflag)
                  energy_6, greensfn_6,
                  virial_6, vg_6, vg2_6,
                  u_brick_a3, v0_brick_a3, v1_brick_a3, v2_brick_a3,
-		 v3_brick_a3, v4_brick_a3, v5_brick_a3);
+                 v3_brick_a3, v4_brick_a3, v5_brick_a3);
       poisson_2s_ad(density_fft_a0, density_fft_a6,
                     u_brick_a0, v0_brick_a0, v1_brick_a0, v2_brick_a0,
-		    v3_brick_a0, v4_brick_a0, v5_brick_a0,
+                    v3_brick_a0, v4_brick_a0, v5_brick_a0,
                     u_brick_a6, v0_brick_a6, v1_brick_a6, v2_brick_a6,
-		    v3_brick_a6, v4_brick_a6, v5_brick_a6);
+                    v3_brick_a6, v4_brick_a6, v5_brick_a6);
       poisson_2s_ad(density_fft_a1, density_fft_a5,
                     u_brick_a1, v0_brick_a1, v1_brick_a1, v2_brick_a1,
-		    v3_brick_a1, v4_brick_a1, v5_brick_a1,
+                    v3_brick_a1, v4_brick_a1, v5_brick_a1,
                     u_brick_a5, v0_brick_a5, v1_brick_a5, v2_brick_a5,
-		    v3_brick_a5, v4_brick_a5, v5_brick_a5);
+                    v3_brick_a5, v4_brick_a5, v5_brick_a5);
       poisson_2s_ad(density_fft_a2, density_fft_a4,
                     u_brick_a2, v0_brick_a2, v1_brick_a2, v2_brick_a2,
-		    v3_brick_a2, v4_brick_a2, v5_brick_a2,
+                    v3_brick_a2, v4_brick_a2, v5_brick_a2,
                     u_brick_a4, v0_brick_a4, v1_brick_a4, v2_brick_a4,
-		    v3_brick_a4, v4_brick_a4, v5_brick_a4);
+                    v3_brick_a4, v4_brick_a4, v5_brick_a4);
 
       gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_AD_A,
-			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       fieldforce_a_ad();
 
       if (evflag_atom)
-	gc6->forward_comm_kspace(this,42,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_A,
-				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+        gc6->forward_comm_kspace(this,42,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_A,
+                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     }  else {
       poisson_ik(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
@@ -1088,55 +1088,55 @@ void PPPMDisp::compute(int eflag, int vflag)
                  fkx_6, fky_6, fkz_6,fkx2_6, fky2_6, fkz2_6,
                  vdx_brick_a3, vdy_brick_a3, vdz_brick_a3, virial_6, vg_6, vg2_6,
                  u_brick_a3, v0_brick_a3, v1_brick_a3, v2_brick_a3,
-		 v3_brick_a3, v4_brick_a3, v5_brick_a3);
+                 v3_brick_a3, v4_brick_a3, v5_brick_a3);
       poisson_2s_ik(density_fft_a0, density_fft_a6,
                     vdx_brick_a0, vdy_brick_a0, vdz_brick_a0,
                     vdx_brick_a6, vdy_brick_a6, vdz_brick_a6,
                     u_brick_a0, v0_brick_a0, v1_brick_a0, v2_brick_a0,
-		    v3_brick_a0, v4_brick_a0, v5_brick_a0,
+                    v3_brick_a0, v4_brick_a0, v5_brick_a0,
                     u_brick_a6, v0_brick_a6, v1_brick_a6, v2_brick_a6,
-		    v3_brick_a6, v4_brick_a6, v5_brick_a6);
+                    v3_brick_a6, v4_brick_a6, v5_brick_a6);
       poisson_2s_ik(density_fft_a1, density_fft_a5,
                     vdx_brick_a1, vdy_brick_a1, vdz_brick_a1,
                     vdx_brick_a5, vdy_brick_a5, vdz_brick_a5,
                     u_brick_a1, v0_brick_a1, v1_brick_a1, v2_brick_a1,
-		    v3_brick_a1, v4_brick_a1, v5_brick_a1,
+                    v3_brick_a1, v4_brick_a1, v5_brick_a1,
                     u_brick_a5, v0_brick_a5, v1_brick_a5, v2_brick_a5,
-		    v3_brick_a5, v4_brick_a5, v5_brick_a5);
+                    v3_brick_a5, v4_brick_a5, v5_brick_a5);
       poisson_2s_ik(density_fft_a2, density_fft_a4,
                     vdx_brick_a2, vdy_brick_a2, vdz_brick_a2,
                     vdx_brick_a4, vdy_brick_a4, vdz_brick_a4,
                     u_brick_a2, v0_brick_a2, v1_brick_a2, v2_brick_a2,
-		    v3_brick_a2, v4_brick_a2, v5_brick_a2,
+                    v3_brick_a2, v4_brick_a2, v5_brick_a2,
                     u_brick_a4, v0_brick_a4, v1_brick_a4, v2_brick_a4,
-		    v3_brick_a4, v4_brick_a4, v5_brick_a4);
+                    v3_brick_a4, v4_brick_a4, v5_brick_a4);
 
       gc6->forward_comm_kspace(this,18,sizeof(FFT_SCALAR),FORWARD_IK_A,
-			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       fieldforce_a_ik();
 
       if (evflag_atom)
-	gc6->forward_comm_kspace(this,49,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_A,
-				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+        gc6->forward_comm_kspace(this,49,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_A,
+                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     }
-    
+
     if (evflag_atom) fieldforce_a_peratom();
   }
 
   if (function[3]) {
-    
+
     // perform calculations if no mixing rule applies
-    
+
     particle_map(delxinv_6, delyinv_6, delzinv_6, shift_6, part2grid_6,
-		 nupper_6, nlower_6,
+                 nupper_6, nlower_6,
                  nxlo_out_6, nylo_out_6, nzlo_out_6,
-		 nxhi_out_6, nyhi_out_6, nzhi_out_6);
+                 nxhi_out_6, nyhi_out_6, nzhi_out_6);
 
     make_rho_none();
 
     gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO_NONE,
-			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     brick2fft_none();
 
@@ -1151,13 +1151,13 @@ void PPPMDisp::compute(int eflag, int vflag)
       }
 
       gc6->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD_NONE,
-			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       fieldforce_none_ad();
 
       if (vflag_atom)
-	gc6->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_NONE,
-				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+        gc6->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_NONE,
+                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     } else {
       int n = 0;
@@ -1172,15 +1172,15 @@ void PPPMDisp::compute(int eflag, int vflag)
       }
 
       gc6->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK_NONE,
-			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       fieldforce_none_ik();
 
       if (evflag_atom)
-	gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_NONE,
-				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+        gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_NONE,
+                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     }
-    
+
     if (evflag_atom) fieldforce_none_peratom();
   }
 
@@ -1194,7 +1194,7 @@ void PPPMDisp::compute(int eflag, int vflag)
   // sum energy across procs and add in volume-dependent term
 
   const double qscale = force->qqrd2e * scale;
-  
+
   if (eflag_global) {
     double energy_all;
     MPI_Allreduce(&energy_1,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
@@ -1234,7 +1234,7 @@ void PPPMDisp::compute(int eflag, int vflag)
       // coulomb self energy correction
       for (i = 0; i < atom->nlocal; i++) {
         eatom[i] -= qscale*g_ewald*q[i]*q[i]/MY_PIS +
-	  qscale*MY_PI2*q[i]*qsum / (g_ewald*g_ewald*volume); 
+          qscale*MY_PI2*q[i]*qsum / (g_ewald*g_ewald*volume);
       }
     }
     if (function[1] + function[2] + function[3]) {
@@ -1254,7 +1254,7 @@ void PPPMDisp::compute(int eflag, int vflag)
       for (i = 0; i < atom->nlocal; i++) {
         tmp = atom->type[i];
         for (int n = 0; n < 3; n++)
-	  vatom[i][n] -= MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumi[tmp];
+          vatom[i][n] -= MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumi[tmp];
       }
     }
   }
@@ -1665,7 +1665,7 @@ void _noopt PPPMDisp::allocate()
     memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm/disp:rho_coeff");
     memory->create2d_offset(drho1d,3,-order/2,order/2,"pppm/disp:rho1d");
     memory->create2d_offset(drho_coeff,order,(1-order)/2,order/2,
-			    "pppm/disp:drho_coeff");
+                            "pppm/disp:drho_coeff");
 
     memory->create(greensfn,nfft_both,"pppm/disp:greensfn");
     memory->create(vg,nfft_both,6,"pppm/disp:vg");
@@ -1721,7 +1721,7 @@ void _noopt PPPMDisp::allocate()
 
     if (differentiation_flag) npergrid = 1;
     else npergrid = 3;
-    
+
     memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
     memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
   }
@@ -1741,10 +1741,10 @@ void _noopt PPPMDisp::allocate()
     memory->create(gf_b_6,order_6,"pppm/disp:gf_b_6");
     memory->create2d_offset(rho1d_6,3,-order_6/2,order_6/2,"pppm/disp:rho1d_6");
     memory->create2d_offset(rho_coeff_6,order_6,(1-order_6)/2,order_6/2,
-			    "pppm/disp:rho_coeff_6");
+                            "pppm/disp:rho_coeff_6");
     memory->create2d_offset(drho1d_6,3,-order_6/2,order_6/2,"pppm/disp:drho1d_6");
     memory->create2d_offset(drho_coeff_6,order_6,(1-order_6)/2,order_6/2,
-			    "pppm/disp:drho_coeff_6");
+                            "pppm/disp:drho_coeff_6");
 
     memory->create(greensfn_6,nfft_both_6,"pppm/disp:greensfn_6");
     memory->create(vg_6,nfft_both_6,6,"pppm/disp:vg_6");
@@ -1777,35 +1777,35 @@ void _noopt PPPMDisp::allocate()
 
     fft1_6 =
       new FFT3d(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6,
-		nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
-		nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
-		0,0,&tmp,collective_flag);
+                nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
+                nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
+                0,0,&tmp,collective_flag);
 
     fft2_6 =
       new FFT3d(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6,
-		nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
-		nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
-		0,0,&tmp,collective_flag);
+                nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
+                nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
+                0,0,&tmp,collective_flag);
 
     remap_6 =
       new Remap(lmp,world,
-		nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
-		nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
-		1,0,0,FFT_PRECISION,collective_flag);
+                nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
+                nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
+                1,0,0,FFT_PRECISION,collective_flag);
 
     // create ghost grid object for rho and electric field communication
     // also create 2 bufs for ghost grid cell comm, passed to GridComm methods
 
     gc6 =
       new GridComm(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6,
-		   nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
-		   nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,nzlo_out_6,nzhi_out_6);
+                   nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
+                   nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,nzlo_out_6,nzhi_out_6);
 
     gc6->setup(ngc6_buf1,ngc6_buf2);
 
     if (differentiation_flag) npergrid6 = 1;
     else npergrid6 = 7;
-    
+
     memory->create(gc6_buf1,npergrid6*ngc6_buf1,"pppm:gc_buf1");
     memory->create(gc6_buf2,npergrid6*ngc6_buf2,"pppm:gc_buf2");
   }
@@ -1825,35 +1825,35 @@ void _noopt PPPMDisp::allocate()
     memory->create(gf_b_6,order_6,"pppm/disp:gf_b_6");
     memory->create2d_offset(rho1d_6,3,-order_6/2,order_6/2,"pppm/disp:rho1d_6");
     memory->create2d_offset(rho_coeff_6,order_6,(1-order_6)/2,order_6/2,
-			    "pppm/disp:rho_coeff_6");
+                            "pppm/disp:rho_coeff_6");
     memory->create2d_offset(drho1d_6,3,-order_6/2,order_6/2,"pppm/disp:drho1d_6");
     memory->create2d_offset(drho_coeff_6,order_6,(1-order_6)/2,order_6/2,
-			    "pppm/disp:drho_coeff_6");
+                            "pppm/disp:drho_coeff_6");
 
     memory->create(greensfn_6,nfft_both_6,"pppm/disp:greensfn_6");
     memory->create(vg_6,nfft_both_6,6,"pppm/disp:vg_6");
     memory->create(vg2_6,nfft_both_6,3,"pppm/disp:vg2_6");
 
     memory->create3d_offset(density_brick_a0,nzlo_out_6,
-			    nzhi_out_6,nylo_out_6,nyhi_out_6,
+                            nzhi_out_6,nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:density_brick_a0");
     memory->create3d_offset(density_brick_a1,nzlo_out_6,
-			    nzhi_out_6,nylo_out_6,nyhi_out_6,
+                            nzhi_out_6,nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:density_brick_a1");
     memory->create3d_offset(density_brick_a2,nzlo_out_6,nzhi_out_6,
-			    nylo_out_6,nyhi_out_6,
+                            nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:density_brick_a2");
     memory->create3d_offset(density_brick_a3,nzlo_out_6,nzhi_out_6,
-			    nylo_out_6,nyhi_out_6,
+                            nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:density_brick_a3");
     memory->create3d_offset(density_brick_a4,nzlo_out_6,nzhi_out_6,
-			    nylo_out_6,nyhi_out_6,
+                            nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:density_brick_a4");
     memory->create3d_offset(density_brick_a5,nzlo_out_6,nzhi_out_6,
-			    nylo_out_6,nyhi_out_6,
+                            nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:density_brick_a5");
     memory->create3d_offset(density_brick_a6,nzlo_out_6,nzhi_out_6,
-			    nylo_out_6,nyhi_out_6,
+                            nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:density_brick_a6");
 
     memory->create(density_fft_a0,nfft_both_6,"pppm/disp:density_fft_a0");
@@ -1959,17 +1959,17 @@ void _noopt PPPMDisp::allocate()
 
     // create ghost grid object for rho and electric field communication
     // also create 2 bufs for ghost grid cell comm, passed to GridComm methods
-    
+
     gc6 =
       new GridComm(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6,
-		   nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
-		   nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,nzlo_out_6,nzhi_out_6);
+                   nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
+                   nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,nzlo_out_6,nzhi_out_6);
 
     gc6->setup(ngc6_buf1,ngc6_buf2);
 
     if (differentiation_flag) npergrid6 = 7;
     else npergrid6 = 18;
-    
+
     memory->create(gc6_buf1,npergrid6*ngc6_buf1,"pppm:gc_buf1");
     memory->create(gc6_buf2,npergrid6*ngc6_buf2,"pppm:gc_buf2");
   }
@@ -1989,21 +1989,21 @@ void _noopt PPPMDisp::allocate()
     memory->create(gf_b_6,order_6,"pppm/disp:gf_b_6");
     memory->create2d_offset(rho1d_6,3,-order_6/2,order_6/2,"pppm/disp:rho1d_6");
     memory->create2d_offset(rho_coeff_6,order_6,(1-order_6)/2,order_6/2,
-			    "pppm/disp:rho_coeff_6");
+                            "pppm/disp:rho_coeff_6");
     memory->create2d_offset(drho1d_6,3,-order_6/2,order_6/2,"pppm/disp:drho1d_6");
     memory->create2d_offset(drho_coeff_6,order_6,(1-order_6)/2,order_6/2,
-			    "pppm/disp:drho_coeff_6");
+                            "pppm/disp:drho_coeff_6");
 
     memory->create(greensfn_6,nfft_both_6,"pppm/disp:greensfn_6");
     memory->create(vg_6,nfft_both_6,6,"pppm/disp:vg_6");
     memory->create(vg2_6,nfft_both_6,3,"pppm/disp:vg2_6");
 
     memory->create4d_offset(density_brick_none,nsplit_alloc,
-			    nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+                            nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:density_brick_none");
     if ( differentiation_flag == 1) {
       memory->create4d_offset(u_brick_none,nsplit_alloc,
-			      nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+                              nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                               nxlo_out_6,nxhi_out_6,"pppm/disp:u_brick_none");
 
       memory->create(sf_precoeff1_6,nfft_both_6,"pppm/disp:sf_precoeff1_6");
@@ -2015,17 +2015,17 @@ void _noopt PPPMDisp::allocate()
 
     }  else {
       memory->create4d_offset(vdx_brick_none,nsplit_alloc,
-			      nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+                              nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                               nxlo_out_6,nxhi_out_6,"pppm/disp:vdx_brick_none");
       memory->create4d_offset(vdy_brick_none,nsplit_alloc,
-			      nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+                              nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                               nxlo_out_6,nxhi_out_6,"pppm/disp:vdy_brick_none");
       memory->create4d_offset(vdz_brick_none,nsplit_alloc,
-			      nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+                              nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                               nxlo_out_6,nxhi_out_6,"pppm/disp:vdz_brick_none");
     }
     memory->create(density_fft_none,nsplit_alloc,nfft_both_6,
-		   "pppm/disp:density_fft_none");
+                   "pppm/disp:density_fft_none");
 
     int tmp;
 
@@ -2046,17 +2046,17 @@ void _noopt PPPMDisp::allocate()
 
     // create ghost grid object for rho and electric field communication
     // also create 2 bufs for ghost grid cell comm, passed to GridComm methods
-    
+
     gc6 =
       new GridComm(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6,
-		   nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
-		   nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,nzlo_out_6,nzhi_out_6);
+                   nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
+                   nxlo_out_6,nxhi_out_6,nylo_out_6,nyhi_out_6,nzlo_out_6,nzhi_out_6);
 
     gc6->setup(ngc6_buf1,ngc6_buf2);
 
     if (differentiation_flag) npergrid6 = 1;
     else npergrid6 = 3;
-    
+
     memory->create(gc6_buf1,npergrid6*ngc6_buf1,"pppm:gc_buf1");
     memory->create(gc6_buf2,npergrid6*ngc6_buf2,"pppm:gc_buf2");
   }
@@ -2253,26 +2253,26 @@ void PPPMDisp::allocate_peratom()
   if (function[3]) {
     if (differentiation_flag != 1)
       memory->create4d_offset(u_brick_none,nsplit_alloc,
-			      nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+                              nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                               nxlo_out_6,nxhi_out_6,"pppm/disp:u_brick_none");
 
     memory->create4d_offset(v0_brick_none,nsplit_alloc,
-			    nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+                            nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:v0_brick_none");
     memory->create4d_offset(v1_brick_none,nsplit_alloc,
-			    nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+                            nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:v1_brick_none");
     memory->create4d_offset(v2_brick_none,nsplit_alloc,
-			    nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+                            nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:v2_brick_none");
     memory->create4d_offset(v3_brick_none,nsplit_alloc,
-			    nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+                            nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:v3_brick_none");
     memory->create4d_offset(v4_brick_none,nsplit_alloc,
-			    nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+                            nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:v4_brick_none");
     memory->create4d_offset(v5_brick_none,nsplit_alloc,
-			    nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
+                            nzlo_out_6,nzhi_out_6,nylo_out_6,nyhi_out_6,
                             nxlo_out_6,nxhi_out_6,"pppm/disp:v5_brick_none");
 
     // use same GC ghost grid object for peratom grid communication
@@ -2604,7 +2604,7 @@ void PPPMDisp::set_grid()
     while (1) {
 
       // set grid dimension
-      
+
       nx_pppm = static_cast<int> (xprd/h_x);
       ny_pppm = static_cast<int> (yprd/h_y);
       nz_pppm = static_cast<int> (zprd_slab/h_z);
@@ -2623,7 +2623,7 @@ void PPPMDisp::set_grid()
 
       count++;
       if (dfkspace <= accuracy) break;
-      
+
       if (count > 500) error->all(FLERR, "Could not compute grid size");
       h *= 0.95;
       h_x = h_y = h_z = h;
@@ -2642,15 +2642,15 @@ void PPPMDisp::set_grid()
 ------------------------------------------------------------------------- */
 
 void PPPMDisp::set_fft_parameters(int& nx_p,int& ny_p,int& nz_p,
-				  int& nxlo_f,int& nylo_f,int& nzlo_f,
-				  int& nxhi_f,int& nyhi_f,int& nzhi_f,
-				  int& nxlo_i,int& nylo_i,int& nzlo_i,
-				  int& nxhi_i,int& nyhi_i,int& nzhi_i,
-				  int& nxlo_o,int& nylo_o,int& nzlo_o,
-				  int& nxhi_o,int& nyhi_o,int& nzhi_o,
-				  int& nlow, int& nupp,
-				  int& ng, int& nf, int& nfb,
-				  double& sft,double& sftone, int& ord)
+                                  int& nxlo_f,int& nylo_f,int& nzlo_f,
+                                  int& nxhi_f,int& nyhi_f,int& nzhi_f,
+                                  int& nxlo_i,int& nylo_i,int& nzlo_i,
+                                  int& nxhi_i,int& nyhi_i,int& nzhi_i,
+                                  int& nxlo_o,int& nylo_o,int& nzlo_o,
+                                  int& nxhi_o,int& nyhi_o,int& nzhi_o,
+                                  int& nlow, int& nupp,
+                                  int& ng, int& nf, int& nfb,
+                                  double& sft,double& sftone, int& ord)
 {
   // global indices of PPPM grid range from 0 to N-1
   // nlo_in,nhi_in = lower/upper limits of the 3d sub-brick of
@@ -2988,7 +2988,7 @@ double PPPMDisp::compute_qopt_ik()
 
   // loop over entire FFT grid
   // each proc calculates contributions from every Pth grid point
-  
+
   bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
   int nxy_pppm = nx_pppm * ny_pppm;
 
@@ -3005,45 +3005,45 @@ double PPPMDisp::compute_qopt_ik()
 
     sqk = pow(unitkx*kper,2.0) + pow(unitky*lper,2.0) + pow(unitkz*mper,2.0);
     if (sqk == 0.0) continue;
-    
+
     sum1 = sum2 = sum3 = 0.0;
-    
+
     for (nx = -nbx; nx <= nbx; nx++) {
       qx = unitkx*(kper+nx_pppm*nx);
       sx = exp(-0.25*pow(qx/g_ewald,2.0));
       wx = 1.0;
       argx = 0.5*qx*xprd/nx_pppm;
       if (argx != 0.0) wx = pow(sin(argx)/argx,order);
-      
+
       for (ny = -nby; ny <= nby; ny++) {
-	qy = unitky*(lper+ny_pppm*ny);
-	sy = exp(-0.25*pow(qy/g_ewald,2.0));
-	wy = 1.0;
-	argy = 0.5*qy*yprd/ny_pppm;
-	if (argy != 0.0) wy = pow(sin(argy)/argy,order);
-	
-	for (nz = -nbz; nz <= nbz; nz++) {
-	  qz = unitkz*(mper+nz_pppm*nz);
-	  sz = exp(-0.25*pow(qz/g_ewald,2.0));
-	  wz = 1.0;
-	  argz = 0.5*qz*zprd_slab/nz_pppm;
-	  if (argz != 0.0) wz = pow(sin(argz)/argz,order);
-	  
-	  dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
-	  dot2 = qx*qx+qy*qy+qz*qz;
-	  u2 =  pow(wx*wy*wz,2.0);
-	  sum1 += sx*sy*sz*sx*sy*sz/dot2*4.0*4.0*MY_PI*MY_PI;
-	  sum2 += u2*sx*sy*sz*4.0*MY_PI/dot2*dot1;
-	  sum3 += u2;
-	}
+        qy = unitky*(lper+ny_pppm*ny);
+        sy = exp(-0.25*pow(qy/g_ewald,2.0));
+        wy = 1.0;
+        argy = 0.5*qy*yprd/ny_pppm;
+        if (argy != 0.0) wy = pow(sin(argy)/argy,order);
+
+        for (nz = -nbz; nz <= nbz; nz++) {
+          qz = unitkz*(mper+nz_pppm*nz);
+          sz = exp(-0.25*pow(qz/g_ewald,2.0));
+          wz = 1.0;
+          argz = 0.5*qz*zprd_slab/nz_pppm;
+          if (argz != 0.0) wz = pow(sin(argz)/argz,order);
+
+          dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
+          dot2 = qx*qx+qy*qy+qz*qz;
+          u2 =  pow(wx*wy*wz,2.0);
+          sum1 += sx*sy*sz*sx*sy*sz/dot2*4.0*4.0*MY_PI*MY_PI;
+          sum2 += u2*sx*sy*sz*4.0*MY_PI/dot2*dot1;
+          sum3 += u2;
+        }
       }
     }
-    
+
     sum2 *= sum2;
     sum3 *= sum3*sqk;
     qopt += sum1 -sum2/sum3;
   }
-  
+
   return qopt;
 }
 
@@ -3079,7 +3079,7 @@ double PPPMDisp::compute_qopt_ad()
 
   // loop over entire FFT grid
   // each proc calculates contributions from every Pth grid point
-  
+
   bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
   int nxy_pppm = nx_pppm * ny_pppm;
 
@@ -3105,31 +3105,31 @@ double PPPMDisp::compute_qopt_ad()
       wx = 1.0;
       argx = 0.5*qx*xprd/nx_pppm;
       if (argx != 0.0) wx = pow(sin(argx)/argx,order);
-      
-      for (ny = -nby; ny <= nby; ny++) {
-	qy = unitky*(lper+ny_pppm*ny);
-	sy = exp(-0.25*pow(qy/g_ewald,2.0));
-	wy = 1.0;
-	argy = 0.5*qy*yprd/ny_pppm;
-	if (argy != 0.0) wy = pow(sin(argy)/argy,order);
-	
-	for (nz = -nbz; nz <= nbz; nz++) {
-	  qz = unitkz*(mper+nz_pppm*nz);
-	  sz = exp(-0.25*pow(qz/g_ewald,2.0));
-	  wz = 1.0;
-	  argz = 0.5*qz*zprd_slab/nz_pppm;
-	  if (argz != 0.0) wz = pow(sin(argz)/argz,order);
 
-	  dot2 = qx*qx+qy*qy+qz*qz;
-	  u2 =  pow(wx*wy*wz,2.0);
-	  sum1 += sx*sy*sz*sx*sy*sz/dot2*4.0*4.0*MY_PI*MY_PI;
-	  sum2 += sx*sy*sz * u2*4.0*MY_PI;
-	  sum3 += u2;
-	  sum4 += dot2*u2;
-	}
+      for (ny = -nby; ny <= nby; ny++) {
+        qy = unitky*(lper+ny_pppm*ny);
+        sy = exp(-0.25*pow(qy/g_ewald,2.0));
+        wy = 1.0;
+        argy = 0.5*qy*yprd/ny_pppm;
+        if (argy != 0.0) wy = pow(sin(argy)/argy,order);
+
+        for (nz = -nbz; nz <= nbz; nz++) {
+          qz = unitkz*(mper+nz_pppm*nz);
+          sz = exp(-0.25*pow(qz/g_ewald,2.0));
+          wz = 1.0;
+          argz = 0.5*qz*zprd_slab/nz_pppm;
+          if (argz != 0.0) wz = pow(sin(argz)/argz,order);
+
+          dot2 = qx*qx+qy*qy+qz*qz;
+          u2 =  pow(wx*wy*wz,2.0);
+          sum1 += sx*sy*sz*sx*sy*sz/dot2*4.0*4.0*MY_PI*MY_PI;
+          sum2 += sx*sy*sz * u2*4.0*MY_PI;
+          sum3 += u2;
+          sum4 += dot2*u2;
+        }
       }
     }
-    
+
     sum2 *= sum2;
     qopt += sum1 - sum2/(sum3*sum4);
   }
@@ -3173,7 +3173,7 @@ double PPPMDisp::compute_qopt_6_ik()
 
   // loop over entire FFT grid
   // each proc calculates contributions from every Pth grid point
-  
+
   bigint ngridtotal = (bigint) nx_pppm_6 * ny_pppm_6 * nz_pppm_6;
   int nxy_pppm_6 = nx_pppm_6 * ny_pppm_6;
 
@@ -3190,7 +3190,7 @@ double PPPMDisp::compute_qopt_6_ik()
 
     sqk = pow(unitkx*kper,2.0) + pow(unitky*lper,2.0) + pow(unitkz*mper,2.0);
     if (sqk == 0.0) continue;
-    
+
     sum1 = sum2 = sum3 = 0.0;
 
     for (nx = -nbx; nx <= nbx; nx++) {
@@ -3199,32 +3199,32 @@ double PPPMDisp::compute_qopt_6_ik()
       wx = 1.0;
       argx = 0.5*qx*xprd/nx_pppm_6;
       if (argx != 0.0) wx = pow(sin(argx)/argx,order_6);
-      
-      for (ny = -nby; ny <= nby; ny++) {
-	qy = unitky*(lper+ny_pppm_6*ny);
-	sy = exp(-qy*qy*inv2ew*inv2ew);
-	wy = 1.0;
-	argy = 0.5*qy*yprd/ny_pppm_6;
-	if (argy != 0.0) wy = pow(sin(argy)/argy,order_6);
-	
-	for (nz = -nbz; nz <= nbz; nz++) {
-	  qz = unitkz*(mper+nz_pppm_6*nz);
-	  sz = exp(-qz*qz*inv2ew*inv2ew);
-	  wz = 1.0;
-	  argz = 0.5*qz*zprd_slab/nz_pppm_6;
-	  if (argz != 0.0) wz = pow(sin(argz)/argz,order_6);
 
-	  dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
-	  dot2 = qx*qx+qy*qy+qz*qz;
-	  rtdot2 = sqrt(dot2);
-	  term = (1-2*dot2*inv2ew*inv2ew)*sx*sy*sz +
-	    2*dot2*rtdot2*inv2ew*inv2ew*inv2ew*rtpi*erfc(rtdot2*inv2ew);
-	  term *= g_ewald_6*g_ewald_6*g_ewald_6;
-	  u2 =  pow(wx*wy*wz,2.0);
-	  sum1 += term*term*MY_PI*MY_PI*MY_PI/9.0 * dot2;
-	  sum2 += -u2*term*MY_PI*rtpi/3.0*dot1;
-	  sum3 += u2;
-	}
+      for (ny = -nby; ny <= nby; ny++) {
+        qy = unitky*(lper+ny_pppm_6*ny);
+        sy = exp(-qy*qy*inv2ew*inv2ew);
+        wy = 1.0;
+        argy = 0.5*qy*yprd/ny_pppm_6;
+        if (argy != 0.0) wy = pow(sin(argy)/argy,order_6);
+
+        for (nz = -nbz; nz <= nbz; nz++) {
+          qz = unitkz*(mper+nz_pppm_6*nz);
+          sz = exp(-qz*qz*inv2ew*inv2ew);
+          wz = 1.0;
+          argz = 0.5*qz*zprd_slab/nz_pppm_6;
+          if (argz != 0.0) wz = pow(sin(argz)/argz,order_6);
+
+          dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
+          dot2 = qx*qx+qy*qy+qz*qz;
+          rtdot2 = sqrt(dot2);
+          term = (1-2*dot2*inv2ew*inv2ew)*sx*sy*sz +
+            2*dot2*rtdot2*inv2ew*inv2ew*inv2ew*rtpi*erfc(rtdot2*inv2ew);
+          term *= g_ewald_6*g_ewald_6*g_ewald_6;
+          u2 =  pow(wx*wy*wz,2.0);
+          sum1 += term*term*MY_PI*MY_PI*MY_PI/9.0 * dot2;
+          sum2 += -u2*term*MY_PI*rtpi/3.0*dot1;
+          sum3 += u2;
+        }
       }
     }
     sum2 *= sum2;
@@ -3271,7 +3271,7 @@ double PPPMDisp::compute_qopt_6_ad()
 
   // loop over entire FFT grid
   // each proc calculates contributions from every Pth grid point
-  
+
   bigint ngridtotal = (bigint) nx_pppm_6 * ny_pppm_6 * nz_pppm_6;
   int nxy_pppm_6 = nx_pppm_6 * ny_pppm_6;
 
@@ -3288,7 +3288,7 @@ double PPPMDisp::compute_qopt_6_ad()
 
     sqk = pow(unitkx*kper,2.0) + pow(unitky*lper,2.0) + pow(unitkz*mper,2.0);
     if (sqk == 0.0) continue;
-    
+
     sum1 = sum2 = sum3 = sum4 = 0.0;
 
     for (nx = -nbx; nx <= nbx; nx++) {
@@ -3299,30 +3299,30 @@ double PPPMDisp::compute_qopt_6_ad()
       if (argx != 0.0) wx = pow(sin(argx)/argx,order_6);
 
       for (ny = -nby; ny <= nby; ny++) {
-	qy = unitky*(lper+ny_pppm_6*ny);
-	sy = exp(-qy*qy*inv2ew*inv2ew);
-	wy = 1.0;
-	argy = 0.5*qy*yprd/ny_pppm_6;
-	if (argy != 0.0) wy = pow(sin(argy)/argy,order_6);
+        qy = unitky*(lper+ny_pppm_6*ny);
+        sy = exp(-qy*qy*inv2ew*inv2ew);
+        wy = 1.0;
+        argy = 0.5*qy*yprd/ny_pppm_6;
+        if (argy != 0.0) wy = pow(sin(argy)/argy,order_6);
 
-	for (nz = -nbz; nz <= nbz; nz++) {
-	  qz = unitkz*(mper+nz_pppm_6*nz);
-	  sz = exp(-qz*qz*inv2ew*inv2ew);
-	  wz = 1.0;
-	  argz = 0.5*qz*zprd_slab/nz_pppm_6;
-	  if (argz != 0.0) wz = pow(sin(argz)/argz,order_6);
+        for (nz = -nbz; nz <= nbz; nz++) {
+          qz = unitkz*(mper+nz_pppm_6*nz);
+          sz = exp(-qz*qz*inv2ew*inv2ew);
+          wz = 1.0;
+          argz = 0.5*qz*zprd_slab/nz_pppm_6;
+          if (argz != 0.0) wz = pow(sin(argz)/argz,order_6);
 
-	  dot2 = qx*qx+qy*qy+qz*qz;
-	  rtdot2 = sqrt(dot2);
-	  term = (1-2*dot2*inv2ew*inv2ew)*sx*sy*sz +
-	    2*dot2*rtdot2*inv2ew*inv2ew*inv2ew*rtpi*erfc(rtdot2*inv2ew);
-	  term *= g_ewald_6*g_ewald_6*g_ewald_6;
-	  u2 =  pow(wx*wy*wz,2.0);
-	  sum1 += term*term*MY_PI*MY_PI*MY_PI/9.0 * dot2;
-	  sum2 += -term*MY_PI*rtpi/3.0 * u2 * dot2;
-	  sum3 += u2;
-	  sum4 += dot2*u2;
-	}
+          dot2 = qx*qx+qy*qy+qz*qz;
+          rtdot2 = sqrt(dot2);
+          term = (1-2*dot2*inv2ew*inv2ew)*sx*sy*sz +
+            2*dot2*rtdot2*inv2ew*inv2ew*inv2ew*rtpi*erfc(rtdot2*inv2ew);
+          term *= g_ewald_6*g_ewald_6*g_ewald_6;
+          u2 =  pow(wx*wy*wz,2.0);
+          sum1 += term*term*MY_PI*MY_PI*MY_PI/9.0 * dot2;
+          sum2 += -term*MY_PI*rtpi/3.0 * u2 * dot2;
+          sum3 += u2;
+          sum4 += dot2*u2;
+        }
       }
     }
     sum2 *= sum2;
@@ -3340,7 +3340,7 @@ double PPPMDisp::compute_qopt_6_ad()
 void PPPMDisp::set_grid_6()
 {
   // calculate csum
-  
+
   if (!csumflag) calc_csum();
   if (!gewaldflag_6) set_init_g6();
   if (!gridflag_6) set_n_pppm_6();
@@ -3416,9 +3416,9 @@ void PPPMDisp::calc_csum()
   MPI_Allreduce(neach,neach_all,ntypes+1,MPI_INT,MPI_SUM,world);
 
   // copmute csumij and csumi
-  
+
   double d1, d2;
-  
+
   if (function[1]) {
     for (i=1; i<=ntypes; i++) {
       for (j=1; j<=ntypes; j++) {
@@ -3430,7 +3430,7 @@ void PPPMDisp::calc_csum()
       }
     }
   }
-  
+
   if (function[2]) {
     for (i=1; i<=ntypes; i++) {
       for (j=1; j<=ntypes; j++) {
@@ -3444,7 +3444,7 @@ void PPPMDisp::calc_csum()
       }
     }
   }
-  
+
   if (function[3]) {
     for (i=1; i<=ntypes; i++) {
       for (j=1; j<=ntypes; j++) {
@@ -3469,7 +3469,7 @@ void PPPMDisp::calc_csum()
 void PPPMDisp::adjust_gewald_6()
 {
   // use Newton solver to find g_ewald_6
-  
+
   double dx;
 
   // start loop
@@ -3549,13 +3549,13 @@ void PPPMDisp::set_init_g6()
   // if df_real > 0, repeat divide g_ewald_6 by 2 until df_real < 0
   // else, repeat multiply g_ewald_6 by 2 until df_real > 0
   // perform bisection for the last two values of
-  
+
   double df_real;
   double g_ewald_old;
   double gmin, gmax;
 
   // check if there is a user defined accuracy
-  
+
   double acc_rspace = accuracy;
   if (accuracy_real_6 > 0) acc_rspace = accuracy_real_6;
 
@@ -3619,7 +3619,7 @@ void PPPMDisp::set_n_pppm_6()
   if (accuracy_kspace_6 > 0.0) acc_kspace = accuracy_kspace_6;
 
   // initial value for the grid spacing
-  
+
   h = h_x = h_y = h_z = 4.0/g_ewald_6;
 
   // decrease grid spacing until required precision is obtained
@@ -3661,7 +3661,7 @@ void PPPMDisp::set_n_pppm_6()
 
     // break loop if the accuracy has been reached or
     // too many loops have been performed
-    
+
     if (df_kspace <= acc_kspace) break;
     if (count > 500) error->all(FLERR, "Could not compute grid size for Dispersion");
     h *= 0.95;
@@ -3771,10 +3771,10 @@ void PPPMDisp::compute_gf()
 ------------------------------------------------------------------------- */
 
 void PPPMDisp::compute_sf_precoeff(int nxp, int nyp, int nzp, int ord,
-				   int nxlo_ft, int nylo_ft, int nzlo_ft,
-				   int nxhi_ft, int nyhi_ft, int nzhi_ft,
-				   double *sf_pre1, double *sf_pre2, double *sf_pre3,
-				   double *sf_pre4, double *sf_pre5, double *sf_pre6)
+                                   int nxlo_ft, int nylo_ft, int nzlo_ft,
+                                   int nxhi_ft, int nyhi_ft, int nzhi_ft,
+                                   double *sf_pre1, double *sf_pre2, double *sf_pre3,
+                                   double *sf_pre4, double *sf_pre5, double *sf_pre6)
 {
   int i,k,l,m,n;
   double *prd;
@@ -4523,22 +4523,22 @@ void PPPMDisp::make_rho_none()
 ------------------------------------------------------------------------- */
 
 void PPPMDisp::poisson_ik(FFT_SCALAR* wk1, FFT_SCALAR* wk2,
-			  FFT_SCALAR* dfft, LAMMPS_NS::FFT3d* ft1,
-			  LAMMPS_NS::FFT3d* ft2,
-			  int nx_p, int ny_p, int nz_p, int nft,
-			  int nxlo_ft, int nylo_ft, int nzlo_ft,
-			  int nxhi_ft, int nyhi_ft, int nzhi_ft,
-			  int nxlo_i, int nylo_i, int nzlo_i,
-			  int nxhi_i, int nyhi_i, int nzhi_i,
-			  double& egy, double* gfn,
-			  double* kx, double* ky, double* kz,
-			  double* kx2, double* ky2, double* kz2,
-			  FFT_SCALAR*** vx_brick, FFT_SCALAR*** vy_brick,
-			  FFT_SCALAR*** vz_brick,
-			  double* vir, double** vcoeff, double** vcoeff2,
-			  FFT_SCALAR*** u_pa, FFT_SCALAR*** v0_pa,
-			  FFT_SCALAR*** v1_pa, FFT_SCALAR*** v2_pa,
-			  FFT_SCALAR*** v3_pa, FFT_SCALAR*** v4_pa, FFT_SCALAR*** v5_pa)
+                          FFT_SCALAR* dfft, LAMMPS_NS::FFT3d* ft1,
+                          LAMMPS_NS::FFT3d* ft2,
+                          int nx_p, int ny_p, int nz_p, int nft,
+                          int nxlo_ft, int nylo_ft, int nzlo_ft,
+                          int nxhi_ft, int nyhi_ft, int nzhi_ft,
+                          int nxlo_i, int nylo_i, int nzlo_i,
+                          int nxhi_i, int nyhi_i, int nzhi_i,
+                          double& egy, double* gfn,
+                          double* kx, double* ky, double* kz,
+                          double* kx2, double* ky2, double* kz2,
+                          FFT_SCALAR*** vx_brick, FFT_SCALAR*** vy_brick,
+                          FFT_SCALAR*** vz_brick,
+                          double* vir, double** vcoeff, double** vcoeff2,
+                          FFT_SCALAR*** u_pa, FFT_SCALAR*** v0_pa,
+                          FFT_SCALAR*** v1_pa, FFT_SCALAR*** v2_pa,
+                          FFT_SCALAR*** v3_pa, FFT_SCALAR*** v4_pa, FFT_SCALAR*** v5_pa)
 
 {
   int i,j,k,n;
@@ -4669,18 +4669,18 @@ void PPPMDisp::poisson_ik(FFT_SCALAR* wk1, FFT_SCALAR* wk2,
 ------------------------------------------------------------------------- */
 
 void PPPMDisp::poisson_ad(FFT_SCALAR* wk1, FFT_SCALAR* wk2,
-			  FFT_SCALAR* dfft, LAMMPS_NS::FFT3d* ft1,LAMMPS_NS::FFT3d* ft2,
-			  int nx_p, int ny_p, int nz_p, int nft,
-			  int nxlo_ft, int nylo_ft, int nzlo_ft,
-			  int nxhi_ft, int nyhi_ft, int nzhi_ft,
-			  int nxlo_i, int nylo_i, int nzlo_i,
-			  int nxhi_i, int nyhi_i, int nzhi_i,
-			  double& egy, double* gfn,
-			  double* vir, double** vcoeff, double** vcoeff2,
-			  FFT_SCALAR*** u_pa, FFT_SCALAR*** v0_pa,
-			  FFT_SCALAR*** v1_pa, FFT_SCALAR*** v2_pa,
-			  FFT_SCALAR*** v3_pa, FFT_SCALAR*** v4_pa,
-			  FFT_SCALAR*** v5_pa)
+                          FFT_SCALAR* dfft, LAMMPS_NS::FFT3d* ft1,LAMMPS_NS::FFT3d* ft2,
+                          int nx_p, int ny_p, int nz_p, int nft,
+                          int nxlo_ft, int nylo_ft, int nzlo_ft,
+                          int nxhi_ft, int nyhi_ft, int nzhi_ft,
+                          int nxlo_i, int nylo_i, int nzlo_i,
+                          int nxhi_i, int nyhi_i, int nzhi_i,
+                          double& egy, double* gfn,
+                          double* vir, double** vcoeff, double** vcoeff2,
+                          FFT_SCALAR*** u_pa, FFT_SCALAR*** v0_pa,
+                          FFT_SCALAR*** v1_pa, FFT_SCALAR*** v2_pa,
+                          FFT_SCALAR*** v3_pa, FFT_SCALAR*** v4_pa,
+                          FFT_SCALAR*** v5_pa)
 {
   int i,j,k,n;
   double eng;
@@ -4760,13 +4760,13 @@ void PPPMDisp::poisson_ad(FFT_SCALAR* wk1, FFT_SCALAR* wk2,
 ------------------------------------------------------------------------- */
 
 void PPPMDisp:: poisson_peratom(FFT_SCALAR* wk1, FFT_SCALAR* wk2, LAMMPS_NS::FFT3d* ft2,
-				double** vcoeff, double** vcoeff2, int nft,
-				int nxlo_i, int nylo_i, int nzlo_i,
-				int nxhi_i, int nyhi_i, int nzhi_i,
-				FFT_SCALAR*** v0_pa, FFT_SCALAR*** v1_pa,
-				FFT_SCALAR*** v2_pa,
-				FFT_SCALAR*** v3_pa, FFT_SCALAR*** v4_pa,
-				FFT_SCALAR*** v5_pa)
+                                double** vcoeff, double** vcoeff2, int nft,
+                                int nxlo_i, int nylo_i, int nzlo_i,
+                                int nxhi_i, int nyhi_i, int nzhi_i,
+                                FFT_SCALAR*** v0_pa, FFT_SCALAR*** v1_pa,
+                                FFT_SCALAR*** v2_pa,
+                                FFT_SCALAR*** v3_pa, FFT_SCALAR*** v4_pa,
+                                FFT_SCALAR*** v5_pa)
 {
  //v0 & v1 term
   int n, i, j, k;
@@ -4834,14 +4834,14 @@ void PPPMDisp:: poisson_peratom(FFT_SCALAR* wk1, FFT_SCALAR* wk2, LAMMPS_NS::FFT
 
 void PPPMDisp::
 poisson_2s_ik(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
-	      FFT_SCALAR*** vxbrick_1, FFT_SCALAR*** vybrick_1, FFT_SCALAR*** vzbrick_1,
-	      FFT_SCALAR*** vxbrick_2, FFT_SCALAR*** vybrick_2, FFT_SCALAR*** vzbrick_2,
-	      FFT_SCALAR*** u_pa_1, FFT_SCALAR*** v0_pa_1,
-	      FFT_SCALAR*** v1_pa_1, FFT_SCALAR*** v2_pa_1,
-	      FFT_SCALAR*** v3_pa_1, FFT_SCALAR*** v4_pa_1, FFT_SCALAR*** v5_pa_1,
-	      FFT_SCALAR*** u_pa_2, FFT_SCALAR*** v0_pa_2,
-	      FFT_SCALAR*** v1_pa_2, FFT_SCALAR*** v2_pa_2,
-	      FFT_SCALAR*** v3_pa_2, FFT_SCALAR*** v4_pa_2, FFT_SCALAR*** v5_pa_2)
+              FFT_SCALAR*** vxbrick_1, FFT_SCALAR*** vybrick_1, FFT_SCALAR*** vzbrick_1,
+              FFT_SCALAR*** vxbrick_2, FFT_SCALAR*** vybrick_2, FFT_SCALAR*** vzbrick_2,
+              FFT_SCALAR*** u_pa_1, FFT_SCALAR*** v0_pa_1,
+              FFT_SCALAR*** v1_pa_1, FFT_SCALAR*** v2_pa_1,
+              FFT_SCALAR*** v3_pa_1, FFT_SCALAR*** v4_pa_1, FFT_SCALAR*** v5_pa_1,
+              FFT_SCALAR*** u_pa_2, FFT_SCALAR*** v0_pa_2,
+              FFT_SCALAR*** v1_pa_2, FFT_SCALAR*** v2_pa_2,
+              FFT_SCALAR*** v3_pa_2, FFT_SCALAR*** v4_pa_2, FFT_SCALAR*** v5_pa_2)
 
 {
   int i,j,k,n;
@@ -4881,7 +4881,7 @@ poisson_2s_ik(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
       n = 0;
       for (i = 0; i < nfft_6; i++) {
         eng = 2 * s2 * greensfn_6[i] *
-	  (work1_6[n]*work2_6[n+1] - work1_6[n+1]*work2_6[n]);
+          (work1_6[n]*work2_6[n+1] - work1_6[n+1]*work2_6[n]);
         for (j = 0; j < 6; j++) virial_6[j] += eng*vg_6[i][j];
         if (eflag_global)energy_6 += eng;
         n += 2;
@@ -4894,9 +4894,9 @@ poisson_2s_ik(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
         n += 2;
       }
     }
-    
+
     // unify the two transformed vectors for efficient calculations later
-    
+
     for ( i = 0; i < 2*nfft_6; i++) {
       work1_6[i] += work2_6[i];
     }
@@ -4997,9 +4997,9 @@ poisson_2s_ik(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
   }
 
   if (vflag_atom) poisson_2s_peratom(v0_pa_1, v1_pa_1, v2_pa_1,
-				     v3_pa_1, v4_pa_1, v5_pa_1,
+                                     v3_pa_1, v4_pa_1, v5_pa_1,
                                      v0_pa_2, v1_pa_2, v2_pa_2,
-				     v3_pa_2, v4_pa_2, v5_pa_2);
+                                     v3_pa_2, v4_pa_2, v5_pa_2);
 }
 
 
@@ -5010,13 +5010,13 @@ poisson_2s_ik(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
 
 void PPPMDisp::
 poisson_none_ik(int n1, int n2,FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
-		FFT_SCALAR*** vxbrick_1, FFT_SCALAR*** vybrick_1,
-		FFT_SCALAR*** vzbrick_1,
-		FFT_SCALAR*** vxbrick_2, FFT_SCALAR*** vybrick_2,
-		FFT_SCALAR*** vzbrick_2,
-		FFT_SCALAR**** u_pa, FFT_SCALAR**** v0_pa,
-		FFT_SCALAR**** v1_pa, FFT_SCALAR**** v2_pa,
-		FFT_SCALAR**** v3_pa, FFT_SCALAR**** v4_pa, FFT_SCALAR**** v5_pa)
+                FFT_SCALAR*** vxbrick_1, FFT_SCALAR*** vybrick_1,
+                FFT_SCALAR*** vzbrick_1,
+                FFT_SCALAR*** vxbrick_2, FFT_SCALAR*** vybrick_2,
+                FFT_SCALAR*** vzbrick_2,
+                FFT_SCALAR**** u_pa, FFT_SCALAR**** v0_pa,
+                FFT_SCALAR**** v1_pa, FFT_SCALAR**** v2_pa,
+                FFT_SCALAR**** v3_pa, FFT_SCALAR**** v4_pa, FFT_SCALAR**** v5_pa)
 {
   int i,j,k,n;
   double eng;
@@ -5025,7 +5025,7 @@ poisson_none_ik(int n1, int n2,FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
 
   // transform charge/dispersion density (r -> k)
   // only one tansform required when energies and pressures not needed
-  
+
   if (eflag_global + vflag_global == 0) {
     n = 0;
     for (i = 0; i < nfft_6; i++) {
@@ -5037,7 +5037,7 @@ poisson_none_ik(int n1, int n2,FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
   }
 
   // two transforms are required when energies and pressures are calculated
-  
+
   else {
     n = 0;
     for (i = 0; i < nfft_6; i++) {
@@ -5057,8 +5057,8 @@ poisson_none_ik(int n1, int n2,FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
       n = 0;
       for (i = 0; i < nfft_6; i++) {
         eng = s2 * greensfn_6[i] *
-	  (B[n1]*(work1_6[n]*work1_6[n] + work1_6[n+1]*work1_6[n+1]) +
-	   B[n2]*(work2_6[n]*work2_6[n] + work2_6[n+1]*work2_6[n+1]));
+          (B[n1]*(work1_6[n]*work1_6[n] + work1_6[n+1]*work1_6[n+1]) +
+           B[n2]*(work2_6[n]*work2_6[n] + work2_6[n+1]*work2_6[n+1]));
         for (j = 0; j < 6; j++) virial_6[j] += eng*vg_6[i][j];
         if (eflag_global)energy_6 += eng;
         n += 2;
@@ -5068,14 +5068,14 @@ poisson_none_ik(int n1, int n2,FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
       for (i = 0; i < nfft_6; i++) {
         energy_6 +=
           s2 * greensfn_6[i] *
-	  (B[n1]*(work1_6[n]*work1_6[n] + work1_6[n+1]*work1_6[n+1]) +
-	   B[n2]*(work2_6[n]*work2_6[n] + work2_6[n+1]*work2_6[n+1]));
+          (B[n1]*(work1_6[n]*work1_6[n] + work1_6[n+1]*work1_6[n+1]) +
+           B[n2]*(work2_6[n]*work2_6[n] + work2_6[n+1]*work2_6[n+1]));
         n += 2;
       }
     }
-    
+
     // unify the two transformed vectors for efficient calculations later
-    
+
     for ( i = 0; i < 2*nfft_6; i++) {
       work1_6[i] += work2_6[i];
     }
@@ -5187,12 +5187,12 @@ poisson_none_ik(int n1, int n2,FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
 
 void PPPMDisp::
 poisson_2s_ad(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
-	      FFT_SCALAR*** u_pa_1, FFT_SCALAR*** v0_pa_1,
-	      FFT_SCALAR*** v1_pa_1, FFT_SCALAR*** v2_pa_1,
-	      FFT_SCALAR*** v3_pa_1, FFT_SCALAR*** v4_pa_1, FFT_SCALAR*** v5_pa_1,
-	      FFT_SCALAR*** u_pa_2, FFT_SCALAR*** v0_pa_2,
-	      FFT_SCALAR*** v1_pa_2, FFT_SCALAR*** v2_pa_2,
-	      FFT_SCALAR*** v3_pa_2, FFT_SCALAR*** v4_pa_2, FFT_SCALAR*** v5_pa_2)
+              FFT_SCALAR*** u_pa_1, FFT_SCALAR*** v0_pa_1,
+              FFT_SCALAR*** v1_pa_1, FFT_SCALAR*** v2_pa_1,
+              FFT_SCALAR*** v3_pa_1, FFT_SCALAR*** v4_pa_1, FFT_SCALAR*** v5_pa_1,
+              FFT_SCALAR*** u_pa_2, FFT_SCALAR*** v0_pa_2,
+              FFT_SCALAR*** v1_pa_2, FFT_SCALAR*** v2_pa_2,
+              FFT_SCALAR*** v3_pa_2, FFT_SCALAR*** v4_pa_2, FFT_SCALAR*** v5_pa_2)
 {
   int i,j,k,n;
   double eng;
@@ -5211,9 +5211,9 @@ poisson_2s_ad(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
 
     fft1_6->compute(work1_6,work1_6,1);
   }
-  
+
   // two transforms are required when energies and pressures are calculated
-  
+
   else {
     n = 0;
     for (i = 0; i < nfft_6; i++) {
@@ -5232,7 +5232,7 @@ poisson_2s_ad(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
       n = 0;
       for (i = 0; i < nfft_6; i++) {
         eng = 2 * s2 * greensfn_6[i] *
-	  (work1_6[n]*work2_6[n+1] - work1_6[n+1]*work2_6[n]);
+          (work1_6[n]*work2_6[n+1] - work1_6[n+1]*work2_6[n]);
         for (j = 0; j < 6; j++) virial_6[j] += eng*vg_6[i][j];
         if (eflag_global)energy_6 += eng;
         n += 2;
@@ -5277,9 +5277,9 @@ poisson_2s_ad(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
       }
 
   if (vflag_atom) poisson_2s_peratom(v0_pa_1, v1_pa_1, v2_pa_1,
-				     v3_pa_1, v4_pa_1, v5_pa_1,
+                                     v3_pa_1, v4_pa_1, v5_pa_1,
                                      v0_pa_2, v1_pa_2, v2_pa_2,
-				     v3_pa_2, v4_pa_2, v5_pa_2);
+                                     v3_pa_2, v4_pa_2, v5_pa_2);
 }
 
 /* ----------------------------------------------------------------------
@@ -5289,9 +5289,9 @@ poisson_2s_ad(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
 
 void PPPMDisp::
 poisson_none_ad(int n1, int n2, FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
-		FFT_SCALAR*** u_pa_1, FFT_SCALAR*** u_pa_2,
-		FFT_SCALAR**** v0_pa, FFT_SCALAR**** v1_pa, FFT_SCALAR**** v2_pa,
-		FFT_SCALAR**** v3_pa, FFT_SCALAR**** v4_pa, FFT_SCALAR**** v5_pa)
+                FFT_SCALAR*** u_pa_1, FFT_SCALAR*** u_pa_2,
+                FFT_SCALAR**** v0_pa, FFT_SCALAR**** v1_pa, FFT_SCALAR**** v2_pa,
+                FFT_SCALAR**** v3_pa, FFT_SCALAR**** v4_pa, FFT_SCALAR**** v5_pa)
 {
   int i,j,k,n;
   double eng;
@@ -5310,9 +5310,9 @@ poisson_none_ad(int n1, int n2, FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
 
     fft1_6->compute(work1_6,work1_6,1);
   }
-  
+
   // two transforms are required when energies and pressures are calculated
-  
+
   else {
     n = 0;
     for (i = 0; i < nfft_6; i++) {
@@ -5331,8 +5331,8 @@ poisson_none_ad(int n1, int n2, FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
       n = 0;
       for (i = 0; i < nfft_6; i++) {
         eng = s2 * greensfn_6[i] *
-	  (B[n1]*(work1_6[n]*work1_6[n] + work1_6[n+1]*work1_6[n+1]) +
-	   B[n2]*(work2_6[n]*work2_6[n] + work2_6[n+1]*work2_6[n+1]));
+          (B[n1]*(work1_6[n]*work1_6[n] + work1_6[n+1]*work1_6[n+1]) +
+           B[n2]*(work2_6[n]*work2_6[n] + work2_6[n+1]*work2_6[n+1]));
         for (j = 0; j < 6; j++) virial_6[j] += eng*vg_6[i][j];
         if (eflag_global)energy_6 += eng;
         n += 2;
@@ -5342,14 +5342,14 @@ poisson_none_ad(int n1, int n2, FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
       for (i = 0; i < nfft_6; i++) {
         energy_6 +=
           s2 * greensfn_6[i] *
-	  (B[n1]*(work1_6[n]*work1_6[n] + work1_6[n+1]*work1_6[n+1]) +
-	   B[n2]*(work2_6[n]*work2_6[n] + work2_6[n+1]*work2_6[n+1]));
+          (B[n1]*(work1_6[n]*work1_6[n] + work1_6[n+1]*work1_6[n+1]) +
+           B[n2]*(work2_6[n]*work2_6[n] + work2_6[n+1]*work2_6[n+1]));
         n += 2;
       }
     }
-    
+
     // unify the two transformed vectors for efficient calculations later
-    
+
     for ( i = 0; i < 2*nfft_6; i++) {
       work1_6[i] += work2_6[i];
     }
@@ -5391,12 +5391,12 @@ poisson_none_ad(int n1, int n2, FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
 
 void PPPMDisp::
 poisson_2s_peratom(FFT_SCALAR*** v0_pa_1, FFT_SCALAR*** v1_pa_1, FFT_SCALAR*** v2_pa_1,
-		   FFT_SCALAR*** v3_pa_1, FFT_SCALAR*** v4_pa_1, FFT_SCALAR*** v5_pa_1,
-		   FFT_SCALAR*** v0_pa_2, FFT_SCALAR*** v1_pa_2, FFT_SCALAR*** v2_pa_2,
-		   FFT_SCALAR*** v3_pa_2, FFT_SCALAR*** v4_pa_2, FFT_SCALAR*** v5_pa_2)
+                   FFT_SCALAR*** v3_pa_1, FFT_SCALAR*** v4_pa_1, FFT_SCALAR*** v5_pa_1,
+                   FFT_SCALAR*** v0_pa_2, FFT_SCALAR*** v1_pa_2, FFT_SCALAR*** v2_pa_2,
+                   FFT_SCALAR*** v3_pa_2, FFT_SCALAR*** v4_pa_2, FFT_SCALAR*** v5_pa_2)
 {
   //Compute first virial term v0
-  
+
   int n, i, j, k;
 
   n = 0;
@@ -5518,14 +5518,14 @@ poisson_2s_peratom(FFT_SCALAR*** v0_pa_1, FFT_SCALAR*** v1_pa_1, FFT_SCALAR*** v
 
 void PPPMDisp::
 poisson_none_peratom(int n1, int n2,
-		     FFT_SCALAR*** v0_pa_1, FFT_SCALAR*** v1_pa_1,
-		     FFT_SCALAR*** v2_pa_1,
-		     FFT_SCALAR*** v3_pa_1, FFT_SCALAR*** v4_pa_1,
-		     FFT_SCALAR*** v5_pa_1,
-		     FFT_SCALAR*** v0_pa_2, FFT_SCALAR*** v1_pa_2,
-		     FFT_SCALAR*** v2_pa_2,
-		     FFT_SCALAR*** v3_pa_2, FFT_SCALAR*** v4_pa_2,
-		     FFT_SCALAR*** v5_pa_2)
+                     FFT_SCALAR*** v0_pa_1, FFT_SCALAR*** v1_pa_1,
+                     FFT_SCALAR*** v2_pa_1,
+                     FFT_SCALAR*** v3_pa_1, FFT_SCALAR*** v4_pa_1,
+                     FFT_SCALAR*** v5_pa_1,
+                     FFT_SCALAR*** v0_pa_2, FFT_SCALAR*** v1_pa_2,
+                     FFT_SCALAR*** v2_pa_2,
+                     FFT_SCALAR*** v3_pa_2, FFT_SCALAR*** v4_pa_2,
+                     FFT_SCALAR*** v5_pa_2)
 {
   //Compute first virial term v0
   int n, i, j, k;
@@ -6339,7 +6339,7 @@ void PPPMDisp::fieldforce_a_ad()
     sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
     if (slabflag != 2)
       f[i][2] += lj0*ekz0 + lj1*ekz1 +
-	lj2*ekz2 + lj3*ekz3 + lj4*ekz4 + lj5*ekz5 + lj6*ekz6 - sf;
+        lj2*ekz2 + lj3*ekz3 + lj4*ekz4 + lj5*ekz5 + lj6*ekz6 - sf;
   }
 }
 
@@ -8248,7 +8248,7 @@ int PPPMDisp::timing_3d(int n, double &time3d)
 double PPPMDisp::memory_usage()
 {
   double bytes = nmax*3 * sizeof(double);
-  
+
   int mixing = 1;
   int diff = 3;     //depends on differentiation
   int per = 7;      //depends on per atom calculations
@@ -8273,7 +8273,7 @@ double PPPMDisp::memory_usage()
     int nbrick = (nxhi_out_6-nxlo_out_6+1) * (nyhi_out_6-nylo_out_6+1) *
       (nzhi_out_6-nzlo_out_6+1);
     // density_brick + vd_brick + per atom bricks
-    bytes += (1 + diff + per ) * nbrick * sizeof(FFT_SCALAR) * mixing;     
+    bytes += (1 + diff + per ) * nbrick * sizeof(FFT_SCALAR) * mixing;
     bytes += 6 * nfft_both_6 * sizeof(double);      // vg
     bytes += nfft_both_6 * sizeof(double);          // greensfn
     // density_FFT, work1, work2
diff --git a/src/KSPACE/pppm_stagger.cpp b/src/KSPACE/pppm_stagger.cpp
index 837644f0e3..f71529ae83 100644
--- a/src/KSPACE/pppm_stagger.cpp
+++ b/src/KSPACE/pppm_stagger.cpp
@@ -157,7 +157,7 @@ void PPPMStagger::compute(int eflag, int vflag)
     // remap from 3d decomposition to FFT decomposition
 
     gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
-			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     brick2fft();
 
     // compute potential gradient on my FFT grid and
@@ -172,20 +172,20 @@ void PPPMStagger::compute(int eflag, int vflag)
 
     if (differentiation_flag == 1)
       gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
-			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     else
       gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
-			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     // extra per-atom energy/virial communication
 
     if (evflag_atom) {
       if (differentiation_flag == 1 && vflag_atom)
-	gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
-				gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+        gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+                                gc_buf1,gc_buf2,MPI_FFT_SCALAR);
       else if (differentiation_flag == 0)
-	gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
-				gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+        gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+                                gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     }
 
     // calculate the force on my particles
@@ -299,10 +299,10 @@ double PPPMStagger::compute_qopt()
 
   // loop over entire FFT grid
   // each proc calculates contributions from every Pth grid point
-  
+
   bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
   int nxy_pppm = nx_pppm * ny_pppm;
-  
+
   double qopt = 0.0;
 
   for (bigint i = me; i < ngridtotal; i += nprocs) {
@@ -336,31 +336,31 @@ double PPPMStagger::compute_qopt()
       wx = powsinxx(argx,twoorder);
 
       for (ny = -nby; ny <= nby; ny++) {
-	qy = unitky*(lper+ny_pppm*ny);
-	sy = exp(-0.25*square(qy/g_ewald));
-	argy = 0.5*qy*yprd/ny_pppm;
-	wy = powsinxx(argy,twoorder);
+        qy = unitky*(lper+ny_pppm*ny);
+        sy = exp(-0.25*square(qy/g_ewald));
+        argy = 0.5*qy*yprd/ny_pppm;
+        wy = powsinxx(argy,twoorder);
 
-	for (nz = -nbz; nz <= nbz; nz++) {
-	  qz = unitkz*(mper+nz_pppm*nz);
-	  sz = exp(-0.25*square(qz/g_ewald));
-	  argz = 0.5*qz*zprd_slab/nz_pppm;
-	  wz = powsinxx(argz,twoorder);
+        for (nz = -nbz; nz <= nbz; nz++) {
+          qz = unitkz*(mper+nz_pppm*nz);
+          sz = exp(-0.25*square(qz/g_ewald));
+          argz = 0.5*qz*zprd_slab/nz_pppm;
+          wz = powsinxx(argz,twoorder);
 
-	  dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
-	  dot2 = qx*qx + qy*qy + qz*qz;
-	  u1   = sx*sy*sz;
-	  u2   = wx*wy*wz;
-	  u3   = numerator*u1*u2*dot1;
-	  sum1 += u1*u1*MY_4PI*MY_4PI/dot2;
-	  sum2 += u3*u3/dot2;
-	}
+          dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
+          dot2 = qx*qx + qy*qy + qz*qz;
+          u1   = sx*sy*sz;
+          u2   = wx*wy*wz;
+          u3   = numerator*u1*u2*dot1;
+          sum1 += u1*u1*MY_4PI*MY_4PI/dot2;
+          sum2 += u3*u3/dot2;
+        }
       }
     }
-      
+
     qopt += sum1 - sum2/denominator;
   }
-  
+
   double qopt_all;
   MPI_Allreduce(&qopt,&qopt_all,1,MPI_DOUBLE,MPI_SUM,world);
   return qopt_all;
@@ -395,10 +395,10 @@ double PPPMStagger::compute_qopt_ad()
 
   // loop over entire FFT grid
   // each proc calculates contributions from every Pth grid point
-  
+
   bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
   int nxy_pppm = nx_pppm * ny_pppm;
-  
+
   double qopt = 0.0;
 
   for (bigint i = me; i < ngridtotal; i += nprocs) {
@@ -414,7 +414,7 @@ double PPPMStagger::compute_qopt_ad()
     if (sqk == 0.0) continue;
 
     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = 0.0;
-    
+
     for (nx = -nbx; nx <= nbx; nx++) {
       qx = unitkx*(kper+nx_pppm*nx);
       sx = exp(-0.25*square(qx/g_ewald));
@@ -422,30 +422,30 @@ double PPPMStagger::compute_qopt_ad()
       wx = powsinxx(argx,twoorder);
 
       for (ny = -nby; ny <= nby; ny++) {
-	qy = unitky*(lper+ny_pppm*ny);
-	sy = exp(-0.25*square(qy/g_ewald));
-	argy = 0.5*qy*yprd/ny_pppm;
-	wy = powsinxx(argy,twoorder);
-	
-	for (nz = -nbz; nz <= nbz; nz++) {
-	  qz = unitkz*(mper+nz_pppm*nz);
-	  sz = exp(-0.25*square(qz/g_ewald));
-	  argz = 0.5*qz*zprd_slab/nz_pppm;
-	  wz = powsinxx(argz,twoorder);
-	  
-	  dot2 = qx*qx + qy*qy + qz*qz;
-	  u1   = sx*sy*sz;
-	  u2   = wx*wy*wz;
-	  sum1 += u1*u1/dot2*MY_4PI*MY_4PI;
-	  sum2 += u1*u1*u2*u2*MY_4PI*MY_4PI;
-	  sum3 += u2;
-	  sum4 += dot2*u2;
-	  sum5 += u2*powint(-1.0,nx+ny+nz);
-	  sum6 += dot2*u2*powint(-1.0,nx+ny+nz);
-	}
+        qy = unitky*(lper+ny_pppm*ny);
+        sy = exp(-0.25*square(qy/g_ewald));
+        argy = 0.5*qy*yprd/ny_pppm;
+        wy = powsinxx(argy,twoorder);
+
+        for (nz = -nbz; nz <= nbz; nz++) {
+          qz = unitkz*(mper+nz_pppm*nz);
+          sz = exp(-0.25*square(qz/g_ewald));
+          argz = 0.5*qz*zprd_slab/nz_pppm;
+          wz = powsinxx(argz,twoorder);
+
+          dot2 = qx*qx + qy*qy + qz*qz;
+          u1   = sx*sy*sz;
+          u2   = wx*wy*wz;
+          sum1 += u1*u1/dot2*MY_4PI*MY_4PI;
+          sum2 += u1*u1*u2*u2*MY_4PI*MY_4PI;
+          sum3 += u2;
+          sum4 += dot2*u2;
+          sum5 += u2*powint(-1.0,nx+ny+nz);
+          sum6 += dot2*u2*powint(-1.0,nx+ny+nz);
+        }
       }
     }
-    
+
     qopt += sum1 - sum2/(0.5*(sum3*sum4 + sum5*sum6));
   }
 
diff --git a/src/MISC/fix_gld.cpp b/src/MISC/fix_gld.cpp
index ab601ae8cb..0e4c61813b 100644
--- a/src/MISC/fix_gld.cpp
+++ b/src/MISC/fix_gld.cpp
@@ -568,7 +568,7 @@ void FixGLD::unpack_restart(int nlocal, int nth)
 
   // skip to the nth set of extended variables
   // unpack the Nth first values this way because other fixes pack them
-  
+
   int m = 0;
   for (int i = 0; i< nth; i++) m += static_cast<int> (extra[nlocal][m]);
   m++;
diff --git a/src/MISC/fix_ttm.cpp b/src/MISC/fix_ttm.cpp
index 529914ec34..896eb24b7c 100644
--- a/src/MISC/fix_ttm.cpp
+++ b/src/MISC/fix_ttm.cpp
@@ -674,7 +674,7 @@ void FixTTM::unpack_restart(int nlocal, int nth)
 
   // skip to Nth set of extra values
   // unpack the Nth first values this way because other fixes pack them
-  
+
   int m = 0;
   for (int i = 0; i < nth; i++) m += static_cast<int> (extra[nlocal][m]);
   m++;
diff --git a/src/MOLECULE/fix_cmap.cpp b/src/MOLECULE/fix_cmap.cpp
index d3b071ebc6..5d95ae73c2 100644
--- a/src/MOLECULE/fix_cmap.cpp
+++ b/src/MOLECULE/fix_cmap.cpp
@@ -1307,7 +1307,7 @@ void FixCMAP::unpack_restart(int nlocal, int nth)
 
   // skip to Nth set of extra values
   // unpack the Nth first values this way because other fixes pack them
-  
+
    int n = 0;
    for (int i = 0; i < nth; i++) n += static_cast<int> (extra[nlocal][n]);
 
diff --git a/src/USER-INTEL/pppm_disp_intel.cpp b/src/USER-INTEL/pppm_disp_intel.cpp
index 97e5c57d6e..3229c462aa 100644
--- a/src/USER-INTEL/pppm_disp_intel.cpp
+++ b/src/USER-INTEL/pppm_disp_intel.cpp
@@ -173,9 +173,9 @@ void PPPMDispIntel::compute(int eflag, int vflag)
     return;
   }
   #endif
-  
+
   int i;
-  
+
   // convert atoms from box to lamda coords
 
   ev_init(eflag,vflag);
@@ -292,7 +292,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
     }
 
     gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
-			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     brick2fft(nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
               density_brick, density_fft, work1,remap);
@@ -306,7 +306,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                  v1_brick, v2_brick, v3_brick, v4_brick, v5_brick);
 
       gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
-			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
         fieldforce_c_ad<float,double>(fix->get_mixed_buffers());
@@ -317,8 +317,8 @@ void PPPMDispIntel::compute(int eflag, int vflag)
       }
 
       if (vflag_atom)
-	gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
-				gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+        gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+                                gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     } else {
       poisson_ik(work1, work2, density_fft, fft1, fft2,
@@ -331,7 +331,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                  v5_brick);
 
       gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
-			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
         fieldforce_c_ik<float,double>(fix->get_mixed_buffers());
@@ -342,14 +342,14 @@ void PPPMDispIntel::compute(int eflag, int vflag)
       }
 
       if (evflag_atom)
-	gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
-				gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+        gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+                                gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     }
     if (evflag_atom) fieldforce_c_peratom();
   }
 
   if (function[1]) {
-    
+
     //perform calculations for geometric mixing
 
     if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
@@ -376,7 +376,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
     }
 
     gc6->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO_G,
-			     gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                             gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     brick2fft(nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, nzhi_in_6,
               density_brick_g, density_fft_g, work1_6,remap_6);
@@ -391,19 +391,19 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                  v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);
 
       gc6->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD_G,
-			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-	fieldforce_g_ad<float,double>(fix->get_mixed_buffers());
+        fieldforce_g_ad<float,double>(fix->get_mixed_buffers());
       } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-	fieldforce_g_ad<double,double>(fix->get_double_buffers());
+        fieldforce_g_ad<double,double>(fix->get_double_buffers());
       } else {
-	fieldforce_g_ad<float,float>(fix->get_single_buffers());
+        fieldforce_g_ad<float,float>(fix->get_single_buffers());
       }
 
       if (vflag_atom)
-	gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_G,
-				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+        gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_G,
+                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     } else {
       poisson_ik(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
@@ -416,19 +416,19 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                  v1_brick_g, v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);
 
       gc6->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK_G,
-			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-	fieldforce_g_ik<float,double>(fix->get_mixed_buffers());
+        fieldforce_g_ik<float,double>(fix->get_mixed_buffers());
       } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-	fieldforce_g_ik<double,double>(fix->get_double_buffers());
+        fieldforce_g_ik<double,double>(fix->get_double_buffers());
       } else {
-	fieldforce_g_ik<float,float>(fix->get_single_buffers());
+        fieldforce_g_ik<float,float>(fix->get_single_buffers());
       }
 
       if (evflag_atom)
-	gc6->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_G,
-				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+        gc6->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_G,
+                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     }
 
     if (evflag_atom) fieldforce_g_peratom();
@@ -461,7 +461,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
     }
 
     gc->reverse_comm_kspace(this,7,sizeof(FFT_SCALAR),REVERSE_RHO_A,
-			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     brick2fft_a();
 
@@ -487,19 +487,19 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                     v2_brick_a4, v3_brick_a4, v4_brick_a4, v5_brick_a4);
 
       gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_AD_A,
-			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-	fieldforce_a_ad<float,double>(fix->get_mixed_buffers());
+        fieldforce_a_ad<float,double>(fix->get_mixed_buffers());
       } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-	fieldforce_a_ad<double,double>(fix->get_double_buffers());
+        fieldforce_a_ad<double,double>(fix->get_double_buffers());
       } else {
-	fieldforce_a_ad<float,float>(fix->get_single_buffers());
+        fieldforce_a_ad<float,float>(fix->get_single_buffers());
       }
 
       if (evflag_atom)
-	gc6->forward_comm_kspace(this,42,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_A,
-				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+        gc6->forward_comm_kspace(this,42,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_A,
+                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     }  else {
       poisson_ik(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
@@ -530,7 +530,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                     v3_brick_a4, v4_brick_a4, v5_brick_a4);
 
       gc6->forward_comm_kspace(this,18,sizeof(FFT_SCALAR),FORWARD_IK_A,
-			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
         fieldforce_a_ik<float,double>(fix->get_mixed_buffers());
@@ -541,15 +541,15 @@ void PPPMDispIntel::compute(int eflag, int vflag)
       }
 
       if (evflag_atom)
-	gc6->forward_comm_kspace(this,49,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_A,
-				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+        gc6->forward_comm_kspace(this,49,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_A,
+                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     }
 
     if (evflag_atom) fieldforce_a_peratom();
   }
 
   if (function[3]) {
-    
+
     // perform calculations if no mixing rule applies
 
     if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
@@ -576,7 +576,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
     }
 
     gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO_NONE,
-			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     brick2fft_none();
 
@@ -592,19 +592,19 @@ void PPPMDispIntel::compute(int eflag, int vflag)
       }
 
       gc6->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD_NONE,
-			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-	fieldforce_none_ad<float,double>(fix->get_mixed_buffers());
+        fieldforce_none_ad<float,double>(fix->get_mixed_buffers());
       } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-	fieldforce_none_ad<double,double>(fix->get_double_buffers());
+        fieldforce_none_ad<double,double>(fix->get_double_buffers());
       } else {
-	fieldforce_none_ad<float,float>(fix->get_single_buffers());
+        fieldforce_none_ad<float,float>(fix->get_single_buffers());
       }
 
       if (vflag_atom)
-	gc6->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_NONE,
-				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+        gc6->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_NONE,
+                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
     } else {
       int n = 0;
@@ -621,19 +621,19 @@ void PPPMDispIntel::compute(int eflag, int vflag)
       }
 
       gc6->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK_NONE,
-			       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
       if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-	fieldforce_none_ik<float,double>(fix->get_mixed_buffers());
+        fieldforce_none_ik<float,double>(fix->get_mixed_buffers());
       } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-	fieldforce_none_ik<double,double>(fix->get_double_buffers());
+        fieldforce_none_ik<double,double>(fix->get_double_buffers());
       } else {
-	fieldforce_none_ik<float,float>(fix->get_single_buffers());
+        fieldforce_none_ik<float,float>(fix->get_single_buffers());
       }
 
       if (evflag_atom)
-	gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_NONE,
-				 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+        gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_NONE,
+                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     }
 
     if (evflag_atom) fieldforce_none_peratom();
diff --git a/src/USER-MISC/fix_gle.cpp b/src/USER-MISC/fix_gle.cpp
index 8459ddf29b..87dbb19496 100644
--- a/src/USER-MISC/fix_gle.cpp
+++ b/src/USER-MISC/fix_gle.cpp
@@ -843,7 +843,7 @@ void FixGLE::unpack_restart(int nlocal, int nth)
 
   // skip to the nth set of extended variables
   // unpack the Nth first values this way because other fixes pack them
-  
+
   int m = 0;
   for (int i = 0; i< nth; i++) m += static_cast<int> (extra[nlocal][m]);
   m++;
diff --git a/src/USER-MISC/fix_srp.cpp b/src/USER-MISC/fix_srp.cpp
index 131fd7d27a..e72ea61b01 100644
--- a/src/USER-MISC/fix_srp.cpp
+++ b/src/USER-MISC/fix_srp.cpp
@@ -572,7 +572,7 @@ void FixSRP::unpack_restart(int nlocal, int nth)
 
   // skip to Nth set of extra values
   // unpack the Nth first values this way because other fixes pack them
-  
+
   int m = 0;
   for (int i = 0; i < nth; i++){
     m += static_cast<int> (extra[nlocal][m]);
diff --git a/src/USER-MISC/pair_meam_spline.cpp b/src/USER-MISC/pair_meam_spline.cpp
index 5f35070130..5764b09d8a 100644
--- a/src/USER-MISC/pair_meam_spline.cpp
+++ b/src/USER-MISC/pair_meam_spline.cpp
@@ -448,7 +448,7 @@ void PairMEAMSpline::coeff(int narg, char **arg)
       if (map[j] == i) count++;
     if (count != 1)
       error->all(FLERR,"Pair style meam/spline requires one atom type per element");
-  } 
+  }
 }
 
 #define MAXLINE 1024
diff --git a/src/USER-OMP/ewald_omp.cpp b/src/USER-OMP/ewald_omp.cpp
index e023daf2db..ec8d708da2 100644
--- a/src/USER-OMP/ewald_omp.cpp
+++ b/src/USER-OMP/ewald_omp.cpp
@@ -438,14 +438,14 @@ void EwaldOMP::eik_dot_r_triclinic()
 #pragma omp parallel LMP_DEFAULT_NONE
 #endif
   {
-    
+
     int i,ifrom,ito,k,l,m,n,ic,tid;
     double cstr1,sstr1;
     double sqk,clpm,slpm;
     double unitk_lamda[3];
 
     loop_setup_thr(ifrom,ito,tid,nlocal,nthreads);
-    
+
     double max_kvecs[3];
     max_kvecs[0] = kxmax;
     max_kvecs[1] = kymax;
diff --git a/src/fix_neigh_history.cpp b/src/fix_neigh_history.cpp
index 933bcdc265..e3d0d2830a 100644
--- a/src/fix_neigh_history.cpp
+++ b/src/fix_neigh_history.cpp
@@ -853,7 +853,7 @@ int FixNeighHistory::pack_restart(int i, double *buf)
     memcpy(&buf[m],&valuepartner[i][dnum*n],dnumbytes);
     m += dnum;
   }
-  // pack buf[0] this way because other fixes unpack it  
+  // pack buf[0] this way because other fixes unpack it
   buf[0] = m;
   return m;
 }

From 2feccc28370b3bcc13a1cad0f67901deba23c8b0 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 19 Aug 2020 20:12:43 -0400
Subject: [PATCH 23/38] fix dispersion gridcomm buffer bug

---
 src/KSPACE/pppm_disp.cpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/KSPACE/pppm_disp.cpp b/src/KSPACE/pppm_disp.cpp
index 8f141bb695..79fb989a16 100644
--- a/src/KSPACE/pppm_disp.cpp
+++ b/src/KSPACE/pppm_disp.cpp
@@ -981,7 +981,7 @@ void PPPMDisp::compute(int eflag, int vflag)
     make_rho_g();
 
     gc6->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO_G,
-                             gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                             gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
 
     brick2fft(nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, nzhi_in_6,
               density_brick_g, density_fft_g, work1_6,remap_6);
@@ -997,13 +997,13 @@ void PPPMDisp::compute(int eflag, int vflag)
                  v3_brick_g, v4_brick_g, v5_brick_g);
 
       gc6->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD_G,
-                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
 
       fieldforce_g_ad();
 
       if (vflag_atom)
         gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_G,
-                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                                 gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
 
     } else {
       poisson_ik(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
@@ -1017,13 +1017,13 @@ void PPPMDisp::compute(int eflag, int vflag)
                  v3_brick_g, v4_brick_g, v5_brick_g);
 
       gc6->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK_G,
-                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
 
       fieldforce_g_ik();
 
       if (evflag_atom)
         gc6->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_G,
-                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                                 gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
     }
 
     if (evflag_atom) fieldforce_g_peratom();
@@ -1071,13 +1071,13 @@ void PPPMDisp::compute(int eflag, int vflag)
                     v3_brick_a4, v4_brick_a4, v5_brick_a4);
 
       gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_AD_A,
-                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
 
       fieldforce_a_ad();
 
       if (evflag_atom)
         gc6->forward_comm_kspace(this,42,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_A,
-                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                                 gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
 
     }  else {
       poisson_ik(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
@@ -1112,13 +1112,13 @@ void PPPMDisp::compute(int eflag, int vflag)
                     v3_brick_a4, v4_brick_a4, v5_brick_a4);
 
       gc6->forward_comm_kspace(this,18,sizeof(FFT_SCALAR),FORWARD_IK_A,
-                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
 
       fieldforce_a_ik();
 
       if (evflag_atom)
         gc6->forward_comm_kspace(this,49,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_A,
-                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                                 gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
     }
 
     if (evflag_atom) fieldforce_a_peratom();
@@ -1151,13 +1151,13 @@ void PPPMDisp::compute(int eflag, int vflag)
       }
 
       gc6->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD_NONE,
-                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
 
       fieldforce_none_ad();
 
       if (vflag_atom)
         gc6->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_NONE,
-                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                                 gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
 
     } else {
       int n = 0;
@@ -1172,13 +1172,13 @@ void PPPMDisp::compute(int eflag, int vflag)
       }
 
       gc6->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK_NONE,
-                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
 
       fieldforce_none_ik();
 
       if (evflag_atom)
         gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_NONE,
-                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                                 gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
     }
 
     if (evflag_atom) fieldforce_none_peratom();

From 2ed18bb8412c34780e3ccec8bf0bfe4fda310c19 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 19 Aug 2020 20:15:00 -0400
Subject: [PATCH 24/38] tweak output format

---
 src/KSPACE/pppm_disp.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KSPACE/pppm_disp.cpp b/src/KSPACE/pppm_disp.cpp
index 79fb989a16..a0f5a3d2f3 100644
--- a/src/KSPACE/pppm_disp.cpp
+++ b/src/KSPACE/pppm_disp.cpp
@@ -563,9 +563,9 @@ void PPPMDisp::init()
                           nx_pppm_6,ny_pppm_6,nz_pppm_6);
       mesg += fmt::format("  Dispersion stencil order = {}\n",order_6);
       mesg += fmt::format("  Dispersion estimated absolute RMS force accuracy "
-                          "= {}\n",acc);
+                          "= {:.8g}\n",acc);
       mesg += fmt::format("  Dispersion estimated relative force accuracy "
-                          "= {}\n",acc/two_charge_force);
+                          "= {:.8g}\n",acc/two_charge_force);
       mesg += "  using " LMP_FFT_PREC " precision " LMP_FFT_LIB "\n";
       mesg += fmt::format("  3d grid and FFT values/proc = {} {}\n",
                           ngrid_max, nfft_both_max);

From ed26d8ef8f267e0fe469233a34bdebfd2fd29036 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 19 Aug 2020 20:24:03 -0400
Subject: [PATCH 25/38] revert change to fft3d.cpp that breaks all PPPM styles

---
 src/KSPACE/fft3d.cpp | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/KSPACE/fft3d.cpp b/src/KSPACE/fft3d.cpp
index 7c555e99b5..477f8c759e 100644
--- a/src/KSPACE/fft3d.cpp
+++ b/src/KSPACE/fft3d.cpp
@@ -103,18 +103,18 @@ void fft_3d(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan)
   length = plan->length1;
 
 #if defined(FFT_MKL)
-  if (flag == 1)
+  if (flag == -1)
     DftiComputeForward(plan->handle_fast,data);
   else
     DftiComputeBackward(plan->handle_fast,data);
 #elif defined(FFT_FFTW3)
-  if (flag == 1)
+  if (flag == -1)
     theplan=plan->plan_fast_forward;
   else
     theplan=plan->plan_fast_backward;
   FFTW_API(execute_dft)(theplan,data,data);
 #else
-  if (flag == 1)
+  if (flag == -1)
     for (offset = 0; offset < total; offset += length)
       kiss_fft(plan->cfg_fast_forward,&data[offset],&data[offset]);
   else
@@ -137,18 +137,18 @@ void fft_3d(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan)
   length = plan->length2;
 
 #if defined(FFT_MKL)
-  if (flag == 1)
+  if (flag == -1)
     DftiComputeForward(plan->handle_mid,data);
   else
     DftiComputeBackward(plan->handle_mid,data);
 #elif defined(FFT_FFTW3)
-  if (flag == 1)
+  if (flag == -1)
     theplan=plan->plan_mid_forward;
   else
     theplan=plan->plan_mid_backward;
   FFTW_API(execute_dft)(theplan,data,data);
 #else
-  if (flag == 1)
+  if (flag == -1)
     for (offset = 0; offset < total; offset += length)
       kiss_fft(plan->cfg_mid_forward,&data[offset],&data[offset]);
   else
@@ -171,18 +171,18 @@ void fft_3d(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan)
   length = plan->length3;
 
 #if defined(FFT_MKL)
-  if (flag == 1)
+  if (flag == -1)
     DftiComputeForward(plan->handle_slow,data);
   else
     DftiComputeBackward(plan->handle_slow,data);
 #elif defined(FFT_FFTW3)
-  if (flag == 1)
+  if (flag == -1)
     theplan=plan->plan_slow_forward;
   else
     theplan=plan->plan_slow_backward;
   FFTW_API(execute_dft)(theplan,data,data);
 #else
-  if (flag == 1)
+  if (flag == -1)
     for (offset = 0; offset < total; offset += length)
       kiss_fft(plan->cfg_slow_forward,&data[offset],&data[offset]);
   else
@@ -199,7 +199,7 @@ void fft_3d(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan)
 
   // scaling if required
 
-  if (flag == -1 && plan->scaled) {
+  if (flag == 1 && plan->scaled) {
     norm = plan->norm;
     num = plan->normnum;
 #if defined(FFT_FFTW3)
@@ -746,7 +746,7 @@ void fft_1d_only(FFT_DATA *data, int nsize, int flag, struct fft_plan_3d *plan)
   // data is just an array of 0.0
 
 #if defined(FFT_MKL)
-  if (flag == 1) {
+  if (flag == -1) {
     DftiComputeForward(plan->handle_fast,data);
     DftiComputeForward(plan->handle_mid,data);
     DftiComputeForward(plan->handle_slow,data);
@@ -757,23 +757,23 @@ void fft_1d_only(FFT_DATA *data, int nsize, int flag, struct fft_plan_3d *plan)
   }
 #elif defined(FFT_FFTW3)
   FFTW_API(plan) theplan;
-  if (flag == 1)
+  if (flag == -1)
     theplan=plan->plan_fast_forward;
   else
     theplan=plan->plan_fast_backward;
   FFTW_API(execute_dft)(theplan,data,data);
-  if (flag == 1)
+  if (flag == -1)
     theplan=plan->plan_mid_forward;
   else
     theplan=plan->plan_mid_backward;
   FFTW_API(execute_dft)(theplan,data,data);
-  if (flag == 1)
+  if (flag == -1)
     theplan=plan->plan_slow_forward;
   else
     theplan=plan->plan_slow_backward;
   FFTW_API(execute_dft)(theplan,data,data);
 #else
-  if (flag == 1) {
+  if (flag == -1) {
     for (int offset = 0; offset < total1; offset += length1)
       kiss_fft(plan->cfg_fast_forward,&data[offset],&data[offset]);
     for (int offset = 0; offset < total2; offset += length2)
@@ -793,7 +793,7 @@ void fft_1d_only(FFT_DATA *data, int nsize, int flag, struct fft_plan_3d *plan)
   // scaling if required
   // limit num to size of data
 
-  if (flag == -1 && plan->scaled) {
+  if (flag == 1 && plan->scaled) {
     norm = plan->norm;
     num = MIN(plan->normnum,nsize);
 #if defined(FFT_FFTW3)

From 394b2da5851c8d7b97ff21e07ed4ab79153859fa Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Thu, 20 Aug 2020 11:09:21 -0600
Subject: [PATCH 26/38] Port changes to Kokkos

---
 src/KOKKOS/gridcomm_kokkos.cpp | 45 ++++++++++++++++++++++++----------
 1 file changed, 32 insertions(+), 13 deletions(-)

diff --git a/src/KOKKOS/gridcomm_kokkos.cpp b/src/KOKKOS/gridcomm_kokkos.cpp
index c4e4a55da2..800e62a6de 100644
--- a/src/KOKKOS/gridcomm_kokkos.cpp
+++ b/src/KOKKOS/gridcomm_kokkos.cpp
@@ -25,7 +25,7 @@ using namespace LAMMPS_NS;
 
 enum{REGULAR,TILED};
 
-#define SWAPDELTA 8
+#define DELTA 16
 
 /* ----------------------------------------------------------------------
    NOTES
@@ -110,7 +110,14 @@ GridCommKokkos<DeviceType>::~GridCommKokkos()
 
 }
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   setup comm for a regular grid of procs
+   each proc has 6 neighbors
+   comm pattern = series of swaps with one of those 6 procs
+   can be multiple swaps with same proc if ghost extent is large
+   swap may not be symmetric if both procs do not need same layers of ghosts
+   all procs perform same # of swaps in a direction, even if some don't need it
+------------------------------------------------------------------------- */
 
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::setup_regular(int &nbuf1, int &nbuf2)
@@ -406,7 +413,12 @@ void GridCommKokkos<DeviceType>::setup_regular(int &nbuf1, int &nbuf2)
 }
 
 /* ----------------------------------------------------------------------
-   NOTE: need to doc this header
+   setup comm for RCB tiled proc domains
+   each proc has arbitrary # of neighbors that overlap its ghost extent
+   identify which procs will send me ghost cells, and vice versa
+   may not be symmetric if both procs do not need same layers of ghosts
+   comm pattern = post recvs for all my ghosts, send my owned, wait on recvs
+     no exchanges by dimension, unlike CommTiled forward/reverse comm of particles
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
@@ -638,7 +650,7 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
 }
 
 /* ----------------------------------------------------------------------
-   use swap list in forward order to acquire copy of all needed ghost grid pts
+   forward comm of my owned cells to other's ghost cells
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
@@ -651,7 +663,9 @@ void GridCommKokkos<DeviceType>::forward_comm_kspace(KSpace *kspace, int nper, i
     forward_comm_kspace_tiled(kspace,nper,which,k_buf1,k_buf2,datatype);
 }
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   forward comm on regular grid of procs via list of swaps with 6 neighbor procs
+------------------------------------------------------------------------- */
 
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::
@@ -703,7 +717,9 @@ forward_comm_kspace_regular(KSpace *kspace, int nper, int which,
   }
 }
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   forward comm on tiled grid decomp via Send/Recv lists of each neighbor proc
+------------------------------------------------------------------------- */
 
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::
@@ -770,8 +786,7 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int which,
 }
 
 /* ----------------------------------------------------------------------
-   use swap list in reverse order to compute fully summed value
-   for each owned grid pt that some other proc has copy of as a ghost grid pt
+   reverse comm of my ghost cells to sum to owner cells
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
@@ -784,7 +799,9 @@ void GridCommKokkos<DeviceType>::reverse_comm_kspace(KSpace *kspace, int nper, i
     reverse_comm_kspace_tiled(kspace,nper,which,k_buf1,k_buf2,datatype);
 }
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   reverse comm on regular grid of procs via list of swaps with 6 neighbor procs
+------------------------------------------------------------------------- */
 
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::
@@ -837,7 +854,9 @@ reverse_comm_kspace_regular(KSpace *kspace, int nper, int which,
   }
 }
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   reverse comm on tiled grid decomp via Send/Recv lists of each neighbor proc
+------------------------------------------------------------------------- */
 
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::
@@ -915,7 +934,7 @@ reverse_comm_kspace_tiled(KSpace *kspace, int nper, int which,
 template<class DeviceType>
 void GridCommKokkos<DeviceType>::grow_swap()
 {
-  maxswap += SWAPDELTA;
+  maxswap += DELTA;
   swap = (Swap *)
     memory->srealloc(swap,maxswap*sizeof(Swap),"GridComm:swap");
 
@@ -930,8 +949,8 @@ void GridCommKokkos<DeviceType>::grow_swap()
 
 /* ----------------------------------------------------------------------
    create 1d list of offsets into 3d array section (xlo:xhi,ylo:yhi,zlo:zhi)
-   assume 3d array is allocated as (0:fullxhi-fullxlo+1,0:fullyhi-fullylo+1,
-     0:fullzhi-fullzlo+1)
+   assume 3d array is allocated as
+     (fullxlo:fullxhi,fullylo:fullyhi,fullzlo:fullzhi)
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>

From 6efc51d863274c0e18373229155004ae37ae09b9 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Thu, 20 Aug 2020 11:19:34 -0600
Subject: [PATCH 27/38] Update comment

---
 src/KOKKOS/gridcomm_kokkos.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/KOKKOS/gridcomm_kokkos.cpp b/src/KOKKOS/gridcomm_kokkos.cpp
index 800e62a6de..a1debe95a6 100644
--- a/src/KOKKOS/gridcomm_kokkos.cpp
+++ b/src/KOKKOS/gridcomm_kokkos.cpp
@@ -30,6 +30,7 @@ enum{REGULAR,TILED};
 /* ----------------------------------------------------------------------
    NOTES
    tiled implementation only currently works for RCB, not general tiled
+   b/c RCB tree is used to find neighboring tiles 
    if o indices for ghosts are < 0 or hi indices are >= N,
      then grid is treated as periodic in that dimension,
      communication is done across the periodic boundaries

From 17b998edb33f34d2c3ce0e1cf6bded324486ff11 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Thu, 20 Aug 2020 15:51:41 -0400
Subject: [PATCH 28/38] fix whitespace issue

---
 src/KOKKOS/gridcomm_kokkos.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/KOKKOS/gridcomm_kokkos.cpp b/src/KOKKOS/gridcomm_kokkos.cpp
index a1debe95a6..5f32c5b4ae 100644
--- a/src/KOKKOS/gridcomm_kokkos.cpp
+++ b/src/KOKKOS/gridcomm_kokkos.cpp
@@ -30,7 +30,7 @@ enum{REGULAR,TILED};
 /* ----------------------------------------------------------------------
    NOTES
    tiled implementation only currently works for RCB, not general tiled
-   b/c RCB tree is used to find neighboring tiles 
+   b/c RCB tree is used to find neighboring tiles
    if o indices for ghosts are < 0 or hi indices are >= N,
      then grid is treated as periodic in that dimension,
      communication is done across the periodic boundaries

From 4fde9022a3f5cb600fc4cd1a1e6a5420f8ef931c Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Thu, 20 Aug 2020 17:54:26 -0400
Subject: [PATCH 29/38] remove tabs

---
 src/KOKKOS/pppm_kokkos.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp
index b6552523a3..9aa13343f6 100644
--- a/src/KOKKOS/pppm_kokkos.cpp
+++ b/src/KOKKOS/pppm_kokkos.cpp
@@ -668,7 +668,7 @@ void PPPMKokkos<DeviceType>::compute(int eflag, int vflag)
   // to fill ghost cells surrounding their 3d bricks
 
   gc->forward_comm_kspace(this,3,FORWARD_IK,
-			  k_gc_buf1,k_gc_buf2,MPI_FFT_SCALAR);
+                          k_gc_buf1,k_gc_buf2,MPI_FFT_SCALAR);
 
   // extra per-atom energy/virial communication
 

From a742db236976264103e312644207af790582b074 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Thu, 20 Aug 2020 18:07:09 -0400
Subject: [PATCH 30/38] add a couple of kspace tests using tiled communication

---
 .../tests/kspace-pppm_cg_tiled.yaml           | 94 +++++++++++++++++++
 .../tests/kspace-pppm_stagger_tiled.yaml      | 90 ++++++++++++++++++
 .../force-styles/tests/kspace-pppm_tiled.yaml | 90 ++++++++++++++++++
 3 files changed, 274 insertions(+)
 create mode 100644 unittest/force-styles/tests/kspace-pppm_cg_tiled.yaml
 create mode 100644 unittest/force-styles/tests/kspace-pppm_stagger_tiled.yaml
 create mode 100644 unittest/force-styles/tests/kspace-pppm_tiled.yaml

diff --git a/unittest/force-styles/tests/kspace-pppm_cg_tiled.yaml b/unittest/force-styles/tests/kspace-pppm_cg_tiled.yaml
new file mode 100644
index 0000000000..ca61591bab
--- /dev/null
+++ b/unittest/force-styles/tests/kspace-pppm_cg_tiled.yaml
@@ -0,0 +1,94 @@
+---
+lammps_version: 30 Jun 2020
+date_generated: Sun Jul 12 19:14:30 202
+epsilon: 7.5e-14
+prerequisites: ! |
+  atom full
+  pair coul/long
+  kspace pppm/cg
+pre_commands: ! ""
+post_commands: ! |
+  set atom 22*23 charge 0.0
+  set atom 25*26 charge 0.0
+  set atom 28*29 charge 0.0
+  set type 5 charge 0.0
+  comm_style tiled
+  pair_modify compute no
+  kspace_style pppm/cg 1.0e-6
+  kspace_modify gewald 0.3
+  balance 0.0 rcb
+input_file: in.fourmol
+pair_style: coul/long 8.0
+pair_coeff: ! |
+  * *
+extract: ! ""
+natoms: 29
+init_vdwl: 0
+init_coul: 0
+init_stress: ! |2-
+   0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+init_forces: ! |2
+    1 -2.6277259148670029e-01  6.3371291789473080e-02  2.5711876152341273e-01
+    2  7.2367296933642605e-02 -2.6377287441660424e-01 -7.6051513821789138e-02
+    3 -2.0405952297641711e-02 -7.8572370620207509e-03  1.6901994166300244e-02
+    4  1.3913539877995293e-01  2.2591757798275315e-03 -4.0397816193944613e-02
+    5  7.8220840962476321e-02  5.8018673691041286e-02 -6.0298251586383647e-02
+    6  1.6642601835640428e-02  5.4813636460583826e-01 -5.9948263006137181e-01
+    7  7.4422159409041455e-02 -5.2976037824286260e-01  2.8745376504092163e-01
+    8  5.1715645698484136e-01 -8.4224353212928849e-01  3.9252943496496401e-01
+    9 -2.8698074414725644e-01  3.8846246184711297e-01 -8.8735165696061394e-02
+   10 -1.4119088048501882e-01  1.7101059256234674e-01 -2.1093128275588843e-02
+   11 -1.9802109162868770e-01  2.3799905238132363e-01 -6.9205308272233729e-02
+   12  5.9829442228003460e-01 -6.6570589694338889e-01  1.8052289946052447e-01
+   13 -1.4432183109305963e-01  2.1119613677613439e-01 -3.7953547334912435e-02
+   14 -2.1057539442776216e-01  1.9833568369952351e-01 -4.3863660550604380e-02
+   15 -1.8510184013940739e-01  1.4558620355697388e-01 -6.3736986322272177e-02
+   16 -8.7068540567994446e-01  8.0781838347116530e-01  7.4023620199055895e-01
+   17  6.8573068239024604e-01 -5.4365714223217854e-01 -9.1523962812961923e-01
+   18  9.0711753065407197e-01  1.6157336473023263e+00 -1.6708191809635238e+00
+   19 -3.2812987976759644e-01 -8.1654277506424433e-01  9.6628327138680947e-01
+   20 -4.4090177907687206e-01 -7.7838783137249912e-01  8.4583048867481425e-01
+   21  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   22  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   23  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   24  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   25  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   26  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   27  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   28  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   29  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+run_vdwl: 0
+run_coul: 0
+run_stress: ! |2-
+   0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+run_forces: ! |2
+    1 -2.5272870059014418e-01  7.7993693891669019e-02  2.7413192480528514e-01
+    2  6.0317817543279174e-02 -2.7726219497773258e-01 -9.2328466795041211e-02
+    3 -2.0381925802263216e-02 -7.4169903634883827e-03  1.7718770630259381e-02
+    4  1.4007637687058852e-01  6.0294233349724097e-04 -4.4118635982017075e-02
+    5  7.7661555892458781e-02  5.6660597072423394e-02 -6.2527393985284924e-02
+    6  1.8472776517392236e-02  5.4080910460464371e-01 -6.2683322637258831e-01
+    7  7.1172361943358894e-02 -5.3339808852688553e-01  3.1182624682757953e-01
+    8  5.1931682876258478e-01 -8.4009257264924786e-01  4.1437713723636871e-01
+    9 -2.8926485235033661e-01  3.8545218869697995e-01 -9.6445715585120301e-02
+   10 -1.4130393273230371e-01  1.7154601844601167e-01 -2.4319589810512892e-02
+   11 -1.9820651458435656e-01  2.3998861307221755e-01 -7.2850168561931314e-02
+   12  6.0180481293740029e-01 -6.6352386739919189e-01  1.9241766558675899e-01
+   13 -1.4512210035912929e-01  2.1001807770143230e-01 -4.0833849803347325e-02
+   14 -2.1221325118484802e-01  1.9911652392243312e-01 -4.7359593324180368e-02
+   15 -1.8584239277230100e-01  1.4355785962059972e-01 -6.8625732013099072e-02
+   16 -8.6818324744564379e-01  8.1248951063969599e-01  7.2669223824162077e-01
+   17  6.8330419262914510e-01 -5.4635175697697436e-01 -9.0599066050100585e-01
+   18  9.4795190549104413e-01  1.6607978960168899e+00 -1.6003547491287762e+00
+   19 -3.4101870572315529e-01 -8.2840317570845334e-01  9.3657904772766076e-01
+   20 -4.6581300504277079e-01 -8.0258437941651817e-01  8.0884475080737095e-01
+   21  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   22  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   23  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   24  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   25  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   26  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   27  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   28  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   29  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+...
diff --git a/unittest/force-styles/tests/kspace-pppm_stagger_tiled.yaml b/unittest/force-styles/tests/kspace-pppm_stagger_tiled.yaml
new file mode 100644
index 0000000000..95cee4cef0
--- /dev/null
+++ b/unittest/force-styles/tests/kspace-pppm_stagger_tiled.yaml
@@ -0,0 +1,90 @@
+---
+lammps_version: 21 Jul 2020
+date_generated: Mon Aug  3 23:27:35 202
+epsilon: 7.5e-14
+prerequisites: ! |
+  atom full
+  pair coul/long
+  kspace pppm/stagger
+pre_commands: ! ""
+post_commands: ! |
+  comm_style tiled
+  pair_modify compute no
+  kspace_style pppm/stagger 1.0e-6
+  kspace_modify gewald 0.3
+  balance 0.0 rcb
+input_file: in.fourmol
+pair_style: coul/long 8.0
+pair_coeff: ! |
+  * *
+extract: ! ""
+natoms: 29
+init_vdwl: 0
+init_coul: 0
+init_stress: ! |2-
+   0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+init_forces: ! |2
+    1 -5.2227715900845750e-01  8.1950519891037896e-02  2.1568864750376832e-01
+    2  2.1709329984201631e-01 -2.7910826043908610e-01 -1.3501796562404628e-01
+    3 -3.4431410110235108e-02 -9.3026609475793179e-03  1.9970153652663108e-02
+    4  1.6294612553322482e-01  2.8820208915782589e-02 -7.8041537992552090e-02
+    5  1.6018614143950777e-01  7.5432356753115826e-02 -3.7722588054429150e-02
+    6  5.6492988309033354e-01  4.1696814706534630e-01 -6.7673476362799612e-01
+    7 -3.4216205471660566e-01 -4.0009165524095192e-01  3.9325085501474977e-01
+    8 -1.4121658810415072e-01 -6.1694926607232958e-01  3.3967088097622000e-01
+    9  1.8213420211015810e-01  3.2023550501078568e-01  5.0865716213608053e-02
+   10 -5.1659753410710330e-02  1.1063713004312387e-01 -1.4434062323990951e-02
+   11 -8.4675426404111437e-02  1.5093767469541053e-01 -3.9273222694910556e-02
+   12  4.5743278086428246e-01 -4.2657803621344026e-01  3.4765791147615285e-02
+   13 -1.5598326968651899e-01  1.1609333380277490e-01  2.6827585674639255e-02
+   14 -1.7229830712699734e-01  1.3660265515995373e-01  1.0364293545061238e-02
+   15 -1.3779624948415931e-01  8.5611514314178239e-02 -1.4417936578185790e-02
+   16 -3.4309620718952316e-01  4.3358259515061448e-01  5.3264911488862143e-01
+   17  1.3394866253126306e-01 -4.1287506953147796e-01 -7.8819987038465467e-01
+   18  7.3032854805187175e-01  1.5459002190369358e+00 -1.3876618467094484e+00
+   19 -2.5946241349817661e-01 -7.7450328984918526e-01  7.7107291114413901e-01
+   20 -3.9364379163465191e-01 -7.0318115064463305e-01  7.3145133273582563e-01
+   21  5.1854207039779388e-01  5.4316140431986648e-01 -1.1630561612460866e+00
+   22 -2.9474924657955082e-01 -1.2314722330232061e-01  5.8311514555951505e-01
+   23 -2.8773056143507980e-01 -2.9281856868591627e-01  5.5619506923778372e-01
+   24  6.2752036539684239e-02  1.7441478631220706e+00 -2.7849000426516041e-01
+   25  1.2955239352175907e-01 -7.0427160638100861e-01  2.2582608971039136e-01
+   26 -2.2227598237037974e-01 -9.7470415379148345e-01  7.4514829391794934e-02
+   27 -8.5917766336431800e-01  1.6506499588643100e+00 -9.3704596621935576e-01
+   28  5.7091533654326099e-01 -9.1760299361767794e-01  5.4073727763352530e-01
+   29  4.1187460365847078e-01 -8.0559715142821642e-01  4.4313023169089372e-01
+run_vdwl: 0
+run_coul: 0
+run_stress: ! |2-
+   0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+run_forces: ! |2
+    1 -5.0888488962950285e-01  8.3430404180325907e-02  2.3883070350075583e-01
+    2  2.0290231222263222e-01 -2.8552521628160221e-01 -1.4795322214007761e-01
+    3 -3.4147227596864727e-02 -9.1044423234983921e-03  2.1050772312966981e-02
+    4  1.6403639693437871e-01  2.7510909903057143e-02 -8.1700222473303191e-02
+    5  1.5793837855428028e-01  7.5134684921661865e-02 -4.3070400148934276e-02
+    6  5.5874989482134696e-01  4.1097082705147381e-01 -7.0880717747024347e-01
+    7 -3.4244419763693645e-01 -4.0336810330768602e-01  4.1416013762933518e-01
+    8 -1.2867011168103898e-01 -6.1360944335177758e-01  3.7373621646565264e-01
+    9  1.7179361391284365e-01  3.1570434896702448e-01  2.9166730933101400e-02
+   10 -5.3386439917058298e-02  1.1157104966535075e-01 -1.9069019877901452e-02
+   11 -8.6641568612711800e-02  1.5447778751280100e-01 -4.3302035139313286e-02
+   12  4.6277613291799979e-01 -4.2640261166652782e-01  5.4642934467963714e-02
+   13 -1.5768096764682599e-01  1.1678927823658505e-01  2.0726708870285715e-02
+   14 -1.7362409264878331e-01  1.3769878891777848e-01  5.6914483198930487e-03
+   15 -1.3804334923445472e-01  8.4232584350085404e-02 -2.2242211531143450e-02
+   16 -3.5505453326610353e-01  4.4007379281236625e-01  5.1108930583976653e-01
+   17  1.4393262437860022e-01 -4.0681625150263201e-01 -7.6550494257092949e-01
+   18  7.7411316765167837e-01  1.6013845677120058e+00 -1.3406161587038208e+00
+   19 -2.7109483507409249e-01 -7.9220736692047411e-01  7.5606765561780620e-01
+   20 -4.2022391372143653e-01 -7.3398564926630305e-01  7.0862056036484811e-01
+   21  5.2066744810063004e-01  4.5601151336785911e-01 -1.1109669200204870e+00
+   22 -2.9131419157235938e-01 -8.0660148622295896e-02  5.6037505807873789e-01
+   23 -2.8851937111607945e-01 -2.5707916023569388e-01  5.3115241105748146e-01
+   24  7.7132406285747662e-02  1.6936935947231240e+00 -2.6197671637095088e-01
+   25  1.1739827770831035e-01 -6.8172720908194795e-01  2.1292828073010400e-01
+   26 -2.2399952098390596e-01 -9.4806186651755442e-01  6.5237310702708273e-02
+   27 -8.6781949013350235e-01  1.6453631212617197e+00 -8.8867885380482659e-01
+   28  5.7526918079966904e-01 -9.1110567278710985e-01  5.1441520143469011e-01
+   29  4.1483886618353916e-01 -8.0439411171811659e-01  4.1599644392583446e-01
+...
diff --git a/unittest/force-styles/tests/kspace-pppm_tiled.yaml b/unittest/force-styles/tests/kspace-pppm_tiled.yaml
new file mode 100644
index 0000000000..b4796b6c27
--- /dev/null
+++ b/unittest/force-styles/tests/kspace-pppm_tiled.yaml
@@ -0,0 +1,90 @@
+---
+lammps_version: 30 Jun 2020
+date_generated: Sun Jul 12 19:14:29 202
+epsilon: 7.5e-14
+prerequisites: ! |
+  atom full
+  pair coul/long
+  kspace pppm
+pre_commands: ! ""
+post_commands: ! |
+  comm_style tiled
+  pair_modify compute no
+  kspace_style pppm 1.0e-6
+  kspace_modify gewald 0.3
+  balance 0.0 rcb
+input_file: in.fourmol
+pair_style: coul/long 8.0
+pair_coeff: ! |
+  * *
+extract: ! ""
+natoms: 29
+init_vdwl: 0
+init_coul: 0
+init_stress: ! |2-
+   0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+init_forces: ! |2
+    1 -5.2239274535568314e-01  8.2051545744881466e-02  2.1533594847972076e-01
+    2  2.1712968366442176e-01 -2.7928074334318026e-01 -1.3471540076656802e-01
+    3 -3.4442019165638028e-02 -9.3084265599194874e-03  1.9948062571124484e-02
+    4  1.6298334373562443e-01  2.8852998088186425e-02 -7.8001870103674154e-02
+    5  1.6024289196964533e-01  7.5428818157230709e-02 -3.7746220978715959e-02
+    6  5.6503043686117405e-01  4.1669523647698320e-01 -6.7638762712651512e-01
+    7 -3.4224573570118516e-01 -3.9969025602522534e-01  3.9331747529410527e-01
+    8 -1.4133104801408738e-01 -6.1685378954692482e-01  3.3931746208503027e-01
+    9  1.8219762821810317e-01  3.2009822401929577e-01  5.0881307357289934e-02
+   10 -5.1688860353236589e-02  1.1069131959908671e-01 -1.4422029744161480e-02
+   11 -8.4689878918105269e-02  1.5099315110947911e-01 -3.9231342126204188e-02
+   12  4.5754413540574290e-01 -4.2644798683690410e-01  3.4587713233253971e-02
+   13 -1.5596780753830558e-01  1.1607584778590280e-01  2.6865880696619902e-02
+   14 -1.7231427615749528e-01  1.3653099035839830e-01  1.0392517888507409e-02
+   15 -1.3787738509698347e-01  8.5569383216123673e-02 -1.4365596072224287e-02
+   16 -3.4322564010548312e-01  4.3371633953160166e-01  5.3259611401138551e-01
+   17  1.3414272886699793e-01 -4.1322529572771644e-01 -7.8812435933765979e-01
+   18  7.3073447759345089e-01  1.5456517688814524e+00 -1.3881786173290165e+00
+   19 -2.5943625025418654e-01 -7.7424664728587522e-01  7.7105598737678260e-01
+   20 -3.9409193260988501e-01 -7.0311103001458264e-01  7.3171724652214931e-01
+   21  5.1856078926614546e-01  5.4286369838352699e-01 -1.1629548434823531e+00
+   22 -2.9453203152655405e-01 -1.2298517567747463e-01  5.8298446261040782e-01
+   23 -2.8798525475710529e-01 -2.9277384277527774e-01  5.5631883166904628e-01
+   24  6.2753212217437501e-02  1.7443957830145815e+00 -2.7814103479849506e-01
+   25  1.2986161832727383e-01 -7.0443921770565177e-01  2.2578528867489417e-01
+   26 -2.2254044464386455e-01 -9.7470640011041609e-01  7.4360754308868779e-02
+   27 -8.5917998510192983e-01  1.6512375326941557e+00 -9.3680672362601536e-01
+   28  5.7118802253451917e-01 -9.1790362039827855e-01  5.4063664700585301e-01
+   29  4.1157232663919069e-01 -8.0588020505345637e-01  4.4297396570656278e-01
+run_vdwl: 0
+run_coul: 0
+run_stress: ! |2-
+   0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+run_forces: ! |2
+    1 -5.0900177524332380e-01  8.3541312360770559e-02  2.3847626933913957e-01
+    2  2.0293760437213748e-01 -2.8570861986354196e-01 -1.4764907529922344e-01
+    3 -3.4158170718364404e-02 -9.1103261103567309e-03  2.1028676255224140e-02
+    4  1.6407381836362508e-01  2.7543938443804754e-02 -8.1661581263582694e-02
+    5  1.5799678256598612e-01  7.5131693181657300e-02 -4.3093731729494773e-02
+    6  5.5884729319027959e-01  4.1070142887655381e-01 -7.0845348552640675e-01
+    7 -3.4252252488516066e-01 -4.0296980848156633e-01  4.1421689597710110e-01
+    8 -1.2877561673829283e-01 -6.1352311819313976e-01  3.7338100957587267e-01
+    9  1.7184965305565320e-01  3.1557352532666366e-01  2.9184166085517198e-02
+   10 -5.3416314160604578e-02  1.1162618970928689e-01 -1.9056641624128120e-02
+   11 -8.6656970143397835e-02  1.5453217862588842e-01 -4.3260381230061914e-02
+   12  4.6288691229484780e-01 -4.2627837697985460e-01  5.4462739307820834e-02
+   13 -1.5766510975402218e-01  1.1677248221389504e-01  2.0765584899621815e-02
+   14 -1.7364037400975366e-01  1.3763067517397995e-01  5.7227400340144263e-03
+   15 -1.3812467553300772e-01  8.4189156813104071e-02 -2.2189341317061961e-02
+   16 -3.5518552554301230e-01  4.4020942653600509e-01  5.1103809754957041e-01
+   17  1.4412978936104159e-01 -4.0716931333084577e-01 -7.6542748703258823e-01
+   18  7.7452008907550252e-01  1.6011444029485182e+00 -1.3411322152836536e+00
+   19 -2.7106696803117064e-01 -7.9194312531475608e-01  7.5606651125297675e-01
+   20 -4.2067655880426469e-01 -7.3392054862184575e-01  7.0888202878660511e-01
+   21  5.2066009642687827e-01  4.5571443252966459e-01 -1.1108598505670140e+00
+   22 -2.9107834596471255e-01 -8.0497477430417727e-02  5.6024739795440914e-01
+   23 -2.8876713232015855e-01 -2.5703387138535833e-01  5.3126570223700098e-01
+   24  7.7134065884385553e-02  1.6939505170091371e+00 -2.6162012529552015e-01
+   25  1.1769649007263155e-01 -6.8190526638151339e-01  2.1287906681296132e-01
+   26 -2.2425351267435142e-01 -9.4806947839154443e-01  6.5072664668127181e-02
+   27 -8.6783221113018416e-01  1.6459489466934927e+00 -8.8842216340655056e-01
+   28  5.7554532106965894e-01 -9.1140444835486800e-01  5.1430957288504053e-01
+   29  4.1454386992115411e-01 -8.0467652760281361e-01  4.1582695595428282e-01
+...

From 287112761c172595b3b53838631c759e3d2960f1 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Thu, 20 Aug 2020 18:51:28 -0400
Subject: [PATCH 31/38] tweak load balancing output format

---
 src/balance.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/balance.cpp b/src/balance.cpp
index 49cd6ecad5..d5032d8923 100644
--- a/src/balance.cpp
+++ b/src/balance.cpp
@@ -387,20 +387,20 @@ void Balance::command(int narg, char **arg)
                                    MPI_Wtime()-start_time);
     mesg += fmt::format("  iteration count = {}\n",niter);
     for (int i = 0; i < nimbalance; ++i) mesg += imbalances[i]->info();
-    mesg += fmt::format("  initial/final maximal load/proc = {} {}\n"
-                        "  initial/final imbalance factor  = {:.6g} {:.6g}\n",
+    mesg += fmt::format("  initial/final maximal load/proc = {:.8} {:.8}\n"
+                        "  initial/final imbalance factor  = {:.8} {:.8}\n",
                         maxinit,maxfinal,imbinit,imbfinal);
 
     if (style != BISECTION) {
       mesg += "  x cuts:";
       for (int i = 0; i <= comm->procgrid[0]; i++)
-        mesg += fmt::format(" {}",comm->xsplit[i]);
+        mesg += fmt::format(" {:.8}",comm->xsplit[i]);
       mesg += "\n  y cuts:";
       for (int i = 0; i <= comm->procgrid[1]; i++)
-        mesg += fmt::format(" {}",comm->ysplit[i]);
+        mesg += fmt::format(" {:.8}",comm->ysplit[i]);
       mesg += "\n  z cuts:";
       for (int i = 0; i <= comm->procgrid[2]; i++)
-        mesg += fmt::format(" {}",comm->zsplit[i]);
+        mesg += fmt::format(" {:.8}",comm->zsplit[i]);
       mesg += "\n";
     }
 

From 0a042be6b85822316315d0ae7ab1f058a8c8db25 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Thu, 20 Aug 2020 22:07:30 -0400
Subject: [PATCH 32/38] fix another pppm/disp bug for dispersion-only runs

---
 src/KSPACE/pppm_disp.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/KSPACE/pppm_disp.cpp b/src/KSPACE/pppm_disp.cpp
index a0f5a3d2f3..8c01451b57 100644
--- a/src/KSPACE/pppm_disp.cpp
+++ b/src/KSPACE/pppm_disp.cpp
@@ -3184,9 +3184,9 @@ double PPPMDisp::compute_qopt_6_ik()
     l = (i/nx_pppm_6) % ny_pppm_6;
     m = i / nxy_pppm_6;
 
-    const int kper = k - nx_pppm_6*(2*k/nx_pppm);
-    const int lper = l - ny_pppm_6*(2*l/ny_pppm);
-    const int mper = m - nz_pppm_6*(2*m/nz_pppm);
+    const int kper = k - nx_pppm_6*(2*k/nx_pppm_6);
+    const int lper = l - ny_pppm_6*(2*l/ny_pppm_6);
+    const int mper = m - nz_pppm_6*(2*m/nz_pppm_6);
 
     sqk = pow(unitkx*kper,2.0) + pow(unitky*lper,2.0) + pow(unitkz*mper,2.0);
     if (sqk == 0.0) continue;
@@ -3282,9 +3282,9 @@ double PPPMDisp::compute_qopt_6_ad()
     l = (i/nx_pppm_6) % ny_pppm_6;
     m = i / nxy_pppm_6;
 
-    const int kper = k - nx_pppm_6*(2*k/nx_pppm);
-    const int lper = l - ny_pppm_6*(2*l/ny_pppm);
-    const int mper = m - nz_pppm_6*(2*m/nz_pppm);
+    const int kper = k - nx_pppm_6*(2*k/nx_pppm_6);
+    const int lper = l - ny_pppm_6*(2*l/ny_pppm_6);
+    const int mper = m - nz_pppm_6*(2*m/nz_pppm_6);
 
     sqk = pow(unitkx*kper,2.0) + pow(unitky*lper,2.0) + pow(unitkz*mper,2.0);
     if (sqk == 0.0) continue;

From 17a96121260e469c5c0b3f102bfcb5eca4defd58 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Thu, 20 Aug 2020 22:15:26 -0400
Subject: [PATCH 33/38] fix pppm/disp initialization issues

---
 src/KSPACE/pppm_disp.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/KSPACE/pppm_disp.cpp b/src/KSPACE/pppm_disp.cpp
index 8c01451b57..10b215b903 100644
--- a/src/KSPACE/pppm_disp.cpp
+++ b/src/KSPACE/pppm_disp.cpp
@@ -213,6 +213,8 @@ PPPMDisp::PPPMDisp(LAMMPS *lmp) : KSpace(lmp),
   gc = gc6 = NULL;
   gc_buf1 = gc_buf2 = NULL;
   gc6_buf1 = gc6_buf2 = NULL;
+  ngc_buf1 = ngc_buf2 = ngc6_buf1 = ngc6_buf2 = 0;
+  npergrid = npergrid6 = 0;
 
   nmax = 0;
   part2grid = NULL;
@@ -8283,7 +8285,7 @@ double PPPMDisp::memory_usage()
   // four GridComm bufs
 
   bytes += (ngc_buf1 + ngc_buf2) * npergrid * sizeof(FFT_SCALAR);
-  bytes += (ngc6_buf1 + ngc6_buf2) * npergrid * sizeof(FFT_SCALAR);
+  bytes += (ngc6_buf1 + ngc6_buf2) * npergrid6 * sizeof(FFT_SCALAR);
 
   return bytes;
 }

From 0440b13a6deefab7927ac42e270754be032f3014 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Thu, 20 Aug 2020 22:27:15 -0400
Subject: [PATCH 34/38] fix memory leaks

---
 src/KSPACE/pppm_disp.cpp | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/KSPACE/pppm_disp.cpp b/src/KSPACE/pppm_disp.cpp
index 10b215b903..550c5f9bda 100644
--- a/src/KSPACE/pppm_disp.cpp
+++ b/src/KSPACE/pppm_disp.cpp
@@ -2125,10 +2125,10 @@ void PPPMDisp::allocate_peratom()
     if (differentiation_flag) npergrid = 6;
     else npergrid = 7;
 
-    memory->destroy(gc_buf1);
-    memory->destroy(gc_buf2);
-    memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
-    memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
+    memory->destroy(gc6_buf1);
+    memory->destroy(gc6_buf2);
+    memory->create(gc6_buf1,npergrid*ngc6_buf1,"pppm:gc6_buf1");
+    memory->create(gc6_buf2,npergrid*ngc6_buf2,"pppm:gc6_buf2");
   }
 
   if (function[2]) {
@@ -2246,10 +2246,10 @@ void PPPMDisp::allocate_peratom()
     if (differentiation_flag) npergrid = 42;
     else npergrid = 49;
 
-    memory->destroy(gc_buf1);
-    memory->destroy(gc_buf2);
-    memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
-    memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
+    memory->destroy(gc6_buf1);
+    memory->destroy(gc6_buf2);
+    memory->create(gc6_buf1,npergrid*ngc6_buf1,"pppm:gc6_buf1");
+    memory->create(gc6_buf2,npergrid*ngc6_buf2,"pppm:gc6_buf2");
   }
 
   if (function[3]) {
@@ -2283,10 +2283,10 @@ void PPPMDisp::allocate_peratom()
     if (differentiation_flag) npergrid = 6;
     else npergrid = 7;
 
-    memory->destroy(gc_buf1);
-    memory->destroy(gc_buf2);
-    memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
-    memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
+    memory->destroy(gc6_buf1);
+    memory->destroy(gc6_buf2);
+    memory->create(gc6_buf1,npergrid*ngc6_buf1,"pppm:gc6_buf1");
+    memory->create(gc6_buf2,npergrid*ngc6_buf2,"pppm:gc6_buf2");
   }
 }
 
@@ -2444,11 +2444,15 @@ void PPPMDisp::deallocate()
   gf_b_6 = NULL;
   rho1d_6 = rho_coeff_6 = drho1d_6 = drho_coeff_6 = NULL;
 
+  memory->destroy(gc_buf1);
+  memory->destroy(gc_buf2);
   delete fft1;
   delete fft2;
   delete remap;
   delete gc;
 
+  memory->destroy(gc6_buf1);
+  memory->destroy(gc6_buf2);
   delete fft1_6;
   delete fft2_6;
   delete remap_6;

From deb24626d1d58fb538541c0ec67b97b8da6648b9 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Thu, 20 Aug 2020 22:45:37 -0400
Subject: [PATCH 35/38] fix gc vs. gc6 issues in fix pppm/disp

---
 src/USER-INTEL/pppm_disp_intel.cpp | 118 +++++++++++++++--------------
 1 file changed, 61 insertions(+), 57 deletions(-)

diff --git a/src/USER-INTEL/pppm_disp_intel.cpp b/src/USER-INTEL/pppm_disp_intel.cpp
index 3229c462aa..054b99a11f 100644
--- a/src/USER-INTEL/pppm_disp_intel.cpp
+++ b/src/USER-INTEL/pppm_disp_intel.cpp
@@ -376,7 +376,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
     }
 
     gc6->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO_G,
-                             gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                             gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
 
     brick2fft(nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, nzhi_in_6,
               density_brick_g, density_fft_g, work1_6,remap_6);
@@ -391,7 +391,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                  v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);
 
       gc6->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD_G,
-                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
 
       if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
         fieldforce_g_ad<float,double>(fix->get_mixed_buffers());
@@ -403,7 +403,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
 
       if (vflag_atom)
         gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_G,
-                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                                 gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
 
     } else {
       poisson_ik(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
@@ -416,7 +416,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                  v1_brick_g, v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);
 
       gc6->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK_G,
-                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
 
       if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
         fieldforce_g_ik<float,double>(fix->get_mixed_buffers());
@@ -428,7 +428,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
 
       if (evflag_atom)
         gc6->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_G,
-                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                                 gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
     }
 
     if (evflag_atom) fieldforce_g_peratom();
@@ -487,7 +487,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                     v2_brick_a4, v3_brick_a4, v4_brick_a4, v5_brick_a4);
 
       gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_AD_A,
-                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
 
       if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
         fieldforce_a_ad<float,double>(fix->get_mixed_buffers());
@@ -499,7 +499,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
 
       if (evflag_atom)
         gc6->forward_comm_kspace(this,42,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_A,
-                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                                 gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
 
     }  else {
       poisson_ik(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
@@ -530,7 +530,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                     v3_brick_a4, v4_brick_a4, v5_brick_a4);
 
       gc6->forward_comm_kspace(this,18,sizeof(FFT_SCALAR),FORWARD_IK_A,
-                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
 
       if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
         fieldforce_a_ik<float,double>(fix->get_mixed_buffers());
@@ -542,7 +542,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
 
       if (evflag_atom)
         gc6->forward_comm_kspace(this,49,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_A,
-                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                                 gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
     }
 
     if (evflag_atom) fieldforce_a_peratom();
@@ -592,7 +592,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
       }
 
       gc6->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD_NONE,
-                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
 
       if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
         fieldforce_none_ad<float,double>(fix->get_mixed_buffers());
@@ -604,7 +604,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
 
       if (vflag_atom)
         gc6->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_NONE,
-                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                                 gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
 
     } else {
       int n = 0;
@@ -621,7 +621,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
       }
 
       gc6->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK_NONE,
-                               gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                               gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
 
       if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
         fieldforce_none_ik<float,double>(fix->get_mixed_buffers());
@@ -633,7 +633,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
 
       if (evflag_atom)
         gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_NONE,
-                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+                                 gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
     }
 
     if (evflag_atom) fieldforce_none_peratom();
@@ -674,7 +674,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
     for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
     MPI_Allreduce(virial_6,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
     for (i = 0; i < 6; i++) virial[i] += 0.5*volume*virial_all[i];
-    if (function[1]+function[2]+function[3]){
+    if (function[1]+function[2]+function[3]) {
       double a =  MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumij;
       virial[0] -= a;
       virial[1] -= a;
@@ -2983,69 +2983,73 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
 
 void PPPMDispIntel::precompute_rho()
 {
-
   half_rho_scale = (rho_points - 1.)/2.;
   half_rho_scale_plus = half_rho_scale + 0.5;
 
-  for (int i = 0; i < rho_points; i++) {
-    FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
-    #if defined(LMP_SIMD_COMPILER)
-    #pragma simd
-    #endif
-    for (int k=nlower; k<=nupper;k++){
-      FFT_SCALAR r1 = ZEROF;
-      for(int l=order-1; l>=0; l--){
-        r1 = rho_coeff[l][k] + r1*dx;
-      }
-      rho_lookup[i][k-nlower] = r1;
-    }
-    for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
-      rho_lookup[i][k] = 0;
-    }
-    if (differentiation_flag == 1) {
+  if (function[0]) {
+    for (int i = 0; i < rho_points; i++) {
+      FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
       #if defined(LMP_SIMD_COMPILER)
       #pragma simd
       #endif
-      for(int k=nlower; k<=nupper;k++){
+      for (int k=nlower; k<=nupper;k++){
         FFT_SCALAR r1 = ZEROF;
-        for(int l=order-2; l>=0; l--){
-          r1 = drho_coeff[l][k] + r1*dx;
+        for(int l=order-1; l>=0; l--){
+          r1 = rho_coeff[l][k] + r1*dx;
         }
-        drho_lookup[i][k-nlower] = r1;
+        rho_lookup[i][k-nlower] = r1;
       }
       for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
-        drho_lookup[i][k] = 0;
+        rho_lookup[i][k] = 0;
+      }
+      if (differentiation_flag == 1) {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for(int k=nlower; k<=nupper;k++){
+          FFT_SCALAR r1 = ZEROF;
+          for(int l=order-2; l>=0; l--){
+            r1 = drho_coeff[l][k] + r1*dx;
+          }
+          drho_lookup[i][k-nlower] = r1;
+        }
+        for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          drho_lookup[i][k] = 0;
+        }
       }
     }
   }
-  for (int i = 0; i < rho_points; i++) {
-    FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
-    #if defined(LMP_SIMD_COMPILER)
-    #pragma simd
-    #endif
-    for (int k=nlower_6; k<=nupper_6;k++){
-      FFT_SCALAR r1 = ZEROF;
-      for(int l=order_6-1; l>=0; l--){
-        r1 = rho_coeff_6[l][k] + r1*dx;
-      }
-      rho6_lookup[i][k-nlower_6] = r1;
-    }
-    for (int k = nupper_6-nlower_6+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
-      rho6_lookup[i][k] = 0;
-    }
-    if (differentiation_flag == 1) {
+
+  if (function[1]+function[2]+function[3]) {
+    for (int i = 0; i < rho_points; i++) {
+      FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
       #if defined(LMP_SIMD_COMPILER)
       #pragma simd
       #endif
-      for(int k=nlower_6; k<=nupper_6;k++){
+      for (int k=nlower_6; k<=nupper_6;k++){
         FFT_SCALAR r1 = ZEROF;
-        for(int l=order_6-2; l>=0; l--){
-          r1 = drho_coeff_6[l][k] + r1*dx;
+        for(int l=order_6-1; l>=0; l--){
+          r1 = rho_coeff_6[l][k] + r1*dx;
         }
-        drho6_lookup[i][k-nlower_6] = r1;
+        rho6_lookup[i][k-nlower_6] = r1;
       }
       for (int k = nupper_6-nlower_6+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
-        drho6_lookup[i][k] = 0;
+        rho6_lookup[i][k] = 0;
+      }
+      if (differentiation_flag == 1) {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for(int k=nlower_6; k<=nupper_6;k++){
+          FFT_SCALAR r1 = ZEROF;
+          for(int l=order_6-2; l>=0; l--){
+            r1 = drho_coeff_6[l][k] + r1*dx;
+          }
+          drho6_lookup[i][k-nlower_6] = r1;
+        }
+        for (int k = nupper_6-nlower_6+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          drho6_lookup[i][k] = 0;
+        }
       }
     }
   }

From 2855f04363e420c79686b3ea81b7029c618dc95e Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Thu, 20 Aug 2020 23:41:32 -0400
Subject: [PATCH 36/38] tweak output format

---
 src/KSPACE/pppm_disp.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/KSPACE/pppm_disp.cpp b/src/KSPACE/pppm_disp.cpp
index 550c5f9bda..e788fe1bc0 100644
--- a/src/KSPACE/pppm_disp.cpp
+++ b/src/KSPACE/pppm_disp.cpp
@@ -560,14 +560,14 @@ void PPPMDisp::init()
 
     if (me == 0) {
       std::string mesg = fmt::format("  Dispersion G vector (1/distance)= "
-                                     "{}\n", g_ewald_6);
+                                     "{:.8}\n", g_ewald_6);
       mesg += fmt::format("  Dispersion grid = {} {} {}\n",
                           nx_pppm_6,ny_pppm_6,nz_pppm_6);
       mesg += fmt::format("  Dispersion stencil order = {}\n",order_6);
       mesg += fmt::format("  Dispersion estimated absolute RMS force accuracy "
-                          "= {:.8g}\n",acc);
+                          "= {:.8}\n",acc);
       mesg += fmt::format("  Dispersion estimated relative force accuracy "
-                          "= {:.8g}\n",acc/two_charge_force);
+                          "= {:.8}\n",acc/two_charge_force);
       mesg += "  using " LMP_FFT_PREC " precision " LMP_FFT_LIB "\n";
       mesg += fmt::format("  3d grid and FFT values/proc = {} {}\n",
                           ngrid_max, nfft_both_max);

From 3183af9b85a46afa96836048e670814ebbd940a1 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Thu, 20 Aug 2020 23:42:00 -0400
Subject: [PATCH 37/38] fix a few more initialization issues in pppm/disp and
 pppm/disp/intel

---
 src/KSPACE/pppm_disp.cpp           | 2 +-
 src/USER-INTEL/pppm_disp_intel.cpp | 3 ++-
 src/USER-INTEL/pppm_disp_intel.h   | 3 ---
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/KSPACE/pppm_disp.cpp b/src/KSPACE/pppm_disp.cpp
index e788fe1bc0..c81dcbf909 100644
--- a/src/KSPACE/pppm_disp.cpp
+++ b/src/KSPACE/pppm_disp.cpp
@@ -214,7 +214,7 @@ PPPMDisp::PPPMDisp(LAMMPS *lmp) : KSpace(lmp),
   gc_buf1 = gc_buf2 = NULL;
   gc6_buf1 = gc6_buf2 = NULL;
   ngc_buf1 = ngc_buf2 = ngc6_buf1 = ngc6_buf2 = 0;
-  npergrid = npergrid6 = 0;
+  ngrid = ngrid_6 = npergrid = npergrid6 = 0;
 
   nmax = 0;
   part2grid = NULL;
diff --git a/src/USER-INTEL/pppm_disp_intel.cpp b/src/USER-INTEL/pppm_disp_intel.cpp
index 054b99a11f..270299b4c5 100644
--- a/src/USER-INTEL/pppm_disp_intel.cpp
+++ b/src/USER-INTEL/pppm_disp_intel.cpp
@@ -131,9 +131,10 @@ void PPPMDispIntel::init()
   // For vectorization, we need some padding in the end
   // The first thread computes on the global density
   if ((comm->nthreads > 1) && !_use_lrt) {
+    int mygrid = MAX(ngrid,ngrid_6);
     memory->destroy(perthread_density);
     memory->create(perthread_density, comm->nthreads-1,
-                   ngrid + INTEL_P3M_ALIGNED_MAXORDER,
+                   mygrid + INTEL_P3M_ALIGNED_MAXORDER,
                    "pppmdispintel:perthread_density");
   }
 
diff --git a/src/USER-INTEL/pppm_disp_intel.h b/src/USER-INTEL/pppm_disp_intel.h
index 7c88c458f8..89f580aa08 100644
--- a/src/USER-INTEL/pppm_disp_intel.h
+++ b/src/USER-INTEL/pppm_disp_intel.h
@@ -70,8 +70,6 @@ namespace LAMMPS_NS {
     FFT_SCALAR *particle_eky6;
     FFT_SCALAR *particle_ekz6;
 
-
-
     int _use_table;
     int rho_points;
     FFT_SCALAR **rho_lookup;
@@ -82,7 +80,6 @@ namespace LAMMPS_NS {
 
     int _use_packing;
 
-
     #ifdef _LMP_INTEL_OFFLOAD
     int _use_base;
     #endif

From 42018d3b357062284744ba831c412131718a2a47 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Thu, 20 Aug 2020 23:42:23 -0400
Subject: [PATCH 38/38] add dispersion only kspace style tests

---
 .../tests/kspace-pppm_disp_ad_only.yaml       | 99 +++++++++++++++++++
 .../tests/kspace-pppm_disp_only.yaml          | 98 ++++++++++++++++++
 2 files changed, 197 insertions(+)
 create mode 100644 unittest/force-styles/tests/kspace-pppm_disp_ad_only.yaml
 create mode 100644 unittest/force-styles/tests/kspace-pppm_disp_only.yaml

diff --git a/unittest/force-styles/tests/kspace-pppm_disp_ad_only.yaml b/unittest/force-styles/tests/kspace-pppm_disp_ad_only.yaml
new file mode 100644
index 0000000000..79264c45bb
--- /dev/null
+++ b/unittest/force-styles/tests/kspace-pppm_disp_ad_only.yaml
@@ -0,0 +1,99 @@
+---
+lammps_version: 21 Jul 2020
+date_generated: Thu Aug 20 23:05:29 202
+epsilon: 2.5e-13
+prerequisites: ! |
+  atom full
+  pair lj/long/coul/long
+  kspace pppm/disp
+pre_commands: ! ""
+post_commands: ! |
+  pair_modify compute no
+  kspace_style pppm/disp 1.0e-4
+  kspace_modify gewald 0.5
+  kspace_modify force/disp/real  0.001
+  kspace_modify force/disp/kspace 0.005
+  kspace_modify diff ad
+input_file: in.fourmol
+pair_style: lj/long/coul/long long off 7.0
+pair_coeff: ! |
+  1 1  0.02   2.5
+  2 2  0.005  1.0
+  2 4  0.005  0.5
+  3 3  0.02   3.2
+  4 4  0.015  3.1
+  5 5  0.015  3.1
+extract: ! |
+  epsilon 2
+  sigma 2
+  cut_coul 0
+natoms: 29
+init_vdwl: 0
+init_coul: 0
+init_stress: ! |2-
+   0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+init_forces: ! |2
+    1  6.9724769730763625e-11 -6.2917196472420914e-11 -1.7553582754737637e-11
+    2  7.6094811340731255e-13 -1.1540411942409495e-12  1.6298559231496556e-13
+    3  3.9430185022927007e-11 -1.9017673718233126e-11 -1.6651673285376923e-13
+    4  1.5917087679856494e-12 -7.5219417810712124e-13  3.9732355364959770e-13
+    5  1.3754210730241487e-12 -5.2393248740046831e-13 -5.8786972569898321e-13
+    6  2.4324626429202970e-11 -5.9541975521081885e-12  1.1615602625372913e-11
+    7  3.9335920395601978e-11 -4.5805387197461763e-12  4.9818207226960900e-11
+    8  1.9840262237451370e-11  6.2076746170376731e-12 -4.0963480752166205e-13
+    9  6.7032491388560462e-14  2.8806602906466370e-14 -5.1168576901689580e-13
+   10 -9.5639153188399775e-12  1.6260508185384930e-11  5.9852790016982401e-12
+   11 -2.1047310133618033e-13  7.9345765138879343e-13  7.5666446278699520e-13
+   12 -2.9173061030991263e-11  3.7633064986308384e-12  1.7460466677353034e-11
+   13 -1.4524529065361568e-12  3.3470067557100611e-13  6.2185091609144014e-13
+   14 -7.3138369212027322e-13  1.0248711043717948e-13  1.1305113696402957e-12
+   15 -9.5037504870165340e-13 -3.5714658367712204e-13  3.9441042026885766e-13
+   16 -2.2415639659259037e-11  2.7640113997479121e-11 -1.1642230752605363e-11
+   17 -2.1208473166391679e-11  3.8522058586534765e-11 -4.6846216614724886e-11
+   18 -1.8040308770875740e-11 -5.7279008382924961e-11  7.2376750640813174e-11
+   19 -3.8160168113867275e-14 -1.0975133018216530e-12  1.5219157003617188e-12
+   20 -8.2805108197841928e-13 -1.1974415437385753e-12  1.4978331961150702e-12
+   21 -8.9475625142751869e-11  6.3090972691297525e-11  7.4019219615381729e-11
+   22 -1.5546663673414098e-12  1.1916129263874209e-12  1.5208406092298733e-12
+   23 -1.6319521654857783e-12  1.1826485038559488e-12  1.4904048534083759e-12
+   24 -1.5641127395622623e-11 -5.8519640828036183e-11 -7.6239449591252700e-11
+   25  1.1910315964875356e-13 -1.1889645936751337e-12 -1.4555354381968814e-12
+   26 -7.1757130498965896e-13 -1.1745493969901884e-12 -1.5212605958363234e-12
+   27  8.0820809208387831e-11  6.3170487105530568e-11 -6.5474684007301223e-11
+   28  1.6363504379062180e-12  1.1991903709725124e-12 -1.1721879859607168e-12
+   29  1.5216247551162307e-12  1.1846825159982258e-12 -1.4010202583443579e-12
+run_vdwl: 0
+run_coul: 0
+run_stress: ! |2-
+   0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+run_forces: ! |2
+    1  6.9638112975627505e-11 -6.2952655433984510e-11 -1.7520579341404558e-11
+    2  7.5441280063952564e-13 -1.1565278235052112e-12  1.5428964869214424e-13
+    3  3.9495752841065062e-11 -1.9092781431624742e-11 -1.4273797798954887e-13
+    4  1.5925753071119834e-12 -7.5018590838065349e-13  4.0694176768714727e-13
+    5  1.3773389032846222e-12 -5.2518353267880336e-13 -5.8790169309916104e-13
+    6  2.4346713154520441e-11 -6.0219217022078622e-12  1.1556637696922417e-11
+    7  3.9348886643182682e-11 -4.5159528947341432e-12  4.9885111934485096e-11
+    8  1.9778099538765962e-11  6.2897367226276710e-12 -4.3131958485496316e-13
+    9  6.3695687649969267e-14  3.3892963902521213e-14 -5.1413748665229894e-13
+   10 -9.6012722021579935e-12  1.6233041014363616e-11  6.0003996378633241e-12
+   11 -2.0949350984685278e-13  7.9562479682529135e-13  7.6275214731002570e-13
+   12 -2.9254571969811829e-11  3.7981192304699482e-12  1.7545839680382277e-11
+   13 -1.4582673940336726e-12  3.2818654537516632e-13  6.2347550527444227e-13
+   14 -7.3106004851428835e-13  1.1022739695233736e-13  1.1369294512424691e-12
+   15 -9.4060413626822731e-13 -3.5126188279055496e-13  3.8676904179416883e-13
+   16 -2.2408460889929651e-11  2.7657483426256059e-11 -1.1741964403647136e-11
+   17 -2.1298916095589507e-11  3.8489167711847335e-11 -4.6910251494943480e-11
+   18 -1.7957271474310740e-11 -5.7234788024451294e-11  7.2383088876515692e-11
+   19 -3.6288894307680100e-14 -1.0943572602766692e-12  1.5210320121776364e-12
+   20 -8.3935575478300129e-13 -1.1966327701053927e-12  1.4966158763399045e-12
+   21 -8.9447202173297866e-11  6.3082241565048134e-11  7.4014350079233694e-11
+   22 -1.5510607489175700e-12  1.1907296946509683e-12  1.5210644252358221e-12
+   23 -1.6318182177140126e-12  1.1841207226717162e-12  1.4885209322729697e-12
+   24 -1.5676362809971050e-11 -5.8529118094744956e-11 -7.6215807688416359e-11
+   25  1.0742373795185689e-13 -1.1875074064094843e-12 -1.4571756878225340e-12
+   26 -7.1286524148732120e-13 -1.1757034686897436e-12 -1.5207002392359802e-12
+   27  8.0828500068665394e-11  6.3165679701490667e-11 -6.5424272562687016e-11
+   28  1.6362386625442489e-12  1.1989853840181747e-12 -1.1753814351696325e-12
+   29  1.5246414975786595e-12  1.1846002567244795e-12 -1.4036250137375415e-12
+...
diff --git a/unittest/force-styles/tests/kspace-pppm_disp_only.yaml b/unittest/force-styles/tests/kspace-pppm_disp_only.yaml
new file mode 100644
index 0000000000..f258248944
--- /dev/null
+++ b/unittest/force-styles/tests/kspace-pppm_disp_only.yaml
@@ -0,0 +1,98 @@
+---
+lammps_version: 21 Jul 2020
+date_generated: Thu Aug 20 23:05:29 202
+epsilon: 2.5e-13
+prerequisites: ! |
+  atom full
+  pair lj/long/coul/long
+  kspace pppm/disp
+pre_commands: ! ""
+post_commands: ! |
+  pair_modify compute no
+  kspace_style pppm/disp 1.0e-4
+  kspace_modify gewald 0.5
+  kspace_modify force/disp/real  0.001
+  kspace_modify force/disp/kspace 0.005
+input_file: in.fourmol
+pair_style: lj/long/coul/long long off 7.0
+pair_coeff: ! |
+  1 1  0.02   2.5
+  2 2  0.005  1.0
+  2 4  0.005  0.5
+  3 3  0.02   3.2
+  4 4  0.015  3.1
+  5 5  0.015  3.1
+extract: ! |
+  epsilon 2
+  sigma 2
+  cut_coul 0
+natoms: 29
+init_vdwl: 0
+init_coul: 0
+init_stress: ! |2-
+   0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+init_forces: ! |2
+    1  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    2  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    3  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    4  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    5  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    6  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    7  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    8  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    9  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   10  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   11  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   12  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   13  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   14  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   15  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   16  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   17  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   18  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   19  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   20  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   21  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   22  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   23  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   24  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   25  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   26  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   27  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   28  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   29  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+run_vdwl: 0
+run_coul: 0
+run_stress: ! |2-
+   0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+run_forces: ! |2
+    1  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    2  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    3  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    4  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    5  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    6  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    7  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    8  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    9  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   10  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   11  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   12  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   13  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   14  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   15  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   16  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   17  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   18  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   19  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   20  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   21  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   22  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   23  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   24  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   25  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   26  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   27  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   28  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   29  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+...