Merge pull request #2280 from lammps/gridcomm-tiled

Support for tiled decompositions in PPPM
2020-08-21 00:16:20 -04:00 · 2020-08-21 00:16:20 -04:00 · 4c46119a48
parent e7639d4951 42018d3b35
commit 4c46119a48
50 changed files with 3818 additions and 2289 deletions
--- a/src/GPU/pppm_gpu.cpp
+++ b/src/GPU/pppm_gpu.cpp
@ -203,11 +203,7 @@ void PPPMGPU::compute(int eflag, int vflag)
  // If need per-atom energies/virials, allocate per-atom arrays here
  // so that particle map on host can be done concurrently with GPU calculations

-  if (evflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    cg_peratom->ghost_notify();
-    cg_peratom->setup();
-  }
+  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();

  if (triclinic == 0) {
    bool success = true;
@ -258,10 +254,12 @@ void PPPMGPU::compute(int eflag, int vflag)
  // remap from 3d decomposition to FFT decomposition

  if (triclinic == 0) {
-    cg->reverse_comm(this,REVERSE_RHO_GPU);
+    gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO_GPU,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
    brick2fft_gpu();
  } else {
-    cg->reverse_comm(this,REVERSE_RHO);
+    gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
    PPPM::brick2fft();
  }

@ -274,16 +272,22 @@ void PPPMGPU::compute(int eflag, int vflag)
  // all procs communicate E-field values
  // to fill ghost cells surrounding their 3d bricks

-  if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
-  else cg->forward_comm(this,FORWARD_IK);
+  if (differentiation_flag == 1)
+    gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  else
+    gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);

  // extra per-atom energy/virial communication

  if (evflag_atom) {
    if (differentiation_flag == 1 && vflag_atom)
-      cg_peratom->forward_comm(this,FORWARD_AD_PERATOM);
+      gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
    else if (differentiation_flag == 0)
-      cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
+      gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
  }

  poisson_time += MPI_Wtime()-t3;
@ -510,8 +514,10 @@ void PPPMGPU::poisson_ik()
   pack own values to buf to send to another proc
 ------------------------------------------------------------------------- */

-void PPPMGPU::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPMGPU::pack_forward_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
  int n = 0;

  if (flag == FORWARD_IK) {
@ -568,8 +574,10 @@ void PPPMGPU::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
   unpack another proc's own values from buf and set own ghost values
 ------------------------------------------------------------------------- */

-void PPPMGPU::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPMGPU::unpack_forward_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
  int n = 0;

  if (flag == FORWARD_IK) {
@ -626,8 +634,10 @@ void PPPMGPU::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
   pack ghost values into buf to send to another proc
 ------------------------------------------------------------------------- */

-void PPPMGPU::pack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPMGPU::pack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
  if (flag == REVERSE_RHO_GPU) {
    FFT_SCALAR *src = &density_brick_gpu[nzlo_out][nylo_out][nxlo_out];
    for (int i = 0; i < nlist; i++)
@ -643,8 +653,10 @@ void PPPMGPU::pack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
   unpack another proc's ghost values from buf and add to own values
 ------------------------------------------------------------------------- */

-void PPPMGPU::unpack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPMGPU::unpack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
  if (flag == REVERSE_RHO_GPU) {
    FFT_SCALAR *dest = &density_brick_gpu[nzlo_out][nylo_out][nxlo_out];
    for (int i = 0; i < nlist; i++)
@ -818,7 +830,8 @@ void PPPMGPU::compute_group_group(int groupbit_A, int groupbit_B, int AA_flag)
  density_brick = density_A_brick;
  density_fft = density_A_fft;

-  cg->reverse_comm(this,REVERSE_RHO);
+  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+			  gc_buf1,gc_buf2,MPI_FFT_SCALAR);
  brick2fft();

  // group B
@ -826,7 +839,8 @@ void PPPMGPU::compute_group_group(int groupbit_A, int groupbit_B, int AA_flag)
  density_brick = density_B_brick;
  density_fft = density_B_fft;

-  cg->reverse_comm(this,REVERSE_RHO);
+  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+			  gc_buf1,gc_buf2,MPI_FFT_SCALAR);
  brick2fft();

  // switch back pointers
--- a/src/GPU/pppm_gpu.h
+++ b/src/GPU/pppm_gpu.h
@ -46,10 +46,10 @@ class PPPMGPU : public PPPM {
  void brick2fft_gpu();
  virtual void poisson_ik();

-  void pack_forward(int, FFT_SCALAR *, int, int *);
-  void unpack_forward(int, FFT_SCALAR *, int, int *);
-  void pack_reverse(int, FFT_SCALAR *, int, int *);
-  void unpack_reverse(int, FFT_SCALAR *, int, int *);
+  void pack_forward_grid(int, void *, int, int *);
+  void unpack_forward_grid(int, void *, int, int *);
+  void pack_reverse_grid(int, void *, int, int *);
+  void unpack_reverse_grid(int, void *, int, int *);

  FFT_SCALAR ***create_3d_offset(int, int, int, int, int, int, const char *,
                                 FFT_SCALAR *, int);
--- a/src/KOKKOS/gridcomm_kokkos.cpp
+++ b/src/KOKKOS/gridcomm_kokkos.cpp
--- a/src/KOKKOS/gridcomm_kokkos.h
+++ b/src/KOKKOS/gridcomm_kokkos.h
@ -14,81 +14,59 @@
 #ifndef LMP_GRIDCOMM_KOKKOS_H
 #define LMP_GRIDCOMM_KOKKOS_H

-#include "pointers.h"
+#include "gridcomm.h"
 #include "kokkos_type.h"
 #include "fftdata_kokkos.h"

-#ifdef FFT_SINGLE
-typedef float FFT_SCALAR;
-#define MPI_FFT_SCALAR MPI_FLOAT
-#else
-typedef double FFT_SCALAR;
-#define MPI_FFT_SCALAR MPI_DOUBLE
-#endif
-
 namespace LAMMPS_NS {

 template<class DeviceType>
-class GridCommKokkos : protected Pointers {
+class GridCommKokkos : public GridComm {
 public:
  typedef DeviceType device_type;
  typedef ArrayTypes<DeviceType> AT;
  typedef FFTArrayTypes<DeviceType> FFT_AT;
-
-  GridCommKokkos(class LAMMPS *, MPI_Comm, int, int,
+  GridCommKokkos(class LAMMPS *, MPI_Comm, int, int, int,
+           int, int, int, int, int, int,
+           int, int, int, int, int, int);
+  GridCommKokkos(class LAMMPS *, MPI_Comm, int, int, int, int,
           int, int, int, int, int, int,
           int, int, int, int, int, int,
           int, int, int, int, int, int);
-  GridCommKokkos(class LAMMPS *, MPI_Comm, int, int,
-           int, int, int, int, int, int,
-           int, int, int, int, int, int,
-           int, int, int, int, int, int,
-           int, int, int, int, int, int);
-  ~GridCommKokkos();
-  void ghost_notify();
-  int ghost_overlap();
-  void setup();
-  void forward_comm(class KSpace *, int);
-  void reverse_comm(class KSpace *, int);
-  double memory_usage();
+  virtual ~GridCommKokkos();
+  void forward_comm_kspace(class KSpace *, int, int,
+                           FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
+  void reverse_comm_kspace(class KSpace *, int, int,
+                           FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);

 private:
-  int me;
-  int nforward,nreverse;
-  MPI_Comm gridcomm;
-  MPI_Request request;
+  DAT::tdual_int_2d k_swap_packlist;
+  DAT::tdual_int_2d k_swap_unpacklist;

-  // in = inclusive indices of 3d grid chunk that I own
-  // out = inclusive indices of 3d grid chunk I own plus ghosts I use
-  // proc = 6 neighbor procs that surround me
-  // ghost = # of my owned grid planes needed from me
-  //         by each of 6 neighbor procs to become their ghost planes
+  DAT::tdual_int_2d k_send_packlist;

-  int inxlo,inxhi,inylo,inyhi,inzlo,inzhi;
-  int outxlo,outxhi,outylo,outyhi,outzlo,outzhi;
-  int outxlo_max,outxhi_max,outylo_max,outyhi_max,outzlo_max,outzhi_max;
-  int procxlo,procxhi,procylo,procyhi,proczlo,proczhi;
-  int ghostxlo,ghostxhi,ghostylo,ghostyhi,ghostzlo,ghostzhi;
+  DAT::tdual_int_2d k_recv_unpacklist;

-  int nbuf;
-  //FFT_SCALAR *buf1,*buf2;
-  FFT_DAT::tdual_FFT_SCALAR_1d k_buf1;
-  FFT_DAT::tdual_FFT_SCALAR_1d k_buf2;
+  DAT::tdual_int_2d k_copy_packlist;
+  DAT::tdual_int_2d k_copy_unpacklist;

-  struct Swap {
-    int sendproc;       // proc to send to for forward comm
-    int recvproc;       // proc to recv from for forward comm
-    int npack;          // # of datums to pack
-    int nunpack;        // # of datums to unpack
-    //int *packlist;      // 3d array offsets to pack
-    //int *unpacklist;    // 3d array offsets to unpack
-  };
+  // -------------------------------------------
+  // internal methods
+  // -------------------------------------------

-  DAT::tdual_int_2d k_packlist;
-  DAT::tdual_int_2d k_unpacklist;
+  void setup_regular(int &, int &);
+  void setup_tiled(int &, int &);

-  int nswap;
-  Swap *swap;
+  void forward_comm_kspace_regular(class KSpace *, int, int,
+                                   FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
+  void forward_comm_kspace_tiled(class KSpace *, int, int,
+                                 FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
+  void reverse_comm_kspace_regular(class KSpace *, int, int,
+                                   FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
+  void reverse_comm_kspace_tiled(class KSpace *, int, int,
+                                 FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
+
+  void grow_swap();

  int indices(DAT::tdual_int_2d &, int, int, int, int, int, int, int);
 };
--- a/src/KOKKOS/kokkos_base_fft.h
+++ b/src/KOKKOS/kokkos_base_fft.h
@ -23,10 +23,10 @@ class KokkosBaseFFT {
  KokkosBaseFFT() {}

  //Kspace
-  virtual void pack_forward_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
-  virtual void unpack_forward_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
-  virtual void pack_reverse_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
-  virtual void unpack_reverse_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
+  virtual void pack_forward_grid_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
+  virtual void unpack_forward_grid_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, int, DAT::tdual_int_2d &, int) {};
+  virtual void pack_reverse_grid_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
+  virtual void unpack_reverse_grid_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, int, DAT::tdual_int_2d &, int) {};
 };

 }
--- a/src/KOKKOS/pppm_kokkos.cpp
+++ b/src/KOKKOS/pppm_kokkos.cpp
@ -96,8 +96,7 @@ PPPMKokkos<DeviceType>::PPPMKokkos(LAMMPS *lmp) : PPPM(lmp)

  fft1 = fft2 = NULL;
  remap = NULL;
-  cg = NULL;
-  cg_peratom = NULL;
+  gc = NULL;

  nmax = 0;
  //part2grid = NULL;
@ -192,9 +191,6 @@ void PPPMKokkos<DeviceType>::init()
               "slab correction");
  if (domain->dimension == 2) error->all(FLERR,
                                         "Cannot use PPPM with 2d simulation");
-  if (comm->style != 0)
-    error->universe_all(FLERR,"PPPM can only currently be used with "
-                        "comm_style brick");

  if (!atomKK->q_flag) error->all(FLERR,"Kspace style requires atom attribute q");

@ -255,9 +251,7 @@ void PPPMKokkos<DeviceType>::init()
  //   or overlap is allowed, then done
  // else reduce order and try again

-  int (*procneigh)[2] = comm->procneigh;
-
-  GridCommKokkos<DeviceType> *cgtmp = NULL;
+  GridCommKokkos<DeviceType> *gctmp = NULL;
  int iteration = 0;

  while (order >= minorder) {
@ -269,24 +263,23 @@ void PPPMKokkos<DeviceType>::init()
    set_grid_local();
    if (overlap_allowed) break;

-    cgtmp = new GridCommKokkos<DeviceType>(lmp,world,1,1,
+    gctmp = new GridCommKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                         nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                         procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                         procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-    cgtmp->ghost_notify();
-    if (!cgtmp->ghost_overlap()) break;
-    delete cgtmp;
+                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
+    int tmp1,tmp2;
+    gctmp->setup(tmp1,tmp2);
+    if (!gctmp->ghost_adjacent()) break;
+    delete gctmp;

    order--;
    iteration++;
  }

  if (order < minorder) error->all(FLERR,"PPPM order < minimum allowed order");
-  if (!overlap_allowed && cgtmp->ghost_overlap())
+  if (!overlap_allowed && gctmp->ghost_adjacent())
    error->all(FLERR,"PPPM grid stencil extends "
               "beyond nearest neighbor processor");
-  if (cgtmp) delete cgtmp;
+  if (gctmp) delete gctmp;

  // adjust g_ewald

@ -320,8 +313,6 @@ void PPPMKokkos<DeviceType>::init()
  // don't invoke allocate peratom(), will be allocated when needed

  allocate();
-  cg->ghost_notify();
-  cg->setup();

  // pre-compute Green's function denomiator expansion
  // pre-compute 1d charge distribution coefficients
@ -564,11 +555,9 @@ void PPPMKokkos<DeviceType>::setup_grid()

  allocate();

-  cg->ghost_notify();
-  if (overlap_allowed == 0 && cg->ghost_overlap())
+  if (!overlap_allowed && !gc->ghost_adjacent())
    error->all(FLERR,"PPPM grid stencil extends "
               "beyond nearest neighbor processor");
-  cg->setup();

  // pre-compute Green's function denomiator expansion
  // pre-compute 1d charge distribution coefficients
@ -576,7 +565,7 @@ void PPPMKokkos<DeviceType>::setup_grid()
  compute_gf_denom();
  compute_rho_coeff();

-  // pre-compute volume-dependent coeffs
+  // pre-compute volume-dependent coeffs for portion of grid I now own

  setup();
 }
@ -609,11 +598,8 @@ void PPPMKokkos<DeviceType>::compute(int eflag, int vflag)
    d_vatom = k_vatom.view<DeviceType>();
  }

-  if (evflag_atom && !peratom_allocate_flag) {
+  if (evflag_atom && !peratom_allocate_flag)
    allocate_peratom();
-    cg_peratom->ghost_notify();
-    cg_peratom->setup();
-  }

  x = atomKK->k_x.view<DeviceType>();
  f = atomKK->k_f.view<DeviceType>();
@ -667,7 +653,8 @@ void PPPMKokkos<DeviceType>::compute(int eflag, int vflag)
  //   to fully sum contribution in their 3d bricks
  // remap from 3d decomposition to FFT decomposition

-  cg->reverse_comm(this,REVERSE_RHO);
+  gc->reverse_comm_kspace(this,1,REVERSE_RHO,
+                          k_gc_buf1,k_gc_buf2,MPI_FFT_SCALAR);
  brick2fft();

  // compute potential gradient on my FFT grid and
@ -680,12 +667,14 @@ void PPPMKokkos<DeviceType>::compute(int eflag, int vflag)
  // all procs communicate E-field values
  // to fill ghost cells surrounding their 3d bricks

-  cg->forward_comm(this,FORWARD_IK);
+  gc->forward_comm_kspace(this,3,FORWARD_IK,
+                          k_gc_buf1,k_gc_buf2,MPI_FFT_SCALAR);

  // extra per-atom energy/virial communication

  if (evflag_atom)
-      cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
+      gc->forward_comm_kspace(this,7,FORWARD_IK_PERATOM,
+                              k_gc_buf1,k_gc_buf2,MPI_FFT_SCALAR);

  // calculate the force on my particles

@ -844,14 +833,18 @@ void PPPMKokkos<DeviceType>::allocate()
                          1,0,0,FFT_PRECISION,collective_flag,cuda_aware_flag);

  // create ghost grid object for rho and electric field communication
+  // also create 2 bufs for ghost grid cell comm, passed to GridComm methods

-  int (*procneigh)[2] = comm->procneigh;
-
-  cg = new GridCommKokkos<DeviceType>(lmp,world,3,1,
+  gc = new GridCommKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                    nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                    procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                    procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+                    nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
+
+  gc->setup(ngc_buf1,ngc_buf2);
+
+  npergrid = 3;
+
+  k_gc_buf1 = FFT_DAT::tdual_FFT_SCALAR_1d("pppm:gc_buf1",npergrid*ngc_buf1);
+  k_gc_buf2 = FFT_DAT::tdual_FFT_SCALAR_1d("pppm:gc_buf2",npergrid*ngc_buf2);
 }

 /* ----------------------------------------------------------------------
@ -876,8 +869,8 @@ void PPPMKokkos<DeviceType>::deallocate()
  fft2 = NULL;
  delete remap;
  remap = NULL;
-  delete cg;
-  cg = NULL;
+  delete gc;
+  gc = NULL;
 }

 /* ----------------------------------------------------------------------
@ -899,16 +892,13 @@ void PPPMKokkos<DeviceType>::allocate_peratom()
  d_v5_brick = typename FFT_AT::t_FFT_SCALAR_3d("pppm:d_v5_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);


-  // create ghost grid object for rho and electric field communication
+  // use same GC ghost grid object for peratom grid communication
+   // but need to reallocate a larger gc_buf1 and gc_buf2

-  int (*procneigh)[2] = comm->procneigh;
+  npergrid = 7;

-  cg_peratom =
-    new GridCommKokkos<DeviceType>(lmp,world,7,1,
-                 nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                 nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                 procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                 procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+  k_gc_buf1 = FFT_DAT::tdual_FFT_SCALAR_1d("pppm:gc_buf1",npergrid*ngc_buf1);
+  k_gc_buf2 = FFT_DAT::tdual_FFT_SCALAR_1d("pppm:gc_buf2",npergrid*ngc_buf2);
 }

 /* ----------------------------------------------------------------------
@ -919,9 +909,6 @@ template<class DeviceType>
 void PPPMKokkos<DeviceType>::deallocate_peratom()
 {
  peratom_allocate_flag = 0;
-
-  delete cg_peratom;
-  cg_peratom = NULL;
 }

 /* ----------------------------------------------------------------------
@ -1185,153 +1172,11 @@ double PPPMKokkos<DeviceType>::final_accuracy()
 template<class DeviceType>
 void PPPMKokkos<DeviceType>::set_grid_local()
 {
-  // global indices of PPPM grid range from 0 to N-1
-  // nlo_in,nhi_in = lower/upper limits of the 3d sub-brick of
-  //   global PPPM grid that I own without ghost cells
-  // for slab PPPM, assign z grid as if it were not extended
+  PPPM::set_grid_local();

-  nxlo_in = static_cast<int> (comm->xsplit[comm->myloc[0]] * nx_pppm);
-  nxhi_in = static_cast<int> (comm->xsplit[comm->myloc[0]+1] * nx_pppm) - 1;
-
-  nylo_in = static_cast<int> (comm->ysplit[comm->myloc[1]] * ny_pppm);
-  nyhi_in = static_cast<int> (comm->ysplit[comm->myloc[1]+1] * ny_pppm) - 1;
-
-  nzlo_in = static_cast<int>
-      (comm->zsplit[comm->myloc[2]] * nz_pppm/slab_volfactor);
-  nzhi_in = static_cast<int>
-      (comm->zsplit[comm->myloc[2]+1] * nz_pppm/slab_volfactor) - 1;
-
-  // nlower,nupper = stencil size for mapping particles to PPPM grid
-
-  nlower = -(order-1)/2;
-  nupper = order/2;
-
-  // shift values for particle <-> grid mapping
-  // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
-
-  if (order % 2) shift = OFFSET + 0.5;
-  else shift = OFFSET;
-  if (order % 2) shiftone = 0.0;
-  else shiftone = 0.5;
-
-  // nlo_out,nhi_out = lower/upper limits of the 3d sub-brick of
-  //   global PPPM grid that my particles can contribute charge to
-  // effectively nlo_in,nhi_in + ghost cells
-  // nlo,nhi = global coords of grid pt to "lower left" of smallest/largest
-  //           position a particle in my box can be at
-  // dist[3] = particle position bound = subbox + skin/2.0 + qdist
-  //   qdist = offset due to TIP4P fictitious charge
-  //   convert to triclinic if necessary
-  // nlo_out,nhi_out = nlo,nhi + stencil size for particle mapping
-  // for slab PPPM, assign z grid as if it were not extended
-
-  double *prd,*sublo,*subhi;
-
-  if (triclinic == 0) {
-    prd = domain->prd;
  boxlo[0] = domain->boxlo[0];
  boxlo[1] = domain->boxlo[1];
  boxlo[2] = domain->boxlo[2];
-    sublo = domain->sublo;
-    subhi = domain->subhi;
-  } else {
-    prd = domain->prd_lamda;
-    boxlo[0] = domain->boxlo_lamda[0];
-    boxlo[1] = domain->boxlo_lamda[1];
-    boxlo[2] = domain->boxlo_lamda[2];
-    domain->x2lamda(atomKK->nlocal);
-    sublo = domain->sublo_lamda;
-    subhi = domain->subhi_lamda;
-  }
-
-  double xprd = prd[0];
-  double yprd = prd[1];
-  double zprd = prd[2];
-  double zprd_slab = zprd*slab_volfactor;
-
-  double dist[3];
-  double cuthalf = 0.5*neighbor->skin + qdist;
-  if (triclinic == 0) dist[0] = dist[1] = dist[2] = cuthalf;
-  else kspacebbox(cuthalf,&dist[0]);
-
-  int nlo,nhi;
-
-  nlo = static_cast<int> ((sublo[0]-dist[0]-boxlo[0]) *
-                            nx_pppm/xprd + shift) - OFFSET;
-  nhi = static_cast<int> ((subhi[0]+dist[0]-boxlo[0]) *
-                            nx_pppm/xprd + shift) - OFFSET;
-  nxlo_out = nlo + nlower;
-  nxhi_out = nhi + nupper;
-
-  nlo = static_cast<int> ((sublo[1]-dist[1]-boxlo[1]) *
-                            ny_pppm/yprd + shift) - OFFSET;
-  nhi = static_cast<int> ((subhi[1]+dist[1]-boxlo[1]) *
-                            ny_pppm/yprd + shift) - OFFSET;
-  nylo_out = nlo + nlower;
-  nyhi_out = nhi + nupper;
-
-  nlo = static_cast<int> ((sublo[2]-dist[2]-boxlo[2]) *
-                            nz_pppm/zprd_slab + shift) - OFFSET;
-  nhi = static_cast<int> ((subhi[2]+dist[2]-boxlo[2]) *
-                            nz_pppm/zprd_slab + shift) - OFFSET;
-  nzlo_out = nlo + nlower;
-  nzhi_out = nhi + nupper;
-
-  // for slab PPPM, change the grid boundary for processors at +z end
-  //   to include the empty volume between periodically repeating slabs
-  // for slab PPPM, want charge data communicated from -z proc to +z proc,
-  //   but not vice versa, also want field data communicated from +z proc to
-  //   -z proc, but not vice versa
-  // this is accomplished by nzhi_in = nzhi_out on +z end (no ghost cells)
-  // also insure no other procs use ghost cells beyond +z limit
-
-  if (slabflag == 1) {
-    if (comm->myloc[2] == comm->procgrid[2]-1)
-      nzhi_in = nzhi_out = nz_pppm - 1;
-    nzhi_out = MIN(nzhi_out,nz_pppm-1);
-  }
-
-  // decomposition of FFT mesh
-  // global indices range from 0 to N-1
-  // proc owns entire x-dimension, clumps of columns in y,z dimensions
-  // npey_fft,npez_fft = # of procs in y,z dims
-  // if nprocs is small enough, proc can own 1 or more entire xy planes,
-  //   else proc owns 2d sub-blocks of yz plane
-  // me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions
-  // nlo_fft,nhi_fft = lower/upper limit of the section
-  //   of the global FFT mesh that I own
-
-  int npey_fft,npez_fft;
-  if (nz_pppm >= nprocs) {
-    npey_fft = 1;
-    npez_fft = nprocs;
-  } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);
-
-  int me_y = me % npey_fft;
-  int me_z = me / npey_fft;
-
-  nxlo_fft = 0;
-  nxhi_fft = nx_pppm - 1;
-  nylo_fft = me_y*ny_pppm/npey_fft;
-  nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1;
-  nzlo_fft = me_z*nz_pppm/npez_fft;
-  nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1;
-
-  // PPPM grid pts owned by this proc, including ghosts
-
-  ngrid = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
-    (nzhi_out-nzlo_out+1);
-
-  // FFT grids owned by this proc, without ghosts
-  // nfft = FFT points in FFT decomposition on this proc
-  // nfft_brick = FFT points in 3d brick-decomposition on this proc
-  // nfft_both = greater of 2 values
-
-  nfft = (nxhi_fft-nxlo_fft+1) * (nyhi_fft-nylo_fft+1) *
-    (nzhi_fft-nzlo_fft+1);
-  int nfft_brick = (nxhi_in-nxlo_in+1) * (nyhi_in-nylo_in+1) *
-    (nzhi_in-nzlo_in+1);
-  nfft_both = MAX(nfft,nfft_brick);
 }

 /* ----------------------------------------------------------------------
@ -2568,7 +2413,7 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_fieldforce_peratom, const int &i
 ------------------------------------------------------------------------- */

 template<class DeviceType>
-void PPPMKokkos<DeviceType>::pack_forward_kspace_kokkos(int flag, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf, int nlist, DAT::tdual_int_2d &k_list, int index)
+void PPPMKokkos<DeviceType>::pack_forward_grid_kokkos(int flag, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf, int nlist, DAT::tdual_int_2d &k_list, int index)
 {
  typename AT::t_int_2d_um d_list = k_list.view<DeviceType>();
  d_list_index = Kokkos::subview(d_list,index,Kokkos::ALL());
@ -2624,11 +2469,12 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_pack_forward2, const int &i) con
 ------------------------------------------------------------------------- */

 template<class DeviceType>
-void PPPMKokkos<DeviceType>::unpack_forward_kspace_kokkos(int flag, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf, int nlist, DAT::tdual_int_2d &k_list, int index)
+void PPPMKokkos<DeviceType>::unpack_forward_grid_kokkos(int flag, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf, int offset, int nlist, DAT::tdual_int_2d &k_list, int index)
 {
  typename AT::t_int_2d_um d_list = k_list.view<DeviceType>();
  d_list_index = Kokkos::subview(d_list,index,Kokkos::ALL());
  d_buf = k_buf.view<DeviceType>();
+  unpack_offset = offset;

  nx = (nxhi_out-nxlo_out+1);
  ny = (nyhi_out-nylo_out+1);
@ -2652,9 +2498,9 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_unpack_forward1, const int &i) c
  const int iz = (int) (dlist/(nx*ny));
  const int iy = (int) ((dlist - iz*nx*ny)/nx);
  const int ix = d_list_index[i] - iz*nx*ny - iy*nx;
-  d_vdx_brick(iz,iy,ix) = d_buf[3*i];
-  d_vdy_brick(iz,iy,ix) = d_buf[3*i+1];
-  d_vdz_brick(iz,iy,ix) = d_buf[3*i+2];
+  d_vdx_brick(iz,iy,ix) = d_buf[3*i   + unpack_offset];
+  d_vdy_brick(iz,iy,ix) = d_buf[3*i+1 + unpack_offset];
+  d_vdz_brick(iz,iy,ix) = d_buf[3*i+2 + unpack_offset];
 }

 template<class DeviceType>
@ -2667,12 +2513,12 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_unpack_forward2, const int &i) c
  const int ix = d_list_index[i] - iz*nx*ny - iy*nx;
  if (eflag_atom) d_u_brick(iz,iy,ix) = d_buf[7*i];
  if (vflag_atom) {
-    d_v0_brick(iz,iy,ix) = d_buf[7*i+1];
-    d_v1_brick(iz,iy,ix) = d_buf[7*i+2];
-    d_v2_brick(iz,iy,ix) = d_buf[7*i+3];
-    d_v3_brick(iz,iy,ix) = d_buf[7*i+4];
-    d_v4_brick(iz,iy,ix) = d_buf[7*i+5];
-    d_v5_brick(iz,iy,ix) = d_buf[7*i+6];
+    d_v0_brick(iz,iy,ix) = d_buf[7*i+1 + unpack_offset];
+    d_v1_brick(iz,iy,ix) = d_buf[7*i+2 + unpack_offset];
+    d_v2_brick(iz,iy,ix) = d_buf[7*i+3 + unpack_offset];
+    d_v3_brick(iz,iy,ix) = d_buf[7*i+4 + unpack_offset];
+    d_v4_brick(iz,iy,ix) = d_buf[7*i+5 + unpack_offset];
+    d_v5_brick(iz,iy,ix) = d_buf[7*i+6 + unpack_offset];
  }
 }

@ -2681,7 +2527,7 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_unpack_forward2, const int &i) c
 ------------------------------------------------------------------------- */

 template<class DeviceType>
-void PPPMKokkos<DeviceType>::pack_reverse_kspace_kokkos(int /*flag*/, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf, int nlist, DAT::tdual_int_2d &k_list, int index)
+void PPPMKokkos<DeviceType>::pack_reverse_grid_kokkos(int /*flag*/, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf, int nlist, DAT::tdual_int_2d &k_list, int index)
 {
  typename AT::t_int_2d_um d_list = k_list.view<DeviceType>();
  d_list_index = Kokkos::subview(d_list,index,Kokkos::ALL());
@ -2711,11 +2557,12 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_pack_reverse, const int &i) cons
 ------------------------------------------------------------------------- */

 template<class DeviceType>
-void PPPMKokkos<DeviceType>::unpack_reverse_kspace_kokkos(int /*flag*/, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf, int nlist, DAT::tdual_int_2d &k_list, int index)
+void PPPMKokkos<DeviceType>::unpack_reverse_grid_kokkos(int /*flag*/, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf, int offset, int nlist, DAT::tdual_int_2d &k_list, int index)
 {
  typename AT::t_int_2d_um d_list = k_list.view<DeviceType>();
  d_list_index = Kokkos::subview(d_list,index,Kokkos::ALL());
  d_buf = k_buf.view<DeviceType>();
+  unpack_offset = offset;

  nx = (nxhi_out-nxlo_out+1);
  ny = (nyhi_out-nylo_out+1);
@ -2733,7 +2580,7 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_unpack_reverse, const int &i) co
  const int iz = (int) (dlist/(nx*ny));
  const int iy = (int) ((dlist - iz*nx*ny)/nx);
  const int ix = d_list_index[i] - iz*nx*ny - iy*nx;
-  d_density_brick(iz,iy,ix) += d_buf[i];
+  d_density_brick(iz,iy,ix) += d_buf[i + unpack_offset];
 }

 /* ----------------------------------------------------------------------
@ -3044,7 +2891,9 @@ double PPPMKokkos<DeviceType>::memory_usage()
  if (peratom_allocate_flag)
    bytes += 6 * nbrick * sizeof(FFT_SCALAR);

-  if (cg) bytes += cg->memory_usage();
+  // two GridComm bufs
+
+  bytes += (ngc_buf1 + ngc_buf2) * npergrid * sizeof(FFT_SCALAR);

  return bytes;
 }
--- a/src/KOKKOS/pppm_kokkos.h
+++ b/src/KOKKOS/pppm_kokkos.h
@ -311,6 +311,7 @@ class PPPMKokkos : public PPPM, public KokkosBaseFFT {
  int nx,ny,nz;
  typename AT::t_int_1d_um d_list_index;
  typename FFT_AT::t_FFT_SCALAR_1d_um d_buf;
+  int unpack_offset;

  DAT::tdual_int_scalar k_flag;

@ -353,10 +354,14 @@ class PPPMKokkos : public PPPM, public KokkosBaseFFT {
  //double **acons;
  typename Kokkos::DualView<F_FLOAT[8][7],Kokkos::LayoutRight,DeviceType>::t_host acons;

+  // FFTs and grid communication
+
  FFT3dKokkos<DeviceType> *fft1,*fft2;
  RemapKokkos<DeviceType> *remap;
-  GridCommKokkos<DeviceType> *cg;
-  GridCommKokkos<DeviceType> *cg_peratom;
+  GridCommKokkos<DeviceType> *gc;
+
+  FFT_DAT::tdual_FFT_SCALAR_1d k_gc_buf1,k_gc_buf2;
+  int ngc_buf1,ngc_buf2,npergrid;

  //int **part2grid;             // storage for particle -> grid mapping
  typename AT::t_int_1d_3 d_part2grid;
@ -403,10 +408,10 @@ class PPPMKokkos : public PPPM, public KokkosBaseFFT {

  // grid communication

-  void pack_forward_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int);
-  void unpack_forward_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int);
-  void pack_reverse_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int);
-  void unpack_reverse_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int);
+  void pack_forward_grid_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int);
+  void unpack_forward_grid_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, int, DAT::tdual_int_2d &, int);
+  void pack_reverse_grid_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int);
+  void unpack_reverse_grid_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, int, DAT::tdual_int_2d &, int);

  // triclinic

--- a/src/KSPACE/fft3d.cpp
+++ b/src/KSPACE/fft3d.cpp
@ -198,6 +198,7 @@ void fft_3d(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan)
             (FFT_SCALAR *) plan->scratch, plan->post_plan);

  // scaling if required
+
  if (flag == 1 && plan->scaled) {
    norm = plan->norm;
    num = plan->normnum;
--- a/src/KSPACE/gridcomm.cpp
+++ b/src/KSPACE/gridcomm.cpp
--- a/src/KSPACE/gridcomm.h
+++ b/src/KSPACE/gridcomm.h
@ -16,55 +16,57 @@

 #include "pointers.h"

-#ifdef FFT_SINGLE
-typedef float FFT_SCALAR;
-#define MPI_FFT_SCALAR MPI_FLOAT
-#else
-typedef double FFT_SCALAR;
-#define MPI_FFT_SCALAR MPI_DOUBLE
-#endif
-
 namespace LAMMPS_NS {

 class GridComm : protected Pointers {
 public:
-  GridComm(class LAMMPS *, MPI_Comm, int, int,
+  GridComm(class LAMMPS *, MPI_Comm, int, int, int,
+           int, int, int, int, int, int,
+           int, int, int, int, int, int);
+  GridComm(class LAMMPS *, MPI_Comm, int, int, int, int,
           int, int, int, int, int, int,
           int, int, int, int, int, int,
           int, int, int, int, int, int);
-  GridComm(class LAMMPS *, MPI_Comm, int, int,
-           int, int, int, int, int, int,
-           int, int, int, int, int, int,
-           int, int, int, int, int, int,
-           int, int, int, int, int, int);
-  ~GridComm();
-  void ghost_notify();
-  int ghost_overlap();
-  void setup();
-  void forward_comm(class KSpace *, int);
-  void reverse_comm(class KSpace *, int);
-  double memory_usage();
+  virtual ~GridComm();
+  void setup(int &, int &);
+  int ghost_adjacent();
+  void forward_comm_kspace(class KSpace *, int, int, int,
+                           void *, void *, MPI_Datatype);
+  void reverse_comm_kspace(class KSpace *, int, int, int,
+                           void *, void *, MPI_Datatype);

- private:
-  int me;
-  int nforward,nreverse;
-  MPI_Comm gridcomm;
-  MPI_Request request;
+ protected:
+  int me,nprocs;
+  int layout;                 // REGULAR or TILED
+  MPI_Comm gridcomm;          // communicator for this class
+                              // usually world, but MSM calls with subset

-  // in = inclusive indices of 3d grid chunk that I own
-  // out = inclusive indices of 3d grid chunk I own plus ghosts I use
-  // proc = 6 neighbor procs that surround me
-  // ghost = # of my owned grid planes needed from me
-  //         by each of 6 neighbor procs to become their ghost planes
+  // inputs from caller via constructor

-  int inxlo,inxhi,inylo,inyhi,inzlo,inzhi;
-  int outxlo,outxhi,outylo,outyhi,outzlo,outzhi;
-  int outxlo_max,outxhi_max,outylo_max,outyhi_max,outzlo_max,outzhi_max;
-  int procxlo,procxhi,procylo,procyhi,proczlo,proczhi;
-  int ghostxlo,ghostxhi,ghostylo,ghostyhi,ghostzlo,ghostzhi;
+  int nx,ny,nz;               // size of global grid in all 3 dims
+  int inxlo,inxhi;            // inclusive extent of my grid chunk
+  int inylo,inyhi;            //   0 <= in <= N-1
+  int inzlo,inzhi;
+  int outxlo,outxhi;          // inclusive extent of my grid chunk plus
+  int outylo,outyhi;          //   ghost cells in all 6 directions
+  int outzlo,outzhi;          //   lo indices can be < 0, hi indices can be >= N
+  int fullxlo,fullxhi;        // extent of grid chunk that caller stores
+  int fullylo,fullyhi;        //   can be same as out indices or larger
+  int fullzlo,fullzhi;

-  int nbuf;
-  FFT_SCALAR *buf1,*buf2;
+  // -------------------------------------------
+  // internal variables for REGULAR layout
+  // -------------------------------------------
+
+  int procxlo,procxhi;     // 6 neighbor procs that adjoin me
+  int procylo,procyhi;     // not used for comm_style = tiled
+  int proczlo,proczhi;
+
+  int ghostxlo,ghostxhi;   // # of my owned grid planes needed
+  int ghostylo,ghostyhi;   // by neighobr procs in each dir as their ghost planes
+  int ghostzlo,ghostzhi;
+
+  // swap = exchange of owned and ghost grid cells between 2 procs, including self

  struct Swap {
    int sendproc;       // proc to send to for forward comm
@ -75,9 +77,130 @@ class GridComm : protected Pointers {
    int *unpacklist;    // 3d array offsets to unpack
  };

-  int nswap;
+  int nswap,maxswap;
  Swap *swap;

+  // -------------------------------------------
+  // internal variables for TILED layout
+  // -------------------------------------------
+
+  int *overlap_procs;          // length of Nprocs in communicator
+  MPI_Request *requests;       // length of max messages this proc receives
+
+  // RCB tree of cut info
+  // each proc contributes one value, except proc 0
+
+  struct RCBinfo {
+    int dim;        // 0,1,2 = which dim the cut is in
+    int cut;        // grid index of lowest cell in upper half of cut
+  };
+
+  RCBinfo *rcbinfo;
+
+  // overlap = a proc whose owned cells overlap with my extended ghost box
+  // includes overlaps across periodic boundaries, can also be self
+
+  struct Overlap {
+    int proc;            // proc whose owned cells overlap my ghost cells
+    int box[6];          // box that overlaps otherproc's owned cells
+                         // this box is wholly contained within global grid
+    int pbc[3];          // PBC offsets to convert box to a portion of my ghost box
+                         // my ghost box may extend beyond global grid
+  };
+
+  int noverlap,maxoverlap;
+  Overlap *overlap;
+
+  // request = sent to each proc whose owned cells overlap my ghost cells
+
+  struct Request {
+    int sender;          // sending proc
+    int index;           // index of overlap on sender
+    int box[6];          // box that overlaps receiver's owned cells
+                         // wholly contained within global grid
+  };
+
+  Request *srequest,*rrequest;
+
+  // response = reply from each proc whose owned cells overlap my ghost cells
+
+  struct Response {
+    int index;           // index of my overlap for the initial request
+    int box[6];          // box that overlaps responder's owned cells
+                         // wholly contained within global grid
+                         // has to unwrapped by PBC to map to my ghost cells
+  };
+
+  Response *sresponse,*rresponse;
+
+  // send = proc to send a subset of my owned cells to, for forward comm
+  // for reverse comm, proc I receive ghost overlaps with my owned cells from
+  // offset used in reverse comm to recv a message in middle of a large buffer
+
+  struct Send {
+    int proc;
+    int npack;
+    int *packlist;
+    int offset;
+  };
+
+  // recv = proc to recv a subset of my ghost cells from, for forward comm
+  // for reverse comm, proc I send a subset of my ghost cells to
+  // offset used in forward comm to recv a message in middle of a large buffer
+
+  struct Recv {
+    int proc;
+    int nunpack;
+    int *unpacklist;
+    int offset;
+  };
+
+  int adjacent;      // 0 on a proc who receives ghosts from a non-neighbor proc
+
+  // copy = subset of my owned cells to copy into subset of my ghost cells
+  // that describes forward comm, for reverse comm it is the opposite
+
+  struct Copy {
+    int npack;
+    int nunpack;
+    int *packlist;
+    int *unpacklist;
+  };
+
+  int nsend,nrecv,ncopy;
+  Send *send;
+  Recv *recv;
+  Copy *copy;
+
+  // -------------------------------------------
+  // internal methods
+  // -------------------------------------------
+
+  void initialize(MPI_Comm, int, int, int,
+                  int, int, int, int, int, int,
+                  int, int, int, int, int, int,
+                  int, int, int, int, int, int,
+                  int, int, int, int, int, int);
+  virtual void setup_regular(int &, int &);
+  virtual void setup_tiled(int &, int &);
+  void ghost_box_drop(int *, int *);
+  void box_drop_grid(int *, int, int, int &, int *);
+
+  int ghost_adjacent_regular();
+  int ghost_adjacent_tiled();
+
+  void forward_comm_kspace_regular(class KSpace *, int, int, int,
+                                   void *, void *, MPI_Datatype);
+  void forward_comm_kspace_tiled(class KSpace *, int, int, int,
+                                 void *, void *, MPI_Datatype);
+  void reverse_comm_kspace_regular(class KSpace *, int, int, int,
+                                   void *, void *, MPI_Datatype);
+  void reverse_comm_kspace_tiled(class KSpace *, int, int, int,
+                                 void *, void *, MPI_Datatype);
+
+  virtual void grow_swap();
+  void grow_overlap();
+
  int indices(int *&, int, int, int, int, int, int);
 };

--- a/src/KSPACE/msm.cpp
+++ b/src/KSPACE/msm.cpp
@ -42,21 +42,24 @@ using namespace MathConst;

 enum{REVERSE_RHO,REVERSE_AD,REVERSE_AD_PERATOM};
 enum{FORWARD_RHO,FORWARD_AD,FORWARD_AD_PERATOM};
+
 /* ---------------------------------------------------------------------- */

-MSM::MSM(LAMMPS *lmp) : KSpace(lmp),
+MSM::MSM(LAMMPS *lmp)
+  : KSpace(lmp),
    factors(NULL), delxinv(NULL), delyinv(NULL), delzinv(NULL), nx_msm(NULL),
    ny_msm(NULL), nz_msm(NULL), nxlo_in(NULL), nylo_in(NULL), nzlo_in(NULL),
    nxhi_in(NULL), nyhi_in(NULL), nzhi_in(NULL), nxlo_out(NULL), nylo_out(NULL),
    nzlo_out(NULL), nxhi_out(NULL), nyhi_out(NULL), nzhi_out(NULL), ngrid(NULL),
-  active_flag(NULL), alpha(NULL), betax(NULL), betay(NULL), betaz(NULL), peratom_allocate_flag(0),
-  levels(0), world_levels(NULL), qgrid(NULL), egrid(NULL), v0grid(NULL), v1grid(NULL),
-  v2grid(NULL), v3grid(NULL), v4grid(NULL), v5grid(NULL), g_direct(NULL),
-  v0_direct(NULL), v1_direct(NULL), v2_direct(NULL), v3_direct(NULL), v4_direct(NULL),
-  v5_direct(NULL), g_direct_top(NULL), v0_direct_top(NULL), v1_direct_top(NULL),
-  v2_direct_top(NULL), v3_direct_top(NULL), v4_direct_top(NULL), v5_direct_top(NULL),
-  phi1d(NULL), dphi1d(NULL), procneigh_levels(NULL), cg(NULL), cg_peratom(NULL),
-  cg_all(NULL), cg_peratom_all(NULL), part2grid(NULL), boxlo(NULL)
+    active_flag(NULL), alpha(NULL), betax(NULL), betay(NULL), betaz(NULL),
+    peratom_allocate_flag(0),levels(0),world_levels(NULL),qgrid(NULL),egrid(NULL),
+    v0grid(NULL), v1grid(NULL),v2grid(NULL),v3grid(NULL),v4grid(NULL),v5grid(NULL),
+    g_direct(NULL),v0_direct(NULL),v1_direct(NULL),v2_direct(NULL),v3_direct(NULL),
+    v4_direct(NULL),v5_direct(NULL),g_direct_top(NULL),v0_direct_top(NULL),
+    v1_direct_top(NULL),v2_direct_top(NULL),v3_direct_top(NULL),v4_direct_top(NULL),
+    v5_direct_top(NULL),phi1d(NULL),dphi1d(NULL),procneigh_levels(NULL),gcall(NULL),
+    gc(NULL),gcall_buf1(NULL),gcall_buf2(NULL),gc_buf1(NULL),gc_buf2(NULL),
+    ngc_buf1(NULL),ngc_buf2(NULL),part2grid(NULL),boxlo(NULL)
 {
  msmflag = 1;

@ -117,6 +120,7 @@ MSM::~MSM()
  delete [] factors;
  deallocate();
  if (peratom_allocate_flag) deallocate_peratom();
+  deallocate_levels();
  memory->destroy(part2grid);
  memory->destroy(g_direct);
  memory->destroy(g_direct_top);
@ -132,7 +136,6 @@ MSM::~MSM()
  memory->destroy(v3_direct_top);
  memory->destroy(v4_direct_top);
  memory->destroy(v5_direct_top);
-  deallocate_levels();
 }

 /* ----------------------------------------------------------------------
@ -397,17 +400,6 @@ void MSM::setup()
  // don't invoke allocate_peratom(), compute() will allocate when needed

  allocate();
-
-  // setup commgrid
-
-  cg_all->ghost_notify();
-  cg_all->setup();
-  for (int n=0; n<levels; n++) {
-    if (!active_flag[n]) continue;
-    cg[n]->ghost_notify();
-    cg[n]->setup();
-  }
-
 }

 /* ----------------------------------------------------------------------
@ -448,16 +440,7 @@ void MSM::compute(int eflag, int vflag)

  // invoke allocate_peratom() if needed for first time

-  if (vflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    cg_peratom_all->ghost_notify();
-    cg_peratom_all->setup();
-    for (int n=0; n<levels; n++) {
-      if (!active_flag[n]) continue;
-      cg_peratom[n]->ghost_notify();
-      cg_peratom[n]->setup();
-    }
-  }
+  if (vflag_atom && !peratom_allocate_flag) allocate_peratom();

  // convert atoms from box to lamda coords

@ -483,7 +466,8 @@ void MSM::compute(int eflag, int vflag)
  // to fully sum contribution in their 3d grid

  current_level = 0;
-  cg_all->reverse_comm(this,REVERSE_RHO);
+  gcall->reverse_comm_kspace(this,1,sizeof(double),REVERSE_RHO,
+                             gcall_buf1,gcall_buf2,MPI_DOUBLE);

  // forward communicate charge density values to fill ghost grid points
  // compute direct sum interaction and then restrict to coarser grid
@ -491,8 +475,8 @@ void MSM::compute(int eflag, int vflag)
  for (int n=0; n<=levels-2; n++) {
    if (!active_flag[n]) continue;
    current_level = n;
-    cg[n]->forward_comm(this,FORWARD_RHO);
-
+    gc[n]->forward_comm_kspace(this,1,sizeof(double),FORWARD_RHO,
+                               gc_buf1[n],gc_buf2[n],MPI_DOUBLE);
    direct(n);
    restriction(n);
  }
@ -503,11 +487,18 @@ void MSM::compute(int eflag, int vflag)
  if (active_flag[levels-1]) {
    if (domain->nonperiodic) {
      current_level = levels-1;
-      cg[levels-1]->forward_comm(this,FORWARD_RHO);
+      gc[levels-1]->
+        forward_comm_kspace(this,1,sizeof(double),FORWARD_RHO,
+                            gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
      direct_top(levels-1);
-      cg[levels-1]->reverse_comm(this,REVERSE_AD);
+      gc[levels-1]->
+        reverse_comm_kspace(this,1,sizeof(double),REVERSE_AD,
+                            gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
      if (vflag_atom)
-        cg_peratom[levels-1]->reverse_comm(this,REVERSE_AD_PERATOM);
+        gc[levels-1]->
+          reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
+                              gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
+
    } else {
      // Here using MPI_Allreduce is cheaper than using commgrid
      grid_swap_forward(levels-1,qgrid[levels-1]);
@ -515,7 +506,9 @@ void MSM::compute(int eflag, int vflag)
      grid_swap_reverse(levels-1,egrid[levels-1]);
      current_level = levels-1;
      if (vflag_atom)
-        cg_peratom[levels-1]->reverse_comm(this,REVERSE_AD_PERATOM);
+        gc[levels-1]->
+          reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
+                              gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
    }
  }

@ -527,24 +520,28 @@ void MSM::compute(int eflag, int vflag)
    prolongation(n);

    current_level = n;
-    cg[n]->reverse_comm(this,REVERSE_AD);
+    gc[n]->reverse_comm_kspace(this,1,sizeof(double),REVERSE_AD,
+                               gc_buf1[n],gc_buf2[n],MPI_DOUBLE);

    // extra per-atom virial communication

    if (vflag_atom)
-      cg_peratom[n]->reverse_comm(this,REVERSE_AD_PERATOM);
+      gc[n]->reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
+                                 gc_buf1[n],gc_buf2[n],MPI_DOUBLE);
  }

  // all procs communicate E-field values
  // to fill ghost cells surrounding their 3d bricks

  current_level = 0;
-  cg_all->forward_comm(this,FORWARD_AD);
+  gcall->forward_comm_kspace(this,1,sizeof(double),FORWARD_AD,
+                             gcall_buf1,gcall_buf2,MPI_DOUBLE);

  // extra per-atom energy/virial communication

  if (vflag_atom)
-    cg_peratom_all->forward_comm(this,FORWARD_AD_PERATOM);
+    gcall->forward_comm_kspace(this,6,sizeof(double),FORWARD_AD_PERATOM,
+                               gcall_buf1,gcall_buf2,MPI_DOUBLE);

  // calculate the force on my particles (interpolation)

@ -603,8 +600,7 @@ void MSM::compute(int eflag, int vflag)

  // convert atoms back from lamda to box coords

-  if (triclinic)
-    domain->lamda2x(atom->nlocal);
+  if (triclinic) domain->lamda2x(atom->nlocal);
 }

 /* ----------------------------------------------------------------------
@ -621,15 +617,18 @@ void MSM::allocate()

  // commgrid using all processors for finest grid level

-  int (*procneigh_all)[2] = comm->procneigh;
+  gcall = new GridComm(lmp,world,1,nx_msm[0],ny_msm[0],nz_msm[0],
+                       nxlo_in[0],nxhi_in[0],nylo_in[0],
+                       nyhi_in[0],nzlo_in[0],nzhi_in[0],
+                       nxlo_out_all,nxhi_out_all,nylo_out_all,
+                       nyhi_out_all,nzlo_out_all,nzhi_out_all,
+                       nxlo_out[0],nxhi_out[0],nylo_out[0],
+                       nyhi_out[0],nzlo_out[0],nzhi_out[0]);

-
-  cg_all = new GridComm(lmp,world,1,1,
-                    nxlo_in[0],nxhi_in[0],nylo_in[0],nyhi_in[0],nzlo_in[0],nzhi_in[0],
-                    nxlo_out_all,nxhi_out_all,nylo_out_all,nyhi_out_all,nzlo_out_all,nzhi_out_all,
-                    nxlo_out[0],nxhi_out[0],nylo_out[0],nyhi_out[0],nzlo_out[0],nzhi_out[0],
-                    procneigh_all[0][0],procneigh_all[0][1],procneigh_all[1][0],
-                    procneigh_all[1][1],procneigh_all[2][0],procneigh_all[2][1]);
+  gcall->setup(ngcall_buf1,ngcall_buf2);
+  npergrid = 1;
+  memory->create(gcall_buf1,npergrid*ngcall_buf1,"msm:gcall_buf1");
+  memory->create(gcall_buf2,npergrid*ngcall_buf2,"msm:gcall_buf2");

  // allocate memory for each grid level

@ -644,12 +643,23 @@ void MSM::allocate()

    if (active_flag[n]) {
      int **procneigh = procneigh_levels[n];
-      cg[n] = new GridComm(lmp,world_levels[n],1,1,
-                        nxlo_in[n],nxhi_in[n],nylo_in[n],nyhi_in[n],nzlo_in[n],nzhi_in[n],
-                        nxlo_out[n],nxhi_out[n],nylo_out[n],nyhi_out[n],nzlo_out[n],nzhi_out[n],
+      gc[n] = new GridComm(lmp,world_levels[n],2,nx_msm[n],ny_msm[n],nz_msm[n],
+                           nxlo_in[n],nxhi_in[n],nylo_in[n],nyhi_in[n],
+                           nzlo_in[n],nzhi_in[n],
+                           nxlo_out[n],nxhi_out[n],nylo_out[n],nyhi_out[n],
+                           nzlo_out[n],nzhi_out[n],
                           procneigh[0][0],procneigh[0][1],procneigh[1][0],
                           procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-    } else cg[n] = nullptr;
+
+      gc[n]->setup(ngc_buf1[n],ngc_buf2[n]);
+      npergrid = 1;
+      memory->create(gc_buf1[n],npergrid*ngc_buf1[n],"msm:gc_buf1");
+      memory->create(gc_buf2[n],npergrid*ngc_buf2[n],"msm:gc_buf2");
+
+    } else {
+      gc[n] = nullptr;
+      gc_buf1[n] = gc_buf2[n] = nullptr;
+    }
  }
 }

@ -662,8 +672,11 @@ void MSM::deallocate()
  memory->destroy2d_offset(phi1d,-order_allocated);
  memory->destroy2d_offset(dphi1d,-order_allocated);

-  if (cg_all) delete cg_all;
-  cg_all = nullptr;
+  if (gcall) delete gcall;
+  memory->destroy(gcall_buf1);
+  memory->destroy(gcall_buf2);
+  gcall = nullptr;
+  gcall_buf1 = gcall_buf2 = nullptr;

  for (int n=0; n<levels; n++) {
    if (qgrid[n])
@ -676,10 +689,13 @@ void MSM::deallocate()
      if (world_levels[n] != MPI_COMM_NULL)
          MPI_Comm_free(&world_levels[n]);

-    if (cg) {
-      if (cg[n]) {
-        delete cg[n];
-        cg[n] = nullptr;
+    if (gc) {
+      if (gc[n]) {
+        delete gc[n];
+        memory->destroy(gc_buf1[n]);
+        memory->destroy(gc_buf2[n]);
+        gc[n] = nullptr;
+        gc_buf1[n] = gc_buf2[n] = nullptr;
      }
    }
  }
@ -695,15 +711,11 @@ void MSM::allocate_peratom()

  // create commgrid object for per-atom virial using all processors

-  int (*procneigh_all)[2] = comm->procneigh;
-
-  cg_peratom_all =
-    new GridComm(lmp,world,6,6,
-                 nxlo_in[0],nxhi_in[0],nylo_in[0],nyhi_in[0],nzlo_in[0],nzhi_in[0],
-                 nxlo_out_all,nxhi_out_all,nylo_out_all,nyhi_out_all,nzlo_out_all,nzhi_out_all,
-                 nxlo_out[0],nxhi_out[0],nylo_out[0],nyhi_out[0],nzlo_out[0],nzhi_out[0],
-                 procneigh_all[0][0],procneigh_all[0][1],procneigh_all[1][0],
-                 procneigh_all[1][1],procneigh_all[2][0],procneigh_all[2][1]);
+  npergrid = 6;
+  memory->destroy(gcall_buf1);
+  memory->destroy(gcall_buf2);
+  memory->create(gcall_buf1,npergrid*ngcall_buf1,"pppm:gcall_buf1");
+  memory->create(gcall_buf2,npergrid*ngcall_buf2,"pppm:gcall_buf2");

  // allocate memory for each grid level

@ -724,13 +736,11 @@ void MSM::allocate_peratom()
    // create commgrid object for per-atom virial

    if (active_flag[n]) {
-      int **procneigh = procneigh_levels[n];
-      cg_peratom[n] =
-        new GridComm(lmp,world_levels[n],6,6,
-                     nxlo_in[n],nxhi_in[n],nylo_in[n],nyhi_in[n],nzlo_in[n],nzhi_in[n],
-                     nxlo_out[n],nxhi_out[n],nylo_out[n],nyhi_out[n],nzlo_out[n],nzhi_out[n],
-                     procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                     procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+      npergrid = 6;
+      memory->destroy(gc_buf1[n]);
+      memory->destroy(gc_buf2[n]);
+      memory->create(gc_buf1[n],npergrid*ngc_buf1[n],"pppm:gc_buf1");
+      memory->create(gc_buf2[n],npergrid*ngc_buf2[n],"pppm:gc_buf2");
    }
  }
 }
@ -743,8 +753,6 @@ void MSM::deallocate_peratom()
 {
  peratom_allocate_flag = 0;

-  if (cg_peratom_all) delete cg_peratom_all;
-
  for (int n=0; n<levels; n++) {
    if (v0grid[n])
      memory->destroy3d_offset(v0grid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
@ -758,9 +766,6 @@ void MSM::deallocate_peratom()
      memory->destroy3d_offset(v4grid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
    if (v5grid[n])
      memory->destroy3d_offset(v5grid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
-
-    if (cg_peratom)
-      if (cg_peratom[n]) delete cg_peratom[n];
  }
 }

@ -772,8 +777,11 @@ void MSM::allocate_levels()
 {
  ngrid = new int[levels];

-  cg = new GridComm*[levels];
-  cg_peratom = new GridComm*[levels];
+  gc = new GridComm*[levels];
+  gc_buf1 = new double*[levels];
+  gc_buf2 = new double*[levels];
+  ngc_buf1 = new int[levels];
+  ngc_buf2 = new int[levels];

  memory->create(procneigh_levels,levels,3,2,"msm:procneigh_levels");
  world_levels = new MPI_Comm[levels];
@ -819,9 +827,8 @@ void MSM::allocate_levels()
  v5grid = new double***[levels];

  for (int n=0; n<levels; n++) {
-    cg[n] = NULL;
+    gc[n] = NULL;
    world_levels[n] = MPI_COMM_NULL;
-    cg_peratom[n] = NULL;

    qgrid[n] = NULL;
    egrid[n] = NULL;
@ -833,7 +840,6 @@ void MSM::allocate_levels()
    v4grid[n] = NULL;
    v5grid[n] = NULL;
  }
-
 }

 /* ----------------------------------------------------------------------
@ -842,15 +848,18 @@ void MSM::allocate_levels()

 void MSM::deallocate_levels()
 {
-  if (cg) deallocate();
  delete [] ngrid;
  ngrid = nullptr;

  memory->destroy(procneigh_levels);
  delete [] world_levels;
  delete [] active_flag;
-  delete [] cg;
-  delete [] cg_peratom;
+
+  delete [] gc;
+  delete [] gc_buf1;
+  delete [] gc_buf2;
+  delete [] ngc_buf1;
+  delete [] ngc_buf2;

  delete [] alpha;
  delete [] betax;
@ -893,8 +902,8 @@ void MSM::deallocate_levels()

  world_levels = nullptr;
  active_flag = nullptr;
-  cg = nullptr;
-  cg_peratom = nullptr;
+  gc = nullptr;
+  gc_buf1 = gc_buf2 = nullptr;

  alpha = nullptr;
  betax = nullptr;
@ -1377,7 +1386,7 @@ void MSM::set_proc_grid(int n)

  // define a new MPI communicator for this grid level that only includes active procs

-  if(world_levels[n] != MPI_COMM_NULL) MPI_Comm_free(&world_levels[n]);
+  if (world_levels[n] != MPI_COMM_NULL) MPI_Comm_free(&world_levels[n]);
  MPI_Comm_split(world,color,me,&world_levels[n]);

  if (!active_flag[n]) return;
@ -2434,6 +2443,7 @@ void MSM::prolongation(int n)
   be cheaper than using nearest-neighbor communication (commgrid), right
   now only works for periodic boundary conditions
 ------------------------------------------------------------------------- */
+
 void MSM::grid_swap_forward(int n, double*** &gridn)
 {
  double ***gridn_tmp;
@ -2533,32 +2543,31 @@ void MSM::grid_swap_reverse(int n, double*** &gridn)
   pack own values to buf to send to another proc (used by commgrid)
 ------------------------------------------------------------------------- */

-void MSM::pack_forward(int flag, double *buf, int nlist, int *list)
+void MSM::pack_forward_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  double *buf = (double *) vbuf;
+
  int n = current_level;
+  int k = 0;

+  if (flag == FORWARD_RHO) {
    double ***qgridn = qgrid[n];
+    double *qsrc = &qgridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
+    for (int i = 0; i < nlist; i++) {
+      buf[k++] = qsrc[list[i]];
+    }
+  } else if (flag == FORWARD_AD) {
    double ***egridn = egrid[n];
-
+    double *src = &egridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
+    for (int i = 0; i < nlist; i++)
+      buf[i] = src[list[i]];
+  } else if (flag == FORWARD_AD_PERATOM) {
    double ***v0gridn = v0grid[n];
    double ***v1gridn = v1grid[n];
    double ***v2gridn = v2grid[n];
    double ***v3gridn = v3grid[n];
    double ***v4gridn = v4grid[n];
    double ***v5gridn = v5grid[n];
-
-  int k = 0;
-
-  if (flag == FORWARD_RHO) {
-    double *qsrc = &qgridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
-    for (int i = 0; i < nlist; i++) {
-      buf[k++] = qsrc[list[i]];
-    }
-  } else if (flag == FORWARD_AD) {
-    double *src = &egridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
-    for (int i = 0; i < nlist; i++)
-      buf[i] = src[list[i]];
-  } else if (flag == FORWARD_AD_PERATOM) {
    double *v0src = &v0gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
    double *v1src = &v1gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
    double *v2src = &v2gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
@ -2580,32 +2589,31 @@ void MSM::pack_forward(int flag, double *buf, int nlist, int *list)
   unpack another proc's own values from buf and set own ghost values
 ------------------------------------------------------------------------- */

-void MSM::unpack_forward(int flag, double *buf, int nlist, int *list)
+void MSM::unpack_forward_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  double *buf = (double *) vbuf;
+
  int n = current_level;
+  int k = 0;

+  if (flag == FORWARD_RHO) {
  double ***qgridn = qgrid[n];
+    double *dest = &qgridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
+    for (int i = 0; i < nlist; i++) {
+      dest[list[i]] = buf[k++];
+    }
+  } else if (flag == FORWARD_AD) {
    double ***egridn = egrid[n];
-
+    double *dest = &egridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
+    for (int i = 0; i < nlist; i++)
+      dest[list[i]] = buf[k++];
+  } else if (flag == FORWARD_AD_PERATOM) {
    double ***v0gridn = v0grid[n];
    double ***v1gridn = v1grid[n];
    double ***v2gridn = v2grid[n];
    double ***v3gridn = v3grid[n];
    double ***v4gridn = v4grid[n];
    double ***v5gridn = v5grid[n];
-
-  int k = 0;
-
-  if (flag == FORWARD_RHO) {
-    double *dest = &qgridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
-    for (int i = 0; i < nlist; i++) {
-      dest[list[i]] = buf[k++];
-    }
-  } else if (flag == FORWARD_AD) {
-    double *dest = &egridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
-    for (int i = 0; i < nlist; i++)
-      dest[list[i]] = buf[k++];
-  } else if (flag == FORWARD_AD_PERATOM) {
    double *v0src = &v0gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
    double *v1src = &v1gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
    double *v2src = &v2gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
@ -2627,32 +2635,31 @@ void MSM::unpack_forward(int flag, double *buf, int nlist, int *list)
   pack ghost values into buf to send to another proc
 ------------------------------------------------------------------------- */

-void MSM::pack_reverse(int flag, double *buf, int nlist, int *list)
+void MSM::pack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  double *buf = (double *) vbuf;
+
  int n = current_level;
+  int k = 0;

+  if (flag == REVERSE_RHO) {
    double ***qgridn = qgrid[n];
+    double *qsrc = &qgridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
+    for (int i = 0; i < nlist; i++) {
+      buf[k++] = qsrc[list[i]];
+    }
+  } else if (flag == REVERSE_AD) {
    double ***egridn = egrid[n];
-
+    double *src = &egridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
+    for (int i = 0; i < nlist; i++)
+      buf[i] = src[list[i]];
+  } else if (flag == REVERSE_AD_PERATOM) {
    double ***v0gridn = v0grid[n];
    double ***v1gridn = v1grid[n];
    double ***v2gridn = v2grid[n];
    double ***v3gridn = v3grid[n];
    double ***v4gridn = v4grid[n];
    double ***v5gridn = v5grid[n];
-
-  int k = 0;
-
-  if (flag == REVERSE_RHO) {
-    double *qsrc = &qgridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
-    for (int i = 0; i < nlist; i++) {
-      buf[k++] = qsrc[list[i]];
-    }
-  } else if (flag == REVERSE_AD) {
-    double *src = &egridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
-    for (int i = 0; i < nlist; i++)
-      buf[i] = src[list[i]];
-  } else if (flag == REVERSE_AD_PERATOM) {
    double *v0src = &v0gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
    double *v1src = &v1gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
    double *v2src = &v2gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
@ -2674,32 +2681,31 @@ void MSM::pack_reverse(int flag, double *buf, int nlist, int *list)
   unpack another proc's ghost values from buf and add to own values
 ------------------------------------------------------------------------- */

-void MSM::unpack_reverse(int flag, double *buf, int nlist, int *list)
+void MSM::unpack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  double *buf = (double *) vbuf;
+
  int n = current_level;
+  int k = 0;

+  if (flag == REVERSE_RHO) {
    double ***qgridn = qgrid[n];
+    double *dest = &qgridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
+    for (int i = 0; i < nlist; i++) {
+      dest[list[i]] += buf[k++];
+    }
+  } else if (flag == REVERSE_AD) {
    double ***egridn = egrid[n];
-
+    double *dest = &egridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
+    for (int i = 0; i < nlist; i++)
+      dest[list[i]] += buf[k++];
+  } else if (flag == REVERSE_AD_PERATOM) {
    double ***v0gridn = v0grid[n];
    double ***v1gridn = v1grid[n];
    double ***v2gridn = v2grid[n];
    double ***v3gridn = v3grid[n];
    double ***v4gridn = v4grid[n];
    double ***v5gridn = v5grid[n];
-
-  int k = 0;
-
-  if (flag == REVERSE_RHO) {
-    double *dest = &qgridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
-    for (int i = 0; i < nlist; i++) {
-      dest[list[i]] += buf[k++];
-    }
-  } else if (flag == REVERSE_AD) {
-    double *dest = &egridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
-    for (int i = 0; i < nlist; i++)
-      dest[list[i]] += buf[k++];
-  } else if (flag == REVERSE_AD_PERATOM) {
    double *v0src = &v0gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
    double *v1src = &v1gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
    double *v2src = &v2gridn[nzlo_out[n]][nylo_out[n]][nxlo_out[n]];
@ -3416,3 +3422,24 @@ void MSM::get_virial_direct_top(int n)
    }
  }
 }
+
+/* ----------------------------------------------------------------------
+   memory usage of local arrays
+------------------------------------------------------------------------- */
+
+double MSM::memory_usage()
+{
+  double bytes = 0;
+
+  // NOTE: Stan, fill in other memory allocations here
+
+  // all GridComm bufs
+
+  bytes += (ngcall_buf1 + ngcall_buf2) * npergrid * sizeof(double);
+
+  for (int n=0; n<levels; n++)
+    if (active_flag[n])
+      bytes += (ngc_buf1[n] + ngc_buf2[n]) * npergrid * sizeof(double);
+
+  return bytes;
+}
--- a/src/KSPACE/msm.h
+++ b/src/KSPACE/msm.h
@ -32,6 +32,7 @@ class MSM : public KSpace {
  void setup();
  virtual void settings(int, char **);
  virtual void compute(int, int);
+  virtual double memory_usage();

 protected:
  int me,nprocs;
@ -79,16 +80,21 @@ class MSM : public KSpace {
  int procgrid[3];                  // procs assigned in each dim of 3d grid
  int myloc[3];                     // which proc I am in each dim
  int ***procneigh_levels;          // my 6 neighboring procs, 0/1 = left/right
-  class GridComm **cg;
-  class GridComm **cg_peratom;
-  class GridComm *cg_all;
-  class GridComm *cg_peratom_all;
+
+  class GridComm *gcall;       // GridComm class for finest level grid
+  class GridComm **gc;         // GridComm classes for each hierarchical level
+
+  double *gcall_buf1,*gcall_buf2;
+  double **gc_buf1,**gc_buf2;
+  int ngcall_buf1,ngcall_buf2,npergrid;
+  int *ngc_buf1,*ngc_buf2;

  int current_level;

  int **part2grid;             // storage for particle -> grid mapping
  int nmax;

+  int triclinic;
  double *boxlo;

  void set_grid_global();
@ -126,15 +132,12 @@ class MSM : public KSpace {
  void get_g_direct_top(int);
  void get_virial_direct_top(int);

-  // triclinic
-
-  int triclinic;
-
  // grid communication
-  void pack_forward(int, double *, int, int *);
-  void unpack_forward(int, double *, int, int *);
-  void pack_reverse(int, double *, int, int *);
-  void unpack_reverse(int, double *, int, int *);
+
+  void pack_forward_grid(int, void *, int, int *);
+  void unpack_forward_grid(int, void *, int, int *);
+  void pack_reverse_grid(int, void *, int, int *);
+  void unpack_reverse_grid(int, void *, int, int *);
 };

 }
--- a/src/KSPACE/msm_cg.cpp
+++ b/src/KSPACE/msm_cg.cpp
@ -91,17 +91,7 @@ void MSMCG::compute(int eflag, int vflag)

  // invoke allocate_peratom() if needed for first time

-  if (vflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    cg_peratom_all->ghost_notify();
-    cg_peratom_all->setup();
-    for (int n=0; n<levels; n++) {
-      if (!active_flag[n]) continue;
-      cg_peratom[n]->ghost_notify();
-      cg_peratom[n]->setup();
-    }
-    peratom_allocate_flag = 1;
-  }
+  if (vflag_atom && !peratom_allocate_flag) allocate_peratom();

  // extend size of per-atom arrays if necessary

@ -171,7 +161,8 @@ void MSMCG::compute(int eflag, int vflag)
  //   to fully sum contribution in their 3d grid

  current_level = 0;
-  cg_all->reverse_comm(this,REVERSE_RHO);
+  gcall->reverse_comm_kspace(this,1,sizeof(double),REVERSE_RHO,
+			     gcall_buf1,gcall_buf2,MPI_DOUBLE);

  // forward communicate charge density values to fill ghost grid points
  // compute direct sum interaction and then restrict to coarser grid
@ -179,24 +170,30 @@ void MSMCG::compute(int eflag, int vflag)
  for (int n=0; n<=levels-2; n++) {
    if (!active_flag[n]) continue;
    current_level = n;
-    cg[n]->forward_comm(this,FORWARD_RHO);
-
+    gc[n]->forward_comm_kspace(this,1,sizeof(double),FORWARD_RHO,
+			       gc_buf1[n],gc_buf2[n],MPI_DOUBLE);
    direct(n);
    restriction(n);
  }

-
  // compute direct interaction for top grid level for non-periodic
  //   and for second from top grid level for periodic

  if (active_flag[levels-1]) {
    if (domain->nonperiodic) {
      current_level = levels-1;
-      cg[levels-1]->forward_comm(this,FORWARD_RHO);
+      gc[levels-1]->
+	forward_comm_kspace(this,1,sizeof(double),FORWARD_RHO,
+			    gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
      direct_top(levels-1);
-      cg[levels-1]->reverse_comm(this,REVERSE_AD);
+      gc[levels-1]->
+	reverse_comm_kspace(this,1,sizeof(double),REVERSE_AD,
+			    gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
      if (vflag_atom)
-        cg_peratom[levels-1]->reverse_comm(this,REVERSE_AD_PERATOM);
+	gc[levels-1]->
+	  reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
+			      gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
+
    } else {
      // Here using MPI_Allreduce is cheaper than using commgrid
      grid_swap_forward(levels-1,qgrid[levels-1]);
@ -204,7 +201,9 @@ void MSMCG::compute(int eflag, int vflag)
      grid_swap_reverse(levels-1,egrid[levels-1]);
      current_level = levels-1;
      if (vflag_atom)
-        cg_peratom[levels-1]->reverse_comm(this,REVERSE_AD_PERATOM);
+	gc[levels-1]->
+	  reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
+			      gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
    }
  }

@ -216,24 +215,28 @@ void MSMCG::compute(int eflag, int vflag)
    prolongation(n);

    current_level = n;
-    cg[n]->reverse_comm(this,REVERSE_AD);
+    gc[n]->reverse_comm_kspace(this,1,sizeof(double),REVERSE_AD,
+			       gc_buf1[n],gc_buf2[n],MPI_DOUBLE);

    // extra per-atom virial communication

    if (vflag_atom)
-      cg_peratom[n]->reverse_comm(this,REVERSE_AD_PERATOM);
+      gc[n]->reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
+				 gc_buf1[n],gc_buf2[n],MPI_DOUBLE);
  }

  // all procs communicate E-field values
  // to fill ghost cells surrounding their 3d bricks

  current_level = 0;
-  cg_all->forward_comm(this,FORWARD_AD);
+  gcall->forward_comm_kspace(this,1,sizeof(double),FORWARD_AD,
+			     gcall_buf1,gcall_buf2,MPI_DOUBLE);

  // extra per-atom energy/virial communication

  if (vflag_atom)
-    cg_peratom_all->forward_comm(this,FORWARD_AD_PERATOM);
+    gcall->forward_comm_kspace(this,6,sizeof(double),FORWARD_AD_PERATOM,
+			       gcall_buf1,gcall_buf2,MPI_DOUBLE);

  // calculate the force on my particles (interpolation)

@ -536,6 +539,9 @@ void MSMCG::fieldforce_peratom()
  }
 }

+/* ----------------------------------------------------------------------
+   memory usage of local arrays
+------------------------------------------------------------------------- */

 double MSMCG::memory_usage()
 {
--- a/src/KSPACE/pppm.cpp
+++ b/src/KSPACE/pppm.cpp
@ -70,11 +70,12 @@ PPPM::PPPM(LAMMPS *lmp) : KSpace(lmp),
  u_brick(NULL), v0_brick(NULL), v1_brick(NULL), v2_brick(NULL), v3_brick(NULL),
  v4_brick(NULL), v5_brick(NULL), greensfn(NULL), vg(NULL), fkx(NULL), fky(NULL),
  fkz(NULL), density_fft(NULL), work1(NULL), work2(NULL), gf_b(NULL), rho1d(NULL),
-  rho_coeff(NULL), drho1d(NULL), drho_coeff(NULL), sf_precoeff1(NULL), sf_precoeff2(NULL),
-  sf_precoeff3(NULL), sf_precoeff4(NULL), sf_precoeff5(NULL), sf_precoeff6(NULL),
-  acons(NULL), density_A_brick(NULL), density_B_brick(NULL), density_A_fft(NULL),
-  density_B_fft(NULL), fft1(NULL), fft2(NULL), remap(NULL), cg(NULL), cg_peratom(NULL),
-  part2grid(NULL), boxlo(NULL)
+  rho_coeff(NULL), drho1d(NULL), drho_coeff(NULL),
+  sf_precoeff1(NULL), sf_precoeff2(NULL), sf_precoeff3(NULL),
+  sf_precoeff4(NULL), sf_precoeff5(NULL), sf_precoeff6(NULL),
+  acons(NULL), fft1(NULL), fft2(NULL), remap(NULL), gc(NULL),
+  gc_buf1(NULL), gc_buf2(NULL), density_A_brick(NULL), density_B_brick(NULL), density_A_fft(NULL),
+  density_B_fft(NULL), part2grid(NULL), boxlo(NULL)
 {
  peratom_allocate_flag = 0;
  group_allocate_flag = 0;
@ -117,8 +118,8 @@ PPPM::PPPM(LAMMPS *lmp) : KSpace(lmp),

  fft1 = fft2 = NULL;
  remap = NULL;
-  cg = NULL;
-  cg_peratom = NULL;
+  gc = NULL;
+  gc_buf1 = gc_buf2 = NULL;

  nmax = 0;
  part2grid = NULL;
@ -205,9 +206,6 @@ void PPPM::init()
               "slab correction");
  if (domain->dimension == 2)
    error->all(FLERR,"Cannot use PPPM with 2d simulation");
-  if (comm->style != 0)
-    error->universe_all(FLERR,"PPPM can only currently be used with "
-                        "comm_style brick");

  if (!atom->q_flag)
    error->all(FLERR,"Kspace style requires atom attribute q");
@ -297,9 +295,7 @@ void PPPM::init()
  //   or overlap is allowed, then done
  // else reduce order and try again

-  int (*procneigh)[2] = comm->procneigh;
-
-  GridComm *cgtmp = NULL;
+  GridComm *gctmp = NULL;
  int iteration = 0;

  while (order >= minorder) {
@ -312,24 +308,24 @@ void PPPM::init()
    set_grid_local();
    if (overlap_allowed) break;

-    cgtmp = new GridComm(lmp,world,1,1,
+    gctmp = new GridComm(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                         nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                         procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                         procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-    cgtmp->ghost_notify();
-    if (!cgtmp->ghost_overlap()) break;
-    delete cgtmp;
+                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
+
+    int tmp1,tmp2;
+    gctmp->setup(tmp1,tmp2);
+    if (gctmp->ghost_adjacent()) break;
+    delete gctmp;

    order--;
    iteration++;
  }

  if (order < minorder) error->all(FLERR,"PPPM order < minimum allowed order");
-  if (!overlap_allowed && cgtmp->ghost_overlap())
+  if (!overlap_allowed && !gctmp->ghost_adjacent())
    error->all(FLERR,"PPPM grid stencil extends "
               "beyond nearest neighbor processor");
-  if (cgtmp) delete cgtmp;
+  if (gctmp) delete gctmp;

  // adjust g_ewald

@ -363,8 +359,6 @@ void PPPM::init()
  // don't invoke allocate peratom() or group(), will be allocated when needed

  allocate();
-  cg->ghost_notify();
-  cg->setup();

  // pre-compute Green's function denomiator expansion
  // pre-compute 1d charge distribution coefficients
@ -578,11 +572,9 @@ void PPPM::setup_grid()

  allocate();

-  cg->ghost_notify();
-  if (overlap_allowed == 0 && cg->ghost_overlap())
+  if (!overlap_allowed && !gc->ghost_adjacent())
    error->all(FLERR,"PPPM grid stencil extends "
               "beyond nearest neighbor processor");
-  cg->setup();

  // pre-compute Green's function denomiator expansion
  // pre-compute 1d charge distribution coefficients
@ -591,7 +583,7 @@ void PPPM::setup_grid()
  if (differentiation_flag == 1) compute_sf_precoeff();
  compute_rho_coeff();

-  // pre-compute volume-dependent coeffs
+  // pre-compute volume-dependent coeffs for portion of grid I now own

  setup();
 }
@ -609,11 +601,7 @@ void PPPM::compute(int eflag, int vflag)

  ev_init(eflag,vflag);

-  if (evflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    cg_peratom->ghost_notify();
-    cg_peratom->setup();
-  }
+  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();

  // if atom count has changed, update qsum and qsqsum

@ -652,7 +640,8 @@ void PPPM::compute(int eflag, int vflag)
  //   to fully sum contribution in their 3d bricks
  // remap from 3d decomposition to FFT decomposition

-  cg->reverse_comm(this,REVERSE_RHO);
+  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+                          gc_buf1,gc_buf2,MPI_FFT_SCALAR);
  brick2fft();

  // compute potential gradient on my FFT grid and
@ -665,16 +654,22 @@ void PPPM::compute(int eflag, int vflag)
  // all procs communicate E-field values
  // to fill ghost cells surrounding their 3d bricks

-  if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
-  else cg->forward_comm(this,FORWARD_IK);
+  if (differentiation_flag == 1)
+    gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  else
+    gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);

  // extra per-atom energy/virial communication

  if (evflag_atom) {
    if (differentiation_flag == 1 && vflag_atom)
-      cg_peratom->forward_comm(this,FORWARD_AD_PERATOM);
+      gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);
    else if (differentiation_flag == 0)
-      cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
+      gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);
  }

  // calculate the force on my particles
@ -821,21 +816,19 @@ void PPPM::allocate()
                    1,0,0,FFT_PRECISION,collective_flag);

  // create ghost grid object for rho and electric field communication
+  // also create 2 bufs for ghost grid cell comm, passed to GridComm methods

-  int (*procneigh)[2] = comm->procneigh;
+  gc = new GridComm(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                    nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);

-  if (differentiation_flag == 1)
-    cg = new GridComm(lmp,world,1,1,
-                      nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                      nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                      procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                      procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-  else
-    cg = new GridComm(lmp,world,3,1,
-                      nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                      nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                      procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                      procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+  gc->setup(ngc_buf1,ngc_buf2);
+
+  if (differentiation_flag) npergrid = 1;
+  else npergrid = 3;
+
+  memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
+  memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
 }

 /* ----------------------------------------------------------------------
@ -886,7 +879,9 @@ void PPPM::deallocate()
  delete fft1;
  delete fft2;
  delete remap;
-  delete cg;
+  delete gc;
+  memory->destroy(gc_buf1);
+  memory->destroy(gc_buf2);
 }

 /* ----------------------------------------------------------------------
@ -915,24 +910,16 @@ void PPPM::allocate_peratom()
  memory->create3d_offset(v5_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm:v5_brick");

-  // create ghost grid object for rho and electric field communication
+  // use same GC ghost grid object for peratom grid communication
+  // but need to reallocate a larger gc_buf1 and gc_buf2

-  int (*procneigh)[2] = comm->procneigh;
+  if (differentiation_flag) npergrid = 6;
+  else npergrid = 7;

-  if (differentiation_flag == 1)
-    cg_peratom =
-      new GridComm(lmp,world,6,1,
-                   nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                   nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                   procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                   procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-  else
-    cg_peratom =
-      new GridComm(lmp,world,7,1,
-                   nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                   nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                   procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                   procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+  memory->destroy(gc_buf1);
+  memory->destroy(gc_buf2);
+  memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
+  memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
 }

 /* ----------------------------------------------------------------------
@ -952,8 +939,6 @@ void PPPM::deallocate_peratom()

  if (differentiation_flag != 1)
    memory->destroy3d_offset(u_brick,nzlo_out,nylo_out,nxlo_out);
-
-  delete cg_peratom;
 }

 /* ----------------------------------------------------------------------
@ -1002,7 +987,8 @@ void PPPM::set_grid_global()
      int count = 0;
      while (1) {

-        // set grid dimension
+        // set grid dimensions
+
        nx_pppm = static_cast<int> (xprd/h_x);
        ny_pppm = static_cast<int> (yprd/h_y);
        nz_pppm = static_cast<int> (zprd_slab/h_z);
@ -1011,31 +997,16 @@ void PPPM::set_grid_global()
        if (ny_pppm <= 1) ny_pppm = 2;
        if (nz_pppm <= 1) nz_pppm = 2;

-        //set local grid dimension
-        int npey_fft,npez_fft;
-        if (nz_pppm >= nprocs) {
-          npey_fft = 1;
-          npez_fft = nprocs;
-        } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);
-
-        int me_y = me % npey_fft;
-        int me_z = me / npey_fft;
-
-        nxlo_fft = 0;
-        nxhi_fft = nx_pppm - 1;
-        nylo_fft = me_y*ny_pppm/npey_fft;
-        nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1;
-        nzlo_fft = me_z*nz_pppm/npez_fft;
-        nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1;
+        // estimate Kspace force error

        double df_kspace = compute_df_kspace();

-        count++;
-
        // break loop if the accuracy has been reached or
        // too many loops have been performed

+        count++;
        if (df_kspace <= accuracy) break;
+
        if (count > 500) error->all(FLERR, "Could not compute grid size");
        h *= 0.95;
        h_x = h_y = h_z = h;
@ -1163,7 +1134,11 @@ double PPPM::compute_df_kspace()

 double PPPM::compute_qopt()
 {
-  double qopt = 0.0;
+  int k,l,m,nx,ny,nz;
+  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  double u1,u2,sqk;
+  double sum1,sum2,sum3,sum4,dot2;
+
  double *prd = domain->prd;

  const double xprd = prd[0];
@ -1176,30 +1151,30 @@ double PPPM::compute_qopt()
  const double unitky = (MY_2PI/yprd);
  const double unitkz = (MY_2PI/zprd_slab);

-  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
-  double u1, u2, sqk;
-  double sum1,sum2,sum3,sum4,dot2;
-
-  int k,l,m,nx,ny,nz;
  const int twoorder = 2*order;

-  for (m = nzlo_fft; m <= nzhi_fft; m++) {
+  // loop over entire FFT grid
+  // each proc calculates contributions from every Pth grid point
+
+  bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
+  int nxy_pppm = nx_pppm * ny_pppm;
+
+  double qopt = 0.0;
+
+  for (bigint i = me; i < ngridtotal; i += nprocs) {
+    k = i % nx_pppm;
+    l = (i/nx_pppm) % ny_pppm;
+    m = i / nxy_pppm;
+
+    const int kper = k - nx_pppm*(2*k/nx_pppm);
+    const int lper = l - ny_pppm*(2*l/ny_pppm);
    const int mper = m - nz_pppm*(2*m/nz_pppm);

-    for (l = nylo_fft; l <= nyhi_fft; l++) {
-      const int lper = l - ny_pppm*(2*l/ny_pppm);
-
-      for (k = nxlo_fft; k <= nxhi_fft; k++) {
-        const int kper = k - nx_pppm*(2*k/nx_pppm);
-
    sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
+    if (sqk == 0.0) continue;

-        if (sqk != 0.0) {
+    sum1 = sum2 = sum3 = sum4 = 0.0;

-          sum1 = 0.0;
-          sum2 = 0.0;
-          sum3 = 0.0;
-          sum4 = 0.0;
    for (nx = -2; nx <= 2; nx++) {
      qx = unitkx*(kper+nx_pppm*nx);
      sx = exp(-0.25*square(qx/g_ewald));
@ -1224,6 +1199,7 @@ double PPPM::compute_qopt()
          dot2 = qx+qy+qz;
          u1   = sx*sy*sz;
          u2   = wx*wy*wz;
+
          sum1 += u1*u1/dot2*MY_4PI*MY_4PI;
          sum2 += u1 * u2 * MY_4PI;
          sum3 += u2;
@ -1231,12 +1207,13 @@ double PPPM::compute_qopt()
        }
      }
    }
+
    sum2 *= sum2;
    qopt += sum1 - sum2/(sum3*sum4);
  }
-      }
-    }
-  }
+
+  // sum qopt over all procs
+
  double qopt_all;
  MPI_Allreduce(&qopt,&qopt_all,1,MPI_DOUBLE,MPI_SUM,world);
  return qopt_all;
@ -1349,7 +1326,9 @@ void PPPM::set_grid_local()
  // nlo_in,nhi_in = lower/upper limits of the 3d sub-brick of
  //   global PPPM grid that I own without ghost cells
  // for slab PPPM, assign z grid as if it were not extended
+  // both non-tiled and tiled proc layouts use 0-1 fractional sumdomain info

+  if (comm->layout != Comm::LAYOUT_TILED) {
    nxlo_in = static_cast<int> (comm->xsplit[comm->myloc[0]] * nx_pppm);
    nxhi_in = static_cast<int> (comm->xsplit[comm->myloc[0]+1] * nx_pppm) - 1;

@ -1361,6 +1340,17 @@ void PPPM::set_grid_local()
    nzhi_in = static_cast<int>
      (comm->zsplit[comm->myloc[2]+1] * nz_pppm/slab_volfactor) - 1;

+  } else {
+    nxlo_in = static_cast<int> (comm->mysplit[0][0] * nx_pppm);
+    nxhi_in = static_cast<int> (comm->mysplit[0][1] * nx_pppm) - 1;
+
+    nylo_in = static_cast<int> (comm->mysplit[1][0] * ny_pppm);
+    nyhi_in = static_cast<int> (comm->mysplit[1][1] * ny_pppm) - 1;
+
+    nzlo_in = static_cast<int> (comm->mysplit[2][0] * nz_pppm/slab_volfactor);
+    nzhi_in = static_cast<int> (comm->mysplit[2][1] * nz_pppm/slab_volfactor) - 1;
+  }
+
  // nlower,nupper = stencil size for mapping particles to PPPM grid

  nlower = -(order-1)/2;
@ -1446,22 +1436,26 @@ void PPPM::set_grid_local()
  //   -z proc, but not vice versa
  // this is accomplished by nzhi_in = nzhi_out on +z end (no ghost cells)
  // also insure no other procs use ghost cells beyond +z limit
+  // differnet logic for non-tiled vs tiled decomposition

  if (slabflag == 1) {
-    if (comm->myloc[2] == comm->procgrid[2]-1)
-      nzhi_in = nzhi_out = nz_pppm - 1;
+    if (comm->layout != Comm::LAYOUT_TILED) {
+      if (comm->myloc[2] == comm->procgrid[2]-1) nzhi_in = nzhi_out = nz_pppm - 1;
+    } else {
+      if (comm->mysplit[2][1] == 1.0) nzhi_in = nzhi_out = nz_pppm - 1;
+    }
    nzhi_out = MIN(nzhi_out,nz_pppm-1);
  }

-  // decomposition of FFT mesh
+  // x-pencil decomposition of FFT mesh
  // global indices range from 0 to N-1
-  // proc owns entire x-dimension, clumps of columns in y,z dimensions
+  // each proc owns entire x-dimension, clumps of columns in y,z dimensions
  // npey_fft,npez_fft = # of procs in y,z dims
  // if nprocs is small enough, proc can own 1 or more entire xy planes,
  //   else proc owns 2d sub-blocks of yz plane
  // me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions
  // nlo_fft,nhi_fft = lower/upper limit of the section
-  //   of the global FFT mesh that I own
+  //   of the global FFT mesh that I own in x-pencil decomposition

  int npey_fft,npez_fft;
  if (nz_pppm >= nprocs) {
@ -1479,13 +1473,13 @@ void PPPM::set_grid_local()
  nzlo_fft = me_z*nz_pppm/npez_fft;
  nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1;

-  // PPPM grid pts owned by this proc, including ghosts
+  // ngrid = count of PPPM grid pts owned by this proc, including ghosts

  ngrid = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
    (nzhi_out-nzlo_out+1);

-  // FFT grids owned by this proc, without ghosts
-  // nfft = FFT points in FFT decomposition on this proc
+  // count of FFT grids pts owned by this proc, without ghosts
+  // nfft = FFT points in x-pencil FFT decomposition on this proc
  // nfft_brick = FFT points in 3d brick-decomposition on this proc
  // nfft_both = greater of 2 values

@ -2637,8 +2631,10 @@ void PPPM::fieldforce_peratom()
   pack own values to buf to send to another proc
 ------------------------------------------------------------------------- */

-void PPPM::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPM::pack_forward_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
  int n = 0;

  if (flag == FORWARD_IK) {
@ -2695,8 +2691,10 @@ void PPPM::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
   unpack another proc's own values from buf and set own ghost values
 ------------------------------------------------------------------------- */

-void PPPM::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPM::unpack_forward_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
  int n = 0;

  if (flag == FORWARD_IK) {
@ -2753,8 +2751,10 @@ void PPPM::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
   pack ghost values into buf to send to another proc
 ------------------------------------------------------------------------- */

-void PPPM::pack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPM::pack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
  if (flag == REVERSE_RHO) {
    FFT_SCALAR *src = &density_brick[nzlo_out][nylo_out][nxlo_out];
    for (int i = 0; i < nlist; i++)
@ -2766,8 +2766,10 @@ void PPPM::pack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
   unpack another proc's ghost values from buf and add to own values
 ------------------------------------------------------------------------- */

-void PPPM::unpack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPM::unpack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
  if (flag == REVERSE_RHO) {
    FFT_SCALAR *dest = &density_brick[nzlo_out][nylo_out][nxlo_out];
    for (int i = 0; i < nlist; i++)
@ -3056,6 +3058,7 @@ int PPPM::timing_3d(int n, double &time3d)
 double PPPM::memory_usage()
 {
  double bytes = nmax*3 * sizeof(double);
+
  int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
    (nzhi_out-nzlo_out+1);
  if (differentiation_flag == 1) {
@ -3063,6 +3066,7 @@ double PPPM::memory_usage()
  } else {
    bytes += 4 * nbrick * sizeof(FFT_SCALAR);
  }
+
  if (triclinic) bytes += 3 * nfft_both * sizeof(double);
  bytes += 6 * nfft_both * sizeof(double);
  bytes += nfft_both * sizeof(double);
@ -3076,7 +3080,9 @@ double PPPM::memory_usage()
    bytes += 2 * nfft_both * sizeof(FFT_SCALAR);;
  }

-  if (cg) bytes += cg->memory_usage();
+  // two GridComm bufs
+
+  bytes += (ngc_buf1 + ngc_buf2) * npergrid * sizeof(FFT_SCALAR);

  return bytes;
 }
@ -3134,7 +3140,8 @@ void PPPM::compute_group_group(int groupbit_A, int groupbit_B, int AA_flag)
  density_brick = density_A_brick;
  density_fft = density_A_fft;

-  cg->reverse_comm(this,REVERSE_RHO);
+  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+                          gc_buf1,gc_buf2,MPI_FFT_SCALAR);
  brick2fft();

  // group B
@ -3142,7 +3149,8 @@ void PPPM::compute_group_group(int groupbit_A, int groupbit_B, int AA_flag)
  density_brick = density_B_brick;
  density_fft = density_B_fft;

-  cg->reverse_comm(this,REVERSE_RHO);
+  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+                          gc_buf1,gc_buf2,MPI_FFT_SCALAR);
  brick2fft();

  // switch back pointers
@ -3436,7 +3444,7 @@ void PPPM::poisson_groups_triclinic()
 }

 /* ----------------------------------------------------------------------
-   Slab-geometry correction term to dampen inter-slab interactions between
+   slab-geometry correction term to dampen inter-slab interactions between
   periodically repeating slabs.  Yields good approximation to 2D Ewald if
   adequate empty space is left between repeating slabs (J. Chem. Phys.
   111, 3155).  Slabs defined here to be parallel to the xy plane. Also
--- a/src/KSPACE/pppm.h
+++ b/src/KSPACE/pppm.h
@ -96,17 +96,21 @@ class PPPM : public KSpace {
  double sf_coeff[6];          // coefficients for calculating ad self-forces
  double **acons;

+  // FFTs and grid communication
+
+  class FFT3d *fft1,*fft2;
+  class Remap *remap;
+  class GridComm *gc;
+
+  FFT_SCALAR *gc_buf1,*gc_buf2;
+  int ngc_buf1,ngc_buf2,npergrid;
+
  // group-group interactions

  int group_allocate_flag;
  FFT_SCALAR ***density_A_brick,***density_B_brick;
  FFT_SCALAR *density_A_fft,*density_B_fft;

-  class FFT3d *fft1,*fft2;
-  class Remap *remap;
-  class GridComm *cg;
-  class GridComm *cg_peratom;
-
  int **part2grid;             // storage for particle -> grid mapping
  int nmax;

@ -160,10 +164,10 @@ class PPPM : public KSpace {

  // grid communication

-  virtual void pack_forward(int, FFT_SCALAR *, int, int *);
-  virtual void unpack_forward(int, FFT_SCALAR *, int, int *);
-  virtual void pack_reverse(int, FFT_SCALAR *, int, int *);
-  virtual void unpack_reverse(int, FFT_SCALAR *, int, int *);
+  virtual void pack_forward_grid(int, void *, int, int *);
+  virtual void unpack_forward_grid(int, void *, int, int *);
+  virtual void pack_reverse_grid(int, void *, int, int *);
+  virtual void unpack_reverse_grid(int, void *, int, int *);

  // triclinic

--- a/src/KSPACE/pppm_cg.cpp
+++ b/src/KSPACE/pppm_cg.cpp
@ -90,11 +90,7 @@ void PPPMCG::compute(int eflag, int vflag)

  ev_init(eflag,vflag);

-  if (evflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    cg_peratom->ghost_notify();
-    cg_peratom->setup();
-  }
+  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();

  // if atom count has changed, update qsum and qsqsum

@ -162,6 +158,7 @@ void PPPMCG::compute(int eflag, int vflag)
  }

  // only need to rebuild this list after a neighbor list update
+
  if (neighbor->ago == 0) {
    num_charged = 0;
    for (int i = 0; i < atom->nlocal; ++i) {
@ -182,7 +179,8 @@ void PPPMCG::compute(int eflag, int vflag)
  //   to fully sum contribution in their 3d bricks
  // remap from 3d decomposition to FFT decomposition

-  cg->reverse_comm(this,REVERSE_RHO);
+  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+                          gc_buf1,gc_buf2,MPI_FFT_SCALAR);
  brick2fft();

  // compute potential gradient on my FFT grid and
@ -195,16 +193,22 @@ void PPPMCG::compute(int eflag, int vflag)
  // all procs communicate E-field values
  // to fill ghost cells surrounding their 3d bricks

-  if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
-  else cg->forward_comm(this,FORWARD_IK);
+  if (differentiation_flag == 1)
+    gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  else
+    gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);

  // extra per-atom energy/virial communication

  if (evflag_atom) {
    if (differentiation_flag == 1 && vflag_atom)
-      cg_peratom->forward_comm(this,FORWARD_AD_PERATOM);
+      gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);
    else if (differentiation_flag == 0)
-      cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
+      gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);
  }

  // calculate the force on my particles
--- a/src/KSPACE/pppm_dipole.cpp
+++ b/src/KSPACE/pppm_dipole.cpp
@ -79,8 +79,7 @@ PPPMDipole::PPPMDipole(LAMMPS *lmp) : PPPM(lmp),
  dipoleflag = 1;
  group_group_enable = 0;

-  cg_dipole = NULL;
-  cg_peratom_dipole = NULL;
+  gc_dipole = NULL;
 }

 /* ----------------------------------------------------------------------
@ -93,10 +92,6 @@ PPPMDipole::~PPPMDipole()

  deallocate();
  if (peratom_allocate_flag) deallocate_peratom();
-  fft1 = NULL;
-  fft2 = NULL;
-  remap = NULL;
-  cg_dipole = NULL;
 }

 /* ----------------------------------------------------------------------
@ -195,9 +190,7 @@ void PPPMDipole::init()
  //   or overlap is allowed, then done
  // else reduce order and try again

-  int (*procneigh)[2] = comm->procneigh;
-
-  GridComm *cgtmp = NULL;
+  GridComm *gctmp = NULL;
  int iteration = 0;

  while (order >= minorder) {
@ -210,24 +203,24 @@ void PPPMDipole::init()
    set_grid_local();
    if (overlap_allowed) break;

-    cgtmp = new GridComm(lmp,world,1,1,
+    gctmp = new GridComm(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                         nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                         procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                         procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-    cgtmp->ghost_notify();
-    if (!cgtmp->ghost_overlap()) break;
-    delete cgtmp;
+                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
+
+    int tmp1,tmp2;
+    gctmp->setup(tmp1,tmp2);
+    if (gctmp->ghost_adjacent()) break;
+    delete gctmp;

    order--;
    iteration++;
  }

  if (order < minorder) error->all(FLERR,"PPPMDipole order < minimum allowed order");
-  if (!overlap_allowed && cgtmp->ghost_overlap())
+  if (!overlap_allowed && !gctmp->ghost_adjacent())
    error->all(FLERR,"PPPMDipole grid stencil extends "
               "beyond nearest neighbor processor");
-  if (cgtmp) delete cgtmp;
+  if (gctmp) delete gctmp;

  // adjust g_ewald

@ -261,8 +254,6 @@ void PPPMDipole::init()
  // don't invoke allocate peratom(), will be allocated when needed

  allocate();
-  cg_dipole->ghost_notify();
-  cg_dipole->setup();

  // pre-compute Green's function denomiator expansion
  // pre-compute 1d charge distribution coefficients
@ -385,11 +376,9 @@ void PPPMDipole::setup_grid()

  allocate();

-  cg_dipole->ghost_notify();
-  if (overlap_allowed == 0 && cg_dipole->ghost_overlap())
+  if (!overlap_allowed && !gc_dipole->ghost_adjacent())
    error->all(FLERR,"PPPMDipole grid stencil extends "
               "beyond nearest neighbor processor");
-  cg_dipole->setup();

  // pre-compute Green's function denomiator expansion
  // pre-compute 1d charge distribution coefficients
@ -421,11 +410,7 @@ void PPPMDipole::compute(int eflag, int vflag)
    error->all(FLERR,"Cannot (yet) compute per-atom virial "
                       "with kspace style pppm/dipole");

-  if (evflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    cg_peratom_dipole->ghost_notify();
-    cg_peratom_dipole->setup();
-  }
+  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();

  // if atom count has changed, update qsum and qsqsum

@ -460,7 +445,8 @@ void PPPMDipole::compute(int eflag, int vflag)
  //   to fully sum contribution in their 3d bricks
  // remap from 3d decomposition to FFT decomposition

-  cg_dipole->reverse_comm(this,REVERSE_MU);
+  gc_dipole->reverse_comm_kspace(this,3,sizeof(FFT_SCALAR),REVERSE_MU,
+                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
  brick2fft_dipole();

  // compute potential gradient on my FFT grid and
@ -473,13 +459,14 @@ void PPPMDipole::compute(int eflag, int vflag)
  // all procs communicate E-field values
  // to fill ghost cells surrounding their 3d bricks

-  cg_dipole->forward_comm(this,FORWARD_MU);
+  gc_dipole->forward_comm_kspace(this,9,sizeof(FFT_SCALAR),FORWARD_MU,
+                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);

  // extra per-atom energy/virial communication

-  if (evflag_atom) {
-    cg_peratom_dipole->forward_comm(this,FORWARD_MU_PERATOM);
-  }
+  if (evflag_atom)
+    gc->forward_comm_kspace(this,18,sizeof(FFT_SCALAR),FORWARD_MU_PERATOM,
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);

  // calculate the force on my particles

@ -522,7 +509,8 @@ void PPPMDipole::compute(int eflag, int vflag)
    if (eflag_atom) {
      for (i = 0; i < nlocal; i++) {
        eatom[i] *= 0.5;
-        eatom[i] -= (mu[i][0]*mu[i][0] + mu[i][1]*mu[i][1] + mu[i][2]*mu[i][2])*2.0*g3/3.0/MY_PIS;
+        eatom[i] -= (mu[i][0]*mu[i][0] + mu[i][1]*mu[i][1] +
+                     mu[i][2]*mu[i][2])*2.0*g3/3.0/MY_PIS;
        eatom[i] *= qscale;
      }
    }
@ -619,14 +607,18 @@ void PPPMDipole::allocate()
                    1,0,0,FFT_PRECISION,collective_flag);

  // create ghost grid object for rho and electric field communication
+  // also create 2 bufs for ghost grid cell comm, passed to GridComm methods

-  int (*procneigh)[2] = comm->procneigh;
-
-  cg_dipole = new GridComm(lmp,world,9,3,
+  gc_dipole = new GridComm(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                           nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                           nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                           procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                           procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+                           nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
+
+  gc->setup(ngc_buf1,ngc_buf2);
+
+  npergrid = 9;
+
+  memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
+  memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
 }

 /* ----------------------------------------------------------------------
@ -674,7 +666,9 @@ void PPPMDipole::deallocate()
  delete fft1;
  delete fft2;
  delete remap;
-  delete cg_dipole;
+  delete gc_dipole;
+  memory->destroy(gc_buf1);
+  memory->destroy(gc_buf2);
 }

 /* ----------------------------------------------------------------------
@ -724,16 +718,15 @@ void PPPMDipole::allocate_peratom()
  memory->create3d_offset(v5z_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:v5z_brick_dipole");

-  // create ghost grid object for rho and electric field communication
+  // use same GC ghost grid object for peratom grid communication
+  // but need to reallocate a larger gc_buf1 and gc_buf2

-  int (*procneigh)[2] = comm->procneigh;
+  npergrid = 18;

-  cg_peratom_dipole =
-    new GridComm(lmp,world,18,1,
-                 nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                 nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                 procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                 procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+  memory->destroy(gc_buf1);
+  memory->destroy(gc_buf2);
+  memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
+  memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
 }

 /* ----------------------------------------------------------------------
@ -764,8 +757,6 @@ void PPPMDipole::deallocate_peratom()
  memory->destroy3d_offset(v3z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(v4z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(v5z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
-
-  delete cg_peratom_dipole;
 }

 /* ----------------------------------------------------------------------
@ -2171,8 +2162,10 @@ void PPPMDipole::fieldforce_peratom_dipole()
   pack own values to buf to send to another proc
 ------------------------------------------------------------------------- */

-void PPPMDipole::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPMDipole::pack_forward_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
  int n = 0;

  if (flag == FORWARD_MU) {
@ -2242,8 +2235,10 @@ void PPPMDipole::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
   unpack another proc's own values from buf and set own ghost values
 ------------------------------------------------------------------------- */

-void PPPMDipole::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPMDipole::unpack_forward_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
  int n = 0;

  if (flag == FORWARD_MU) {
@ -2313,8 +2308,10 @@ void PPPMDipole::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
   pack ghost values into buf to send to another proc
 ------------------------------------------------------------------------- */

-void PPPMDipole::pack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPMDipole::pack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
  int n = 0;
  if (flag == REVERSE_MU) {
    FFT_SCALAR *src_dipole0 = &densityx_brick_dipole[nzlo_out][nylo_out][nxlo_out];
@ -2332,8 +2329,10 @@ void PPPMDipole::pack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
   unpack another proc's ghost values from buf and add to own values
 ------------------------------------------------------------------------- */

-void PPPMDipole::unpack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
+void PPPMDipole::unpack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
 {
+  FFT_SCALAR *buf = (FFT_SCALAR *) vbuf;
+
  int n = 0;
  if (flag == REVERSE_MU) {
    FFT_SCALAR *dest_dipole0 = &densityx_brick_dipole[nzlo_out][nylo_out][nxlo_out];
@ -2484,6 +2483,7 @@ int PPPMDipole::timing_3d(int n, double &time3d)
 double PPPMDipole::memory_usage()
 {
  double bytes = nmax*3 * sizeof(double);
+
  int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
    (nzhi_out-nzlo_out+1);
  bytes += 6 * nfft_both * sizeof(double);   // vg
@ -2495,8 +2495,9 @@ double PPPMDipole::memory_usage()
  if (peratom_allocate_flag)
    bytes += 21 * nbrick * sizeof(FFT_SCALAR);

-  if (cg_dipole) bytes += cg_dipole->memory_usage();
-  if (cg_peratom_dipole) bytes += cg_peratom_dipole->memory_usage();
+  // two GridComm bufs
+
+  bytes += (ngc_buf1 + ngc_buf2) * npergrid * sizeof(FFT_SCALAR);

  return bytes;
 }
--- a/src/KSPACE/pppm_dipole.h
+++ b/src/KSPACE/pppm_dipole.h
@ -50,10 +50,10 @@ class PPPMDipole : public PPPM {

  // grid communication

-  void pack_forward(int, FFT_SCALAR *, int, int *);
-  void unpack_forward(int, FFT_SCALAR *, int, int *);
-  void pack_reverse(int, FFT_SCALAR *, int, int *);
-  void unpack_reverse(int, FFT_SCALAR *, int, int *);
+  void pack_forward_grid(int, void *, int, int *);
+  void unpack_forward_grid(int, void *, int, int *);
+  void pack_reverse_grid(int, void *, int, int *);
+  void unpack_reverse_grid(int, void *, int, int *);

  // dipole

@ -69,10 +69,12 @@ class PPPMDipole : public PPPM {
  FFT_SCALAR ***v3z_brick_dipole,***v4z_brick_dipole,***v5z_brick_dipole;
  FFT_SCALAR *work3,*work4;
  FFT_SCALAR *densityx_fft_dipole,*densityy_fft_dipole,*densityz_fft_dipole;
-  class GridComm *cg_dipole;
-  class GridComm *cg_peratom_dipole;
+
+  class GridComm *gc_dipole;
+
  int only_dipole_flag;
  double musum,musqsum,mu2;
+
  double find_gewald_dipole(double, double, bigint, double, double);
  double newton_raphson_f_dipole(double, double, bigint, double, double);
  double derivf_dipole(double, double, bigint, double, double);
@ -87,7 +89,6 @@ class PPPMDipole : public PPPM {
  void fieldforce_peratom_dipole();
  double final_accuracy_dipole();
  void musum_musq();
-
 };

 }
--- a/src/KSPACE/pppm_dipole_spin.cpp
+++ b/src/KSPACE/pppm_dipole_spin.cpp
@ -81,7 +81,7 @@ PPPMDipoleSpin::~PPPMDipoleSpin()
  fft1 = NULL;
  fft2 = NULL;
  remap = NULL;
-  cg_dipole = NULL;
+  gc_dipole = NULL;
 }

 /* ----------------------------------------------------------------------
@ -175,9 +175,7 @@ void PPPMDipoleSpin::init()
  //   or overlap is allowed, then done
  // else reduce order and try again

-  int (*procneigh)[2] = comm->procneigh;
-
-  GridComm *cgtmp = NULL;
+  GridComm *gctmp = NULL;
  int iteration = 0;

  while (order >= minorder) {
@ -190,24 +188,24 @@ void PPPMDipoleSpin::init()
    set_grid_local();
    if (overlap_allowed) break;

-    cgtmp = new GridComm(lmp,world,1,1,
+    gctmp = new GridComm(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                         nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                         procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                         procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-    cgtmp->ghost_notify();
-    if (!cgtmp->ghost_overlap()) break;
-    delete cgtmp;
+                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
+
+    int tmp1,tmp2;
+    gctmp->setup(tmp1,tmp2);
+    if (gctmp->ghost_adjacent()) break;
+    delete gctmp;

    order--;
    iteration++;
  }

  if (order < minorder) error->all(FLERR,"PPPMDipoleSpin order < minimum allowed order");
-  if (!overlap_allowed && cgtmp->ghost_overlap())
+  if (!overlap_allowed && !gctmp->ghost_adjacent())
    error->all(FLERR,"PPPMDipoleSpin grid stencil extends "
               "beyond nearest neighbor processor");
-  if (cgtmp) delete cgtmp;
+  if (gctmp) delete gctmp;

  // adjust g_ewald

@ -241,8 +239,6 @@ void PPPMDipoleSpin::init()
  // don't invoke allocate peratom(), will be allocated when needed

  allocate();
-  cg_dipole->ghost_notify();
-  cg_dipole->setup();

  // pre-compute Green's function denominator expansion
  // pre-compute 1d charge distribution coefficients
@ -270,11 +266,7 @@ void PPPMDipoleSpin::compute(int eflag, int vflag)
    error->all(FLERR,"Cannot (yet) compute per-atom virial "
                       "with kspace style pppm/dipole/spin");

-  if (evflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    cg_peratom_dipole->ghost_notify();
-    cg_peratom_dipole->setup();
-  }
+  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();

  // if atom count has changed, update qsum and qsqsum

@ -309,7 +301,8 @@ void PPPMDipoleSpin::compute(int eflag, int vflag)
  //   to fully sum contribution in their 3d bricks
  // remap from 3d decomposition to FFT decomposition

-  cg_dipole->reverse_comm(this,REVERSE_MU);
+  gc_dipole->reverse_comm_kspace(this,3,sizeof(FFT_SCALAR),REVERSE_MU,
+                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);
  brick2fft_dipole();

  // compute potential gradient on my FFT grid and
@ -322,13 +315,14 @@ void PPPMDipoleSpin::compute(int eflag, int vflag)
  // all procs communicate E-field values
  // to fill ghost cells surrounding their 3d bricks

-  cg_dipole->forward_comm(this,FORWARD_MU);
+  gc_dipole->forward_comm_kspace(this,9,sizeof(FFT_SCALAR),FORWARD_MU,
+                                 gc_buf1,gc_buf2,MPI_FFT_SCALAR);

  // extra per-atom energy/virial communication

-  if (evflag_atom) {
-    cg_peratom_dipole->forward_comm(this,FORWARD_MU_PERATOM);
-  }
+  if (evflag_atom)
+    gc->forward_comm_kspace(this,18,sizeof(FFT_SCALAR),FORWARD_MU_PERATOM,
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);

  // calculate the force on my particles

--- a/src/KSPACE/pppm_dipole_spin.h
+++ b/src/KSPACE/pppm_dipole_spin.h
@ -46,7 +46,6 @@ class PPPMDipoleSpin : public PPPMDipole {
  void fieldforce_ik_spin();
  void fieldforce_peratom_spin();
  void spsum_spsq();
-
 };

 }
--- a/src/KSPACE/pppm_disp.cpp
+++ b/src/KSPACE/pppm_disp.cpp
--- a/src/KSPACE/pppm_disp.h
+++ b/src/KSPACE/pppm_disp.h
@ -42,7 +42,6 @@ typedef double FFT_SCALAR;

 namespace LAMMPS_NS {

-
 #define EWALD_MAXORDER  6
 #define EWALD_FUNCS     4

@ -190,15 +189,14 @@ Variables needed for calculating the 1/r and 1/r^6 potential
  FFT_SCALAR *work1,*work2;
  FFT_SCALAR *work1_6, *work2_6;

-
  class FFT3d *fft1,*fft2 ;
-  class FFT3d *fft1_6, *fft2_6;
-  class Remap *remap;
-  class Remap *remap_6;
-  class GridComm *cg;
-  class GridComm *cg_peratom;
-  class GridComm *cg_6;
-  class GridComm *cg_peratom_6;
+  class FFT3d *fft1_6,*fft2_6;
+  class Remap *remap,*remap_6;
+  class GridComm *gc,*gc6;
+
+  FFT_SCALAR *gc_buf1,*gc_buf2,*gc6_buf1,*gc6_buf2;
+  int ngc_buf1,ngc_buf2,npergrid;
+  int ngc6_buf1,ngc6_buf2,npergrid6;

  int **part2grid;             // storage for particle -> grid mapping
  int **part2grid_6;
@ -257,7 +255,6 @@ Variables needed for calculating the 1/r and 1/r^6 potential
  void compute_gf_denom(double*, int);
  double gf_denom(double, double, double, double*, int);

-
  void compute_sf_precoeff(int, int, int, int,
                           int, int, int,
                           int, int, int,
@ -268,7 +265,6 @@ Variables needed for calculating the 1/r and 1/r^6 potential
  void compute_gf_6();
  void compute_sf_coeff_6();

-
  virtual void particle_map(double, double, double,
                             double, int **, int, int,
                             int, int, int,
@ -295,8 +291,10 @@ Variables needed for calculating the 1/r and 1/r^6 potential
                          int, int, int, double&, double *,
                          double *, double *, double *,
                          double *, double *, double *,
-                          FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***, double *, double **, double **,
-                          FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
+                          FFT_SCALAR ***, FFT_SCALAR ***,
+			  FFT_SCALAR ***, double *, double **, double **,
+                          FFT_SCALAR ***, FFT_SCALAR ***,
+			  FFT_SCALAR ***, FFT_SCALAR ***,
                          FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***);

  virtual void poisson_ad(FFT_SCALAR*, FFT_SCALAR*,
@ -317,14 +315,18 @@ Variables needed for calculating the 1/r and 1/r^6 potential
  virtual void poisson_2s_ik(FFT_SCALAR *, FFT_SCALAR *,
                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
-                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
-                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
+			     FFT_SCALAR ***,
+                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
+                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
+			     FFT_SCALAR ***,
                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***);
  virtual void poisson_2s_ad(FFT_SCALAR *, FFT_SCALAR *,
-                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
-                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
+			     FFT_SCALAR ***,
+                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
+                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
+			     FFT_SCALAR ***,
                             FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***);

  virtual void poisson_2s_peratom(FFT_SCALAR***, FFT_SCALAR***, FFT_SCALAR***,
@ -339,9 +341,11 @@ Variables needed for calculating the 1/r and 1/r^6 potential
  virtual void poisson_none_ik(int, int, FFT_SCALAR *, FFT_SCALAR *,
                               FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
                               FFT_SCALAR ***, FFT_SCALAR ***, FFT_SCALAR ***,
-                               FFT_SCALAR ****, FFT_SCALAR ****, FFT_SCALAR ****, FFT_SCALAR ****,
+                               FFT_SCALAR ****, FFT_SCALAR ****, FFT_SCALAR ****,
+			       FFT_SCALAR ****,
                               FFT_SCALAR ****, FFT_SCALAR ****, FFT_SCALAR ****);
-  virtual void poisson_none_peratom(int, int, FFT_SCALAR***, FFT_SCALAR***, FFT_SCALAR***,
+  virtual void poisson_none_peratom(int, int,
+				    FFT_SCALAR***, FFT_SCALAR***, FFT_SCALAR***,
                                    FFT_SCALAR***, FFT_SCALAR***, FFT_SCALAR***,
                                    FFT_SCALAR***, FFT_SCALAR***, FFT_SCALAR***,
                                    FFT_SCALAR***, FFT_SCALAR***, FFT_SCALAR***);
@ -369,10 +373,10 @@ Variables needed for calculating the 1/r and 1/r^6 potential

  // grid communication

-  void pack_forward(int, FFT_SCALAR *, int, int *);
-  void unpack_forward(int, FFT_SCALAR *, int, int *);
-  void pack_reverse(int, FFT_SCALAR *, int, int *);
-  void unpack_reverse(int, FFT_SCALAR *, int, int *);
+  void pack_forward_grid(int, void *, int, int *);
+  void unpack_forward_grid(int, void *, int, int *);
+  void pack_reverse_grid(int, void *, int, int *);
+  void unpack_reverse_grid(int, void *, int, int *);
 };

 }
--- a/src/KSPACE/pppm_stagger.cpp
+++ b/src/KSPACE/pppm_stagger.cpp
@ -123,11 +123,7 @@ void PPPMStagger::compute(int eflag, int vflag)

  ev_init(eflag,vflag);

-  if (evflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    cg_peratom->ghost_notify();
-    cg_peratom->setup();
-  }
+  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();

  // convert atoms from box to lamda coords

@ -160,7 +156,8 @@ void PPPMStagger::compute(int eflag, int vflag)
    //   to fully sum contribution in their 3d bricks
    // remap from 3d decomposition to FFT decomposition

-    cg->reverse_comm(this,REVERSE_RHO);
+    gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
    brick2fft();

    // compute potential gradient on my FFT grid and
@ -173,16 +170,22 @@ void PPPMStagger::compute(int eflag, int vflag)
    // all procs communicate E-field values
    // to fill ghost cells surrounding their 3d bricks

-    if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
-    else cg->forward_comm(this,FORWARD_IK);
+    if (differentiation_flag == 1)
+      gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
+                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+    else
+      gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
+                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);

    // extra per-atom energy/virial communication

    if (evflag_atom) {
      if (differentiation_flag == 1 && vflag_atom)
-        cg_peratom->forward_comm(this,FORWARD_AD_PERATOM);
+        gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+                                gc_buf1,gc_buf2,MPI_FFT_SCALAR);
      else if (differentiation_flag == 0)
-        cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
+        gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+                                gc_buf1,gc_buf2,MPI_FFT_SCALAR);
    }

    // calculate the force on my particles
@ -268,10 +271,16 @@ void PPPMStagger::compute(int eflag, int vflag)

 double PPPMStagger::compute_qopt()
 {
-  if (differentiation_flag == 1)
-    return compute_qopt_ad();
+  if (differentiation_flag == 1) return compute_qopt_ad();
+
+  int k,l,m,nx,ny,nz;
+  double snx,sny,snz;
+  double cnx,cny,cnz;
+  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  double sum1,sum2,dot1,dot2;
+  double numerator,denominator;
+  double u1,u2,u3,sqk;

-  double qopt = 0.0;
  const double * const prd = domain->prd;

  const double xprd = prd[0];
@ -282,43 +291,43 @@ double PPPMStagger::compute_qopt()
  const double unitky = (MY_2PI/yprd);
  const double unitkz = (MY_2PI/zprd_slab);

-  double snx,sny,snz;
-  double cnx,cny,cnz;
-  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
-  double sum1,sum2,dot1,dot2;
-  double numerator,denominator;
-  double u1,u2,u3,sqk;
-
-  int k,l,m,nx,ny,nz,kper,lper,mper;
-
  const int nbx = 2;
  const int nby = 2;
  const int nbz = 2;

  const int twoorder = 2*order;

-  for (m = nzlo_fft; m <= nzhi_fft; m++) {
-    mper = m - nz_pppm*(2*m/nz_pppm);
+  // loop over entire FFT grid
+  // each proc calculates contributions from every Pth grid point
+
+  bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
+  int nxy_pppm = nx_pppm * ny_pppm;
+
+  double qopt = 0.0;
+
+  for (bigint i = me; i < ngridtotal; i += nprocs) {
+    k = i % nx_pppm;
+    l = (i/nx_pppm) % ny_pppm;
+    m = i / nxy_pppm;
+
+    const int kper = k - nx_pppm*(2*k/nx_pppm);
+    const int lper = l - ny_pppm*(2*l/ny_pppm);
+    const int mper = m - nz_pppm*(2*m/nz_pppm);
+
+    sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
+    if (sqk == 0.0) continue;
+
+    snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));
+    cnx = cos(0.5*unitkx*kper*xprd/nx_pppm);
+    sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));
+    cny = cos(0.5*unitky*lper*yprd/ny_pppm);
    snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm));
    cnz = cos(0.5*unitkz*mper*zprd_slab/nz_pppm);

-    for (l = nylo_fft; l <= nyhi_fft; l++) {
-      lper = l - ny_pppm*(2*l/ny_pppm);
-      sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));
-      cny = cos(0.5*unitky*lper*yprd/ny_pppm);
-
-      for (k = nxlo_fft; k <= nxhi_fft; k++) {
-        kper = k - nx_pppm*(2*k/nx_pppm);
-        snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));
-        cnx = cos(0.5*unitkx*kper*xprd/nx_pppm);
-
-        sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
-
-        if (sqk != 0.0) {
    numerator = MY_4PI/sqk;
    denominator = 0.5*(gf_denom(snx,sny,snz) + gf_denom2(cnx,cny,cnz));
-          sum1 = 0.0;
-          sum2 = 0.0;
+
+    sum1 = sum2 = 0.0;

    for (nx = -nbx; nx <= nbx; nx++) {
      qx = unitkx*(kper+nx_pppm*nx);
@ -348,11 +357,10 @@ double PPPMStagger::compute_qopt()
        }
      }
    }
+
    qopt += sum1 - sum2/denominator;
  }
-      }
-    }
-  }
+
  double qopt_all;
  MPI_Allreduce(&qopt,&qopt_all,1,MPI_DOUBLE,MPI_SUM,world);
  return qopt_all;
@ -364,7 +372,11 @@ double PPPMStagger::compute_qopt()

 double PPPMStagger::compute_qopt_ad()
 {
-  double qopt = 0.0;
+  int k,l,m,nx,ny,nz;
+  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  double sum1,sum2,sum3,sum4,sum5,sum6,dot2;
+  double u1,u2,sqk;
+
  const double * const prd = domain->prd;

  const double xprd = prd[0];
@ -375,36 +387,33 @@ double PPPMStagger::compute_qopt_ad()
  const double unitky = (MY_2PI/yprd);
  const double unitkz = (MY_2PI/zprd_slab);

-  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
-  double sum1,sum2,sum3,sum4,sum5,sum6,dot2;
-  double u1,u2,sqk;
-
-  int k,l,m,nx,ny,nz,kper,lper,mper;
-
  const int nbx = 2;
  const int nby = 2;
  const int nbz = 2;

  const int twoorder = 2*order;

-  for (m = nzlo_fft; m <= nzhi_fft; m++) {
-    mper = m - nz_pppm*(2*m/nz_pppm);
+  // loop over entire FFT grid
+  // each proc calculates contributions from every Pth grid point

-    for (l = nylo_fft; l <= nyhi_fft; l++) {
-      lper = l - ny_pppm*(2*l/ny_pppm);
+  bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
+  int nxy_pppm = nx_pppm * ny_pppm;

-      for (k = nxlo_fft; k <= nxhi_fft; k++) {
-        kper = k - nx_pppm*(2*k/nx_pppm);
+  double qopt = 0.0;
+
+  for (bigint i = me; i < ngridtotal; i += nprocs) {
+    k = i % nx_pppm;
+    l = (i/nx_pppm) % ny_pppm;
+    m = i / nxy_pppm;
+
+    const int kper = k - nx_pppm*(2*k/nx_pppm);
+    const int lper = l - ny_pppm*(2*l/ny_pppm);
+    const int mper = m - nz_pppm*(2*m/nz_pppm);

    sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
+    if (sqk == 0.0) continue;

-        if (sqk != 0.0) {
-          sum1 = 0.0;
-          sum2 = 0.0;
-          sum3 = 0.0;
-          sum4 = 0.0;
-          sum5 = 0.0;
-          sum6 = 0.0;
+    sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = 0.0;

    for (nx = -nbx; nx <= nbx; nx++) {
      qx = unitkx*(kper+nx_pppm*nx);
@ -436,11 +445,10 @@ double PPPMStagger::compute_qopt_ad()
        }
      }
    }
+
    qopt += sum1 - sum2/(0.5*(sum3*sum4 + sum5*sum6));
  }
-      }
-    }
-  }
+
  double qopt_all;
  MPI_Allreduce(&qopt,&qopt_all,1,MPI_DOUBLE,MPI_SUM,world);
  return qopt_all;
--- a/src/KSPACE/pppm_stagger.h
+++ b/src/KSPACE/pppm_stagger.h
@ -50,7 +50,6 @@ class PPPMStagger : public PPPM {
  virtual void fieldforce_ad();
  virtual void fieldforce_peratom();

-
  inline double gf_denom2(const double &x, const double &y,
                         const double &z) const
  {
--- a/src/USER-INTEL/pppm_disp_intel.cpp
+++ b/src/USER-INTEL/pppm_disp_intel.cpp
@ -131,9 +131,10 @@ void PPPMDispIntel::init()
  // For vectorization, we need some padding in the end
  // The first thread computes on the global density
  if ((comm->nthreads > 1) && !_use_lrt) {
+    int mygrid = MAX(ngrid,ngrid_6);
    memory->destroy(perthread_density);
    memory->create(perthread_density, comm->nthreads-1,
-                   ngrid + INTEL_P3M_ALIGNED_MAXORDER,
+                   mygrid + INTEL_P3M_ALIGNED_MAXORDER,
                   "pppmdispintel:perthread_density");
  }

@ -173,23 +174,15 @@ void PPPMDispIntel::compute(int eflag, int vflag)
    return;
  }
  #endif
+
  int i;
+
  // convert atoms from box to lamda coords

  ev_init(eflag,vflag);

-  if (evflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    if (function[0]) {
-      cg_peratom->ghost_notify();
-      cg_peratom->setup();
-    }
-    if (function[1] + function[2] + function[3]) {
-      cg_peratom_6->ghost_notify();
-      cg_peratom_6->setup();
-    }
-    peratom_allocate_flag = 1;
-  }
+  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();
+
  if (triclinic == 0) boxlo = domain->boxlo;
  else {
    boxlo = domain->boxlo_lamda;
@ -299,7 +292,8 @@ void PPPMDispIntel::compute(int eflag, int vflag)
      make_rho_c<float,float>(fix->get_single_buffers());
    }

-    cg->reverse_comm(this,REVERSE_RHO);
+    gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);

    brick2fft(nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
              density_brick, density_fft, work1,remap);
@ -312,7 +306,8 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                 energy_1, greensfn, virial_1, vg,vg2, u_brick, v0_brick,
                 v1_brick, v2_brick, v3_brick, v4_brick, v5_brick);

-      cg->forward_comm(this,FORWARD_AD);
+      gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
+                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);

      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
        fieldforce_c_ad<float,double>(fix->get_mixed_buffers());
@ -322,7 +317,9 @@ void PPPMDispIntel::compute(int eflag, int vflag)
        fieldforce_c_ad<float,float>(fix->get_single_buffers());
      }

-      if (vflag_atom) cg_peratom->forward_comm(this, FORWARD_AD_PERATOM);
+      if (vflag_atom)
+        gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+                                gc_buf1,gc_buf2,MPI_FFT_SCALAR);

    } else {
      poisson_ik(work1, work2, density_fft, fft1, fft2,
@ -334,7 +331,8 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                 u_brick, v0_brick, v1_brick, v2_brick, v3_brick, v4_brick,
                 v5_brick);

-      cg->forward_comm(this, FORWARD_IK);
+      gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
+                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);

      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
        fieldforce_c_ik<float,double>(fix->get_mixed_buffers());
@ -344,12 +342,15 @@ void PPPMDispIntel::compute(int eflag, int vflag)
        fieldforce_c_ik<float,float>(fix->get_single_buffers());
      }

-      if (evflag_atom) cg_peratom->forward_comm(this, FORWARD_IK_PERATOM);
+      if (evflag_atom)
+        gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+                                gc_buf1,gc_buf2,MPI_FFT_SCALAR);
    }
    if (evflag_atom) fieldforce_c_peratom();
  }

  if (function[1]) {
+
    //perform calculations for geometric mixing

    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
@ -375,14 +376,13 @@ void PPPMDispIntel::compute(int eflag, int vflag)
      make_rho_g<float,float>(fix->get_single_buffers());
    }

-
-    cg_6->reverse_comm(this, REVERSE_RHO_G);
+    gc6->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO_G,
+                             gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);

    brick2fft(nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, nzhi_in_6,
              density_brick_g, density_fft_g, work1_6,remap_6);

    if (differentiation_flag == 1) {
-
      poisson_ad(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
                 nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6,
                 nxlo_fft_6, nylo_fft_6, nzlo_fft_6, nxhi_fft_6,
@ -391,7 +391,8 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                 virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g, v1_brick_g,
                 v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);

-      cg_6->forward_comm(this,FORWARD_AD_G);
+      gc6->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD_G,
+                               gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);

      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
        fieldforce_g_ad<float,double>(fix->get_mixed_buffers());
@ -401,7 +402,9 @@ void PPPMDispIntel::compute(int eflag, int vflag)
        fieldforce_g_ad<float,float>(fix->get_single_buffers());
      }

-      if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_G);
+      if (vflag_atom)
+        gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_G,
+                                 gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);

    } else {
      poisson_ik(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
@ -413,7 +416,8 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                 vdz_brick_g, virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g,
                 v1_brick_g, v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);

-      cg_6->forward_comm(this,FORWARD_IK_G);
+      gc6->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK_G,
+                               gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);

      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
        fieldforce_g_ik<float,double>(fix->get_mixed_buffers());
@ -423,9 +427,11 @@ void PPPMDispIntel::compute(int eflag, int vflag)
        fieldforce_g_ik<float,float>(fix->get_single_buffers());
      }

-
-      if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_G);
+      if (evflag_atom)
+        gc6->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_G,
+                                 gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
    }
+
    if (evflag_atom) fieldforce_g_peratom();
  }

@ -455,12 +461,12 @@ void PPPMDispIntel::compute(int eflag, int vflag)
      make_rho_a<float,float>(fix->get_single_buffers());
    }

-    cg_6->reverse_comm(this, REVERSE_RHO_A);
+    gc->reverse_comm_kspace(this,7,sizeof(FFT_SCALAR),REVERSE_RHO_A,
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);

    brick2fft_a();

-    if ( differentiation_flag == 1) {
-
+    if (differentiation_flag == 1) {
      poisson_ad(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
                 nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6,
                 nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
@ -481,7 +487,8 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                    v5_brick_a2, u_brick_a4, v0_brick_a4, v1_brick_a4,
                    v2_brick_a4, v3_brick_a4, v4_brick_a4, v5_brick_a4);

-      cg_6->forward_comm(this, FORWARD_AD_A);
+      gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_AD_A,
+                               gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);

      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
        fieldforce_a_ad<float,double>(fix->get_mixed_buffers());
@ -491,10 +498,11 @@ void PPPMDispIntel::compute(int eflag, int vflag)
        fieldforce_a_ad<float,float>(fix->get_single_buffers());
      }

-      if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_AD_PERATOM_A);
+      if (evflag_atom)
+        gc6->forward_comm_kspace(this,42,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_A,
+                                 gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);

    }  else {
-
      poisson_ik(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
                 nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6,
                 nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
@ -522,7 +530,8 @@ void PPPMDispIntel::compute(int eflag, int vflag)
                    u_brick_a4, v0_brick_a4, v1_brick_a4, v2_brick_a4,
                    v3_brick_a4, v4_brick_a4, v5_brick_a4);

-      cg_6->forward_comm(this, FORWARD_IK_A);
+      gc6->forward_comm_kspace(this,18,sizeof(FFT_SCALAR),FORWARD_IK_A,
+                               gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);

      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
        fieldforce_a_ik<float,double>(fix->get_mixed_buffers());
@ -532,13 +541,17 @@ void PPPMDispIntel::compute(int eflag, int vflag)
        fieldforce_a_ik<float,float>(fix->get_single_buffers());
      }

-      if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_A);
+      if (evflag_atom)
+        gc6->forward_comm_kspace(this,49,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_A,
+                                 gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
    }
+
    if (evflag_atom) fieldforce_a_peratom();
  }

  if (function[3]) {
-    //perform calculations if no mixing rule applies
+
+    // perform calculations if no mixing rule applies

    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
      particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
@ -563,7 +576,8 @@ void PPPMDispIntel::compute(int eflag, int vflag)
      make_rho_none<float,float>(fix->get_single_buffers());
    }

-    cg_6->reverse_comm(this, REVERSE_RHO_NONE);
+    gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO_NONE,
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);

    brick2fft_none();

@ -578,7 +592,8 @@ void PPPMDispIntel::compute(int eflag, int vflag)
        n += 2;
      }

-      cg_6->forward_comm(this,FORWARD_AD_NONE);
+      gc6->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD_NONE,
+                               gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);

      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
        fieldforce_none_ad<float,double>(fix->get_mixed_buffers());
@ -588,7 +603,9 @@ void PPPMDispIntel::compute(int eflag, int vflag)
        fieldforce_none_ad<float,float>(fix->get_single_buffers());
      }

-      if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_NONE);
+      if (vflag_atom)
+        gc6->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM_NONE,
+                                 gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);

    } else {
      int n = 0;
@ -604,7 +621,8 @@ void PPPMDispIntel::compute(int eflag, int vflag)
        n += 2;
      }

-      cg_6->forward_comm(this,FORWARD_IK_NONE);
+      gc6->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK_NONE,
+                               gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);

      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
        fieldforce_none_ik<float,double>(fix->get_mixed_buffers());
@ -615,8 +633,10 @@ void PPPMDispIntel::compute(int eflag, int vflag)
      }

      if (evflag_atom)
-        cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_NONE);
+        gc6->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM_NONE,
+                                 gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
    }
+
    if (evflag_atom) fieldforce_none_peratom();
  }

@ -655,7 +675,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
    for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
    MPI_Allreduce(virial_6,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
    for (i = 0; i < 6; i++) virial[i] += 0.5*volume*virial_all[i];
-    if (function[1]+function[2]+function[3]){
+    if (function[1]+function[2]+function[3]) {
      double a =  MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumij;
      virial[0] -= a;
      virial[1] -= a;
@ -2964,10 +2984,10 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)

 void PPPMDispIntel::precompute_rho()
 {
-
  half_rho_scale = (rho_points - 1.)/2.;
  half_rho_scale_plus = half_rho_scale + 0.5;

+  if (function[0]) {
    for (int i = 0; i < rho_points; i++) {
      FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
      #if defined(LMP_SIMD_COMPILER)
@ -2999,6 +3019,9 @@ void PPPMDispIntel::precompute_rho()
        }
      }
    }
+  }
+
+  if (function[1]+function[2]+function[3]) {
    for (int i = 0; i < rho_points; i++) {
      FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
      #if defined(LMP_SIMD_COMPILER)
@ -3030,6 +3053,7 @@ void PPPMDispIntel::precompute_rho()
        }
      }
    }
+  }
 }

 /* ----------------------------------------------------------------------
--- a/src/USER-INTEL/pppm_disp_intel.h
+++ b/src/USER-INTEL/pppm_disp_intel.h
@ -70,8 +70,6 @@ namespace LAMMPS_NS {
    FFT_SCALAR *particle_eky6;
    FFT_SCALAR *particle_ekz6;

-
-
    int _use_table;
    int rho_points;
    FFT_SCALAR **rho_lookup;
@ -82,7 +80,6 @@ namespace LAMMPS_NS {

    int _use_packing;

-
    #ifdef _LMP_INTEL_OFFLOAD
    int _use_base;
    #endif
--- a/src/USER-INTEL/pppm_intel.cpp
+++ b/src/USER-INTEL/pppm_intel.cpp
@ -164,11 +164,7 @@ void PPPMIntel::compute_first(int eflag, int vflag)

  ev_init(eflag,vflag);

-  if (evflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    cg_peratom->ghost_notify();
-    cg_peratom->setup();
-  }
+  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();

  // if atom count has changed, update qsum and qsqsum

@ -232,7 +228,8 @@ void PPPMIntel::compute_first(int eflag, int vflag)
  //   to fully sum contribution in their 3d bricks
  // remap from 3d decomposition to FFT decomposition

-  cg->reverse_comm(this,REVERSE_RHO);
+  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+			  gc_buf1,gc_buf2,MPI_FFT_SCALAR);
  brick2fft();

  // compute potential gradient on my FFT grid and
@ -246,16 +243,22 @@ void PPPMIntel::compute_first(int eflag, int vflag)
  // all procs communicate E-field values
  // to fill ghost cells surrounding their 3d bricks

-  if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
-  else cg->forward_comm(this,FORWARD_IK);
+  if (differentiation_flag == 1)
+    gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  else
+    gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
+			    gc_buf1,gc_buf2,MPI_FFT_SCALAR);

  // extra per-atom energy/virial communication

  if (evflag_atom) {
    if (differentiation_flag == 1 && vflag_atom)
-      cg_peratom->forward_comm(this,FORWARD_AD_PERATOM);
+      gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
    else if (differentiation_flag == 0)
-      cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
+      gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+			      gc_buf1,gc_buf2,MPI_FFT_SCALAR);
  }
 }

--- a/src/USER-OMP/msm_cg_omp.cpp
+++ b/src/USER-OMP/msm_cg_omp.cpp
@ -95,17 +95,7 @@ void MSMCGOMP::compute(int eflag, int vflag)

  // invoke allocate_peratom() if needed for first time

-  if (vflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    cg_peratom_all->ghost_notify();
-    cg_peratom_all->setup();
-    for (int n=0; n<levels; n++) {
-      if (!active_flag[n]) continue;
-      cg_peratom[n]->ghost_notify();
-      cg_peratom[n]->setup();
-    }
-    peratom_allocate_flag = 1;
-  }
+  if (vflag_atom && !peratom_allocate_flag) allocate_peratom();

  // extend size of per-atom arrays if necessary

@ -175,7 +165,8 @@ void MSMCGOMP::compute(int eflag, int vflag)
  //   to fully sum contribution in their 3d grid

  current_level = 0;
-  cg_all->reverse_comm(this,REVERSE_RHO);
+  gcall->reverse_comm_kspace(this,1,sizeof(double),REVERSE_RHO,
+			     gcall_buf1,gcall_buf2,MPI_DOUBLE);

  // forward communicate charge density values to fill ghost grid points
  // compute direct sum interaction and then restrict to coarser grid
@ -183,24 +174,30 @@ void MSMCGOMP::compute(int eflag, int vflag)
  for (int n=0; n<=levels-2; n++) {
    if (!active_flag[n]) continue;
    current_level = n;
-    cg[n]->forward_comm(this,FORWARD_RHO);
-
+    gc[n]->forward_comm_kspace(this,1,sizeof(double),FORWARD_RHO,
+			       gc_buf1[n],gc_buf2[n],MPI_DOUBLE);
    direct(n);
    restriction(n);
  }

-
  // compute direct interaction for top grid level for non-periodic
  //   and for second from top grid level for periodic

  if (active_flag[levels-1]) {
    if (domain->nonperiodic) {
      current_level = levels-1;
-      cg[levels-1]->forward_comm(this,FORWARD_RHO);
+      gc[levels-1]->
+	forward_comm_kspace(this,1,sizeof(double),FORWARD_RHO,
+			    gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
      direct_top(levels-1);
-      cg[levels-1]->reverse_comm(this,REVERSE_AD);
+      gc[levels-1]->
+	reverse_comm_kspace(this,1,sizeof(double),REVERSE_AD,
+			    gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
      if (vflag_atom)
-        cg_peratom[levels-1]->reverse_comm(this,REVERSE_AD_PERATOM);
+	gc[levels-1]->
+	  reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
+			      gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
+
    } else {
      // Here using MPI_Allreduce is cheaper than using commgrid
      grid_swap_forward(levels-1,qgrid[levels-1]);
@ -208,7 +205,9 @@ void MSMCGOMP::compute(int eflag, int vflag)
      grid_swap_reverse(levels-1,egrid[levels-1]);
      current_level = levels-1;
      if (vflag_atom)
-        cg_peratom[levels-1]->reverse_comm(this,REVERSE_AD_PERATOM);
+	gc[levels-1]->
+	  reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
+			      gc_buf1[levels-1],gc_buf2[levels-1],MPI_DOUBLE);
    }
  }

@ -220,24 +219,28 @@ void MSMCGOMP::compute(int eflag, int vflag)
    prolongation(n);

    current_level = n;
-    cg[n]->reverse_comm(this,REVERSE_AD);
+    gc[n]->reverse_comm_kspace(this,1,sizeof(double),REVERSE_AD,
+			       gc_buf1[n],gc_buf2[n],MPI_DOUBLE);

    // extra per-atom virial communication

    if (vflag_atom)
-      cg_peratom[n]->reverse_comm(this,REVERSE_AD_PERATOM);
+      gc[n]->reverse_comm_kspace(this,6,sizeof(double),REVERSE_AD_PERATOM,
+				 gc_buf1[n],gc_buf2[n],MPI_DOUBLE);
  }

  // all procs communicate E-field values
  // to fill ghost cells surrounding their 3d bricks

  current_level = 0;
-  cg_all->forward_comm(this,FORWARD_AD);
+  gcall->forward_comm_kspace(this,1,sizeof(double),FORWARD_AD,
+			     gcall_buf1,gcall_buf2,MPI_DOUBLE);

  // extra per-atom energy/virial communication

  if (vflag_atom)
-    cg_peratom_all->forward_comm(this,FORWARD_AD_PERATOM);
+    gcall->forward_comm_kspace(this,6,sizeof(double),FORWARD_AD_PERATOM,
+			       gcall_buf1,gcall_buf2,MPI_DOUBLE);

  // calculate the force on my particles (interpolation)

@ -556,6 +559,7 @@ void MSMCGOMP::fieldforce_peratom()
  }
 }

+/* ---------------------------------------------------------------------- */

 double MSMCGOMP::memory_usage()
 {
--- a/src/balance.cpp
+++ b/src/balance.cpp
@ -387,20 +387,20 @@ void Balance::command(int narg, char **arg)
                                   MPI_Wtime()-start_time);
    mesg += fmt::format("  iteration count = {}\n",niter);
    for (int i = 0; i < nimbalance; ++i) mesg += imbalances[i]->info();
-    mesg += fmt::format("  initial/final maximal load/proc = {} {}\n"
-                        "  initial/final imbalance factor  = {:.6g} {:.6g}\n",
+    mesg += fmt::format("  initial/final maximal load/proc = {:.8} {:.8}\n"
+                        "  initial/final imbalance factor  = {:.8} {:.8}\n",
                        maxinit,maxfinal,imbinit,imbfinal);

    if (style != BISECTION) {
      mesg += "  x cuts:";
      for (int i = 0; i <= comm->procgrid[0]; i++)
-        mesg += fmt::format(" {}",comm->xsplit[i]);
+        mesg += fmt::format(" {:.8}",comm->xsplit[i]);
      mesg += "\n  y cuts:";
      for (int i = 0; i <= comm->procgrid[1]; i++)
-        mesg += fmt::format(" {}",comm->ysplit[i]);
+        mesg += fmt::format(" {:.8}",comm->ysplit[i]);
      mesg += "\n  z cuts:";
      for (int i = 0; i <= comm->procgrid[2]; i++)
-        mesg += fmt::format(" {}",comm->zsplit[i]);
+        mesg += fmt::format(" {:.8}",comm->zsplit[i]);
      mesg += "\n";
    }

--- a/src/force.cpp
+++ b/src/force.cpp
@ -664,10 +664,6 @@ void Force::create_kspace(const std::string &style, int trysuffix)
  int sflag;
  kspace = new_kspace(style,trysuffix,sflag);
  store_style(kspace_style,style,sflag);
-
-  if (comm->style == 1 && !kspace_match("ewald",0))
-    error->all(FLERR,
-               "Cannot yet use KSpace solver with grid with comm style tiled");
 }

 /* ----------------------------------------------------------------------
--- a/src/kspace.h
+++ b/src/kspace.h
@ -121,10 +121,10 @@ class KSpace : protected Pointers {
  virtual void compute(int, int) = 0;
  virtual void compute_group_group(int, int, int) {};

-  virtual void pack_forward(int, FFT_SCALAR *, int, int *) {};
-  virtual void unpack_forward(int, FFT_SCALAR *, int, int *) {};
-  virtual void pack_reverse(int, FFT_SCALAR *, int, int *) {};
-  virtual void unpack_reverse(int, FFT_SCALAR *, int, int *) {};
+  virtual void pack_forward_grid(int, void *, int, int *) {};
+  virtual void unpack_forward_grid(int, void *, int, int *) {};
+  virtual void pack_reverse_grid(int, void *, int, int *) {};
+  virtual void unpack_reverse_grid(int, void *, int, int *) {};

  virtual int timing(int, double &, double &) {return 0;}
  virtual int timing_1d(int, double &) {return 0;}
--- a/unittest/force-styles/tests/kspace-pppm_cg_tiled.yaml
+++ b/unittest/force-styles/tests/kspace-pppm_cg_tiled.yaml
@ -0,0 +1,94 @@
+---
+lammps_version: 30 Jun 2020
+date_generated: Sun Jul 12 19:14:30 202
+epsilon: 7.5e-14
+prerequisites: ! |
+  atom full
+  pair coul/long
+  kspace pppm/cg
+pre_commands: ! ""
+post_commands: ! |
+  set atom 22*23 charge 0.0
+  set atom 25*26 charge 0.0
+  set atom 28*29 charge 0.0
+  set type 5 charge 0.0
+  comm_style tiled
+  pair_modify compute no
+  kspace_style pppm/cg 1.0e-6
+  kspace_modify gewald 0.3
+  balance 0.0 rcb
+input_file: in.fourmol
+pair_style: coul/long 8.0
+pair_coeff: ! |
+  * *
+extract: ! ""
+natoms: 29
+init_vdwl: 0
+init_coul: 0
+init_stress: ! |2-
+   0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+init_forces: ! |2
+    1 -2.6277259148670029e-01  6.3371291789473080e-02  2.5711876152341273e-01
+    2  7.2367296933642605e-02 -2.6377287441660424e-01 -7.6051513821789138e-02
+    3 -2.0405952297641711e-02 -7.8572370620207509e-03  1.6901994166300244e-02
+    4  1.3913539877995293e-01  2.2591757798275315e-03 -4.0397816193944613e-02
+    5  7.8220840962476321e-02  5.8018673691041286e-02 -6.0298251586383647e-02
+    6  1.6642601835640428e-02  5.4813636460583826e-01 -5.9948263006137181e-01
+    7  7.4422159409041455e-02 -5.2976037824286260e-01  2.8745376504092163e-01
+    8  5.1715645698484136e-01 -8.4224353212928849e-01  3.9252943496496401e-01
+    9 -2.8698074414725644e-01  3.8846246184711297e-01 -8.8735165696061394e-02
+   10 -1.4119088048501882e-01  1.7101059256234674e-01 -2.1093128275588843e-02
+   11 -1.9802109162868770e-01  2.3799905238132363e-01 -6.9205308272233729e-02
+   12  5.9829442228003460e-01 -6.6570589694338889e-01  1.8052289946052447e-01
+   13 -1.4432183109305963e-01  2.1119613677613439e-01 -3.7953547334912435e-02
+   14 -2.1057539442776216e-01  1.9833568369952351e-01 -4.3863660550604380e-02
+   15 -1.8510184013940739e-01  1.4558620355697388e-01 -6.3736986322272177e-02
+   16 -8.7068540567994446e-01  8.0781838347116530e-01  7.4023620199055895e-01
+   17  6.8573068239024604e-01 -5.4365714223217854e-01 -9.1523962812961923e-01
+   18  9.0711753065407197e-01  1.6157336473023263e+00 -1.6708191809635238e+00
+   19 -3.2812987976759644e-01 -8.1654277506424433e-01  9.6628327138680947e-01
+   20 -4.4090177907687206e-01 -7.7838783137249912e-01  8.4583048867481425e-01
+   21  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   22  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   23  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   24  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   25  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   26  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   27  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   28  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   29  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+run_vdwl: 0
+run_coul: 0
+run_stress: ! |2-
+   0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+run_forces: ! |2
+    1 -2.5272870059014418e-01  7.7993693891669019e-02  2.7413192480528514e-01
+    2  6.0317817543279174e-02 -2.7726219497773258e-01 -9.2328466795041211e-02
+    3 -2.0381925802263216e-02 -7.4169903634883827e-03  1.7718770630259381e-02
+    4  1.4007637687058852e-01  6.0294233349724097e-04 -4.4118635982017075e-02
+    5  7.7661555892458781e-02  5.6660597072423394e-02 -6.2527393985284924e-02
+    6  1.8472776517392236e-02  5.4080910460464371e-01 -6.2683322637258831e-01
+    7  7.1172361943358894e-02 -5.3339808852688553e-01  3.1182624682757953e-01
+    8  5.1931682876258478e-01 -8.4009257264924786e-01  4.1437713723636871e-01
+    9 -2.8926485235033661e-01  3.8545218869697995e-01 -9.6445715585120301e-02
+   10 -1.4130393273230371e-01  1.7154601844601167e-01 -2.4319589810512892e-02
+   11 -1.9820651458435656e-01  2.3998861307221755e-01 -7.2850168561931314e-02
+   12  6.0180481293740029e-01 -6.6352386739919189e-01  1.9241766558675899e-01
+   13 -1.4512210035912929e-01  2.1001807770143230e-01 -4.0833849803347325e-02
+   14 -2.1221325118484802e-01  1.9911652392243312e-01 -4.7359593324180368e-02
+   15 -1.8584239277230100e-01  1.4355785962059972e-01 -6.8625732013099072e-02
+   16 -8.6818324744564379e-01  8.1248951063969599e-01  7.2669223824162077e-01
+   17  6.8330419262914510e-01 -5.4635175697697436e-01 -9.0599066050100585e-01
+   18  9.4795190549104413e-01  1.6607978960168899e+00 -1.6003547491287762e+00
+   19 -3.4101870572315529e-01 -8.2840317570845334e-01  9.3657904772766076e-01
+   20 -4.6581300504277079e-01 -8.0258437941651817e-01  8.0884475080737095e-01
+   21  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   22  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   23  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   24  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   25  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   26  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   27  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   28  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   29  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+...
--- a/unittest/force-styles/tests/kspace-pppm_disp_ad_only.yaml
+++ b/unittest/force-styles/tests/kspace-pppm_disp_ad_only.yaml
@ -0,0 +1,99 @@
+---
+lammps_version: 21 Jul 2020
+date_generated: Thu Aug 20 23:05:29 202
+epsilon: 2.5e-13
+prerequisites: ! |
+  atom full
+  pair lj/long/coul/long
+  kspace pppm/disp
+pre_commands: ! ""
+post_commands: ! |
+  pair_modify compute no
+  kspace_style pppm/disp 1.0e-4
+  kspace_modify gewald 0.5
+  kspace_modify force/disp/real  0.001
+  kspace_modify force/disp/kspace 0.005
+  kspace_modify diff ad
+input_file: in.fourmol
+pair_style: lj/long/coul/long long off 7.0
+pair_coeff: ! |
+  1 1  0.02   2.5
+  2 2  0.005  1.0
+  2 4  0.005  0.5
+  3 3  0.02   3.2
+  4 4  0.015  3.1
+  5 5  0.015  3.1
+extract: ! |
+  epsilon 2
+  sigma 2
+  cut_coul 0
+natoms: 29
+init_vdwl: 0
+init_coul: 0
+init_stress: ! |2-
+   0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+init_forces: ! |2
+    1  6.9724769730763625e-11 -6.2917196472420914e-11 -1.7553582754737637e-11
+    2  7.6094811340731255e-13 -1.1540411942409495e-12  1.6298559231496556e-13
+    3  3.9430185022927007e-11 -1.9017673718233126e-11 -1.6651673285376923e-13
+    4  1.5917087679856494e-12 -7.5219417810712124e-13  3.9732355364959770e-13
+    5  1.3754210730241487e-12 -5.2393248740046831e-13 -5.8786972569898321e-13
+    6  2.4324626429202970e-11 -5.9541975521081885e-12  1.1615602625372913e-11
+    7  3.9335920395601978e-11 -4.5805387197461763e-12  4.9818207226960900e-11
+    8  1.9840262237451370e-11  6.2076746170376731e-12 -4.0963480752166205e-13
+    9  6.7032491388560462e-14  2.8806602906466370e-14 -5.1168576901689580e-13
+   10 -9.5639153188399775e-12  1.6260508185384930e-11  5.9852790016982401e-12
+   11 -2.1047310133618033e-13  7.9345765138879343e-13  7.5666446278699520e-13
+   12 -2.9173061030991263e-11  3.7633064986308384e-12  1.7460466677353034e-11
+   13 -1.4524529065361568e-12  3.3470067557100611e-13  6.2185091609144014e-13
+   14 -7.3138369212027322e-13  1.0248711043717948e-13  1.1305113696402957e-12
+   15 -9.5037504870165340e-13 -3.5714658367712204e-13  3.9441042026885766e-13
+   16 -2.2415639659259037e-11  2.7640113997479121e-11 -1.1642230752605363e-11
+   17 -2.1208473166391679e-11  3.8522058586534765e-11 -4.6846216614724886e-11
+   18 -1.8040308770875740e-11 -5.7279008382924961e-11  7.2376750640813174e-11
+   19 -3.8160168113867275e-14 -1.0975133018216530e-12  1.5219157003617188e-12
+   20 -8.2805108197841928e-13 -1.1974415437385753e-12  1.4978331961150702e-12
+   21 -8.9475625142751869e-11  6.3090972691297525e-11  7.4019219615381729e-11
+   22 -1.5546663673414098e-12  1.1916129263874209e-12  1.5208406092298733e-12
+   23 -1.6319521654857783e-12  1.1826485038559488e-12  1.4904048534083759e-12
+   24 -1.5641127395622623e-11 -5.8519640828036183e-11 -7.6239449591252700e-11
+   25  1.1910315964875356e-13 -1.1889645936751337e-12 -1.4555354381968814e-12
+   26 -7.1757130498965896e-13 -1.1745493969901884e-12 -1.5212605958363234e-12
+   27  8.0820809208387831e-11  6.3170487105530568e-11 -6.5474684007301223e-11
+   28  1.6363504379062180e-12  1.1991903709725124e-12 -1.1721879859607168e-12
+   29  1.5216247551162307e-12  1.1846825159982258e-12 -1.4010202583443579e-12
+run_vdwl: 0
+run_coul: 0
+run_stress: ! |2-
+   0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+run_forces: ! |2
+    1  6.9638112975627505e-11 -6.2952655433984510e-11 -1.7520579341404558e-11
+    2  7.5441280063952564e-13 -1.1565278235052112e-12  1.5428964869214424e-13
+    3  3.9495752841065062e-11 -1.9092781431624742e-11 -1.4273797798954887e-13
+    4  1.5925753071119834e-12 -7.5018590838065349e-13  4.0694176768714727e-13
+    5  1.3773389032846222e-12 -5.2518353267880336e-13 -5.8790169309916104e-13
+    6  2.4346713154520441e-11 -6.0219217022078622e-12  1.1556637696922417e-11
+    7  3.9348886643182682e-11 -4.5159528947341432e-12  4.9885111934485096e-11
+    8  1.9778099538765962e-11  6.2897367226276710e-12 -4.3131958485496316e-13
+    9  6.3695687649969267e-14  3.3892963902521213e-14 -5.1413748665229894e-13
+   10 -9.6012722021579935e-12  1.6233041014363616e-11  6.0003996378633241e-12
+   11 -2.0949350984685278e-13  7.9562479682529135e-13  7.6275214731002570e-13
+   12 -2.9254571969811829e-11  3.7981192304699482e-12  1.7545839680382277e-11
+   13 -1.4582673940336726e-12  3.2818654537516632e-13  6.2347550527444227e-13
+   14 -7.3106004851428835e-13  1.1022739695233736e-13  1.1369294512424691e-12
+   15 -9.4060413626822731e-13 -3.5126188279055496e-13  3.8676904179416883e-13
+   16 -2.2408460889929651e-11  2.7657483426256059e-11 -1.1741964403647136e-11
+   17 -2.1298916095589507e-11  3.8489167711847335e-11 -4.6910251494943480e-11
+   18 -1.7957271474310740e-11 -5.7234788024451294e-11  7.2383088876515692e-11
+   19 -3.6288894307680100e-14 -1.0943572602766692e-12  1.5210320121776364e-12
+   20 -8.3935575478300129e-13 -1.1966327701053927e-12  1.4966158763399045e-12
+   21 -8.9447202173297866e-11  6.3082241565048134e-11  7.4014350079233694e-11
+   22 -1.5510607489175700e-12  1.1907296946509683e-12  1.5210644252358221e-12
+   23 -1.6318182177140126e-12  1.1841207226717162e-12  1.4885209322729697e-12
+   24 -1.5676362809971050e-11 -5.8529118094744956e-11 -7.6215807688416359e-11
+   25  1.0742373795185689e-13 -1.1875074064094843e-12 -1.4571756878225340e-12
+   26 -7.1286524148732120e-13 -1.1757034686897436e-12 -1.5207002392359802e-12
+   27  8.0828500068665394e-11  6.3165679701490667e-11 -6.5424272562687016e-11
+   28  1.6362386625442489e-12  1.1989853840181747e-12 -1.1753814351696325e-12
+   29  1.5246414975786595e-12  1.1846002567244795e-12 -1.4036250137375415e-12
+...
--- a/unittest/force-styles/tests/kspace-pppm_disp_only.yaml
+++ b/unittest/force-styles/tests/kspace-pppm_disp_only.yaml
@ -0,0 +1,98 @@
+---
+lammps_version: 21 Jul 2020
+date_generated: Thu Aug 20 23:05:29 202
+epsilon: 2.5e-13
+prerequisites: ! |
+  atom full
+  pair lj/long/coul/long
+  kspace pppm/disp
+pre_commands: ! ""
+post_commands: ! |
+  pair_modify compute no
+  kspace_style pppm/disp 1.0e-4
+  kspace_modify gewald 0.5
+  kspace_modify force/disp/real  0.001
+  kspace_modify force/disp/kspace 0.005
+input_file: in.fourmol
+pair_style: lj/long/coul/long long off 7.0
+pair_coeff: ! |
+  1 1  0.02   2.5
+  2 2  0.005  1.0
+  2 4  0.005  0.5
+  3 3  0.02   3.2
+  4 4  0.015  3.1
+  5 5  0.015  3.1
+extract: ! |
+  epsilon 2
+  sigma 2
+  cut_coul 0
+natoms: 29
+init_vdwl: 0
+init_coul: 0
+init_stress: ! |2-
+   0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+init_forces: ! |2
+    1  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    2  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    3  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    4  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    5  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    6  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    7  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    8  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    9  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   10  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   11  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   12  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   13  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   14  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   15  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   16  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   17  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   18  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   19  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   20  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   21  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   22  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   23  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   24  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   25  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   26  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   27  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   28  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   29  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+run_vdwl: 0
+run_coul: 0
+run_stress: ! |2-
+   0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+run_forces: ! |2
+    1  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    2  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    3  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    4  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    5  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    6  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    7  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    8  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+    9  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   10  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   11  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   12  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   13  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   14  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   15  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   16  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   17  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   18  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   19  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   20  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   21  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   22  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   23  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   24  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   25  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   26  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   27  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   28  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+   29  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+...
--- a/unittest/force-styles/tests/kspace-pppm_stagger_tiled.yaml
+++ b/unittest/force-styles/tests/kspace-pppm_stagger_tiled.yaml
@ -0,0 +1,90 @@
+---
+lammps_version: 21 Jul 2020
+date_generated: Mon Aug  3 23:27:35 202
+epsilon: 7.5e-14
+prerequisites: ! |
+  atom full
+  pair coul/long
+  kspace pppm/stagger
+pre_commands: ! ""
+post_commands: ! |
+  comm_style tiled
+  pair_modify compute no
+  kspace_style pppm/stagger 1.0e-6
+  kspace_modify gewald 0.3
+  balance 0.0 rcb
+input_file: in.fourmol
+pair_style: coul/long 8.0
+pair_coeff: ! |
+  * *
+extract: ! ""
+natoms: 29
+init_vdwl: 0
+init_coul: 0
+init_stress: ! |2-
+   0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+init_forces: ! |2
+    1 -5.2227715900845750e-01  8.1950519891037896e-02  2.1568864750376832e-01
+    2  2.1709329984201631e-01 -2.7910826043908610e-01 -1.3501796562404628e-01
+    3 -3.4431410110235108e-02 -9.3026609475793179e-03  1.9970153652663108e-02
+    4  1.6294612553322482e-01  2.8820208915782589e-02 -7.8041537992552090e-02
+    5  1.6018614143950777e-01  7.5432356753115826e-02 -3.7722588054429150e-02
+    6  5.6492988309033354e-01  4.1696814706534630e-01 -6.7673476362799612e-01
+    7 -3.4216205471660566e-01 -4.0009165524095192e-01  3.9325085501474977e-01
+    8 -1.4121658810415072e-01 -6.1694926607232958e-01  3.3967088097622000e-01
+    9  1.8213420211015810e-01  3.2023550501078568e-01  5.0865716213608053e-02
+   10 -5.1659753410710330e-02  1.1063713004312387e-01 -1.4434062323990951e-02
+   11 -8.4675426404111437e-02  1.5093767469541053e-01 -3.9273222694910556e-02
+   12  4.5743278086428246e-01 -4.2657803621344026e-01  3.4765791147615285e-02
+   13 -1.5598326968651899e-01  1.1609333380277490e-01  2.6827585674639255e-02
+   14 -1.7229830712699734e-01  1.3660265515995373e-01  1.0364293545061238e-02
+   15 -1.3779624948415931e-01  8.5611514314178239e-02 -1.4417936578185790e-02
+   16 -3.4309620718952316e-01  4.3358259515061448e-01  5.3264911488862143e-01
+   17  1.3394866253126306e-01 -4.1287506953147796e-01 -7.8819987038465467e-01
+   18  7.3032854805187175e-01  1.5459002190369358e+00 -1.3876618467094484e+00
+   19 -2.5946241349817661e-01 -7.7450328984918526e-01  7.7107291114413901e-01
+   20 -3.9364379163465191e-01 -7.0318115064463305e-01  7.3145133273582563e-01
+   21  5.1854207039779388e-01  5.4316140431986648e-01 -1.1630561612460866e+00
+   22 -2.9474924657955082e-01 -1.2314722330232061e-01  5.8311514555951505e-01
+   23 -2.8773056143507980e-01 -2.9281856868591627e-01  5.5619506923778372e-01
+   24  6.2752036539684239e-02  1.7441478631220706e+00 -2.7849000426516041e-01
+   25  1.2955239352175907e-01 -7.0427160638100861e-01  2.2582608971039136e-01
+   26 -2.2227598237037974e-01 -9.7470415379148345e-01  7.4514829391794934e-02
+   27 -8.5917766336431800e-01  1.6506499588643100e+00 -9.3704596621935576e-01
+   28  5.7091533654326099e-01 -9.1760299361767794e-01  5.4073727763352530e-01
+   29  4.1187460365847078e-01 -8.0559715142821642e-01  4.4313023169089372e-01
+run_vdwl: 0
+run_coul: 0
+run_stress: ! |2-
+   0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+run_forces: ! |2
+    1 -5.0888488962950285e-01  8.3430404180325907e-02  2.3883070350075583e-01
+    2  2.0290231222263222e-01 -2.8552521628160221e-01 -1.4795322214007761e-01
+    3 -3.4147227596864727e-02 -9.1044423234983921e-03  2.1050772312966981e-02
+    4  1.6403639693437871e-01  2.7510909903057143e-02 -8.1700222473303191e-02
+    5  1.5793837855428028e-01  7.5134684921661865e-02 -4.3070400148934276e-02
+    6  5.5874989482134696e-01  4.1097082705147381e-01 -7.0880717747024347e-01
+    7 -3.4244419763693645e-01 -4.0336810330768602e-01  4.1416013762933518e-01
+    8 -1.2867011168103898e-01 -6.1360944335177758e-01  3.7373621646565264e-01
+    9  1.7179361391284365e-01  3.1570434896702448e-01  2.9166730933101400e-02
+   10 -5.3386439917058298e-02  1.1157104966535075e-01 -1.9069019877901452e-02
+   11 -8.6641568612711800e-02  1.5447778751280100e-01 -4.3302035139313286e-02
+   12  4.6277613291799979e-01 -4.2640261166652782e-01  5.4642934467963714e-02
+   13 -1.5768096764682599e-01  1.1678927823658505e-01  2.0726708870285715e-02
+   14 -1.7362409264878331e-01  1.3769878891777848e-01  5.6914483198930487e-03
+   15 -1.3804334923445472e-01  8.4232584350085404e-02 -2.2242211531143450e-02
+   16 -3.5505453326610353e-01  4.4007379281236625e-01  5.1108930583976653e-01
+   17  1.4393262437860022e-01 -4.0681625150263201e-01 -7.6550494257092949e-01
+   18  7.7411316765167837e-01  1.6013845677120058e+00 -1.3406161587038208e+00
+   19 -2.7109483507409249e-01 -7.9220736692047411e-01  7.5606765561780620e-01
+   20 -4.2022391372143653e-01 -7.3398564926630305e-01  7.0862056036484811e-01
+   21  5.2066744810063004e-01  4.5601151336785911e-01 -1.1109669200204870e+00
+   22 -2.9131419157235938e-01 -8.0660148622295896e-02  5.6037505807873789e-01
+   23 -2.8851937111607945e-01 -2.5707916023569388e-01  5.3115241105748146e-01
+   24  7.7132406285747662e-02  1.6936935947231240e+00 -2.6197671637095088e-01
+   25  1.1739827770831035e-01 -6.8172720908194795e-01  2.1292828073010400e-01
+   26 -2.2399952098390596e-01 -9.4806186651755442e-01  6.5237310702708273e-02
+   27 -8.6781949013350235e-01  1.6453631212617197e+00 -8.8867885380482659e-01
+   28  5.7526918079966904e-01 -9.1110567278710985e-01  5.1441520143469011e-01
+   29  4.1483886618353916e-01 -8.0439411171811659e-01  4.1599644392583446e-01
+...
--- a/unittest/force-styles/tests/kspace-pppm_tiled.yaml
+++ b/unittest/force-styles/tests/kspace-pppm_tiled.yaml
@ -0,0 +1,90 @@
+---
+lammps_version: 30 Jun 2020
+date_generated: Sun Jul 12 19:14:29 202
+epsilon: 7.5e-14
+prerequisites: ! |
+  atom full
+  pair coul/long
+  kspace pppm
+pre_commands: ! ""
+post_commands: ! |
+  comm_style tiled
+  pair_modify compute no
+  kspace_style pppm 1.0e-6
+  kspace_modify gewald 0.3
+  balance 0.0 rcb
+input_file: in.fourmol
+pair_style: coul/long 8.0
+pair_coeff: ! |
+  * *
+extract: ! ""
+natoms: 29
+init_vdwl: 0
+init_coul: 0
+init_stress: ! |2-
+   0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+init_forces: ! |2
+    1 -5.2239274535568314e-01  8.2051545744881466e-02  2.1533594847972076e-01
+    2  2.1712968366442176e-01 -2.7928074334318026e-01 -1.3471540076656802e-01
+    3 -3.4442019165638028e-02 -9.3084265599194874e-03  1.9948062571124484e-02
+    4  1.6298334373562443e-01  2.8852998088186425e-02 -7.8001870103674154e-02
+    5  1.6024289196964533e-01  7.5428818157230709e-02 -3.7746220978715959e-02
+    6  5.6503043686117405e-01  4.1669523647698320e-01 -6.7638762712651512e-01
+    7 -3.4224573570118516e-01 -3.9969025602522534e-01  3.9331747529410527e-01
+    8 -1.4133104801408738e-01 -6.1685378954692482e-01  3.3931746208503027e-01
+    9  1.8219762821810317e-01  3.2009822401929577e-01  5.0881307357289934e-02
+   10 -5.1688860353236589e-02  1.1069131959908671e-01 -1.4422029744161480e-02
+   11 -8.4689878918105269e-02  1.5099315110947911e-01 -3.9231342126204188e-02
+   12  4.5754413540574290e-01 -4.2644798683690410e-01  3.4587713233253971e-02
+   13 -1.5596780753830558e-01  1.1607584778590280e-01  2.6865880696619902e-02
+   14 -1.7231427615749528e-01  1.3653099035839830e-01  1.0392517888507409e-02
+   15 -1.3787738509698347e-01  8.5569383216123673e-02 -1.4365596072224287e-02
+   16 -3.4322564010548312e-01  4.3371633953160166e-01  5.3259611401138551e-01
+   17  1.3414272886699793e-01 -4.1322529572771644e-01 -7.8812435933765979e-01
+   18  7.3073447759345089e-01  1.5456517688814524e+00 -1.3881786173290165e+00
+   19 -2.5943625025418654e-01 -7.7424664728587522e-01  7.7105598737678260e-01
+   20 -3.9409193260988501e-01 -7.0311103001458264e-01  7.3171724652214931e-01
+   21  5.1856078926614546e-01  5.4286369838352699e-01 -1.1629548434823531e+00
+   22 -2.9453203152655405e-01 -1.2298517567747463e-01  5.8298446261040782e-01
+   23 -2.8798525475710529e-01 -2.9277384277527774e-01  5.5631883166904628e-01
+   24  6.2753212217437501e-02  1.7443957830145815e+00 -2.7814103479849506e-01
+   25  1.2986161832727383e-01 -7.0443921770565177e-01  2.2578528867489417e-01
+   26 -2.2254044464386455e-01 -9.7470640011041609e-01  7.4360754308868779e-02
+   27 -8.5917998510192983e-01  1.6512375326941557e+00 -9.3680672362601536e-01
+   28  5.7118802253451917e-01 -9.1790362039827855e-01  5.4063664700585301e-01
+   29  4.1157232663919069e-01 -8.0588020505345637e-01  4.4297396570656278e-01
+run_vdwl: 0
+run_coul: 0
+run_stress: ! |2-
+   0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00  0.0000000000000000e+00
+run_forces: ! |2
+    1 -5.0900177524332380e-01  8.3541312360770559e-02  2.3847626933913957e-01
+    2  2.0293760437213748e-01 -2.8570861986354196e-01 -1.4764907529922344e-01
+    3 -3.4158170718364404e-02 -9.1103261103567309e-03  2.1028676255224140e-02
+    4  1.6407381836362508e-01  2.7543938443804754e-02 -8.1661581263582694e-02
+    5  1.5799678256598612e-01  7.5131693181657300e-02 -4.3093731729494773e-02
+    6  5.5884729319027959e-01  4.1070142887655381e-01 -7.0845348552640675e-01
+    7 -3.4252252488516066e-01 -4.0296980848156633e-01  4.1421689597710110e-01
+    8 -1.2877561673829283e-01 -6.1352311819313976e-01  3.7338100957587267e-01
+    9  1.7184965305565320e-01  3.1557352532666366e-01  2.9184166085517198e-02
+   10 -5.3416314160604578e-02  1.1162618970928689e-01 -1.9056641624128120e-02
+   11 -8.6656970143397835e-02  1.5453217862588842e-01 -4.3260381230061914e-02
+   12  4.6288691229484780e-01 -4.2627837697985460e-01  5.4462739307820834e-02
+   13 -1.5766510975402218e-01  1.1677248221389504e-01  2.0765584899621815e-02
+   14 -1.7364037400975366e-01  1.3763067517397995e-01  5.7227400340144263e-03
+   15 -1.3812467553300772e-01  8.4189156813104071e-02 -2.2189341317061961e-02
+   16 -3.5518552554301230e-01  4.4020942653600509e-01  5.1103809754957041e-01
+   17  1.4412978936104159e-01 -4.0716931333084577e-01 -7.6542748703258823e-01
+   18  7.7452008907550252e-01  1.6011444029485182e+00 -1.3411322152836536e+00
+   19 -2.7106696803117064e-01 -7.9194312531475608e-01  7.5606651125297675e-01
+   20 -4.2067655880426469e-01 -7.3392054862184575e-01  7.0888202878660511e-01
+   21  5.2066009642687827e-01  4.5571443252966459e-01 -1.1108598505670140e+00
+   22 -2.9107834596471255e-01 -8.0497477430417727e-02  5.6024739795440914e-01
+   23 -2.8876713232015855e-01 -2.5703387138535833e-01  5.3126570223700098e-01
+   24  7.7134065884385553e-02  1.6939505170091371e+00 -2.6162012529552015e-01
+   25  1.1769649007263155e-01 -6.8190526638151339e-01  2.1287906681296132e-01
+   26 -2.2425351267435142e-01 -9.4806947839154443e-01  6.5072664668127181e-02
+   27 -8.6783221113018416e-01  1.6459489466934927e+00 -8.8842216340655056e-01
+   28  5.7554532106965894e-01 -9.1140444835486800e-01  5.1430957288504053e-01
+   29  4.1454386992115411e-01 -8.0467652760281361e-01  4.1582695595428282e-01
+...