Merge pull request #1384 from lammps/rendezvous2

second attempt at adding rendezvous protocol for some to some communication.
2019-03-27 17:44:16 -04:00 · 2019-03-27 17:44:16 -04:00 · 28a6dcd1c3
parent 2ad9355140 8ba1d76055
commit 28a6dcd1c3
19 changed files with 2709 additions and 1222 deletions
--- a/src/RIGID/fix_rigid_small.cpp
+++ b/src/RIGID/fix_rigid_small.cpp
@ -28,12 +28,14 @@
 #include "modify.h"
 #include "group.h"
 #include "comm.h"
+#include "neighbor.h"
 #include "force.h"
 #include "input.h"
 #include "output.h"
 #include "variable.h"
 #include "random_mars.h"
 #include "math_const.h"
+#include "hashlittle.h"
 #include "memory.h"
 #include "error.h"

@ -43,6 +45,8 @@ using namespace LAMMPS_NS;
 using namespace FixConst;
 using namespace MathConst;

+#define RVOUS 1   // 0 for irregular, 1 for all2all
+
 #define MAXLINE 1024
 #define CHUNK 1024
 #define ATTRIBUTE_PERBODY 20
@ -70,8 +74,7 @@ FixRigidSmall::FixRigidSmall(LAMMPS *lmp, int narg, char **arg) :
  xcmimage(NULL), displace(NULL), eflags(NULL), orient(NULL), dorient(NULL),
  avec_ellipsoid(NULL), avec_line(NULL), avec_tri(NULL), counts(NULL),
  itensor(NULL), mass_body(NULL), langextra(NULL), random(NULL),
-  id_dilate(NULL), onemols(NULL), hash(NULL), bbox(NULL), ctr(NULL),
-  idclose(NULL), rsqclose(NULL)
+  id_dilate(NULL), onemols(NULL)
 {
  int i;

@ -107,18 +110,18 @@ FixRigidSmall::FixRigidSmall(LAMMPS *lmp, int narg, char **arg) :
  // parse args for rigid body specification

  int *mask = atom->mask;
-  tagint *bodyid = NULL;
+  tagint *bodyID = NULL;
  int nlocal = atom->nlocal;

  if (narg < 4) error->all(FLERR,"Illegal fix rigid/small command");
  if (strcmp(arg[3],"molecule") == 0) {
    if (atom->molecule_flag == 0)
      error->all(FLERR,"Fix rigid/small requires atom attribute molecule");
-    bodyid = atom->molecule;
+    bodyID = atom->molecule;

  } else if (strcmp(arg[3],"custom") == 0) {
    if (narg < 5) error->all(FLERR,"Illegal fix rigid/small command");
-      bodyid = new tagint[nlocal];
+      bodyID = new tagint[nlocal];
      customflag = 1;

      // determine whether atom-style variable or atom property is used.
@ -126,9 +129,11 @@ FixRigidSmall::FixRigidSmall(LAMMPS *lmp, int narg, char **arg) :
        int is_double=0;
        int custom_index = atom->find_custom(arg[4]+2,is_double);
        if (custom_index == -1)
-          error->all(FLERR,"Fix rigid/small custom requires previously defined property/atom");
+          error->all(FLERR,"Fix rigid/small custom requires "
+                     "previously defined property/atom");
        else if (is_double)
-          error->all(FLERR,"Fix rigid/small custom requires integer-valued property/atom");
+          error->all(FLERR,"Fix rigid/small custom requires "
+                     "integer-valued property/atom");

        int minval = INT_MAX;
        int *value = atom->ivector[custom_index];
@ -139,15 +144,17 @@ FixRigidSmall::FixRigidSmall(LAMMPS *lmp, int narg, char **arg) :

        for (i = 0; i < nlocal; i++)
          if (mask[i] & groupbit)
-            bodyid[i] = (tagint)(value[i] - minval + 1);
-          else bodyid[i] = 0;
+            bodyID[i] = (tagint)(value[i] - minval + 1);
+          else bodyID[i] = 0;

      } else if (strstr(arg[4],"v_") == arg[4]) {
        int ivariable = input->variable->find(arg[4]+2);
        if (ivariable < 0)
-          error->all(FLERR,"Variable name for fix rigid/small custom does not exist");
+          error->all(FLERR,"Variable name for fix rigid/small custom "
+                     "does not exist");
        if (input->variable->atomstyle(ivariable) == 0)
-          error->all(FLERR,"Fix rigid/small custom variable is no atom-style variable");
+          error->all(FLERR,"Fix rigid/small custom variable is not "
+                     "atom-style variable");
        double *value = new double[nlocal];
        input->variable->compute_atom(ivariable,0,value,1,0);
        int minval = INT_MAX;
@ -158,8 +165,8 @@ FixRigidSmall::FixRigidSmall(LAMMPS *lmp, int narg, char **arg) :

        for (i = 0; i < nlocal; i++)
          if (mask[i] & groupbit)
-            bodyid[i] = (tagint)((tagint)value[i] - minval + 1);
-          else bodyid[0] = 0;
+            bodyID[i] = (tagint)((tagint)value[i] - minval + 1);
+          else bodyID[0] = 0;
        delete[] value;
      } else error->all(FLERR,"Unsupported fix rigid custom property");
  } else error->all(FLERR,"Illegal fix rigid/small command");
@ -167,10 +174,11 @@ FixRigidSmall::FixRigidSmall(LAMMPS *lmp, int narg, char **arg) :
  if (atom->map_style == 0)
    error->all(FLERR,"Fix rigid/small requires an atom map, see atom_modify");

-  // maxmol = largest bodyid #
+  // maxmol = largest bodyID #
+
  maxmol = -1;
  for (i = 0; i < nlocal; i++)
-    if (mask[i] & groupbit) maxmol = MAX(maxmol,bodyid[i]);
+    if (mask[i] & groupbit) maxmol = MAX(maxmol,bodyID[i]);

  tagint itmp;
  MPI_Allreduce(&maxmol,&itmp,1,MPI_LMP_TAGINT,MPI_MAX,world);
@ -400,8 +408,19 @@ FixRigidSmall::FixRigidSmall(LAMMPS *lmp, int narg, char **arg) :
  // sets bodytag for owned atoms
  // body attributes are computed later by setup_bodies()

-  create_bodies(bodyid);
-  if (customflag) delete [] bodyid;
+  double time1 = MPI_Wtime();
+
+  create_bodies(bodyID);
+  if (customflag) delete [] bodyID;
+
+  double time2 = MPI_Wtime();
+
+  if (comm->me == 0) {
+    if (screen)
+      fprintf(screen,"  create bodies CPU = %g secs\n",time2-time1);
+    if (logfile)
+      fprintf(logfile,"  create bodies CPU = %g secs\n",time2-time1);
+  }

  // set nlocal_body and allocate bodies I own

@ -569,7 +588,8 @@ void FixRigidSmall::init()
      if (rflag && (modify->fmask[i] & POST_FORCE) &&
          !modify->fix[i]->rigid_flag) {
        char str[128];
-        snprintf(str,128,"Fix %s alters forces after fix rigid",modify->fix[i]->id);
+        snprintf(str,128,"Fix %s alters forces after fix rigid",
+                 modify->fix[i]->id);
        error->warning(FLERR,str);
      }
    }
@ -633,6 +653,16 @@ void FixRigidSmall::setup(int vflag)
 {
  int i,n,ibody;

+  // error if maxextent > comm->cutghost
+  // NOTE: could just warn if an override flag set
+  // NOTE: this could fail for comm multi mode if user sets a wrong cutoff
+  //       for atom types in rigid bodies - need a more careful test
+  // must check here, not in init, b/c neigh/comm values set after fix init
+
+  double cutghost = MAX(neighbor->cutneighmax,comm->cutghostuser);
+  if (maxextent > cutghost)
+    error->all(FLERR,"Rigid body extent > ghost cutoff - use comm_modify cutoff");
+
  //check(1);

  // sum fcm, torque across all rigid bodies
@ -1514,175 +1544,72 @@ void FixRigidSmall::set_v()
   set bodytag for all owned atoms
 ------------------------------------------------------------------------- */

-void FixRigidSmall::create_bodies(tagint *bodyid)
+void FixRigidSmall::create_bodies(tagint *bodyID)
 {
-  int i,m,n;
-  double unwrap[3];
+  int i,m;

-  // error check on image flags of atoms in rigid bodies
+  // allocate buffer for input to rendezvous comm
+  // ncount = # of my atoms in bodies

-  imageint *image = atom->image;
  int *mask = atom->mask;
  int nlocal = atom->nlocal;

-  int *periodicity = domain->periodicity;
-  int xbox,ybox,zbox;
-
-  int flag = 0;
-  for (i = 0; i < nlocal; i++) {
-    if (!(mask[i] & groupbit)) continue;
-    xbox = (image[i] & IMGMASK) - IMGMAX;
-    ybox = (image[i] >> IMGBITS & IMGMASK) - IMGMAX;
-    zbox = (image[i] >> IMG2BITS) - IMGMAX;
-    if ((xbox && !periodicity[0]) || (ybox && !periodicity[1]) ||
-        (zbox && !periodicity[2])) flag = 1;
-  }
-
-  int flagall;
-  MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_SUM,world);
-  if (flagall) error->all(FLERR,"Fix rigid/small atom has non-zero image flag "
-                          "in a non-periodic dimension");
-
-  // allocate buffer for passing messages around ring of procs
-  // percount = max number of values to put in buffer for each of ncount
-
  int ncount = 0;
  for (i = 0; i < nlocal; i++)
    if (mask[i] & groupbit) ncount++;

-  int percount = 5;
-  double *buf;
-  memory->create(buf,ncount*percount,"rigid/small:buf");
+  int *proclist;
+  memory->create(proclist,ncount,"rigid/small:proclist");
+  InRvous *inbuf = (InRvous *)
+    memory->smalloc(ncount*sizeof(InRvous),"rigid/small:inbuf");

-  // create map hash for storing unique body IDs of my atoms
-  // key = body ID
-  // value = index into per-body data structure
-  // n = # of entries in hash
-
-  hash = new std::map<tagint,int>();
-  hash->clear();
-
-  // setup hash
-  // key = body ID
-  // value = index into N-length data structure
-  // n = count of unique bodies my atoms are part of
-
-  n = 0;
-  for (i = 0; i < nlocal; i++) {
-    if (!(mask[i] & groupbit)) continue;
-    if (hash->find(bodyid[i]) == hash->end()) (*hash)[bodyid[i]] = n++;
-  }
-
-  // bbox = bounding box of each rigid body my atoms are part of
-
-  memory->create(bbox,n,6,"rigid/small:bbox");
-
-  for (i = 0; i < n; i++) {
-    bbox[i][0] = bbox[i][2] = bbox[i][4] = BIG;
-    bbox[i][1] = bbox[i][3] = bbox[i][5] = -BIG;
-  }
-
-  // pack my atoms into buffer as body ID, unwrapped coords
+  // setup buf to pass to rendezvous comm
+  // one BodyMsg datum for each constituent atom
+  // datum = me, local index of atom, atomID, bodyID, unwrapped coords
+  // owning proc for each datum = random hash of bodyID

  double **x = atom->x;
-
-  m = 0;
-  for (i = 0; i < nlocal; i++) {
-    if (!(mask[i] & groupbit)) continue;
-    domain->unmap(x[i],image[i],unwrap);
-    buf[m++] = bodyid[i];
-    buf[m++] = unwrap[0];
-    buf[m++] = unwrap[1];
-    buf[m++] = unwrap[2];
-  }
-
-  // pass buffer around ring of procs
-  // func = update bbox with atom coords from every proc
-  // when done, have full bbox for every rigid body my atoms are part of
-
-  comm->ring(m,sizeof(double),buf,1,ring_bbox,NULL,(void *)this);
-
-  // check if any bbox is size 0.0, meaning rigid body is a single particle
-
-  flag = 0;
-  for (i = 0; i < n; i++)
-    if (bbox[i][0] == bbox[i][1] && bbox[i][2] == bbox[i][3] &&
-        bbox[i][4] == bbox[i][5]) flag = 1;
-  MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_SUM,world);
-  if (flagall)
-    error->all(FLERR,"One or more rigid bodies are a single particle");
-
-  // ctr = center pt of each rigid body my atoms are part of
-
-  memory->create(ctr,n,6,"rigid/small:bbox");
-
-  for (i = 0; i < n; i++) {
-    ctr[i][0] = 0.5 * (bbox[i][0] + bbox[i][1]);
-    ctr[i][1] = 0.5 * (bbox[i][2] + bbox[i][3]);
-    ctr[i][2] = 0.5 * (bbox[i][4] + bbox[i][5]);
-  }
-
-  // idclose = ID of atom in body closest to center pt (smaller ID if tied)
-  // rsqclose = distance squared from idclose to center pt
-
-  memory->create(idclose,n,"rigid/small:idclose");
-  memory->create(rsqclose,n,"rigid/small:rsqclose");
-
-  for (i = 0; i < n; i++) rsqclose[i] = BIG;
-
-  // pack my atoms into buffer as body ID, atom ID, unwrapped coords
-
  tagint *tag = atom->tag;
+  imageint *image = atom->image;

  m = 0;
  for (i = 0; i < nlocal; i++) {
    if (!(mask[i] & groupbit)) continue;
-    domain->unmap(x[i],image[i],unwrap);
-    buf[m++] = bodyid[i];
-    buf[m++] = ubuf(tag[i]).d;
-    buf[m++] = unwrap[0];
-    buf[m++] = unwrap[1];
-    buf[m++] = unwrap[2];
+    proclist[m] = hashlittle(&bodyID[i],sizeof(tagint),0) % nprocs;
+    inbuf[m].me = me;
+    inbuf[m].ilocal = i;
+    inbuf[m].atomID = tag[i];
+    inbuf[m].bodyID = bodyID[i];
+    domain->unmap(x[i],image[i],inbuf[m].x);
+    m++;
  }

-  // pass buffer around ring of procs
-  // func = update idclose,rsqclose with atom IDs from every proc
-  // when done, have idclose for every rigid body my atoms are part of
+  // perform rendezvous operation
+  // each proc owns random subset of bodies
+  // receives all atoms in those bodies
+  // func = compute bbox of each body, find atom closest to geometric center

-  comm->ring(m,sizeof(double),buf,2,ring_nearest,NULL,(void *)this);
+  char *buf;
+  int nreturn = comm->rendezvous(RVOUS,ncount,(char *) inbuf,sizeof(InRvous),
+                                 0,proclist,
+                                 rendezvous_body,0,buf,sizeof(OutRvous),
+                                 (void *) this);
+  OutRvous *outbuf = (OutRvous *) buf;

-  // set bodytag of all owned atoms, based on idclose
-  // find max value of rsqclose across all procs
+  memory->destroy(proclist);
+  memory->sfree(inbuf);

-  double rsqmax = 0.0;
-  for (i = 0; i < nlocal; i++) {
-    bodytag[i] = 0;
-    if (!(mask[i] & groupbit)) continue;
-    m = hash->find(bodyid[i])->second;
-    bodytag[i] = idclose[m];
-    rsqmax = MAX(rsqmax,rsqclose[m]);
-  }
+  // set bodytag of all owned atoms based on outbuf info for constituent atoms

-  // pack my atoms into buffer as bodytag of owning atom, unwrapped coords
+  for (i = 0; i < nlocal; i++)
+    if (!(mask[i] & groupbit)) bodytag[i] = 0;

-  m = 0;
-  for (i = 0; i < nlocal; i++) {
-    if (!(mask[i] & groupbit)) continue;
-    domain->unmap(x[i],image[i],unwrap);
-    buf[m++] = ubuf(bodytag[i]).d;
-    buf[m++] = unwrap[0];
-    buf[m++] = unwrap[1];
-    buf[m++] = unwrap[2];
-  }
+  for (m = 0; m < nreturn; m++)
+    bodytag[outbuf[m].ilocal] = outbuf[m].atomID;

-  // pass buffer around ring of procs
-  // func = update rsqfar for atoms belonging to bodies I own
-  // when done, have rsqfar for all atoms in bodies I own
+  memory->sfree(outbuf);

-  rsqfar = 0.0;
-  comm->ring(m,sizeof(double),buf,3,ring_farthest,NULL,(void *)this);
-
-  // find maxextent of rsqfar across all procs
+  // maxextent = max of rsqfar across all procs
  // if defined, include molecule->maxextent

  MPI_Allreduce(&rsqfar,&maxextent,1,MPI_DOUBLE,MPI_MAX,world);
@ -1691,125 +1618,156 @@ void FixRigidSmall::create_bodies(tagint *bodyid)
    for (int i = 0; i < nmol; i++)
      maxextent = MAX(maxextent,onemols[i]->maxextent);
  }
+}
+
+/* ----------------------------------------------------------------------
+   process rigid bodies assigned to me
+   buf = list of N BodyMsg datums
+------------------------------------------------------------------------- */
+
+int FixRigidSmall::rendezvous_body(int n, char *inbuf,
+                                   int &rflag, int *&proclist, char *&outbuf,
+                                   void *ptr)
+{
+  int i,m;
+  double delx,dely,delz,rsq;
+  int *iclose;
+  tagint *idclose;
+  double *x,*xown,*rsqclose;
+  double **bbox,**ctr;
+
+  FixRigidSmall *frsptr = (FixRigidSmall *) ptr;
+  Memory *memory = frsptr->memory;
+  Error *error = frsptr->error;
+  MPI_Comm world = frsptr->world;
+
+  // setup hash
+  // use STL map instead of atom->map
+  //   b/c know nothing about body ID values specified by user
+  // ncount = number of bodies assigned to me
+  // key = body ID
+  // value = index into Ncount-length data structure
+
+  InRvous *in = (InRvous *) inbuf;
+  std::map<tagint,int> hash;
+  tagint id;
+
+  int ncount = 0;
+  for (i = 0; i < n; i++) {
+    id = in[i].bodyID;
+    if (hash.find(id) == hash.end()) hash[id] = ncount++;
+  }
+
+  // bbox = bounding box of each rigid body
+
+  memory->create(bbox,ncount,6,"rigid/small:bbox");
+
+  for (m = 0; m < ncount; m++) {
+    bbox[m][0] = bbox[m][2] = bbox[m][4] = BIG;
+    bbox[m][1] = bbox[m][3] = bbox[m][5] = -BIG;
+  }
+
+  for (i = 0; i < n; i++) {
+    m = hash.find(in[i].bodyID)->second;
+    x = in[i].x;
+    bbox[m][0] = MIN(bbox[m][0],x[0]);
+    bbox[m][1] = MAX(bbox[m][1],x[0]);
+    bbox[m][2] = MIN(bbox[m][2],x[1]);
+    bbox[m][3] = MAX(bbox[m][3],x[1]);
+    bbox[m][4] = MIN(bbox[m][4],x[2]);
+    bbox[m][5] = MAX(bbox[m][5],x[2]);
+  }
+
+  // check if any bbox is size 0.0, meaning rigid body is a single particle
+
+  int flag = 0;
+  for (m = 0; m < ncount; m++)
+    if (bbox[m][0] == bbox[m][1] && bbox[m][2] == bbox[m][3] &&
+        bbox[m][4] == bbox[m][5]) flag = 1;
+  int flagall;
+  MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_SUM,world);    // sync here?
+  if (flagall)
+    error->all(FLERR,"One or more rigid bodies are a single particle");
+
+  // ctr = geometric center pt of each rigid body
+
+  memory->create(ctr,ncount,3,"rigid/small:bbox");
+
+  for (m = 0; m < ncount; m++) {
+    ctr[m][0] = 0.5 * (bbox[m][0] + bbox[m][1]);
+    ctr[m][1] = 0.5 * (bbox[m][2] + bbox[m][3]);
+    ctr[m][2] = 0.5 * (bbox[m][4] + bbox[m][5]);
+  }
+
+  // idclose = atomID closest to center point of each body
+
+  memory->create(idclose,ncount,"rigid/small:idclose");
+  memory->create(iclose,ncount,"rigid/small:iclose");
+  memory->create(rsqclose,ncount,"rigid/small:rsqclose");
+  for (m = 0; m < ncount; m++) rsqclose[m] = BIG;
+
+  for (i = 0; i < n; i++) {
+    m = hash.find(in[i].bodyID)->second;
+    x = in[i].x;
+    delx = x[0] - ctr[m][0];
+    dely = x[1] - ctr[m][1];
+    delz = x[2] - ctr[m][2];
+    rsq = delx*delx + dely*dely + delz*delz;
+    if (rsq <= rsqclose[m]) {
+      if (rsq == rsqclose[m] && in[i].atomID > idclose[m]) continue;
+      iclose[m] = i;
+      idclose[m] = in[i].atomID;
+      rsqclose[m] = rsq;
+    }
+  }
+
+  // compute rsqfar for all bodies I own
+  // set rsqfar back in caller
+
+  double rsqfar = 0.0;
+
+  for (int i = 0; i < n; i++) {
+    m = hash.find(in[i].bodyID)->second;
+    xown = in[iclose[m]].x;
+    x = in[i].x;
+    delx = x[0] - xown[0];
+    dely = x[1] - xown[1];
+    delz = x[2] - xown[2];
+    rsq = delx*delx + dely*dely + delz*delz;
+    rsqfar = MAX(rsqfar,rsq);
+  }
+
+  frsptr->rsqfar = rsqfar;
+
+  // pass list of OutRvous datums back to comm->rendezvous
+
+  int nout = n;
+  memory->create(proclist,nout,"rigid/small:proclist");
+  OutRvous *out = (OutRvous *)
+    memory->smalloc(nout*sizeof(OutRvous),"rigid/small:out");
+
+  for (int i = 0; i < nout; i++) {
+    proclist[i] = in[i].me;
+    out[i].ilocal = in[i].ilocal;
+    m = hash.find(in[i].bodyID)->second;
+    out[i].atomID = idclose[m];
+  }
+
+  outbuf = (char *) out;

  // clean up
+  // Comm::rendezvous will delete proclist and out (outbuf)

-  delete hash;
-  memory->destroy(buf);
  memory->destroy(bbox);
  memory->destroy(ctr);
  memory->destroy(idclose);
+  memory->destroy(iclose);
  memory->destroy(rsqclose);
-}

-/* ----------------------------------------------------------------------
-   process rigid body atoms from another proc
-   update bounding box for rigid bodies my atoms are part of
------------------------------------------------------------------------- */
+  // flag = 2: new outbuf

-void FixRigidSmall::ring_bbox(int n, char *cbuf, void *ptr)
-{
-  FixRigidSmall *frsptr = (FixRigidSmall *) ptr;
-  std::map<tagint,int> *hash = frsptr->hash;
-  double **bbox = frsptr->bbox;
-
-  double *buf = (double *) cbuf;
-  int ndatums = n/4;
-
-  int j,imol;
-  double *x;
-
-  int m = 0;
-  for (int i = 0; i < ndatums; i++, m += 4) {
-    imol = static_cast<int> (buf[m]);
-    if (hash->find(imol) != hash->end()) {
-      j = hash->find(imol)->second;
-      x = &buf[m+1];
-      bbox[j][0] = MIN(bbox[j][0],x[0]);
-      bbox[j][1] = MAX(bbox[j][1],x[0]);
-      bbox[j][2] = MIN(bbox[j][2],x[1]);
-      bbox[j][3] = MAX(bbox[j][3],x[1]);
-      bbox[j][4] = MIN(bbox[j][4],x[2]);
-      bbox[j][5] = MAX(bbox[j][5],x[2]);
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   process rigid body atoms from another proc
-   update nearest atom to body center for rigid bodies my atoms are part of
------------------------------------------------------------------------- */
-
-void FixRigidSmall::ring_nearest(int n, char *cbuf, void *ptr)
-{
-  FixRigidSmall *frsptr = (FixRigidSmall *) ptr;
-  std::map<tagint,int> *hash = frsptr->hash;
-  double **ctr = frsptr->ctr;
-  tagint *idclose = frsptr->idclose;
-  double *rsqclose = frsptr->rsqclose;
-
-  double *buf = (double *) cbuf;
-  int ndatums = n/5;
-
-  int j,imol;
-  tagint tag;
-  double delx,dely,delz,rsq;
-  double *x;
-
-  int m = 0;
-  for (int i = 0; i < ndatums; i++, m += 5) {
-    imol = static_cast<int> (buf[m]);
-    if (hash->find(imol) != hash->end()) {
-      j = hash->find(imol)->second;
-      tag = (tagint) ubuf(buf[m+1]).i;
-      x = &buf[m+2];
-      delx = x[0] - ctr[j][0];
-      dely = x[1] - ctr[j][1];
-      delz = x[2] - ctr[j][2];
-      rsq = delx*delx + dely*dely + delz*delz;
-      if (rsq <= rsqclose[j]) {
-        if (rsq == rsqclose[j] && tag > idclose[j]) continue;
-        idclose[j] = tag;
-        rsqclose[j] = rsq;
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   process rigid body atoms from another proc
-   update rsqfar = distance from owning atom to other atom
------------------------------------------------------------------------- */
-
-void FixRigidSmall::ring_farthest(int n, char *cbuf, void *ptr)
-{
-  FixRigidSmall *frsptr = (FixRigidSmall *) ptr;
-  double **x = frsptr->atom->x;
-  imageint *image = frsptr->atom->image;
-  int nlocal = frsptr->atom->nlocal;
-
-  double *buf = (double *) cbuf;
-  int ndatums = n/4;
-
-  int iowner;
-  tagint tag;
-  double delx,dely,delz,rsq;
-  double *xx;
-  double unwrap[3];
-
-  int m = 0;
-  for (int i = 0; i < ndatums; i++, m += 4) {
-    tag = (tagint) ubuf(buf[m]).i;
-    iowner = frsptr->atom->map(tag);
-    if (iowner < 0 || iowner >= nlocal) continue;
-    frsptr->domain->unmap(x[iowner],image[iowner],unwrap);
-    xx = &buf[m+1];
-    delx = xx[0] - unwrap[0];
-    dely = xx[1] - unwrap[1];
-    delz = xx[2] - unwrap[2];
-    rsq = delx*delx + dely*dely + delz*delz;
-    frsptr->rsqfar = MAX(frsptr->rsqfar,rsq);
-  }
+  rflag = 2;
+  return nout;
 }

 /* ----------------------------------------------------------------------
@ -2472,9 +2430,9 @@ void FixRigidSmall::readfile(int which, double **array, int *inbody)

  int nlocal = atom->nlocal;

-  hash = new std::map<tagint,int>();
+  std::map<tagint,int> hash;
  for (i = 0; i < nlocal; i++)
-    if (bodyown[i] >= 0) (*hash)[atom->molecule[i]] = bodyown[i];
+    if (bodyown[i] >= 0) hash[atom->molecule[i]] = bodyown[i];

  // open file and read header

@ -2533,11 +2491,11 @@ void FixRigidSmall::readfile(int which, double **array, int *inbody)
      id = ATOTAGINT(values[0]);
      if (id <= 0 || id > maxmol)
        error->all(FLERR,"Invalid rigid body ID in fix rigid/small file");
-      if (hash->find(id) == hash->end()) {
+      if (hash.find(id) == hash.end()) {
        buf = next + 1;
        continue;
      }
-      m = (*hash)[id];
+      m = hash[id];
      inbody[m] = 1;

      if (which == 0) {
@ -2576,7 +2534,6 @@ void FixRigidSmall::readfile(int which, double **array, int *inbody)

  delete [] buffer;
  delete [] values;
-  delete hash;
 }

 /* ----------------------------------------------------------------------
--- a/src/RIGID/fix_rigid_small.h
+++ b/src/RIGID/fix_rigid_small.h
@ -22,9 +22,6 @@ FixStyle(rigid/small,FixRigidSmall)

 #include "fix.h"

-// replace this later
-#include <map>
-
 namespace LAMMPS_NS {

 class FixRigidSmall : public Fix {
@ -180,13 +177,21 @@ class FixRigidSmall : public Fix {

  // class data used by ring communication callbacks

-  std::map<tagint,int> *hash;
-  double **bbox;
-  double **ctr;
-  tagint *idclose;
-  double *rsqclose;
  double rsqfar;

+  struct InRvous {
+    int me,ilocal;
+    tagint atomID,bodyID;
+    double x[3];
+  };
+
+  struct OutRvous {
+    int ilocal;
+    tagint atomID;
+  };
+
+  // local methods
+
  void image_shift();
  void set_xv();
  void set_v();
@ -199,11 +204,9 @@ class FixRigidSmall : public Fix {
  void grow_body();
  void reset_atom2body();

-  // callback functions for ring communication
+  // callback function for rendezvous communication

-  static void ring_bbox(int, char *, void *);
-  static void ring_nearest(int, char *, void *);
-  static void ring_farthest(int, char *, void *);
+  static int rendezvous_body(int, char *, int &, int *&, char *&, void *);

  // debug

--- a/src/RIGID/fix_shake.cpp
+++ b/src/RIGID/fix_shake.cpp
@ -39,6 +39,8 @@ using namespace LAMMPS_NS;
 using namespace FixConst;
 using namespace MathConst;

+#define RVOUS 1   // 0 for irregular, 1 for all2all
+
 #define BIG 1.0e20
 #define MASSDELTA 0.1

@ -219,8 +221,19 @@ FixShake::FixShake(LAMMPS *lmp, int narg, char **arg) :

  // identify all SHAKE clusters

+  double time1 = MPI_Wtime();
+
  find_clusters();

+  double time2 = MPI_Wtime();
+
+  if (comm->me == 0) {
+    if (screen)
+      fprintf(screen,"  find clusters CPU = %g secs\n",time2-time1);
+    if (logfile)
+      fprintf(logfile,"  find clusters CPU = %g secs\n",time2-time1);
+  }
+
  // initialize list of SHAKE clusters to constrain

  maxlist = 0;
@ -681,10 +694,9 @@ int FixShake::dof(int igroup)
 void FixShake::find_clusters()
 {
  int i,j,m,n,imol,iatom;
-  int flag,flag_all,nbuf,size;
+  int flag,flag_all;
  tagint tagprev;
  double massone;
-  tagint *buf;

  if (me == 0 && screen) {
    if (!rattle) fprintf(screen,"Finding SHAKE clusters ...\n");
@ -707,13 +719,6 @@ void FixShake::find_clusters()
  int nlocal = atom->nlocal;
  int angles_allow = atom->avec->angles_allow;

-  // setup ring of procs
-
-  int next = me + 1;
-  int prev = me -1;
-  if (next == nprocs) next = 0;
-  if (prev < 0) prev = nprocs - 1;
-
  // -----------------------------------------------------
  // allocate arrays for self (1d) and bond partners (2d)
  // max = max # of bond partners for owned atoms = 2nd dim of partner arrays
@ -755,6 +760,10 @@ void FixShake::find_clusters()
  memory->create(partner_shake,nlocal,max,"shake:partner_shake");
  memory->create(partner_nshake,nlocal,max,"shake:partner_nshake");

+  // setup atomIDs and procowner vectors in rendezvous decomposition
+
+  atom_owners();
+
  // -----------------------------------------------------
  // set npartner and partner_tag from special arrays
  // -----------------------------------------------------
@ -778,86 +787,13 @@ void FixShake::find_clusters()
  }

  // -----------------------------------------------------
-  // set partner_mask, partner_type, partner_massflag, partner_bondtype
-  //   for bonded partners
-  // requires communication for off-proc partners
+  // set partner_mask, partner_type, partner_massflag,
+  //   partner_bondtype for all my bonded partners
+  // requires rendezvous communication for off-proc partners
  // -----------------------------------------------------

-  // fill in mask, type, massflag, bondtype if own bond partner
-  // info to store in buf for each off-proc bond = nper = 6
-  //   2 atoms IDs in bond, space for mask, type, massflag, bondtype
-  // nbufmax = largest buffer needed to hold info from any proc
-
-  int nper = 6;
-
-  nbuf = 0;
-  for (i = 0; i < nlocal; i++) {
-    for (j = 0; j < npartner[i]; j++) {
-      partner_mask[i][j] = 0;
-      partner_type[i][j] = 0;
-      partner_massflag[i][j] = 0;
-      partner_bondtype[i][j] = 0;
-
-      m = atom->map(partner_tag[i][j]);
-      if (m >= 0 && m < nlocal) {
-        partner_mask[i][j] = mask[m];
-        partner_type[i][j] = type[m];
-        if (nmass) {
-          if (rmass) massone = rmass[m];
-          else massone = mass[type[m]];
-          partner_massflag[i][j] = masscheck(massone);
-        }
-        n = bondtype_findset(i,tag[i],partner_tag[i][j],0);
-        if (n) partner_bondtype[i][j] = n;
-        else {
-          n = bondtype_findset(m,tag[i],partner_tag[i][j],0);
-          if (n) partner_bondtype[i][j] = n;
-        }
-      } else nbuf += nper;
-    }
-  }
-
-  memory->create(buf,nbuf,"shake:buf");
-
-  // fill buffer with info
-
-  size = 0;
-  for (i = 0; i < nlocal; i++) {
-    for (j = 0; j < npartner[i]; j++) {
-      m = atom->map(partner_tag[i][j]);
-      if (m < 0 || m >= nlocal) {
-        buf[size] = tag[i];
-        buf[size+1] = partner_tag[i][j];
-        buf[size+2] = 0;
-        buf[size+3] = 0;
-        buf[size+4] = 0;
-        n = bondtype_findset(i,tag[i],partner_tag[i][j],0);
-        if (n) buf[size+5] = n;
-        else buf[size+5] = 0;
-        size += nper;
-      }
-    }
-  }
-
-  // cycle buffer around ring of procs back to self
-
-  comm->ring(size,sizeof(tagint),buf,1,ring_bonds,buf,(void *)this);
-
-  // store partner info returned to me
-
-  m = 0;
-  while (m < size) {
-    i = atom->map(buf[m]);
-    for (j = 0; j < npartner[i]; j++)
-      if (buf[m+1] == partner_tag[i][j]) break;
-    partner_mask[i][j] = buf[m+2];
-    partner_type[i][j] = buf[m+3];
-    partner_massflag[i][j] = buf[m+4];
-    partner_bondtype[i][j] = buf[m+5];
-    m += nper;
-  }
-
-  memory->destroy(buf);
+  partner_info(npartner,partner_tag,partner_mask,partner_type,
+               partner_massflag,partner_bondtype);

  // error check for unfilled partner info
  // if partner_type not set, is an error
@ -868,12 +804,13 @@ void FixShake::find_clusters()
  // else it's an error

  flag = 0;
+  int flag2 = 0;
  for (i = 0; i < nlocal; i++)
    for (j = 0; j < npartner[i]; j++) {
-      if (partner_type[i][j] == 0) flag = 1;
+      if (partner_type[i][j] == 0) flag++;
      if (!(mask[i] & groupbit)) continue;
      if (!(partner_mask[i][j] & groupbit)) continue;
-      if (partner_bondtype[i][j] == 0) flag = 1;
+      if (partner_bondtype[i][j] == 0) flag2++;
    }

  MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
@ -931,55 +868,10 @@ void FixShake::find_clusters()

  // -----------------------------------------------------
  // set partner_nshake for bonded partners
-  // requires communication for off-proc partners
+  // requires rendezvous communication for off-proc partners
  // -----------------------------------------------------

-  // fill in partner_nshake if own bond partner
-  // info to store in buf for each off-proc bond =
-  //   2 atoms IDs in bond, space for nshake value
-  // nbufmax = largest buffer needed to hold info from any proc
-
-  nbuf = 0;
-  for (i = 0; i < nlocal; i++) {
-    for (j = 0; j < npartner[i]; j++) {
-      m = atom->map(partner_tag[i][j]);
-      if (m >= 0 && m < nlocal) partner_nshake[i][j] = nshake[m];
-      else nbuf += 3;
-    }
-  }
-
-  memory->create(buf,nbuf,"shake:buf");
-
-  // fill buffer with info
-
-  size = 0;
-  for (i = 0; i < nlocal; i++) {
-    for (j = 0; j < npartner[i]; j++) {
-      m = atom->map(partner_tag[i][j]);
-      if (m < 0 || m >= nlocal) {
-        buf[size] = tag[i];
-        buf[size+1] = partner_tag[i][j];
-        size += 3;
-      }
-    }
-  }
-
-  // cycle buffer around ring of procs back to self
-
-  comm->ring(size,sizeof(tagint),buf,2,ring_nshake,buf,(void *)this);
-
-  // store partner info returned to me
-
-  m = 0;
-  while (m < size) {
-    i = atom->map(buf[m]);
-    for (j = 0; j < npartner[i]; j++)
-      if (buf[m+1] == partner_tag[i][j]) break;
-    partner_nshake[i][j] = buf[m+2];
-    m += 3;
-  }
-
-  memory->destroy(buf);
+  nshake_info(npartner,partner_tag,partner_nshake);

  // -----------------------------------------------------
  // error checks
@ -988,7 +880,7 @@ void FixShake::find_clusters()
  // -----------------------------------------------------

  flag = 0;
-  for (i = 0; i < nlocal; i++) if (nshake[i] > 3) flag = 1;
+  for (i = 0; i < nlocal; i++) if (nshake[i] > 3) flag++;
  MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
  if (flag_all) error->all(FLERR,"Shake cluster of more than 4 atoms");

@ -996,7 +888,7 @@ void FixShake::find_clusters()
  for (i = 0; i < nlocal; i++) {
    if (nshake[i] <= 1) continue;
    for (j = 0; j < npartner[i]; j++)
-      if (partner_shake[i][j] && partner_nshake[i][j] > 1) flag = 1;
+      if (partner_shake[i][j] && partner_nshake[i][j] > 1) flag++;
  }
  MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
  if (flag_all) error->all(FLERR,"Shake clusters are connected");
@ -1064,68 +956,18 @@ void FixShake::find_clusters()

  // -----------------------------------------------------
  // set shake_flag,shake_atom,shake_type for non-central atoms
-  // requires communication for off-proc atoms
+  // requires rendezvous communication for off-proc atoms
  // -----------------------------------------------------

-  // fill in shake arrays for each bond partner I own
-  // info to store in buf for each off-proc bond =
-  //   all values from shake_flag, shake_atom, shake_type
-  // nbufmax = largest buffer needed to hold info from any proc
-
-  nbuf = 0;
-  for (i = 0; i < nlocal; i++) {
-    if (shake_flag[i] == 0) continue;
-    for (j = 0; j < npartner[i]; j++) {
-      if (partner_shake[i][j] == 0) continue;
-      m = atom->map(partner_tag[i][j]);
-      if (m >= 0 && m < nlocal) {
-        shake_flag[m] = shake_flag[i];
-        shake_atom[m][0] = shake_atom[i][0];
-        shake_atom[m][1] = shake_atom[i][1];
-        shake_atom[m][2] = shake_atom[i][2];
-        shake_atom[m][3] = shake_atom[i][3];
-        shake_type[m][0] = shake_type[i][0];
-        shake_type[m][1] = shake_type[i][1];
-        shake_type[m][2] = shake_type[i][2];
-      } else nbuf += 9;
-    }
-  }
-
-  memory->create(buf,nbuf,"shake:buf");
-
-  // fill buffer with info
-
-  size = 0;
-  for (i = 0; i < nlocal; i++) {
-    if (shake_flag[i] == 0) continue;
-    for (j = 0; j < npartner[i]; j++) {
-      if (partner_shake[i][j] == 0) continue;
-      m = atom->map(partner_tag[i][j]);
-      if (m < 0 || m >= nlocal) {
-        buf[size] = partner_tag[i][j];
-        buf[size+1] = shake_flag[i];
-        buf[size+2] = shake_atom[i][0];
-        buf[size+3] = shake_atom[i][1];
-        buf[size+4] = shake_atom[i][2];
-        buf[size+5] = shake_atom[i][3];
-        buf[size+6] = shake_type[i][0];
-        buf[size+7] = shake_type[i][1];
-        buf[size+8] = shake_type[i][2];
-        size += 9;
-      }
-    }
-  }
-
-  // cycle buffer around ring of procs back to self
-
-  comm->ring(size,sizeof(tagint),buf,3,ring_shake,NULL,(void *)this);
-
-  memory->destroy(buf);
+  shake_info(npartner,partner_tag,partner_shake);

  // -----------------------------------------------------
  // free local memory
  // -----------------------------------------------------

+  memory->destroy(atomIDs);
+  memory->destroy(procowner);
+
  memory->destroy(npartner);
  memory->destroy(nshake);
  memory->destroy(partner_tag);
@ -1199,98 +1041,548 @@ void FixShake::find_clusters()
 }

 /* ----------------------------------------------------------------------
-   when receive buffer, scan bond partner IDs for atoms I own
-   if I own partner:
-     fill in mask and type and massflag
-     search for bond with 1st atom and fill in bondtype
+   setup atomIDs and procowner
 ------------------------------------------------------------------------- */

-void FixShake::ring_bonds(int ndatum, char *cbuf, void *ptr)
+void FixShake::atom_owners()
 {
-  FixShake *fsptr = (FixShake *)ptr;
-  Atom *atom = fsptr->atom;
+  tagint *tag = atom->tag;
+  int nlocal = atom->nlocal;
+
+  int *proclist;
+  memory->create(proclist,nlocal,"shake:proclist");
+  IDRvous *idbuf = (IDRvous *)
+    memory->smalloc((bigint) nlocal*sizeof(IDRvous),"shake:idbuf");
+
+  // setup input buf to rendezvous comm
+  // input datums = pairs of bonded atoms
+  // owning proc for each datum = random hash of atomID
+  // one datum for each owned atom: datum = owning proc, atomID
+
+  for (int i = 0; i < nlocal; i++) {
+    proclist[i] = tag[i] % nprocs;
+    idbuf[i].me = me;
+    idbuf[i].atomID = tag[i];
+  }
+
+  // perform rendezvous operation
+  // each proc assigned every 1/Pth atom
+
+  char *buf;
+  comm->rendezvous(RVOUS,nlocal,(char *) idbuf,sizeof(IDRvous),
+                   0,proclist,
+                   rendezvous_ids,0,buf,0,(void *) this);
+
+  memory->destroy(proclist);
+  memory->sfree(idbuf);
+}
+
+/* ----------------------------------------------------------------------
+   setup partner_mask, partner_type, partner_massflag, partner_bondtype
+------------------------------------------------------------------------- */
+
+void FixShake::partner_info(int *npartner, tagint **partner_tag,
+                            int **partner_mask, int **partner_type,
+                            int **partner_massflag, int **partner_bondtype)
+{
+  int i,j,m,n;
+  int nlocal = atom->nlocal;
+
+  // nsend = # of my datums to send
+  // one datum for every off-processor partner
+
+  int nsend = 0;
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < npartner[i]; j++) {
+      m = atom->map(partner_tag[i][j]);
+      if (m < 0 || m >= nlocal) nsend++;
+    }
+  }
+
+  int *proclist;
+  memory->create(proclist,nsend,"special:proclist");
+  PartnerInfo *inbuf = (PartnerInfo *)
+    memory->smalloc((bigint) nsend*sizeof(PartnerInfo),"special:inbuf");
+
+  // set values in 4 partner arrays for all partner atoms I own
+  // also setup input buf to rendezvous comm
+  // input datums = pair of bonded atoms where I do not own partner
+  // owning proc for each datum = partner_tag % nprocs
+  // datum: atomID = partner_tag (off-proc), partnerID = tag (on-proc)
+  //        4 values for my owned atom
+
  double *rmass = atom->rmass;
  double *mass = atom->mass;
-  int *mask = atom->mask;
  int *type = atom->type;
-  int nlocal = atom->nlocal;
-  int nmass = fsptr->nmass;
+  int *mask = atom->mask;
+  tagint *tag = atom->tag;

-  tagint *buf = (tagint *) cbuf;
-  int m,n;
  double massone;

-  for (int i = 0; i < ndatum; i += 6) {
-    m = atom->map(buf[i+1]);
-    if (m >= 0 && m < nlocal) {
-      buf[i+2] = mask[m];
-      buf[i+3] = type[m];
-      if (nmass) {
-        if (rmass) massone = rmass[m];
-        else massone = mass[type[m]];
-        buf[i+4] = fsptr->masscheck(massone);
-      }
-      if (buf[i+5] == 0) {
-        n = fsptr->bondtype_findset(m,buf[i],buf[i+1],0);
-        if (n) buf[i+5] = n;
+  nsend = 0;
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < npartner[i]; j++) {
+      partner_mask[i][j] = 0;
+      partner_type[i][j] = 0;
+      partner_massflag[i][j] = 0;
+      partner_bondtype[i][j] = 0;
+
+      m = atom->map(partner_tag[i][j]);
+
+      if (m >= 0 && m < nlocal) {
+        partner_mask[i][j] = mask[m];
+        partner_type[i][j] = type[m];
+        if (nmass) {
+          if (rmass) massone = rmass[m];
+          else massone = mass[type[m]];
+          partner_massflag[i][j] = masscheck(massone);
+        }
+        n = bondtype_findset(i,tag[i],partner_tag[i][j],0);
+        if (n) partner_bondtype[i][j] = n;
+        else {
+          n = bondtype_findset(m,tag[i],partner_tag[i][j],0);
+          if (n) partner_bondtype[i][j] = n;
+        }
+
+      } else {
+        proclist[nsend] = partner_tag[i][j] % nprocs;
+        inbuf[nsend].atomID = partner_tag[i][j];
+        inbuf[nsend].partnerID = tag[i];
+        inbuf[nsend].mask = mask[i];
+        inbuf[nsend].type = type[i];
+        if (nmass) {
+          if (rmass) massone = rmass[i];
+          else massone = mass[type[i]];
+          inbuf[nsend].massflag = masscheck(massone);
+        } else inbuf[nsend].massflag = 0;
+
+        // my atom may own bond, in which case set partner_bondtype
+        // else receiver of this datum will own the bond and return the value
+
+        n = bondtype_findset(i,tag[i],partner_tag[i][j],0);
+        if (n) {
+          partner_bondtype[i][j] = n;
+          inbuf[nsend].bondtype = n;
+        } else inbuf[nsend].bondtype = 0;
+
+        nsend++;
      }
    }
  }
-}

-/* ----------------------------------------------------------------------
-   when receive buffer, scan bond partner IDs for atoms I own
-   if I own partner, fill in nshake value
------------------------------------------------------------------------- */
+  // perform rendezvous operation
+  // each proc owns random subset of atoms
+  // receives all data needed to populate un-owned partner 4 values

-void FixShake::ring_nshake(int ndatum, char *cbuf, void *ptr)
-{
-  FixShake *fsptr = (FixShake *)ptr;
-  Atom *atom = fsptr->atom;
-  int nlocal = atom->nlocal;
+  char *buf;
+  int nreturn = comm->rendezvous(RVOUS,nsend,(char *) inbuf,sizeof(PartnerInfo),
+                                 0,proclist,
+                                 rendezvous_partners_info,
+                                 0,buf,sizeof(PartnerInfo),
+                                 (void *) this);
+  PartnerInfo *outbuf = (PartnerInfo *) buf;

-  int *nshake = fsptr->nshake;
+  memory->destroy(proclist);
+  memory->sfree(inbuf);

-  tagint *buf = (tagint *) cbuf;
-  int m;
+  // set partner 4 values for un-onwed partners based on output info
+  // outbuf.atomID = my owned atom, outbuf.partnerID = partner the info is for

-  for (int i = 0; i < ndatum; i += 3) {
-    m = atom->map(buf[i+1]);
-    if (m >= 0 && m < nlocal) buf[i+2] = nshake[m];
+  for (m = 0; m < nreturn; m++) {
+    i = atom->map(outbuf[m].atomID);
+    for (j = 0; j < npartner[i]; j++)
+      if (partner_tag[i][j] == outbuf[m].partnerID) break;
+    partner_mask[i][j] = outbuf[m].mask;
+    partner_type[i][j] = outbuf[m].type;
+    partner_massflag[i][j] = outbuf[m].massflag;
+
+    // only set partner_bondtype if my atom did not set it
+    //   when setting up rendezvous
+    // if this proc set it, then sender of this datum set outbuf.bondtype = 0
+
+    if (partner_bondtype[i][j] == 0)
+      partner_bondtype[i][j] = outbuf[m].bondtype;
  }
+
+  memory->sfree(outbuf);
 }

 /* ----------------------------------------------------------------------
-   when receive buffer, scan bond partner IDs for atoms I own
-   if I own partner, fill in nshake value
+   setup partner_nshake
 ------------------------------------------------------------------------- */

-void FixShake::ring_shake(int ndatum, char *cbuf, void *ptr)
+void FixShake::nshake_info(int *npartner, tagint **partner_tag,
+                           int **partner_nshake)
 {
-  FixShake *fsptr = (FixShake *)ptr;
-  Atom *atom = fsptr->atom;
+  int i,j,m;
  int nlocal = atom->nlocal;

-  int *shake_flag = fsptr->shake_flag;
-  tagint **shake_atom = fsptr->shake_atom;
-  int **shake_type = fsptr->shake_type;
+  // nsend = # of my datums to send
+  // one datum for every off-processor partner

-  tagint *buf = (tagint *) cbuf;
-  int m;
-
-  for (int i = 0; i < ndatum; i += 9) {
-    m = atom->map(buf[i]);
-    if (m >= 0 && m < nlocal) {
-      shake_flag[m] = buf[i+1];
-      shake_atom[m][0] = buf[i+2];
-      shake_atom[m][1] = buf[i+3];
-      shake_atom[m][2] = buf[i+4];
-      shake_atom[m][3] = buf[i+5];
-      shake_type[m][0] = buf[i+6];
-      shake_type[m][1] = buf[i+7];
-      shake_type[m][2] = buf[i+8];
+  int nsend = 0;
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < npartner[i]; j++) {
+      m = atom->map(partner_tag[i][j]);
+      if (m < 0 || m >= nlocal) nsend++;
    }
  }
+
+  int *proclist;
+  memory->create(proclist,nsend,"special:proclist");
+  NShakeInfo *inbuf = (NShakeInfo *)
+    memory->smalloc((bigint) nsend*sizeof(NShakeInfo),"special:inbuf");
+
+  // set partner_nshake for all partner atoms I own
+  // also setup input buf to rendezvous comm
+  // input datums = pair of bonded atoms where I do not own partner
+  // owning proc for each datum = partner_tag % nprocs
+  // datum: atomID = partner_tag (off-proc), partnerID = tag (on-proc)
+  //        nshake value for my owned atom
+
+  tagint *tag = atom->tag;
+
+  nsend = 0;
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < npartner[i]; j++) {
+      partner_nshake[i][j] = 0;
+      m = atom->map(partner_tag[i][j]);
+      if (m >= 0 && m < nlocal) {
+        partner_nshake[i][j] = nshake[m];
+      } else {
+        proclist[nsend] = partner_tag[i][j] % nprocs;
+        inbuf[nsend].atomID = partner_tag[i][j];
+        inbuf[nsend].partnerID = tag[i];
+        inbuf[nsend].nshake = nshake[i];
+        nsend++;
+      }
+    }
+  }
+
+  // perform rendezvous operation
+  // each proc owns random subset of atoms
+  // receives all data needed to populate un-owned partner nshake
+
+  char *buf;
+  int nreturn = comm->rendezvous(RVOUS,nsend,(char *) inbuf,sizeof(NShakeInfo),
+                                 0,proclist,
+                                 rendezvous_nshake,0,buf,sizeof(NShakeInfo),
+                                 (void *) this);
+  NShakeInfo *outbuf = (NShakeInfo *) buf;
+
+  memory->destroy(proclist);
+  memory->sfree(inbuf);
+
+  // set partner nshake for un-onwed partners based on output info
+  // outbuf.atomID = my owned atom, outbuf.partnerID = partner the info is for
+
+  for (m = 0; m < nreturn; m++) {
+    i = atom->map(outbuf[m].atomID);
+    for (j = 0; j < npartner[i]; j++)
+      if (partner_tag[i][j] == outbuf[m].partnerID) break;
+    partner_nshake[i][j] = outbuf[m].nshake;
+  }
+
+  memory->sfree(outbuf);
+}
+
+/* ----------------------------------------------------------------------
+   setup shake_flag, shake_atom, shake_type
+------------------------------------------------------------------------- */
+
+void FixShake::shake_info(int *npartner, tagint **partner_tag,
+                          int **partner_shake)
+{
+  int i,j,m;
+  int nlocal = atom->nlocal;
+
+  // nsend = # of my datums to send
+  // one datum for every off-processor partner
+
+  int nsend = 0;
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < npartner[i]; j++) {
+      m = atom->map(partner_tag[i][j]);
+      if (m < 0 || m >= nlocal) nsend++;
+    }
+  }
+
+  int *proclist;
+  memory->create(proclist,nsend,"special:proclist");
+  ShakeInfo *inbuf = (ShakeInfo *)
+    memory->smalloc((bigint) nsend*sizeof(ShakeInfo),"special:inbuf");
+
+  // set 3 shake arrays for all partner atoms I own
+  // also setup input buf to rendezvous comm
+  // input datums = partner atom where I do not own partner
+  // owning proc for each datum = partner_tag % nprocs
+  // datum: atomID = partner_tag (off-proc)
+  //        values in 3 shake arrays
+
+  nsend = 0;
+  for (i = 0; i < nlocal; i++) {
+    if (shake_flag[i] == 0) continue;
+    for (j = 0; j < npartner[i]; j++) {
+      if (partner_shake[i][j] == 0) continue;
+      m = atom->map(partner_tag[i][j]);
+
+      if (m >= 0 && m < nlocal) {
+        shake_flag[m] = shake_flag[i];
+        shake_atom[m][0] = shake_atom[i][0];
+        shake_atom[m][1] = shake_atom[i][1];
+        shake_atom[m][2] = shake_atom[i][2];
+        shake_atom[m][3] = shake_atom[i][3];
+        shake_type[m][0] = shake_type[i][0];
+        shake_type[m][1] = shake_type[i][1];
+        shake_type[m][2] = shake_type[i][2];
+
+      } else {
+        proclist[nsend] = partner_tag[i][j] % nprocs;
+        inbuf[nsend].atomID = partner_tag[i][j];
+        inbuf[nsend].shake_flag = shake_flag[i];
+        inbuf[nsend].shake_atom[0] = shake_atom[i][0];
+        inbuf[nsend].shake_atom[1] = shake_atom[i][1];
+        inbuf[nsend].shake_atom[2] = shake_atom[i][2];
+        inbuf[nsend].shake_atom[3] = shake_atom[i][3];
+        inbuf[nsend].shake_type[0] = shake_type[i][0];
+        inbuf[nsend].shake_type[1] = shake_type[i][1];
+        inbuf[nsend].shake_type[2] = shake_type[i][2];
+        nsend++;
+      }
+    }
+  }
+
+  // perform rendezvous operation
+  // each proc owns random subset of atoms
+  // receives all data needed to populate un-owned shake info
+
+  char *buf;
+  int nreturn = comm->rendezvous(RVOUS,nsend,(char *) inbuf,sizeof(ShakeInfo),
+                                 0,proclist,
+                                 rendezvous_shake,0,buf,sizeof(ShakeInfo),
+                                 (void *) this);
+  ShakeInfo *outbuf = (ShakeInfo *) buf;
+
+  memory->destroy(proclist);
+  memory->sfree(inbuf);
+
+  // set shake info for un-onwed partners based on output info
+
+  for (m = 0; m < nreturn; m++) {
+    i = atom->map(outbuf[m].atomID);
+    shake_flag[i] = outbuf[m].shake_flag;
+    shake_atom[i][0] = outbuf[m].shake_atom[0];
+    shake_atom[i][1] = outbuf[m].shake_atom[1];
+    shake_atom[i][2] = outbuf[m].shake_atom[2];
+    shake_atom[i][3] = outbuf[m].shake_atom[3];
+    shake_type[i][0] = outbuf[m].shake_type[0];
+    shake_type[i][1] = outbuf[m].shake_type[1];
+    shake_type[i][2] = outbuf[m].shake_type[2];
+  }
+
+  memory->sfree(outbuf);
+}
+
+/* ----------------------------------------------------------------------
+   process data for atoms assigned to me in rendezvous decomposition
+   inbuf = list of N IDRvous datums
+   no outbuf
+------------------------------------------------------------------------- */
+
+int FixShake::rendezvous_ids(int n, char *inbuf,
+                             int &flag, int *&proclist, char *&outbuf,
+                             void *ptr)
+{
+  FixShake *fsptr = (FixShake *) ptr;
+  Memory *memory = fsptr->memory;
+
+  tagint *atomIDs;
+  int *procowner;
+
+  memory->create(atomIDs,n,"special:atomIDs");
+  memory->create(procowner,n,"special:procowner");
+
+  IDRvous *in = (IDRvous *) inbuf;
+
+  for (int i = 0; i < n; i++) {
+    atomIDs[i] = in[i].atomID;
+    procowner[i] = in[i].me;
+  }
+
+  // store rendezvous data in FixShake class
+
+  fsptr->nrvous = n;
+  fsptr->atomIDs = atomIDs;
+  fsptr->procowner = procowner;
+
+  // flag = 0: no second comm needed in rendezvous
+
+  flag = 0;
+  return 0;
+}
+
+/* ----------------------------------------------------------------------
+   process data for atoms assigned to me in rendezvous decomposition
+   inbuf = list of N PairRvous datums
+   outbuf = same list of N PairRvous datums, routed to different procs
+------------------------------------------------------------------------- */
+
+int FixShake::rendezvous_partners_info(int n, char *inbuf,
+                                       int &flag, int *&proclist, char *&outbuf,
+                                       void *ptr)
+{
+  int i,m;
+
+  FixShake *fsptr = (FixShake *) ptr;
+  Atom *atom = fsptr->atom;
+  Memory *memory = fsptr->memory;
+
+  // clear atom map so it can be here as a hash table
+  // faster than an STL map for large atom counts
+
+  atom->map_clear();
+
+  // hash atom IDs stored in rendezvous decomposition
+
+  int nrvous = fsptr->nrvous;
+  tagint *atomIDs = fsptr->atomIDs;
+
+  for (i = 0; i < nrvous; i++)
+    atom->map_one(atomIDs[i],i);
+
+  // proclist = owner of atomID in caller decomposition
+  // outbuf = info about owned atomID = 4 values
+
+  PartnerInfo *in = (PartnerInfo *) inbuf;
+  int *procowner = fsptr->procowner;
+  memory->create(proclist,n,"shake:proclist");
+
+  for (i = 0; i < n; i++) {
+    m = atom->map(in[i].atomID);
+    proclist[i] = procowner[m];
+  }
+
+  outbuf = inbuf;
+
+  // re-create atom map
+
+  atom->map_init(0);
+  atom->nghost = 0;
+  atom->map_set();
+
+  // flag = 1: outbuf = inbuf
+
+  flag = 1;
+  return n;
+}
+
+/* ----------------------------------------------------------------------
+   process data for atoms assigned to me in rendezvous decomposition
+   inbuf = list of N NShakeInfo datums
+   outbuf = same list of N NShakeInfo datums, routed to different procs
+------------------------------------------------------------------------- */
+
+int FixShake::rendezvous_nshake(int n, char *inbuf,
+                                int &flag, int *&proclist, char *&outbuf,
+                                void *ptr)
+{
+  int i,m;
+
+  FixShake *fsptr = (FixShake *) ptr;
+  Atom *atom = fsptr->atom;
+  Memory *memory = fsptr->memory;
+
+  // clear atom map so it can be here as a hash table
+  // faster than an STL map for large atom counts
+
+  atom->map_clear();
+
+  // hash atom IDs stored in rendezvous decomposition
+
+  int nrvous = fsptr->nrvous;
+  tagint *atomIDs = fsptr->atomIDs;
+
+  for (i = 0; i < nrvous; i++)
+    atom->map_one(atomIDs[i],i);
+
+  // proclist = owner of atomID in caller decomposition
+  // outbuf = info about owned atomID
+
+  NShakeInfo *in = (NShakeInfo *) inbuf;
+  int *procowner = fsptr->procowner;
+  memory->create(proclist,n,"shake:proclist");
+
+  for (i = 0; i < n; i++) {
+    m = atom->map(in[i].atomID);
+    proclist[i] = procowner[m];
+  }
+
+  outbuf = inbuf;
+
+  // re-create atom map
+
+  atom->map_init(0);
+  atom->nghost = 0;
+  atom->map_set();
+
+  // flag = 1: outbuf = inbuf
+
+  flag = 1;
+  return n;
+}
+/* ----------------------------------------------------------------------
+   process data for atoms assigned to me in rendezvous decomposition
+   inbuf = list of N PairRvous datums
+   outbuf = same list of N PairRvous datums, routed to different procs
+------------------------------------------------------------------------- */
+
+int FixShake::rendezvous_shake(int n, char *inbuf,
+                               int &flag, int *&proclist, char *&outbuf,
+                               void *ptr)
+{
+  int i,m;
+
+  FixShake *fsptr = (FixShake *) ptr;
+  Atom *atom = fsptr->atom;
+  Memory *memory = fsptr->memory;
+
+  // clear atom map so it can be here as a hash table
+  // faster than an STL map for large atom counts
+
+  atom->map_clear();
+
+  // hash atom IDs stored in rendezvous decomposition
+
+  int nrvous = fsptr->nrvous;
+  tagint *atomIDs = fsptr->atomIDs;
+
+  for (i = 0; i < nrvous; i++)
+    atom->map_one(atomIDs[i],i);
+
+  // proclist = owner of atomID in caller decomposition
+  // outbuf = info about owned atomID
+
+  ShakeInfo *in = (ShakeInfo *) inbuf;
+  int *procowner = fsptr->procowner;
+  memory->create(proclist,n,"shake:proclist");
+
+  for (i = 0; i < n; i++) {
+    m = atom->map(in[i].atomID);
+    proclist[i] = procowner[m];
+  }
+
+  outbuf = inbuf;
+
+  // re-create atom map
+
+  atom->map_init(0);
+  atom->nghost = 0;
+  atom->map_set();
+
+  // flag = 1: outbuf = inbuf;
+
+  flag = 1;
+  return n;
 }

 /* ----------------------------------------------------------------------
--- a/src/RIGID/fix_shake.h
+++ b/src/RIGID/fix_shake.h
@ -120,6 +120,11 @@ class FixShake : public Fix {
  int nmol;

  void find_clusters();
+  void atom_owners();
+  void partner_info(int *, tagint **, int **, int **, int **, int **);
+  void nshake_info(int *, tagint **, int **);
+  void shake_info(int *, tagint **, int **);
+
  int masscheck(double);
  void unconstrained_update();
  void unconstrained_update_respa(int);
@ -131,12 +136,40 @@ class FixShake : public Fix {
  int bondtype_findset(int, tagint, tagint, int);
  int angletype_findset(int, tagint, tagint, int);

-  // static variable for ring communication callback to access class data
-  // callback functions for ring communication
+  // data used by rendezvous callback methods

-  static void ring_bonds(int, char *, void *);
-  static void ring_nshake(int, char *, void *);
-  static void ring_shake(int, char *, void *);
+  int nrvous;
+  tagint *atomIDs;
+  int *procowner;
+
+  struct IDRvous {
+    int me;
+    tagint atomID;
+  };
+
+  struct PartnerInfo {
+    tagint atomID,partnerID;
+    int mask,type,massflag,bondtype;
+  };
+
+  struct NShakeInfo {
+    tagint atomID,partnerID;
+    int nshake;
+  };
+
+  struct ShakeInfo {
+    tagint atomID;
+    tagint shake_atom[4];
+    int shake_flag;
+    int shake_type[3];
+  };
+
+  // callback functions for rendezvous communication
+
+  static int rendezvous_ids(int, char *, int &, int *&, char *&, void *);
+  static int rendezvous_partners_info(int, char *, int &, int *&, char *&, void *);
+  static int rendezvous_nshake(int, char *, int &, int *&, char *&, void *);
+  static int rendezvous_shake(int, char *, int &, int *&, char *&, void *);
 };

 }
--- a/src/comm.cpp
+++ b/src/comm.cpp
@ -28,6 +28,7 @@
 #include "dump.h"
 #include "group.h"
 #include "procmap.h"
+#include "irregular.h"
 #include "accelerator_kokkos.h"
 #include "memory.h"
 #include "error.h"
@ -725,6 +726,429 @@ void Comm::ring(int n, int nper, void *inbuf, int messtag,
  memory->destroy(bufcopy);
 }

+/* ----------------------------------------------------------------------
+   rendezvous communication operation
+   three stages:
+     first comm sends inbuf from caller decomp to rvous decomp
+     callback operates on data in rendevous decomp
+     second comm sends outbuf from rvous decomp back to caller decomp
+   inputs:
+     which = perform (0) irregular or (1) MPI_All2allv communication
+     n = # of datums in inbuf
+     inbuf = vector of input datums
+     insize = byte size of each input datum
+     inorder = 0 for inbuf in random proc order, 1 for datums ordered by proc
+     procs: inorder 0 = proc to send each datum to, 1 = # of datums/proc,
+     callback = caller function to invoke in rendezvous decomposition
+                takes input datums, returns output datums
+     outorder = same as inorder, but for datums returned by callback()
+     ptr = pointer to caller class, passed to callback()
+   outputs:
+     nout = # of output datums (function return)
+     outbuf = vector of output datums
+     outsize = byte size of each output datum
+   callback inputs:
+     nrvous = # of rvous decomp datums in inbuf_rvous
+     inbuf_rvous = vector of rvous decomp input datums
+     ptr = pointer to caller class
+   callback outputs:
+     nrvous_out = # of rvous decomp output datums (function return)
+     flag = 0 for no second comm, 1 for outbuf_rvous = inbuf_rvous,
+            2 for second comm with new outbuf_rvous
+     procs_rvous = outorder 0 = proc to send each datum to, 1 = # of datums/proc
+                   allocated
+     outbuf_rvous = vector of rvous decomp output datums
+   NOTE: could use MPI_INT or MPI_DOUBLE insead of MPI_CHAR
+         to avoid checked-for overflow in MPI_Alltoallv?
+------------------------------------------------------------------------- */
+
+int Comm::
+rendezvous(int which, int n, char *inbuf, int insize,
+           int inorder, int *procs,
+           int (*callback)(int, char *, int &, int *&, char *&, void *),
+           int outorder, char *&outbuf, int outsize, void *ptr, int statflag)
+{
+  if (which == 0)
+    return rendezvous_irregular(n,inbuf,insize,inorder,procs,callback,
+                                outorder,outbuf,outsize,ptr,statflag);
+  else
+    return rendezvous_all2all(n,inbuf,insize,inorder,procs,callback,
+                              outorder,outbuf,outsize,ptr,statflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int Comm::
+rendezvous_irregular(int n, char *inbuf, int insize, int inorder, int *procs,
+                     int (*callback)(int, char *, int &, int *&, char *&, void *),
+                     int outorder, char *&outbuf,
+                     int outsize, void *ptr, int statflag)
+{
+  // irregular comm of inbuf from caller decomp to rendezvous decomp
+
+  Irregular *irregular = new Irregular(lmp);
+
+  int nrvous;
+  if (inorder) nrvous = irregular->create_data_grouped(n,procs);
+  else nrvous = irregular->create_data(n,procs);
+
+  char *inbuf_rvous = (char *) memory->smalloc((bigint) nrvous*insize,
+                                               "rendezvous:inbuf");
+  irregular->exchange_data(inbuf,insize,inbuf_rvous);
+
+  bigint irregular1_bytes = irregular->memory_usage();
+  irregular->destroy_data();
+  delete irregular;
+
+  // peform rendezvous computation via callback()
+  // callback() allocates/populates proclist_rvous and outbuf_rvous
+
+  int flag;
+  int *procs_rvous;
+  char *outbuf_rvous;
+  int nrvous_out = callback(nrvous,inbuf_rvous,flag,
+                            procs_rvous,outbuf_rvous,ptr);
+
+  if (flag != 1) memory->sfree(inbuf_rvous);  // outbuf_rvous = inbuf_vous
+  if (flag == 0) {
+    if (statflag) rendezvous_stats(n,0,nrvous,nrvous_out,insize,outsize,
+                                   (bigint) nrvous_out*sizeof(int) +
+                                   irregular1_bytes);
+    return 0;    // all nout_rvous are 0, no 2nd comm stage
+  }
+
+  // irregular comm of outbuf from rendezvous decomp back to caller decomp
+  // caller will free outbuf
+
+  irregular = new Irregular(lmp);
+
+  int nout;
+  if (outorder)
+    nout = irregular->create_data_grouped(nrvous_out,procs_rvous);
+  else nout = irregular->create_data(nrvous_out,procs_rvous);
+
+  outbuf = (char *) memory->smalloc((bigint) nout*outsize,
+                                    "rendezvous:outbuf");
+  irregular->exchange_data(outbuf_rvous,outsize,outbuf);
+
+  bigint irregular2_bytes = irregular->memory_usage();
+  irregular->destroy_data();
+  delete irregular;
+
+  memory->destroy(procs_rvous);
+  memory->sfree(outbuf_rvous);
+
+  // return number of output datums
+  // last arg to stats() = memory for procs_rvous + irregular comm
+
+  if (statflag) rendezvous_stats(n,nout,nrvous,nrvous_out,insize,outsize,
+                                 (bigint) nrvous_out*sizeof(int) +
+                                 MAX(irregular1_bytes,irregular2_bytes));
+  return nout;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int Comm::
+rendezvous_all2all(int n, char *inbuf, int insize, int inorder, int *procs,
+                   int (*callback)(int, char *, int &, int *&, char *&, void *),
+                   int outorder, char *&outbuf, int outsize, void *ptr,
+                   int statflag)
+{
+  int iproc;
+  bigint all2all1_bytes,all2all2_bytes;
+  int *sendcount,*sdispls,*recvcount,*rdispls;
+  int *procs_a2a;
+  bigint *offsets;
+  char *inbuf_a2a,*outbuf_a2a;
+
+  // create procs and inbuf for All2all if necesary
+
+  if (!inorder) {
+    memory->create(procs_a2a,nprocs,"rendezvous:procs");
+    inbuf_a2a = (char *) memory->smalloc((bigint) n*insize,
+                                         "rendezvous:inbuf");
+    memory->create(offsets,nprocs,"rendezvous:offsets");
+
+    for (int i = 0; i < nprocs; i++) procs_a2a[i] = 0;
+    for (int i = 0; i < n; i++) procs_a2a[procs[i]]++;
+
+    offsets[0] = 0;
+    for (int i = 1; i < nprocs; i++)
+      offsets[i] = offsets[i-1] + insize*procs_a2a[i-1];
+
+    bigint offset = 0;
+    for (int i = 0; i < n; i++) {
+      iproc = procs[i];
+      memcpy(&inbuf_a2a[offsets[iproc]],&inbuf[offset],insize);
+      offsets[iproc] += insize;
+      offset += insize;
+    }
+
+    all2all1_bytes = nprocs*sizeof(int) + nprocs*sizeof(bigint) + n*insize;
+
+  } else {
+    procs_a2a = procs;
+    inbuf_a2a = inbuf;
+    all2all1_bytes = 0;
+  }
+
+  // create args for MPI_Alltoallv() on input data
+
+  memory->create(sendcount,nprocs,"rendezvous:sendcount");
+  memcpy(sendcount,procs_a2a,nprocs*sizeof(int));
+
+  memory->create(recvcount,nprocs,"rendezvous:recvcount");
+  MPI_Alltoall(sendcount,1,MPI_INT,recvcount,1,MPI_INT,world);
+
+  memory->create(sdispls,nprocs,"rendezvous:sdispls");
+  memory->create(rdispls,nprocs,"rendezvous:rdispls");
+  sdispls[0] = rdispls[0] = 0;
+  for (int i = 1; i < nprocs; i++) {
+    sdispls[i] = sdispls[i-1] + sendcount[i-1];
+    rdispls[i] = rdispls[i-1] + recvcount[i-1];
+  }
+  int nrvous = rdispls[nprocs-1] + recvcount[nprocs-1];
+
+  // test for overflow of input data due to imbalance or insize
+  // means that individual sdispls or rdispls values overflow
+
+  int overflow = 0;
+  if ((bigint) n*insize > MAXSMALLINT) overflow = 1;
+  if ((bigint) nrvous*insize > MAXSMALLINT) overflow = 1;
+  int overflowall;
+  MPI_Allreduce(&overflow,&overflowall,1,MPI_INT,MPI_MAX,world);
+  if (overflowall) error->all(FLERR,"Overflow input size in rendezvous_a2a");
+
+  for (int i = 0; i < nprocs; i++) {
+    sendcount[i] *= insize;
+    sdispls[i] *= insize;
+    recvcount[i] *= insize;
+    rdispls[i] *= insize;
+  }
+
+  // all2all comm of inbuf from caller decomp to rendezvous decomp
+
+  char *inbuf_rvous = (char *) memory->smalloc((bigint) nrvous*insize,
+                                               "rendezvous:inbuf");
+
+  MPI_Alltoallv(inbuf_a2a,sendcount,sdispls,MPI_CHAR,
+		inbuf_rvous,recvcount,rdispls,MPI_CHAR,world);
+
+  if (!inorder) {
+    memory->destroy(procs_a2a);
+    memory->sfree(inbuf_a2a);
+    memory->destroy(offsets);
+  }
+
+  // peform rendezvous computation via callback()
+  // callback() allocates/populates proclist_rvous and outbuf_rvous
+
+  int flag;
+  int *procs_rvous;
+  char *outbuf_rvous;
+
+  int nrvous_out = callback(nrvous,inbuf_rvous,flag,
+                            procs_rvous,outbuf_rvous,ptr);
+
+  if (flag != 1) memory->sfree(inbuf_rvous);  // outbuf_rvous = inbuf_vous
+  if (flag == 0) {
+    memory->destroy(sendcount);
+    memory->destroy(recvcount);
+    memory->destroy(sdispls);
+    memory->destroy(rdispls);
+    if (statflag) rendezvous_stats(n,0,nrvous,nrvous_out,insize,outsize,
+                                   (bigint) nrvous_out*sizeof(int) +
+                                   4*nprocs*sizeof(int) + all2all1_bytes);
+    return 0;    // all nout_rvous are 0, no 2nd irregular
+  }
+
+
+
+
+
+
+  // create procs and outbuf for All2all if necesary
+
+  if (!outorder) {
+    memory->create(procs_a2a,nprocs,"rendezvous_a2a:procs");
+
+    outbuf_a2a = (char *) memory->smalloc((bigint) nrvous_out*outsize,
+                                          "rendezvous:outbuf");
+    memory->create(offsets,nprocs,"rendezvous:offsets");
+
+    for (int i = 0; i < nprocs; i++) procs_a2a[i] = 0;
+    for (int i = 0; i < nrvous_out; i++) procs_a2a[procs_rvous[i]]++;
+
+    offsets[0] = 0;
+    for (int i = 1; i < nprocs; i++)
+      offsets[i] = offsets[i-1] + outsize*procs_a2a[i-1];
+
+    bigint offset = 0;
+    for (int i = 0; i < nrvous_out; i++) {
+      iproc = procs_rvous[i];
+      memcpy(&outbuf_a2a[offsets[iproc]],&outbuf_rvous[offset],outsize);
+      offsets[iproc] += outsize;
+      offset += outsize;
+    }
+
+    all2all2_bytes = nprocs*sizeof(int) + nprocs*sizeof(bigint) +
+      nrvous_out*outsize;
+
+  } else {
+    procs_a2a = procs_rvous;
+    outbuf_a2a = outbuf_rvous;
+    all2all2_bytes = 0;
+  }
+
+  // comm outbuf from rendezvous decomposition back to caller
+
+  memcpy(sendcount,procs_a2a,nprocs*sizeof(int));
+
+  MPI_Alltoall(sendcount,1,MPI_INT,recvcount,1,MPI_INT,world);
+
+  sdispls[0] = rdispls[0] = 0;
+  for (int i = 1; i < nprocs; i++) {
+    sdispls[i] = sdispls[i-1] + sendcount[i-1];
+    rdispls[i] = rdispls[i-1] + recvcount[i-1];
+  }
+  int nout = rdispls[nprocs-1] + recvcount[nprocs-1];
+
+  // test for overflow of outbuf due to imbalance or outsize
+  // means that individual sdispls or rdispls values overflow
+
+  overflow = 0;
+  if ((bigint) nrvous*outsize > MAXSMALLINT) overflow = 1;
+  if ((bigint) nout*outsize > MAXSMALLINT) overflow = 1;
+  MPI_Allreduce(&overflow,&overflowall,1,MPI_INT,MPI_MAX,world);
+  if (overflowall) error->all(FLERR,"Overflow output in rendezvous_a2a");
+
+  for (int i = 0; i < nprocs; i++) {
+    sendcount[i] *= outsize;
+    sdispls[i] *= outsize;
+    recvcount[i] *= outsize;
+    rdispls[i] *= outsize;
+  }
+
+  // all2all comm of outbuf from rendezvous decomp back to caller decomp
+  // caller will free outbuf
+
+  outbuf = (char *) memory->smalloc((bigint) nout*outsize,"rendezvous:outbuf");
+
+  MPI_Alltoallv(outbuf_a2a,sendcount,sdispls,MPI_CHAR,
+		outbuf,recvcount,rdispls,MPI_CHAR,world);
+
+  memory->destroy(procs_rvous);
+  memory->sfree(outbuf_rvous);
+
+  if (!outorder) {
+    memory->destroy(procs_a2a);
+    memory->sfree(outbuf_a2a);
+    memory->destroy(offsets);
+  }
+
+  // clean up
+
+  memory->destroy(sendcount);
+  memory->destroy(recvcount);
+  memory->destroy(sdispls);
+  memory->destroy(rdispls);
+
+  // return number of output datums
+  // last arg to stats() = mem for procs_rvous + per-proc vecs + reordering ops
+
+  if (statflag) rendezvous_stats(n,nout,nrvous,nrvous_out,insize,outsize,
+                                 (bigint) nrvous_out*sizeof(int) +
+                                 4*nprocs*sizeof(int) +
+                                 MAX(all2all1_bytes,all2all2_bytes));
+  return nout;
+}
+
+/* ----------------------------------------------------------------------
+   print balance and memory info for rendezvous operation
+   useful for debugging
+------------------------------------------------------------------------- */
+
+void Comm::rendezvous_stats(int n, int nout, int nrvous, int nrvous_out,
+                            int insize, int outsize, bigint commsize)
+{
+  bigint size_in_all,size_in_max,size_in_min;
+  bigint size_out_all,size_out_max,size_out_min;
+  bigint size_inrvous_all,size_inrvous_max,size_inrvous_min;
+  bigint size_outrvous_all,size_outrvous_max,size_outrvous_min;
+  bigint size_comm_all,size_comm_max,size_comm_min;
+
+  bigint size = (bigint) n*insize;
+  MPI_Allreduce(&size,&size_in_all,1,MPI_LMP_BIGINT,MPI_SUM,world);
+  MPI_Allreduce(&size,&size_in_max,1,MPI_LMP_BIGINT,MPI_MAX,world);
+  MPI_Allreduce(&size,&size_in_min,1,MPI_LMP_BIGINT,MPI_MIN,world);
+
+  size = (bigint) nout*outsize;
+  MPI_Allreduce(&size,&size_out_all,1,MPI_LMP_BIGINT,MPI_SUM,world);
+  MPI_Allreduce(&size,&size_out_max,1,MPI_LMP_BIGINT,MPI_MAX,world);
+  MPI_Allreduce(&size,&size_out_min,1,MPI_LMP_BIGINT,MPI_MIN,world);
+
+  size = (bigint) nrvous*insize;
+  MPI_Allreduce(&size,&size_inrvous_all,1,MPI_LMP_BIGINT,MPI_SUM,world);
+  MPI_Allreduce(&size,&size_inrvous_max,1,MPI_LMP_BIGINT,MPI_MAX,world);
+  MPI_Allreduce(&size,&size_inrvous_min,1,MPI_LMP_BIGINT,MPI_MIN,world);
+
+  size = (bigint) nrvous_out*insize;
+  MPI_Allreduce(&size,&size_outrvous_all,1,MPI_LMP_BIGINT,MPI_SUM,world);
+  MPI_Allreduce(&size,&size_outrvous_max,1,MPI_LMP_BIGINT,MPI_MAX,world);
+  MPI_Allreduce(&size,&size_outrvous_min,1,MPI_LMP_BIGINT,MPI_MIN,world);
+
+  size = commsize;
+  MPI_Allreduce(&size,&size_comm_all,1,MPI_LMP_BIGINT,MPI_SUM,world);
+  MPI_Allreduce(&size,&size_comm_max,1,MPI_LMP_BIGINT,MPI_MAX,world);
+  MPI_Allreduce(&size,&size_comm_min,1,MPI_LMP_BIGINT,MPI_MIN,world);
+
+  int mbytes = 1024*1024;
+
+  if (me == 0) {
+    if (screen) {
+      fprintf(screen,"Rendezvous balance and memory info: (tot,ave,max,min) \n");
+      fprintf(screen,"  input datum count: "
+              BIGINT_FORMAT " %g " BIGINT_FORMAT " " BIGINT_FORMAT "\n",
+              size_in_all/insize,1.0*size_in_all/nprocs/insize,
+              size_in_max/insize,size_in_min/insize);
+      fprintf(screen,"  input data (MB): %g %g %g %g\n",
+              1.0*size_in_all/mbytes,1.0*size_in_all/nprocs/mbytes,
+              1.0*size_in_max/mbytes,1.0*size_in_min/mbytes);
+      if (outsize)
+        fprintf(screen,"  output datum count: "
+                BIGINT_FORMAT " %g " BIGINT_FORMAT " " BIGINT_FORMAT "\n",
+                size_out_all/outsize,1.0*size_out_all/nprocs/outsize,
+                size_out_max/outsize,size_out_min/outsize);
+      else
+        fprintf(screen,"  output datum count: %d %g %d %d\n",0,0.0,0,0);
+      fprintf(screen,"  output data (MB): %g %g %g %g\n",
+              1.0*size_out_all/mbytes,1.0*size_out_all/nprocs/mbytes,
+              1.0*size_out_max/mbytes,1.0*size_out_min/mbytes);
+      fprintf(screen,"  input rvous datum count: "
+              BIGINT_FORMAT " %g " BIGINT_FORMAT " " BIGINT_FORMAT "\n",
+              size_inrvous_all/insize,1.0*size_inrvous_all/nprocs/insize,
+              size_inrvous_max/insize,size_inrvous_min/insize);
+      fprintf(screen,"  input rvous data (MB): %g %g %g %g\n",
+              1.0*size_inrvous_all/mbytes,1.0*size_inrvous_all/nprocs/mbytes,
+              1.0*size_inrvous_max/mbytes,1.0*size_inrvous_min/mbytes);
+      if (outsize)
+        fprintf(screen,"  output rvous datum count: "
+                BIGINT_FORMAT " %g " BIGINT_FORMAT " " BIGINT_FORMAT "\n",
+                size_outrvous_all/outsize,1.0*size_outrvous_all/nprocs/outsize,
+                size_outrvous_max/outsize,size_outrvous_min/outsize);
+      else
+        fprintf(screen,"  output rvous datum count: %d %g %d %d\n",0,0.0,0,0);
+      fprintf(screen,"  output rvous data (MB): %g %g %g %g\n",
+              1.0*size_outrvous_all/mbytes,1.0*size_outrvous_all/nprocs/mbytes,
+              1.0*size_outrvous_max/mbytes,1.0*size_outrvous_min/mbytes);
+      fprintf(screen,"  rvous comm (MB): %g %g %g %g\n",
+              1.0*size_comm_all/mbytes,1.0*size_comm_all/nprocs/mbytes,
+              1.0*size_comm_max/mbytes,1.0*size_comm_min/mbytes);
+    }
+  }
+}
+
 /* ----------------------------------------------------------------------
   proc 0 reads Nlines from file into buf and bcasts buf to all procs
   caller allocates buf to max size needed
--- a/src/comm.h
+++ b/src/comm.h
@ -109,6 +109,10 @@ class Comm : protected Pointers {

  void ring(int, int, void *, int, void (*)(int, char *, void *),
            void *, void *, int self = 1);
+  int rendezvous(int, int, char *, int, int, int *,
+                 int (*)(int, char *, int &, int *&, char *&, void *),
+                 int, char *&, int, void *, int statflag=0);
+
  int read_lines_from_file(FILE *, int, int, char *);
  int read_lines_from_file_universe(FILE *, int, int, char *);

@ -142,6 +146,15 @@ class Comm : protected Pointers {
  int ncores;                       // # of cores per node
  int coregrid[3];                  // 3d grid of cores within a node
  int user_coregrid[3];             // user request for cores in each dim
+
+  int rendezvous_irregular(int, char *, int, int, int *,
+                           int (*)(int, char *, int &, int *&, char *&, void *),
+                           int, char *&, int, void *, int);
+  int rendezvous_all2all(int, char *, int, int, int *,
+                         int (*)(int, char *, int &, int *&, char *&, void *),
+                         int, char *&, int, void *, int);
+  void rendezvous_stats(int, int, int, int, int, int, bigint);
+
 public:
  enum{MULTIPLE};
 };
--- a/src/create_atoms.cpp
+++ b/src/create_atoms.cpp
@ -514,9 +514,6 @@ void CreateAtoms::command(int narg, char **arg)
    if (domain->triclinic) domain->lamda2x(atom->nlocal);
  }

-  MPI_Barrier(world);
-  double time2 = MPI_Wtime();
-
  // clean up

  delete ranmol;
@ -526,21 +523,6 @@ void CreateAtoms::command(int narg, char **arg)
  delete [] ystr;
  delete [] zstr;

-  // print status
-
-  if (comm->me == 0) {
-    if (screen) {
-      fprintf(screen,"Created " BIGINT_FORMAT " atoms\n",
-              atom->natoms-natoms_previous);
-      fprintf(screen,"  Time spent = %g secs\n",time2-time1);
-    }
-    if (logfile) {
-      fprintf(logfile,"Created " BIGINT_FORMAT " atoms\n",
-              atom->natoms-natoms_previous);
-      fprintf(logfile,"  Time spent = %g secs\n",time2-time1);
-    }
-  }
-
  // for MOLECULE mode:
  // create special bond lists for molecular systems,
  //   but not for atom style template
@ -550,6 +532,25 @@ void CreateAtoms::command(int narg, char **arg)
    if (atom->molecular == 1 && onemol->bondflag && !onemol->specialflag) {
      Special special(lmp);
      special.build();
+
+    }
+  }
+
+  // print status
+
+  MPI_Barrier(world);
+  double time2 = MPI_Wtime();
+
+  if (comm->me == 0) {
+    if (screen) {
+      fprintf(screen,"Created " BIGINT_FORMAT " atoms\n",
+              atom->natoms-natoms_previous);
+      fprintf(screen,"  create_atoms CPU = %g secs\n",time2-time1);
+    }
+    if (logfile) {
+      fprintf(logfile,"Created " BIGINT_FORMAT " atoms\n",
+              atom->natoms-natoms_previous);
+      fprintf(logfile,"  create_atoms CPU = %g secs\n",time2-time1);
    }
  }
 }
--- a/src/hashlittle.cpp
+++ b/src/hashlittle.cpp
@ -0,0 +1,348 @@
+// Hash function hashlittle()
+// from lookup3.c, by Bob Jenkins, May 2006, Public Domain
+// bob_jenkins@burtleburtle.net
+
+#include <cmath>
+#include <stddef.h>
+#include <stdint.h>
+
+// if the system defines the __BYTE_ORDER__ define,
+// we use it instead of guessing the platform
+
+#if defined(__BYTE_ORDER__)
+# if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#  define HASH_LITTLE_ENDIAN 1
+# else
+#  define HASH_LITTLE_ENDIAN 0
+# endif
+#else   // heuristic platform guess
+# if defined(__bg__)
+#  define HASH_LITTLE_ENDIAN 0       // IBM BlueGene is big endian
+# else
+#  define HASH_LITTLE_ENDIAN 1       // Intel and AMD x86 are little endian
+# endif
+#endif
+
+#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
+
+/*
+-------------------------------------------------------------------------------
+mix -- mix 3 32-bit values reversibly.
+
+This is reversible, so any information in (a,b,c) before mix() is
+still in (a,b,c) after mix().
+
+If four pairs of (a,b,c) inputs are run through mix(), or through
+mix() in reverse, there are at least 32 bits of the output that
+are sometimes the same for one pair and different for another pair.
+This was tested for:
+* pairs that differed by one bit, by two bits, in any combination
+  of top bits of (a,b,c), or in any combination of bottom bits of
+  (a,b,c).
+* "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
+  the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+  is commonly produced by subtraction) look like a single 1-bit
+  difference.
+* the base values were pseudorandom, all zero but one bit set, or
+  all zero plus a counter that starts at zero.
+
+Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
+satisfy this are
+    4  6  8 16 19  4
+    9 15  3 18 27 15
+   14  9  3  7 17  3
+Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
+for "differ" defined as + with a one-bit base and a two-bit delta.  I
+used http://burtleburtle.net/bob/hash/avalanche.html to choose
+the operations, constants, and arrangements of the variables.
+
+This does not achieve avalanche.  There are input bits of (a,b,c)
+that fail to affect some output bits of (a,b,c), especially of a.  The
+most thoroughly mixed value is c, but it doesn't really even achieve
+avalanche in c.
+
+This allows some parallelism.  Read-after-writes are good at doubling
+the number of bits affected, so the goal of mixing pulls in the opposite
+direction as the goal of parallelism.  I did what I could.  Rotates
+seem to cost as much as shifts on every machine I could lay my hands
+on, and rotates are much kinder to the top and bottom bits, so I used
+rotates.
+-------------------------------------------------------------------------------
+*/
+#define mix(a,b,c) \
+{ \
+  a -= c;  a ^= rot(c, 4);  c += b; \
+  b -= a;  b ^= rot(a, 6);  a += c; \
+  c -= b;  c ^= rot(b, 8);  b += a; \
+  a -= c;  a ^= rot(c,16);  c += b; \
+  b -= a;  b ^= rot(a,19);  a += c; \
+  c -= b;  c ^= rot(b, 4);  b += a; \
+}
+
+/*
+-------------------------------------------------------------------------------
+final -- final mixing of 3 32-bit values (a,b,c) into c
+
+Pairs of (a,b,c) values differing in only a few bits will usually
+produce values of c that look totally different.  This was tested for
+* pairs that differed by one bit, by two bits, in any combination
+  of top bits of (a,b,c), or in any combination of bottom bits of
+  (a,b,c).
+* "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
+  the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+  is commonly produced by subtraction) look like a single 1-bit
+  difference.
+* the base values were pseudorandom, all zero but one bit set, or
+  all zero plus a counter that starts at zero.
+
+These constants passed:
+ 14 11 25 16 4 14 24
+ 12 14 25 16 4 14 24
+and these came close:
+  4  8 15 26 3 22 24
+ 10  8 15 26 3 22 24
+ 11  8 15 26 3 22 24
+-------------------------------------------------------------------------------
+*/
+#define final(a,b,c) \
+{ \
+  c ^= b; c -= rot(b,14); \
+  a ^= c; a -= rot(c,11); \
+  b ^= a; b -= rot(a,25); \
+  c ^= b; c -= rot(b,16); \
+  a ^= c; a -= rot(c,4);  \
+  b ^= a; b -= rot(a,14); \
+  c ^= b; c -= rot(b,24); \
+}
+
+/*
+-------------------------------------------------------------------------------
+hashlittle() -- hash a variable-length key into a 32-bit value
+  k       : the key (the unaligned variable-length array of bytes)
+  length  : the length of the key, counting by bytes
+  initval : can be any 4-byte value
+Returns a 32-bit value.  Every bit of the key affects every bit of
+the return value.  Two keys differing by one or two bits will have
+totally different hash values.
+
+The best hash table sizes are powers of 2.  There is no need to do
+mod a prime (mod is sooo slow!).  If you need less than 32 bits,
+use a bitmask.  For example, if you need only 10 bits, do
+  h = (h & hashmask(10));
+In which case, the hash table should have hashsize(10) elements.
+
+If you are hashing n strings (uint8_t **)k, do it like this:
+  for (i=0, h=0; i<n; ++i) h = hashlittle( k[i], len[i], h);
+
+By Bob Jenkins, 2006.  bob_jenkins@burtleburtle.net.  You may use this
+code any way you wish, private, educational, or commercial.  It's free.
+
+Use for hash table lookup, or anything where one collision in 2^^32 is
+acceptable.  Do NOT use for cryptographic purposes.
+-------------------------------------------------------------------------------
+*/
+
+uint32_t hashlittle( const void *key, size_t length, uint32_t initval)
+{
+#ifndef PURIFY_HATES_HASHLITTLE
+
+  uint32_t a,b,c;                                          /* internal state */
+  union { const void *ptr; size_t i; } u;     /* needed for Mac Powerbook G4 */
+
+  /* Set up the internal state */
+  a = b = c = 0xdeadbeef + ((uint32_t)length) + initval;
+
+  u.ptr = key;
+  if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {
+    const uint32_t *k = (const uint32_t *)key;         /* read 32-bit chunks */
+
+    /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
+    while (length > 12)
+    {
+      a += k[0];
+      b += k[1];
+      c += k[2];
+      mix(a,b,c);
+      length -= 12;
+      k += 3;
+    }
+
+    /*----------------------------- handle the last (probably partial) block */
+    /*
+     * "k[2]&0xffffff" actually reads beyond the end of the string, but
+     * then masks off the part it's not allowed to read.  Because the
+     * string is aligned, the masked-off tail is in the same word as the
+     * rest of the string.  Every machine with memory protection I've seen
+     * does it on word boundaries, so is OK with this.  But VALGRIND will
+     * still catch it and complain.  The masking trick does make the hash
+     * noticably faster for short strings (like English words).
+     */
+#ifndef VALGRIND
+
+    switch(length)
+    {
+    case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
+    case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break;
+    case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break;
+    case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break;
+    case 8 : b+=k[1]; a+=k[0]; break;
+    case 7 : b+=k[1]&0xffffff; a+=k[0]; break;
+    case 6 : b+=k[1]&0xffff; a+=k[0]; break;
+    case 5 : b+=k[1]&0xff; a+=k[0]; break;
+    case 4 : a+=k[0]; break;
+    case 3 : a+=k[0]&0xffffff; break;
+    case 2 : a+=k[0]&0xffff; break;
+    case 1 : a+=k[0]&0xff; break;
+    case 0 : return c;              /* zero length strings require no mixing */
+    }
+
+#else /* make valgrind happy */
+
+    const uint8_t  *k8 = (const uint8_t *)k;
+    switch(length)
+    {
+    case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
+    case 11: c+=((uint32_t)k8[10])<<16;  /* fall through */
+    case 10: c+=((uint32_t)k8[9])<<8;    /* fall through */
+    case 9 : c+=k8[8];                   /* fall through */
+    case 8 : b+=k[1]; a+=k[0]; break;
+    case 7 : b+=((uint32_t)k8[6])<<16;   /* fall through */
+    case 6 : b+=((uint32_t)k8[5])<<8;    /* fall through */
+    case 5 : b+=k8[4];                   /* fall through */
+    case 4 : a+=k[0]; break;
+    case 3 : a+=((uint32_t)k8[2])<<16;   /* fall through */
+    case 2 : a+=((uint32_t)k8[1])<<8;    /* fall through */
+    case 1 : a+=k8[0]; break;
+    case 0 : return c;
+    }
+
+#endif /* !valgrind */
+
+  } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) {
+    const uint16_t *k = (const uint16_t *)key;         /* read 16-bit chunks */
+    const uint8_t  *k8;
+
+    /*--------------- all but last block: aligned reads and different mixing */
+    while (length > 12)
+    {
+      a += k[0] + (((uint32_t)k[1])<<16);
+      b += k[2] + (((uint32_t)k[3])<<16);
+      c += k[4] + (((uint32_t)k[5])<<16);
+      mix(a,b,c);
+      length -= 12;
+      k += 6;
+    }
+
+    /*----------------------------- handle the last (probably partial) block */
+    k8 = (const uint8_t *)k;
+    switch(length)
+    {
+    case 12: c+=k[4]+(((uint32_t)k[5])<<16);
+             b+=k[2]+(((uint32_t)k[3])<<16);
+             a+=k[0]+(((uint32_t)k[1])<<16);
+             break;
+    case 11: c+=((uint32_t)k8[10])<<16;     /* fall through */
+    case 10: c+=k[4];
+             b+=k[2]+(((uint32_t)k[3])<<16);
+             a+=k[0]+(((uint32_t)k[1])<<16);
+             break;
+    case 9 : c+=k8[8];                      /* fall through */
+    case 8 : b+=k[2]+(((uint32_t)k[3])<<16);
+             a+=k[0]+(((uint32_t)k[1])<<16);
+             break;
+    case 7 : b+=((uint32_t)k8[6])<<16;      /* fall through */
+    case 6 : b+=k[2];
+             a+=k[0]+(((uint32_t)k[1])<<16);
+             break;
+    case 5 : b+=k8[4];                      /* fall through */
+    case 4 : a+=k[0]+(((uint32_t)k[1])<<16);
+             break;
+    case 3 : a+=((uint32_t)k8[2])<<16;      /* fall through */
+    case 2 : a+=k[0];
+             break;
+    case 1 : a+=k8[0];
+             break;
+    case 0 : return c;                     /* zero length requires no mixing */
+    }
+
+  } else {                        /* need to read the key one byte at a time */
+    const uint8_t *k = (const uint8_t *)key;
+
+    /*--------------- all but the last block: affect some 32 bits of (a,b,c) */
+    while (length > 12)
+    {
+      a += k[0];
+      a += ((uint32_t)k[1])<<8;
+      a += ((uint32_t)k[2])<<16;
+      a += ((uint32_t)k[3])<<24;
+      b += k[4];
+      b += ((uint32_t)k[5])<<8;
+      b += ((uint32_t)k[6])<<16;
+      b += ((uint32_t)k[7])<<24;
+      c += k[8];
+      c += ((uint32_t)k[9])<<8;
+      c += ((uint32_t)k[10])<<16;
+      c += ((uint32_t)k[11])<<24;
+      mix(a,b,c);
+      length -= 12;
+      k += 12;
+    }
+
+    /*-------------------------------- last block: affect all 32 bits of (c) */
+    switch(length)                   /* all the case statements fall through */
+    {
+    case 12: c+=((uint32_t)k[11])<<24;
+    case 11: c+=((uint32_t)k[10])<<16;
+    case 10: c+=((uint32_t)k[9])<<8;
+    case 9 : c+=k[8];
+    case 8 : b+=((uint32_t)k[7])<<24;
+    case 7 : b+=((uint32_t)k[6])<<16;
+    case 6 : b+=((uint32_t)k[5])<<8;
+    case 5 : b+=k[4];
+    case 4 : a+=((uint32_t)k[3])<<24;
+    case 3 : a+=((uint32_t)k[2])<<16;
+    case 2 : a+=((uint32_t)k[1])<<8;
+    case 1 : a+=k[0];
+             break;
+    case 0 : return c;
+    }
+  }
+
+  final(a,b,c);
+  return c;
+
+#else  /* PURIFY_HATES_HASHLITTLE */
+/* I don't know what it is about Jenkins' hashlittle function, but
+ * it drives purify insane, even with VALGRIND defined.  It makes
+ * purify unusable!!  The code execution doesn't even make sense.
+ * Below is a (probably) weaker hash function that at least allows
+ * testing with purify.
+ */
+#define MAXINT_DIV_PHI  11400714819323198485U
+
+  uint32_t h, rest, *p, bytes, num_bytes;
+  char *byteptr;
+
+  num_bytes = length;
+
+  /* First hash the uint32_t-sized portions of the key */
+  h = 0;
+  for (p = (uint32_t *)key, bytes=num_bytes;
+       bytes >= (uint32_t) sizeof(uint32_t);
+       bytes-=sizeof(uint32_t), p++){
+    h = (h^(*p))*MAXINT_DIV_PHI;
+  }
+
+  /* Then take care of the remaining bytes, if any */
+  rest = 0;
+  for (byteptr = (char *)p; bytes > 0; bytes--, byteptr++){
+    rest = (rest<<8) | (*byteptr);
+  }
+
+  /* If extra bytes, merge the two parts */
+  if (rest)
+    h = (h^rest)*MAXINT_DIV_PHI;
+
+  return h;
+#endif /* PURIFY_HATES_HASHLITTLE */
+}
--- a/src/hashlittle.h
+++ b/src/hashlittle.h
@ -0,0 +1,5 @@
+// Hash function hashlittle()
+// from lookup3.c, by Bob Jenkins, May 2006, Public Domain
+// bob_jenkins@burtleburtle.net
+
+uint32_t hashlittle(const void *key, size_t length, uint32_t);
--- a/src/irregular.cpp
+++ b/src/irregular.cpp
@ -501,7 +501,8 @@ int compare_standalone(const int i, const int j, void *ptr)

 void Irregular::exchange_atom(double *sendbuf, int *sizes, double *recvbuf)
 {
-  int i,m,n,offset,count;
+  int i,m,n,count;
+  bigint offset;

  // post all receives

@ -621,6 +622,7 @@ int Irregular::create_data(int n, int *proclist, int sortflag)
  num_send = new int[nsend_proc];
  index_send = new int[n-work1[me]];
  index_self = new int[work1[me]];
+  maxindex = n;

  // proc_send = procs I send to
  // num_send = # of datums I send to each proc
@ -678,8 +680,182 @@ int Irregular::create_data(int n, int *proclist, int sortflag)

  // receive incoming messages
  // proc_recv = procs I recv from
-  // num_recv = total size of message each proc sends me
-  // nrecvdatum = total size of data I recv
+  // num_recv = # of datums each proc sends me
+  // nrecvdatum = total # of datums I recv
+
+  int nrecvdatum = 0;
+  for (i = 0; i < nrecv_proc; i++) {
+    MPI_Recv(&num_recv[i],1,MPI_INT,MPI_ANY_SOURCE,0,world,status);
+    proc_recv[i] = status->MPI_SOURCE;
+    nrecvdatum += num_recv[i];
+  }
+  nrecvdatum += num_self;
+
+  // sort proc_recv and num_recv by proc ID if requested
+  // useful for debugging to insure reproducible ordering of received datums
+
+  if (sortflag) {
+    int *order = new int[nrecv_proc];
+    int *proc_recv_ordered = new int[nrecv_proc];
+    int *num_recv_ordered = new int[nrecv_proc];
+
+    for (i = 0; i < nrecv_proc; i++) order[i] = i;
+
+#if defined(LMP_QSORT)
+    proc_recv_copy = proc_recv;
+    qsort(order,nrecv_proc,sizeof(int),compare_standalone);
+#else
+    merge_sort(order,nrecv_proc,(void *)proc_recv,compare_standalone);
+#endif
+
+    int j;
+    for (i = 0; i < nrecv_proc; i++) {
+      j = order[i];
+      proc_recv_ordered[i] = proc_recv[j];
+      num_recv_ordered[i] = num_recv[j];
+    }
+
+    memcpy(proc_recv,proc_recv_ordered,nrecv_proc*sizeof(int));
+    memcpy(num_recv,num_recv_ordered,nrecv_proc*sizeof(int));
+    delete [] order;
+    delete [] proc_recv_ordered;
+    delete [] num_recv_ordered;
+  }
+
+  // barrier to insure all MPI_ANY_SOURCE messages are received
+  // else another proc could proceed to exchange_data() and send to me
+
+  MPI_Barrier(world);
+
+  // return # of datums I will receive
+
+  return nrecvdatum;
+}
+
+/* ----------------------------------------------------------------------
+   create communication plan based on list of datums of uniform size
+   n = # of datums to send
+   procs = how many datums to send to each proc, must include self
+   sort = flag for sorting order of received messages by proc ID
+   return total # of datums I will recv, including any to self
+------------------------------------------------------------------------- */
+
+int Irregular::create_data_grouped(int n, int *procs, int sortflag)
+{
+  int i,j,k,m;
+
+  // setup for collective comm
+  // work1 = # of datums I send to each proc, set self to 0
+  // work2 = 1 for all procs, used for ReduceScatter
+
+  for (i = 0; i < nprocs; i++) {
+    work1[i] = procs[i];
+    work2[i] = 1;
+  }
+  work1[me] = 0;
+
+  // nrecv_proc = # of procs I receive messages from, not including self
+  // options for performing ReduceScatter operation
+  // some are more efficient on some machines at big sizes
+
+#ifdef LAMMPS_RS_ALLREDUCE_INPLACE
+  MPI_Allreduce(MPI_IN_PLACE,work1,nprocs,MPI_INT,MPI_SUM,world);
+  nrecv_proc = work1[me];
+#else
+#ifdef LAMMPS_RS_ALLREDUCE
+  MPI_Allreduce(work1,work2,nprocs,MPI_INT,MPI_SUM,world);
+  nrecv_proc = work2[me];
+#else
+  MPI_Reduce_scatter(work1,&nrecv_proc,work2,MPI_INT,MPI_SUM,world);
+#endif
+#endif
+
+  // allocate receive arrays
+
+  proc_recv = new int[nrecv_proc];
+  num_recv = new int[nrecv_proc];
+  request = new MPI_Request[nrecv_proc];
+  status = new MPI_Status[nrecv_proc];
+
+  // work1 = # of datums I send to each proc, including self
+  // nsend_proc = # of procs I send messages to, not including self
+
+  for (i = 0; i < nprocs; i++) work1[i] = procs[i];
+
+  nsend_proc = 0;
+  for (i = 0; i < nprocs; i++)
+    if (work1[i]) nsend_proc++;
+  if (work1[me]) nsend_proc--;
+
+  // allocate send and self arrays
+
+  proc_send = new int[nsend_proc];
+  num_send = new int[nsend_proc];
+  index_send = new int[n-work1[me]];
+  index_self = new int[work1[me]];
+  maxindex = n;
+
+  // proc_send = procs I send to
+  // num_send = # of datums I send to each proc
+  // num_self = # of datums I copy to self
+  // to balance pattern of send messages:
+  //   each proc begins with iproc > me, continues until iproc = me
+  // reset work1 to store which send message each proc corresponds to
+
+  int iproc = me;
+  int isend = 0;
+  for (i = 0; i < nprocs; i++) {
+    iproc++;
+    if (iproc == nprocs) iproc = 0;
+    if (iproc == me) {
+      num_self = work1[iproc];
+      work1[iproc] = 0;
+    } else if (work1[iproc] > 0) {
+      proc_send[isend] = iproc;
+      num_send[isend] = work1[iproc];
+      work1[iproc] = isend;
+      isend++;
+    }
+  }
+
+  // work2 = offsets into index_send for each proc I send to
+  // m = ptr into index_self
+  // index_send = list of which datums to send to each proc
+  //   1st N1 values are datum indices for 1st proc,
+  //   next N2 values are datum indices for 2nd proc, etc
+  // index_self = list of which datums to copy to self
+
+  work2[0] = 0;
+  for (i = 1; i < nsend_proc; i++) work2[i] = work2[i-1] + num_send[i-1];
+
+  m = 0;
+  i = 0;
+  for (iproc = 0; iproc < nprocs; iproc++) {
+    k = procs[iproc];
+    for (j = 0; j < k; j++) {
+      if (iproc == me) index_self[m++] = i++;
+      else {
+        isend = work1[iproc];
+        index_send[work2[isend]++] = i++;
+      }
+    }
+  }
+
+  // tell receivers how much data I send
+  // sendmax_proc = largest # of datums I send in a single message
+
+  sendmax_proc = 0;
+  for (i = 0; i < nsend_proc; i++) {
+    MPI_Request tmpReq; // Use non-blocking send to avoid possible deadlock
+    MPI_Isend(&num_send[i],1,MPI_INT,proc_send[i],0,world,&tmpReq);
+    MPI_Request_free(&tmpReq); // the MPI_Barrier below marks completion
+    sendmax_proc = MAX(sendmax_proc,num_send[i]);
+  }
+
+  // receive incoming messages
+  // proc_recv = procs I recv from
+  // num_recv = # of datums each proc sends me
+  // nrecvdatum = total # of datums I recv

  int nrecvdatum = 0;
  for (i = 0; i < nrecv_proc; i++) {
@ -739,11 +915,13 @@ int Irregular::create_data(int n, int *proclist, int sortflag)

 void Irregular::exchange_data(char *sendbuf, int nbytes, char *recvbuf)
 {
-  int i,m,n,offset,count;
+  int i,n,count;
+  bigint m;       // these 2 lines enable send/recv buf to be larger than 2 GB
+  char *dest;

  // post all receives, starting after self copies

-  offset = num_self*nbytes;
+  bigint offset = num_self*nbytes;
  for (int irecv = 0; irecv < nrecv_proc; irecv++) {
    MPI_Irecv(&recvbuf[offset],num_recv[irecv]*nbytes,MPI_CHAR,
              proc_recv[irecv],0,world,&request[irecv]);
@ -765,23 +943,34 @@ void Irregular::exchange_data(char *sendbuf, int nbytes, char *recvbuf)
  n = 0;
  for (int isend = 0; isend < nsend_proc; isend++) {
    count = num_send[isend];
+    dest = buf;
    for (i = 0; i < count; i++) {
      m = index_send[n++];
-      memcpy(&buf[i*nbytes],&sendbuf[m*nbytes],nbytes);
+      memcpy(dest,&sendbuf[m*nbytes],nbytes);
+      dest += nbytes;
    }
    MPI_Send(buf,count*nbytes,MPI_CHAR,proc_send[isend],0,world);
  }

  // copy datums to self, put at beginning of recvbuf

+  dest = recvbuf;
  for (i = 0; i < num_self; i++) {
    m = index_self[i];
-    memcpy(&recvbuf[i*nbytes],&sendbuf[m*nbytes],nbytes);
+    memcpy(dest,&sendbuf[m*nbytes],nbytes);
+    dest += nbytes;
  }

  // wait on all incoming messages

  if (nrecv_proc) MPI_Waitall(nrecv_proc,request,status);
+
+  // approximate memory tally
+  // DEBUG lines
+
+  //bigint irregular_bytes = 2*nprocs*sizeof(int);
+  //irregular_bytes += maxindex*sizeof(int);
+  //irregular_bytes += maxbuf;
 }

 /* ----------------------------------------------------------------------
--- a/src/irregular.h
+++ b/src/irregular.h
@ -33,6 +33,7 @@ class Irregular : protected Pointers {
                     int *procassign = NULL);
  int migrate_check();
  int create_data(int, int *, int sortflag = 0);
+  int create_data_grouped(int, int *, int sortflag = 0);
  void exchange_data(char *, int, char *);
  void destroy_data();
  bigint memory_usage();
@ -48,6 +49,7 @@ class Irregular : protected Pointers {
  double *dbuf;                     // double buf for largest single atom send
  int maxbuf;                       // size of char buf in bytes
  char *buf;                        // char buf for largest single data send
+  int maxindex;                     // combined size of index_send + index_self

  int *mproclist,*msizes;           // persistent vectors in migrate_atoms
  int maxlocal;                     // allocated size of mproclist and msizes
--- a/src/pair_table.cpp
+++ b/src/pair_table.cpp
@ -378,7 +378,7 @@ void PairTable::read_table(Table *tb, char *file, char *keyword)
    utils::sfgets(FLERR,line,MAXLINE,fp,file,error); // no match, skip section
    param_extract(tb,line);
    utils::sfgets(FLERR,line,MAXLINE,fp,file,error);
-    for (int i = 0; i < tb->ninput; i++) 
+    for (int i = 0; i < tb->ninput; i++)
      utils::sfgets(FLERR,line,MAXLINE,fp,file,error);
  }

--- a/src/read_data.cpp
+++ b/src/read_data.cpp
@ -120,6 +120,9 @@ void ReadData::command(int narg, char **arg)
 {
  if (narg < 1) error->all(FLERR,"Illegal read_data command");

+  MPI_Barrier(world);
+  double time1 = MPI_Wtime();
+
  // optional args

  addflag = NONE;
@ -906,6 +909,18 @@ void ReadData::command(int narg, char **arg)

    force->kspace = saved_kspace;
  }
+
+  // total time
+
+  MPI_Barrier(world);
+  double time2 = MPI_Wtime();
+
+  if (comm->me == 0) {
+    if (screen)
+      fprintf(screen,"  read_data CPU = %g secs\n",time2-time1);
+    if (logfile)
+      fprintf(logfile,"  read_data CPU = %g secs\n",time2-time1);
+  }
 }

 /* ----------------------------------------------------------------------
--- a/src/read_dump.cpp
+++ b/src/read_dump.cpp
@ -94,7 +94,7 @@ ReadDump::~ReadDump()

  memory->destroy(fields);
  memory->destroy(buf);
-  
+
  for (int i = 0; i < nreader; i++) delete readers[i];
  delete [] readers;
  delete [] nsnapatoms;
@ -359,7 +359,7 @@ bigint ReadDump::seek(bigint nrequest, int exact)
        readers[i]->skip();
      }

-      if (eofflag) 
+      if (eofflag)
        error->one(FLERR,"Read dump parallel files "
                   "do not all have same timestep");
    }
@ -466,7 +466,7 @@ bigint ReadDump::next(bigint ncurrent, bigint nlast, int nevery, int nskip)
        readers[i]->skip();
      }

-      if (eofflag) 
+      if (eofflag)
        error->one(FLERR,"Read dump parallel files "
                   "do not all have same timestep");
    }
@ -724,7 +724,7 @@ void ReadDump::read_atoms()
      otherproc = 0;
      ofirst = (bigint) otherproc * nsnap/nprocs_cluster;
      olast = (bigint) (otherproc+1) * nsnap/nprocs_cluster;
-      if (olast-ofirst > MAXSMALLINT) 
+      if (olast-ofirst > MAXSMALLINT)
        error->one(FLERR,"Read dump snapshot is too large for a proc");
      nnew = static_cast<int> (olast - ofirst);

@ -765,7 +765,7 @@ void ReadDump::read_atoms()
    } else {
      ofirst = (bigint) me_cluster * nsnap/nprocs_cluster;
      olast = (bigint) (me_cluster+1) * nsnap/nprocs_cluster;
-      if (olast-ofirst > MAXSMALLINT) 
+      if (olast-ofirst > MAXSMALLINT)
        error->one(FLERR,"Read dump snapshot is too large for a proc");
      nnew = static_cast<int> (olast - ofirst);
      if (nnew > maxnew || maxnew == 0) {
@ -791,7 +791,7 @@ void ReadDump::read_atoms()
    bigint sum = 0;
    for (int i = 0; i < nreader; i++)
      sum += nsnapatoms[i];
-    if (sum > MAXSMALLINT) 
+    if (sum > MAXSMALLINT)
      error->one(FLERR,"Read dump snapshot is too large for a proc");
    nnew = static_cast<int> (sum);
    if (nnew > maxnew || maxnew == 0) {
@ -811,7 +811,7 @@ void ReadDump::read_atoms()
      }
      nnew += nsnap;
    }
-  }  
+  }
 }

 /* ----------------------------------------------------------------------
@ -943,7 +943,7 @@ void ReadDump::process_atoms()
        ntrim++;
      } else i++;
    }
-    
+
    atom->nlocal = nlocal;
    bigint nblocal = atom->nlocal;
    MPI_Allreduce(&nblocal,&atom->natoms,1,MPI_LMP_BIGINT,MPI_SUM,world);
--- a/src/read_dump.h
+++ b/src/read_dump.h
@ -86,7 +86,7 @@ private:
  double xlo,xhi,ylo,yhi,zlo,zhi,xy,xz,yz;  // dump snapshot box params
  double xprd,yprd,zprd;

-  bigint *nsnapatoms;       // # of atoms in one snapshot from 
+  bigint *nsnapatoms;       // # of atoms in one snapshot from
                            //   one (parallel) dump file
                            // nreader-length vector b/c a reader proc
                            //   may read from multiple parallel dump files
--- a/src/read_restart.cpp
+++ b/src/read_restart.cpp
@ -81,6 +81,9 @@ void ReadRestart::command(int narg, char **arg)
  if (domain->box_exist)
    error->all(FLERR,"Cannot read_restart after simulation box is defined");

+  MPI_Barrier(world);
+  double time1 = MPI_Wtime();
+
  MPI_Comm_rank(world,&me);
  MPI_Comm_size(world,&nprocs);

@ -562,6 +565,18 @@ void ReadRestart::command(int narg, char **arg)
    Special special(lmp);
    special.build();
  }
+
+  // total time
+
+  MPI_Barrier(world);
+  double time2 = MPI_Wtime();
+
+  if (comm->me == 0) {
+    if (screen)
+      fprintf(screen,"  read_restart CPU = %g secs\n",time2-time1);
+    if (logfile)
+      fprintf(logfile,"  read_restart CPU = %g secs\n",time2-time1);
+  }
 }

 /* ----------------------------------------------------------------------
--- a/src/replicate.cpp
+++ b/src/replicate.cpp
@ -76,7 +76,7 @@ void Replicate::command(int narg, char **arg)
  if (atom->nextra_grow || atom->nextra_restart || atom->nextra_store)
    error->all(FLERR,"Cannot replicate with fixes that store atom quantities");

-  // Record wall time for atom replication
+  // record wall time for atom replication

  MPI_Barrier(world);
  double time1 = MPI_Wtime();
@ -762,15 +762,15 @@ void Replicate::command(int narg, char **arg)
    special.build();
  }

-  // Wall time
+  // total time

  MPI_Barrier(world);
  double time2 = MPI_Wtime();

  if (me == 0) {
    if (screen)
-      fprintf(screen,"  Time spent = %g secs\n",time2-time1);
+      fprintf(screen,"  replicate CPU = %g secs\n",time2-time1);
    if (logfile)
-      fprintf(logfile,"  Time spent = %g secs\n",time2-time1);
+      fprintf(logfile,"  replicate CPU = %g secs\n",time2-time1);
  }
 }
--- a/src/special.cpp
+++ b/src/special.cpp
--- a/src/special.h
+++ b/src/special.h
@ -26,29 +26,43 @@ class Special : protected Pointers {

 private:
  int me,nprocs;
+  int maxall;
  tagint **onetwo,**onethree,**onefour;

-  // data used by ring callback methods
+  // data used by rendezvous callback methods

-  int *count;
-  int **dflag;
+  int nrvous;
+  tagint *atomIDs;
+  int *procowner;
+
+  struct IDRvous {
+    int me;
+    tagint atomID;
+  };
+
+  struct PairRvous {
+    tagint atomID,partnerID;
+  };
+
+  // private methods
+
+  void atom_owners();
+  void onetwo_build_newton();
+  void onetwo_build_newton_off();
+  void onethree_build();
+  void onefour_build();

  void dedup();
  void angle_trim();
  void dihedral_trim();
  void combine();
  void fix_alteration();
+  void timer_output(double);

-  // callback functions for ring communication
+  // callback functions for rendezvous communication

-  static void ring_one(int, char *, void *);
-  static void ring_two(int, char *, void *);
-  static void ring_three(int, char *, void *);
-  static void ring_four(int, char *, void *);
-  static void ring_five(int, char *, void *);
-  static void ring_six(int, char *, void *);
-  static void ring_seven(int, char *, void *);
-  static void ring_eight(int, char *, void *);
+  static int rendezvous_ids(int, char *, int &, int *&, char *&, void *);
+  static int rendezvous_pairs(int, char *, int &, int *&, char *&, void *);
 };

 }