From 31bc11ee34dbab6ed53491121ddddae1e47df768 Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Thu, 21 Jun 2012 16:27:12 +0000
Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@8365
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 src/REPLICA/prd.cpp |  14 +--
 src/balance.cpp     | 226 +++++++++++++++++++++++++-------------
 src/balance.h       |  15 +--
 src/fix_balance.cpp | 257 ++++++++++++++++++++++++++++++++++++++++++++
 src/fix_balance.h   |  80 ++++++++++++++
 src/irregular.cpp   | 131 ++++++++++++++++++----
 src/irregular.h     |   3 +-
 src/region.h        |   2 +-
 src/velocity.h      |   2 +-
 9 files changed, 616 insertions(+), 114 deletions(-)
 create mode 100644 src/fix_balance.cpp
 create mode 100644 src/fix_balance.h

diff --git a/src/REPLICA/prd.cpp b/src/REPLICA/prd.cpp
index cd577cba93..4597d974b8 100644
--- a/src/REPLICA/prd.cpp
+++ b/src/REPLICA/prd.cpp
@@ -103,16 +103,18 @@ void PRD::command(int narg, char **arg)
   MPI_Comm_rank(world,&me);
   MPI_Comm_size(world,&nprocs);
 
-  // comm_replica = communicator between same proc across replicas
-  // not used if replicas have unequal number of procs
-  // equal_size_replicas = 1 if all replicas have same # of procs
+  // comm_replica = communicator between all proc 0s across replicas
 
   int color = me;
   MPI_Comm_split(universe->uworld,color,0,&comm_replica);
 
-  flag = 0;
-  if (nreplica*nprocs == nprocs_universe) flag = 1;
-  MPI_Allreduce(&flag,&equal_size_replicas,1,MPI_INT,MPI_MIN,universe->uworld);
+  // equal_size_replicas = 1 if all replicas have same # of procs
+  // no longer used
+
+  //flag = 0;
+  //if (nreplica*nprocs == nprocs_universe) flag = 1;
+  //MPI_Allreduce(&flag,&equal_size_replicas,1,MPI_INT,MPI_MIN,
+  //              universe->uworld);
 
   // workspace for inter-replica communication via gathers
 
diff --git a/src/balance.cpp b/src/balance.cpp
index ecb1fd6187..8e6c61c7c0 100644
--- a/src/balance.cpp
+++ b/src/balance.cpp
@@ -30,9 +30,8 @@ using namespace LAMMPS_NS;
 
 enum{NONE,UNIFORM,USER,DYNAMIC};
 enum{X,Y,Z};
-enum{EXPAND,CONTRACT};
 
-//#define BALANCE_DEBUG 1
+#define BALANCE_DEBUG 1
 
 /* ---------------------------------------------------------------------- */
 
@@ -48,6 +47,7 @@ Balance::Balance(LAMMPS *lmp) : Pointers(lmp)
   dflag = 0;
 
   fp = NULL;
+  firststep = 1;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -69,6 +69,8 @@ Balance::~Balance()
     delete [] onecount;
     delete [] lo;
     delete [] hi;
+    delete [] losum;
+    delete [] hisum;
   }
 
   if (fp) fclose(fp);
@@ -94,7 +96,6 @@ void Balance::command(int narg, char **arg)
   xflag = yflag = zflag = NONE;
   dflag = 0;
   outflag = 0;
-  laststep = -1;
 
   int iarg = 0;
   while (iarg < narg) {
@@ -162,8 +163,8 @@ void Balance::command(int narg, char **arg)
       if (dimension == 3) zflag = DYNAMIC;
       if (strlen(arg[iarg+1]) > 3) error->all(FLERR,"Illegal balance command");
       strcpy(bstr,arg[iarg+1]);
-      niter = atoi(arg[iarg+2]);
-      if (niter <= 0) error->all(FLERR,"Illegal balance command");
+      nitermax = atoi(arg[iarg+2]);
+      if (nitermax <= 0) error->all(FLERR,"Illegal balance command");
       thresh = atof(arg[iarg+3]);
       if (thresh < 1.0) error->all(FLERR,"Illegal balance command");
       iarg += 4;
@@ -229,10 +230,10 @@ void Balance::command(int narg, char **arg)
   // debug output of initial state
 
 #ifdef BALANCE_DEBUG
-  dumpout(update->ntimestep);
+  if (me == 0 && fp) dumpout(update->ntimestep,fp);
 #endif
 
-  int iter = 0;
+  int niter = 0;
 
   // explicit setting of sub-domain sizes
 
@@ -266,13 +267,13 @@ void Balance::command(int narg, char **arg)
   // static load-balance of sub-domain sizes
 
   if (dflag) {
-    dynamic_setup(bstr);
-    iter = dynamic_once();
+    static_setup(bstr);
+    niter = dynamic();
   }
 
   // output of final result
 
-  if (outflag) dumpout(update->ntimestep);
+  if (outflag && me == 0) dumpout(update->ntimestep,fp);
 
   // reset comm->uniform flag if necessary
 
@@ -321,14 +322,14 @@ void Balance::command(int narg, char **arg)
 
   if (me == 0) {
     if (screen) {
-      fprintf(screen,"  iteration count = %d\n",iter);
+      fprintf(screen,"  iteration count = %d\n",niter);
       fprintf(screen,"  initial/final max atoms/proc = %d %d\n",
               maxinit,maxfinal);
       fprintf(screen,"  initial/final imbalance factor = %g %g\n",
               imbinit,imbfinal);
     }
     if (logfile) {
-      fprintf(logfile,"  iteration count = %d\n",iter);
+      fprintf(logfile,"  iteration count = %d\n",niter);
       fprintf(logfile,"  initial/final max atoms/proc = %d %d\n",
               maxinit,maxfinal);
       fprintf(logfile,"  initial/final imbalance factor = %g %g\n",
@@ -424,9 +425,10 @@ double Balance::imbalance_splits(int &max)
 /* ----------------------------------------------------------------------
    setup static load balance operations
    called from command
+   set rho = 0 for static balancing
 ------------------------------------------------------------------------- */
 
-void Balance::dynamic_setup(char *str)
+void Balance::static_setup(char *str)
 {
   ndim = strlen(str);
   bdim = new int[ndim];
@@ -446,27 +448,35 @@ void Balance::dynamic_setup(char *str)
   target = new bigint[max+1];
   lo = new double[max+1];
   hi = new double[max+1];
+  losum = new bigint[max+1];
+  hisum = new bigint[max+1];
+
+  rho = 0;
 }
 
 /* ----------------------------------------------------------------------
    setup dynamic load balance operations
    called from fix balance
+   set rho = 1 for dynamic balancing after call to dynamic_setup()
 ------------------------------------------------------------------------- */
 
-void Balance::dynamic_setup(char *str, int niter_in, double thresh_in)
+void Balance::dynamic_setup(char *str, int nitermax_in, double thresh_in)
 {
-  niter = niter_in;
+  dflag = 1;
+  static_setup(str);
+  nitermax = nitermax_in;
   thresh = thresh_in;
-  dynamic_setup(str);
+
+  rho = 1;
 }
 
 /* ----------------------------------------------------------------------
-   perform static load balance by changing xyz split proc boundaries in Comm
-   called from command
-   return actual iteration count
+   load balance by changing xyz split proc boundaries in Comm
+   called one time from input script command or many times from fix balance
+   return niter = iteration count
 ------------------------------------------------------------------------- */
 
-int Balance::dynamic_once()
+int Balance::dynamic()
 {
   int i,j,k,m,np,max;
   double *split;
@@ -488,7 +498,7 @@ int Balance::dynamic_once()
 
   // loop over dimensions in balance string
 
-  int iter = 0;
+  int niter = 0;
   for (int idim = 0; idim < ndim; idim++) {
 
     // split = ptr to xyz split in Comm
@@ -513,30 +523,40 @@ int Balance::dynamic_once()
 
     lo[0] = hi[0] = 0.0;
     lo[np] = hi[np] = 1.0;
+    losum[0] = hisum[0] = 0;
+    losum[np] = hisum[np] = natoms;
+
     for (i = 1; i < np; i++) {
       for (j = i; j >= 0; j--)
         if (sum[j] <= target[i]) {
           lo[i] = split[j];
+          losum[i] = sum[j];
           break;
         }
       for (j = i; j <= np; j++)
         if (sum[j] >= target[i]) {
           hi[i] = split[j];
+          hisum[i] = sum[j];
           break;
         }
     }
 
     // iterate until balanced
 
+#ifdef BALANCE_DEBUG
+    if (me == 0) debug_output(idim,0,np,split);
+#endif
+
     int doneflag;
     int change = 1;
-    for (m = 0; m < niter; m++) {
+    for (m = 0; m < nitermax; m++) {
       change = adjust(np,split);
       tally(bdim[idim],np,split);
-      iter++;
+      niter++;
 
 #ifdef BALANCE_DEBUG
-      dumpout(update->ntimestep);
+      if (me == 0) debug_output(idim,m+1,np,split);
+      if (me == 0 && fp) dumpout(update->ntimestep,fp);
 #endif
 
       // stop if no change in splits, b/c all targets are met exactly
@@ -554,8 +574,8 @@ int Balance::dynamic_once()
       if (doneflag) break;
     }
 
-    // eliminate adjacent splits that are duplicates
-    // can happen if particle distribution is narrow and Niter is small
+    // eliminate final adjacent splits that are duplicates
+    // can happen if particle distribution is narrow and Nitermax is small
     // set lo = midpt between splits
     // spread duplicates out evenly between bounding midpts with non-duplicates
     // i,j = lo/hi indices of set of duplicate splits
@@ -592,7 +612,7 @@ int Balance::dynamic_once()
     if (bad) error->all(FLERR,"Balance produced bad splits");
     /*
       if (me == 0) {
-      printf("BAD SPLITS %d %d %d\n",np+1,iter,delta);
+      printf("BAD SPLITS %d %d %d\n",np+1,niter,delta);
       for (i = 0; i < np+1; i++)
       printf(" %g",split[i]);
       printf("\n");
@@ -610,18 +630,7 @@ int Balance::dynamic_once()
 
   domain->lamda2x(atom->nlocal);
 
-  return iter;
-}
-
-/* ----------------------------------------------------------------------
-   perform dynamic load balance by changing xyz split proc boundaries in Comm
-   called from fix balance
-   return actual iteration count
-------------------------------------------------------------------------- */
-
-int Balance::dynamic()
-{
-  return 0;
+  return niter;
 }
 
 /* ----------------------------------------------------------------------
@@ -670,6 +679,7 @@ void Balance::tally(int dim, int n, double *split)
 int Balance::adjust(int n, double *split)
 {
   int i;
+  double fraction;
 
   // reset lo/hi based on current sum and splits
   // insure lo is monotonically increasing, ties are OK
@@ -678,19 +688,35 @@ int Balance::adjust(int n, double *split)
   // to possibly tighten bounds on lo/hi
 
   for (i = 1; i < n; i++) {
-    if (sum[i] <= target[i]) lo[i] = split[i];
-    if (sum[i] >= target[i]) hi[i] = split[i];
+    if (sum[i] <= target[i]) {
+      lo[i] = split[i];
+      losum[i] = sum[i];
+    }
+    if (sum[i] >= target[i]) {
+      hi[i] = split[i];
+      hisum[i] = sum[i];
+    }
   }
   for (i = 1; i < n; i++)
-    if (lo[i] < lo[i-1]) lo[i] = lo[i-1];
+    if (lo[i] < lo[i-1]) {
+      lo[i] = lo[i-1];
+      losum[i] = losum[i-1];
+    }
   for (i = n-1; i > 0; i--)
-    if (hi[i] > hi[i+1]) hi[i] = hi[i+1];
+    if (hi[i] > hi[i+1]) {
+      hi[i] = hi[i+1];
+      hisum[i] = hisum[i+1];
+    }
   
   int change = 0;
   for (int i = 1; i < n; i++)
     if (sum[i] != target[i]) {
-      split[i] = 0.5 * (lo[i]+hi[i]);
       change = 1;
+      if (rho == 0) split[i] = 0.5 * (lo[i]+hi[i]);
+      else {
+        fraction = 1.0*(target[i]-losum[i]) / (hisum[i]-losum[i]);
+        split[i] = lo[i] + fraction * (hi[i]-lo[i]);
+      }
     }
   return change;
 }
@@ -840,33 +866,28 @@ int Balance::binary(double value, int n, double *vec)
 }
 
 /* ----------------------------------------------------------------------
-   create dump file of line segments in Pizza.py mdump mesh format
-   invoked by out option or debug flag
-   debug flag requires out flag to specify file
+   write dump snapshot of line segments in Pizza.py mdump mesh format
    write xy lines around each proc's sub-domain for 2d
    write xyz cubes around each proc's sub-domain for 3d
-   all procs but 0 just return
+   only called by proc 0
 ------------------------------------------------------------------------- */
 
-void Balance::dumpout(bigint tstep)
+void Balance::dumpout(bigint tstep, FILE *bfp)
 {
-  if (me) return;
-  if (fp == NULL) error->one(FLERR,"Balance output requires out keyword");
-
   int dimension = domain->dimension;
 
   // write out one square/cube per processor for 2d/3d
   // only write once since topology is static
 
-  if (tstep != laststep) {
-    laststep = tstep;
-    fprintf(fp,"ITEM: TIMESTEP\n");
-    fprintf(fp,"%ld\n",tstep);
-    if (dimension == 2) fprintf(fp,"ITEM: NUMBER OF SQUARES\n");
-    else fprintf(fp,"ITEM: NUMBER OF CUBES\n");
-    fprintf(fp,"%d\n",nprocs);
-    if (dimension == 2) fprintf(fp,"ITEM: SQUARES\n");
-    else fprintf(fp,"ITEM: CUBES\n");
+  if (firststep) {
+    firststep = 0;
+    fprintf(bfp,"ITEM: TIMESTEP\n");
+    fprintf(bfp,BIGINT_FORMAT "\n",tstep);
+    if (dimension == 2) fprintf(bfp,"ITEM: NUMBER OF SQUARES\n");
+    else fprintf(bfp,"ITEM: NUMBER OF CUBES\n");
+    fprintf(bfp,"%d\n",nprocs);
+    if (dimension == 2) fprintf(bfp,"ITEM: SQUARES\n");
+    else fprintf(bfp,"ITEM: CUBES\n");
     
     int nx = comm->procgrid[0] + 1;
     int ny = comm->procgrid[1] + 1;
@@ -880,7 +901,7 @@ void Balance::dumpout(bigint tstep)
           int c2 = c1 + 1;
           int c3 = c2 + nx;
           int c4 = c3 - 1;
-          fprintf(fp,"%d %d %d %d %d %d\n",m+1,m+1,c1,c2,c3,c4);
+          fprintf(bfp,"%d %d %d %d %d %d\n",m+1,m+1,c1,c2,c3,c4);
           m++;
         }
       
@@ -897,7 +918,7 @@ void Balance::dumpout(bigint tstep)
             int c6 = c2 + ny*nx;
             int c7 = c3 + ny*nx;
             int c8 = c4 + ny*nx;
-            fprintf(fp,"%d %d %d %d %d %d %d %d %d %d\n",
+            fprintf(bfp,"%d %d %d %d %d %d %d %d %d %d\n",
                     m+1,m+1,c1,c2,c3,c4,c5,c6,c7,c8);
             m++;
           }
@@ -916,22 +937,22 @@ void Balance::dumpout(bigint tstep)
   double *boxhi = domain->boxhi;
   double *prd = domain->prd;
 
-  fprintf(fp,"ITEM: TIMESTEP\n");
-  fprintf(fp,"%ld\n",tstep);
-  fprintf(fp,"ITEM: NUMBER OF NODES\n");
-  if (dimension == 2) fprintf(fp,"%d\n",nx*ny);
-  else fprintf(fp,"%d\n",nx*ny*nz);
-  fprintf(fp,"ITEM: BOX BOUNDS\n");
-  fprintf(fp,"%g %g\n",boxlo[0],boxhi[0]);
-  fprintf(fp,"%g %g\n",boxlo[1],boxhi[1]);
-  fprintf(fp,"%g %g\n",boxlo[2],boxhi[2]);
-  fprintf(fp,"ITEM: NODES\n");
+  fprintf(bfp,"ITEM: TIMESTEP\n");
+  fprintf(bfp,BIGINT_FORMAT "\n",tstep);
+  fprintf(bfp,"ITEM: NUMBER OF NODES\n");
+  if (dimension == 2) fprintf(bfp,"%d\n",nx*ny);
+  else fprintf(bfp,"%d\n",nx*ny*nz);
+  fprintf(bfp,"ITEM: BOX BOUNDS\n");
+  fprintf(bfp,"%g %g\n",boxlo[0],boxhi[0]);
+  fprintf(bfp,"%g %g\n",boxlo[1],boxhi[1]);
+  fprintf(bfp,"%g %g\n",boxlo[2],boxhi[2]);
+  fprintf(bfp,"ITEM: NODES\n");
 
   if (dimension == 2) {
     int m = 0;
     for (int j = 0; j < ny; j++)
       for (int i = 0; i < nx; i++) {
-        fprintf(fp,"%d %d %g %g %g\n",m+1,1,
+        fprintf(bfp,"%d %d %g %g %g\n",m+1,1,
                 boxlo[0] + prd[0]*comm->xsplit[i],
                 boxlo[1] + prd[1]*comm->ysplit[j],
                 0.0);
@@ -942,7 +963,7 @@ void Balance::dumpout(bigint tstep)
     for (int k = 0; k < nz; k++)
       for (int j = 0; j < ny; j++)
         for (int i = 0; i < nx; i++) {
-          fprintf(fp,"%d %d %g %g %g\n",m+1,1,
+          fprintf(bfp,"%d %d %g %g %g\n",m+1,1,
                   boxlo[0] + prd[0]*comm->xsplit[i],
                   boxlo[1] + prd[1]*comm->ysplit[j],
                   boxlo[2] + prd[2]*comm->zsplit[j]);
@@ -950,3 +971,58 @@ void Balance::dumpout(bigint tstep)
       }
   }
 }
+
+/* ----------------------------------------------------------------------
+   debug output for Idim and count
+   only called by proc 0
+------------------------------------------------------------------------- */
+
+void Balance::debug_output(int idim, int m, int np, double *split)
+{
+  int i;
+  const char *dim;
+
+  double *boxlo = domain->boxlo;
+  double *prd = domain->prd;
+
+  if (bdim[idim] == X) dim = "X";
+  else if (bdim[idim] == Y) dim = "Y";
+  else if (bdim[idim] == Z) dim = "Z";
+  printf("Dimension %s, Iteration %d\n",dim,m);
+
+  printf("  Count:");
+  for (i = 0; i < np; i++) printf(" " BIGINT_FORMAT,count[i]);
+  printf("\n");
+  printf("  Sum:");
+  for (i = 0; i <= np; i++) printf(" " BIGINT_FORMAT,sum[i]);
+  printf("\n");
+  printf("  Target:");
+  for (i = 0; i <= np; i++) printf(" " BIGINT_FORMAT,target[i]);
+  printf("\n");
+  printf("  Actual cut:");
+  for (i = 0; i <= np; i++) 
+    printf(" %g",boxlo[bdim[idim]] + split[i]*prd[bdim[idim]]);
+  printf("\n");
+  printf("  Split:");
+  for (i = 0; i <= np; i++) printf(" %g",split[i]);
+  printf("\n");
+  printf("  Low:");
+  for (i = 0; i <= np; i++) printf(" %g",lo[i]);
+  printf("\n");
+  printf("  Low-sum:");
+  for (i = 0; i <= np; i++) printf(" " BIGINT_FORMAT,losum[i]);
+  printf("\n");
+  printf("  Hi:");
+  for (i = 0; i <= np; i++) printf(" %g",hi[i]);
+  printf("\n");
+  printf("  Hi-sum:");
+  for (i = 0; i <= np; i++) printf(" " BIGINT_FORMAT,hisum[i]);
+  printf("\n");
+  printf("  Delta:");
+  for (i = 0; i < np; i++) printf(" %g",split[i+1]-split[i]);
+  printf("\n");
+
+  bigint max = 0;
+  for (i = 0; i < np; i++) max = MAX(max,count[i]);
+  printf("  Imbalance factor: %g\n",1.0*max*np/target[np]);
+}
diff --git a/src/balance.h b/src/balance.h
index 77dde67678..fa9a4ac8af 100644
--- a/src/balance.h
+++ b/src/balance.h
@@ -20,7 +20,6 @@ CommandStyle(balance,Balance)
 #ifndef LMP_BALANCE_H
 #define LMP_BALANCE_H
 
-#include "mpi.h"
 #include "stdio.h"
 #include "pointers.h"
 
@@ -34,6 +33,7 @@ class Balance : protected Pointers {
   void dynamic_setup(char *, int, double);
   int dynamic();
   double imbalance_nlocal(int &);
+  void dumpout(bigint, FILE *);
 
  private:
   int me,nprocs;
@@ -42,7 +42,7 @@ class Balance : protected Pointers {
   double *user_xsplit,*user_ysplit,*user_zsplit;    // params for xyz LB
 
   int dflag;                 // dynamic LB flag
-  int niter;                 // params for dynamic LB
+  int nitermax;              // params for dynamic LB
   double thresh;
   char bstr[4];
 
@@ -53,23 +53,24 @@ class Balance : protected Pointers {
   bigint *sum;               // cummulative count for slices in one dim
   bigint *target;            // target sum for slices in one dim
   double *lo,*hi;            // lo/hi split coords that bound each target
+  bigint *losum,*hisum;      // cummulative counts at lo/hi coords
+  int rho;                   // 0 for geometric recursion
+                             // 1 for density weighted recursion
 
   int *proccount;            // particle count per processor
   int *allproccount;
 
   int outflag;               // for output of balance results to file
   FILE *fp;
-  bigint laststep;
+  int firststep;
 
-  void dynamic_setup(char *);
-  int dynamic_once();
+  void static_setup(char *);
   double imbalance_splits(int &);
   void tally(int, int, double *);
   int adjust(int, double *);
   void old_adjust(int, int, bigint *, double *);
   int binary(double, int, double *);
-
-  void dumpout(bigint);          // for debug output
+  void debug_output(int, int, int, double *);
 };
 
 }
diff --git a/src/fix_balance.cpp b/src/fix_balance.cpp
new file mode 100644
index 0000000000..d54796e66d
--- /dev/null
+++ b/src/fix_balance.cpp
@@ -0,0 +1,257 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "string.h"
+#include "stdlib.h"
+#include "fix_balance.h"
+#include "balance.h"
+#include "update.h"
+#include "domain.h"
+#include "atom.h"
+#include "comm.h"
+#include "irregular.h"
+#include "force.h"
+#include "kspace.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+/* ---------------------------------------------------------------------- */
+
+FixBalance::FixBalance(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  if (narg < 7) error->all(FLERR,"Illegal fix balance command");
+
+  box_change = 1;
+  scalar_flag = 1;
+  extscalar = 0;
+  vector_flag = 1;
+  size_vector = 3;
+  extvector = 0;
+  global_freq = 1;
+
+  // parse arguments
+
+  int dimension = domain->dimension;
+
+  nevery = atoi(arg[3]);
+  if (strlen(arg[4]) > 3) error->all(FLERR,"Illegal fix balance command");
+  strcpy(bstr,arg[4]);
+  nitermax = atoi(arg[5]);
+  thresh = atof(arg[6]);
+
+  if (nevery < 0 || nitermax <= 0 || thresh < 1.0) 
+    error->all(FLERR,"Illegal fix balance command");
+
+  for (int i = 0; i < strlen(bstr); i++) {
+    if (bstr[i] != 'x' && bstr[i] != 'y' && bstr[i] != 'z') 
+      error->all(FLERR,"Fix balance string is invalid");
+    if (bstr[i] == 'z' && dimension == 2) 
+      error->all(FLERR,"Fix balance string is invalid for 2d simulation");
+    for (int j = i+1; j < strlen(bstr); j++)
+      if (bstr[i] == bstr[j])
+        error->all(FLERR,"Fix balance string is invalid");
+  }
+
+  // optional args
+
+  int outarg = 0;
+  fp = NULL;
+
+  int iarg = 7;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"out") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal fix balance command");
+      outarg = iarg+1;
+      iarg += 2;
+    } else error->all(FLERR,"Illegal fix balance command");
+  }
+
+  // create instance of Balance class and initialize it with params
+  // create instance of Irregular class
+
+  balance = new Balance(lmp);
+  balance->dynamic_setup(bstr,nitermax,thresh);
+  irregular = new Irregular(lmp);
+
+  // output file
+
+  if (outarg && comm->me == 0) {
+    fp = fopen(arg[outarg],"w");
+    if (fp == NULL) error->one(FLERR,"Cannot open fix balance output file");
+  }
+
+  // only force reneighboring if nevery > 0
+
+  if (nevery) force_reneighbor = 1;
+
+  // compute initial outputs
+
+  imbfinal = imbprev = balance->imbalance_nlocal(maxperproc);
+  itercount = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixBalance::~FixBalance()
+{
+  if (fp) fclose(fp);
+  delete balance;
+  delete irregular;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixBalance::setmask()
+{
+  int mask = 0;
+  mask |= PRE_EXCHANGE;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixBalance::init()
+{
+  // don't allow PPPM for now
+
+  if (force->kspace && strstr(force->kspace_style,"pppm"))
+    error->all(FLERR,"Cannot yet use fix balance with PPPM");
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixBalance::setup_pre_exchange()
+{
+  // insure atoms are in current box & update box via shrink-wrap
+  // no exchange() since doesn't matter if atoms are assigned to correct procs
+
+  if (domain->triclinic) domain->x2lamda(atom->nlocal);
+  domain->pbc();
+  domain->reset_box();
+  if (domain->triclinic) domain->lamda2x(atom->nlocal);
+
+  // perform a rebalance if threshhold exceeded
+
+  imbnow = balance->imbalance_nlocal(maxperproc);
+  if (imbnow > thresh) rebalance();
+
+  // next_reneighbor = next time to force reneighboring
+
+  if (nevery) next_reneighbor = (update->ntimestep/nevery)*nevery + nevery;
+}
+
+/* ----------------------------------------------------------------------
+   perform dynamic load balancing
+------------------------------------------------------------------------- */
+
+void FixBalance::pre_exchange()
+{
+  // return if not a rebalance timestep
+
+  if (nevery && update->ntimestep < next_reneighbor) return;
+
+  // insure atoms are in current box & update box via shrink-wrap
+  // no exchange() since doesn't matter if atoms are assigned to correct procs
+
+  if (domain->triclinic) domain->x2lamda(atom->nlocal);
+  domain->pbc();
+  domain->reset_box();
+  if (domain->triclinic) domain->lamda2x(atom->nlocal);
+
+  // return if imbalance < threshhold
+
+  imbnow = balance->imbalance_nlocal(maxperproc);
+  if (imbnow <= thresh) {
+    if (nevery) next_reneighbor = (update->ntimestep/nevery)*nevery + nevery;
+    return;
+  }
+
+  rebalance();
+
+  // next timestep to rebalance
+
+  if (nevery) next_reneighbor = (update->ntimestep/nevery)*nevery + nevery;
+}
+
+/* ----------------------------------------------------------------------
+   perform dynamic load balancing
+------------------------------------------------------------------------- */
+
+void FixBalance::rebalance()
+{
+  imbprev = imbnow;
+  itercount = balance->dynamic();
+
+  // output of final result
+  
+  if (fp) balance->dumpout(update->ntimestep,fp);
+
+  // reset comm->uniform flag
+
+  comm->uniform = 0;
+
+  // reset proc sub-domains
+
+  if (domain->triclinic) domain->set_lamda_box();
+  domain->set_local_box();
+
+  // if splits moved further than neighboring processor
+  // move atoms to new processors via irregular()
+  // only needed if migrate_check() says an atom moves to far,
+  // else allow comm->exchange that follows in caller to do it
+  
+  if (domain->triclinic) domain->x2lamda(atom->nlocal);
+  if (irregular->migrate_check()) irregular->migrate_atoms();
+  if (domain->triclinic) domain->lamda2x(atom->nlocal);
+
+  // NOTE: still to be implemented
+  // check that sub-domains are valid with KSpace constraints
+  // if (kspace_flag) force->kspace->check();
+
+  // imbfinal = final imbalance based on final nlocal
+
+  imbfinal = balance->imbalance_nlocal(maxperproc);
+}
+
+/* ----------------------------------------------------------------------
+   return imbalance factor after last rebalance
+------------------------------------------------------------------------- */
+
+double FixBalance::compute_scalar()
+{
+  return imbfinal;
+}
+
+/* ----------------------------------------------------------------------
+   return stats for last rebalance
+------------------------------------------------------------------------- */
+
+double FixBalance::compute_vector(int i)
+{
+  if (i == 0) return (double) maxperproc;
+  if (i == 1) return (double) itercount;
+  return imbprev;
+}
+
+/* ----------------------------------------------------------------------
+   return # of bytes of allocated memory
+------------------------------------------------------------------------- */
+
+double FixBalance::memory_usage()
+{
+  double bytes = irregular->memory_usage();
+  return bytes;
+}
diff --git a/src/fix_balance.h b/src/fix_balance.h
new file mode 100644
index 0000000000..74afd965c8
--- /dev/null
+++ b/src/fix_balance.h
@@ -0,0 +1,80 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(balance,FixBalance)
+
+#else
+
+#ifndef LMP_FIX_BALANCE_H
+#define LMP_FIX_BALANCE_H
+
+#include "stdio.h"
+#include "fix.h"
+
+namespace LAMMPS_NS {
+
+class FixBalance : public Fix {
+ public:
+  FixBalance(class LAMMPS *, int, char **);
+  ~FixBalance();
+  int setmask();
+  void init();
+  void setup_pre_exchange();
+  void pre_exchange();
+  double compute_scalar();
+  double compute_vector(int);
+  double memory_usage();
+
+ private:
+  int nevery,nitermax;
+  char bstr[3];
+  double thresh;
+  FILE *fp;
+
+  double imbnow;                // current imbalance factor
+  double imbprev;               // imbalance factor before last rebalancing
+  double imbfinal;              // imbalance factor after last rebalancing
+  int maxperproc;               // max atoms on any processor
+  int itercount;                // iteration count of last call to Balance
+  int kspace_flag;              // 1 if KSpace solver defined
+
+  class Balance *balance;
+  class Irregular *irregular;
+
+  void rebalance();
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Fix balance string is invalid
+
+The string can only contain the characters "x", "y", or "z".
+
+E: Fix balance string is invalid for 2d simulation
+
+The string cannot contain the letter "z".
+
+*/
diff --git a/src/irregular.cpp b/src/irregular.cpp
index 5e28aa554d..a6888ef3dc 100644
--- a/src/irregular.cpp
+++ b/src/irregular.cpp
@@ -117,13 +117,14 @@ void Irregular::migrate_atoms()
   int nsendatom = 0;
   int *sizes = new int[nlocal];
   int *proclist = new int[nlocal];
+  int igx,igy,igz;
 
   int i = 0;
   while (i < nlocal) {
     if (x[i][0] < sublo[0] || x[i][0] >= subhi[0] ||
         x[i][1] < sublo[1] || x[i][1] >= subhi[1] ||
         x[i][2] < sublo[2] || x[i][2] >= subhi[2]) {
-      proclist[nsendatom] = coord2proc(x[i]);
+      proclist[nsendatom] = coord2proc(x[i],igx,igy,igz);
       if (proclist[nsendatom] != me) {
         if (nsend > maxsend) grow_send(nsend,1);
         sizes[nsendatom] = avec->pack_exchange(i,&buf_send[nsend]);
@@ -157,6 +158,90 @@ void Irregular::migrate_atoms()
   if (map_style) atom->map_set();
 }
 
+/* ----------------------------------------------------------------------
+   check if any atoms need to migrate further than one proc away in any dim
+   if not, caller can decide to use comm->exchange() instead
+   atoms must be remapped to be inside simulation box before this is called
+   for triclinic: atoms must be in lamda coords (0-1) before this is called
+   return 1 if migrate required, 0 if not
+------------------------------------------------------------------------- */
+
+int Irregular::migrate_check()
+{
+  return 1;
+
+  // subbox bounds for orthogonal or triclinic box
+  // other comm/domain data used by coord2proc()
+
+  double *sublo,*subhi;
+  if (triclinic == 0) {
+    sublo = domain->sublo;
+    subhi = domain->subhi;
+  } else {
+    sublo = domain->sublo_lamda;
+    subhi = domain->subhi_lamda;
+  }
+
+  uniform = comm->uniform;
+  xsplit = comm->xsplit;
+  ysplit = comm->ysplit;
+  zsplit = comm->zsplit;
+  boxlo = domain->boxlo;
+  prd = domain->prd;
+
+  // loop over atoms, check for any that are not in my sub-box
+  // assign which proc it belongs to via coord2proc()
+  // if logical igx,igy,igz of newproc > one away from myloc, set flag = 1
+  // this check needs to observe PBC
+  // cannot check via comm->procneigh since it ignores PBC
+
+  AtomVec *avec = atom->avec;
+  double **x = atom->x;
+  int nlocal = atom->nlocal;
+
+  int *periodicity = domain->periodicity;
+  int *myloc = comm->myloc;
+  int *procgrid = comm->procgrid;
+  int newproc,igx,igy,igz,glo,ghi;
+
+  int flag = 0;
+  for (int i = 0; i < nlocal; i++) {
+    if (x[i][0] < sublo[0] || x[i][0] >= subhi[0] ||
+        x[i][1] < sublo[1] || x[i][1] >= subhi[1] ||
+        x[i][2] < sublo[2] || x[i][2] >= subhi[2]) {
+      newproc = coord2proc(x[i],igx,igy,igz);
+
+      glo = myloc[0] - 1;
+      ghi = myloc[0] + 1;
+      if (periodicity[0]) {
+        if (glo < 0) glo = procgrid[0] - 1;
+        if (ghi >= procgrid[0]) ghi = 0;
+      }
+      if (igx != myloc[0] && igx != glo && igx != ghi) flag = 1;
+
+      glo = myloc[1] - 1;
+      ghi = myloc[1] + 1;
+      if (periodicity[1]) {
+        if (glo < 0) glo = procgrid[1] - 1;
+        if (ghi >= procgrid[1]) ghi = 0;
+      }
+      if (igx != myloc[1] && igx != glo && igx != ghi) flag = 1;
+
+      glo = myloc[2] - 1;
+      ghi = myloc[2] + 1;
+      if (periodicity[2]) {
+        if (glo < 0) glo = procgrid[2] - 1;
+        if (ghi >= procgrid[2]) ghi = 0;
+      }
+      if (igx != myloc[2] && igx != glo && igx != ghi) flag = 1;
+    }
+  }
+
+  int flagall;
+  MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_MAX,world);
+  return flagall;
+}
+
 /* ----------------------------------------------------------------------
    create a communication plan for atoms
    n = # of atoms to send
@@ -613,43 +698,43 @@ void Irregular::destroy_data()
    x will be in box (orthogonal) or lamda coords (triclinic)
    for uniform = 1, directly calculate owning proc
    for non-uniform, iteratively find owning proc via binary search
+   return owning proc ID via grid2proc
+   return igx,igy,igz = logical grid loc of owing proc within 3d grid of procs
 ------------------------------------------------------------------------- */
 
-int Irregular::coord2proc(double *x)
+int Irregular::coord2proc(double *x, int &igx, int &igy, int &igz)
 {
-  int loc[3];
-
   if (uniform) {
     if (triclinic == 0) {
-      loc[0] = static_cast<int> (procgrid[0] * (x[0]-boxlo[0]) / prd[0]);
-      loc[1] = static_cast<int>        (procgrid[1] * (x[1]-boxlo[1]) / prd[1]);
-      loc[2] = static_cast<int>        (procgrid[2] * (x[2]-boxlo[2]) / prd[2]);
+      igx = static_cast<int> (procgrid[0] * (x[0]-boxlo[0]) / prd[0]);
+      igy = static_cast<int> (procgrid[1] * (x[1]-boxlo[1]) / prd[1]);
+      igz = static_cast<int> (procgrid[2] * (x[2]-boxlo[2]) / prd[2]);
     } else {
-      loc[0] = static_cast<int> (procgrid[0] * x[0]);
-      loc[1] = static_cast<int> (procgrid[1] * x[1]);
-      loc[2] = static_cast<int> (procgrid[2] * x[2]);
+      igx = static_cast<int> (procgrid[0] * x[0]);
+      igy = static_cast<int> (procgrid[1] * x[1]);
+      igz = static_cast<int> (procgrid[2] * x[2]);
     }
 
   } else {
     if (triclinic == 0) {
-      loc[0] = binary((x[0]-boxlo[0])/prd[0],procgrid[0],xsplit);
-      loc[1] = binary((x[1]-boxlo[1])/prd[1],procgrid[1],ysplit);
-      loc[2] = binary((x[2]-boxlo[2])/prd[2],procgrid[2],zsplit);
+      igx = binary((x[0]-boxlo[0])/prd[0],procgrid[0],xsplit);
+      igy = binary((x[1]-boxlo[1])/prd[1],procgrid[1],ysplit);
+      igz = binary((x[2]-boxlo[2])/prd[2],procgrid[2],zsplit);
     } else {
-      loc[0] = binary(x[0],procgrid[0],xsplit);
-      loc[1] = binary(x[1],procgrid[1],ysplit);
-      loc[2] = binary(x[2],procgrid[2],zsplit);
+      igx = binary(x[0],procgrid[0],xsplit);
+      igy = binary(x[1],procgrid[1],ysplit);
+      igz = binary(x[2],procgrid[2],zsplit);
     }
   }
 
-  if (loc[0] < 0) loc[0] = 0;
-  if (loc[0] >= procgrid[0]) loc[0] = procgrid[0] - 1;
-  if (loc[1] < 0) loc[1] = 0;
-  if (loc[1] >= procgrid[1]) loc[1] = procgrid[1] - 1;
-  if (loc[2] < 0) loc[2] = 0;
-  if (loc[2] >= procgrid[2]) loc[2] = procgrid[2] - 1;
+  if (igx < 0) igx = 0;
+  if (igx >= procgrid[0]) igx = procgrid[0] - 1;
+  if (igy < 0) igy = 0;
+  if (igy >= procgrid[1]) igy = procgrid[1] - 1;
+  if (igz < 0) igz = 0;
+  if (igz >= procgrid[2]) igz = procgrid[2] - 1;
 
-  return grid2proc[loc[0]][loc[1]][loc[2]];
+  return grid2proc[igx][igy][igz];
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/irregular.h b/src/irregular.h
index c3ebcea6c6..2581551c0f 100644
--- a/src/irregular.h
+++ b/src/irregular.h
@@ -23,6 +23,7 @@ class Irregular : protected Pointers {
   Irregular(class LAMMPS *);
   ~Irregular();
   void migrate_atoms();
+  int migrate_check();
   int create_data(int, int *);
   void exchange_data(char *, int, char *);
   void destroy_data();
@@ -84,7 +85,7 @@ class Irregular : protected Pointers {
   int create_atom(int, int *, int *);
   void exchange_atom(double *, int *, double *);
   void destroy_atom();
-  int coord2proc(double *);
+  int coord2proc(double *, int &, int &, int &);
   int binary(double, int, double *);
 
   void grow_send(int,int);          // reallocate send buffer
diff --git a/src/region.h b/src/region.h
index 9b04bfd9c3..2406dc186f 100644
--- a/src/region.h
+++ b/src/region.h
@@ -97,7 +97,7 @@ The sub-regions can be dynamic, but not the combined region.
 
 E: Use of region with undefined lattice
 
-If scale = lattice (the default) for the region command, then a
+If units = lattice (the default) for the region command, then a
 lattice must first be defined via the lattice command.
 
 E: Region cannot have 0 length rotation vector
diff --git a/src/velocity.h b/src/velocity.h
index 7d249d16d5..8fa8be7f04 100644
--- a/src/velocity.h
+++ b/src/velocity.h
@@ -95,7 +95,7 @@ Self-explanatory.
 
 E: Use of velocity with undefined lattice
 
-If scale = lattice (the default) for the velocity set or velocity ramp
+If units = lattice (the default) for the velocity set or velocity ramp
 command, then a lattice must first be defined via the lattice command.
 
 E: Variable name for velocity set does not exist