Optimizations to kspace_style MSM, including improving the single-core performance and increasing the parallel scalability. A bug in MSM for mixed periodic and non-periodic boundary conditions was also fixed.

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@9597 f3b2605a-c512-4ea7-a41b-209d697bcdaa
2013-03-06 21:12:28 +00:00 · 2013-03-06 21:12:28 +00:00 · ed72c6c5ae
parent a3d29c1665
commit ed72c6c5ae
7 changed files with 1211 additions and 267 deletions
--- a/src/KSPACE/commgrid.cpp
+++ b/src/KSPACE/commgrid.cpp
@ -30,8 +30,8 @@ CommGrid::CommGrid(LAMMPS *lmp, MPI_Comm gcomm, int forward, int reverse,
                   int pxlo, int pxhi, int pylo, int pyhi, int pzlo, int pzhi) 
  : Pointers(lmp)
 {
-  me = comm->me;
  gridcomm = gcomm;
+  MPI_Comm_rank(gridcomm,&me);

  nforward = forward;
  nreverse = reverse;
@ -50,6 +50,61 @@ CommGrid::CommGrid(LAMMPS *lmp, MPI_Comm gcomm, int forward, int reverse,
  outzlo = ozlo;
  outzhi = ozhi;

+  outxlo_max = oxlo;
+  outxhi_max = oxhi;
+  outylo_max = oylo;
+  outyhi_max = oyhi;
+  outzlo_max = ozlo;
+  outzhi_max = ozhi;
+
+  procxlo = pxlo;
+  procxhi = pxhi;
+  procylo = pylo;
+  procyhi = pyhi;
+  proczlo = pzlo;
+  proczhi = pzhi;
+
+  nswap = 0;
+  swap = NULL;
+  buf1 = buf2 = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+CommGrid::CommGrid(LAMMPS *lmp, MPI_Comm gcomm, int forward, int reverse,
+                   int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
+                   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi,
+                   int oxlo_max, int oxhi_max, int oylo_max, int oyhi_max, int ozlo_max, int ozhi_max,
+                   int pxlo, int pxhi, int pylo, int pyhi, int pzlo, int pzhi) 
+  : Pointers(lmp)
+{
+  gridcomm = gcomm;
+  MPI_Comm_rank(gridcomm,&me);
+
+  nforward = forward;
+  nreverse = reverse;
+
+  inxlo = ixlo;
+  inxhi = ixhi;
+  inylo = iylo;
+  inyhi = iyhi;
+  inzlo = izlo;
+  inzhi = izhi;
+
+  outxlo = oxlo;
+  outxhi = oxhi;
+  outylo = oylo;
+  outyhi = oyhi;
+  outzlo = ozlo;
+  outzhi = ozhi;
+
+  outxlo_max = oxlo_max;
+  outxhi_max = oxhi_max;
+  outylo_max = oylo_max;
+  outyhi_max = oyhi_max;
+  outzlo_max = ozlo_max;
+  outzhi_max = ozhi_max;
+
  procxlo = pxlo;
  procxhi = pxhi;
  procylo = pylo;
@ -482,7 +537,8 @@ void CommGrid::reverse_comm(KSpace *kspace, int which)

 /* ----------------------------------------------------------------------
   create 1d list of offsets into 3d array section (xlo:xhi,ylo:yhi,zlo:zhi)
-   assume 3d array is allocated as (outxlo:outxhi,outylo:outyhi,outzlo:outzhi)
+   assume 3d array is allocated as (outxlo_max:outxhi_max,outylo_max:outyhi_max,
+     outzlo_max:outzhi_max)
 ------------------------------------------------------------------------- */

 int CommGrid::indices(int *&list, 
@ -491,15 +547,15 @@ int CommGrid::indices(int *&list,
  int nmax = (xhi-xlo+1) * (yhi-ylo+1) * (zhi-zlo+1);
  memory->create(list,nmax,"Commgrid:list");

-  int nx = (outxhi-outxlo+1);
-  int ny = (outyhi-outylo+1);
+  int nx = (outxhi_max-outxlo_max+1);
+  int ny = (outyhi_max-outylo_max+1);

  int n = 0;
  int ix,iy,iz;
  for (iz = zlo; iz <= zhi; iz++)
    for (iy = ylo; iy <= yhi; iy++)
      for (ix = xlo; ix <= xhi; ix++)
-        list[n++] = (iz-outzlo)*ny*nx + (iy-outylo)*nx + (ix-outxlo);
+        list[n++] = (iz-outzlo_max)*ny*nx + (iy-outylo_max)*nx + (ix-outxlo_max);

  return nmax;
 }
--- a/src/KSPACE/commgrid.h
+++ b/src/KSPACE/commgrid.h
@ -32,6 +32,11 @@ class CommGrid : protected Pointers {
           int, int, int, int, int, int,
           int, int, int, int, int, int,
           int, int, int, int, int, int);
+  CommGrid(class LAMMPS *, MPI_Comm, int, int,
+           int, int, int, int, int, int,
+           int, int, int, int, int, int,
+           int, int, int, int, int, int,
+           int, int, int, int, int, int);
  ~CommGrid();
  void ghost_notify();
  int ghost_overlap();
@ -55,6 +60,7 @@ class CommGrid : protected Pointers {

  int inxlo,inxhi,inylo,inyhi,inzlo,inzhi;
  int outxlo,outxhi,outylo,outyhi,outzlo,outzhi;
+  int outxlo_max,outxhi_max,outylo_max,outyhi_max,outzlo_max,outzhi_max;
  int procxlo,procxhi,procylo,procyhi,proczlo,proczhi;
  int ghostxlo,ghostxhi,ghostylo,ghostyhi,ghostzlo,ghostzhi;

--- a/src/KSPACE/msm.cpp
+++ b/src/KSPACE/msm.cpp
--- a/src/KSPACE/msm.h
+++ b/src/KSPACE/msm.h
@ -51,8 +51,10 @@ class MSM : public KSpace {
  int *nxhi_in,*nyhi_in,*nzhi_in;
  int *nxlo_out,*nylo_out,*nzlo_out;
  int *nxhi_out,*nyhi_out,*nzhi_out;
-  int *ngrid;
+  int *ngrid,*active_flag;
  int *alpha,*betax,*betay,*betaz;
+  int nxlo_out_all,nylo_out_all,nzlo_out_all;
+  int nxhi_out_all,nyhi_out_all,nzhi_out_all;
  int nxlo_direct,nxhi_direct,nylo_direct;
  int nyhi_direct,nzlo_direct,nzhi_direct;
  int nmax_direct;
@ -108,14 +110,19 @@ class MSM : public KSpace {
  void particle_map();
  void make_rho();
  virtual void direct(int);
+  void direct_peratom(int);
  void direct_top(int);
+  void direct_peratom_top(int);
  void restriction(int);
  void prolongation(int);
+  void grid_swap_forward(int,double*** &);
+  void grid_swap_reverse(int,double*** &);
  void fieldforce();
  void fieldforce_peratom();
+  void compute_phis(const double &, const double &, const double &);
  void compute_phis_and_dphis(const double &, const double &, const double &);
-  double compute_phi(const double &);
-  double compute_dphi(const double &);
+  inline double compute_phi(const double &);
+  inline double compute_dphi(const double &);
  void get_g_direct();
  void get_virial_direct();
  void get_g_direct_top(int);
@ -153,9 +160,9 @@ E: Kspace style requires atom attribute q

 The atom style defined does not have these attributes.

-E: Cannot use slab correction with MSM
+E: Slab correction not needed for MSM

-Slab correction can only be used with Ewald and PPPM, not MSM.
+Slab correction can only be used with Ewald and PPPM and is not needed by MSM.

 E: MSM order must be 4, 6, 8, or 10

--- a/src/KSPACE/msm_cg.cpp
+++ b/src/KSPACE/msm_cg.cpp
@ -78,15 +78,19 @@ void MSMCG::compute(int eflag, int vflag)
  int i,j,n;

  // set energy/virial flags
-  // invoke allocate_peratom() if needed for first time

  if (eflag || vflag) ev_setup(eflag,vflag);
  else evflag = evflag_atom = eflag_global = vflag_global =
    eflag_atom = vflag_atom = eflag_either = vflag_either = 0;

+  // invoke allocate_peratom() if needed for first time
+
  if (vflag_atom && !peratom_allocate_flag) {
    allocate_peratom();
-    for (n=0; n<levels; n++) {
+    cg_peratom_all->ghost_notify();
+    cg_peratom_all->setup();
+    for (int n=0; n<levels; n++) {
+      if (!active_flag[n]) continue;
      cg_peratom[n]->ghost_notify();
      cg_peratom[n]->setup();
    }
@ -161,28 +165,52 @@ void MSMCG::compute(int eflag, int vflag)
  particle_map();
  make_rho();

+  // all procs reverse communicate charge density values from their ghost grid points
+  //   to fully sum contribution in their 3d grid
+
  current_level = 0;
-  cg[0]->reverse_comm(this,REVERSE_RHO);
+  cg_all->reverse_comm(this,REVERSE_RHO);

-  // all procs communicate density values from their ghost cells
-  //   to fully sum contribution in their 3d bricks
+  // forward communicate charge density values to fill ghost grid points
+  // compute direct sum interaction and then restrict to coarser grid

-  for (n=0; n<=levels-2; n++) {
+  for (int n=0; n<=levels-2; n++) {
+    if (!active_flag[n]) continue;
    current_level = n;
    cg[n]->forward_comm(this,FORWARD_RHO);

    direct(n);
    restriction(n);
  }
-  
-  // top grid level

-  current_level = levels-1;
-  cg[levels-1]->forward_comm(this,FORWARD_RHO);
-  direct_top(levels-1);

-  for (n=levels-2; n>=0; n--) {
+  // compute direct interation for top grid level for nonperiodic
+  //   and for second from top grid level for periodic

+  if (active_flag[levels-1]) {
+    if (domain->nonperiodic) {
+      current_level = levels-1;
+      cg[levels-1]->forward_comm(this,FORWARD_RHO);
+      direct_top(levels-1);
+      cg[levels-1]->reverse_comm(this,REVERSE_AD);
+      if (vflag_atom)
+        cg_peratom[levels-1]->reverse_comm(this,REVERSE_AD_PERATOM);
+    } else {
+      // Here using MPI_Allreduce is cheaper than using commgrid
+      grid_swap_forward(levels-1,qgrid[levels-1]);
+      direct(levels-1);
+      grid_swap_reverse(levels-1,egrid[levels-1]);
+      current_level = levels-1;
+      if (vflag_atom)
+        cg_peratom[levels-1]->reverse_comm(this,REVERSE_AD_PERATOM);
+    }
+  }
+
+  // prolongate energy/virial from coarser grid to finer grid
+  // reverse communicate from ghost grid points to get full sum
+
+  for (int n=levels-2; n>=0; n--) {
+    if (!active_flag[n]) continue;
    prolongation(n);

    current_level = n;
@ -198,26 +226,25 @@ void MSMCG::compute(int eflag, int vflag)
  // to fill ghost cells surrounding their 3d bricks

  current_level = 0;
-
-  cg[0]->forward_comm(this,FORWARD_AD);
+  cg_all->forward_comm(this,FORWARD_AD);

  // extra per-atom energy/virial communication

  if (vflag_atom)
-    cg_peratom[0]->forward_comm(this,FORWARD_AD_PERATOM);
+    cg_peratom_all->forward_comm(this,FORWARD_AD_PERATOM);

  // calculate the force on my particles (interpolation)

  fieldforce();

-  // calculate the per-atom energy for my particles
+  // calculate the per-atom energy/virial for my particles

  if (evflag_atom) fieldforce_peratom();

+  // total long-range energy
+
  const double qscale = force->qqrd2e * scale;

-  // Total long-range energy
-    
  if (eflag_global) {
    double energy_all;
    MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
@ -228,7 +255,7 @@ void MSMCG::compute(int eflag, int vflag)
    energy *= 0.5*qscale;
  }

-  // Total long-range virial
+  // total long-range virial

  if (vflag_global) {
    double virial_all[6];
@ -338,7 +365,7 @@ void MSMCG::make_rho()
    dy = ny - (x[i][1]-boxlo[1])*delyinv[0];
    dz = nz - (x[i][2]-boxlo[2])*delzinv[0];

-    compute_phis_and_dphis(dx,dy,dz);
+    compute_phis(dx,dy,dz);

    z0 = q[i];
    for (n = nlower; n <= nupper; n++) {
--- a/src/STUBS/mpi.h
+++ b/src/STUBS/mpi.h
@ -43,6 +43,9 @@ extern "C" {
 #define MPI_MINLOC 5
 #define MPI_LOR 6

+#define MPI_UNDEFINED -1
+#define MPI_COMM_NULL -1
+
 #define MPI_ANY_SOURCE -1

 #define MPI_Comm int
--- a/src/USER-OMP/msm_cg_omp.cpp
+++ b/src/USER-OMP/msm_cg_omp.cpp
@ -79,15 +79,19 @@ void MSMCGOMP::compute(int eflag, int vflag)
  int i,j,n;

  // set energy/virial flags
-  // invoke allocate_peratom() if needed for first time

  if (eflag || vflag) ev_setup(eflag,vflag);
  else evflag = evflag_atom = eflag_global = vflag_global =
    eflag_atom = vflag_atom = eflag_either = vflag_either = 0;

+  // invoke allocate_peratom() if needed for first time
+
  if (vflag_atom && !peratom_allocate_flag) {
    allocate_peratom();
-    for (n=0; n<levels; n++) {
+    cg_peratom_all->ghost_notify();
+    cg_peratom_all->setup();
+    for (int n=0; n<levels; n++) {
+      if (!active_flag[n]) continue;
      cg_peratom[n]->ghost_notify();
      cg_peratom[n]->setup();
    }
@ -162,13 +166,17 @@ void MSMCGOMP::compute(int eflag, int vflag)
  particle_map();
  make_rho();

+  // all procs reverse communicate charge density values from their ghost grid points
+  //   to fully sum contribution in their 3d grid
+
  current_level = 0;
-  cg[0]->reverse_comm(this,REVERSE_RHO);
+  cg_all->reverse_comm(this,REVERSE_RHO);

-  // all procs communicate density values from their ghost cells
-  //   to fully sum contribution in their 3d bricks
+  // forward communicate charge density values to fill ghost grid points
+  // compute direct sum interaction and then restrict to coarser grid

-  for (n=0; n<=levels-2; n++) {
+  for (int n=0; n<=levels-2; n++) {
+    if (!active_flag[n]) continue;
    current_level = n;
    cg[n]->forward_comm(this,FORWARD_RHO);

@ -176,14 +184,34 @@ void MSMCGOMP::compute(int eflag, int vflag)
    restriction(n);
  }

-  // top grid level

-  current_level = levels-1;
-  cg[levels-1]->forward_comm(this,FORWARD_RHO);
-  direct_top(levels-1);
+  // compute direct interation for top grid level for nonperiodic
+  //   and for second from top grid level for periodic

-  for (n=levels-2; n>=0; n--) {
+  if (active_flag[levels-1]) {
+    if (domain->nonperiodic) {
+      current_level = levels-1;
+      cg[levels-1]->forward_comm(this,FORWARD_RHO);
+      direct_top(levels-1);
+      cg[levels-1]->reverse_comm(this,REVERSE_AD);
+      if (vflag_atom)
+        cg_peratom[levels-1]->reverse_comm(this,REVERSE_AD_PERATOM);
+    } else {
+      // Here using MPI_Allreduce is cheaper than using commgrid
+      grid_swap_forward(levels-1,qgrid[levels-1]);
+      direct(levels-1);
+      grid_swap_reverse(levels-1,egrid[levels-1]);
+      current_level = levels-1;
+      if (vflag_atom)
+        cg_peratom[levels-1]->reverse_comm(this,REVERSE_AD_PERATOM);
+    }
+  }

+  // prolongate energy/virial from coarser grid to finer grid
+  // reverse communicate from ghost grid points to get full sum
+
+  for (int n=levels-2; n>=0; n--) {
+    if (!active_flag[n]) continue;
    prolongation(n);

    current_level = n;
@ -199,25 +227,24 @@ void MSMCGOMP::compute(int eflag, int vflag)
  // to fill ghost cells surrounding their 3d bricks

  current_level = 0;
-
-  cg[0]->forward_comm(this,FORWARD_AD);
+  cg_all->forward_comm(this,FORWARD_AD);

  // extra per-atom energy/virial communication

  if (vflag_atom)
-    cg_peratom[0]->forward_comm(this,FORWARD_AD_PERATOM);
+    cg_peratom_all->forward_comm(this,FORWARD_AD_PERATOM);

  // calculate the force on my particles (interpolation)

  fieldforce();

-  // calculate the per-atom energy for my particles
+  // calculate the per-atom energy/virial for my particles

  if (evflag_atom) fieldforce_peratom();

-  const double qscale = force->qqrd2e * scale;
+  // total long-range energy

-  // Total long-range energy
+  const double qscale = force->qqrd2e * scale;

  if (eflag_global) {
    double energy_all;
@ -229,7 +256,7 @@ void MSMCGOMP::compute(int eflag, int vflag)
    energy *= 0.5*qscale;
  }

-  // Total long-range virial
+  // total long-range virial

  if (vflag_global) {
    double virial_all[6];
@ -352,7 +379,7 @@ void MSMCGOMP::make_rho()
    dy = ny - (x[i][1]-boxlo[1])*delyinv[0];
    dz = nz - (x[i][2]-boxlo[2])*delzinv[0];

-    compute_phis_and_dphis(dx,dy,dz);
+    compute_phis(dx,dy,dz);

    z0 = q[i];
    for (n = nlower; n <= nupper; n++) {