From 54f60bf7177ffc6bf7ef14ce5742d3ecf039b122 Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Wed, 2 Jan 2013 16:27:38 +0000
Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@9176
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 src/GPU/gpu_extra.h  |   8 ++++
 src/GPU/pppm_gpu.cpp | 112 ++++++++++++++++++++++++-------------------
 src/GPU/pppm_gpu.h   |   7 ++-
 3 files changed, 78 insertions(+), 49 deletions(-)

diff --git a/src/GPU/gpu_extra.h b/src/GPU/gpu_extra.h
index 2c83806c00..f1b8de65cf 100644
--- a/src/GPU/gpu_extra.h
+++ b/src/GPU/gpu_extra.h
@@ -49,6 +49,9 @@ namespace GPU_EXTRA {
       else if (all_success == -8)
         error->all(FLERR,
                    "GPU particle split must be set to 1 for this pair style.");
+      else if (all_success == -9)
+        error->all(FLERR,
+                   "CPU neighbor lists must be used for ellipsoid/sphere mix.");
       else
         error->all(FLERR,"Unknown error in GPU library");
     }
@@ -102,6 +105,11 @@ E: GPU particle split must be set to 1 for this pair style.
 For this pair style, you cannot run part of the force calculation on
 the host.  See the package command.
 
+E: CPU neighbor lists must be used for ellipsoid/sphere mix
+
+When using Gay-Berne or RE-squared pair styles with both ellipsoidal and
+spherical particles, the neighbor list must be built on the CPU
+
 E: Unknown error in GPU library
 
 Self-explanatory.
diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp
index a8138067de..4fef72a0e1 100644
--- a/src/GPU/pppm_gpu.cpp
+++ b/src/GPU/pppm_gpu.cpp
@@ -39,6 +39,7 @@
 #include "error.h"
 #include "update.h"
 #include "universe.h"
+#include "fix.h"
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
@@ -120,9 +121,6 @@ void PPPMGPU::init()
   // NOTE: could free density_brick and vdxyz_brick after PPPM allocates them,
   //       before allocating db_gpu and vd_brick down below, if don't need,
   //       if do this, make sure to set them to NULL
-  // NOTE: delete/realloc of cg necessary b/c packing 4 values per grid pt,
-  //       not 3 as PPPM does - probably a better way to account for this
-  //       in PPPM::init()
 
   destroy_3d_offset(density_brick_gpu,nzlo_out,nylo_out);
   destroy_3d_offset(vd_brick,nzlo_out,nylo_out);
@@ -130,15 +128,11 @@ void PPPMGPU::init()
 
   PPPM::init();
 
-  if (differentiation_flag == 0) {
-    delete cg;
-    int (*procneigh)[2] = comm->procneigh;
-    cg = new CommGrid(lmp,world,4,1,
-                      nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                      nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
-                      procneigh[0][0],procneigh[0][1],procneigh[1][0],
-                      procneigh[1][1],procneigh[2][0],procneigh[2][1]);
-  }
+  // insure no conflict with fix balance
+
+  for (int i = 0; i < modify->nfix; i++)
+    if (strcmp(modify->fix[i]->style,"balance") == 0)
+      error->all(FLERR,"Cannot currently use pppm/gpu with fix balance.");
 
   // unsupported option
 
@@ -189,6 +183,8 @@ void PPPMGPU::init()
 
 void PPPMGPU::compute(int eflag, int vflag)
 {
+  int i,j;
+
   int nago;
   if (kspace_split) {
     if (im_real_space) return;
@@ -205,8 +201,12 @@ void PPPMGPU::compute(int eflag, int vflag)
   else evflag = evflag_atom = eflag_global = vflag_global = 
         eflag_atom = vflag_atom = 0;
 
+  // If need per-atom energies/virials, also do particle map on host
+  // concurrently with GPU calculations
   if (evflag_atom && !peratom_allocate_flag) {
     allocate_peratom();
+    cg_peratom->ghost_notify();
+    cg_peratom->setup();
     peratom_allocate_flag = 1;
   }
 
@@ -220,18 +220,6 @@ void PPPMGPU::compute(int eflag, int vflag)
   if (flag != 0)
     error->one(FLERR,"Out of range atoms - cannot compute PPPM");
 
-  // If need per-atom energies/virials, also do particle map on host
-  // concurrently with GPU calculations
-
-  if (evflag_atom) {
-    memory->destroy(part2grid);
-    nmax = atom->nmax;
-    memory->create(part2grid,nmax,3,"pppm:part2grid");
-    particle_map();
-  }
-
-  int i,j;
-
   // convert atoms from box to lamda coords
 
   if (triclinic == 0) boxlo = domain->boxlo;
@@ -240,6 +228,15 @@ void PPPMGPU::compute(int eflag, int vflag)
     domain->x2lamda(atom->nlocal);
   }
 
+  // extend size of per-atom arrays if necessary
+
+  if (evflag_atom && atom->nlocal > nmax) {
+    memory->destroy(part2grid);
+    nmax = atom->nmax;
+    memory->create(part2grid,nmax,3,"pppm:part2grid");
+    particle_map();
+  }
+
   double t3 = MPI_Wtime();
 
   // all procs communicate density values from their ghost cells
@@ -280,24 +277,7 @@ void PPPMGPU::compute(int eflag, int vflag)
   // per-atom energy/virial
   // energy includes self-energy correction
 
-  if (evflag_atom) {
-    double *q = atom->q;
-    int nlocal = atom->nlocal;
-
-    if (eflag_atom) {
-      for (i = 0; i < nlocal; i++) {
-        eatom[i] *= 0.5;
-        eatom[i] -= g_ewald*q[i]*q[i]/MY_PIS + MY_PI2*q[i]*qsum /
-          (g_ewald*g_ewald*volume);
-        eatom[i] *= qscale;
-      }
-    }
-
-    if (vflag_atom) {
-      for (i = 0; i < nlocal; i++)
-        for (j = 0; j < 6; j++) vatom[i][j] *= 0.5*qscale;
-    }
-  }
+  if (evflag_atom) fieldforce_peratom();
 
   // sum energy across procs and add in volume-dependent term
 
@@ -320,6 +300,28 @@ void PPPMGPU::compute(int eflag, int vflag)
     for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
   }
 
+  // per-atom energy/virial
+  // energy includes self-energy correction
+
+  if (evflag_atom) {
+    double *q = atom->q;
+    int nlocal = atom->nlocal;
+
+    if (eflag_atom) {
+      for (i = 0; i < nlocal; i++) {
+        eatom[i] *= 0.5;
+        eatom[i] -= g_ewald*q[i]*q[i]/MY_PIS + MY_PI2*q[i]*qsum /
+          (g_ewald*g_ewald*volume);
+        eatom[i] *= qscale;
+      }
+    }
+
+    if (vflag_atom) {
+      for (i = 0; i < nlocal; i++)
+        for (j = 0; j < 6; j++) vatom[i][j] *= 0.5*qscale;
+    }
+  }
+
   // 2d slab correction
 
   if (slabflag) slabcorr();
@@ -555,7 +557,7 @@ void PPPMGPU::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
 
   if (flag == FORWARD_IK) {
     int offset;
-    FFT_SCALAR *dest = &vdx_brick[nzlo_out][nylo_out][4*nxlo_out];
+    FFT_SCALAR *dest = &vd_brick[nzlo_out][nylo_out][4*nxlo_out];
     for (int i = 0; i < nlist; i++) {
       offset = 4*list[i];
       dest[offset++] = buf[n++];
@@ -565,7 +567,7 @@ void PPPMGPU::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
   } else if (flag == FORWARD_AD) {
     FFT_SCALAR *dest = &u_brick[nzlo_out][nylo_out][nxlo_out];
     for (int i = 0; i < nlist; i++)
-      dest[list[i]] = buf[n++];
+      dest[list[i]] = buf[i];
   } else if (flag == FORWARD_IK_PERATOM) {
     FFT_SCALAR *esrc = &u_brick[nzlo_out][nylo_out][nxlo_out];
     FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
@@ -690,16 +692,30 @@ double PPPMGPU::memory_usage()
 }
 
 /* ----------------------------------------------------------------------
-   perform and time the 4 FFTs required for N timesteps
+   perform and time the 1d FFTs required for N timesteps
 ------------------------------------------------------------------------- */
 
-int PPPMGPU::timing(int n, double &time3d, double &time1d) {
+int PPPMGPU::timing_1d(int n, double &time1d)
+{
   if (im_real_space) {
-    time3d = 1.0;
     time1d = 1.0;
     return 4;
   }
-  PPPM::timing(n,time3d,time1d);
+  PPPM::timing_1d(n,time1d);
+  return 4;
+}
+
+/* ----------------------------------------------------------------------
+   perform and time the 3d FFTs required for N timesteps
+------------------------------------------------------------------------- */
+
+int PPPMGPU::timing_3d(int n, double &time3d)
+{
+  if (im_real_space) {
+    time3d = 1.0;
+    return 4;
+  }
+  PPPM::timing_3d(n,time3d);
   return 4;
 }
 
diff --git a/src/GPU/pppm_gpu.h b/src/GPU/pppm_gpu.h
index 98d62ffa24..7ca80080ef 100644
--- a/src/GPU/pppm_gpu.h
+++ b/src/GPU/pppm_gpu.h
@@ -31,7 +31,8 @@ class PPPMGPU : public PPPM {
   void init();
   void setup();
   void compute(int, int);
-  int timing(int, double &, double &);
+  int timing_1d(int, double &);
+  int timing_3d(int, double &);
   double memory_usage();
 
  protected:
@@ -67,6 +68,10 @@ Self-explanatory.  Check the input script syntax and compare to the
 documentation for the command.  You can use -echo screen as a
 command-line option when running LAMMPS to see the offending line.
 
+E: Cannot currently use pppm/gpu with fix balance.
+
+Self-explanatory.
+
 E: Cannot (yet) do analytic differentiation with pppm/gpu.
 
 Self-explanatory.