git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6068 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-05-04 16:23:59 +00:00 · 2011-05-04 16:23:59 +00:00 · b8f1ff821f
parent 6df2d87723
commit b8f1ff821f
11 changed files with 2970 additions and 0 deletions
--- a/src/GPU/gpu_extra.h
+++ b/src/GPU/gpu_extra.h
@ -0,0 +1,47 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Mike Brown (ORNL)
+------------------------------------------------------------------------- */
+
+#ifndef LMP_GPU_EXTRA_H
+#define LMP_GPU_EXTRA_H
+
+#include "error.h"
+
+namespace GPU_EXTRA {
+
+  inline void check_flag(int error_flag, LAMMPS_NS::Error *error,
+                         MPI_Comm &world) { 
+    int all_success;
+    MPI_Allreduce(&error_flag, &all_success, 1, MPI_INT, MPI_MIN, world);
+    if (all_success != 0) {
+      if (all_success == -1)
+	error->all("Accelerated style in input script but no fix gpu."); 
+      else if (all_success == -2)
+	error->all("Could not find/initialize a specified accelerator device.");
+      else if (all_success == -3)
+	error->all("Insufficient memory on accelerator.");
+      else if (all_success == -4)
+	error->all("GPU library not compiled for this accelerator.");
+      else if (all_success == -5)
+	error->all("Double precision is not supported on this accelerator.");
+      else
+	error->all("Unknown error in GPU library.");
+    }
+  }
+
+}
+
+#endif
--- a/src/GPU/pair_lj_expand_gpu.cpp
+++ b/src/GPU/pair_lj_expand_gpu.cpp
@ -0,0 +1,236 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+   
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Inderaj Bains (NVIDIA), ibains@nvidia.com
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "pair_lj_expand_gpu.h"
+#include "lmptype.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "integrate.h"
+#include "memory.h"
+#include "error.h"
+#include "neigh_request.h"
+#include "universe.h"
+#include "update.h"
+#include "domain.h"
+#include "string.h"
+#include "gpu_extra.h"
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+// External functions from cuda library for atom decomposition
+
+int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+		 double **host_lj2, double **host_lj3, double **host_lj4, 
+		 double **offset, double **shift, double *special_lj, 
+		 const int nlocal, const int nall, const int max_nbors,
+		 const int maxspecial, const double cell_size, int &gpu_mode,
+		 FILE *screen);
+void lje_gpu_clear();
+int ** lje_gpu_compute_n(const int ago, const int inum, const int nall,
+			 double **host_x, int *host_type, double *sublo,
+			 double *subhi, int *tag, int **nspecial,
+			 int **special, const bool eflag, const bool vflag,
+			 const bool eatom, const bool vatom, int &host_start,
+			 int **ilist, int **jnum,
+			 const double cpu_time, bool &success);
+void lje_gpu_compute(const int ago, const int inum, const int nall,
+		     double **host_x, int *host_type, int *ilist, int *numj,
+		     int **firstneigh, const bool eflag, const bool vflag,
+		     const bool eatom, const bool vatom, int &host_start,
+		     const double cpu_time, bool &success);
+double lje_gpu_bytes();
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJExpandGPU::PairLJExpandGPU(LAMMPS *lmp) : PairLJExpand(lmp), gpu_mode(GPU_PAIR)
+{
+  respa_enable = 0;
+  cpu_time = 0.0;
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairLJExpandGPU::~PairLJExpandGPU()
+{
+  lje_gpu_clear();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJExpandGPU::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+  
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+  
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;    
+  if (gpu_mode == GPU_NEIGH) {
+    inum = atom->nlocal;
+    firstneigh = lje_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
+				   atom->type, domain->sublo, domain->subhi,
+				   atom->tag, atom->nspecial, atom->special,
+				   eflag, vflag, eflag_atom, vflag_atom,
+				   host_start, &ilist, &numneigh, cpu_time,
+				   success);
+  } else {
+    inum = list->inum;
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    lje_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+		    ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+		    vflag_atom, host_start, cpu_time, success);
+  }
+  if (!success)
+    error->one("Out of memory on GPGPU");
+
+  if (host_start<inum) {
+    cpu_time = MPI_Wtime();
+    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
+    cpu_time = MPI_Wtime() - cpu_time;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairLJExpandGPU::init_style()
+{
+  if (force->newton_pair) 
+    error->all("Cannot use newton pair with GPU LJ pair style");
+
+  // Repeat cutsq calculation because done after call to init_style
+  double maxcut = -1.0;
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cut *= cut;
+        if (cut > maxcut)
+          maxcut = cut;
+        cutsq[i][j] = cutsq[j][i] = cut;
+      } else
+        cutsq[i][j] = cutsq[j][i] = 0.0;
+    }
+  }
+  double cell_size = sqrt(maxcut) + neighbor->skin;
+
+  int maxspecial=0;
+  if (atom->molecular)
+    maxspecial=atom->maxspecial;
+  int success = lje_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
+			     offset, shift, force->special_lj, atom->nlocal,
+			     atom->nlocal+atom->nghost, 300, maxspecial,
+			     cell_size, gpu_mode, screen);
+  GPU_EXTRA::check_flag(success,error,world);
+
+  if (gpu_mode != GPU_NEIGH) {
+    int irequest = neighbor->request(this);
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full = 1;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJExpandGPU::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+  return bytes + lje_gpu_bytes();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJExpandGPU::cpu_compute(int start, int inum, int eflag, int vflag,
+				  int *ilist, int *numneigh, int **firstneigh)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r6inv,forcelj,factor_lj;
+  double r,rshift,rshiftsq;
+  int *jlist;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  double *special_lj = force->special_lj;
+
+  // loop over neighbors of my atoms
+
+  for (ii = start; ii < inum; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+	r = sqrt(rsq);
+	rshift = r - shift[itype][jtype];
+	rshiftsq = rshift*rshift;
+	r2inv = 1.0/rshiftsq;
+	r6inv = r2inv*r2inv*r2inv;
+	forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+	fpair = factor_lj*forcelj/rshift/r;
+
+	f[i][0] += delx*fpair;
+	f[i][1] += dely*fpair;
+	f[i][2] += delz*fpair;
+
+	if (eflag) {
+	  evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
+	    offset[itype][jtype];
+	  evdwl *= factor_lj;
+	}
+
+	if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
+      }
+    }
+  }
+}
--- a/src/GPU/pair_lj_expand_gpu.h
+++ b/src/GPU/pair_lj_expand_gpu.h
@ -0,0 +1,47 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/expand/gpu,PairLJExpandGPU)
+
+#else
+
+#ifndef LMP_PAIR_LJE_LIGHT_GPU_H
+#define LMP_PAIR_LJE_LIGHT_GPU_H
+
+#include "pair_lj_expand.h"
+
+namespace LAMMPS_NS {
+
+class PairLJExpandGPU : public PairLJExpand {
+ public:
+  PairLJExpandGPU(LAMMPS *lmp);
+  ~PairLJExpandGPU();
+  void cpu_compute(int, int, int, int, int *, int *, int **);
+  void compute(int, int);
+  void init_style();
+  double memory_usage();
+
+ enum { GPU_PAIR, GPU_NEIGH };
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+  int *gpulist;
+};
+
+}
+#endif
+#endif
+
--- a/src/GPU/pair_morse_gpu.cpp
+++ b/src/GPU/pair_morse_gpu.cpp
@ -0,0 +1,230 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+   
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Mike Brown (SNL)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "pair_morse_gpu.h"
+#include "lmptype.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "integrate.h"
+#include "memory.h"
+#include "error.h"
+#include "neigh_request.h"
+#include "universe.h"
+#include "update.h"
+#include "domain.h"
+#include "string.h"
+#include "gpu_extra.h"
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+// External functions from cuda library for atom decomposition
+
+int mor_gpu_init(const int ntypes, double **cutsq, double **host_morse1,
+		 double **host_r0, double **host_alpha, double **host_d0, 
+		 double **offset, double *special_lj, const int nlocal, 
+		 const int nall, const int max_nbors, const int maxspecial,
+		 const double cell_size, int &gpu_mode, FILE *screen);
+void mor_gpu_clear();
+int ** mor_gpu_compute_n(const int ago, const int inum,
+			 const int nall, double **host_x, int *host_type, 
+			 double *sublo, double *subhi, int *tag, int **nspecial,
+			 int **special, const bool eflag, const bool vflag,
+			 const bool eatom, const bool vatom, int &host_start,
+			 int **ilist, int **jnum,
+			 const double cpu_time, bool &success);
+void mor_gpu_compute(const int ago, const int inum, const int nall, 
+		     double **host_x, int *host_type, int *ilist, int *numj,
+		     int **firstneigh, const bool eflag, const bool vflag,
+		     const bool eatom, const bool vatom, int &host_start,
+		     const double cpu_time, bool &success);
+double mor_gpu_bytes();
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairMorseGPU::PairMorseGPU(LAMMPS *lmp) : PairMorse(lmp), gpu_mode(GPU_PAIR)
+{
+  cpu_time = 0.0;
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairMorseGPU::~PairMorseGPU()
+{
+  mor_gpu_clear();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairMorseGPU::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+  
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+  
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;    
+  if (gpu_mode == GPU_NEIGH) {
+    inum = atom->nlocal;
+    firstneigh = mor_gpu_compute_n(neighbor->ago, inum, nall,
+				   atom->x, atom->type, domain->sublo,
+				   domain->subhi, atom->tag, atom->nspecial,
+				   atom->special, eflag, vflag, eflag_atom,
+				   vflag_atom, host_start, &ilist, &numneigh,
+				   cpu_time, success);
+  } else {
+    inum = list->inum;
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    mor_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+		    ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+		    vflag_atom, host_start, cpu_time, success);
+  }
+  if (!success)
+    error->one("Out of memory on GPGPU");
+
+  if (host_start<inum) {
+    cpu_time = MPI_Wtime();
+    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
+    cpu_time = MPI_Wtime() - cpu_time;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairMorseGPU::init_style()
+{
+  if (force->newton_pair) 
+    error->all("Cannot use newton pair with GPU Morse pair style");
+
+  // Repeat cutsq calculation because done after call to init_style
+  double maxcut = -1.0;
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cut *= cut;
+        if (cut > maxcut)
+          maxcut = cut;
+        cutsq[i][j] = cutsq[j][i] = cut;
+      } else
+        cutsq[i][j] = cutsq[j][i] = 0.0;
+    }
+  }
+  double cell_size = sqrt(maxcut) + neighbor->skin;
+
+  int maxspecial=0;
+  if (atom->molecular)
+    maxspecial=atom->maxspecial;
+  int success = mor_gpu_init(atom->ntypes+1, cutsq, morse1, r0, alpha, d0,
+			     offset, force->special_lj, atom->nlocal,
+			     atom->nlocal+atom->nghost, 300, maxspecial,
+			     cell_size, gpu_mode, screen);
+  GPU_EXTRA::check_flag(success,error,world);
+
+  if (gpu_mode != GPU_NEIGH) {
+    int irequest = neighbor->request(this);
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full = 1;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairMorseGPU::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+  return bytes + mor_gpu_bytes();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairMorseGPU::cpu_compute(int start, int inum, int eflag, int vflag,
+			       int *ilist, int *numneigh, int **firstneigh)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r,dr,dexp,factor_lj;
+  int *jlist;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  double *special_lj = force->special_lj;
+
+  // loop over neighbors of my atoms
+
+  for (ii = start; ii < inum; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+	r = sqrt(rsq);
+	dr = r - r0[itype][jtype];
+	dexp = exp(-alpha[itype][jtype] * dr);
+	fpair = factor_lj * morse1[itype][jtype] * (dexp*dexp - dexp) / r;
+
+	f[i][0] += delx*fpair;
+	f[i][1] += dely*fpair;
+	f[i][2] += delz*fpair;
+
+	if (eflag) {
+	  evdwl = d0[itype][jtype] * (dexp*dexp - 2.0*dexp) -
+	    offset[itype][jtype];
+	  evdwl *= factor_lj;
+	}
+
+	if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
+      }
+    }
+  }
+}
--- a/src/GPU/pair_morse_gpu.h
+++ b/src/GPU/pair_morse_gpu.h
@ -0,0 +1,47 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(morse/gpu,PairMorseGPU)
+
+#else
+
+#ifndef LMP_PAIR_MORSE_GPU_H
+#define LMP_PAIR_MORSE_GPU_H
+
+#include "pair_morse.h"
+
+namespace LAMMPS_NS {
+
+class PairMorseGPU : public PairMorse {
+ public:
+  PairMorseGPU(LAMMPS *lmp);
+  ~PairMorseGPU();
+  void cpu_compute(int, int, int, int, int *, int *, int **);
+  void compute(int, int);
+  void init_style();
+  double memory_usage();
+
+ enum { GPU_PAIR, GPU_NEIGH };
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+  int *gpulist;
+};
+
+}
+#endif
+#endif
+
--- a/src/GPU/pppm_gpu.cpp
+++ b/src/GPU/pppm_gpu.cpp
--- a/src/GPU/pppm_gpu.h
+++ b/src/GPU/pppm_gpu.h
@ -0,0 +1,103 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_PPPM_GPU_H
+#define LMP_PPPM_GPU_H
+
+#include "kspace.h"
+#include "lmptype.h"
+
+namespace LAMMPS_NS {
+
+template <class grdtyp>
+class PPPMGPU : public KSpace {
+ public:
+  PPPMGPU(class LAMMPS *, int, char **);
+  virtual ~PPPMGPU();
+  virtual void init() = 0;
+  void base_init();
+  void setup();
+  virtual void compute(int, int) = 0;
+  void timing(int, double &, double &);
+  virtual double memory_usage() = 0;
+
+ protected:
+  int me,nprocs;
+  double PI;
+  double precision;
+  int nfactors;
+  int *factors;
+  double qsum,qsqsum;
+  double qqrd2e;
+  double cutoff;
+  double volume;
+  double delxinv,delyinv,delzinv,delvolinv;
+  double shift,shiftone;
+
+  int nxlo_in,nylo_in,nzlo_in,nxhi_in,nyhi_in,nzhi_in;
+  int nxlo_out,nylo_out,nzlo_out,nxhi_out,nyhi_out,nzhi_out;
+  int nxlo_ghost,nxhi_ghost,nylo_ghost,nyhi_ghost,nzlo_ghost,nzhi_ghost;
+  int nxlo_fft,nylo_fft,nzlo_fft,nxhi_fft,nyhi_fft,nzhi_fft;
+  int nlower,nupper;
+  int ngrid,nfft,nbuf,nfft_both;
+
+  grdtyp ***density_brick;
+  grdtyp ***vd_brick;
+  double *greensfn;
+  double **vg;
+  double *fkx,*fky,*fkz;
+  double *density_fft;
+  double *work1,*work2;
+  double *buf1,*buf2;
+
+  double *gf_b;
+  double **rho1d,**rho_coeff;
+
+  class FFT3d *fft1,*fft2;
+  class Remap *remap;
+
+  int nmax;
+
+  int triclinic;               // domain settings, orthog or triclinic
+  double *boxlo;
+                               // TIP4P settings
+  int typeH,typeO;             // atom types of TIP4P water H and O atoms
+  double qdist;                // distance from O site to negative charge
+  double alpha;                // geometric factor
+
+  void set_grid();
+  void allocate();
+  void deallocate();
+  int factorable(int);
+  double rms(double, double, bigint, double, double **);
+  double diffpr(double, double, double, double, double **);
+  void compute_gf_denom();
+  double gf_denom(double, double, double);
+  void brick2fft();
+  void fillbrick();
+  void poisson(int, int);
+  void procs2grid2d(int,int,int,int *, int*);
+  void compute_rho1d(double, double, double);
+  void compute_rho_coeff();
+  void slabcorr(int);
+
+  double poisson_time;  
+
+  grdtyp ***create_3d_offset(int, int, int, int, int, int, const char *,
+			     grdtyp *, int);
+  void destroy_3d_offset(grdtyp ***, int, int);
+};
+
+}
+
+#endif
--- a/src/GPU/pppm_gpu_double.cpp
+++ b/src/GPU/pppm_gpu_double.cpp
@ -0,0 +1,217 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL)
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include "string.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "math.h"
+#include "pppm_gpu_double.h"
+#include "lmptype.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "domain.h"
+#include "fft3d_wrap.h"
+#include "remap_wrap.h"
+#include "memory.h"
+#include "error.h"
+#include "gpu_extra.h"
+
+#define grdtyp double
+
+// External functions from cuda library for atom decomposition
+
+grdtyp* pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen,
+		        const int order, const int nxlo_out, 
+			const int nylo_out, const int nzlo_out,
+			const int nxhi_out, const int nyhi_out,
+			const int nzhi_out, double **rho_coeff,
+			grdtyp **_vd_brick, const double slab_volfactor,
+			const int nx_pppm, const int ny_pppm,
+			const int nz_pppm, int &success);
+void pppm_gpu_clear_d(const double poisson_time);
+int pppm_gpu_spread_d(const int ago, const int nlocal, const int nall,
+                      double **host_x, int *host_type, bool &success,
+                      double *host_q, double *boxlo, const double delxinv,
+                      const double delyinv, const double delzinv);
+void pppm_gpu_interp_d(const grdtyp qqrd2e_scale);
+double pppm_gpu_bytes_d();
+
+using namespace LAMMPS_NS;
+
+#define MAXORDER 7
+#define OFFSET 16384
+#define SMALL 0.00001
+#define LARGE 10000.0
+#define EPS_HOC 1.0e-7
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PPPMGPUDouble::PPPMGPUDouble(LAMMPS *lmp, int narg, char **arg) :
+  PPPMGPU<grdtyp>(lmp, narg, arg)
+{
+}
+
+/* ----------------------------------------------------------------------
+   free all memory 
+------------------------------------------------------------------------- */
+
+PPPMGPUDouble::~PPPMGPUDouble()
+{
+  pppm_gpu_clear_d(poisson_time);
+}
+
+/* ----------------------------------------------------------------------
+   called once before run 
+------------------------------------------------------------------------- */
+
+void PPPMGPUDouble::init()
+{
+  base_init();
+
+  if (order>8)
+    error->all("Cannot use order greater than 8 with pppm/gpu.");
+  pppm_gpu_clear_d(poisson_time);
+
+  int success;
+  grdtyp *data, *h_brick;
+  h_brick = pppm_gpu_init_d(atom->nlocal, atom->nlocal+atom->nghost, screen,
+			    order, nxlo_out, nylo_out, nzlo_out, nxhi_out,
+			    nyhi_out, nzhi_out, rho_coeff, &data,
+			    slab_volfactor, nx_pppm, ny_pppm, nz_pppm, 
+			    success);
+
+  GPU_EXTRA::check_flag(success,error,world);
+
+  density_brick =
+    create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
+		     nxlo_out,nxhi_out,"pppm:density_brick",h_brick,1);
+  vd_brick =
+    create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
+		     nxlo_out,nxhi_out,"pppm:vd_brick",data,4);
+
+  poisson_time=0;
+}
+
+/* ----------------------------------------------------------------------
+   compute the PPPMGPU long-range force, energy, virial 
+------------------------------------------------------------------------- */
+
+void PPPMGPUDouble::compute(int eflag, int vflag)
+{
+  bool success = true;
+  int flag=pppm_gpu_spread_d(neighbor->ago, atom->nlocal, atom->nlocal + 
+			     atom->nghost, atom->x, atom->type, success,
+			     atom->q, domain->boxlo, delxinv, delyinv,
+			     delzinv);
+  if (!success)
+    error->one("Out of memory on GPGPU");
+  if (flag != 0)
+    error->one("Out of range atoms - cannot compute PPPM");
+
+  int i;
+
+  // convert atoms from box to lamda coords
+  
+  if (triclinic == 0) boxlo = domain->boxlo;
+  else {
+    boxlo = domain->boxlo_lamda;
+    domain->x2lamda(atom->nlocal);
+  }
+
+  energy = 0.0;
+  if (vflag) for (i = 0; i < 6; i++) virial[i] = 0.0;
+
+  double t3=MPI_Wtime();
+
+  // all procs communicate density values from their ghost cells
+  //   to fully sum contribution in their 3d bricks
+  // remap from 3d decomposition to FFT decomposition
+
+  brick2fft();
+
+  // compute potential gradient on my FFT grid and
+  //   portion of e_long on this proc's FFT grid
+  // return gradients (electric fields) in 3d brick decomposition
+  
+  poisson(eflag,vflag);
+
+  // all procs communicate E-field values to fill ghost cells
+  //   surrounding their 3d bricks
+
+  fillbrick();
+
+  poisson_time+=MPI_Wtime()-t3;
+
+  // calculate the force on my particles
+
+  grdtyp qqrd2e_scale=qqrd2e*scale;
+  pppm_gpu_interp_d(qqrd2e_scale);
+
+  // sum energy across procs and add in volume-dependent term
+
+  if (eflag) {
+    double energy_all;
+    MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
+    energy = energy_all;
+   
+    energy *= 0.5*volume;
+    energy -= g_ewald*qsqsum/1.772453851 +
+      0.5*PI*qsum*qsum / (g_ewald*g_ewald*volume);
+    energy *= qqrd2e*scale;
+  }
+
+  // sum virial across procs
+
+  if (vflag) {
+    double virial_all[6];
+    MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
+    for (i = 0; i < 6; i++) virial[i] = 0.5*qqrd2e*scale*volume*virial_all[i];
+  }
+
+  // 2d slab correction
+
+  if (slabflag) slabcorr(eflag);
+
+  // convert atoms back from lamda to box coords
+  
+  if (triclinic) domain->lamda2x(atom->nlocal);
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of local arrays 
+------------------------------------------------------------------------- */
+
+double PPPMGPUDouble::memory_usage()
+{
+  double bytes = nmax*3 * sizeof(double);
+  int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) * 
+    (nzhi_out-nzlo_out+1);
+  bytes += 4 * nbrick * sizeof(grdtyp);
+  bytes += 6 * nfft_both * sizeof(double);
+  bytes += nfft_both*6 * sizeof(double);
+  bytes += 2 * nbuf * sizeof(double);
+  return bytes + pppm_gpu_bytes_d();
+}
--- a/src/GPU/pppm_gpu_double.h
+++ b/src/GPU/pppm_gpu_double.h
@ -0,0 +1,42 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/gpu/double,PPPMGPUDouble)
+
+#else
+
+#ifndef LMP_PPPM_GPU_DOUBLE_H
+#define LMP_PPPM_GPU_DOUBLE_H
+
+#include "pppm_gpu.h"
+#include "lmptype.h"
+
+namespace LAMMPS_NS {
+
+class PPPMGPUDouble : public PPPMGPU<double> {
+ public:
+  PPPMGPUDouble(class LAMMPS *, int, char **);
+  ~PPPMGPUDouble();
+  void init();
+  void compute(int, int);
+  double memory_usage();
+
+ protected:
+};
+
+}
+
+#endif
+#endif
--- a/src/GPU/pppm_gpu_single.cpp
+++ b/src/GPU/pppm_gpu_single.cpp
@ -0,0 +1,216 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL)
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include "string.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "math.h"
+#include "pppm_gpu_single.h"
+#include "lmptype.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "domain.h"
+#include "fft3d_wrap.h"
+#include "remap_wrap.h"
+#include "memory.h"
+#include "error.h"
+#include "gpu_extra.h"
+
+#define grdtyp float
+
+// External functions from cuda library for atom decomposition
+
+grdtyp* pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen,
+		        const int order, const int nxlo_out, 
+			const int nylo_out, const int nzlo_out,
+			const int nxhi_out, const int nyhi_out,
+			const int nzhi_out, double **rho_coeff,
+			grdtyp **_vd_brick, const double slab_volfactor,
+			const int nx_pppm, const int ny_pppm,
+			const int nz_pppm, int &success);
+void pppm_gpu_clear_f(const double poisson_time);
+int pppm_gpu_spread_f(const int ago, const int nlocal, const int nall,
+                      double **host_x, int *host_type, bool &success,
+                      double *host_q, double *boxlo, const double delxinv,
+                      const double delyinv, const double delzinv);
+void pppm_gpu_interp_f(const grdtyp qqrd2e_scale);
+double pppm_gpu_bytes_f();
+
+using namespace LAMMPS_NS;
+
+#define MAXORDER 7
+#define OFFSET 16384
+#define SMALL 0.00001
+#define LARGE 10000.0
+#define EPS_HOC 1.0e-7
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PPPMGPUSingle::PPPMGPUSingle(LAMMPS *lmp, int narg, char **arg) :
+  PPPMGPU<grdtyp>(lmp, narg, arg)
+{
+}
+
+/* ----------------------------------------------------------------------
+   free all memory 
+------------------------------------------------------------------------- */
+
+PPPMGPUSingle::~PPPMGPUSingle()
+{
+  pppm_gpu_clear_f(poisson_time);
+}
+
+/* ----------------------------------------------------------------------
+   called once before run 
+------------------------------------------------------------------------- */
+
+void PPPMGPUSingle::init()
+{
+  base_init();
+
+  if (order>8)
+    error->all("Cannot use order greater than 8 with pppm/gpu.");
+  pppm_gpu_clear_f(poisson_time);
+
+  int success;
+  grdtyp *data, *h_brick;
+  h_brick = pppm_gpu_init_f(atom->nlocal, atom->nlocal+atom->nghost, screen,
+			    order, nxlo_out, nylo_out, nzlo_out, nxhi_out,
+			    nyhi_out, nzhi_out, rho_coeff, &data, 
+			    slab_volfactor,nx_pppm,ny_pppm,nz_pppm,success);
+
+  GPU_EXTRA::check_flag(success,error,world);
+
+  density_brick =
+    create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
+		     nxlo_out,nxhi_out,"pppm:density_brick",h_brick,1);
+  vd_brick =
+    create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
+		     nxlo_out,nxhi_out,"pppm:vd_brick",data,4);
+
+  poisson_time=0;
+}
+
+/* ----------------------------------------------------------------------
+   compute the PPPMGPU long-range force, energy, virial 
+------------------------------------------------------------------------- */
+
+void PPPMGPUSingle::compute(int eflag, int vflag)
+{
+  bool success = true;
+  int flag=pppm_gpu_spread_f(neighbor->ago, atom->nlocal, atom->nlocal + 
+			     atom->nghost, atom->x, atom->type, success,
+			     atom->q, domain->boxlo, delxinv, delyinv,
+			     delzinv);
+  if (!success)
+    error->one("Out of memory on GPGPU");
+  if (flag != 0)
+    error->one("Out of range atoms - cannot compute PPPM");
+
+  int i;
+
+  // convert atoms from box to lamda coords
+  
+  if (triclinic == 0) boxlo = domain->boxlo;
+  else {
+    boxlo = domain->boxlo_lamda;
+    domain->x2lamda(atom->nlocal);
+  }
+
+  energy = 0.0;
+  if (vflag) for (i = 0; i < 6; i++) virial[i] = 0.0;
+
+  double t3=MPI_Wtime();
+
+  // all procs communicate density values from their ghost cells
+  //   to fully sum contribution in their 3d bricks
+  // remap from 3d decomposition to FFT decomposition
+
+  brick2fft();
+
+  // compute potential gradient on my FFT grid and
+  //   portion of e_long on this proc's FFT grid
+  // return gradients (electric fields) in 3d brick decomposition
+  
+  poisson(eflag,vflag);
+
+  // all procs communicate E-field values to fill ghost cells
+  //   surrounding their 3d bricks
+
+  fillbrick();
+
+  poisson_time+=MPI_Wtime()-t3;
+
+  // calculate the force on my particles
+
+  grdtyp qqrd2e_scale=qqrd2e*scale;
+  pppm_gpu_interp_f(qqrd2e_scale);
+
+  // sum energy across procs and add in volume-dependent term
+
+  if (eflag) {
+    double energy_all;
+    MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
+    energy = energy_all;
+   
+    energy *= 0.5*volume;
+    energy -= g_ewald*qsqsum/1.772453851 +
+      0.5*PI*qsum*qsum / (g_ewald*g_ewald*volume);
+    energy *= qqrd2e*scale;
+  }
+
+  // sum virial across procs
+
+  if (vflag) {
+    double virial_all[6];
+    MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
+    for (i = 0; i < 6; i++) virial[i] = 0.5*qqrd2e*scale*volume*virial_all[i];
+  }
+
+  // 2d slab correction
+
+  if (slabflag) slabcorr(eflag);
+
+  // convert atoms back from lamda to box coords
+  
+  if (triclinic) domain->lamda2x(atom->nlocal);
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of local arrays 
+------------------------------------------------------------------------- */
+
+double PPPMGPUSingle::memory_usage()
+{
+  double bytes = nmax*3 * sizeof(double);
+  int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) * 
+    (nzhi_out-nzlo_out+1);
+  bytes += 4 * nbrick * sizeof(grdtyp);
+  bytes += 6 * nfft_both * sizeof(double);
+  bytes += nfft_both*6 * sizeof(double);
+  bytes += 2 * nbuf * sizeof(double);
+  return bytes + pppm_gpu_bytes_f();
+}
--- a/src/GPU/pppm_gpu_single.h
+++ b/src/GPU/pppm_gpu_single.h
@ -0,0 +1,42 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/gpu/single,PPPMGPUSingle)
+
+#else
+
+#ifndef LMP_PPPM_GPU_SINGLE_H
+#define LMP_PPPM_GPU_SINGLE_H
+
+#include "pppm_gpu.h"
+#include "lmptype.h"
+
+namespace LAMMPS_NS {
+
+class PPPMGPUSingle : public PPPMGPU<float> {
+ public:
+  PPPMGPUSingle(class LAMMPS *, int, char **);
+  ~PPPMGPUSingle();
+  void init();
+  void compute(int, int);
+  double memory_usage();
+
+ protected:
+};
+
+}
+
+#endif
+#endif