Changes from Mike Brown.

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5278 f3b2605a-c512-4ea7-a41b-209d697bcdaa
2010-11-23 00:41:14 +00:00 · 2010-11-23 00:41:14 +00:00 · 4e52d8d927
parent 5a82c99485
commit 4e52d8d927
18 changed files with 3104 additions and 536 deletions
--- a/src/GPU/Install.sh
+++ b/src/GPU/Install.sh
@ -1,6 +1,7 @@
 # Install/unInstall package files in LAMMPS
 # edit Makefile.package to include/exclude GPU library
 # do not copy gayberne files if non-GPU version does not exist
+# do not copy charmm files if non-GPU version does not exist

 if (test $1 = 1) then

@ -12,14 +13,36 @@ if (test $1 = 1) then
    sed -i -e 's|^PKG_SYSPATH =[ \t]*|&$(gpu_SYSPATH) |' ../Makefile.package
    sed -i -e 's|^PKG_SYSLIB =[ \t]*|&$(gpu_SYSLIB) |' ../Makefile.package
  fi
-
+  
  if (test -e ../pair_gayberne.cpp) then
    cp pair_gayberne_gpu.cpp ..
    cp pair_gayberne_gpu.h ..
  fi
+  
+  if (test -e ../pair_lj_cut_coul_long.cpp) then
+    cp pair_lj_cut_coul_long_gpu.cpp ..
+    cp pair_lj_cut_coul_long_gpu.h ..
+  fi
+
+  if (test -e ../pair_cg_cmm.cpp) then
+    cp pair_cg_cmm_gpu.cpp ..
+    cp pair_cg_cmm_gpu.h ..
+  fi
+
+  if (test -e ../pair_cg_cmm_coul_long.cpp) then
+    cp pair_cg_cmm_coul_long_gpu.cpp ..
+    cp pair_cg_cmm_coul_long_gpu.h ..
+  fi

  cp pair_lj_cut_gpu.cpp ..
+  cp pair_lj96_cut_gpu.cpp ..
+  cp pair_lj_cut_coul_cut_gpu.cpp ..
  cp pair_lj_cut_gpu.h ..
+  cp pair_lj96_cut_gpu.h ..
+  cp pair_lj_cut_coul_cut_gpu.h ..
+  
+  cp fix_gpu.cpp ..
+  cp fix_gpu.h ..

 elif (test $1 = 0) then

@ -27,11 +50,23 @@ elif (test $1 = 0) then
    sed -i -e 's/[^ \t]*gpu //' ../Makefile.package
    sed -i -e 's/[^ \t]*gpu_[^ \t]*) //' ../Makefile.package
  fi
-
+  
  rm ../pair_gayberne_gpu.cpp
  rm ../pair_lj_cut_gpu.cpp
+  rm ../pair_lj96_cut_gpu.cpp
+  rm ../pair_lj_cut_coul_cut_gpu.cpp
+  rm ../pair_lj_cut_coul_long_gpu.cpp
+  rm ../pair_cg_cmm_gpu.cpp
+  rm ../pair_cg_cmm_coul_long_gpu.cpp
+  rm ../fix_gpu.cpp

  rm ../pair_gayberne_gpu.h
  rm ../pair_lj_cut_gpu.h
-
+  rm ../pair_lj96_cut_gpu.h
+  rm ../pair_lj_cut_coul_cut_gpu.h
+  rm ../pair_lj_cut_coul_long_gpu.h
+  rm ../pair_cg_cmm_gpu.h
+  rm ../pair_cg_cmm_coul_long_gpu.h
+  rm ../fix_gpu.h
+  
 fi
--- a/src/GPU/Package.sh
+++ b/src/GPU/Package.sh
@ -3,10 +3,40 @@
 # do not copy gayberne files if non-GPU version does not exist

 for file in *.cpp *.h; do
-  if (test $file == pair_gayberne_gpu.cpp -a ! -e ../pair_gayberne.cpp) then
+  if (test $file = pair_gayberne_gpu.cpp -a ! -e ../pair_gayberne.cpp) then
    continue
  fi
-  if (test $file == pair_gayberne_gpu.h -a ! -e ../pair_gayberne.cpp) then
+  if (test $file = pair_gayberne_gpu.h -a ! -e ../pair_gayberne.cpp) then
+    continue
+  fi
+  if (test $file = pair_lj_cut_coul_long_gpu.cpp -a ! -e ../pair_lj_cut_coul_long.cpp) then
+    continue
+  fi
+  if (test $file = pair_lj_cut_coul_long_gpu.h -a ! -e ../pair_lj_cut_coul_long.cpp) then
+    continue
+  fi
+  if (test $file = pair_cg_cmm_gpu.cpp -a ! -e ../pair_cg_cmm.cpp) then
+    continue
+  fi
+  if (test $file = pair_cg_cmm_gpu.h -a ! -e ../pair_cg_cmm.cpp) then
+    continue
+  fi
+  if (test $file = pair_cg_cmm_coul_long_gpu.cpp -a ! -e ../pair_cg_cmm_coul_long.cpp) then
+    continue
+  fi
+  if (test $file = pair_cg_cmm_coul_long_gpu.h -a ! -e ../pair_cg_cmm_coul_long.cpp) then
+    continue
+  fi
+  if (test $file = pair_cg_cmm_coul_msm.cpp -a ! -e ../pair_cg_cmm.cpp) then
+    continue
+  fi
+  if (test $file = pair_cg_cmm_coul_msm.h -a ! -e ../pair_cg_cmm.cpp) then
+    continue
+  fi
+  if (test $file = pair_cg_cmm_coul_msm_gpu.cpp -a ! -e ../pair_cg_cmm.cpp) then
+    continue
+  fi
+  if (test $file = pair_cg_cmm_coul_msm_gpu.h -a ! -e ../pair_cg_cmm.cpp) then
    continue
  fi

--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@ -0,0 +1,148 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "string.h"
+#include "stdlib.h"
+#include "fix_gpu.h"
+#include "atom.h"
+#include "force.h"
+#include "pair.h"
+#include "respa.h"
+#include "input.h"
+#include "error.h"
+#include "timer.h"
+#include "modify.h"
+#include "domain.h"
+
+using namespace LAMMPS_NS;
+
+enum{GPU_FORCE, GPU_NEIGH};
+
+extern bool lmp_init_device(const int first_gpu, const int last_gpu,
+                            const int gpu_mode, const double particle_split);
+extern void lmp_clear_device();
+extern double lmp_gpu_forces(double **f, double **tor, double *eatom,
+                             double **vatom, double *virial, double &ecoul);
+
+/* ---------------------------------------------------------------------- */
+
+FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  if (narg < 7) error->all("Illegal fix gpu command");
+
+  if (strcmp(arg[1],"all") != 0)
+    error->all("Illegal fix gpu command");
+
+  int gpu_mode, first_gpu, last_gpu;
+  double particle_split;
+
+  if (strcmp(arg[3],"force") == 0)
+    gpu_mode = GPU_FORCE;
+  else if (strcmp(arg[3],"force/neigh") == 0) {
+    gpu_mode = GPU_NEIGH;
+    if (domain->triclinic)
+      error->all("Cannot use force/neigh with triclinic box.");
+  } else
+    error->all("Illegal fix gpu command.");
+
+  first_gpu = atoi(arg[4]);
+  last_gpu = atoi(arg[5]);
+
+  particle_split = force->numeric(arg[6]);
+  if (particle_split==0 || particle_split>1)
+    error->all("Illegal fix gpu command.");
+
+  if (!lmp_init_device(first_gpu,last_gpu,gpu_mode,particle_split))
+    error->one("Could not find or initialize a specified accelerator device.");
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixGPU::~FixGPU()
+{
+  lmp_clear_device();
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixGPU::setmask()
+{
+  int mask = 0;
+  mask |= POST_FORCE;
+  mask |= MIN_POST_FORCE;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixGPU::init()
+{
+  // Can only have 1 gpu fix that must be the first fix for a run
+  if ((void*)modify->fix[0] != (void*)this)
+    error->all("GPU is not the first fix for this run.");
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixGPU::setup(int vflag)
+{
+  post_force(vflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixGPU::min_setup(int vflag)
+{
+  post_force(vflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixGPU::post_force(int vflag)
+{
+  timer->stamp();
+  double lvirial[6];
+  for (int i = 0; i < 6; i++) lvirial[i] = 0.0;
+  double my_eng = lmp_gpu_forces(atom->f, atom->torque, force->pair->eatom,
+                                 force->pair->vatom, lvirial,
+                                 force->pair->eng_coul);
+
+  force->pair->eng_vdwl += my_eng;
+  force->pair->virial[0] += lvirial[0];
+  force->pair->virial[1] += lvirial[1];
+  force->pair->virial[2] += lvirial[2];
+  force->pair->virial[3] += lvirial[3];
+  force->pair->virial[4] += lvirial[4];
+  force->pair->virial[5] += lvirial[5];
+
+  if (force->pair->vflag_fdotr) force->pair->virial_compute();
+  timer->stamp(TIME_PAIR);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixGPU::min_post_force(int vflag)
+{
+  post_force(vflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+double FixGPU::memory_usage()
+{
+  double bytes = 0.0;
+  // Memory usage currently returned by pair routine
+  return bytes;
+}
+
--- a/src/GPU/fix_gpu.h
+++ b/src/GPU/fix_gpu.h
@ -0,0 +1,45 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(gpu,FixGPU)
+
+#else
+
+#ifndef LMP_FIX_GPU_H
+#define LMP_FIX_GPU_H
+
+#include "fix.h"
+
+namespace LAMMPS_NS {
+
+class FixGPU : public Fix {
+ public:
+  FixGPU(class LAMMPS *, int, char **);
+  ~FixGPU();
+  int setmask();
+  void init();
+  void setup(int);
+  void min_setup(int);
+  void post_force(int);
+  void min_post_force(int);
+  double memory_usage();
+
+ private:
+};
+
+}
+
+#endif
+#endif
--- a/src/GPU/pair_cg_cmm_coul_long_gpu.cpp
+++ b/src/GPU/pair_cg_cmm_coul_long_gpu.cpp
@ -0,0 +1,499 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+   
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Mike Brown (SNL)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "pair_cg_cmm_coul_long_gpu.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "integrate.h"
+#include "memory.h"
+#include "error.h"
+#include "neigh_request.h"
+#include "universe.h"
+#include "update.h"
+#include "domain.h"
+#include "string.h"
+#include "kspace.h"
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+// External functions from cuda library for atom decomposition
+
+bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
+                   double **host_lj1, double **host_lj2, double **host_lj3,
+                   double **host_lj4, double **offset, double *special_lj,
+                   const int nlocal, const int nall, const int max_nbors,
+                   const int maxspecial, const double cell_size, int &gpu_mode,
+                   FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
+                   double *host_special_coul, const double qqrd2e,
+                   const double g_ewald);
+void cmml_gpu_clear();
+int * cmml_gpu_compute_n(const int timestep, const int ago, const int inum,
+	 	         const int nall, double **host_x, int *host_type, 
+                         double *boxlo, double *boxhi, int *tag, int **nspecial,
+                         int **special, const bool eflag, const bool vflag,
+                         const bool eatom, const bool vatom, int &host_start,
+                         const double cpu_time, bool &success, double *host_q);
+void cmml_gpu_compute(const int timestep, const int ago, const int inum,
+	 	      const int nall, double **host_x, int *host_type,
+                      int *ilist, int *numj, int **firstneigh,
+		      const bool eflag, const bool vflag, const bool eatom,
+                      const bool vatom, int &host_start, const double cpu_time,
+                      bool &success, double *host_q);
+double cmml_gpu_bytes();
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairCGCMMCoulLongGPU::PairCGCMMCoulLongGPU(LAMMPS *lmp) : PairCGCMMCoulLong(lmp), gpu_mode(GPU_PAIR)
+{
+  respa_enable = 0;
+  cpu_time = 0.0;
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairCGCMMCoulLongGPU::~PairCGCMMCoulLongGPU()
+{
+  cmml_gpu_clear();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCoulLongGPU::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+  
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+  
+  bool success = true;
+  
+  if (gpu_mode == GPU_NEIGH) {
+    inum = atom->nlocal;
+    gpulist = cmml_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
+			         atom->x, atom->type, domain->sublo,
+				 domain->subhi, atom->tag, atom->nspecial,
+                                 atom->special, eflag, vflag, eflag_atom,
+                                 vflag_atom, host_start, cpu_time, success,
+                                 atom->q);
+  } else {
+    inum = list->inum;
+    cmml_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
+		     atom->type, list->ilist, list->numneigh, list->firstneigh,
+		     eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
+                     success, atom->q);
+  }
+  if (!success)
+    error->one("Out of memory on GPGPU");
+
+  if (host_start<inum) {
+    cpu_time = MPI_Wtime();
+    if (gpu_mode == GPU_NEIGH)
+      cpu_compute(gpulist, host_start, eflag, vflag);
+    else
+      cpu_compute(host_start, eflag, vflag);
+    cpu_time = MPI_Wtime() - cpu_time;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairCGCMMCoulLongGPU::init_style()
+{
+  cut_respa = NULL;
+
+  if (!atom->q_flag)
+    error->all("Pair style cg/cmm/coul/long requires atom attribute q");
+  if (force->pair_match("gpu",0) == NULL)
+    error->all("Cannot use pair hybrid with multiple GPU pair styles");
+
+  // Repeat cutsq calculation because done after call to init_style
+  double maxcut = -1.0;
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cut *= cut;
+        if (cut > maxcut)
+          maxcut = cut;
+        cutsq[i][j] = cutsq[j][i] = cut;
+      } else
+        cutsq[i][j] = cutsq[j][i] = 0.0;
+    }
+  }
+  double cell_size = sqrt(maxcut) + neighbor->skin;
+
+  // insure use of KSpace long-range solver, set g_ewald
+
+  if (force->kspace == NULL)
+    error->all("Pair style is incompatible with KSpace style");
+  g_ewald = force->kspace->g_ewald;
+
+  // setup force tables
+
+  if (ncoultablebits) init_tables();
+
+  int maxspecial=0;
+  if (atom->molecular)
+    maxspecial=atom->maxspecial;
+  bool init_ok = cmml_gpu_init(atom->ntypes+1, cutsq, cg_type, lj1, lj2, lj3,
+                               lj4, offset, force->special_lj, atom->nlocal,
+                               atom->nlocal+atom->nghost, 300, maxspecial,
+                               cell_size, gpu_mode, screen, cut_ljsq,
+                               cut_coulsq_global, force->special_coul,
+                               force->qqrd2e, g_ewald);
+  if (!init_ok)
+    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
+
+  if (force->newton_pair) 
+    error->all("Cannot use newton pair with GPU cg/cmm pair style");
+
+  if (gpu_mode != GPU_NEIGH) {
+    int irequest = neighbor->request(this);
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full = 1;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairCGCMMCoulLongGPU::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+  return bytes + cmml_gpu_bytes();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCoulLongGPU::cpu_compute(int start, int eflag, int vflag)
+{
+  int i,j,ii,jj,inum,jnum,itype,jtype,itable;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz;
+  double fraction,table;
+  double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
+  double grij,expm2,prefactor,t,erfc;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  double rsq;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  double *q = atom->q;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  double *special_coul = force->special_coul;
+  double *special_lj = force->special_lj;
+  double qqrd2e = force->qqrd2e;
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+  
+  // loop over neighbors of my atoms
+
+  for (ii = start; ii < inum; ii++) {
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+
+      if (j < nall) factor_coul = factor_lj = 1.0;
+      else {
+	factor_coul = special_coul[j/nall];
+	factor_lj = special_lj[j/nall];
+	j %= nall;
+      }
+
+      const double delx = xtmp - x[j][0];
+      const double dely = ytmp - x[j][1];
+      const double delz = ztmp - x[j][2];
+      const double rsq = delx*delx + dely*dely + delz*delz;
+      const int jtype = type[j];
+
+      double evdwl = 0.0;
+      double ecoul = 0.0;
+      double fpair = 0.0;
+
+      if (rsq < cutsq[itype][jtype]) {
+        const double r2inv = 1.0/rsq;
+        const int cgt=cg_type[itype][jtype];
+
+        double forcelj  = 0.0;
+        double forcecoul = 0.0;
+
+        if (rsq < cut_ljsq[itype][jtype]) {
+          forcelj=factor_lj;
+          if (eflag) evdwl=factor_lj;
+
+          if (cgt == CG_LJ12_4) {
+            const double r4inv=r2inv*r2inv;
+            forcelj *= r4inv*(lj1[itype][jtype]*r4inv*r4inv
+                       - lj2[itype][jtype]);
+            if (eflag) {
+              evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv
+                       - lj4[itype][jtype]) - offset[itype][jtype];
+            }
+          } else if (cgt == CG_LJ9_6) {
+            const double r3inv = r2inv*sqrt(r2inv);
+            const double r6inv = r3inv*r3inv;
+            forcelj *= r6inv*(lj1[itype][jtype]*r3inv
+                       - lj2[itype][jtype]);
+            if (eflag) {
+              evdwl *= r6inv*(lj3[itype][jtype]*r3inv
+                        - lj4[itype][jtype]) - offset[itype][jtype];
+            }
+          } else {
+            const double r6inv = r2inv*r2inv*r2inv;
+            forcelj *= r6inv*(lj1[itype][jtype]*r6inv
+                       - lj2[itype][jtype]);
+            if (eflag) {
+              evdwl *= r6inv*(lj3[itype][jtype]*r6inv
+                       - lj4[itype][jtype]) - offset[itype][jtype];
+            }
+          }
+        }
+
+        if (rsq < cut_coulsq_global) {
+          if (!ncoultablebits || rsq <= tabinnersq) {
+            const double r = sqrt(rsq);
+            const double grij = g_ewald * r;
+            const double expm2 = exp(-grij*grij);
+            const double t = 1.0 / (1.0 + EWALD_P*grij);
+            const double erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+            const double prefactor = qqrd2e * qtmp*q[j]/r;
+            forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+            if (eflag) ecoul = prefactor*erfc;
+            if (factor_coul < 1.0) {
+              forcecoul -= (1.0-factor_coul)*prefactor;
+              if (eflag) ecoul -= (1.0-factor_coul)*prefactor;
+            }
+          } else {
+            union_int_float_t rsq_lookup;
+            rsq_lookup.f = rsq;
+            int itable = rsq_lookup.i & ncoulmask;
+            itable >>= ncoulshiftbits;
+            const double fraction = (rsq_lookup.f - rtable[itable]) *
+                                     drtable[itable];
+            const double table = ftable[itable] + fraction*dftable[itable];
+            forcecoul = qtmp*q[j] * table;
+            if (eflag) {
+              const double table2 = etable[itable] + fraction*detable[itable];
+              ecoul = qtmp*q[j] * table2;
+            }
+            if (factor_coul < 1.0) {
+              const double table2 = ctable[itable] + fraction*dctable[itable];
+              const double prefactor = qtmp*q[j] * table2;
+              forcecoul -= (1.0-factor_coul)*prefactor;
+              if (eflag) ecoul -= (1.0-factor_coul)*prefactor;
+            }
+          }
+        }
+        fpair = (forcecoul + forcelj) * r2inv;
+
+	f[i][0] += delx*fpair;
+	f[i][1] += dely*fpair;
+	f[i][2] += delz*fpair;
+
+	if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCoulLongGPU::cpu_compute(int *nbors, int start, int eflag,
+                                      int vflag)
+{
+  int i,j,jnum,itype,jtype,itable;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz;
+  double fraction,table;
+  double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
+  double grij,expm2,prefactor,t,erfc;
+  double rsq;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  double *q = atom->q;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  int stride = nlocal-start;
+  double *special_coul = force->special_coul;
+  double *special_lj = force->special_lj;
+  double qqrd2e = force->qqrd2e;
+
+  // loop over neighbors of my atoms
+
+  for (i = start; i < nlocal; i++) {
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    int *nbor = nbors + i - start;
+    jnum = *nbor;
+    nbor += stride;
+    int *nbor_end = nbor + stride * jnum;
+
+    for (; nbor<nbor_end; nbor+=stride) {
+      j = *nbor;
+
+      if (j < nall) factor_coul = factor_lj = 1.0;
+      else {
+	factor_coul = special_coul[j/nall];
+	factor_lj = special_lj[j/nall];
+	j %= nall;
+      }
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      double evdwl = 0.0;
+      double ecoul = 0.0;
+      double fpair = 0.0;
+
+      if (rsq < cutsq[itype][jtype]) {
+        const double r2inv = 1.0/rsq;
+        const int cgt=cg_type[itype][jtype];
+
+        double forcelj  = 0.0;
+        double forcecoul = 0.0;
+
+        if (rsq < cut_ljsq[itype][jtype]) {
+          forcelj=factor_lj;
+          if (eflag) evdwl=factor_lj;
+
+          if (cgt == CG_LJ12_4) {
+            const double r4inv=r2inv*r2inv;
+            forcelj *= r4inv*(lj1[itype][jtype]*r4inv*r4inv
+                       - lj2[itype][jtype]);
+            if (eflag) {
+              evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv
+                       - lj4[itype][jtype]) - offset[itype][jtype];
+            }
+          } else if (cgt == CG_LJ9_6) {
+            const double r3inv = r2inv*sqrt(r2inv);
+            const double r6inv = r3inv*r3inv;
+            forcelj *= r6inv*(lj1[itype][jtype]*r3inv
+                       - lj2[itype][jtype]);
+            if (eflag) {
+              evdwl *= r6inv*(lj3[itype][jtype]*r3inv
+                        - lj4[itype][jtype]) - offset[itype][jtype];
+            }
+          } else {
+            const double r6inv = r2inv*r2inv*r2inv;
+            forcelj *= r6inv*(lj1[itype][jtype]*r6inv
+                       - lj2[itype][jtype]);
+            if (eflag) {
+              evdwl *= r6inv*(lj3[itype][jtype]*r6inv
+                       - lj4[itype][jtype]) - offset[itype][jtype];
+            }
+          }
+        }
+
+        if (rsq < cut_coulsq_global) {
+          if (!ncoultablebits || rsq <= tabinnersq) {
+            const double r = sqrt(rsq);
+            const double grij = g_ewald * r;
+            const double expm2 = exp(-grij*grij);
+            const double t = 1.0 / (1.0 + EWALD_P*grij);
+            const double erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+            const double prefactor = qqrd2e * qtmp*q[j]/r;
+            forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+            if (eflag) ecoul = prefactor*erfc;
+            if (factor_coul < 1.0) {
+              forcecoul -= (1.0-factor_coul)*prefactor;
+              if (eflag) ecoul -= (1.0-factor_coul)*prefactor;
+            }
+          } else {
+            union_int_float_t rsq_lookup;
+            rsq_lookup.f = rsq;
+            int itable = rsq_lookup.i & ncoulmask;
+            itable >>= ncoulshiftbits;
+            const double fraction = (rsq_lookup.f - rtable[itable]) *
+                                     drtable[itable];
+            const double table = ftable[itable] + fraction*dftable[itable];
+            forcecoul = qtmp*q[j] * table;
+            if (eflag) {
+              const double table2 = etable[itable] + fraction*detable[itable];
+              ecoul = qtmp*q[j] * table2;
+            }
+            if (factor_coul < 1.0) {
+              const double table2 = ctable[itable] + fraction*dctable[itable];
+              const double prefactor = qtmp*q[j] * table2;
+              forcecoul -= (1.0-factor_coul)*prefactor;
+              if (eflag) ecoul -= (1.0-factor_coul)*prefactor;
+            }
+          }
+        }
+        fpair = (forcecoul + forcelj) * r2inv;
+
+	f[i][0] += delx*fpair;
+	f[i][1] += dely*fpair;
+	f[i][2] += delz*fpair;
+
+        if (j<start) {
+  	  if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
+        } else {
+          if (j<nlocal) {
+	    f[j][0] -= delx*fpair;
+	    f[j][1] -= dely*fpair;
+	    f[j][2] -= delz*fpair;
+  	  }
+	  if (evflag) ev_tally(i,j,nlocal,0,
+			       evdwl,ecoul,fpair,delx,dely,delz);
+        }
+      }
+    }
+  }
+}
+
--- a/src/GPU/pair_cg_cmm_coul_long_gpu.h
+++ b/src/GPU/pair_cg_cmm_coul_long_gpu.h
@ -0,0 +1,48 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(cg/cmm/coul/long/gpu,PairCGCMMCoulLongGPU)
+
+#else
+
+#ifndef LMP_PAIR_CG_CMM_COUL_LONG_GPU_H
+#define LMP_PAIR_CG_CMM_COUL_LONG_GPU_H
+
+#include "pair_cg_cmm_coul_long.h"
+
+namespace LAMMPS_NS {
+
+class PairCGCMMCoulLongGPU : public PairCGCMMCoulLong {
+ public:
+  PairCGCMMCoulLongGPU(LAMMPS *lmp);
+  ~PairCGCMMCoulLongGPU();
+  void cpu_compute(int, int, int);
+  void cpu_compute(int *, int, int, int);
+  void compute(int, int);
+  void init_style();
+  double memory_usage();
+
+ enum { GPU_PAIR, GPU_NEIGH };
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+  int *gpulist;
+};
+
+}
+#endif
+#endif
+
--- a/src/GPU/pair_cg_cmm_gpu.cpp
+++ b/src/GPU/pair_cg_cmm_gpu.cpp
@ -0,0 +1,362 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+   
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Mike Brown (SNL)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "pair_cg_cmm_gpu.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "integrate.h"
+#include "memory.h"
+#include "error.h"
+#include "neigh_request.h"
+#include "universe.h"
+#include "update.h"
+#include "domain.h"
+#include "string.h"
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+// External functions from cuda library for atom decomposition
+
+bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, 
+                  double **host_lj1, double **host_lj2, double **host_lj3,
+                  double **host_lj4, double **offset, double *special_lj,
+                  const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const double cell_size, int &gpu_mode,
+                  FILE *screen);
+void cmm_gpu_clear();
+int * cmm_gpu_compute_n(const int timestep, const int ago, const int inum,
+	 	        const int nall, double **host_x, int *host_type, 
+                        double *boxlo, double *boxhi, int *tag, int **nspecial,
+                        int **special, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        const double cpu_time, bool &success);
+void cmm_gpu_compute(const int timestep, const int ago, const int inum,
+	 	     const int nall, double **host_x, int *host_type,
+                     int *ilist, int *numj, int **firstneigh,
+		     const bool eflag, const bool vflag, const bool eatom,
+                     const bool vatom, int &host_start, const double cpu_time,
+                     bool &success);
+double cmm_gpu_bytes();
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairCGCMMGPU::PairCGCMMGPU(LAMMPS *lmp) : PairCGCMM(lmp), gpu_mode(GPU_PAIR)
+{
+  respa_enable = 0;
+  cpu_time = 0.0;
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairCGCMMGPU::~PairCGCMMGPU()
+{
+  cmm_gpu_clear();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMGPU::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+  
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+  
+  bool success = true;
+  
+  if (gpu_mode == GPU_NEIGH) {
+    inum = atom->nlocal;
+    gpulist = cmm_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
+			        atom->x, atom->type, domain->sublo,
+				domain->subhi, atom->tag, atom->nspecial,
+                                atom->special, eflag, vflag, eflag_atom,
+                                vflag_atom, host_start, cpu_time, success);
+  } else {
+    inum = list->inum;
+    cmm_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
+		    atom->type, list->ilist, list->numneigh, list->firstneigh,
+		    eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
+                    success);
+  }
+  if (!success)
+    error->one("Out of memory on GPGPU");
+
+  if (host_start<inum) {
+    cpu_time = MPI_Wtime();
+    if (gpu_mode == GPU_NEIGH)
+      cpu_compute(gpulist, host_start, eflag, vflag);
+    else
+      cpu_compute(host_start, eflag, vflag);
+    cpu_time = MPI_Wtime() - cpu_time;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairCGCMMGPU::init_style()
+{
+  cut_respa = NULL;
+
+  if (force->pair_match("gpu",0) == NULL)
+    error->all("Cannot use pair hybrid with multiple GPU pair styles");
+
+  // Repeat cutsq calculation because done after call to init_style
+  double maxcut = -1.0;
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cut *= cut;
+        if (cut > maxcut)
+          maxcut = cut;
+        cutsq[i][j] = cutsq[j][i] = cut;
+      } else
+        cutsq[i][j] = cutsq[j][i] = 0.0;
+    }
+  }
+  double cell_size = sqrt(maxcut) + neighbor->skin;
+
+  int maxspecial=0;
+  if (atom->molecular)
+    maxspecial=atom->maxspecial;
+  bool init_ok = cmm_gpu_init(atom->ntypes+1,cutsq,cg_type,lj1,lj2,lj3,lj4,
+                              offset, force->special_lj, atom->nlocal,
+                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              cell_size, gpu_mode, screen);
+  if (!init_ok)
+    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
+
+  if (force->newton_pair) 
+    error->all("Cannot use newton pair with GPU CGCMM pair style");
+
+  if (gpu_mode != GPU_NEIGH) {
+    int irequest = neighbor->request(this);
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full = 1;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairCGCMMGPU::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+  return bytes + cmm_gpu_bytes();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMGPU::cpu_compute(int start, int eflag, int vflag) {
+  int i,j,ii,jj,inum,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r6inv,forcelj,factor_lj;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  double *special_lj = force->special_lj;
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = start; ii < inum; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+
+      if (j < nall) factor_lj = 1.0;
+      else {
+	factor_lj = special_lj[j/nall];
+	j %= nall;
+      }
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        const int cgt=cg_type[itype][jtype];
+        r2inv = 1.0/rsq;
+
+	fpair = factor_lj;
+	if (eflag) evdwl = factor_lj;
+	if (cgt == CG_LJ12_4) {
+	  const double r4inv = r2inv*r2inv;
+	  fpair *= r4inv*(lj1[itype][jtype]*r4inv*r4inv
+			  - lj2[itype][jtype]);
+	  if (eflag) {
+	    evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv
+			    - lj4[itype][jtype]) - offset[itype][jtype];
+	  }
+	} else if (cgt == CG_LJ9_6) {
+	  const double r3inv = r2inv*sqrt(r2inv);
+	  const double r6inv = r3inv*r3inv;
+	  fpair *= r6inv*(lj1[itype][jtype]*r3inv
+			  - lj2[itype][jtype]);
+	  if (eflag) {
+	    evdwl *= r6inv*(lj3[itype][jtype]*r3inv
+			    - lj4[itype][jtype]) - offset[itype][jtype];
+	  }
+	} else {
+	  const double r6inv = r2inv*r2inv*r2inv;
+	  fpair *= r6inv*(lj1[itype][jtype]*r6inv
+			  - lj2[itype][jtype]);
+	  if (eflag) {
+	    evdwl *= r6inv*(lj3[itype][jtype]*r6inv
+			    - lj4[itype][jtype]) - offset[itype][jtype];
+	  }
+	}
+        fpair *= r2inv;
+
+	f[i][0] += delx*fpair;
+	f[i][1] += dely*fpair;
+	f[i][2] += delz*fpair;
+
+	if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) {
+  int i,j,itype,jtype;
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  int stride = nlocal-start;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r6inv,forcelj,factor_lj;
+  double *special_lj = force->special_lj;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+
+  // loop over neighbors of my atoms
+
+  for (i = start; i < nlocal; i++) {
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    int *nbor = nbors + i - start;
+    int jnum = *nbor;
+    nbor += stride;
+    int *nbor_end = nbor + stride * jnum;
+
+    for (; nbor<nbor_end; nbor+=stride) {
+      j = *nbor;
+      
+      if (j < nall) factor_lj = 1.0;
+      else {
+	factor_lj = special_lj[j/nall];
+	j %= nall;
+      }
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        const int cgt=cg_type[itype][jtype];
+        r2inv = 1.0/rsq;
+
+	fpair = factor_lj;
+	if (eflag) evdwl = factor_lj;
+	if (cgt == CG_LJ12_4) {
+	  const double r4inv = r2inv*r2inv;
+	  fpair *= r4inv*(lj1[itype][jtype]*r4inv*r4inv
+			  - lj2[itype][jtype]);
+	  if (eflag) {
+	    evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv
+			    - lj4[itype][jtype]) - offset[itype][jtype];
+	  }
+	} else if (cgt == CG_LJ9_6) {
+	  const double r3inv = r2inv*sqrt(r2inv);
+	  const double r6inv = r3inv*r3inv;
+	  fpair *= r6inv*(lj1[itype][jtype]*r3inv
+			  - lj2[itype][jtype]);
+	  if (eflag) {
+	    evdwl *= r6inv*(lj3[itype][jtype]*r3inv
+			    - lj4[itype][jtype]) - offset[itype][jtype];
+	  }
+	} else {
+	  const double r6inv = r2inv*r2inv*r2inv;
+	  fpair *= r6inv*(lj1[itype][jtype]*r6inv
+			  - lj2[itype][jtype]);
+	  if (eflag) {
+	    evdwl *= r6inv*(lj3[itype][jtype]*r6inv
+			    - lj4[itype][jtype]) - offset[itype][jtype];
+	  }
+	}
+        fpair *= r2inv;
+
+	f[i][0] += delx*fpair;
+	f[i][1] += dely*fpair;
+	f[i][2] += delz*fpair;
+
+        if (j<start) {
+  	  if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
+        } else {
+          if (j<nlocal) {
+	    f[j][0] -= delx*fpair;
+	    f[j][1] -= dely*fpair;
+	    f[j][2] -= delz*fpair;
+  	  }
+	  if (evflag) ev_tally(i,j,nlocal,0,
+			       evdwl,0.0,fpair,delx,dely,delz);
+	}
+      }
+    }
+  }
+}
+
--- a/src/GPU/pair_cg_cmm_gpu.h
+++ b/src/GPU/pair_cg_cmm_gpu.h
@ -0,0 +1,47 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(cg/cmm/gpu,PairCGCMMGPU)
+
+#else
+
+#ifndef LMP_PAIR_CG_CMM_GPU_H
+#define LMP_PAIR_CG_CMM_GPU_H
+
+#include "pair_cg_cmm.h"
+
+namespace LAMMPS_NS {
+
+class PairCGCMMGPU : public PairCGCMM {
+ public:
+  PairCGCMMGPU(LAMMPS *lmp);
+  ~PairCGCMMGPU();
+  void cpu_compute(int, int, int);
+  void cpu_compute(int *, int, int, int);
+  void compute(int, int);
+  void init_style();
+  double memory_usage();
+
+ enum { GPU_PAIR, GPU_NEIGH };
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+  int *gpulist;
+};
+
+}
+#endif
+#endif
--- a/src/GPU/pair_gayberne_gpu.cpp
+++ b/src/GPU/pair_gayberne_gpu.cpp
@ -31,123 +31,47 @@
 #include "error.h"
 #include "neigh_request.h"
 #include "universe.h"
-
-#include <string>
-
-#ifdef GB_GPU_OMP
-#include "omp.h"
-#endif
+#include "domain.h"
+#include "update.h"
+#include "string.h"

 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))

-#ifndef WINDLL
-
 // External functions from cuda library for atom decomposition

-bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
-                 const double upsilon, const double mu, double **shape,
-                 double **well, double **cutsq, double **sigma, 
-                 double **epsilon, double *host_lshape, int **form,
-                 double **host_lj1, double **host_lj2, double **host_lj3, 
-                 double **host_lj4, double **offset, double *special_lj, 
-                 const int nlocal, const int nall, const int max_nbors, 
-                 const int thread, const int gpu_id);
-void gb_gpu_clear(const int thread);
-int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum, 
-                         int *ilist, const int *numj, const int *type,
-                         const int thread, bool &success);
-void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag, 
-                  const int thread);
-void gb_gpu_atom(double **host_x, double **host_quat, const int *host_type, 
-                 const bool rebuild, const int thread);
-void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, 
-                     const int thread);
-double gb_gpu_forces(double **f, double **tor, const int *ilist,
-                     const bool eflag, const bool vflag, const bool eflag_atom,
-                     const bool vflag_atom, double *eatom, double **vatom,
-                     double *virial, const int thread);
-void gb_gpu_name(const int gpu_id, const int max_nbors, char * name);
-void gb_gpu_time(const int thread);
-int gb_gpu_num_devices();
+bool gb_gpu_init(const int ntypes, const double gamma, const double upsilon,
+                 const double mu, double **shape, double **well, double **cutsq,
+                 double **sigma, double **epsilon, double *host_lshape,
+                 int **form, double **host_lj1, double **host_lj2,
+                 double **host_lj3, double **host_lj4, double **offset,
+                 double *special_lj, const int nlocal, const int nall,
+                 const int max_nbors, const double cell_size,
+                 int &gpu_mode, FILE *screen);
+void gb_gpu_clear();
+int * gb_gpu_compute_n(const int timestep, const int ago, const int inum,
+	 	       const int nall, double **host_x, int *host_type,
+                       double *boxlo, double *boxhi, const bool eflag,
+		       const bool vflag, const bool eatom, const bool vatom,
+                       int &host_start, const double cpu_time, bool &success,
+		       double **host_quat);
+int * gb_gpu_compute(const int timestep, const int ago, const int inum,
+	 	     const int nall, double **host_x, int *host_type,
+                     int *ilist, int *numj, int **firstneigh,
+		     const bool eflag, const bool vflag, const bool eatom,
+                     const bool vatom, int &host_start, const double cpu_time,
+                     bool &success, double **host_quat);
 double gb_gpu_bytes();

-#else
-#include <windows.h>
-
-typedef bool (*_gb_gpu_init)(int &ij_size, const int ntypes, const double gamma,
-                 const double upsilon, const double mu, double **shape,
-                 double **well, double **cutsq, double **sigma, 
-                 double **epsilon, double *host_lshape, int **form,
-                 double **host_lj1, double **host_lj2, double **host_lj3, 
-                 double **host_lj4, double **offset, double *special_lj, 
-                 const int nlocal, const int nall, const int max_nbors, 
-                 const int thread, const int gpu_id);
-typedef void (*_gb_gpu_clear)(const int thread);
-typedef int * (*_gb_gpu_reset_nbors)(const int nall, const int nlocal, 
-                 const int inum, int *ilist, const int *numj, const int *type,
-                 const int thread, bool &success);
-typedef void (*_gb_gpu_nbors)(const int *ij, const int num_ij, const bool eflag, 
-                 const int thread);
-typedef void (*_gb_gpu_atom)(double **host_x, double **host_quat, 
-                 const int *host_type, const bool rebuild, const int thread);
-typedef void (*_gb_gpu_gayberne)(const bool eflag, const bool vflag, 
-                 const bool rebuild, const int thread);
-typedef double (*_gb_gpu_forces)(double **f, double **tor, const int *ilist,
-                 const bool eflag, const bool vflag, const bool eflag_atom,
-                 const bool vflag_atom, double *eatom, double **vatom,
-                 double *virial, const int thread);
-typedef void (*_gb_gpu_name)(const int gpu_id, const int max_nbors, 
-                 char * name);
-typedef void (*_gb_gpu_time)(const int thread);
-typedef int (*_gb_gpu_num_devices)();
-typedef double (*_gb_gpu_bytes)();
-
-_gb_gpu_init gb_gpu_init;
-_gb_gpu_clear gb_gpu_clear;
-_gb_gpu_reset_nbors gb_gpu_reset_nbors;
-_gb_gpu_nbors gb_gpu_nbors;
-_gb_gpu_atom gb_gpu_atom;
-_gb_gpu_gayberne gb_gpu_gayberne;
-_gb_gpu_forces gb_gpu_forces;
-_gb_gpu_name gb_gpu_name;
-_gb_gpu_time gb_gpu_time;
-_gb_gpu_num_devices gb_gpu_num_devices;
-_gb_gpu_bytes gb_gpu_bytes;
-
-#endif
-
 using namespace LAMMPS_NS;

+enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
+
 /* ---------------------------------------------------------------------- */

-PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), my_thread(0),
-                                                omp_chunk(0), nthreads(1),
-                                                multi_gpu_mode(ONE_NODE),
-                                                multi_gpu_param(0),
-                                                output_time(false)
+PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp),
+                                                gpu_mode(GPU_PAIR)
 {
-  ij_new[0]=NULL;
-  
-#ifdef WINDLL
-  HINSTANCE hinstLib = LoadLibrary(TEXT("gpu.dll"));
-  if (hinstLib == NULL) {
-    printf("\nUnable to load gpu.dll\n");
-    exit(1);
-  }
-
-  gb_gpu_init=(_gb_gpu_init)GetProcAddress(hinstLib,"gb_gpu_init");
-  gb_gpu_clear=(_gb_gpu_clear)GetProcAddress(hinstLib,"gb_gpu_clear");
-  gb_gpu_reset_nbors=(_gb_gpu_reset_nbors)GetProcAddress(hinstLib,"gb_gpu_reset_nbors");
-  gb_gpu_nbors=(_gb_gpu_nbors)GetProcAddress(hinstLib,"gb_gpu_nbors");
-  gb_gpu_atom=(_gb_gpu_atom)GetProcAddress(hinstLib,"gb_gpu_atom");
-  gb_gpu_gayberne=(_gb_gpu_gayberne)GetProcAddress(hinstLib,"gb_gpu_gayberne");
-  gb_gpu_forces=(_gb_gpu_forces)GetProcAddress(hinstLib,"gb_gpu_forces");
-  gb_gpu_name=(_gb_gpu_name)GetProcAddress(hinstLib,"gb_gpu_name");
-  gb_gpu_time=(_gb_gpu_time)GetProcAddress(hinstLib,"gb_gpu_time");
-  gb_gpu_num_devices=(_gb_gpu_num_devices)GetProcAddress(hinstLib,"gb_gpu_num_devices");
-  gb_gpu_bytes=(_gb_gpu_bytes)GetProcAddress(hinstLib,"gb_gpu_bytes");
-#endif
 }

 /* ----------------------------------------------------------------------
@ -156,26 +80,8 @@ PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), my_thread(0),

 PairGayBerneGPU::~PairGayBerneGPU()
 {
-  if (output_time) {
-    printf("\n\n-------------------------------------");
-    printf("--------------------------------\n");
-    printf("      GPU Time Stamps (on proc 0): ");
-    printf("\n-------------------------------------");
-    printf("--------------------------------\n");
-    gb_gpu_time(my_thread);
-    printf("-------------------------------------");
-    printf("--------------------------------\n\n");
-  }
-  
-  #pragma omp parallel
-  {
-    #ifdef GB_GPU_OMP
-    int my_thread=omp_get_thread_num();
-    #endif
-    gb_gpu_clear(my_thread);
-    if (ij_new[my_thread]!=NULL)
-      delete [] ij_new[my_thread];
-  }
+  gb_gpu_clear();
+  cpu_time = 0.0;
 }

 /* ---------------------------------------------------------------------- */
@ -183,141 +89,38 @@ PairGayBerneGPU::~PairGayBerneGPU()
 void PairGayBerneGPU::compute(int eflag, int vflag)
 {
  if (eflag || vflag) ev_setup(eflag,vflag);
-  else evflag = vflag_fdotr = eflag_atom = vflag_atom = 0;
-  if (vflag_atom) 
-    error->all("Per-atom virial not available with GPU Gay-Berne");
+  else evflag = vflag_fdotr = 0;

-  int nlocal = atom->nlocal;
-  int nall = nlocal + atom->nghost;
-  int inum = list->inum;
-  
-  bool rebuild=false;
-  if (neighbor->ncalls > last_neighbor) {
-    last_neighbor=neighbor->ncalls;
-    rebuild=true;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+
+  if (gpu_mode == GPU_NEIGH) {
+    inum = atom->nlocal;
+    gpulist = gb_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
+			       atom->x, atom->type, domain->sublo, domain->subhi,
+			       eflag, vflag, eflag_atom, vflag_atom, host_start,
+                               cpu_time, success, atom->quat);
+  } else {
+    inum = list->inum;
+    olist = gb_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
+			   atom->type, list->ilist, list->numneigh,
+			   list->firstneigh, eflag, vflag, eflag_atom,
+                           vflag_atom, host_start, cpu_time, success,
+                           atom->quat);
  }
-  
-  #pragma omp parallel
-  {
-    
-  bool success=true;
-  #ifdef GB_GPU_OMP
-  int my_thread=omp_get_thread_num();
-  if (rebuild) {
-    omp_chunk=static_cast<int>(ceil(static_cast<double>(inum)/nthreads));
-    if (my_thread==nthreads-1)
-      thread_inum[my_thread]=inum-(nthreads-1)*omp_chunk;
-    else
-      thread_inum[my_thread]=omp_chunk;
-    olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal, 
-                                        thread_inum[my_thread],  
-                                        list->ilist+omp_chunk*my_thread, 
-                                        list->numneigh, atom->type, my_thread,
-                                        success);
-  }
-  #else
-  if (rebuild)
-    olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal, inum, list->ilist, 
-                                        list->numneigh, atom->type, my_thread,
-                                        success);
-  #endif
  if (!success)
    error->one("Out of memory on GPGPU");
-  
-  // copy atom data to GPU
-  gb_gpu_atom(atom->x,atom->quat,atom->type,rebuild,my_thread);

-  int i,j,ii,jj,jnum;
-  double factor_lj;
-  int *jlist;
-
-  if (rebuild==true) {
-    int num_ij = 0;
-
-    // loop over neighbors of my atoms
-    int *ijp=ij_new[my_thread];
-    #ifdef GB_GPU_OMP
-    int mgo=my_thread*omp_chunk;
-    int mgot=mgo+thread_inum[my_thread];
-    #else
-    int mgo=0, mgot=inum;
-    #endif
-    for (ii = mgo; ii<mgot; ii++) {
-      i = olist[my_thread][ii];
-      jlist = list->firstneigh[i];
-      jnum = list->numneigh[i];
-      
-      for (jj = 0; jj < jnum; jj++) {
-        j = jlist[jj];
-
-        *ijp=j;
-        ijp++;
-        num_ij++;
-          
-        if (num_ij==ij_size) {
-          gb_gpu_nbors(ij_new[my_thread],num_ij,eflag,my_thread);
-          ijp=ij_new[my_thread];
-          num_ij=0;
-        }
-      }
-    }
-    if (num_ij>0)
-      gb_gpu_nbors(ij_new[my_thread],num_ij,eflag,my_thread);
+  if (host_start < inum) {
+    cpu_time = MPI_Wtime();
+    if (gpu_mode == GPU_NEIGH)
+      cpu_compute(gpulist,host_start,eflag,vflag);
+    else
+      cpu_compute(host_start,eflag,vflag);
+    cpu_time = MPI_Wtime() - cpu_time;
  }
-  
-  gb_gpu_gayberne(eflag,vflag,rebuild,my_thread);
-  double lvirial[6];
-  for (int i=0; i<6; i++) lvirial[i]=0.0;
-  double my_eng=gb_gpu_forces(atom->f,atom->torque,olist[my_thread],eflag,vflag,
-                              eflag_atom, vflag_atom, eatom, vatom, lvirial,
-                              my_thread);
-  #pragma omp critical
-  {
-    eng_vdwl+=my_eng;
-    virial[0]+=lvirial[0];
-    virial[1]+=lvirial[1];
-    virial[2]+=lvirial[2];
-    virial[3]+=lvirial[3];
-    virial[4]+=lvirial[4];
-    virial[5]+=lvirial[5];
-  }
-  
-  } //End parallel
-  
-  if (vflag_fdotr) virial_compute();
-}
-
-/* ----------------------------------------------------------------------
-   global settings
------------------------------------------------------------------------- */
-
-void PairGayBerneGPU::settings(int narg, char **arg)
-{
-  // strip off GPU keyword/value and send remaining args to parent
-
-  if (narg < 2) error->all("Illegal pair_style command");
-  
-  // set multi_gpu_mode to one/node for multiple gpus on 1 node
-  // -- param is starting gpu id
-  // set multi_gpu_mode to one/gpu to select the same gpu id on every node
-  // -- param is id of gpu
-  // set multi_gpu_mode to multi/gpu to get ma
-  // -- param is number of gpus per node
-
-  if (strcmp(arg[0],"one/node") == 0)
-    multi_gpu_mode = ONE_NODE;
-  else if (strcmp(arg[0],"one/gpu") == 0)
-    multi_gpu_mode = ONE_GPU;
-  else if (strcmp(arg[0],"multi/gpu") == 0)
-    multi_gpu_mode = MULTI_GPU;
-  else error->all("Illegal pair_style command");
-
-  multi_gpu_param = atoi(arg[1]);
-
-  if (multi_gpu_mode == MULTI_GPU && multi_gpu_param < 1)
-    error->all("Illegal pair_style command");
-
-  PairGayBerne::settings(narg-2,&arg[2]);
 }

 /* ----------------------------------------------------------------------
@ -326,8 +129,6 @@ void PairGayBerneGPU::settings(int narg, char **arg)

 void PairGayBerneGPU::init_style()
 {
-  if (comm->me == 0)
-    output_time=true;
  if (force->pair_match("gpu",0) == NULL)
    error->all("Cannot use pair hybrid with multiple GPU pair styles");
  if (!atom->quat_flag || !atom->torque_flag || !atom->avec->shape_type)
@ -335,24 +136,7 @@ void PairGayBerneGPU::init_style()
  if (atom->radius_flag)
    error->all("Pair gayberne cannot be used with atom attribute diameter");

-  // set the GPU ID
-
-  int my_gpu=comm->me+multi_gpu_param;
-  int ngpus=universe->nprocs;
-  if (multi_gpu_mode==ONE_GPU) {
-    my_gpu=multi_gpu_param;
-    ngpus=1;
-  } else if (multi_gpu_mode==MULTI_GPU) {
-    ngpus=multi_gpu_param;
-    my_gpu=comm->me%ngpus;
-    if (ngpus>universe->nprocs)
-      ngpus=universe->nprocs;
-  }
-
-  int irequest = neighbor->request(this);
-
  // per-type shape precalculations
-
  for (int i = 1; i <= atom->ntypes; i++) {
    if (setwell[i]) {
      double *one = atom->shape[i];
@ -364,72 +148,38 @@ void PairGayBerneGPU::init_style()
  }

  // Repeat cutsq calculation because done after call to init_style
+  double maxcut = -1.0;
  double cut;
-  for (int i = 1; i <= atom->ntypes; i++)
+  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = i; j <= atom->ntypes; j++) {
-      cut = init_one(i,j);
-      cutsq[i][j] = cutsq[j][i] = cut*cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cut *= cut;
+        if (cut > maxcut)
+          maxcut = cut;
+        cutsq[i][j] = cutsq[j][i] = cut;
+      } else
+        cutsq[i][j] = cutsq[j][i] = 0.0;
    }
+  }

-  // If compiled with OpenMP and only 1 proc, try to use multiple GPUs w/threads
-  #ifdef GB_GPU_OMP
-  if (multi_gpu_mode!=ONE_GPU)
-    nthreads=ngpus=gb_gpu_num_devices();
-  else
-    nthreads=ngpus=1;
-  if (nthreads>MAX_GPU_THREADS)
-    nthreads=MAX_GPU_THREADS;
-  omp_set_num_threads(nthreads);
-  #endif
-    
-  #pragma omp parallel firstprivate(my_gpu)
-  {
-    #ifdef GB_GPU_OMP
-    int my_thread = omp_get_thread_num();
-    if (multi_gpu_mode!=ONE_GPU)
-      my_gpu=my_thread;
-    if (multi_gpu_mode==ONE_NODE)
-      my_gpu+=multi_gpu_param;
-    #endif
-    
-    bool init_ok=gb_gpu_init(ij_size, atom->ntypes+1, gamma, upsilon, mu, 
+  double cell_size = sqrt(maxcut) + neighbor->skin;
+
+  bool init_ok = gb_gpu_init(atom->ntypes+1, gamma, upsilon, mu, 
                             shape, well, cutsq, sigma, epsilon, lshape, form,
                             lj1, lj2, lj3, lj4, offset, force->special_lj, 
                             atom->nlocal, atom->nlocal+atom->nghost, 300, 
-                             my_thread, my_gpu);
-    if (!init_ok)
-      error->one("At least 1 proc could not allocate a CUDA gpu or memory");
-    
-    if (ij_new[my_thread]!=NULL)
-      delete [] ij_new[my_thread];
-    ij_new[my_thread]=new int[ij_size];
-  }
-  
-  last_neighbor = -1;
-  neighbor->requests[irequest]->half = 0;
-  neighbor->requests[irequest]->full = 1;
+                             cell_size, gpu_mode, screen);
+  if (!init_ok)
+    error->one("Insufficient memory on accelerator (or no fix gpu).");
+
  if (force->newton_pair) 
-    error->all("Cannot use newton pair with GPU GayBerne pair style");
+    error->all("Cannot use newton pair with GPU Gay-Berne pair style");

-  if (comm->me == 0 && screen) {
-    printf("\n-------------------------------------");
-    printf("-------------------------------------\n");
-    printf("- Using GPGPU acceleration for Gay-Berne:\n");
-    printf("-------------------------------------");
-    printf("-------------------------------------\n");
-
-    for (int i=0; i<ngpus; i++) {
-      int gpui=my_gpu;
-      if (multi_gpu_mode==ONE_NODE)
-        gpui=i+multi_gpu_param;
-      else if (multi_gpu_mode==MULTI_GPU)
-        gpui=i;
-      char gpu_string[500];
-      gb_gpu_name(gpui,neighbor->oneatom,gpu_string);
-      printf("GPU %d: %s\n",gpui,gpu_string);         
-    }
-    printf("-------------------------------------");
-    printf("-------------------------------------\n\n");
+  if (gpu_mode != GPU_NEIGH) {
+    int irequest = neighbor->request(this);
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full = 1;
  }
 }

@ -437,6 +187,275 @@ void PairGayBerneGPU::init_style()

 double PairGayBerneGPU::memory_usage()
 {
-  double bytes=Pair::memory_usage()+nthreads*ij_size*sizeof(int);
-  return bytes+gb_gpu_bytes();
+  double bytes = Pair::memory_usage();
+  return bytes + gb_gpu_bytes();
 }
+
+/* ---------------------------------------------------------------------- */
+
+void PairGayBerneGPU::cpu_compute(int start, int eflag, int vflag)
+{
+  int i,j,ii,jj,inum,jnum,itype,jtype;
+  double evdwl,one_eng,rsq,r2inv,r6inv,forcelj,factor_lj;
+  double fforce[3],ttor[3],rtor[3],r12[3];
+  double a1[3][3],b1[3][3],g1[3][3],a2[3][3],b2[3][3],g2[3][3],temp[3][3];
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  double **quat = atom->quat;
+  double **tor = atom->torque;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  double *special_lj = force->special_lj;
+
+  inum = list->inum;
+  ilist = olist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+  
+  // loop over neighbors of my atoms
+
+  for (ii = start; ii < inum; ii++) {
+    i = ilist[ii];
+    itype = type[i];
+
+    if (form[itype][itype] == ELLIPSE_ELLIPSE) {
+      MathExtra::quat_to_mat_trans(quat[i],a1);
+      MathExtra::diag_times3(well[itype],a1,temp);
+      MathExtra::transpose_times3(a1,temp,b1);
+      MathExtra::diag_times3(shape[itype],a1,temp);
+      MathExtra::transpose_times3(a1,temp,g1);
+    }
+
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+
+      if (j < nall) factor_lj = 1.0;
+      else {
+        factor_lj = special_lj[j/nall];
+        j %= nall;
+      }
+
+      // r12 = center to center vector
+
+      r12[0] = x[j][0]-x[i][0];
+      r12[1] = x[j][1]-x[i][1];
+      r12[2] = x[j][2]-x[i][2];
+      rsq = MathExtra::dot3(r12,r12);
+      jtype = type[j];
+
+      // compute if less than cutoff
+
+      if (rsq < cutsq[itype][jtype]) {
+
+	switch (form[itype][jtype]) {
+	case SPHERE_SPHERE:
+	  r2inv = 1.0/rsq;
+	  r6inv = r2inv*r2inv*r2inv;
+	  forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+	  forcelj *= -r2inv;
+	  if (eflag) one_eng = 
+	    r6inv*(r6inv*lj3[itype][jtype]-lj4[itype][jtype]) -
+	    offset[itype][jtype];
+	  fforce[0] = r12[0]*forcelj;
+	  fforce[1] = r12[1]*forcelj;
+	  fforce[2] = r12[2]*forcelj;
+	  ttor[0] = ttor[1] = ttor[2] = 0.0;
+	  rtor[0] = rtor[1] = rtor[2] = 0.0;
+	  break;
+
+        case SPHERE_ELLIPSE:
+	  MathExtra::quat_to_mat_trans(quat[j],a2);
+	  MathExtra::diag_times3(well[jtype],a2,temp);
+	  MathExtra::transpose_times3(a2,temp,b2);
+	  MathExtra::diag_times3(shape[jtype],a2,temp);
+	  MathExtra::transpose_times3(a2,temp,g2);
+	  one_eng = gayberne_lj(j,i,a2,b2,g2,r12,rsq,fforce,rtor);
+	  ttor[0] = ttor[1] = ttor[2] = 0.0;
+	  break;
+
+        case ELLIPSE_SPHERE:
+	  one_eng = gayberne_lj(i,j,a1,b1,g1,r12,rsq,fforce,ttor);
+	  rtor[0] = rtor[1] = rtor[2] = 0.0;
+	  break;
+
+	default:
+	  MathExtra::quat_to_mat_trans(quat[j],a2);
+	  MathExtra::diag_times3(well[jtype],a2,temp);
+	  MathExtra::transpose_times3(a2,temp,b2);
+	  MathExtra::diag_times3(shape[jtype],a2,temp);
+	  MathExtra::transpose_times3(a2,temp,g2);
+	  one_eng = gayberne_analytic(i,j,a1,a2,b1,b2,g1,g2,r12,rsq,
+				      fforce,ttor,rtor);
+	  break;
+	}
+
+        fforce[0] *= factor_lj;
+	fforce[1] *= factor_lj;
+	fforce[2] *= factor_lj;
+        ttor[0] *= factor_lj;
+	ttor[1] *= factor_lj;
+	ttor[2] *= factor_lj;
+
+        f[i][0] += fforce[0];
+	f[i][1] += fforce[1];
+	f[i][2] += fforce[2];
+        tor[i][0] += ttor[0];
+	tor[i][1] += ttor[1];
+	tor[i][2] += ttor[2];
+
+        if (eflag) evdwl = factor_lj*one_eng;
+
+	if (evflag) ev_tally_xyz_full(i,evdwl,0.0,fforce[0],fforce[1],fforce[2],
+				      -r12[0],-r12[1],-r12[2]);
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairGayBerneGPU::cpu_compute(int *nbors, int start, int eflag, int vflag)
+{
+  int i,j,itype,jtype;
+  double evdwl,one_eng,rsq,r2inv,r6inv,forcelj,factor_lj;
+  double fforce[3],ttor[3],rtor[3],r12[3];
+  double a1[3][3],b1[3][3],g1[3][3],a2[3][3],b2[3][3],g2[3][3],temp[3][3];
+
+  double **x = atom->x;
+  double **f = atom->f;
+  double **quat = atom->quat;
+  double **tor = atom->torque;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  int stride = nlocal-start;
+  double *special_lj = force->special_lj;
+
+  // loop over neighbors of my atoms
+
+  for (i = start; i < nlocal; i++) {
+    itype = type[i];
+
+    if (form[itype][itype] == ELLIPSE_ELLIPSE) {
+      MathExtra::quat_to_mat_trans(quat[i],a1);
+      MathExtra::diag_times3(well[itype],a1,temp);
+      MathExtra::transpose_times3(a1,temp,b1);
+      MathExtra::diag_times3(shape[itype],a1,temp);
+      MathExtra::transpose_times3(a1,temp,g1);
+    }
+
+    int *nbor = nbors+i-start;
+    int jnum =* nbor;
+    nbor += stride;
+    int *nbor_end = nbor + stride * jnum;
+
+    for ( ; nbor < nbor_end; nbor += stride) {
+      j = *nbor;
+
+      if (j < nall) factor_lj = 1.0;
+      else {
+        factor_lj = special_lj[j/nall];
+        j %= nall;
+      }
+
+      // r12 = center to center vector
+
+      r12[0] = x[j][0]-x[i][0];
+      r12[1] = x[j][1]-x[i][1];
+      r12[2] = x[j][2]-x[i][2];
+      rsq = MathExtra::dot3(r12,r12);
+      jtype = type[j];
+
+      // compute if less than cutoff
+
+      if (rsq < cutsq[itype][jtype]) {
+
+	switch (form[itype][jtype]) {
+	case SPHERE_SPHERE:
+	  r2inv = 1.0/rsq;
+	  r6inv = r2inv*r2inv*r2inv;
+	  forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+	  forcelj *= -r2inv;
+	  if (eflag) one_eng = 
+	    r6inv*(r6inv*lj3[itype][jtype]-lj4[itype][jtype]) -
+	    offset[itype][jtype];
+	  fforce[0] = r12[0]*forcelj;
+	  fforce[1] = r12[1]*forcelj;
+	  fforce[2] = r12[2]*forcelj;
+	  ttor[0] = ttor[1] = ttor[2] = 0.0;
+	  rtor[0] = rtor[1] = rtor[2] = 0.0;
+	  break;
+
+        case SPHERE_ELLIPSE:
+	  MathExtra::quat_to_mat_trans(quat[j],a2);
+	  MathExtra::diag_times3(well[jtype],a2,temp);
+	  MathExtra::transpose_times3(a2,temp,b2);
+	  MathExtra::diag_times3(shape[jtype],a2,temp);
+	  MathExtra::transpose_times3(a2,temp,g2);
+	  one_eng = gayberne_lj(j,i,a2,b2,g2,r12,rsq,fforce,rtor);
+	  ttor[0] = ttor[1] = ttor[2] = 0.0;
+	  break;
+
+        case ELLIPSE_SPHERE:
+	  one_eng = gayberne_lj(i,j,a1,b1,g1,r12,rsq,fforce,ttor);
+	  rtor[0] = rtor[1] = rtor[2] = 0.0;
+	  break;
+
+	default:
+	  MathExtra::quat_to_mat_trans(quat[j],a2);
+	  MathExtra::diag_times3(well[jtype],a2,temp);
+	  MathExtra::transpose_times3(a2,temp,b2);
+	  MathExtra::diag_times3(shape[jtype],a2,temp);
+	  MathExtra::transpose_times3(a2,temp,g2);
+	  one_eng = gayberne_analytic(i,j,a1,a2,b1,b2,g1,g2,r12,rsq,
+				      fforce,ttor,rtor);
+	  break;
+	}
+
+        fforce[0] *= factor_lj;
+	fforce[1] *= factor_lj;
+	fforce[2] *= factor_lj;
+        ttor[0] *= factor_lj;
+	ttor[1] *= factor_lj;
+	ttor[2] *= factor_lj;
+
+        f[i][0] += fforce[0];
+	f[i][1] += fforce[1];
+	f[i][2] += fforce[2];
+        tor[i][0] += ttor[0];
+	tor[i][1] += ttor[1];
+	tor[i][2] += ttor[2];
+
+        if (eflag) evdwl = factor_lj*one_eng;
+
+        if (j<start) { 
+  	  if (evflag) ev_tally_xyz_full(i,evdwl,0.0,fforce[0],fforce[1],
+				        fforce[2],-r12[0],-r12[1],-r12[2]);
+        } else {
+          if (j < nlocal) {
+            rtor[0] *= factor_lj;
+	    rtor[1] *= factor_lj;
+	    rtor[2] *= factor_lj;
+            f[j][0] -= fforce[0];
+	    f[j][1] -= fforce[1];
+	    f[j][2] -= fforce[2];
+            tor[j][0] += rtor[0];
+	    tor[j][1] += rtor[1];
+	    tor[j][2] += rtor[2];
+          }
+  	  if (evflag) ev_tally_xyz(i,j,nlocal,0,
+	  			   evdwl,0.0,fforce[0],fforce[1],fforce[2],
+				   -r12[0],-r12[1],-r12[2]);
+        }
+      }
+    }
+  }
+}
+
+
--- a/src/GPU/pair_gayberne_gpu.h
+++ b/src/GPU/pair_gayberne_gpu.h
@ -29,21 +29,19 @@ class PairGayBerneGPU : public PairGayBerne {
 public:
  PairGayBerneGPU(LAMMPS *lmp);
  ~PairGayBerneGPU();
+  void cpu_compute(int, int, int);
+  void cpu_compute(int *, int, int, int);
  void compute(int, int);
-  void settings(int, char **);
  void init_style();
  double memory_usage();
- 
-  enum { ONE_NODE, ONE_GPU, MULTI_GPU };

- private:  
-  int ij_size;
-  int *ij_new[MAX_GPU_THREADS], *olist[MAX_GPU_THREADS];
- 
-  int my_thread, nthreads, thread_inum[MAX_GPU_THREADS], omp_chunk;
- 
-  int last_neighbor, multi_gpu_mode, multi_gpu_param;
-  bool output_time;
+  enum { GPU_PAIR, GPU_NEIGH };
+
+ private:
+  int *olist;
+  int gpu_mode;
+  double cpu_time;
+  int *gpulist;
 };

 }
--- a/src/GPU/pair_lj96_cut_gpu.cpp
+++ b/src/GPU/pair_lj96_cut_gpu.cpp
@ -0,0 +1,318 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+   
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Mike Brown (SNL)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "pair_lj96_cut_gpu.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "integrate.h"
+#include "memory.h"
+#include "error.h"
+#include "neigh_request.h"
+#include "universe.h"
+#include "update.h"
+#include "domain.h"
+#include "string.h"
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+// External functions from cuda library for atom decomposition
+
+bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                   double **host_lj2, double **host_lj3, double **host_lj4, 
+                   double **offset, double *special_lj, const int nlocal, 
+                   const int nall, const int max_nbors, const int maxspecial,
+                   const double cell_size, int &gpu_mode, FILE *screen);
+void lj96_gpu_clear();
+int * lj96_gpu_compute_n(const int timestep, const int ago, const int inum,
+	 	         const int nall, double **host_x, int *host_type, 
+                         double *boxlo, double *boxhi, int *tag, int **nspecial,
+                         int **special, const bool eflag, const bool vflag,
+                         const bool eatom, const bool vatom, int &host_start,
+                         const double cpu_time, bool &success);
+void lj96_gpu_compute(const int timestep, const int ago, const int inum,
+	 	      const int nall, double **host_x, int *host_type,
+                      int *ilist, int *numj, int **firstneigh,
+		      const bool eflag, const bool vflag, const bool eatom,
+                      const bool vatom, int &host_start, const double cpu_time,
+                      bool &success);
+double lj96_gpu_bytes();
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJ96CutGPU::PairLJ96CutGPU(LAMMPS *lmp) : PairLJ96Cut(lmp), gpu_mode(GPU_PAIR)
+{
+  respa_enable = 0;
+  cpu_time = 0.0;
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairLJ96CutGPU::~PairLJ96CutGPU()
+{
+  lj96_gpu_clear();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJ96CutGPU::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+  
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+  
+  bool success = true;
+  
+  if (gpu_mode == GPU_NEIGH) {
+    inum = atom->nlocal;
+    gpulist = lj96_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
+			         atom->x, atom->type, domain->sublo,
+				 domain->subhi, atom->tag, atom->nspecial,
+                                 atom->special, eflag, vflag, eflag_atom,
+                                 vflag_atom, host_start, cpu_time, success);
+  } else {
+    inum = list->inum;
+    lj96_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
+		     atom->type, list->ilist, list->numneigh, list->firstneigh,
+		     eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
+                     success);
+  }
+  if (!success)
+    error->one("Out of memory on GPGPU");
+
+  if (host_start<inum) {
+    cpu_time = MPI_Wtime();
+    if (gpu_mode == GPU_NEIGH)
+      cpu_compute(gpulist, host_start, eflag, vflag);
+    else
+      cpu_compute(host_start, eflag, vflag);
+    cpu_time = MPI_Wtime() - cpu_time;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairLJ96CutGPU::init_style()
+{
+  cut_respa = NULL;
+
+  if (force->pair_match("gpu",0) == NULL)
+    error->all("Cannot use pair hybrid with multiple GPU pair styles");
+
+  // Repeat cutsq calculation because done after call to init_style
+  double maxcut = -1.0;
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cut *= cut;
+        if (cut > maxcut)
+          maxcut = cut;
+        cutsq[i][j] = cutsq[j][i] = cut;
+      } else
+        cutsq[i][j] = cutsq[j][i] = 0.0;
+    }
+  }
+  double cell_size = sqrt(maxcut) + neighbor->skin;
+
+  int maxspecial=0;
+  if (atom->molecular)
+    maxspecial=atom->maxspecial;
+  bool init_ok = lj96_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
+                               offset, force->special_lj, atom->nlocal,
+                               atom->nlocal+atom->nghost, 300, maxspecial,
+                               cell_size, gpu_mode, screen);
+  if (!init_ok)
+    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
+
+  if (force->newton_pair) 
+    error->all("Cannot use newton pair with GPU LJ96 pair style");
+
+  if (gpu_mode != GPU_NEIGH) {
+    int irequest = neighbor->request(this);
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full = 1;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJ96CutGPU::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+  return bytes + lj96_gpu_bytes();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJ96CutGPU::cpu_compute(int start, int eflag, int vflag) {
+  int i,j,ii,jj,inum,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r3inv,r6inv,forcelj,factor_lj;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  double *special_lj = force->special_lj;
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = start; ii < inum; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+
+      if (j < nall) factor_lj = 1.0;
+      else {
+	factor_lj = special_lj[j/nall];
+	j %= nall;
+      }
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+	r2inv = 1.0/rsq;
+	r6inv = r2inv*r2inv*r2inv;
+	r3inv = sqrt(r6inv);
+	forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
+	fpair = factor_lj*forcelj*r2inv;
+
+	f[i][0] += delx*fpair;
+	f[i][1] += dely*fpair;
+	f[i][2] += delz*fpair;
+
+	if (eflag) {
+	  evdwl = r6inv*(lj3[itype][jtype]*r3inv-lj4[itype][jtype]) -
+	    offset[itype][jtype];
+	  evdwl *= factor_lj;
+	}
+
+	if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJ96CutGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) {
+  int i,j,itype,jtype;
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  int stride = nlocal-start;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r3inv,r6inv,forcelj,factor_lj;
+  double *special_lj = force->special_lj;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+
+  // loop over neighbors of my atoms
+
+  for (i = start; i < nlocal; i++) {
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    int *nbor = nbors + i - start;
+    int jnum = *nbor;
+    nbor += stride;
+    int *nbor_end = nbor + stride * jnum;
+
+    for (; nbor<nbor_end; nbor+=stride) {
+      j = *nbor;
+      
+      if (j < nall) factor_lj = 1.0;
+      else {
+	factor_lj = special_lj[j/nall];
+	j %= nall;
+      }
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+	r2inv = 1.0/rsq;
+	r6inv = r2inv*r2inv*r2inv;
+	r3inv = sqrt(r6inv);
+	forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
+	fpair = factor_lj*forcelj*r2inv;
+
+	f[i][0] += delx*fpair;
+	f[i][1] += dely*fpair;
+	f[i][2] += delz*fpair;
+
+	if (eflag) {
+	  evdwl = r6inv*(lj3[itype][jtype]*r3inv-lj4[itype][jtype]) -
+	    offset[itype][jtype];
+	  evdwl *= factor_lj;
+	}
+
+        if (j<start) {
+  	  if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
+        } else {
+          if (j<nlocal) {
+	    f[j][0] -= delx*fpair;
+	    f[j][1] -= dely*fpair;
+	    f[j][2] -= delz*fpair;
+  	  }
+	  if (evflag) ev_tally(i,j,nlocal,0,
+			       evdwl,0.0,fpair,delx,dely,delz);
+	}
+      }
+    }
+  }
+}
--- a/src/GPU/pair_lj96_cut_gpu.h
+++ b/src/GPU/pair_lj96_cut_gpu.h
@ -0,0 +1,48 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj96/cut/gpu,PairLJ96CutGPU)
+
+#else
+
+#ifndef LMP_PAIR_LJ_96_GPU_H
+#define LMP_PAIR_LJ_96_GPU_H
+
+#include "pair_lj96_cut.h"
+
+namespace LAMMPS_NS {
+
+class PairLJ96CutGPU : public PairLJ96Cut {
+ public:
+  PairLJ96CutGPU(LAMMPS *lmp);
+  ~PairLJ96CutGPU();
+  void cpu_compute(int, int, int);
+  void cpu_compute(int *, int, int, int);
+  void compute(int, int);
+  void init_style();
+  double memory_usage();
+
+ enum { GPU_PAIR, GPU_NEIGH };
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+  int *gpulist;
+};
+
+}
+#endif
+#endif
+
--- a/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
@ -0,0 +1,364 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+   
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Mike Brown (SNL)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "pair_lj_cut_coul_cut_gpu.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "integrate.h"
+#include "memory.h"
+#include "error.h"
+#include "neigh_request.h"
+#include "universe.h"
+#include "update.h"
+#include "domain.h"
+#include "string.h"
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+// External functions from cuda library for atom decomposition
+
+bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                  double **host_lj2, double **host_lj3, double **host_lj4, 
+                  double **offset, double *special_lj, const int nlocal, 
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  double **host_cut_ljsq, double **host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e);
+void ljc_gpu_clear();
+int * ljc_gpu_compute_n(const int timestep, const int ago, const int inum,
+	 	        const int nall, double **host_x, int *host_type, 
+                        double *boxlo, double *boxhi, int *tag, int **nspecial,
+                        int **special, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        const double cpu_time, bool &success, double *host_q);
+void ljc_gpu_compute(const int timestep, const int ago, const int inum,
+	 	     const int nall, double **host_x, int *host_type,
+                     int *ilist, int *numj, int **firstneigh,
+		     const bool eflag, const bool vflag, const bool eatom,
+                     const bool vatom, int &host_start, const double cpu_time,
+                     bool &success, double *host_q);
+double ljc_gpu_bytes();
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutCoulCutGPU::PairLJCutCoulCutGPU(LAMMPS *lmp) : PairLJCutCoulCut(lmp), gpu_mode(GPU_PAIR)
+{
+  respa_enable = 0;
+  cpu_time = 0.0;
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairLJCutCoulCutGPU::~PairLJCutCoulCutGPU()
+{
+  ljc_gpu_clear();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulCutGPU::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+  
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+  
+  bool success = true;
+  
+  if (gpu_mode == GPU_NEIGH) {
+    inum = atom->nlocal;
+    gpulist = ljc_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
+			        atom->x, atom->type, domain->sublo,
+				domain->subhi, atom->tag, atom->nspecial,
+                                atom->special, eflag, vflag, eflag_atom,
+                                vflag_atom, host_start, cpu_time, success,
+                                atom->q);
+  } else {
+    inum = list->inum;
+    ljc_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
+		    atom->type, list->ilist, list->numneigh, list->firstneigh,
+		    eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
+                    success, atom->q);
+  }
+  if (!success)
+    error->one("Out of memory on GPGPU");
+
+  if (host_start<inum) {
+    cpu_time = MPI_Wtime();
+    if (gpu_mode == GPU_NEIGH)
+      cpu_compute(gpulist, host_start, eflag, vflag);
+    else
+      cpu_compute(host_start, eflag, vflag);
+    cpu_time = MPI_Wtime() - cpu_time;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairLJCutCoulCutGPU::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style lj/cut/coul/cut requires atom attribute q");
+  if (force->pair_match("gpu",0) == NULL)
+    error->all("Cannot use pair hybrid with multiple GPU pair styles");
+
+  // Repeat cutsq calculation because done after call to init_style
+  double maxcut = -1.0;
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cut *= cut;
+        if (cut > maxcut)
+          maxcut = cut;
+        cutsq[i][j] = cutsq[j][i] = cut;
+      } else
+        cutsq[i][j] = cutsq[j][i] = 0.0;
+    }
+  }
+  double cell_size = sqrt(maxcut) + neighbor->skin;
+
+  int maxspecial=0;
+  if (atom->molecular)
+    maxspecial=atom->maxspecial;
+  bool init_ok = ljc_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
+                              offset, force->special_lj, atom->nlocal,
+                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
+                              force->special_coul, force->qqrd2e);
+  if (!init_ok)
+    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
+
+  if (force->newton_pair) 
+    error->all("Cannot use newton pair with GPU LJ pair style");
+
+  if (gpu_mode != GPU_NEIGH) {
+    int irequest = neighbor->request(this);
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full = 1;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCutCoulCutGPU::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+  return bytes + ljc_gpu_bytes();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulCutGPU::cpu_compute(int start, int eflag, int vflag)
+{
+  int i,j,ii,jj,inum,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = ecoul = 0.0;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  double *q = atom->q;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  double *special_coul = force->special_coul;
+  double *special_lj = force->special_lj;
+  int newton_pair = force->newton_pair;
+  double qqrd2e = force->qqrd2e;
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = start; ii < inum; ii++) {
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+
+      if (j < nall) factor_coul = factor_lj = 1.0;
+      else {
+	factor_coul = special_coul[j/nall];
+	factor_lj = special_lj[j/nall];
+	j %= nall;
+      }
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+	r2inv = 1.0/rsq;
+
+	if (rsq < cut_coulsq[itype][jtype])
+	  forcecoul = qqrd2e * qtmp*q[j]*sqrt(r2inv);
+	else forcecoul = 0.0;
+
+	if (rsq < cut_ljsq[itype][jtype]) {
+	  r6inv = r2inv*r2inv*r2inv;
+	  forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+	} else forcelj = 0.0;
+
+	fpair = (factor_coul*forcecoul + factor_lj*forcelj) * r2inv;
+
+	f[i][0] += delx*fpair;
+	f[i][1] += dely*fpair;
+	f[i][2] += delz*fpair;
+
+	if (eflag) {
+	  if (rsq < cut_coulsq[itype][jtype])
+	    ecoul = factor_coul * qqrd2e * qtmp*q[j]*sqrt(r2inv);
+	  else ecoul = 0.0;
+	  if (rsq < cut_ljsq[itype][jtype]) {
+	    evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
+	      offset[itype][jtype];
+	    evdwl *= factor_lj;
+	  } else evdwl = 0.0;
+	}
+
+	if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulCutGPU::cpu_compute(int *nbors, int start, int eflag,
+                                      int vflag)
+{
+  int i,j,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
+
+  evdwl = ecoul = 0.0;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  double *q = atom->q;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  int stride = nlocal-start;
+  double *special_coul = force->special_coul;
+  double *special_lj = force->special_lj;
+  double qqrd2e = force->qqrd2e;
+
+  // loop over neighbors of my atoms
+
+  for (i = start; i < nlocal; i++) {
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    int *nbor = nbors + i - start;
+    jnum = *nbor;
+    nbor += stride;
+    int *nbor_end = nbor + stride * jnum;
+
+    for (; nbor<nbor_end; nbor+=stride) {
+      j = *nbor;
+
+      if (j < nall) factor_coul = factor_lj = 1.0;
+      else {
+	factor_coul = special_coul[j/nall];
+	factor_lj = special_lj[j/nall];
+	j %= nall;
+      }
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+	r2inv = 1.0/rsq;
+
+	if (rsq < cut_coulsq[itype][jtype])
+	  forcecoul = qqrd2e * qtmp*q[j]*sqrt(r2inv);
+	else forcecoul = 0.0;
+
+	if (rsq < cut_ljsq[itype][jtype]) {
+	  r6inv = r2inv*r2inv*r2inv;
+	  forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+	} else forcelj = 0.0;
+
+	fpair = (factor_coul*forcecoul + factor_lj*forcelj) * r2inv;
+
+	f[i][0] += delx*fpair;
+	f[i][1] += dely*fpair;
+	f[i][2] += delz*fpair;
+
+	if (eflag) {
+	  if (rsq < cut_coulsq[itype][jtype])
+	    ecoul = factor_coul * qqrd2e * qtmp*q[j]*sqrt(r2inv);
+	  else ecoul = 0.0;
+	  if (rsq < cut_ljsq[itype][jtype]) {
+	    evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
+	      offset[itype][jtype];
+	    evdwl *= factor_lj;
+	  } else evdwl = 0.0;
+	}
+
+        if (j<start) {
+  	  if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
+        } else {
+          if (j<nlocal) {
+	    f[j][0] -= delx*fpair;
+	    f[j][1] -= dely*fpair;
+	    f[j][2] -= delz*fpair;
+  	  }
+	  if (evflag) ev_tally(i,j,nlocal,0,
+			       evdwl,ecoul,fpair,delx,dely,delz);
+        }
+      }
+    }
+  }
+}
--- a/src/GPU/pair_lj_cut_coul_cut_gpu.h
+++ b/src/GPU/pair_lj_cut_coul_cut_gpu.h
@ -0,0 +1,48 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/coul/cut/gpu,PairLJCutCoulCutGPU)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CUT_COUL_CUT_GPU_H
+#define LMP_PAIR_LJ_CUT_COUL_CUT_GPU_H
+
+#include "pair_lj_cut_coul_cut.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutCoulCutGPU : public PairLJCutCoulCut {
+ public:
+  PairLJCutCoulCutGPU(LAMMPS *lmp);
+  ~PairLJCutCoulCutGPU();
+  void cpu_compute(int, int, int);
+  void cpu_compute(int *, int, int, int);
+  void compute(int, int);
+  void init_style();
+  double memory_usage();
+
+ enum { GPU_PAIR, GPU_NEIGH };
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+  int *gpulist;
+};
+
+}
+#endif
+#endif
+
--- a/src/GPU/pair_lj_cut_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_long_gpu.cpp
@ -0,0 +1,452 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+   
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Mike Brown (SNL)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "pair_lj_cut_coul_long_gpu.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "integrate.h"
+#include "memory.h"
+#include "error.h"
+#include "neigh_request.h"
+#include "universe.h"
+#include "update.h"
+#include "domain.h"
+#include "string.h"
+#include "kspace.h"
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+// External functions from cuda library for atom decomposition
+
+bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                  double **host_lj2, double **host_lj3, double **host_lj4, 
+                  double **offset, double *special_lj, const int nlocal, 
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  double **host_cut_ljsq, double host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e,
+                  const double g_ewald);
+void ljcl_gpu_clear();
+int * ljcl_gpu_compute_n(const int timestep, const int ago, const int inum,
+	 	        const int nall, double **host_x, int *host_type, 
+                        double *boxlo, double *boxhi, int *tag, int **nspecial,
+                        int **special, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        const double cpu_time, bool &success, double *host_q);
+void ljcl_gpu_compute(const int timestep, const int ago, const int inum,
+	 	     const int nall, double **host_x, int *host_type,
+                     int *ilist, int *numj, int **firstneigh,
+		     const bool eflag, const bool vflag, const bool eatom,
+                     const bool vatom, int &host_start, const double cpu_time,
+                     bool &success, double *host_q);
+double ljcl_gpu_bytes();
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutCoulLongGPU::PairLJCutCoulLongGPU(LAMMPS *lmp) : PairLJCutCoulLong(lmp), gpu_mode(GPU_PAIR)
+{
+  respa_enable = 0;
+  cpu_time = 0.0;
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairLJCutCoulLongGPU::~PairLJCutCoulLongGPU()
+{
+  ljcl_gpu_clear();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulLongGPU::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+  
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+  
+  bool success = true;
+  
+  if (gpu_mode == GPU_NEIGH) {
+    inum = atom->nlocal;
+    gpulist = ljcl_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
+			         atom->x, atom->type, domain->sublo,
+				 domain->subhi, atom->tag, atom->nspecial,
+                                 atom->special, eflag, vflag, eflag_atom,
+                                 vflag_atom, host_start, cpu_time, success,
+                                 atom->q);
+  } else {
+    inum = list->inum;
+    ljcl_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
+		    atom->type, list->ilist, list->numneigh, list->firstneigh,
+		    eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
+                    success, atom->q);
+  }
+  if (!success)
+    error->one("Out of memory on GPGPU");
+
+  if (host_start<inum) {
+    cpu_time = MPI_Wtime();
+    if (gpu_mode == GPU_NEIGH)
+      cpu_compute(gpulist, host_start, eflag, vflag);
+    else
+      cpu_compute(host_start, eflag, vflag);
+    cpu_time = MPI_Wtime() - cpu_time;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairLJCutCoulLongGPU::init_style()
+{
+  cut_respa = NULL;
+
+  if (!atom->q_flag)
+    error->all("Pair style lj/cut/coul/cut requires atom attribute q");
+  if (force->pair_match("gpu",0) == NULL)
+    error->all("Cannot use pair hybrid with multiple GPU pair styles");
+
+  // Repeat cutsq calculation because done after call to init_style
+  double maxcut = -1.0;
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cut *= cut;
+        if (cut > maxcut)
+          maxcut = cut;
+        cutsq[i][j] = cutsq[j][i] = cut;
+      } else
+        cutsq[i][j] = cutsq[j][i] = 0.0;
+    }
+  }
+  double cell_size = sqrt(maxcut) + neighbor->skin;
+
+  cut_coulsq = cut_coul * cut_coul;
+
+  // insure use of KSpace long-range solver, set g_ewald
+
+  if (force->kspace == NULL)
+    error->all("Pair style is incompatible with KSpace style");
+  g_ewald = force->kspace->g_ewald;
+
+  // setup force tables
+
+  if (ncoultablebits) init_tables();
+
+  int maxspecial=0;
+  if (atom->molecular)
+    maxspecial=atom->maxspecial;
+  bool init_ok = ljcl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
+                              offset, force->special_lj, atom->nlocal,
+                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
+                              force->special_coul, force->qqrd2e, g_ewald);
+  if (!init_ok)
+    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
+
+  if (force->newton_pair) 
+    error->all("Cannot use newton pair with GPU LJ pair style");
+
+  if (gpu_mode != GPU_NEIGH) {
+    int irequest = neighbor->request(this);
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full = 1;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCutCoulLongGPU::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+  return bytes + ljcl_gpu_bytes();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulLongGPU::cpu_compute(int start, int eflag, int vflag)
+{
+  int i,j,ii,jj,inum,jnum,itype,jtype,itable;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double fraction,table;
+  double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
+  double grij,expm2,prefactor,t,erfc;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  double rsq;
+
+  evdwl = ecoul = 0.0;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  double *q = atom->q;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  double *special_coul = force->special_coul;
+  double *special_lj = force->special_lj;
+  double qqrd2e = force->qqrd2e;
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+  
+  // loop over neighbors of my atoms
+
+  for (ii = start; ii < inum; ii++) {
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+
+      if (j < nall) factor_coul = factor_lj = 1.0;
+      else {
+	factor_coul = special_coul[j/nall];
+	factor_lj = special_lj[j/nall];
+	j %= nall;
+      }
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+	r2inv = 1.0/rsq;
+
+	if (rsq < cut_coulsq) {
+	  if (!ncoultablebits || rsq <= tabinnersq) {
+	    r = sqrt(rsq);
+	    grij = g_ewald * r;
+	    expm2 = exp(-grij*grij);
+	    t = 1.0 / (1.0 + EWALD_P*grij);
+	    erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+	    prefactor = qqrd2e * qtmp*q[j]/r;
+	    forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+	    if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+	  } else {
+	    union_int_float_t rsq_lookup;
+	    rsq_lookup.f = rsq;
+	    itable = rsq_lookup.i & ncoulmask;
+	    itable >>= ncoulshiftbits;
+	    fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
+	    table = ftable[itable] + fraction*dftable[itable];
+	    forcecoul = qtmp*q[j] * table;
+	    if (factor_coul < 1.0) {
+	      table = ctable[itable] + fraction*dctable[itable];
+	      prefactor = qtmp*q[j] * table;
+	      forcecoul -= (1.0-factor_coul)*prefactor;
+	    }
+	  }
+	} else forcecoul = 0.0;
+
+	if (rsq < cut_ljsq[itype][jtype]) {
+	  r6inv = r2inv*r2inv*r2inv;
+	  forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+	} else forcelj = 0.0;
+
+	fpair = (forcecoul + factor_lj*forcelj) * r2inv;
+
+	f[i][0] += delx*fpair;
+	f[i][1] += dely*fpair;
+	f[i][2] += delz*fpair;
+
+	if (eflag) {
+	  if (rsq < cut_coulsq) {
+	    if (!ncoultablebits || rsq <= tabinnersq)
+	      ecoul = prefactor*erfc;
+	    else {
+	      table = etable[itable] + fraction*detable[itable];
+	      ecoul = qtmp*q[j] * table;
+	    }
+	    if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+	  } else ecoul = 0.0;
+
+	  if (rsq < cut_ljsq[itype][jtype]) {
+	    evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
+	      offset[itype][jtype];
+	    evdwl *= factor_lj;
+	  } else evdwl = 0.0;
+	}
+
+	if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulLongGPU::cpu_compute(int *nbors, int start, int eflag,
+                                      int vflag)
+{
+  int i,j,jnum,itype,jtype,itable;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double fraction,table;
+  double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
+  double grij,expm2,prefactor,t,erfc;
+  double rsq;
+
+  evdwl = ecoul = 0.0;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  double *q = atom->q;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  int stride = nlocal-start;
+  double *special_coul = force->special_coul;
+  double *special_lj = force->special_lj;
+  double qqrd2e = force->qqrd2e;
+
+  // loop over neighbors of my atoms
+
+  for (i = start; i < nlocal; i++) {
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    int *nbor = nbors + i - start;
+    jnum = *nbor;
+    nbor += stride;
+    int *nbor_end = nbor + stride * jnum;
+
+    for (; nbor<nbor_end; nbor+=stride) {
+      j = *nbor;
+
+      if (j < nall) factor_coul = factor_lj = 1.0;
+      else {
+	factor_coul = special_coul[j/nall];
+	factor_lj = special_lj[j/nall];
+	j %= nall;
+      }
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+	r2inv = 1.0/rsq;
+
+	if (rsq < cut_coulsq) {
+	  if (!ncoultablebits || rsq <= tabinnersq) {
+	    r = sqrt(rsq);
+	    grij = g_ewald * r;
+	    expm2 = exp(-grij*grij);
+	    t = 1.0 / (1.0 + EWALD_P*grij);
+	    erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+	    prefactor = qqrd2e * qtmp*q[j]/r;
+	    forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+	    if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+	  } else {
+	    union_int_float_t rsq_lookup;
+	    rsq_lookup.f = rsq;
+	    itable = rsq_lookup.i & ncoulmask;
+	    itable >>= ncoulshiftbits;
+	    fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
+	    table = ftable[itable] + fraction*dftable[itable];
+	    forcecoul = qtmp*q[j] * table;
+	    if (factor_coul < 1.0) {
+	      table = ctable[itable] + fraction*dctable[itable];
+	      prefactor = qtmp*q[j] * table;
+	      forcecoul -= (1.0-factor_coul)*prefactor;
+	    }
+	  }
+	} else forcecoul = 0.0;
+
+	if (rsq < cut_ljsq[itype][jtype]) {
+	  r6inv = r2inv*r2inv*r2inv;
+	  forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+	} else forcelj = 0.0;
+
+	fpair = (forcecoul + factor_lj*forcelj) * r2inv;
+
+	f[i][0] += delx*fpair;
+	f[i][1] += dely*fpair;
+	f[i][2] += delz*fpair;
+
+	if (eflag) {
+	  if (rsq < cut_coulsq) {
+	    if (!ncoultablebits || rsq <= tabinnersq)
+	      ecoul = prefactor*erfc;
+	    else {
+	      table = etable[itable] + fraction*detable[itable];
+	      ecoul = qtmp*q[j] * table;
+	    }
+	    if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+	  } else ecoul = 0.0;
+
+	  if (rsq < cut_ljsq[itype][jtype]) {
+	    evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
+	      offset[itype][jtype];
+	    evdwl *= factor_lj;
+	  } else evdwl = 0.0;
+	}
+
+        if (j<start) {
+  	  if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
+        } else {
+          if (j<nlocal) {
+	    f[j][0] -= delx*fpair;
+	    f[j][1] -= dely*fpair;
+	    f[j][2] -= delz*fpair;
+  	  }
+	  if (evflag) ev_tally(i,j,nlocal,0,
+			       evdwl,ecoul,fpair,delx,dely,delz);
+        }
+      }
+    }
+  }
+}
+
--- a/src/GPU/pair_lj_cut_coul_long_gpu.h
+++ b/src/GPU/pair_lj_cut_coul_long_gpu.h
@ -0,0 +1,48 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/coul/long/gpu,PairLJCutCoulLongGPU)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CUT_COUL_LONG_GPU_H
+#define LMP_PAIR_LJ_CUT_COUL_LONG_GPU_H
+
+#include "pair_lj_cut_coul_long.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutCoulLongGPU : public PairLJCutCoulLong {
+ public:
+  PairLJCutCoulLongGPU(LAMMPS *lmp);
+  ~PairLJCutCoulLongGPU();
+  void cpu_compute(int, int, int);
+  void cpu_compute(int *, int, int, int);
+  void compute(int, int);
+  void init_style();
+  double memory_usage();
+
+ enum { GPU_PAIR, GPU_NEIGH };
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+  int *gpulist;
+};
+
+}
+#endif
+#endif
+
--- a/src/GPU/pair_lj_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_gpu.cpp
@ -2,28 +2,24 @@
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
-
+   
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-
+   
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */

 /* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
-                         Peng Wang (Nvidia), penwang@nvidia.com
-                         Paul Crozier (SNL), pscrozi@sandia.gov
+   Contributing author: Mike Brown (SNL)
 ------------------------------------------------------------------------- */

 #include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "pair_lj_cut_gpu.h"
-#include "math_extra.h"
 #include "atom.h"
-#include "domain.h"
 #include "atom_vec.h"
 #include "comm.h"
 #include "force.h"
@ -32,79 +28,45 @@
 #include "integrate.h"
 #include "memory.h"
 #include "error.h"
+#include "neigh_request.h"
 #include "universe.h"
-
-#include <string>
+#include "update.h"
+#include "domain.h"
+#include "string.h"

 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))

-#ifndef WINDLL
+// External functions from cuda library for atom decomposition

-// External functions from cuda library for force decomposition
-
-bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, 
-                  double **epsilon, double **host_lj1, double **host_lj2, 
-                  double **host_lj3, double **host_lj4, double **offset, 
-                  double *special_lj, double *boxlo, double *boxhi, double cell_len, double skin,
-                  const int max_nbors, const int gpu_id);
-void lj_gpu_clear();
-double lj_gpu_cell(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall, 
-		   const int ago, const bool eflag, const bool vflag, 
-		   const double *boxlo, const double *boxhi);
-void lj_gpu_name(const int gpu_id, const int max_nbors, char * name);
-void lj_gpu_time();
-double lj_gpu_bytes();
-
-#else
-#include <windows.h>
-
-typedef bool (*_lj_gpu_init)(int &ij_size, const int ntypes, double **cutsq,double **sigma, 
-                  double **epsilon, double **host_lj1, double **host_lj2, 
-                  double **host_lj3, double **host_lj4, double **offset, 
-                  double *special_lj, double *boxlo, double *boxhi, double cell_len, double skin,
-                  const int max_nbors, const int gpu_id);
-typedef void (*_lj_gpu_clear)();
-typedef double (*_lj_gpu_cell)(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall, 
-		   const int ago, const bool eflag, const bool vflag, 
-		   const double *boxlo, const double *boxhi);
-typedef void (*_lj_gpu_name)(const int gpu_id, const int max_nbors, char * name);
-typedef void (*_lj_gpu_time)();
-typedef double (*_lj_gpu_bytes)();
-
-_lj_gpu_init lj_gpu_init;
-_lj_gpu_clear lj_gpu_clear;
-_lj_gpu_cell lj_gpu_cell;
-_lj_gpu_name lj_gpu_name;
-_lj_gpu_time lj_gpu_time;
-_lj_gpu_bytes lj_gpu_bytes;
-
-#endif
+bool ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                  double **host_lj2, double **host_lj3, double **host_lj4, 
+                  double **offset, double *special_lj, const int nlocal, 
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen);
+void ljl_gpu_clear();
+int * ljl_gpu_compute_n(const int timestep, const int ago, const int inum,
+	 	        const int nall, double **host_x, int *host_type, 
+                        double *boxlo, double *boxhi, int *tag, int **nspecial,
+                        int **special, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        const double cpu_time, bool &success);
+void ljl_gpu_compute(const int timestep, const int ago, const int inum,
+	 	     const int nall, double **host_x, int *host_type,
+                     int *ilist, int *numj, int **firstneigh,
+		     const bool eflag, const bool vflag, const bool eatom,
+                     const bool vatom, int &host_start, const double cpu_time,
+                     bool &success);
+double ljl_gpu_bytes();

 using namespace LAMMPS_NS;

 /* ---------------------------------------------------------------------- */

-PairLJCutGPU::PairLJCutGPU(LAMMPS *lmp) : PairLJCut(lmp), multi_gpu_mode(0)
+PairLJCutGPU::PairLJCutGPU(LAMMPS *lmp) : PairLJCut(lmp), gpu_mode(GPU_PAIR)
 {
-  ij_new=NULL;
  respa_enable = 0;
-
-#ifdef WINDLL
-  HINSTANCE hinstLib = LoadLibrary(TEXT("gpu.dll"));
-  if (hinstLib == NULL) {
-    printf("\nUnable to load gpu.dll\n");
-    exit(1);
-  }
-
-  lj_gpu_init=(_lj_gpu_init)GetProcAddress(hinstLib,"lj_gpu_init");
-  lj_gpu_clear=(_lj_gpu_clear)GetProcAddress(hinstLib,"lj_gpu_clear");
-  lj_gpu_cell=(_lj_gpu_cell)GetProcAddress(hinstLib,"lj_gpu_cell");
-  lj_gpu_name=(_lj_gpu_name)GetProcAddress(hinstLib,"lj_gpu_name");
-  lj_gpu_time=(_lj_gpu_time)GetProcAddress(hinstLib,"lj_gpu_time");
-  lj_gpu_bytes=(_lj_gpu_bytes)GetProcAddress(hinstLib,"lj_gpu_bytes");
-#endif
-
+  cpu_time = 0.0;
 }

 /* ----------------------------------------------------------------------
@ -113,17 +75,7 @@ PairLJCutGPU::PairLJCutGPU(LAMMPS *lmp) : PairLJCut(lmp), multi_gpu_mode(0)

 PairLJCutGPU::~PairLJCutGPU()
 {
-  printf("\n\n-------------------------------------");
-  printf("--------------------------------\n");
-  printf("      GPU Time Stamps: ");
-  printf("\n-------------------------------------");
-  printf("--------------------------------\n");
-  lj_gpu_time();
-  printf("-------------------------------------");
-  printf("--------------------------------\n\n");
-  lj_gpu_clear();
-  if (ij_new!=NULL)
-    delete [] ij_new;
+  ljl_gpu_clear();
 }

 /* ---------------------------------------------------------------------- */
@ -132,45 +84,37 @@ void PairLJCutGPU::compute(int eflag, int vflag)
 {
  if (eflag || vflag) ev_setup(eflag,vflag);
  else evflag = vflag_fdotr = 0;
+  
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+  
+  bool success = true;
+  
+  if (gpu_mode == GPU_NEIGH) {
+    inum = atom->nlocal;
+    gpulist = ljl_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
+			        atom->x, atom->type, domain->sublo,
+				domain->subhi, atom->tag, atom->nspecial,
+                                atom->special, eflag, vflag, eflag_atom,
+                                vflag_atom, host_start, cpu_time, success);
+  } else {
+    inum = list->inum;
+    ljl_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
+		    atom->type, list->ilist, list->numneigh, list->firstneigh,
+		    eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
+                    success);
+  }
+  if (!success)
+    error->one("Out of memory on GPGPU");

-  // compute forces on GPU
-  eng_vdwl = lj_gpu_cell(atom->f, virial, atom->x, atom->type, atom->nlocal, atom->nlocal + atom->nghost, 
-			 neighbor->ago, eflag, vflag, domain->boxlo, domain->boxhi);
-
-  if (vflag_fdotr) virial_compute();
-}
-
-/* ----------------------------------------------------------------------
-   global settings
------------------------------------------------------------------------- */
-
-void PairLJCutGPU::settings(int narg, char **arg)
-{
-  // strip off GPU keyword/value and send remaining args to parent
-
-  if (narg < 2) error->all("Illegal pair_style command");
-
-  // set multi_gpu_mode to one/node for multiple gpus on 1 node
-  // -- param is starting gpu id
-  // set multi_gpu_mode to one/gpu to select the same gpu id on every node
-  // -- param is id of gpu
-  // set multi_gpu_mode to multi/gpu to get ma
-  // -- param is number of gpus per node
-
-  if (strcmp(arg[0],"one/node") == 0)
-    multi_gpu_mode = ONE_NODE;
-  else if (strcmp(arg[0],"one/gpu") == 0)
-    multi_gpu_mode = ONE_GPU;
-  else if (strcmp(arg[0],"multi/gpu") == 0)
-    multi_gpu_mode = MULTI_GPU;
-  else error->all("Illegal pair_style command");
-
-  multi_gpu_param = atoi(arg[1]);
-
-  if (multi_gpu_mode == MULTI_GPU && multi_gpu_param < 1)
-    error->all("Illegal pair_style command");
-
-  PairLJCut::settings(narg-2,&arg[2]);
+  if (host_start<inum) {
+    cpu_time = MPI_Wtime();
+    if (gpu_mode == GPU_NEIGH)
+      cpu_compute(gpulist, host_start, eflag, vflag);
+    else
+      cpu_compute(host_start, eflag, vflag);
+    cpu_time = MPI_Wtime() - cpu_time;
+  }
 }

 /* ----------------------------------------------------------------------
@ -179,74 +123,45 @@ void PairLJCutGPU::settings(int narg, char **arg)

 void PairLJCutGPU::init_style()
 {
+  cut_respa = NULL;
+
  if (force->pair_match("gpu",0) == NULL)
    error->all("Cannot use pair hybrid with multiple GPU pair styles");

-  // set the GPU ID
-
-  int my_gpu=comm->me+multi_gpu_param;
-  int ngpus=universe->nprocs;
-  if (multi_gpu_mode==ONE_GPU) {
-    my_gpu=multi_gpu_param;
-    ngpus=1;
-  } else if (multi_gpu_mode==MULTI_GPU) {
-    ngpus=multi_gpu_param;
-    my_gpu=comm->me%ngpus;
-    if (ngpus>universe->nprocs)
-      ngpus=universe->nprocs;
-  }
-
-  cut_respa=NULL;
-
  // Repeat cutsq calculation because done after call to init_style
-  double cut;
-  for (int i = 1; i <= atom->ntypes; i++)
-    for (int j = i; j <= atom->ntypes; j++) {
-      cut = init_one(i,j);
-      cutsq[i][j] = cutsq[j][i] = cut*cut;
-    }
-
-  // use the max cutoff length as the cell length
  double maxcut = -1.0;
-  for (int i = 1; i <= atom->ntypes; i++)
-    for (int j = i; j <= atom->ntypes; j++)
-      if (cutsq[i][j] > maxcut) maxcut = cutsq[i][j];
-
-  // for this problem, adding skin results in better perf
-  // this may be a parameter in the future
-  double cell_len = sqrt(maxcut) + neighbor->skin;
-
-  if (!lj_gpu_init(ij_size, atom->ntypes+1, cutsq, sigma, epsilon, lj1, lj2, lj3, 
-                 lj4, offset, force->special_lj, domain->boxlo, domain->boxhi, 
-                 cell_len, neighbor->skin, neighbor->oneatom, my_gpu))
-    error->one("At least one process could not allocate a CUDA-enabled gpu");
-
-  if (ij_new!=NULL)
-    delete [] ij_new;
-  ij_new=new int[ij_size];
-  
-  if (force->newton_pair) 
-    error->all("Cannot use newton pair with GPU lj/cut pair style");
-
-  if (comm->me == 0 && screen) {
-    printf("\n-------------------------------------");
-    printf("-------------------------------------\n");
-    printf("- Using GPGPU acceleration for LJ-Cut:\n");
-    printf("-------------------------------------");
-    printf("-------------------------------------\n");
-
-    for (int i=0; i<ngpus; i++) {
-      int gpui=my_gpu;
-      if (multi_gpu_mode==ONE_NODE)
-        gpui=i+multi_gpu_param;
-      else if (multi_gpu_mode==MULTI_GPU)
-        gpui=i;
-      char gpu_string[500];
-      lj_gpu_name(gpui,neighbor->oneatom,gpu_string);
-      printf("GPU %d: %s\n",gpui,gpu_string);   
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cut *= cut;
+        if (cut > maxcut)
+          maxcut = cut;
+        cutsq[i][j] = cutsq[j][i] = cut;
+      } else
+        cutsq[i][j] = cutsq[j][i] = 0.0;
    }
-    printf("-------------------------------------");
-    printf("-------------------------------------\n\n");
+  }
+  double cell_size = sqrt(maxcut) + neighbor->skin;
+
+  int maxspecial=0;
+  if (atom->molecular)
+    maxspecial=atom->maxspecial;
+  bool init_ok = ljl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
+                              offset, force->special_lj, atom->nlocal,
+                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              cell_size, gpu_mode, screen);
+  if (!init_ok)
+    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
+
+  if (force->newton_pair) 
+    error->all("Cannot use newton pair with GPU LJ pair style");
+
+  if (gpu_mode != GPU_NEIGH) {
+    int irequest = neighbor->request(this);
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full = 1;
  }
 }

@ -254,6 +169,149 @@ void PairLJCutGPU::init_style()

 double PairLJCutGPU::memory_usage()
 {
-  double bytes=Pair::memory_usage()+ij_size*sizeof(int);
-  return bytes+lj_gpu_bytes();
+  double bytes = Pair::memory_usage();
+  return bytes + ljl_gpu_bytes();
 }
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutGPU::cpu_compute(int start, int eflag, int vflag) {
+  int i,j,ii,jj,inum,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r6inv,forcelj,factor_lj;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  double *special_lj = force->special_lj;
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = start; ii < inum; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+
+      if (j < nall) factor_lj = 1.0;
+      else {
+	factor_lj = special_lj[j/nall];
+	j %= nall;
+      }
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+	r2inv = 1.0/rsq;
+	r6inv = r2inv*r2inv*r2inv;
+	forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+	fpair = factor_lj*forcelj*r2inv;
+
+	f[i][0] += delx*fpair;
+	f[i][1] += dely*fpair;
+	f[i][2] += delz*fpair;
+
+	if (eflag) {
+	  evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
+	    offset[itype][jtype];
+	  evdwl *= factor_lj;
+	}
+
+	if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) {
+  int i,j,itype,jtype;
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  int stride = nlocal-start;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r6inv,forcelj,factor_lj;
+  double *special_lj = force->special_lj;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+
+  // loop over neighbors of my atoms
+
+  for (i = start; i < nlocal; i++) {
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    int *nbor = nbors + i - start;
+    int jnum = *nbor;
+    nbor += stride;
+    int *nbor_end = nbor + stride * jnum;
+
+    for (; nbor<nbor_end; nbor+=stride) {
+      j = *nbor;
+      
+      if (j < nall) factor_lj = 1.0;
+      else {
+	factor_lj = special_lj[j/nall];
+	j %= nall;
+      }
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+	r2inv = 1.0/rsq;
+	r6inv = r2inv*r2inv*r2inv;
+	forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+	fpair = factor_lj*forcelj*r2inv;
+
+	f[i][0] += delx*fpair;
+	f[i][1] += dely*fpair;
+	f[i][2] += delz*fpair;
+
+	if (eflag) {
+	  evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
+	    offset[itype][jtype];
+	  evdwl *= factor_lj;
+	}
+
+        if (j<start) {
+  	  if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
+        } else {
+          if (j<nlocal) {
+	    f[j][0] -= delx*fpair;
+	    f[j][1] -= dely*fpair;
+	    f[j][2] -= delz*fpair;
+  	  }
+	  if (evflag) ev_tally(i,j,nlocal,0,
+			       evdwl,0.0,fpair,delx,dely,delz);
+	}
+      }
+    }
+  }
+}
+
--- a/src/GPU/pair_lj_cut_gpu.h
+++ b/src/GPU/pair_lj_cut_gpu.h
@ -17,8 +17,8 @@ PairStyle(lj/cut/gpu,PairLJCutGPU)

 #else

-#ifndef LMP_PAIR_LJ_CUT_GPU_H
-#define LMP_PAIR_LJ_CUT_GPU_H
+#ifndef LMP_PAIR_LJ_LIGHT_GPU_H
+#define LMP_PAIR_LJ_LIGHT_GPU_H

 #include "pair_lj_cut.h"

@ -28,20 +28,21 @@ class PairLJCutGPU : public PairLJCut {
 public:
  PairLJCutGPU(LAMMPS *lmp);
  ~PairLJCutGPU();
+  void cpu_compute(int, int, int);
+  void cpu_compute(int *, int, int, int);
  void compute(int, int);
-  void settings(int, char **);
  void init_style();
  double memory_usage();

-  enum { ONE_NODE, ONE_GPU, MULTI_GPU };
+ enum { GPU_PAIR, GPU_NEIGH };

- private:  
-  int ij_size;
-  int *ij_new;
- 
-  int last_neighbor, multi_gpu_mode, multi_gpu_param;
+ private:
+  int gpu_mode;
+  double cpu_time;
+  int *gpulist;
 };

 }
 #endif
 #endif
+