From 93e0e00793c3dfff9a48639f46897bb2e91ce2cd Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Tue, 11 Aug 2009 18:59:27 +0000
Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@3047
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 src/GPU/Install.csh           |  39 ++++
 src/GPU/pair_gayberne_gpu.cpp | 369 ++++++++++++++++++++++++++++++++++
 src/GPU/pair_gayberne_gpu.h   |  43 ++++
 src/GPU/pair_lj_cut_gpu.cpp   | 270 +++++++++++++++++++++++++
 src/GPU/pair_lj_cut_gpu.h     |  40 ++++
 src/GPU/style_gpu.h           |  41 ++++
 src/Makefile                  |   2 +-
 src/OPT/Install.csh           |  38 +++-
 src/style.h                   |   1 +
 9 files changed, 831 insertions(+), 12 deletions(-)
 create mode 100644 src/GPU/Install.csh
 create mode 100644 src/GPU/pair_gayberne_gpu.cpp
 create mode 100644 src/GPU/pair_gayberne_gpu.h
 create mode 100644 src/GPU/pair_lj_cut_gpu.cpp
 create mode 100644 src/GPU/pair_lj_cut_gpu.h
 create mode 100644 src/GPU/style_gpu.h

diff --git a/src/GPU/Install.csh b/src/GPU/Install.csh
new file mode 100644
index 0000000000..80f0783fd8
--- /dev/null
+++ b/src/GPU/Install.csh
@@ -0,0 +1,39 @@
+# Install/unInstall package classes in LAMMPS
+# edit Makefile.package to include/exclude GPU library
+# do not copy gayberne files if non-GPU version does not exist
+
+if ($1 == 1) then
+
+  sed -i 's/\S*gpu //' ../Makefile.package
+  sed -i 's|^PKGPATH =\s*|&-L../../lib/gpu |' ../Makefile.package
+  sed -i 's|^PKGLIB =\s*|&-lgpu |' ../Makefile.package
+
+  cp style_gpu.h tmp.h
+  if (! -e ../pair_gayberne.cpp) then
+    grep -v gayberne tmp.h > tmp1.h
+    mv tmp1.h tmp.h
+  endif
+  mv tmp.h ../style_gpu.h
+
+  if (-e ../pair_gayberne.cpp) then
+    cp pair_gayberne_gpu.cpp ..
+    cp pair_gayberne_gpu.h ..
+  endif
+
+  cp pair_lj_cut_gpu.cpp ..
+  cp pair_lj_cut_gpu.h ..
+
+else if ($1 == 0) then
+
+  sed -i 's/\S*gpu //' ../Makefile.package
+
+  rm ../style_gpu.h
+  touch ../style_gpu.h
+
+  rm ../pair_gayberne_gpu.cpp
+  rm ../pair_lj_cut_gpu.cpp
+
+  rm ../pair_gayberne_gpu.h
+  rm ../pair_lj_cut_gpu.h
+
+endif
diff --git a/src/GPU/pair_gayberne_gpu.cpp b/src/GPU/pair_gayberne_gpu.cpp
new file mode 100644
index 0000000000..23ddff98dd
--- /dev/null
+++ b/src/GPU/pair_gayberne_gpu.cpp
@@ -0,0 +1,369 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   cetain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Mike Brown (SNL)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "string.h"
+#include "pair_gayberne_gpu.h"
+#include "math_extra.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "integrate.h"
+#include "memory.h"
+#include "error.h"
+#include "neigh_request.h"
+#include "universe.h"
+
+#ifdef GB_GPU_OMP
+#include "omp.h"
+#endif
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+// External functions from cuda library for atom decomposition
+int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
+                  const double upsilon, const double mu, double **shape,
+                  double **well, double **cutsq, double **sigma, 
+                  double **epsilon, double *host_lshape, int **form,
+                  double **host_lj1, double **host_lj2, double **host_lj3, 
+                  double **host_lj4, double **offset, double *special_lj, 
+                  const int max_nbors, const int thread, const int gpu_id);
+void gb_gpu_clear(const int thread);
+int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum, 
+                         int *ilist, const int *numj, const int *type,
+                         const int thread, bool &success);
+void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread);
+void gb_gpu_atom(double **host_x, double **host_quat, const int *host_type, 
+                 const bool rebuild, const int thread);
+void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, 
+                     const int thread);
+double gb_gpu_forces(double **f, double **tor, const int *ilist,
+                     const bool eflag, const bool vflag, const bool eflag_atom,
+                     const bool vflag_atom, double *eatom, double **vatom,
+                     double *virial, const int thread);
+std::string gb_gpu_name(const int i, const int max_nbors);
+void gb_gpu_time(const int thread);
+int gb_gpu_num_devices();
+double gb_gpu_bytes();
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), my_thread(0),
+                                                omp_chunk(0), nthreads(1),
+                                                multi_gpu_mode(ONE_NODE),
+                                                multi_gpu_param(0)
+{
+  ij_new[0]=NULL;
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairGayBerneGPU::~PairGayBerneGPU()
+{
+  if (comm->me == 0 && screen) {
+    printf("\n\n-------------------------------------");
+    printf("--------------------------------\n");
+    printf("      GPU Time Stamps: ");
+    printf("\n-------------------------------------");
+    printf("--------------------------------\n");
+    gb_gpu_time(my_thread);
+    std::cout << "Procs: " << universe->nprocs << std::endl;
+    printf("-------------------------------------");
+    printf("--------------------------------\n\n");
+  }
+  #pragma omp parallel
+  {
+    #ifdef GB_GPU_OMP
+    int my_thread=omp_get_thread_num();
+    #endif
+    gb_gpu_clear(my_thread);
+    if (ij_new[my_thread]!=NULL) {
+      ij_new[my_thread]=NULL;
+      delete [] ij_new[my_thread];
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairGayBerneGPU::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+  if (vflag_atom) 
+    error->all("Per-atom virial not available with GPU Gay-Berne.");
+
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  int inum = list->inum;
+  
+  bool rebuild=false;
+  if (neighbor->ncalls > last_neighbor) {
+    last_neighbor=neighbor->ncalls;
+    rebuild=true;
+  }
+  
+  #pragma omp parallel
+  {
+    
+  bool success=true;
+  #ifdef GB_GPU_OMP
+  int my_thread=omp_get_thread_num();
+  if (rebuild) {
+    omp_chunk=static_cast<int>(ceil(static_cast<double>(inum)/nthreads));
+    if (my_thread==nthreads-1)
+      thread_inum[my_thread]=inum-(nthreads-1)*omp_chunk;
+    else
+      thread_inum[my_thread]=omp_chunk;
+    olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal, 
+                                        thread_inum[my_thread],  
+                                        list->ilist+omp_chunk*my_thread, 
+                                        list->numneigh, atom->type, my_thread,
+                                        success);
+  }
+  #else
+  if (rebuild)
+    olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal, inum, list->ilist, 
+                                        list->numneigh, atom->type, my_thread,
+                                        success);
+  #endif
+  if (!success)
+    error->one("Total # of atoms exceeds maximum allowed per GPGPU.\n");
+  
+  // copy atom data to GPU
+  gb_gpu_atom(atom->x,atom->quat,atom->type,rebuild,my_thread);
+
+  int i,j,ii,jj,jnum;
+  double factor_lj;
+  int *jlist;
+
+  if (rebuild==true) {
+    int num_ij = 0;
+
+    // loop over neighbors of my atoms
+    int *ijp=ij_new[my_thread];
+    #ifdef GB_GPU_OMP
+    int mgo=my_thread*omp_chunk;
+    int mgot=mgo+thread_inum[my_thread];
+    #else
+    int mgo=0, mgot=inum;
+    #endif
+    for (ii = mgo; ii<mgot; ii++) {
+      i = olist[my_thread][ii];
+      jlist = list->firstneigh[i];
+      jnum = list->numneigh[i];
+      
+      for (jj = 0; jj < jnum; jj++) {
+        j = jlist[jj];
+
+        *ijp=j;
+        ijp++;
+        num_ij++;
+          
+        if (num_ij==ij_size) {
+          memcpy(ij[my_thread],ij_new[my_thread],num_ij*sizeof(int));
+          gb_gpu_nbors(num_ij,eflag,my_thread);
+          ijp=ij_new[my_thread];
+          num_ij=0;
+        }
+      }
+    }
+    if (num_ij>0) {
+      memcpy(ij[my_thread],ij_new[my_thread],num_ij*sizeof(int));
+      gb_gpu_nbors(num_ij,eflag,my_thread);
+    }
+  }
+  
+  gb_gpu_gayberne(eflag,vflag,rebuild,my_thread);
+  double lvirial[6];
+  for (int i=0; i<6; i++) lvirial[i]=0.0;
+  double my_eng=gb_gpu_forces(atom->f,atom->torque,olist[my_thread],eflag,vflag,
+                              eflag_atom, vflag_atom, eatom, vatom, lvirial,
+                              my_thread);
+  #pragma omp critical
+  {
+    eng_vdwl+=my_eng;
+    virial[0]+=lvirial[0];
+    virial[1]+=lvirial[1];
+    virial[2]+=lvirial[2];
+    virial[3]+=lvirial[3];
+    virial[4]+=lvirial[4];
+    virial[5]+=lvirial[5];
+  }
+  
+  } //End parallel
+  
+  if (vflag_fdotr) virial_compute();
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairGayBerneGPU::settings(int narg, char **arg)
+{
+  if (narg != 4 && narg != 6) error->all("Illegal pair_style command");
+  
+  // Set multi_gpu_mode to one_node for multiple gpus on 1 node
+  // -- param is starting gpu id
+  // Set multi_gpu_mode to one_gpu to select the same gpu id on every node
+  // -- param is id of gpu
+  // Set multi_gpu_mode to multi_gpu to get ma
+  // -- param is number of gpus per node
+  multi_gpu_mode=ONE_NODE;
+  multi_gpu_param=0;
+  if (narg==6) {
+    multi_gpu_param=atoi(arg[5]);
+    if (strcmp("one_node",arg[4])==0)
+      multi_gpu_mode=ONE_NODE;
+    else if (strcmp("one_gpu",arg[4])==0)
+      multi_gpu_mode=ONE_GPU;
+    else if (strcmp("multi_gpu",arg[4])==0) {
+      multi_gpu_mode=MULTI_GPU;
+      if (multi_gpu_param<1)
+        error->all("Illegal pair_style command");
+    } else
+      error->all("Illegal pair_style command");
+    narg-=2;
+  }
+
+  PairGayBerne::settings(narg,arg);
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairGayBerneGPU::init_style()
+{
+  // Set the GPU ID
+  int my_gpu=comm->me+multi_gpu_param;
+  int ngpus=universe->nprocs;
+  if (multi_gpu_mode==ONE_GPU) {
+    my_gpu=multi_gpu_param;
+    ngpus=1;
+  } else if (multi_gpu_mode==MULTI_GPU) {
+    ngpus=multi_gpu_param;
+    my_gpu=comm->me%ngpus;
+    if (ngpus>universe->nprocs)
+      ngpus=universe->nprocs;
+  }
+  
+  if (!atom->quat_flag || !atom->torque_flag || !atom->avec->shape_type)
+    error->all("Pair gayberne requires atom attributes quat, torque, shape");
+  if (atom->radius_flag)
+    error->all("Pair gayberne cannot be used with atom attribute diameter");
+
+  int irequest = neighbor->request(this);
+
+  // per-type shape precalculations
+
+  for (int i = 1; i <= atom->ntypes; i++) {
+    if (setwell[i]) {
+      double *one = atom->shape[i];
+      shape[i][0] = one[0]*one[0];
+      shape[i][1] = one[1]*one[1];
+      shape[i][2] = one[2]*one[2];
+      lshape[i] = (one[0]*one[1]+one[2]*one[2])*sqrt(one[0]*one[1]);
+    }
+  }
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++)
+    for (int j = i; j <= atom->ntypes; j++) {
+      cut = init_one(i,j);
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
+    }
+
+  // If compiled with OpenMP and only 1 proc, try to use multiple GPUs w/threads
+  #ifdef GB_GPU_OMP
+  if (multi_gpu_mode!=ONE_GPU)
+    nthreads=ngpus=gb_gpu_num_devices();
+  else
+    nthreads=ngpus=1;
+  if (nthreads>MAX_GPU_THREADS)
+    nthreads=MAX_GPU_THREADS;
+  omp_set_num_threads(nthreads);
+  #endif
+    
+  #pragma omp parallel firstprivate(my_gpu)
+  {
+    #ifdef GB_GPU_OMP
+    int my_thread = omp_get_thread_num();
+    if (multi_gpu_mode!=ONE_GPU)
+      my_gpu=my_thread;
+    if (multi_gpu_mode==ONE_NODE)
+      my_gpu+=multi_gpu_param;
+    #endif
+    
+    ij[my_thread]=gb_gpu_init(ij_size, atom->ntypes+1, gamma, upsilon, mu, 
+                              shape, well, cutsq, sigma, epsilon, lshape, form,
+                              lj1, lj2, lj3, lj4, offset, force->special_lj, 
+                              neighbor->oneatom, my_thread, my_gpu);
+    if (ij[my_thread]==0)
+      error->one("AT LEAST ONE PROCESS COULD NOT ALLOCATE A CUDA-ENABLED GPU.");
+    
+    if (ij_new[my_thread]!=NULL)
+      delete [] ij_new[my_thread];
+    ij_new[my_thread]=new int[ij_size];
+  }
+  
+  last_neighbor = -1;
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->full = 1;
+  if (force->newton_pair) 
+    error->all("Cannot use newton with GPU GayBerne pair style.");
+
+  if (comm->me == 0 && screen) {
+    printf("\n-------------------------------------");
+    printf("-------------------------------------\n");
+    printf("- Using GPGPU acceleration for Gay-Berne:\n");
+    printf("-------------------------------------");
+    printf("-------------------------------------\n");
+
+    for (int i=0; i<ngpus; i++) {
+      int gpui=my_gpu;
+      if (multi_gpu_mode==ONE_NODE)
+        gpui=i+multi_gpu_param;
+      else if (multi_gpu_mode==MULTI_GPU)
+        gpui=i;
+      std::string gpu_string=gb_gpu_name(gpui,neighbor->oneatom);
+      printf("GPU %d: %s\n",gpui,gpu_string.c_str());  
+    }
+    printf("-------------------------------------");
+    printf("-------------------------------------\n\n");
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairGayBerneGPU::memory_usage()
+{
+  double bytes=Pair::memory_usage()+nthreads*ij_size*sizeof(int);
+  return bytes+gb_gpu_bytes();
+}
diff --git a/src/GPU/pair_gayberne_gpu.h b/src/GPU/pair_gayberne_gpu.h
new file mode 100644
index 0000000000..7be34c3531
--- /dev/null
+++ b/src/GPU/pair_gayberne_gpu.h
@@ -0,0 +1,43 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef PAIR_GPU_H
+#define PAIR_GPU_H
+
+#include "pair_gayberne.h"
+#define MAX_GPU_THREADS 4
+
+namespace LAMMPS_NS {
+
+class PairGayBerneGPU : public PairGayBerne {
+ public:
+  PairGayBerneGPU(LAMMPS *lmp);
+  ~PairGayBerneGPU();
+  void compute(int, int);
+  void settings(int, char **);
+  void init_style();
+  double memory_usage();
+ 
+  enum { ONE_NODE, ONE_GPU, MULTI_GPU };
+
+ private:  
+  int ij_size;
+  int *ij[MAX_GPU_THREADS], *ij_new[MAX_GPU_THREADS], *olist[MAX_GPU_THREADS];
+ 
+  int my_thread, nthreads, thread_inum[MAX_GPU_THREADS], omp_chunk;
+ 
+  int last_neighbor, multi_gpu_mode, multi_gpu_param;
+};
+
+}
+#endif
diff --git a/src/GPU/pair_lj_cut_gpu.cpp b/src/GPU/pair_lj_cut_gpu.cpp
new file mode 100644
index 0000000000..d82f99e87d
--- /dev/null
+++ b/src/GPU/pair_lj_cut_gpu.cpp
@@ -0,0 +1,270 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   cetain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Mike Brown (SNL)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "string.h"
+#include "pair_lj_cut_gpu.h"
+#include "math_extra.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "integrate.h"
+#include "memory.h"
+#include "error.h"
+#include "neigh_request.h"
+#include "universe.h"
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+// External functions from cuda library for force decomposition
+int * lj_gpu_init(int &ij_size, const int ntypes, double **cutsq, 
+                  double **sigma, double **epsilon, double **host_lj1, 
+                  double **host_lj2, double **host_lj3, double **host_lj4, 
+                  double **offset, double *special_lj, const int max_nbors, 
+                  const int gpu_id);
+void lj_gpu_clear();
+bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist, 
+                        const int *numj);
+void lj_gpu_nbors(const int num_ij);
+void lj_gpu_atom(double **host_x, const int *host_type, const bool rebuild);
+void lj_gpu(const bool eflag, const bool vflag, const bool rebuild);
+double lj_gpu_forces(double **f, const int *ilist, const bool eflag, 
+                     const bool vflag, const bool eflag_atom, 
+                     const bool vflag_atom, double *eatom, double **vatom,
+                     double *virial);
+std::string lj_gpu_name(const int gpu_id, const int max_nbors);
+void lj_gpu_time();
+int lj_gpu_num_devices();
+double lj_gpu_bytes();
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutGPU::PairLJCutGPU(LAMMPS *lmp) : PairLJCut(lmp), multi_gpu_mode(0)
+{
+  ij_new=NULL;
+  respa_enable = 0;
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairLJCutGPU::~PairLJCutGPU()
+{
+  if (comm->me == 0 && screen) {
+    printf("\n\n-------------------------------------");
+    printf("--------------------------------\n");
+    printf("      GPU Time Stamps: ");
+    printf("\n-------------------------------------");
+    printf("--------------------------------\n");
+    lj_gpu_time();
+    std::cout << "Procs: " << universe->nprocs << std::endl;
+    printf("-------------------------------------");
+    printf("--------------------------------\n\n");
+  }
+  lj_gpu_clear();
+  if (ij_new!=NULL) {
+    ij_new=NULL;
+    delete [] ij_new;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutGPU::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+  if (vflag_atom) 
+    error->all("Per-atom virial not available with GPU Gay-Berne.");
+
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  int inum = list->inum;
+  int *ilist = list->ilist;
+  
+  bool rebuild=false;
+  if (neighbor->ncalls > last_neighbor) {
+    last_neighbor=neighbor->ncalls;
+    rebuild=true;
+  }
+  
+  // copy nbors to GPU
+  if (rebuild)
+    if (!lj_gpu_reset_nbors(nall, inum, ilist, list->numneigh))
+      error->one("Total # of atoms exceed maximum allowed per GPGPU.\n");
+  
+  // copy atom data to GPU
+  lj_gpu_atom(atom->x,atom->type,rebuild);
+
+  int i,j,ii,jj,jnum;
+  double factor_lj;
+  int *jlist;
+
+  if (rebuild==true) {
+    int num_ij = 0;
+
+    // loop over neighbors of my atoms
+    int *ijp=ij_new;
+    for (ii = 0; ii<inum; ii++) {
+      i = ilist[ii];
+      jlist = list->firstneigh[i];
+      jnum = list->numneigh[i];
+      
+      for (jj = 0; jj < jnum; jj++) {
+        j = jlist[jj];
+
+        *ijp=j;
+        ijp++;
+        num_ij++;
+          
+        if (num_ij==ij_size) {
+          memcpy(ij,ij_new,num_ij*sizeof(int));
+          lj_gpu_nbors(num_ij);
+          ijp=ij_new;
+          num_ij=0;
+        }
+      }
+    }
+    if (num_ij>0) {
+      memcpy(ij,ij_new,num_ij*sizeof(int));
+      lj_gpu_nbors(num_ij);
+    }
+  }
+  
+  lj_gpu(eflag,vflag,rebuild);
+  eng_vdwl=lj_gpu_forces(atom->f,ilist,eflag,vflag, eflag_atom, vflag_atom, 
+                         eatom, vatom, virial);
+
+  if (vflag_fdotr) virial_compute();
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairLJCutGPU::settings(int narg, char **arg)
+{
+  if (narg != 1 && narg != 3) error->all("Illegal pair_style command");
+  
+  // Set multi_gpu_mode to one_node for multiple gpus on 1 node
+  // -- param is starting gpu id
+  // Set multi_gpu_mode to one_gpu to select the same gpu id on every node
+  // -- param is id of gpu
+  // Set multi_gpu_mode to multi_gpu to get ma
+  // -- param is number of gpus per node
+  multi_gpu_mode=ONE_NODE;
+  multi_gpu_param=0;
+  if (narg==3) {
+    multi_gpu_param=atoi(arg[2]);
+    if (strcmp("one_node",arg[1])==0)
+      multi_gpu_mode=ONE_NODE;
+    else if (strcmp("one_gpu",arg[1])==0)
+      multi_gpu_mode=ONE_GPU;
+    else if (strcmp("multi_gpu",arg[1])==0) {
+      multi_gpu_mode=MULTI_GPU;
+      if (multi_gpu_param<1)
+        error->all("Illegal pair_style command");
+    } else
+      error->all("Illegal pair_style command");
+    narg-=2;
+  }
+
+  PairLJCut::settings(narg,arg);
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairLJCutGPU::init_style()
+{
+  // Set the GPU ID
+  int my_gpu=comm->me+multi_gpu_param;
+  int ngpus=universe->nprocs;
+  if (multi_gpu_mode==ONE_GPU) {
+    my_gpu=multi_gpu_param;
+    ngpus=1;
+  } else if (multi_gpu_mode==MULTI_GPU) {
+    ngpus=multi_gpu_param;
+    my_gpu=comm->me%ngpus;
+    if (ngpus>universe->nprocs)
+      ngpus=universe->nprocs;
+  }
+  
+  int irequest = neighbor->request(this);
+  cut_respa=NULL;
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++)
+    for (int j = i; j <= atom->ntypes; j++) {
+      cut = init_one(i,j);
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
+    }
+
+  ij=lj_gpu_init(ij_size, atom->ntypes+1, cutsq, sigma, epsilon, lj1, lj2, lj3, 
+                 lj4, offset, force->special_lj, neighbor->oneatom, my_gpu);
+  if (ij==0)
+    error->one("AT LEAST ONE PROCESS COULD NOT ALLOCATE A CUDA-ENABLED GPU.");
+    
+  if (ij_new!=NULL)
+    delete [] ij_new;
+  ij_new=new int[ij_size];
+  
+  last_neighbor = -1;
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->full = 1;
+  if (force->newton_pair) 
+    error->all("Cannot use newton with GPU LJCut pair style.");
+
+  if (comm->me == 0 && screen) {
+    printf("\n-------------------------------------");
+    printf("-------------------------------------\n");
+    printf("- Using GPGPU acceleration for LJ-Cut:\n");
+    printf("-------------------------------------");
+    printf("-------------------------------------\n");
+
+    for (int i=0; i<ngpus; i++) {
+      int gpui=my_gpu;
+      if (multi_gpu_mode==ONE_NODE)
+        gpui=i+multi_gpu_param;
+      else if (multi_gpu_mode==MULTI_GPU)
+        gpui=i;
+      std::string gpu_string=lj_gpu_name(gpui,neighbor->oneatom);
+      printf("GPU %d: %s\n",gpui,gpu_string.c_str());  
+    }
+    printf("-------------------------------------");
+    printf("-------------------------------------\n\n");
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCutGPU::memory_usage()
+{
+  double bytes=Pair::memory_usage()+ij_size*sizeof(int);
+  return bytes+lj_gpu_bytes();
+}
diff --git a/src/GPU/pair_lj_cut_gpu.h b/src/GPU/pair_lj_cut_gpu.h
new file mode 100644
index 0000000000..2774e723ed
--- /dev/null
+++ b/src/GPU/pair_lj_cut_gpu.h
@@ -0,0 +1,40 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef PAIR_LJ_CUT_GPU_H
+#define PAIR_LJ_CUT_GPU_H
+
+#include "pair_lj_cut.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutGPU : public PairLJCut {
+ public:
+  PairLJCutGPU(LAMMPS *lmp);
+  ~PairLJCutGPU();
+  void compute(int, int);
+  void settings(int, char **);
+  void init_style();
+  double memory_usage();
+
+  enum { ONE_NODE, ONE_GPU, MULTI_GPU };
+
+ private:  
+  int ij_size;
+  int *ij, *ij_new;
+ 
+  int last_neighbor, multi_gpu_mode, multi_gpu_param;
+};
+
+}
+#endif
diff --git a/src/GPU/style_gpu.h b/src/GPU/style_gpu.h
new file mode 100644
index 0000000000..1be999ccdb
--- /dev/null
+++ b/src/GPU/style_gpu.h
@@ -0,0 +1,41 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef AtomInclude
+#endif
+
+#ifdef AtomClass
+# endif
+
+#ifdef ComputeInclude
+#endif
+
+#ifdef ComputeClass
+#endif
+
+#ifdef FixInclude
+#endif
+
+#ifdef FixClass
+#endif
+
+#ifdef PairInclude
+#include "pair_lj_cut_gpu.h"
+#include "pair_gayberne_gpu.h"
+#endif
+
+#ifdef PairClass
+PairStyle(lj/cut/gpu,PairLJCutGPU)
+PairStyle(gayberne/gpu,PairGayBerneGPU)
+#endif
+
diff --git a/src/Makefile b/src/Makefile
index f84ef210f5..55f00f28db 100755
--- a/src/Makefile
+++ b/src/Makefile
@@ -13,7 +13,7 @@ OBJ = 	$(SRC:.cpp=.o)
 
 # Package variables
 
-PACKAGE = asphere class2 colloid dipole dpd granular \
+PACKAGE = asphere class2 colloid dipole dpd gpu granular \
 	  kspace manybody meam molecule opt peri poems reax xtc
 
 PACKUSER = user-ackland user-cg-cmm user-ewaldn user-smd
diff --git a/src/OPT/Install.csh b/src/OPT/Install.csh
index 0a42aeff25..2e0d6f6dd0 100644
--- a/src/OPT/Install.csh
+++ b/src/OPT/Install.csh
@@ -1,21 +1,37 @@
 # Install/unInstall package classes in LAMMPS
+# do not copy eam and charmm files if non-OPT versions do not exist
 
 if ($1 == 1) then
 
-  cp style_opt.h ..
+  if (-e ../pair_eam.cpp) then
+    cp pair_eam_opt.cpp ..
+    cp pair_eam_alloy_opt.cpp ..
+    cp pair_eam_fs_opt.cpp ..
+    cp pair_eam_opt.h ..
+    cp pair_eam_alloy_opt.h ..
+    cp pair_eam_fs_opt.h ..
+  endif
+
+  cp style_opt.h tmp.h
+  if (! -e ../pair_eam.cpp) then
+    grep -v eam tmp.h > tmp1.h
+    mv tmp1.h tmp.h
+  endif
+  if (! -e ../pair_lj_charmm_coul_long.cpp) then
+    grep -v charmm tmp.h > tmp1.h
+    mv tmp1.h tmp.h
+  endif
+  mv tmp.h ../style_opt.h
+
+  if (-e ../pair_lj_charmm_coul_long.cpp) then
+    cp pair_lj_charmm_coul_long_opt.cpp ..
+    cp pair_lj_charmm_coul_long_opt.h ..
+  endif
 
-  cp pair_eam_opt.cpp ..
-  cp pair_eam_alloy_opt.cpp ..
-  cp pair_eam_fs_opt.cpp ..
-  cp pair_lj_charmm_coul_long_opt.cpp ..
   cp pair_lj_cut_opt.cpp ..
-  cp pair_morse_opt.cpp ..
-
-  cp pair_eam_opt.h ..
-  cp pair_eam_alloy_opt.h ..
-  cp pair_eam_fs_opt.h ..
-  cp pair_lj_charmm_coul_long_opt.h ..
   cp pair_lj_cut_opt.h ..
+
+  cp pair_morse_opt.cpp ..
   cp pair_morse_opt.h ..
 
 else if ($1 == 0) then
diff --git a/src/style.h b/src/style.h
index e5cdf8c1d1..3df9044334 100644
--- a/src/style.h
+++ b/src/style.h
@@ -372,6 +372,7 @@ RegionStyle(union,RegUnion)
 #include "style_colloid.h"
 #include "style_dipole.h"
 #include "style_dpd.h"
+#include "style_gpu.h"
 #include "style_granular.h"
 #include "style_kspace.h"
 #include "style_manybody.h"