Added GPU library and package directories.

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@3036 f3b2605a-c512-4ea7-a41b-209d697bcdaa
2009-08-10 20:03:47 +00:00 · 2009-08-10 20:03:47 +00:00 · 2c6dc60a30
parent 15b6eeb011
commit 2c6dc60a30
28 changed files with 5296 additions and 0 deletions
--- a/lib/gpu/Makefile
+++ b/lib/gpu/Makefile
@ -0,0 +1,75 @@
+#***************************************************************************
+#                               Makefile
+#                          -------------------
+#                           W. Michael Brown
+#
+#  _________________________________________________________________________
+#    Build for the LAMMPS GPU Force Library
+#
+#  _________________________________________________________________________
+#
+#    begin                : Tue June 23 2009
+#    copyright            : (C) 2009 by W. Michael Brown
+#    email                : wmbrown@sandia.gov
+# ***************************************************************************/
+
+BIN_DIR = .
+OBJ_DIR = .
+AR = ar
+CUDA_CPP  = nvcc -I/usr/local/cuda/include -DUNIX -O3 -DDEBUG -Xptxas -v --use_fast_math
+CUDA_ARCH = -maxrregcount 128 #-arch=sm_13
+CUDA_PREC = -D_SINGLE_SINGLE
+CUDA_LINK = -L/usr/local/cuda/lib -lcudart $(CUDA_LIB)
+
+CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC)
+
+CUDA_LIB = $(OBJ_DIR)/pair_gpu_lib.a
+
+# Headers for CUDA Stuff
+NVC_H  = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h
+# Headers for Pair Stuff
+PAIR_H  = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h
+
+ALL_H = $(NVC_H) $(PAIR_H)
+
+EXECS = $(BIN_DIR)/nvc_get_devices
+OBJS = $(OBJ_DIR)/nvc_device.o $(OBJ_DIR)/gb_gpu.cu_o \
+       $(OBJ_DIR)/gb_gpu_memory.cu_o $(OBJ_DIR)/lj_gpu.cu_o \
+       $(OBJ_DIR)/lj_gpu_memory.cu_o $(OBJ_DIR)/pair_gpu_atom.cu_o \
+       $(OBJ_DIR)/pair_gpu_nbor.cu_o
+
+all: $(CUDA_LIB) $(EXECS)
+
+$(OBJ_DIR)/nvc_device.o: nvc_device.cu $(NVC_H)
+	$(CUDA) -o $@ -c nvc_device.cu
+
+$(OBJ_DIR)/pair_gpu_atom.cu_o: pair_gpu_atom.cu pair_gpu_texture.h pair_gpu_atom.h $(NVC_H)
+	$(CUDA) -o $@ -c pair_gpu_atom.cu
+
+$(OBJ_DIR)/pair_gpu_nbor.cu_o: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor.h $(NVC_H)
+	$(CUDA) -o $@ -c pair_gpu_nbor.cu
+
+$(OBJ_DIR)/lj_gpu_memory.cu_o: lj_gpu_memory.cu lj_gpu_memory.h $(ALL_H)
+	$(CUDA) -o $@ -c lj_gpu_memory.cu
+
+$(OBJ_DIR)/lj_gpu.cu_o: lj_gpu.cu $(ALL_H) lj_gpu_memory.h lj_gpu_kernel.h
+	$(CUDA) -o $@ -c lj_gpu.cu
+
+$(OBJ_DIR)/gb_gpu_memory.cu_o: gb_gpu_memory.cu gb_gpu_memory.h $(ALL_H)
+	$(CUDA) -o $@ -c gb_gpu_memory.cu
+
+$(OBJ_DIR)/gb_gpu.cu_o: gb_gpu.cu $(ALL_H) gb_gpu_memory.h gb_gpu_kernel.h gb_gpu_extra.h
+	$(CUDA) -o $@ -c gb_gpu.cu
+
+$(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.o
+	$(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.o 
+
+$(CUDA_LIB): $(OBJS)
+	$(AR) -crusv $(CUDA_LIB) $(OBJS)
+
+clean:
+	rm -rf $(EXECS) $(CUDA_LIB) $(OBJS) *.linkinfo
+
+veryclean: clean
+	rm -rf *~ *.linkinfo
+
--- a/lib/gpu/README
+++ b/lib/gpu/README
@ -0,0 +1,94 @@
+/***************************************************************************
+                                   README
+                             -------------------
+                               W. Michael Brown
+
+  README for building LAMMPS GPU Library
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS GPU Library
+ __________________________________________________________________________
+
+    begin                : Thu Jun 25 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+
+                          GENERAL NOTES
+                          
+This library, pair_gpu_lib.a, provides routines for GPGPU acceleration of
+LAMMPS pair styles. Currently, only CUDA enabled GPUs are supported. 
+Compilation of this library requires installing the CUDA GPU driver and
+CUDA toolkit for your operating system. In addition to the LAMMPS library,
+the binary nvc_get_devices will also be built. This can be used to query
+the names and properties of GPU devices on your system.
+
+NOTE: Installation of the CUDA SDK is not required.
+
+Current pair styles supporting GPU Accelartion:
+
+  1. lj/cut/gpu
+  2. gayberne/gpu
+
+
+
+                     MULTIPLE LAMMPS PROCESSES
+                     
+When using GPGPU acceleration, you are restricted to one physical GPU per
+LAMMPS process. This can be multiple GPUs on a single node or across multiple
+nodes. Intructions on GPU assignment can be found in the LAMMPS documentation.
+          
+
+
+                            SPEEDUPS
+
+The speedups that can be obtained using this library are highly
+dependent on the GPU architecture and the computational expense of
+the pair potential. When comparing a single precision Tesla C1060 run to a 
+serial Intel Xeon 5140 2.33 GHz serial run, the speedup is ~4.42x for lj/cut
+with a cutoff of 2.5. For gayberne with a cutoff of 7, the speedup is >103x
+for 8000 particles. The speedup will improve with an increase in the
+number of particles or an increase in the cutoff.
+
+
+                         
+                    BUILDING AND PRECISION MODES
+
+To build, edit the CUDA_CPP, CUDA_ARCH, CUDA_PREC, and CUDA_LINK files for
+your machine. Type make. Additionally, the GPU package must be installed and
+compiled for LAMMPS. The library supports 3 precision modes as determined by 
+the CUDA_PREC variable:
+
+  CUDA_PREC = -D_SINGLE_SINGLE  # Single precision for all calculations
+  CUDA_PREC = -D_DOUBLE_DOUBLE  # Double precision for all calculations
+  CUDA_PREC = -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double
+
+NOTE: Double precision is only supported on certain GPUS (with
+      compute capability>=1.3).
+      
+NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
+      package has been installed before installing the GPU package in LAMMPS.
+
+
+
+                           GPU MEMORY
+
+Upon initialization of the pair style, the library will reserve memory for
+64K atoms per GPU or 70% of each cards GPU memory, whichever value is limiting. 
+The value of 70% can be changed by editing the PERCENT_GPU_MEMORY definition 
+in the source file. The value of 64K cannot be increased and is the maximum 
+number of atoms allowed per GPU. Using the 'neigh_modify one' modifier in
+your LAMMPS input script can help to increase maximum number of atoms per
+GPU for cards with limited memory.
+
+
+
+
--- a/lib/gpu/gb_gpu.cu
+++ b/lib/gpu/gb_gpu.cu
@ -0,0 +1,518 @@
+/***************************************************************************
+                                  gb_gpu.cu
+                             -------------------
+                               W. Michael Brown
+
+  Gay-Berne anisotropic potential GPU calcultation
+
+   *** Force decomposition by Atom Version ***
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS GPU Library
+ __________________________________________________________________________
+
+    begin                : Tue Jun 23 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+#include <iostream>
+#include <cassert>
+#include "nvc_macros.h"
+#include "nvc_timer.h"
+#include "nvc_device.h"
+#include "gb_gpu_memory.h"
+#include "gb_gpu_kernel.h"
+
+using namespace std;
+
+static GB_GPU_Memory<PRECISION,ACC_PRECISION> GBMF[MAX_GPU_THREADS];
+#define GBMT GB_GPU_Memory<numtyp,acctyp>
+
+// ---------------------------------------------------------------------------
+// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access
+// -- Only pack neighbors matching the specified inclusive range of forms
+// -- Only pack neighbors within cutoff
+// ---------------------------------------------------------------------------
+template<class numtyp>
+__global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch, 
+                                 const int start, const int inum, 
+                                 const int *dev_ij, const int form_low, 
+                                 const int form_high, const int nall) {
+                                
+  // ii indexes the two interacting particles in gi
+  int ii=threadIdx.x+INT_MUL(blockIdx.x,blockDim.x)+start;
+
+  if (ii<inum) {
+    int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    const int *list=dev_ij+*nbor;
+    const int *list_end=list+numj;
+    nbor+=nbor_pitch;
+    int *nbor_newj=nbor;
+    nbor+=nbor_pitch;
+  
+    numtyp ix=_x_<numtyp>(i,0);
+    numtyp iy=_x_<numtyp>(i,1);
+    numtyp iz=_x_<numtyp>(i,2);
+    int itype=_x_<numtyp>(i,7);
+
+    int newj=0;  
+    for ( ; list<list_end; list++) {
+      int j=*list;
+      if (j>=nall)
+        j%=nall;
+      int jtype=_x_<numtyp>(j,7);
+      
+      if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) {
+        // Compute r12;
+        numtyp rsq=_x_<numtyp>(j,0)-ix;
+        rsq*=rsq;
+        numtyp t=_x_<numtyp>(j,1)-iy;
+        rsq+=t*t;
+        t=_x_<numtyp>(j,2)-iz;
+        rsq+=t*t;
+
+        if (rsq< _cutsq_<numtyp>(itype,jtype)) {
+          *nbor=j;
+          nbor+=nbor_pitch;
+          newj++;
+        }
+      }
+    }
+    *nbor_newj=newj;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access
+// -- Only pack neighbors matching the specified inclusive range of forms
+// -- Only pack neighbors within cutoff
+// -- Fast version of routine that uses shared memory for LJ constants
+// ---------------------------------------------------------------------------
+template<class numtyp>
+__global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch, 
+                                      const int start, const int inum, 
+                                      const int *dev_ij, const int form_low, 
+                                      const int form_high, const int nall) {
+                                
+  int ii=threadIdx.x;
+  __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    int itype=ii/MAX_SHARED_TYPES;
+    int jtype=ii%MAX_SHARED_TYPES;
+    cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
+    form[ii]=_form_(itype,jtype);
+  }
+  ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
+
+  if (ii<inum) {
+    int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    const int *list=dev_ij+*nbor;
+    const int *list_end=list+numj;
+    nbor+=nbor_pitch;
+    int *nbor_newj=nbor;
+    nbor+=nbor_pitch;
+  
+    numtyp ix=_x_<numtyp>(i,0);
+    numtyp iy=_x_<numtyp>(i,1);
+    numtyp iz=_x_<numtyp>(i,2);
+    int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));
+
+    int newj=0;  
+    for ( ; list<list_end; list++) {
+      int j=*list;
+      if (j>=nall)
+        j%=nall;
+      int mtype=itype+_x_<numtyp>(j,7);
+      
+      if (form[mtype]>=form_low && form[mtype]<=form_high) {
+        // Compute r12;
+        numtyp rsq=_x_<numtyp>(j,0)-ix;
+        rsq*=rsq;
+        numtyp t=_x_<numtyp>(j,1)-iy;
+        rsq+=t*t;
+        t=_x_<numtyp>(j,2)-iz;
+        rsq+=t*t;
+
+        if (rsq<cutsq[mtype]) {
+          *nbor=j;
+          nbor+=nbor_pitch;
+          newj++;
+        }
+      }
+    }
+    *nbor_newj=newj;
+  }
+}
+
+template<class numtyp, class acctyp>
+void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start, 
+                const int inum, const int form_low, const int form_high) {
+  if (gbm.shared_types)
+    kernel_pack_nbor_fast<numtyp><<<GX,BX,0,gbm.pair_stream>>>
+          (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,
+           gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
+  else
+    kernel_pack_nbor<numtyp><<<GX,BX,0,gbm.pair_stream>>>
+          (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,
+           gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
+}
+
+// ---------------------------------------------------------------------------
+// Convert something to a string
+// ---------------------------------------------------------------------------
+#include <sstream>
+template <class t>
+inline string gb_gpu_toa(const t& in) {
+  ostringstream o;
+  o.precision(2);
+  o << in;
+  return o.str();
+}
+
+// ---------------------------------------------------------------------------
+// Return string with GPU info
+// ---------------------------------------------------------------------------
+string gb_gpu_name(const int id, const int max_nbors) {
+  string name=GBMF[0].gpu.name(id)+", "+
+              gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+
+              gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+
+              gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ, "+
+              gb_gpu_toa(GBMF[0].get_max_atoms(GBMF[0].gpu.bytes(id),
+                                               max_nbors))+" Atoms";
+  return name;
+}
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
+                  const double upsilon, const double mu, double **shape,
+                  double **well, double **cutsq, double **sigma, 
+                  double **epsilon, double *host_lshape, int **form,
+                  double **host_lj1, double **host_lj2, double **host_lj3, 
+                  double **host_lj4, double **offset, double *special_lj,
+                  const int max_nbors, const int thread, const int gpu_id) {
+  assert(thread<MAX_GPU_THREADS);
+  
+  if (GBMF[thread].gpu.num_devices()==0)
+    return 0;                   
+
+  ij_size=IJ_SIZE;
+  return GBMF[thread].init(ij_size, ntypes, gamma, upsilon, mu, shape,
+                           well, cutsq, sigma, epsilon, host_lshape, form,
+                           host_lj1, host_lj2, host_lj3, host_lj4, offset,
+                           special_lj, max_nbors, false, gpu_id);
+}
+
+// ---------------------------------------------------------------------------
+// Clear memory on host and device
+// ---------------------------------------------------------------------------
+void gb_gpu_clear(const int thread) {
+  GBMF[thread].clear();
+}
+
+// ---------------------------------------------------------------------------
+// copy atom positions, quaternions, and optionally types to device
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+inline void _gb_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x, 
+                         double **host_quat, const int *host_type, 
+                         const bool rebuild, cudaStream_t &stream) {
+  atom.time_atom.start();
+  atom.reset_write_buffer();
+ 
+  // Rows 1-3 of dev_x are position; rows 4-7 are quaternion
+  atom.add_atom_data(host_x[0],3);
+  atom.add_atom_data(host_x[0]+1,3);
+  atom.add_atom_data(host_x[0]+2,3);
+  atom.add_atom_data(host_quat[0],4);
+  atom.add_atom_data(host_quat[0]+1,4);
+  atom.add_atom_data(host_quat[0]+2,4);
+  atom.add_atom_data(host_quat[0]+3,4);
+   
+  int csize=7;
+  
+  // If a rebuild occured, copy type data
+  if (rebuild) {
+    atom.add_atom_data(host_type);
+    csize++;
+  }
+  
+  atom.copy_atom_data(csize,stream);
+  atom.time_atom.stop();
+}
+
+void gb_gpu_atom(double **host_x, double **host_quat, 
+                 const int *host_type, const bool rebuild, const int thread) {
+  _gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild,
+               GBMF[thread].pair_stream);
+}
+
+// ---------------------------------------------------------------------------
+// Signal that we need to transfer a new neighbor list
+// ---------------------------------------------------------------------------
+template <class gbmtyp>
+int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal, 
+                          const int inum, int *ilist, const int *numj,
+                          const int *type, bool &success) {
+  if (nall>gbm.max_atoms) {
+    success=false;
+    return 0;
+  }
+  success=true;
+    
+  gbm.nbor.time_nbor.start();
+
+  gbm.atom.nall(nall);
+  gbm.atom.inum(inum);
+
+  if (gbm.multiple_forms) {
+    int p=0, acc=0;
+    for (int i=0; i<inum; i++) {
+      int itype=type[ilist[i]];
+      if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
+        gbm.host_olist[p]=ilist[i];
+        gbm.nbor.host_ij[p]=numj[ilist[i]];
+        gbm.nbor.host_ij[p+inum]=acc;
+        acc+=numj[ilist[i]];
+        p++;
+      }
+    }
+    gbm.last_ellipse=p;
+    for (int i=0; i<inum; i++) {
+      int itype=type[ilist[i]];
+      if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
+        gbm.host_olist[p]=ilist[i];
+        gbm.nbor.host_ij[p]=numj[ilist[i]];
+        gbm.nbor.host_ij[p+inum]=acc;
+        acc+=numj[ilist[i]];
+        p++;
+      }
+    }
+    gbm.nbor.ij_total=0;
+    gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
+    gbm.nbor.host_ij.copy_to_2Ddevice(gbm.nbor.dev_nbor.begin()+
+                                      gbm.nbor.dev_nbor.row_size(),
+                                      gbm.nbor.dev_nbor.row_size(),2,inum,
+                                      gbm.pair_stream);
+  } else {
+    gbm.nbor.reset(inum,ilist,numj,gbm.pair_stream);
+    gbm.last_ellipse=inum;
+  }
+
+  gbm.nbor.time_nbor.stop();
+  
+  if (gbm.multiple_forms)
+    return gbm.host_olist.begin();
+  return ilist;
+}
+
+int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum, 
+                         int *ilist, const int *numj, const int *type,
+                         const int thread, bool &success) {
+  return _gb_gpu_reset_nbors(GBMF[thread],nall,nlocal,inum,ilist,numj,type,
+                             success);
+}
+
+// ---------------------------------------------------------------------------
+// Copy a set of ij_size ij interactions to device and compute energies,
+// forces, and torques for those interactions
+// ---------------------------------------------------------------------------
+template <class gbmtyp>
+void _gb_gpu_nbors(gbmtyp &gbm, const int num_ij, const bool eflag) {
+  gbm.nbor.time_nbor.add_to_total();
+  // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
+  
+  gbm.nbor.time_nbor.start();
+  gbm.nbor.add(num_ij,gbm.pair_stream);
+  gbm.nbor.time_nbor.stop();
+}
+
+void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread) {
+  _gb_gpu_nbors(GBMF[thread],num_ij,eflag);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques for all ij interactions
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag, 
+                      const bool rebuild) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=BLOCK_1D;
+
+  int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum())/BX));
+
+  if (gbm.multiple_forms) {
+    gbm.time_kernel.start();
+    if (gbm.last_ellipse>0) {
+      // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
+      GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
+                               static_cast<double>(BX)));
+      pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,SPHERE_ELLIPSE,ELLIPSE_ELLIPSE);
+      gbm.time_kernel.stop();
+  
+      gbm.time_gayberne.start();                                 
+      kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
+           (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
+            gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(),
+            gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
+            eflag, vflag, gbm.last_ellipse, gbm.atom.nall());
+      gbm.time_gayberne.stop();
+
+      if (gbm.last_ellipse==gbm.atom.inum()) {
+        gbm.time_kernel2.start();
+        gbm.time_kernel2.stop();
+        gbm.time_gayberne2.start();
+        gbm.time_gayberne2.stop();
+        gbm.time_pair.start();
+        gbm.time_pair.stop();
+        return;
+      }
+                    
+      // ------------ SPHERE_ELLIPSE ---------------
+        
+      gbm.time_kernel2.start();
+      GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum()-
+                               gbm.last_ellipse)/BX));
+      pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE,
+                 ELLIPSE_SPHERE);
+      gbm.time_kernel2.stop();
+
+      gbm.time_gayberne2.start();
+      kernel_sphere_gb<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
+           (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
+            gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), 
+            gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
+            eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
+      gbm.time_gayberne2.stop();
+   } else {
+      gbm.atom.ans.zero();
+      gbm.time_kernel.stop();
+      gbm.time_gayberne.start();                                 
+      gbm.time_gayberne.stop();
+      gbm.time_kernel2.start();
+      gbm.time_kernel2.stop();
+      gbm.time_gayberne2.start();
+      gbm.time_gayberne2.stop();
+    }
+    
+    // ------------         LJ      ---------------
+    gbm.time_pair.start();
+    if (gbm.last_ellipse<gbm.atom.inum()) {
+      if (gbm.shared_types)
+        kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
+           (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), 
+            gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), 
+            gbm.atom.ans.begin(), gbm.atom.ans.row_size(), 
+            gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, 
+            gbm.atom.inum(), gbm.atom.nall());
+      else
+        kernel_lj<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
+           (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), 
+            gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), 
+            gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
+            eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
+    }
+    gbm.time_pair.stop();
+  } else {
+    gbm.time_kernel.start();
+    pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE);
+    gbm.time_kernel.stop();
+  
+    gbm.time_gayberne.start();                                 
+    kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
+         (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
+          gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), 
+          gbm.atom.ans.begin(), gbm.atom.ans.row_size(), gbm.dev_error.begin(), 
+          eflag, vflag, gbm.atom.inum(), gbm.atom.nall());
+    gbm.time_gayberne.stop();
+  }
+}
+
+void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, 
+                     const int thread) {
+  _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag,rebuild);
+}
+
+// ---------------------------------------------------------------------------
+// Get energies, forces, and torques to host
+// ---------------------------------------------------------------------------
+template<class numtyp, class acctyp>
+double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
+                      const bool eflag, const bool vflag, const bool eflag_atom,
+                      const bool vflag_atom, double *eatom, double **vatom,
+                      double *virial) {
+  double evdw;
+
+  gbm.atom.time_answer.start();
+  gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream);
+
+  gbm.atom.time_atom.add_to_total();
+  gbm.nbor.time_nbor.add_to_total();
+  gbm.time_kernel.add_to_total();
+  gbm.time_gayberne.add_to_total();
+  if (gbm.multiple_forms) {
+    gbm.time_kernel2.add_to_total();
+    gbm.time_gayberne2.add_to_total();
+    gbm.time_pair.add_to_total();
+  }      
+  // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
+  
+  evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
+  gbm.atom.add_forces(ilist,f);
+  gbm.atom.add_torques(ilist,tor,gbm.last_ellipse);
+  gbm.atom.time_answer.stop();
+  gbm.atom.time_answer.add_to_total();
+  return evdw;
+}
+
+double gb_gpu_forces(double **f, double **tor, const int *ilist,
+                     const bool eflag, const bool vflag, const bool eflag_atom,
+                     const bool vflag_atom, double *eatom, double **vatom,
+                     double *virial, const int thread) {
+  return _gb_gpu_forces<PRECISION,ACC_PRECISION>
+                       (GBMF[thread],f,tor,ilist,eflag,vflag,eflag_atom,
+                        vflag_atom,eatom,vatom,virial);
+}
+
+void gb_gpu_time(const int i) {
+  cout.precision(4);
+  cout << "Atom copy:     " << GBMF[i].atom.time_atom.total_seconds() 
+       << " s.\n"
+       << "Neighbor copy: " << GBMF[i].nbor.time_nbor.total_seconds() 
+       << " s.\n"
+       << "Neighbor pack: " << GBMF[i].time_kernel.total_seconds()+
+                               GBMF[i].time_kernel2.total_seconds() << " s.\n"
+       << "Force calc:    " << GBMF[i].time_gayberne.total_seconds()+
+                               GBMF[i].time_gayberne2.total_seconds()<< " s.\n";
+  if (GBMF[i].multiple_forms)
+    cout << "LJ calc:       " << GBMF[i].time_pair.total_seconds() << " s.\n";
+  cout << "Answer copy:   " << GBMF[i].atom.time_answer.total_seconds() 
+       << " s.\n";
+}
+
+int gb_gpu_num_devices() {
+  return GBMF[0].gpu.num_devices();
+}
+
+double gb_gpu_bytes() {
+  return GBMF[0].host_memory_usage();
+}
--- a/lib/gpu/gb_gpu_extra.h
+++ b/lib/gpu/gb_gpu_extra.h
@ -0,0 +1,337 @@
+/***************************************************************************
+                               gb_gpu_extra.h
+                             -------------------
+                               W. Michael Brown
+
+  Inline GPU kernel routines ala math_extra for the CPU.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS GPU Library
+ __________________________________________________________________________
+
+    begin                : Tue Jun 23 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+#ifndef GB_GPU_EXTRA_H
+#define GB_GPU_EXTRA_H
+
+#include "math.h"
+#include "stdio.h"
+#include "string.h"
+
+/* ----------------------------------------------------------------------
+   Atomic update of global memory
+------------------------------------------------------------------------- */
+/*
+template <class numtyp> __device__ 
+inline void atomicAdd(numtyp *address, numtyp val);
+
+template <>
+__device__ inline void atomicAdd<float>(float *address, float val)
+{
+  int i_val = __float_as_int(val);
+  int tmp0 = 0;
+  int tmp1;
+
+  while( (tmp1 = atomicCAS((int *)address, tmp0, i_val)) != tmp0) {
+    tmp0 = tmp1;
+    i_val = __float_as_int(val + __int_as_float(tmp1));
+  }
+}*/
+
+/* ----------------------------------------------------------------------
+   dot product of 2 vectors
+------------------------------------------------------------------------- */
+
+template <class numtyp>
+static __inline__ __device__ numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
+{
+  return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2];
+}
+
+/* ----------------------------------------------------------------------
+   cross product of 2 vectors
+------------------------------------------------------------------------- */
+
+template <class numtyp>
+static __inline__ __device__ void gpu_cross3(const numtyp *v1, 
+                                             const numtyp *v2, numtyp *ans)
+{
+  ans[0] = v1[1]*v2[2]-v1[2]*v2[1];
+  ans[1] = v1[2]*v2[0]-v1[0]*v2[2];
+  ans[2] = v1[0]*v2[1]-v1[1]*v2[0];
+}
+
+/* ----------------------------------------------------------------------
+   determinant of a matrix
+------------------------------------------------------------------------- */
+
+template <class numtyp>
+static __inline__ __device__ numtyp gpu_det3(const numtyp m[9])
+{
+  numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] - 
+    m[3]*m[1]*m[8] + m[3]*m[2]*m[7] + 
+    m[6]*m[1]*m[5] - m[6]*m[2]*m[4];
+  return ans;
+}
+
+/* ----------------------------------------------------------------------
+   diagonal matrix times a full matrix
+------------------------------------------------------------------------- */
+
+template <class numtyp>
+static __inline__ __device__ void gpu_well_times3(const int i, const numtyp m[9],
+                                                  numtyp ans[9])
+{
+  ans[0] = _well_<numtyp>(i,0)*m[0];
+  ans[1] = _well_<numtyp>(i,0)*m[1];
+  ans[2] = _well_<numtyp>(i,0)*m[2];
+  ans[3] = _well_<numtyp>(i,1)*m[3];
+  ans[4] = _well_<numtyp>(i,1)*m[4];
+  ans[5] = _well_<numtyp>(i,1)*m[5];
+  ans[6] = _well_<numtyp>(i,2)*m[6];
+  ans[7] = _well_<numtyp>(i,2)*m[7];
+  ans[8] = _well_<numtyp>(i,2)*m[8];
+}
+
+/* ----------------------------------------------------------------------
+   diagonal matrix times a full matrix
+------------------------------------------------------------------------- */
+
+template <class numtyp>
+static __inline__ __device__ void gpu_shape_times3(const int i, const numtyp m[9],
+                                                   numtyp ans[9])
+{
+  ans[0] = _shape_<numtyp>(i,0)*m[0];
+  ans[1] = _shape_<numtyp>(i,0)*m[1];
+  ans[2] = _shape_<numtyp>(i,0)*m[2];
+  ans[3] = _shape_<numtyp>(i,1)*m[3];
+  ans[4] = _shape_<numtyp>(i,1)*m[4];
+  ans[5] = _shape_<numtyp>(i,1)*m[5];
+  ans[6] = _shape_<numtyp>(i,2)*m[6];
+  ans[7] = _shape_<numtyp>(i,2)*m[7];
+  ans[8] = _shape_<numtyp>(i,2)*m[8];
+}
+
+/* ----------------------------------------------------------------------
+   add two matrices
+------------------------------------------------------------------------- */
+
+template <class numtyp>
+static __inline__ __device__ void gpu_plus3(const numtyp m[9], 
+                                            const numtyp m2[9], numtyp ans[9])
+{
+  ans[0] = m[0]+m2[0];
+  ans[1] = m[1]+m2[1];
+  ans[2] = m[2]+m2[2];
+  ans[3] = m[3]+m2[3];
+  ans[4] = m[4]+m2[4];
+  ans[5] = m[5]+m2[5];
+  ans[6] = m[6]+m2[6];
+  ans[7] = m[7]+m2[7];
+  ans[8] = m[8]+m2[8];
+}
+
+/* ----------------------------------------------------------------------
+   multiply the transpose of mat1 times mat2
+------------------------------------------------------------------------- */
+
+template <class numtyp>
+static __inline__ __device__ void gpu_transpose_times3(const numtyp m[9], 
+                                                       const numtyp m2[9],
+                                                       numtyp ans[9])
+{
+  ans[0] = m[0]*m2[0]+m[3]*m2[3]+m[6]*m2[6];
+  ans[1] = m[0]*m2[1]+m[3]*m2[4]+m[6]*m2[7];
+  ans[2] = m[0]*m2[2]+m[3]*m2[5]+m[6]*m2[8];
+  ans[3] = m[1]*m2[0]+m[4]*m2[3]+m[7]*m2[6];
+  ans[4] = m[1]*m2[1]+m[4]*m2[4]+m[7]*m2[7];
+  ans[5] = m[1]*m2[2]+m[4]*m2[5]+m[7]*m2[8];
+  ans[6] = m[2]*m2[0]+m[5]*m2[3]+m[8]*m2[6];
+  ans[7] = m[2]*m2[1]+m[5]*m2[4]+m[8]*m2[7];
+  ans[8] = m[2]*m2[2]+m[5]*m2[5]+m[8]*m2[8];
+}
+
+/* ----------------------------------------------------------------------
+   row vector times matrix
+------------------------------------------------------------------------- */
+
+template <class numtyp>
+static __inline__ __device__ void gpu_row_times3(const numtyp *v, 
+                                                 const numtyp m[9], numtyp *ans)
+{
+  ans[0] = m[0]*v[0]+v[1]*m[3]+v[2]*m[6];
+  ans[1] = v[0]*m[1]+m[4]*v[1]+v[2]*m[7];
+  ans[2] = v[0]*m[2]+v[1]*m[5]+m[8]*v[2];
+}
+
+/* ----------------------------------------------------------------------
+   solve Ax = b or M ans = v
+   use gaussian elimination & partial pivoting on matrix
+   error_flag set to 2 if bad matrix inversion attempted
+------------------------------------------------------------------------- */
+
+template <class numtyp>
+static __inline__ __device__ void gpu_mldivide3(const numtyp m[9], 
+                                                const numtyp *v, numtyp *ans,
+                                                int *error_flag)
+{
+  // create augmented matrix for pivoting
+
+  numtyp aug[12], t;
+
+  aug[3] = v[0];
+  aug[0] = m[0];
+  aug[1] = m[1];
+  aug[2] = m[2];
+  aug[7] = v[1];
+  aug[4] = m[3];
+  aug[5] = m[4];
+  aug[6] = m[5];
+  aug[11] = v[2];
+  aug[8] = m[6];
+  aug[9] = m[7];
+  aug[10] = m[8];
+
+  if (fabs(aug[4]) > fabs(aug[0])) {
+    numtyp swapt;
+    swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
+    swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
+    swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
+    swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
+  }
+  if (fabs(aug[8]) > fabs(aug[0])) {
+    numtyp swapt;
+    swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
+    swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
+    swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
+    swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
+  }
+
+  if (aug[0] != (numtyp)0.0) {
+    if (0!=0) {
+      numtyp swapt;
+      swapt=aug[0]; aug[0]=aug[0]; aug[0]=swapt;
+      swapt=aug[1]; aug[1]=aug[1]; aug[1]=swapt;
+      swapt=aug[2]; aug[2]=aug[2]; aug[2]=swapt;
+      swapt=aug[3]; aug[3]=aug[3]; aug[3]=swapt;
+    }
+  } else if (aug[4] != (numtyp)0.0) {
+    if (1!=0) {
+      numtyp swapt;
+      swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
+      swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
+      swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
+      swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
+    }
+  } else if (aug[8] != (numtyp)0.0) {
+    if (2!=0) {
+      numtyp swapt;
+      swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
+      swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
+      swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
+      swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
+    }
+  } else
+    *error_flag=2;
+
+  t = aug[4]/aug[0];
+  aug[5]-=t*aug[1];
+  aug[6]-=t*aug[2];
+  aug[7]-=t*aug[3];
+  t = aug[8]/aug[0];
+  aug[9]-=t*aug[1];
+  aug[10]-=t*aug[2];
+  aug[11]-=t*aug[3];
+
+  if (fabs(aug[9]) > fabs(aug[5])) {
+    numtyp swapt;
+    swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
+    swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
+    swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
+    swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
+  }
+
+  if (aug[5] != (numtyp)0.0) {
+    if (1!=1) {
+      numtyp swapt;
+      swapt=aug[4]; aug[4]=aug[4]; aug[4]=swapt;
+      swapt=aug[5]; aug[5]=aug[5]; aug[5]=swapt;
+      swapt=aug[6]; aug[6]=aug[6]; aug[6]=swapt;
+      swapt=aug[7]; aug[7]=aug[7]; aug[7]=swapt;
+    }
+  } else if (aug[9] != (numtyp)0.0) {
+    if (2!=1) {
+      numtyp swapt;
+      swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
+      swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
+      swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
+      swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
+    }
+  }
+
+  t = aug[9]/aug[5];
+  aug[10]-=t*aug[6];
+  aug[11]-=t*aug[7];
+  
+  if (aug[10] == (numtyp)0.0)
+    *error_flag=2;
+
+  ans[2] = aug[11]/aug[10];
+  t = (numtyp)0.0;
+  t += aug[6]*ans[2];
+  ans[1] = (aug[7]-t) / aug[5];
+  t = (numtyp)0.0;
+  t += aug[1]*ans[1];
+  t += aug[2]*ans[2];
+  ans[0] = (aug[3]-t) / aug[0];
+}
+
+/* ----------------------------------------------------------------------
+   compute rotation matrix from quaternion conjugate
+   quat = [w i j k]
+------------------------------------------------------------------------- */
+
+template <class numtyp>
+static __inline__ __device__ void gpu_quat_to_mat_trans(const int qi, 
+                                                        numtyp mat[9])
+{
+  numtyp qi3=_x_<numtyp>(qi,3);
+  numtyp qi4=_x_<numtyp>(qi,4);
+  numtyp qi5=_x_<numtyp>(qi,5);
+  numtyp qi6=_x_<numtyp>(qi,6);
+
+  numtyp w2 = qi3*qi3;
+  numtyp i2 = qi4*qi4;
+  numtyp j2 = qi5*qi5;
+  numtyp k2 = qi6*qi6;
+  numtyp twoij = 2.0*qi4*qi5;
+  numtyp twoik = 2.0*qi4*qi6;
+  numtyp twojk = 2.0*qi5*qi6;
+  numtyp twoiw = 2.0*qi4*qi3;
+  numtyp twojw = 2.0*qi5*qi3;
+  numtyp twokw = 2.0*qi6*qi3;
+
+  mat[0] = w2+i2-j2-k2;
+  mat[3] = twoij-twokw;
+  mat[6] = twojw+twoik;
+
+  mat[1] = twoij+twokw;
+  mat[4] = w2-i2+j2-k2;
+  mat[7] = twojk-twoiw;
+	
+  mat[2] = twoik-twojw;
+  mat[5] = twojk+twoiw;
+  mat[8] = w2-i2-j2+k2;
+}
+
+#endif
--- a/lib/gpu/gb_gpu_kernel.h
+++ b/lib/gpu/gb_gpu_kernel.h
@ -0,0 +1,861 @@
+/***************************************************************************
+                              gb_gpu_kernel.cu
+                             -------------------
+                               W. Michael Brown
+
+  Routines that actually perform the force/torque computation
+
+   *** Force Decomposition by Atom Version ***
+ __________________________________________________________________________
+    This file is part of the LAMMPS GPU Library
+ __________________________________________________________________________
+
+    begin                : Tue Jun 23 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+#ifndef GB_GPU_KERNEL
+#define GB_GPU_KERNEL
+
+#include "gb_gpu_extra.h"
+
+template <class numtyp>
+static __inline__ __device__ void compute_eta_torque(numtyp m[9], 
+                                                     numtyp m2[9],
+                                                     const int i, 
+                                                     numtyp ans[9])
+{
+  numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
+    m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
+    m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
+  den = (numtyp)1.0/den;
+  
+  numtyp shapex=_shape_<numtyp>(i,0);
+  numtyp shapey=_shape_<numtyp>(i,1);
+  numtyp shapez=_shape_<numtyp>(i,2);
+  
+  ans[0] = shapex*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
+		    m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
+		    m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
+		    m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
+		    m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
+  
+  ans[1] = shapex*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
+		    (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
+		    (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
+		    m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
+		    m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
+  
+  ans[2] = shapex*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
+		    m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
+		    m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
+		    (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
+		    m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
+  
+  ans[3] = shapey*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
+		    m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
+		    m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
+		    m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
+		    m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
+  
+  ans[4] = shapey*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
+		    (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
+		    (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
+		    m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
+		    m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
+  
+  ans[5] = shapey*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
+		    m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
+		    (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
+		    m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
+		    (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
+  
+  ans[6] = shapez*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
+		    (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
+		    m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
+		    m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
+		    m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
+  
+  ans[7] = shapez*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
+		     (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
+		     (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
+		     m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
+		     m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
+  
+  ans[8] = shapez*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
+		    m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
+		    m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
+		    (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
+		    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
+}
+
+template<class numtyp, class acctyp>
+__global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
+                                const int *dev_nbor, const int nbor_pitch, 
+                                acctyp *ans, size_t ans_pitch, int *err_flag, 
+                                const bool eflag, const bool vflag,
+                                const int inum, const int nall) {
+                                
+  __shared__ numtyp sp_lj[4];
+
+   // ii indexes the two interacting particles in gi
+  int ii=threadIdx.x;
+  if (ii<4)
+    sp_lj[ii]=special_lj[ii];    
+  ii+=INT_MUL(blockIdx.x,blockDim.x);                                  
+
+  if (ii<inum) {
+  
+  acctyp energy=(numtyp)0;
+  acctyp fx=(numtyp)0;
+  acctyp fy=(numtyp)0;
+  acctyp fz=(numtyp)0;
+  acctyp torx=(numtyp)0;
+  acctyp tory=(numtyp)0;
+  acctyp torz=(numtyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(numtyp)0;
+  
+  const int *nbor=dev_nbor+ii;
+  int i=*nbor;
+  nbor+=nbor_pitch;
+  nbor+=nbor_pitch;
+  nbor+=nbor_pitch;
+  int numj=*nbor;
+  nbor+=nbor_pitch;
+  const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);
+  
+  numtyp ix=_x_<numtyp>(i,0);
+  numtyp iy=_x_<numtyp>(i,1);
+  numtyp iz=_x_<numtyp>(i,2);
+  int itype=_x_<numtyp>(i,7);
+  numtyp a1[9], b1[9], g1[9];
+  {
+    numtyp t[9];
+    gpu_quat_to_mat_trans(i,a1);
+    gpu_shape_times3(itype,a1,t);
+    gpu_transpose_times3(a1,t,g1);
+    gpu_well_times3(itype,a1,t);
+    gpu_transpose_times3(a1,t,b1);
+  }
+
+  numtyp factor_lj;
+  for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+  
+  int j=*nbor;
+  if (j < nall) 
+    factor_lj = 1.0;
+  else {
+    factor_lj = sp_lj[j/nall];
+    j %= nall;
+  }
+  int jtype=_x_<numtyp>(j,7);
+
+  // Compute r12
+  numtyp r12[3];
+  r12[0] = _x_<numtyp>(j,0)-ix;
+  r12[1] = _x_<numtyp>(j,1)-iy;
+  r12[2] = _x_<numtyp>(j,2)-iz;
+  numtyp ir = gpu_dot3(r12,r12);
+
+  ir = rsqrt(ir);
+  numtyp r = (numtyp)1.0/ir;
+
+  numtyp a2[9];
+  gpu_quat_to_mat_trans(j,a2);
+  
+  numtyp u_r, dUr[3], tUr[3], eta, teta[3];
+  { // Compute U_r, dUr, eta, and teta
+    // Compute g12
+    numtyp g12[9];
+    {
+      numtyp g2[9];
+      {
+          gpu_shape_times3(jtype,a2,g12);
+          gpu_transpose_times3(a2,g12,g2);
+          gpu_plus3(g1,g2,g12);
+      }
+  
+      { // Compute U_r and dUr
+    
+        // Compute kappa
+        numtyp kappa[3];
+        gpu_mldivide3(g12,r12,kappa,err_flag);
+
+        // -- replace r12 with r12 hat
+        r12[0]*=ir;
+        r12[1]*=ir;
+        r12[2]*=ir;
+
+        // -- kappa is now / r
+        kappa[0]*=ir;
+        kappa[1]*=ir;
+        kappa[2]*=ir;
+  
+        // energy
+  
+        // compute u_r and dUr
+        numtyp uslj_rsq;
+        {
+          // Compute distance of closest approach
+          numtyp h12, sigma12;
+          sigma12 = gpu_dot3(r12,kappa);
+          sigma12 = rsqrt((numtyp)0.5*sigma12);
+          h12 = r-sigma12;
+
+          // -- kappa is now ok
+          kappa[0]*=r;
+          kappa[1]*=r;
+          kappa[2]*=r;
+          
+          numtyp sigma = _sigma_<numtyp>(itype,jtype);
+          numtyp epsilon = _epsilon_<numtyp>(itype,jtype);
+          numtyp varrho = sigma/(h12+gum[0]*sigma);
+          numtyp varrho6 = varrho*varrho*varrho;
+          varrho6*=varrho6;
+          numtyp varrho12 = varrho6*varrho6;
+          u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
+
+          numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
+          temp1 = temp1*(numtyp)24.0*epsilon;
+          uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
+          numtyp temp2 = gpu_dot3(kappa,r12);
+          uslj_rsq = uslj_rsq*ir*ir;
+
+          dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
+          dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
+          dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
+        }
+
+        // torque for particle 1
+        {
+          numtyp tempv[3], tempv2[3];
+          tempv[0] = -uslj_rsq*kappa[0];
+          tempv[1] = -uslj_rsq*kappa[1];
+          tempv[2] = -uslj_rsq*kappa[2];
+          gpu_row_times3(kappa,g1,tempv2);
+          gpu_cross3(tempv,tempv2,tUr);
+        }
+      }
+    }
+     
+    // Compute eta
+    {
+      eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype);
+      numtyp det_g12 = gpu_det3(g12);
+      eta = pow(eta/det_g12,gum[1]);
+    }
+    
+    // Compute teta
+    numtyp temp[9], tempv[3], tempv2[3];
+    compute_eta_torque(g12,a1,itype,temp);
+    numtyp temp1 = -eta*gum[1];
+
+    tempv[0] = temp1*temp[0];
+    tempv[1] = temp1*temp[1];
+    tempv[2] = temp1*temp[2];
+    gpu_cross3(a1,tempv,tempv2);
+    teta[0] = tempv2[0];
+    teta[1] = tempv2[1];
+    teta[2] = tempv2[2];
+  
+    tempv[0] = temp1*temp[3];
+    tempv[1] = temp1*temp[4];
+    tempv[2] = temp1*temp[5];
+    gpu_cross3(a1+3,tempv,tempv2);
+    teta[0] += tempv2[0];
+    teta[1] += tempv2[1];
+    teta[2] += tempv2[2];
+
+    tempv[0] = temp1*temp[6];
+    tempv[1] = temp1*temp[7];
+    tempv[2] = temp1*temp[8];
+    gpu_cross3(a1+6,tempv,tempv2);
+    teta[0] += tempv2[0];
+    teta[1] += tempv2[1];
+    teta[2] += tempv2[2];
+  }
+  
+  numtyp chi, dchi[3], tchi[3];
+  { // Compute chi and dchi
+
+    // Compute b12
+    numtyp b2[9], b12[9];
+    {
+      gpu_well_times3(jtype,a2,b12);
+      gpu_transpose_times3(a2,b12,b2);
+      gpu_plus3(b1,b2,b12);
+    }
+
+    // compute chi_12
+    r12[0]*=r;
+    r12[1]*=r;
+    r12[2]*=r;
+    numtyp iota[3];
+    gpu_mldivide3(b12,r12,iota,err_flag);
+    // -- iota is now iota/r
+    iota[0]*=ir;
+    iota[1]*=ir;
+    iota[2]*=ir;
+    r12[0]*=ir;
+    r12[1]*=ir;
+    r12[2]*=ir;
+    chi = gpu_dot3(r12,iota);
+    chi = pow(chi*(numtyp)2.0,gum[2]);
+
+    // -- iota is now ok
+    iota[0]*=r;
+    iota[1]*=r;
+    iota[2]*=r;
+
+    numtyp temp1 = gpu_dot3(iota,r12);
+    numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
+    dchi[0] = temp2*(iota[0]-temp1*r12[0]);
+    dchi[1] = temp2*(iota[1]-temp1*r12[1]);
+    dchi[2] = temp2*(iota[2]-temp1*r12[2]);
+
+    // compute t_chi
+    numtyp tempv[3];
+    gpu_row_times3(iota,b1,tempv);
+    gpu_cross3(tempv,iota,tchi);
+    temp1 = (numtyp)-4.0*ir*ir;
+    tchi[0] *= temp1;
+    tchi[1] *= temp1;
+    tchi[2] *= temp1;
+  }
+
+  numtyp temp2 = factor_lj*eta*chi;
+  if (eflag)
+    energy+=u_r*temp2;
+  numtyp temp1 = -eta*u_r*factor_lj;
+  if (vflag) {
+    r12[0]*=-r;
+    r12[1]*=-r;
+    r12[2]*=-r;
+    numtyp ft=temp1*dchi[0]-temp2*dUr[0];
+    fx+=ft;
+    virial[0]+=r12[0]*ft;
+    ft=temp1*dchi[1]-temp2*dUr[1];
+    fy+=ft;
+    virial[1]+=r12[1]*ft;
+    virial[3]+=r12[0]*ft;
+    ft=temp1*dchi[2]-temp2*dUr[2];
+    fz+=ft;
+    virial[2]+=r12[2]*ft;
+    virial[4]+=r12[0]*ft;
+    virial[5]+=r12[1]*ft;
+  } else {
+    fx+=temp1*dchi[0]-temp2*dUr[0];
+    fy+=temp1*dchi[1]-temp2*dUr[1];
+    fz+=temp1*dchi[2]-temp2*dUr[2];
+  }
+
+  // Torque on 1
+  temp1 = -u_r*eta*factor_lj;
+  temp2 = -u_r*chi*factor_lj;
+  numtyp temp3 = -chi*eta*factor_lj;
+  torx+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
+  tory+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
+  torz+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
+
+  } // for nbor
+
+  // Store answers
+  acctyp *ap1=ans+ii;
+  if (eflag) {
+    *ap1=energy;
+    ap1+=ans_pitch;
+  }
+  if (vflag) {
+    for (int i=0; i<6; i++) {
+      *ap1=virial[i];
+      ap1+=ans_pitch;
+    }
+  }
+  *ap1=fx;
+  ap1+=ans_pitch;
+  *ap1=fy;
+  ap1+=ans_pitch;
+  *ap1=fz;
+  ap1+=ans_pitch;
+  *ap1=torx;
+  ap1+=ans_pitch;
+  *ap1=tory;
+  ap1+=ans_pitch;
+  *ap1=torz;
+
+  } // if ii
+}
+
+template<class numtyp, class acctyp>
+__global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
+                                 const int *dev_nbor, const int nbor_pitch, 
+                                 acctyp *ans, size_t ans_pitch, int *err_flag, 
+                                 const bool eflag, const bool vflag,
+                                 const int start, const int inum, 
+                                 const int nall) {
+  __shared__ numtyp sp_lj[4];
+
+  // ii indexes the two interacting particles in gi
+  int ii=threadIdx.x;
+  if (ii<4)
+    sp_lj[ii]=special_lj[ii];    
+  ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
+
+  if (ii<inum) {
+  
+    acctyp energy=(numtyp)0;
+    acctyp fx=(numtyp)0;
+    acctyp fy=(numtyp)0;
+    acctyp fz=(numtyp)0;
+    acctyp virial[6];
+    for (int i=0; i<6; i++)
+      virial[i]=(numtyp)0;
+  
+    const int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    nbor+=nbor_pitch;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);
+  
+    numtyp ix=_x_<numtyp>(i,0);
+    numtyp iy=_x_<numtyp>(i,1);
+    numtyp iz=_x_<numtyp>(i,2);
+    int itype=_x_<numtyp>(i,7);
+      
+    numtyp oner=_shape_<numtyp>(itype,0);
+    numtyp one_well=_well_<numtyp>(itype,0);
+  
+    numtyp factor_lj;
+    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+  
+      int j=*nbor;
+      if (j < nall) 
+        factor_lj = 1.0;
+      else {
+        factor_lj = sp_lj[j/nall];
+        j %= nall;
+      }
+      int jtype=_x_<numtyp>(j,7);
+
+      // Compute r12
+      numtyp r12[3];
+      r12[0] = _x_<numtyp>(j,0)-ix;
+      r12[1] = _x_<numtyp>(j,1)-iy;
+      r12[2] = _x_<numtyp>(j,2)-iz;
+      numtyp ir = gpu_dot3(r12,r12);
+
+      ir = rsqrt(ir);
+      numtyp r = (numtyp)1.0/ir;
+      
+      numtyp r12hat[3];
+      r12hat[0]=r12[0]*ir;
+      r12hat[1]=r12[1]*ir;
+      r12hat[2]=r12[2]*ir;
+
+      numtyp a2[9];
+      gpu_quat_to_mat_trans(j,a2);
+  
+      numtyp u_r, dUr[3], eta;
+      { // Compute U_r, dUr, eta, and teta
+        // Compute g12
+        numtyp g12[9];
+        {
+          {
+            numtyp g2[9];
+            gpu_shape_times3(jtype,a2,g12);
+            gpu_transpose_times3(a2,g12,g2);
+            g12[0]=g2[0]+oner;
+            g12[4]=g2[4]+oner;
+            g12[8]=g2[8]+oner;
+            g12[1]=g2[1];
+            g12[2]=g2[2];
+            g12[3]=g2[3];
+            g12[5]=g2[5];
+            g12[6]=g2[6];
+            g12[7]=g2[7];    
+          }
+  
+          { // Compute U_r and dUr
+    
+            // Compute kappa
+            numtyp kappa[3];
+            gpu_mldivide3(g12,r12,kappa,err_flag);
+
+            // -- kappa is now / r
+            kappa[0]*=ir;
+            kappa[1]*=ir;
+            kappa[2]*=ir;
+  
+            // energy
+  
+            // compute u_r and dUr
+            numtyp uslj_rsq;
+            {
+              // Compute distance of closest approach
+              numtyp h12, sigma12;
+              sigma12 = gpu_dot3(r12hat,kappa);
+              sigma12 = rsqrt((numtyp)0.5*sigma12);
+              h12 = r-sigma12;
+
+              // -- kappa is now ok
+              kappa[0]*=r;
+              kappa[1]*=r;
+              kappa[2]*=r;
+          
+              numtyp sigma = _sigma_<numtyp>(itype,jtype);
+              numtyp epsilon = _epsilon_<numtyp>(itype,jtype);
+              numtyp varrho = sigma/(h12+gum[0]*sigma);
+              numtyp varrho6 = varrho*varrho*varrho;
+              varrho6*=varrho6;
+              numtyp varrho12 = varrho6*varrho6;
+              u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
+
+              numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
+              temp1 = temp1*(numtyp)24.0*epsilon;
+              uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
+              numtyp temp2 = gpu_dot3(kappa,r12hat);
+              uslj_rsq = uslj_rsq*ir*ir;
+
+              dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
+              dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
+              dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
+            }
+          }
+        }
+     
+        // Compute eta
+        {
+          eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype);
+          numtyp det_g12 = gpu_det3(g12);
+          eta = pow(eta/det_g12,gum[1]);
+        }
+      }
+  
+      numtyp chi, dchi[3];
+      { // Compute chi and dchi
+
+        // Compute b12
+        numtyp b12[9];
+        {
+          numtyp b2[9];
+          gpu_well_times3(jtype,a2,b12);
+          gpu_transpose_times3(a2,b12,b2);
+          b12[0]=b2[0]+one_well;
+          b12[4]=b2[4]+one_well;
+          b12[8]=b2[8]+one_well;
+          b12[1]=b2[1];
+          b12[2]=b2[2];
+          b12[3]=b2[3];
+          b12[5]=b2[5];
+          b12[6]=b2[6];
+          b12[7]=b2[7];    
+        }
+
+        // compute chi_12
+        numtyp iota[3];
+        gpu_mldivide3(b12,r12,iota,err_flag);
+        // -- iota is now iota/r
+        iota[0]*=ir;
+        iota[1]*=ir;
+        iota[2]*=ir;
+        chi = gpu_dot3(r12hat,iota);
+        chi = pow(chi*(numtyp)2.0,gum[2]);
+
+        // -- iota is now ok
+        iota[0]*=r;
+        iota[1]*=r;
+        iota[2]*=r;
+
+        numtyp temp1 = gpu_dot3(iota,r12hat);
+        numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
+        dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
+        dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
+        dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
+      }
+
+      numtyp temp2 = factor_lj*eta*chi;
+      if (eflag)
+        energy+=u_r*temp2;
+      numtyp temp1 = -eta*u_r*factor_lj;
+      if (vflag) {
+        r12[0]*=-1;
+        r12[1]*=-1;
+        r12[2]*=-1;
+        numtyp ft=temp1*dchi[0]-temp2*dUr[0];
+        fx+=ft;
+        virial[0]+=r12[0]*ft;
+        ft=temp1*dchi[1]-temp2*dUr[1];
+        fy+=ft;
+        virial[1]+=r12[1]*ft;
+        virial[3]+=r12[0]*ft;
+        ft=temp1*dchi[2]-temp2*dUr[2];
+        fz+=ft;
+        virial[2]+=r12[2]*ft;
+        virial[4]+=r12[0]*ft;
+        virial[5]+=r12[1]*ft;
+      } else {
+        fx+=temp1*dchi[0]-temp2*dUr[0];
+        fy+=temp1*dchi[1]-temp2*dUr[1];
+        fz+=temp1*dchi[2]-temp2*dUr[2];
+      }
+    } // for nbor
+
+    // Store answers
+    acctyp *ap1=ans+ii;
+    if (eflag) {
+      *ap1=energy;
+      ap1+=ans_pitch;
+    }
+    if (vflag) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=ans_pitch;
+      }
+    }
+    *ap1=fx;
+    ap1+=ans_pitch;
+    *ap1=fy;
+    ap1+=ans_pitch;
+    *ap1=fz;
+  } // if ii
+}
+
+template<class numtyp, class acctyp>
+__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor, 
+                          const int *dev_ij, const int nbor_pitch, acctyp *ans, 
+                          size_t ans_pitch, int *err_flag, const bool eflag, 
+                          const bool vflag, const int start, const int inum,
+                          const int nall) {
+  __shared__ numtyp sp_lj[4];                              
+  
+  // ii indexes the two interacting particles in gi
+  int ii=threadIdx.x;
+  if (ii<4)
+    sp_lj[ii]=special_lj[ii];    
+  ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
+
+  if (ii<inum) {
+  
+    acctyp energy=(numtyp)0;
+    acctyp fx=(numtyp)0;
+    acctyp fy=(numtyp)0;
+    acctyp fz=(numtyp)0;
+    acctyp virial[6];
+    for (int i=0; i<6; i++)
+      virial[i]=(numtyp)0;
+  
+    const int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    const int *list=dev_ij+*nbor;
+    const int *list_end=list+numj;
+  
+    numtyp ix=_x_<numtyp>(i,0);
+    numtyp iy=_x_<numtyp>(i,1);
+    numtyp iz=_x_<numtyp>(i,2);
+    int itype=_x_<numtyp>(i,7);
+
+    numtyp factor_lj;
+    for ( ; list<list_end; list++) {
+  
+      int j=*list;
+      if (j < nall) 
+        factor_lj = 1.0;
+      else {
+        factor_lj = sp_lj[j/nall];
+        j %= nall;
+      }
+      int jtype=_x_<numtyp>(j,7);
+
+      // Compute r12
+      numtyp delx = ix-_x_<numtyp>(j,0);
+      numtyp dely = iy-_x_<numtyp>(j,1);
+      numtyp delz = iz-_x_<numtyp>(j,2);
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      if (r2inv<_cutsq_<numtyp>(itype,jtype) &&
+          _form_(itype,jtype)==SPHERE_SPHERE) {
+        r2inv=(numtyp)1.0/r2inv;
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = r2inv*r6inv*(_lj1_<numtyp>(itype,jtype).x*r6inv-
+                                    _lj1_<numtyp>(itype,jtype).y);
+        force*=factor_lj;
+      
+        fx+=delx*force;
+        fy+=dely*force;
+        fz+=delz*force;
+
+        if (eflag) {
+          numtyp e=r6inv*(_lj3_<numtyp>(itype,jtype).x*r6inv-
+                          _lj3_<numtyp>(itype,jtype).y);
+          energy+=factor_lj*(e-_offset_<numtyp>(1,1)); 
+        }
+        if (vflag) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+
+    // Store answers
+    acctyp *ap1=ans+ii;
+    if (eflag) {
+      *ap1+=energy;
+      ap1+=ans_pitch;
+    }
+    if (vflag) {
+      for (int i=0; i<6; i++) {
+        *ap1+=virial[i];
+        ap1+=ans_pitch;
+      }
+    }
+    *ap1+=fx;
+    ap1+=ans_pitch;
+    *ap1+=fy;
+    ap1+=ans_pitch;
+    *ap1+=fz;
+
+  } // if ii
+}
+
+template<class numtyp, class acctyp>
+__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor, 
+                               const int *dev_ij, const int nbor_pitch, 
+                               acctyp *ans, size_t ans_pitch,int *err_flag,
+                               const bool eflag, const bool vflag, 
+                               const int start, const int inum, const int nall){
+  // ii indexes the two interacting particles in gi
+  int ii=threadIdx.x;
+  __shared__ numtyp sp_lj[4];                              
+  __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  if (ii<4)
+    sp_lj[ii]=special_lj[ii];    
+  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    int itype=ii/MAX_SHARED_TYPES;
+    int jtype=ii%MAX_SHARED_TYPES;
+    cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
+    form[ii]=_form_(itype,jtype);
+    lj1[ii]=_lj1_<numtyp>(itype,jtype).x;
+    lj2[ii]=_lj1_<numtyp>(itype,jtype).y;
+    if (eflag) {
+      lj3[ii]=_lj3_<numtyp>(itype,jtype).x;
+      lj4[ii]=_lj3_<numtyp>(itype,jtype).y;
+      offset[ii]=_offset_<numtyp>(itype,jtype);
+    }
+  }
+  ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
+  
+  if (ii<inum) {
+  
+    acctyp energy=(numtyp)0;
+    acctyp fx=(numtyp)0;
+    acctyp fy=(numtyp)0;
+    acctyp fz=(numtyp)0;
+    acctyp virial[6];
+    for (int i=0; i<6; i++)
+      virial[i]=(numtyp)0;
+  
+    const int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    const int *list=dev_ij+*nbor;
+    const int *list_end=list+numj;
+  
+    numtyp ix=_x_<numtyp>(i,0);
+    numtyp iy=_x_<numtyp>(i,1);
+    numtyp iz=_x_<numtyp>(i,2);
+    int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));
+
+    numtyp factor_lj;
+    for ( ; list<list_end; list++) {
+  
+      int j=*list;
+      if (j < nall) 
+        factor_lj = 1.0;
+      else {
+        factor_lj = sp_lj[j/nall];
+        j %= nall;
+      }
+      int mtype=itype+_x_<numtyp>(j,7);
+
+      // Compute r12
+      numtyp delx = ix-_x_<numtyp>(j,0);
+      numtyp dely = iy-_x_<numtyp>(j,1);
+      numtyp delz = iz-_x_<numtyp>(j,2);
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      if (r2inv<cutsq[mtype] && form[mtype]==SPHERE_SPHERE) {
+        r2inv=(numtyp)1.0/r2inv;
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype]*r6inv-lj2[mtype]);
+      
+        fx+=delx*force;
+        fy+=dely*force;
+        fz+=delz*force;
+
+        if (eflag) {
+          numtyp e=r6inv*(lj3[mtype]*r6inv-lj4[mtype]);
+          energy+=factor_lj*(e-offset[mtype]); 
+        }
+        if (vflag) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+
+    // Store answers
+    acctyp *ap1=ans+ii;
+    if (eflag) {
+      *ap1+=energy;
+      ap1+=ans_pitch;
+    }
+    if (vflag) {
+      for (int i=0; i<6; i++) {
+        *ap1+=virial[i];
+        ap1+=ans_pitch;
+      }
+    }
+    *ap1+=fx;
+    ap1+=ans_pitch;
+    *ap1+=fy;
+    ap1+=ans_pitch;
+    *ap1+=fz;
+
+  } // if ii
+}
+
+#endif
--- a/lib/gpu/gb_gpu_memory.cu
+++ b/lib/gpu/gb_gpu_memory.cu
@ -0,0 +1,146 @@
+/***************************************************************************
+                              gb_gpu_memory.cu
+                             -------------------
+                               W. Michael Brown
+
+  Global variables for GPU Gayberne Library
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS GPU Library
+ __________________________________________________________________________
+
+    begin                : Thu Jun 25 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+#include "gb_gpu_memory.h"
+#define GB_GPU_MemoryT GB_GPU_Memory<numtyp, acctyp>
+
+template <class numtyp, class acctyp>
+GB_GPU_MemoryT::GB_GPU_Memory() : LJ_GPU_MemoryT() {
+  this->atom.atom_fields(8); 
+  this->atom.ans_fields(13); 
+  this->nbor.packing(true);
+}
+
+template <class numtyp, class acctyp>
+GB_GPU_MemoryT::~GB_GPU_Memory() { 
+  clear(); 
+}
+ 
+template <class numtyp, class acctyp>
+int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes, 
+                          const double gamma, const double upsilon, 
+                          const double mu, double **host_shape,
+                          double **host_well, double **host_cutsq, 
+                          double **host_sigma, double **host_epsilon, 
+                          double *host_lshape, int **h_form, double **host_lj1, 
+                          double **host_lj2, double **host_lj3, 
+                          double **host_lj4, double **host_offset, 
+                          double *host_special_lj, const int max_nbors, 
+                          const bool force_d, const int me) {
+  if (this->allocated)
+    clear();
+    
+  LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon,
+                       host_lj1, host_lj2, host_lj3, host_lj4, host_offset,
+                       host_special_lj, max_nbors, me);
+    
+  host_form=h_form;
+    
+  // Initialize timers for the selected GPU
+  time_kernel.init();
+  time_gayberne.init();
+  time_kernel2.init();
+  time_gayberne2.init();
+    
+  // Use the write buffer from atom for data initialization
+  NVC_HostT &host_write=this->atom.host_write;
+  assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2);
+
+  // Allocate, cast and asynchronous memcpy of constant data
+  gamma_upsilon_mu.safe_alloc(3);
+  host_write[0]=static_cast<numtyp>(gamma); 
+  host_write[1]=static_cast<numtyp>(upsilon);
+  host_write[2]=static_cast<numtyp>(mu);
+  gamma_upsilon_mu.copy_from_host(host_write.begin());
+
+  lshape.safe_alloc(ntypes);
+  lshape.cast_copy(host_lshape,host_write);
+  lshape.copy_from_host(host_write.begin());
+    
+  // Copy shape, well, sigma, epsilon, and cutsq onto GPU
+  shape.safe_alloc(ntypes,3);
+  shape.cast_copy(host_shape[0],host_write);
+  well.safe_alloc(ntypes,3);
+  well.cast_copy(host_well[0],host_write);
+
+  // Copy LJ data onto GPU
+  int lj_types=ntypes;
+  if (lj_types<=MAX_SHARED_TYPES)
+    lj_types=MAX_SHARED_TYPES;
+  form.safe_alloc(lj_types,lj_types);
+  form.copy_2Dfrom_host(host_form[0],ntypes,ntypes);
+
+  // See if we want fast GB-sphere or sphere-sphere calculations
+  multiple_forms=false;
+  for (int i=1; i<ntypes; i++)
+    for (int j=i; j<ntypes; j++) 
+      if (host_form[i][j]!=ELLIPSE_ELLIPSE)
+        multiple_forms=true;
+        
+  // Memory for ilist ordered by particle type
+  host_olist.safe_alloc_rw(this->max_atoms);
+
+  // Bind constant data to textures
+  lshape_bind_texture<numtyp>(lshape);
+  shape_bind_texture<numtyp>(shape);
+  well_bind_texture<numtyp>(well);
+  form_bind_texture(form);
+
+  return this->nbor.host_ij.begin();
+}
+  
+template <class numtyp, class acctyp>
+void GB_GPU_MemoryT::clear() {
+  if (!this->allocated)
+    return;
+
+  int err_flag;
+  this->dev_error.copy_to_host(&err_flag);
+  if (err_flag == 1)
+    std::cerr << "COLLISION BUFFER OVERFLOW OCCURED. INCREASE COLLISION_N "
+              << "and RECOMPILE.\n";
+  else if (err_flag == 2)
+    std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";  
+
+  LJ_GPU_MemoryT::clear();      
+  
+  shape_unbind_texture<numtyp>();
+  well_unbind_texture<numtyp>();
+  form_unbind_texture();
+
+  shape.clear();
+  well.clear();
+  form.clear();
+  lshape.clear();
+  gamma_upsilon_mu.clear();
+  host_olist.clear();
+}  
+ 
+template <class numtyp, class acctyp>
+double GB_GPU_MemoryT::host_memory_usage() {
+  return this->atom.host_memory_usage(this->max_atoms)+
+         this->nbor.host_memory_usage()+4*sizeof(numtyp)+
+         sizeof(GB_GPU_Memory<numtyp,acctyp>)+this->max_atoms*sizeof(int);
+}
+
+template class GB_GPU_Memory<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/gb_gpu_memory.h
+++ b/lib/gpu/gb_gpu_memory.h
@ -0,0 +1,74 @@
+/***************************************************************************
+                               gb_gpu_memory.h
+                             -------------------
+                               W. Michael Brown
+
+  Global variables for GPU Gayberne Library
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS GPU Library
+ __________________________________________________________________________
+
+    begin                : Thu Jun 25 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+#ifndef GB_GPU_MEMORY_H
+#define GB_GPU_MEMORY_H
+
+#define MAX_GPU_THREADS 4
+#include "lj_gpu_memory.h"
+#define LJ_GPU_MemoryT LJ_GPU_Memory<numtyp,acctyp>
+
+enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
+
+template <class numtyp, class acctyp>
+class GB_GPU_Memory : public LJ_GPU_MemoryT {
+ public:
+  GB_GPU_Memory();
+  ~GB_GPU_Memory(); 
+ 
+  int* init(const int ij_size, const int ntypes, const double gamma,
+            const double upsilon, const double mu, double **host_shape,
+            double **host_well, double **host_cutsq, double **host_sigma, 
+            double **host_epsilon, double *host_lshape, int **h_form,
+            double **host_lj1, double **host_lj2, double **host_lj3, 
+            double **host_lj4, double **host_offset, double *host_special_lj,
+            const int max_nbors, const bool force_d, const int me);
+
+  void clear();
+ 
+  double host_memory_usage();
+  
+  // ----------------------------  DATA  ----------------------------
+
+  // ilist with particles sorted by type
+  NVC_HostI host_olist;
+
+  // --------------- Const Data for Atoms
+  NVC_ConstMatT shape, well;
+  NVC_ConstMatI form;
+  NVC_VecT lshape, gamma_upsilon_mu;
+  
+
+  // --------------- Timing Stuff
+  NVCTimer time_kernel, time_gayberne, time_kernel2, time_gayberne2;
+  
+  // True if we want to use fast GB-sphere or sphere-sphere calculations 
+  bool multiple_forms;
+  int **host_form;
+  int last_ellipse;
+   
+ private:
+};
+
+#endif
+
--- a/lib/gpu/lj_gpu.cu
+++ b/lib/gpu/lj_gpu.cu
@ -0,0 +1,234 @@
+/***************************************************************************
+                                  lj_gpu.cu
+                             -------------------
+                               W. Michael Brown
+
+  Lennard-Jones potential GPU calcultation
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS GPU Library
+ __________________________________________________________________________
+
+    begin                : Tue Aug 4 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+#include <iostream>
+#include <cassert>
+#include "nvc_macros.h"
+#include "nvc_timer.h"
+#include "nvc_device.h"
+#include "lj_gpu_memory.h"
+#include "lj_gpu_kernel.h"
+
+using namespace std;
+
+static LJ_GPU_Memory<PRECISION,ACC_PRECISION> LJMF;
+#define LJMT LJ_GPU_Memory<numtyp,acctyp>
+
+// ---------------------------------------------------------------------------
+// Convert something to a string
+// ---------------------------------------------------------------------------
+#include <sstream>
+template <class t>
+inline string lj_gpu_toa(const t& in) {
+  ostringstream o;
+  o.precision(2);
+  o << in;
+  return o.str();
+}
+
+// ---------------------------------------------------------------------------
+// Return string with GPU info
+// ---------------------------------------------------------------------------
+string lj_gpu_name(const int id, const int max_nbors) {
+  string name=LJMF.gpu.name(id)+", "+
+              lj_gpu_toa(LJMF.gpu.cores(id))+" cores, "+
+              lj_gpu_toa(LJMF.gpu.gigabytes(id))+" GB, "+
+              lj_gpu_toa(LJMF.gpu.clock_rate(id))+" GHZ, "+
+              lj_gpu_toa(LJMF.get_max_atoms(LJMF.gpu.bytes(id),
+                                            max_nbors))+" Atoms";
+  return name;
+}
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int * lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, 
+                  double **epsilon, double **host_lj1, double **host_lj2, 
+                  double **host_lj3, double **host_lj4, double **offset, 
+                  double *special_lj, const int max_nbors, const int gpu_id) {
+  if (LJMF.gpu.num_devices()==0)
+    return 0;                   
+
+  ij_size=IJ_SIZE;
+  return LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, 
+                   host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id);
+}
+
+// ---------------------------------------------------------------------------
+// Clear memory on host and device
+// ---------------------------------------------------------------------------
+void lj_gpu_clear() {
+  LJMF.clear();
+}
+
+// ---------------------------------------------------------------------------
+// copy atom positions and optionally types to device
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+inline void _lj_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x,
+                         const int *host_type, const bool rebuild,
+                         cudaStream_t &stream) {
+  atom.time_atom.start();
+  atom.reset_write_buffer();
+  
+  // First row of dev_x is x position, second is y, third is z
+  atom.add_atom_data(host_x[0],3);
+  atom.add_atom_data(host_x[0]+1,3);
+  atom.add_atom_data(host_x[0]+2,3);
+
+  int csize=3;
+
+  // If a rebuild occured, copy type data
+  if (rebuild) {
+    atom.add_atom_data(host_type);
+    csize++;
+  }
+  
+  atom.copy_atom_data(csize,stream);
+  atom.time_atom.stop();
+}
+
+void lj_gpu_atom(double **host_x, const int *host_type, const bool rebuild) {
+  _lj_gpu_atom(LJMF.atom, host_x, host_type, rebuild, LJMF.pair_stream);
+}
+
+// ---------------------------------------------------------------------------
+// Signal that we need to transfer a new neighbor list
+// ---------------------------------------------------------------------------
+template <class LJMTyp>
+bool _lj_gpu_reset_nbors(LJMTyp &ljm, const int nall, const int inum, 
+                         int *ilist, const int *numj) {
+  if (nall>ljm.max_atoms)
+    return false;
+  
+  ljm.nbor.time_nbor.start();
+
+  ljm.atom.nall(nall);
+  ljm.atom.inum(inum);
+  ljm.nbor.reset(inum,ilist,numj,ljm.pair_stream);
+
+  ljm.nbor.time_nbor.stop();
+  return true;
+}
+
+bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist, 
+                        const int *numj) {
+  return _lj_gpu_reset_nbors(LJMF,nall,inum,ilist,numj);
+}
+
+// ---------------------------------------------------------------------------
+// Copy a set of ij_size ij interactions to device and compute energies,
+// forces, and torques for those interactions
+// ---------------------------------------------------------------------------
+template <class LJMTyp>
+void _lj_gpu_nbors(LJMTyp &ljm, const int num_ij) {
+  ljm.nbor.time_nbor.add_to_total();
+  // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed
+  
+  ljm.nbor.time_nbor.start();
+  ljm.nbor.add(num_ij,ljm.pair_stream);
+  ljm.nbor.time_nbor.stop();
+}
+
+void lj_gpu_nbors(const int num_ij) {
+  _lj_gpu_nbors(LJMF,num_ij);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies and forces for all ij interactions
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void _lj_gpu(LJMT &ljm, const bool eflag, const bool vflag, const bool rebuild){
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=BLOCK_1D;
+
+  int GX=static_cast<int>(ceil(static_cast<double>(ljm.atom.inum())/BX));
+
+  ljm.time_pair.start();
+  if (ljm.shared_types)
+    kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
+           (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), 
+            ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), 
+            ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag,
+            vflag, ljm.atom.inum(), ljm.atom.nall());
+  else
+    kernel_lj<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
+           (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), 
+            ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), 
+            ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag, 
+            vflag, ljm.atom.inum(), ljm.atom.nall());
+  ljm.time_pair.stop();
+}
+
+void lj_gpu(const bool eflag, const bool vflag, const bool rebuild) {
+  _lj_gpu<PRECISION,ACC_PRECISION>(LJMF,eflag,vflag,rebuild);
+}
+
+// ---------------------------------------------------------------------------
+// Get energies and forces to host
+// ---------------------------------------------------------------------------
+template<class numtyp, class acctyp>
+double _lj_gpu_forces(LJMT &ljm, double **f, const int *ilist,
+                      const bool eflag, const bool vflag, const bool eflag_atom,
+                      const bool vflag_atom, double *eatom, double **vatom,
+                      double *virial) {
+  double evdw;
+  
+  ljm.atom.time_answer.start();
+  ljm.atom.copy_answers(eflag,vflag,ljm.pair_stream);
+
+  ljm.atom.time_atom.add_to_total();
+  ljm.nbor.time_nbor.add_to_total();
+  ljm.time_pair.add_to_total();
+  // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // not if timed
+
+  evdw=ljm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
+  ljm.atom.add_forces(ilist,f);
+  ljm.atom.time_answer.stop();
+  ljm.atom.time_answer.add_to_total();
+  return evdw;
+}
+
+double lj_gpu_forces(double **f, const int *ilist, const bool eflag, 
+                     const bool vflag, const bool eflag_atom,
+                     const bool vflag_atom, double *eatom, double **vatom,
+                     double *virial) {
+  return _lj_gpu_forces<PRECISION,ACC_PRECISION> 
+    (LJMF,f,ilist,eflag,vflag,eflag_atom,vflag_atom,eatom,vatom,virial);
+}
+
+void lj_gpu_time() {
+  cout.precision(4);
+  cout << "Atom copy:     " << LJMF.atom.time_atom.total_seconds() << " s.\n";
+  cout << "Neighbor copy: " << LJMF.nbor.time_nbor.total_seconds() << " s.\n";
+  cout << "LJ calc:       " << LJMF.time_pair.total_seconds() << " s.\n";
+  cout << "Answer copy:   " << LJMF.atom.time_answer.total_seconds() << " s.\n";
+}
+
+int lj_gpu_num_devices() {
+  return LJMF.gpu.num_devices();
+}
+
+double lj_gpu_bytes() {
+  return LJMF.host_memory_usage();
+}
--- a/lib/gpu/lj_gpu_kernel.h
+++ b/lib/gpu/lj_gpu_kernel.h
@ -0,0 +1,248 @@
+/***************************************************************************
+                              lj_gpu_kernel.cu
+                             -------------------
+                               W. Michael Brown
+
+  Routines that actually perform the force computation
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS GPU Library
+ __________________________________________________________________________
+
+    begin                : Tue Aug 4 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+#ifndef LJ_GPU_KERNEL
+#define LJ_GPU_KERNEL
+
+template<class numtyp, class acctyp>
+__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor, 
+                          const int *dev_ij, const int nbor_pitch, acctyp *ans, 
+                          size_t ans_pitch, const bool eflag, 
+                          const bool vflag, const int inum, const int nall) {
+  __shared__ numtyp sp_lj[4];
+                            
+  // ii indexes the two interacting particles in gi
+  int ii=threadIdx.x;
+  if (ii<4)
+    sp_lj[ii]=special_lj[ii];    
+  ii+=INT_MUL(blockIdx.x,blockDim.x);
+
+  if (ii<inum) {
+  
+    acctyp energy=(numtyp)0;
+    acctyp fx=(numtyp)0;
+    acctyp fy=(numtyp)0;
+    acctyp fz=(numtyp)0;
+    acctyp virial[6];
+    for (int i=0; i<6; i++)
+      virial[i]=(numtyp)0;
+  
+    const int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    const int *list=dev_ij+*nbor;
+    const int *list_end=list+numj;
+  
+    numtyp ix=_x_<numtyp>(i,0);
+    numtyp iy=_x_<numtyp>(i,1);
+    numtyp iz=_x_<numtyp>(i,2);
+    int itype=_x_<numtyp>(i,3);
+
+    numtyp factor_lj;
+    for ( ; list<list_end; list++) {
+  
+      int j=*list;
+      if (j < nall) 
+        factor_lj = 1.0;
+      else {
+        factor_lj = sp_lj[j/nall];
+        j %= nall;
+      }
+      int jtype=_x_<numtyp>(j,3);
+
+      // Compute r12
+      numtyp delx = ix-_x_<numtyp>(j,0);
+      numtyp dely = iy-_x_<numtyp>(j,1);
+      numtyp delz = iz-_x_<numtyp>(j,2);
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      if (r2inv<_cutsq_<numtyp>(itype,jtype)) {
+        r2inv=(numtyp)1.0/r2inv;
+        numtyp r6inv =r2inv*r2inv*r2inv;
+        numtyp force =factor_lj*r2inv*r6inv*(_lj1_<numtyp>(itype,jtype).x*r6inv-
+                                             _lj1_<numtyp>(itype,jtype).y);
+      
+        fx+=delx*force;
+        fy+=dely*force;
+        fz+=delz*force;
+
+        if (eflag) {
+          numtyp e=r6inv*(_lj3_<numtyp>(itype,jtype).x*r6inv-
+                          _lj3_<numtyp>(itype,jtype).y);
+          energy+=factor_lj*(e-_offset_<numtyp>(1,1)); 
+        }
+        if (vflag) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+
+    // Store answers
+    acctyp *ap1=ans+ii;
+    if (eflag) {
+      *ap1=energy;
+      ap1+=ans_pitch;
+    }
+    if (vflag) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=ans_pitch;
+      }
+    }
+    *ap1=fx;
+    ap1+=ans_pitch;
+    *ap1=fy;
+    ap1+=ans_pitch;
+    *ap1=fz;
+
+  } // if ii
+}
+
+template<class numtyp, class acctyp>
+__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor, 
+                               const int *dev_ij, const int nbor_pitch, 
+                               acctyp *ans, size_t ans_pitch,const bool eflag, 
+                               const bool vflag, const int inum, 
+                               const int nall) {
+                                
+  // ii indexes the two interacting particles in gi
+  int ii=threadIdx.x;
+  __shared__ numtyp sp_lj[4];
+  __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  if (ii<4)
+    sp_lj[ii]=special_lj[ii];    
+  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    int itype=ii/MAX_SHARED_TYPES;
+    int jtype=ii%MAX_SHARED_TYPES;
+    cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
+    lj1[ii]=_lj1_<numtyp>(itype,jtype).x;
+    lj2[ii]=_lj1_<numtyp>(itype,jtype).y;
+    if (eflag) {
+      lj3[ii]=_lj3_<numtyp>(itype,jtype).x;
+      lj4[ii]=_lj3_<numtyp>(itype,jtype).y;
+      offset[ii]=_offset_<numtyp>(itype,jtype);
+    }
+  }
+  ii+=INT_MUL(blockIdx.x,blockDim.x);
+  
+  if (ii<inum) {
+  
+    acctyp energy=(numtyp)0;
+    acctyp fx=(numtyp)0;
+    acctyp fy=(numtyp)0;
+    acctyp fz=(numtyp)0;
+    acctyp virial[6];
+    for (int i=0; i<6; i++)
+      virial[i]=(numtyp)0;
+  
+    const int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    const int *list=dev_ij+*nbor;
+    const int *list_end=list+numj;
+  
+    numtyp ix=_x_<numtyp>(i,0);
+    numtyp iy=_x_<numtyp>(i,1);
+    numtyp iz=_x_<numtyp>(i,2);
+    int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,3));
+
+    numtyp factor_lj;
+    for ( ; list<list_end; list++) {
+  
+      int j=*list;
+      if (j < nall) 
+        factor_lj = 1.0;
+      else {
+        factor_lj = sp_lj[j/nall];
+        j %= nall;
+      }
+      int mtype=itype+_x_<numtyp>(j,3);
+
+      // Compute r12
+      numtyp delx = ix-_x_<numtyp>(j,0);
+      numtyp dely = iy-_x_<numtyp>(j,1);
+      numtyp delz = iz-_x_<numtyp>(j,2);
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+
+      if (r2inv<cutsq[mtype]) {
+        r2inv=(numtyp)1.0/r2inv;
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype]*r6inv-lj2[mtype]);
+      
+        fx+=delx*force;
+        fy+=dely*force;
+        fz+=delz*force;
+
+        if (eflag) {
+          numtyp e=r6inv*(lj3[mtype]*r6inv-lj4[mtype]);
+          energy+=factor_lj*(e-offset[mtype]); 
+        }
+        if (vflag) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+
+    // Store answers
+    acctyp *ap1=ans+ii;
+    if (eflag) {
+      *ap1=energy;
+      ap1+=ans_pitch;
+    }
+    if (vflag) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=ans_pitch;
+      }
+    }
+    *ap1=fx;
+    ap1+=ans_pitch;
+    *ap1=fy;
+    ap1+=ans_pitch;
+    *ap1=fz;
+
+  } // if ii
+}
+
+#endif
--- a/lib/gpu/lj_gpu_memory.cu
+++ b/lib/gpu/lj_gpu_memory.cu
@ -0,0 +1,167 @@
+/***************************************************************************
+                               lj_gpu_memory.cu
+                             -------------------
+                               W. Michael Brown
+
+  Global variables for GPU Lennard-Jones Library
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS GPU Library
+ __________________________________________________________________________
+
+    begin                : Tue Aug 4 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+#include "lj_gpu_memory.h"
+#define LJ_GPU_MemoryT LJ_GPU_Memory<numtyp, acctyp>
+
+template <class numtyp, class acctyp>
+int LJ_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
+  return atom.bytes_per_atom()+nbor.bytes_per_atom(max_nbors); 
+}
+
+template <class numtyp, class acctyp>
+int LJ_GPU_MemoryT::get_max_atoms(const int gpu_bytes, const int max_nbors) {
+  int matoms=static_cast<int>(PERCENT_GPU_MEMORY*gpu_bytes/
+                              bytes_per_atom(max_nbors));
+  if (matoms>MAX_ATOMS)
+    matoms=MAX_ATOMS;
+  return matoms;
+}
+  
+template <class numtyp, class acctyp>
+int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, 
+                          double **host_cutsq, double **host_sigma, 
+                          double **host_epsilon, double **host_lj1, 
+                          double **host_lj2, double **host_lj3, 
+                          double **host_lj4, double **host_offset, 
+                          double *host_special_lj, const int max_nbors, 
+                          const int me) {
+  if (allocated)
+    clear();
+    
+  if (me>=gpu.num_devices())
+    return 0;
+  gpu.set(me);
+  if (gpu.revision()<1.0)
+    return 0;  
+    
+  // Initialize timers for the selected GPU
+  time_pair.init();
+
+  // Initialize atom and nbor data
+  max_atoms=get_max_atoms(gpu.bytes(),max_nbors);
+  atom.init(max_atoms);
+  nbor.init(ij_size,max_atoms,max_nbors);
+  
+  // Get a stream for computing pair potentials
+  CUDA_SAFE_CALL(cudaStreamCreate(&pair_stream));
+    
+  // Use the write buffer from atom for data initialization
+  NVC_HostT &host_write=atom.host_write;
+  assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2);
+
+  // Copy data for bonded interactions
+  special_lj.safe_alloc(4);
+  special_lj.cast_copy(host_special_lj,host_write);
+
+  // Copy sigma, epsilon, and cutsq onto GPU
+  sigma.safe_alloc(ntypes,ntypes);
+  sigma.cast_copy(host_sigma[0],host_write);
+  epsilon.safe_alloc(ntypes,ntypes);
+  epsilon.cast_copy(host_epsilon[0],host_write);
+  cutsq.safe_alloc(ntypes,ntypes);
+  cutsq.cast_copy(host_cutsq[0],host_write);
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  if (lj_types<=MAX_SHARED_TYPES) {
+    lj_types=MAX_SHARED_TYPES;
+    shared_types=true;
+  }
+  offset.safe_alloc(lj_types,lj_types);
+  offset.cast_copy2D(host_offset[0],host_write,ntypes,ntypes);
+  double *t1=host_lj1[0];
+  double *t2=host_lj2[0];
+  for (int i=0; i<lj_types*lj_types; i++) {
+    host_write[i*2]=t1[i];
+    host_write[i*2+1]=t2[i];
+  }
+  lj1.safe_alloc(lj_types,lj_types);
+  lj1.copy_2Dfrom_host(reinterpret_cast<typename cu_vec_traits<numtyp>::vec2 *> (host_write.begin()),
+                       ntypes,ntypes);
+  t1=host_lj3[0];
+  t2=host_lj4[0];
+  for (int i=0; i<lj_types*lj_types; i++) {
+    host_write[i*2]=t1[i];
+    host_write[i*2+1]=t2[i];
+  }
+  lj3.safe_alloc(lj_types,lj_types);
+  lj3.copy_2Dfrom_host(reinterpret_cast<typename cu_vec_traits<numtyp>::vec2 *> (host_write.begin()),
+                       ntypes,ntypes);
+        
+  // Bind constant data to textures
+  sigma_bind_texture<numtyp>(sigma);
+  epsilon_bind_texture<numtyp>(epsilon);
+  cutsq_bind_texture<numtyp>(cutsq);
+  offset_bind_texture<numtyp>(offset);
+  lj1_bind_texture<typename cu_vec_traits<numtyp>::vec2>(lj1);
+  lj3_bind_texture<typename cu_vec_traits<numtyp>::vec2>(lj3);
+
+  dev_error.safe_alloc(1);
+  dev_error.zero();
+    
+  allocated=true;
+
+  return nbor.host_ij.begin();
+}
+  
+template <class numtyp, class acctyp>
+void LJ_GPU_MemoryT::clear() {
+  if (!allocated)
+    return;
+  allocated=false;
+      
+  // Check for any pair style specific errors here
+  int err_flag;
+  dev_error.copy_to_host(&err_flag);
+ 
+  atom.clear();
+  nbor.clear();
+    
+  sigma_unbind_texture<numtyp>();
+  epsilon_unbind_texture<numtyp>();
+  cutsq_unbind_texture<numtyp>();
+  offset_unbind_texture<numtyp>();
+  lj1_unbind_texture<typename cu_vec_traits<numtyp>::vec2>();
+  lj3_unbind_texture<typename cu_vec_traits<numtyp>::vec2>();
+
+  CUDA_SAFE_CALL(cudaStreamDestroy(pair_stream));
+
+  dev_error.clear();
+  sigma.clear();
+  epsilon.clear();
+  special_lj.clear();
+  cutsq.clear();
+  offset.clear();
+  lj1.clear();
+  lj3.clear();
+}  
+ 
+template <class numtyp, class acctyp>
+double LJ_GPU_MemoryT::host_memory_usage() const {
+  return atom.host_memory_usage(max_atoms)+nbor.host_memory_usage()+
+         sizeof(LJ_GPU_Memory<numtyp,acctyp>);
+}
+
+template class LJ_GPU_Memory<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lj_gpu_memory.h
+++ b/lib/gpu/lj_gpu_memory.h
@ -0,0 +1,88 @@
+/***************************************************************************
+                               lj_gpu_memory.h
+                             -------------------
+                               W. Michael Brown
+
+  Global variables for GPU Lennard-Jones Library
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS GPU Library
+ __________________________________________________________________________
+
+    begin                : Tue Aug 4 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+#ifndef LJ_GPU_MEMORY_H
+#define LJ_GPU_MEMORY_H
+
+#include "nvc_device.h"
+#include "pair_gpu_atom.h"
+#include "pair_gpu_nbor.h"
+
+#define BLOCK_1D 64
+#define MAX_SHARED_TYPES 8
+#define PERCENT_GPU_MEMORY 0.7
+
+template <class numtyp, class acctyp>
+class LJ_GPU_Memory {
+ public:
+  LJ_GPU_Memory() : allocated(false) {}
+  ~LJ_GPU_Memory() { clear(); }
+ 
+  /// Allocate memory on host and device
+  int* init(const int ij_size, const int ntypes, double **host_cutsq, 
+            double **host_sigma, double **host_epsilon, 
+            double **host_lj1, double **host_lj2, double **host_lj3, 
+            double **host_lj4, double **host_offset, double *host_special_lj,
+            const int max_nbors, const int me);
+  /// Free any memory on host and device
+  void clear();
+
+  /// Returns memory usage on GPU per atom
+  int bytes_per_atom(const int max_nbors) const;
+  /// Maximum number of atoms that can be stored on GPU
+  int get_max_atoms(const int gpu_bytes, const int max_nbors);
+  /// Total host memory used by library
+  double host_memory_usage() const;
+  
+  // -------------------------   DATA   -----------------------------
+
+  // Device Properties
+  NVCDevice gpu;
+  // Device Error Flag
+  NVC_VecI dev_error;
+  // Stream for asynchronous work
+  cudaStream_t pair_stream;
+  
+  // Atom Data
+  PairGPUAtom<numtyp,acctyp> atom;
+  // Neighbor Data
+  PairGPUNbor nbor;
+  
+  // --------------- Const Data for Atoms
+  NVC_ConstMatT sigma, epsilon, cutsq, offset;
+  NVC_ConstMat< typename cu_vec_traits<numtyp>::vec2 > lj1, lj3;
+  NVC_VecT special_lj;
+  
+  size_t max_atoms;
+  
+  // Timing for pair calculation
+  NVCTimer time_pair;
+  
+  // If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+   
+ protected:
+  bool allocated;
+};
+
+#endif
--- a/lib/gpu/nvc_device.cu
+++ b/lib/gpu/nvc_device.cu
@ -0,0 +1,96 @@
+/***************************************************************************
+                                nvc_device.cu
+                             -------------------
+                               W. Michael Brown
+
+  Utilities for dealing with cuda devices
+
+ __________________________________________________________________________
+    This file is part of the NVC Library
+ __________________________________________________________________________
+
+    begin                : Wed Jan 28 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "nvc_macros.h"
+#include "nvc_device.h"
+
+// Grabs the properties for all devices
+NVCDevice::NVCDevice() {
+  CUDA_SAFE_CALL(cudaGetDeviceCount(&_num_devices));
+  for (int dev=0; dev<_num_devices; ++dev) {
+    cudaDeviceProp deviceProp;
+    CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, dev));
+    if (deviceProp.major == 9999 && deviceProp.minor == 9999)
+      break;
+    _properties.push_back(deviceProp);
+  }
+  _device=0;
+}
+
+// Set the CUDA device to the specified device number
+void NVCDevice::set(int num) {
+  if (_device==num)
+    return;
+  cudaThreadExit();
+  CUDA_SAFE_CALL(cudaSetDevice(num));
+  _device=num;
+}
+
+// List all devices along with all properties
+void NVCDevice::print_all(ostream &out) {
+  if (num_devices() == 0)
+    printf("There is no device supporting CUDA\n");
+  for (int i=0; i<num_devices(); ++i) {
+    printf("\nDevice %d: \"%s\"\n", i, name(i).c_str());
+    printf("  Revision number:                               %.1f\n", revision(i));
+    printf("  Total amount of global memory:                 %.2f GB\n",
+           gigabytes(i));
+    #if CUDART_VERSION >= 2000
+    printf("  Number of multiprocessors:                     %d\n",
+           _properties[i].multiProcessorCount);
+    printf("  Number of cores:                               %d\n",cores(i));
+    #endif
+    printf("  Total amount of constant memory:               %u bytes\n",
+           _properties[i].totalConstMem); 
+    printf("  Total amount of shared memory per block:       %u bytes\n",
+           _properties[i].sharedMemPerBlock);
+    printf("  Total number of registers available per block: %d\n",
+           _properties[i].regsPerBlock);
+    printf("  Warp size:                                     %d\n",
+           _properties[i].warpSize);
+    printf("  Maximum number of threads per block:           %d\n",
+           _properties[i].maxThreadsPerBlock);
+    printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",
+           _properties[i].maxThreadsDim[0],
+           _properties[i].maxThreadsDim[1],
+           _properties[i].maxThreadsDim[2]);
+    printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",
+           _properties[i].maxGridSize[0],
+           _properties[i].maxGridSize[1],
+           _properties[i].maxGridSize[2]);
+    printf("  Maximum memory pitch:                          %u bytes\n",
+           _properties[i].memPitch);
+    printf("  Texture alignment:                             %u bytes\n",
+           _properties[i].textureAlignment);
+    printf("  Clock rate:                                    %.2f GHz\n",
+           clock_rate(i));
+    #if CUDART_VERSION >= 2000
+    printf("  Concurrent copy and execution:                 %s\n",
+           _properties[i].deviceOverlap ? "Yes" : "No");
+    #endif
+  }
+}
+
--- a/lib/gpu/nvc_device.h
+++ b/lib/gpu/nvc_device.h
@ -0,0 +1,92 @@
+/***************************************************************************
+                                nvc_device.h
+                             -------------------
+                               W. Michael Brown
+
+  Utilities for dealing with cuda devices
+
+ __________________________________________________________________________
+    This file is part of the NVC Library
+ __________________________________________________________________________
+
+    begin                : Wed Jan 28 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+#ifndef NVC_DEVICE
+#define NVC_DEVICE
+
+#include <string>
+#include <vector>
+#include <iostream>
+
+using namespace std;
+
+/// Class for looking at device properties
+/** \note Calls to change the device outside of the class results in incorrect
+  *       behavior 
+  * \note There is no error checking for indexing past the number of devices **/
+class NVCDevice {
+ public:
+  /// Grabs the properties for all devices
+  NVCDevice();
+
+  /// Return the number of devices that support CUDA
+  inline int num_devices() { return _properties.size(); }
+
+  /// Set the CUDA device to the specified device number
+  void set(int num);
+
+  /// Get the current device number
+  inline int device_num() { return _device; }
+
+  /// Get the current CUDA device name
+  inline string name() { return name(_device); }
+  /// Get the CUDA device name
+  inline string name(const int i) { return string(_properties[i].name); }
+
+  /// Get the number of cores in the current device
+  inline unsigned cores() { return cores(_device); }
+  /// Get the number of cores
+  inline unsigned cores(const int i) 
+    { return _properties[i].multiProcessorCount*8; }
+  
+  /// Get the gigabytes of global memory in the current device
+  inline double gigabytes() { return gigabytes(_device); }
+  /// Get the gigabytes of global memory
+  inline double gigabytes(const int i) 
+    { return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
+  
+  /// Get the bytes of global memory in the current device
+  inline size_t bytes() { return bytes(_device); }
+  /// Get the bytes of global memory
+  inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; }
+
+  /// Return the GPGPU revision number for current device
+  inline double revision() { return revision(_device); }
+  /// Return the GPGPU revision number
+  inline double revision(const int i) 
+    { return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
+  
+  /// Clock rate in GHz for current device
+  inline double clock_rate() { return clock_rate(_device); }
+  /// Clock rate in GHz
+  inline double clock_rate(const int i) { return _properties[i].clockRate*1e-6;}
+               
+  /// List all devices along with all properties
+  void print_all(ostream &out);
+ 
+ private:
+  int _device, _num_devices;
+  vector<cudaDeviceProp> _properties;
+};
+
+#endif
--- a/lib/gpu/nvc_get_devices.cu
+++ b/lib/gpu/nvc_get_devices.cu
@ -0,0 +1,31 @@
+/***************************************************************************
+                              nvc_get_devices.h
+                             -------------------
+                               W. Michael Brown
+
+  List properties of cuda devices
+
+ __________________________________________________________________________
+    This file is part of the NVC Library
+ __________________________________________________________________________
+
+    begin                : Wed Jan 28 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+#include "nvc_device.h"
+
+int main(int argc, char** argv) {
+  NVCDevice gpu;
+  gpu.print_all(cout);
+  return 0;
+}
+
--- a/lib/gpu/nvc_macros.h
+++ b/lib/gpu/nvc_macros.h
@ -0,0 +1,121 @@
+#ifndef NVC_MACROS_H
+#define NVC_MACROS_H
+
+#include <stdio.h>
+#include "math_constants.h"
+#define INT_MUL(x,y)	(__mul24(x,y))
+//#define INT_MUL(x,y) ((x)*(y))
+
+template <class numbr>
+static __inline__ __device__ numbr cuda_inf() { return CUDART_INF_F; }
+
+#ifdef CUDA_DOUBLE
+template <>
+static __inline__ __device__ double cuda_inf<double>() { return CUDART_INF; }
+#endif
+
+template <class numbr>
+static __inline__ __device__ numbr cuda_zero() { return 0.0; }
+
+template <>
+static __inline__ __device__ float cuda_zero<float>() { return 0.0f; }
+
+#ifdef DEBUG
+
+#  define CU_SAFE_CALL_NO_SYNC( call ) do {                                  \
+    CUresult err = call;                                                     \
+    if( CUDA_SUCCESS != err) {                                               \
+        fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n",   \
+                err, __FILE__, __LINE__ );                                   \
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+#  define CU_SAFE_CALL( call ) do {                                          \
+    CU_SAFE_CALL_NO_SYNC(call);                                              \
+    CUresult err = cuCtxSynchronize();                                       \
+    if( CUDA_SUCCESS != err) {                                               \
+        fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n",   \
+                err, __FILE__, __LINE__ );                                   \
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+#  define CUDA_SAFE_CALL_NO_SYNC( call) do {                                 \
+    cudaError err = call;                                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
+                __FILE__, __LINE__, cudaGetErrorString( err) );              \
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+#  define CUDA_SAFE_CALL( call) do {                                         \
+    CUDA_SAFE_CALL_NO_SYNC(call);                                            \
+    cudaError err = cudaThreadSynchronize();                                 \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
+                __FILE__, __LINE__, cudaGetErrorString( err) );              \
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+#  define CUFFT_SAFE_CALL( call) do {                                        \
+    cufftResult err = call;                                                  \
+    if( CUFFT_SUCCESS != err) {                                              \
+        fprintf(stderr, "CUFFT error in file '%s' in line %i.\n",            \
+                __FILE__, __LINE__);                                         \
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+#  define CUT_SAFE_CALL( call)                                               \
+    if( CUTTrue != call) {                                                   \
+        fprintf(stderr, "Cut error in file '%s' in line %i.\n",              \
+                __FILE__, __LINE__);                                         \
+        exit(EXIT_FAILURE);                                                  \
+    } 
+
+    //! Check for CUDA error
+#  define CUT_CHECK_ERROR(errorMessage) do {                                 \
+    cudaError_t err = cudaGetLastError();                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
+                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+        exit(EXIT_FAILURE);                                                  \
+    }                                                                        \
+    err = cudaThreadSynchronize();                                           \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
+                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+    //! Check for malloc error
+#  define CUT_SAFE_MALLOC( mallocCall ) do{                                  \
+    if( !(mallocCall)) {                                                     \
+        fprintf(stderr, "Host malloc failure in file '%s' in line %i\n",     \
+                __FILE__, __LINE__);                                         \
+        exit(EXIT_FAILURE);                                                  \
+    } } while(0);
+
+    //! Check if conditon is true (flexible assert)
+#  define CUT_CONDITION( val)                                                \
+    if( CUTFalse == cutCheckCondition( val, __FILE__, __LINE__)) {           \
+        exit(EXIT_FAILURE);                                                  \
+    }
+
+#else  // not DEBUG
+
+#define CUT_BANK_CHECKER( array, index)  array[index]
+
+    // void macros for performance reasons
+#  define CUT_CHECK_ERROR(errorMessage)
+#  define CUT_CHECK_ERROR_GL()
+#  define CUT_CONDITION( val) 
+#  define CU_SAFE_CALL_NO_SYNC( call) call
+#  define CU_SAFE_CALL( call) call
+#  define CUDA_SAFE_CALL_NO_SYNC( call) call
+#  define CUDA_SAFE_CALL( call) call
+#  define CUT_SAFE_CALL( call) call
+#  define CUFFT_SAFE_CALL( call) call
+#  define CUT_SAFE_MALLOC( mallocCall ) mallocCall
+
+#endif
+
+#endif
--- a/lib/gpu/nvc_memory.h
+++ b/lib/gpu/nvc_memory.h
@ -0,0 +1,447 @@
+/***************************************************************************
+                                nvc_memory.h
+                             -------------------
+                               W. Michael Brown
+
+  Routines for memory management on CUDA devices
+
+ __________________________________________________________________________
+    This file is part of the NVC Library
+ __________________________________________________________________________
+
+    begin                : Thu Jun 25 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+#ifndef NVC_MEMORY_H
+#define NVC_MEMORY_H
+
+#include <iostream>
+
+#define NVC_HostT NVC_Host<numtyp>
+#define NVC_HostD NVC_Host<double>
+#define NVC_HostS NVC_Host<float>
+#define NVC_HostI NVC_Host<int>
+
+#define NVC_VecT NVC_Vec<numtyp>
+#define NVC_VecD NVC_Vec<double>
+#define NVC_VecS NVC_Vec<float>
+#define NVC_VecI NVC_Vec<int>
+#define NVC_VecI2 NVC_Vec<int2>
+#define NVC_VecU2 NVC_Vec<uint2>
+
+#define NVC_MatT NVC_Mat<numtyp>
+#define NVC_MatD NVC_Mat<double>
+#define NVC_MatS NVC_Mat<float>
+#define NVC_MatI NVC_Mat<int>
+
+#define NVC_ConstMatT NVC_ConstMat<numtyp>
+#define NVC_ConstMatD NVC_ConstMat<double>
+#define NVC_ConstMatS NVC_ConstMat<float>
+#define NVC_ConstMatI NVC_ConstMat<int>
+#define NVC_ConstMatD2 NVC_ConstMat<double2>
+
+namespace NVC {
+
+// Get a channel for float array
+template <class numtyp>
+inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) {
+  channel = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+// Get a channel for float2 array
+template <>
+inline void cuda_gb_get_channel<float2>(cudaChannelFormatDesc &channel) {
+  channel = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat);
+}
+
+// Get a channel for double array
+template <>
+inline void cuda_gb_get_channel<double>(cudaChannelFormatDesc &channel) {
+  channel = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindSigned);
+}
+
+// Get a channel for double array
+template <>
+inline void cuda_gb_get_channel<double2>(cudaChannelFormatDesc &channel) {
+  channel = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindSigned);
+}
+
+// Get a channel for int array
+template <>
+inline void cuda_gb_get_channel<int>(cudaChannelFormatDesc &channel) {
+  channel = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+}
+
+/// Page-locked Row Vector on Host
+template <class numtyp>
+class NVC_Host {
+ public:
+  NVC_Host() { _cols=0; }
+  ~NVC_Host() { if (_cols>0) CUDA_SAFE_CALL(cudaFreeHost(_array)); }
+  
+  // Allocate page-locked memory with fast write/slow read on host
+  inline void safe_alloc_w(const size_t cols) {
+    _cols=cols;
+    _row_bytes=cols*sizeof(numtyp);
+    CUDA_SAFE_CALL(cudaHostAlloc((void **)&_array,_row_bytes,
+                   cudaHostAllocWriteCombined));
+    _end=_array+cols;
+  }
+
+  // Allocate page-locked memory with fast read/write on host
+  inline void safe_alloc_rw(const size_t cols) {
+    _cols=cols;
+    _row_bytes=cols*sizeof(numtyp);
+    CUDA_SAFE_CALL(cudaMallocHost((void **)&_array,_row_bytes));
+    _end=_array+cols;
+  }
+  
+  /// Free any memory associated with device
+  inline void clear() 
+    { if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFreeHost(_array)); } }
+
+  /// Set each element to zero
+  inline void zero() { memset(_array,0,row_bytes()); }
+  
+  /// Set first n elements to zero
+  inline void zero(const int n) { memset(_array,0,n*sizeof(numtyp)); }
+
+  inline numtyp * begin() { return _array; }
+  inline const numtyp * begin() const { return _array; }
+  inline numtyp * end() { return _end; }
+  inline const numtyp * end() const { return _end; }
+
+  inline size_t numel() const { return _cols; }
+  inline size_t rows() const { return 1; }
+  inline size_t cols() const { return _cols; }
+  inline size_t row_size() const { return _cols; }
+  inline size_t row_bytes() const { return _row_bytes; }
+    
+  inline numtyp & operator[](const int i) { return _array[i]; }
+  inline const numtyp & operator[](const int i) const { return _array[i]; }
+  
+  /// Copy from device (numel is not bytes)
+  inline void copy_from_device(const numtyp *device_p, size_t numel) {
+    CUDA_SAFE_CALL(cudaMemcpy(_array,device_p,numel*sizeof(numtyp),
+                              cudaMemcpyDeviceToHost));
+  }
+  
+  /// Copy to device (numel is not bytes)
+  inline void copy_to_device(numtyp *device_p, size_t numel) {
+    CUDA_SAFE_CALL(cudaMemcpy(device_p,_array,numel*sizeof(numtyp),
+                              cudaMemcpyHostToDevice));
+  }
+  
+  /// Copy to 2D matrix on device (numel is not bytes)
+  inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size,
+                               const size_t rows, const size_t cols) {
+    CUDA_SAFE_CALL(cudaMemcpy2D(device_p,dev_row_size*sizeof(numtyp),
+                                _array,cols*sizeof(numtyp),
+                                cols*sizeof(numtyp),rows,
+                                cudaMemcpyHostToDevice));
+  }
+
+  /// Asynchronous copy from device (numel is not bytes)
+  inline void copy_from_device(const numtyp *device_p, size_t numel,
+                               cudaStream_t &stream) {
+    CUDA_SAFE_CALL(cudaMemcpyAsync(_array,device_p,numel*sizeof(numtyp),
+                                   cudaMemcpyDeviceToHost,stream));
+  }
+  
+  /// Asynchronous copy to device (numel is not bytes)
+  inline void copy_to_device(numtyp *device_p, size_t numel,
+                             cudaStream_t &stream) {
+    CUDA_SAFE_CALL(cudaMemcpyAsync(device_p,_array,numel*sizeof(numtyp),
+                                   cudaMemcpyHostToDevice,stream));
+  }
+  
+  /// Asynchronous copy to 2D matrix on device (numel is not bytes)
+  inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size,
+                               const size_t rows, const size_t cols,
+                               cudaStream_t &stream) {
+    CUDA_SAFE_CALL(cudaMemcpy2DAsync(device_p,dev_row_size*sizeof(numtyp),
+                                     _array,cols*sizeof(numtyp),
+                                     cols*sizeof(numtyp),rows,
+                                     cudaMemcpyHostToDevice,stream));
+  }
+  
+ private:
+  numtyp *_array, *_end;
+  size_t _row_bytes, _row_size, _rows, _cols;
+};
+
+/// Row vector on device 
+template <class numtyp>
+class NVC_Vec {
+ public:
+  NVC_Vec() { _cols=0; }
+  ~NVC_Vec() { if (_cols>0) CUDA_SAFE_CALL(cudaFree(_array)); }
+  
+  // Row vector on device
+  inline void safe_alloc(const size_t cols) {
+    _cols=cols;
+    _row_bytes=cols*sizeof(numtyp);
+    CUDA_SAFE_CALL(cudaMalloc((void **)&_array,_row_bytes));
+    _end=_array+cols;
+  }
+  
+  /// Free any memory associated with device
+  inline void clear() 
+    { if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFree(_array)); } }
+
+  /// Set each element to zero
+  inline void zero() { CUDA_SAFE_CALL(cudaMemset(_array,0,row_bytes())); }
+
+  inline numtyp * begin() { return _array; }
+  inline const numtyp * begin() const { return _array; }
+  inline numtyp * end() { return _end; }
+  inline const numtyp * end() const { return _end; }
+
+  inline size_t numel() const { return _cols; }
+  inline size_t rows() const { return 1; }
+  inline size_t cols() const { return _cols; }
+  inline size_t row_size() const { return _cols; }
+  inline size_t row_bytes() const { return _row_bytes; }
+    
+  /// Copy from host
+  inline void copy_from_host(const numtyp *host_p)
+    { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,row_bytes(), 
+                                cudaMemcpyHostToDevice)); }
+
+  /// Asynchronous copy from host
+  inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream)
+    { CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,row_bytes(), 
+                     cudaMemcpyHostToDevice, stream)); }
+
+  /// Copy to host
+  inline void copy_to_host(numtyp *host_p)
+    { CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,row_bytes(),
+                                cudaMemcpyDeviceToHost)); }
+
+  /// Copy n elements to host
+  inline void copy_to_host(numtyp *host_p, const int n)
+    { CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,n*sizeof(numtyp),
+                                cudaMemcpyDeviceToHost)); }
+
+  /// Cast and then copy to device
+  template <class numtyp2>
+  inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write) {
+    for (int i=0; i<numel(); i++)
+      host_write[i]=static_cast<numtyp>(buffer[i]);
+    copy_from_host(host_write.begin());
+  }
+  
+  /// Bind to texture
+  template <class texture>
+  inline void bind_texture(texture &texi, cudaChannelFormatDesc &channel) {
+    NVC::cuda_gb_get_channel<numtyp>(channel);
+    texi.addressMode[0] = cudaAddressModeClamp;
+    texi.addressMode[1] = cudaAddressModeClamp;
+    texi.filterMode = cudaFilterModePoint;
+    texi.normalized = false;
+    CUDA_SAFE_CALL(cudaBindTexture(NULL,&texi,_array,&channel));
+  }
+
+  /// Output the vector (debugging)
+  inline void print(std::ostream &out) { print (out, numel()); }    
+
+  // Output first n elements of vector
+  inline void print(std::ostream &out, const int n) {
+    numtyp *t=new numtyp[n];
+    copy_to_host(t,n);
+    for (int i=0; i<n; i++)
+      out << t[i] << " ";
+    delete []t;
+  }
+    
+ private:
+  numtyp *_array, *_end;
+  size_t _row_bytes, _row_size, _rows, _cols;
+};
+
+/// 2D Matrix on device (can have extra column storage to get correct alignment)
+template <class numtyp>
+class NVC_Mat {
+ public:
+  NVC_Mat() { _rows=0; }
+  ~NVC_Mat() { if (_rows>0) CUDA_SAFE_CALL(cudaFree(_array)); }
+  
+  // Row major matrix on device
+  // - Coalesced access using adjacent cols on same row
+  // - NVC_Mat(row,col) given by array[row*row_size()+col]
+  inline void safe_alloc(const size_t rows, const size_t cols) {
+    _rows=rows;
+    _cols=cols;
+    CUDA_SAFE_CALL(cudaMallocPitch((void **)&_array,&_pitch, 
+                   cols*sizeof(numtyp),rows));
+   _row_size=_pitch/sizeof(numtyp);                
+    _end=_array+_row_size*cols;
+  }
+  
+  /// Free any memory associated with device
+  inline void clear() 
+    { if (_rows>0) { _rows=0; CUDA_SAFE_CALL(cudaFree(_array)); } }
+
+  /// Set each element to zero
+  inline void zero() { CUDA_SAFE_CALL(cudaMemset(_array,0, _pitch*_rows)); }
+
+  inline numtyp * begin() { return _array; }
+  inline const numtyp * begin() const { return _array; }
+  inline numtyp * end() { return _end; }
+  inline const numtyp * end() const { return _end; }
+
+
+  inline size_t numel() const { return _cols*_rows; }
+  inline size_t rows() const { return _rows; }
+  inline size_t cols() const { return _cols; }
+  inline size_t row_size() const { return _row_size; }
+  inline size_t row_bytes() const { return _pitch; }
+    
+  /// Copy from host (elements not bytes)
+  inline void copy_from_host(const numtyp *host_p, const size_t numel)
+    { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,numel*sizeof(numtyp), 
+                     cudaMemcpyHostToDevice)); }
+
+  /// Asynchronous copy from host (elements not bytes)
+  inline void copy_from_host(const numtyp *host_p, const size_t numel,
+                             cudaStream_t &stream)
+    { CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,numel*sizeof(numtyp), 
+                     cudaMemcpyHostToDevice, stream)); }
+
+  /// Asynchronous Copy from Host
+  /** \note Used when the number of columns/rows allocated on host smaller than
+    *       on device **/
+  inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
+                               const size_t cols, cudaStream_t &stream) {
+    CUDA_SAFE_CALL(cudaMemcpy2DAsync(_array, _pitch, host_p,cols*sizeof(numtyp), 
+                                     cols*sizeof(numtyp), rows,
+                                     cudaMemcpyHostToDevice,stream));
+  }
+
+ private:
+  numtyp *_array, *_end;
+  size_t _pitch, _row_size, _rows, _cols;
+};
+
+/// Const 2D Matrix on device (requires texture binding)
+template <class numtyp>
+class NVC_ConstMat {
+ public:
+  NVC_ConstMat() { _rows=0; }
+  ~NVC_ConstMat() { if (_rows>0) CUDA_SAFE_CALL(cudaFreeArray(_array)); }
+  
+  /// Row major matrix on device
+  inline void safe_alloc(const size_t rows, const size_t cols) {
+    _rows=rows;
+    _cols=cols;
+
+    NVC::cuda_gb_get_channel<numtyp>(_channel);
+    CUDA_SAFE_CALL(cudaMallocArray(&_array, &_channel, cols, rows));
+  }
+  
+  /// Bind to texture
+  template <class texture>
+  inline void bind_texture(texture &texi) {
+    texi.addressMode[0] = cudaAddressModeClamp;
+    texi.addressMode[1] = cudaAddressModeClamp;
+    texi.filterMode = cudaFilterModePoint;
+    texi.normalized = false;
+    CUDA_SAFE_CALL(cudaBindTextureToArray(&texi,_array,&_channel));
+  }
+  
+  /// Free any memory associated with device
+  inline void clear() 
+    { if (_rows>0) { _rows=0; CUDA_SAFE_CALL(cudaFreeArray(_array)); } }
+
+  inline size_t numel() const { return _cols*_rows; }
+  inline size_t rows() const { return _rows; }
+  inline size_t cols() const { return _cols; }
+  inline size_t row_size() const { return _cols; }
+  inline size_t row_bytes() const { return _cols*sizeof(numtyp); }
+
+  /// Copy from Host
+  inline void copy_from_host(const numtyp *host_p) {
+    CUDA_SAFE_CALL(cudaMemcpyToArray(_array, 0, 0, host_p,
+                                     numel()*sizeof(numtyp),
+                                     cudaMemcpyHostToDevice));
+  }
+
+  /// Copy from Host
+  /** \note Used when the number of columns/rows allocated on host smaller than
+    *       on device **/
+  inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
+                               const size_t cols) {
+    CUDA_SAFE_CALL(cudaMemcpy2DToArray(_array, 0, 0, host_p, 
+                         cols*sizeof(numtyp), cols*sizeof(numtyp), rows,
+                         cudaMemcpyHostToDevice));
+  }                               
+
+  /// Asynchronous Copy from Host
+  inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) {
+    CUDA_SAFE_CALL(cudaMemcpyToArrayAsync(_array, 0, 0, host_p, 
+                                          numel()*sizeof(numtyp),
+                                          cudaMemcpyHostToDevice,stream));
+  }
+      
+  /// Asynchronous Copy from Host
+  /** \note Used when the number of columns/rows allocated on host smaller than
+    *       on device **/
+  inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
+                               const size_t cols, cudaStream_t &stream) {
+    CUDA_SAFE_CALL(cudaMemcpy2DToArrayAsync(_array, 0, 0, host_p, 
+                         cols*sizeof(numtyp), cols*sizeof(numtyp), rows,
+                         cudaMemcpyHostToDevice,stream));
+  }                               
+
+  /// Cast buffer to numtyp in host_write and copy to array
+  template <class numtyp2>
+  inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write) {
+    int n=numel();                        
+    for (int i=0; i<n; i++) {
+      host_write[i]=static_cast<numtyp>(*buffer); buffer++;
+    }
+    copy_from_host(host_write.begin());
+  }
+
+  /// Cast buffer to numtyp in host_write and copy to array
+  /** \note Used when the number of columns/rows allocated on host smaller than
+    *       on device **/
+  template <class numtyp2>
+  inline void cast_copy2D(const numtyp2 *buffer, NVC_HostT &host_write,
+                          const size_t rows, const size_t cols) {
+    int n=rows*cols;                        
+    for (int i=0; i<n; i++) {
+      host_write[i]=static_cast<numtyp>(*buffer); buffer++;
+    }
+    copy_2Dfrom_host(host_write.begin(),rows,cols);
+  }
+
+  /// Cast buffer to numtyp in host_write and copy to array asynchronously
+  template <class numtyp2>
+  inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write,
+                        cudaStream_t &stream) {
+    int n=numel();                        
+    for (int i=0; i<n; i++) {
+      host_write[i]=static_cast<numtyp>(*buffer); buffer++;
+    }
+    copy_from_host(host_write.begin(),stream);
+  }
+
+ private:
+  size_t _rows, _cols;
+  cudaArray *_array;
+  cudaChannelFormatDesc _channel;
+};
+
+#endif
--- a/lib/gpu/nvc_timer.h
+++ b/lib/gpu/nvc_timer.h
@ -0,0 +1,83 @@
+/***************************************************************************
+                                 nvc_timer.h
+                             -------------------
+                               W. Michael Brown
+
+  Class for timing CUDA routines
+
+ __________________________________________________________________________
+    This file is part of the NVC Library
+ __________________________________________________________________________
+
+    begin                : Tue Feb 3 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+#ifndef NVC_TIMER_H
+#define NVC_TIMER_H
+
+#include "nvc_macros.h"
+
+#define cudaEventDestroy(a)
+
+/// Class for timing CUDA events
+class NVCTimer {
+ public:
+  NVCTimer() : _total_time(0.0f), initialized(false) { }
+  
+  ~NVCTimer() {
+    if (initialized) 
+      { cudaEventDestroy(start_event); cudaEventDestroy(stop_event); }
+  }
+
+  inline void init() {
+    if (!initialized) {
+      initialized=true;
+      CUDA_SAFE_CALL( cudaEventCreate(&start_event) );
+      CUDA_SAFE_CALL( cudaEventCreate(&stop_event) );
+    }
+  }
+  
+  /// Start timing
+  inline void start() { cudaEventRecord(start_event,0); }
+  
+  /// Stop timing and store event time
+  inline void stop() { cudaEventRecord(stop_event,0); }
+  
+  /// Set the time elapsed to zero (not the total_time)
+  inline void zero() 
+    { cudaEventRecord(start_event,0); cudaEventRecord(stop_event,0); } 
+  
+  /// Add time from previous start and stop to total
+  /** Forces synchronization **/
+  inline void add_to_total() { _total_time+=time(); }
+  
+  /// Return the time (ms) of last start to stop - Forces synchronization
+  inline double time() { 
+    float timer;
+    cudaEventSynchronize(stop_event);
+    CUDA_SAFE_CALL( cudaEventElapsedTime(&timer,start_event,stop_event) );
+    return timer; 
+  }
+  
+  /// Return the total time in ms
+  inline double total_time() { return _total_time; }
+
+  /// Return the total time in seconds
+  inline double total_seconds() { return _total_time/1000.0; }
+
+ private:
+  cudaEvent_t start_event, stop_event;
+  double _total_time;
+  bool initialized;
+};
+
+#endif
--- a/lib/gpu/pair_gpu_atom.cu
+++ b/lib/gpu/pair_gpu_atom.cu
@ -0,0 +1,173 @@
+/***************************************************************************
+                               pair_gpu_atom.cu
+                             -------------------
+                               W. Michael Brown
+
+  Memory routines for moving atom and force data between host and gpu
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS GPU Library
+ __________________________________________________________________________
+
+    begin                : Tue Aug 4 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+#include "pair_gpu_atom.h"
+#define PairGPUAtomT PairGPUAtom<numtyp,acctyp>
+
+template <class numtyp, class acctyp>
+int PairGPUAtomT::bytes_per_atom() const { 
+  return atom_fields()*sizeof(numtyp)+ans_fields()*sizeof(acctyp); 
+}
+
+template <class numtyp, class acctyp>
+void PairGPUAtomT::init(const int max_atoms) {
+  if (allocated)
+    clear();
+    
+  _max_atoms=max_atoms;
+
+  // Initialize timers for the selected GPU
+  time_atom.init();
+  time_answer.init();
+
+  // Device matrices for atom and force data
+  dev_x.safe_alloc(atom_fields(),max_atoms);
+  x_bind_texture<numtyp>(dev_x);
+  ans.safe_alloc(ans_fields(),max_atoms);
+
+  // Get a host write only buffer
+  host_write.safe_alloc_w(max_atoms*4);
+  // Get a host read/write buffer
+  host_read.safe_alloc_rw(ans.row_size()*ans_fields());
+    
+  allocated=true;
+}
+  
+template <class numtyp, class acctyp>
+void PairGPUAtomT::clear() {
+  if (!allocated)
+      return;
+  allocated=false;
+      
+  x_unbind_texture<numtyp>();
+  ans.clear();                               
+  host_write.clear();
+  host_read.clear();
+  dev_x.clear();
+}  
+ 
+template <class numtyp, class acctyp>
+double PairGPUAtomT::host_memory_usage(const int max_atoms) const {
+  return max_atoms*atom_fields()*sizeof(numtyp)+
+         ans_fields()*(max_atoms)*sizeof(acctyp)+
+         sizeof(PairGPUAtom<numtyp,acctyp>);
+}
+  
+template <class numtyp, class acctyp>
+void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, 
+                                cudaStream_t &s) {
+  _eflag=eflag;
+  _vflag=vflag;
+    
+  int csize=ans_fields();    
+  if (!eflag)
+    csize--;
+  if (!vflag)
+    csize-=6;
+      
+  host_read.copy_from_device(ans.begin(),ans.row_size()*csize,s);
+}
+  
+template <class numtyp, class acctyp>
+double PairGPUAtomT::energy_virial(const int *ilist, const bool eflag_atom,
+                                   const bool vflag_atom, double *eatom, 
+                                   double **vatom, double *virial) {
+  double evdwl=0.0;
+  int gap=ans.row_size()-_inum;
+
+  acctyp *ap=host_read.begin();
+  if (_eflag) {
+    if (eflag_atom) {
+      for (int i=0; i<_inum; i++) {
+        evdwl+=*ap;
+        eatom[ilist[i]]+=*ap*0.5;
+        ap++;
+      }
+    } else
+      for (int i=0; i<_inum; i++) {
+        evdwl+=*ap;
+        ap++;
+      }
+    ap+=gap;
+    evdwl*=0.5;
+  }
+  _read_loc=ap;
+  gap=ans.row_size();
+  if (_vflag) {
+    if (vflag_atom) {
+      for (int ii=0; ii<_inum; ii++) {
+        int i=ilist[ii];
+        ap=_read_loc+ii;
+        for (int j=0; j<6; j++) {
+          vatom[i][j]+=*ap*0.5;
+          virial[j]+=*ap;
+          ap+=gap;
+        }
+      }
+    } else {
+      for (int ii=0; ii<_inum; ii++) {
+        ap=_read_loc+ii;
+        for (int j=0; j<6; j++) {
+          virial[j]+=*ap;
+          ap+=gap;
+        }
+      }
+    }
+    for (int j=0; j<6; j++)
+      virial[j]*=0.5;
+    _read_loc+=gap*6;
+  }
+  
+  return evdwl;
+}
+
+template <class numtyp, class acctyp>
+void PairGPUAtomT::add_forces(const int *ilist, double **f) {
+  int gap=ans.row_size();
+  for (int ii=0; ii<_inum; ii++) {
+    acctyp *ap=_read_loc+ii;
+    int i=ilist[ii];
+    f[i][0]+=*ap;
+    ap+=gap;
+    f[i][1]+=*ap;
+    ap+=gap;
+    f[i][2]+=*ap;
+  }
+}
+
+template <class numtyp, class acctyp>
+void PairGPUAtomT::add_torques(const int *ilist, double **tor, const int n) {
+  int gap=ans.row_size();
+  _read_loc+=gap*3;
+  for (int ii=0; ii<n; ii++) {
+    acctyp *ap=_read_loc+ii;
+    int i=ilist[ii];
+    tor[i][0]+=*ap;
+    ap+=gap;
+    tor[i][1]+=*ap;
+    ap+=gap;
+    tor[i][2]+=*ap;
+  }
+}
+
+template class PairGPUAtom<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/pair_gpu_atom.h
+++ b/lib/gpu/pair_gpu_atom.h
@ -0,0 +1,151 @@
+/***************************************************************************
+                               pair_gpu_atom.h
+                             -------------------
+                               W. Michael Brown
+
+  Memory routines for moving atom and force data between host and gpu
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS GPU Library
+ __________________________________________________________________________
+
+    begin                : Tue Aug 4 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+#ifndef PAIR_GPU_ATOM_H
+#define PAIR_GPU_ATOM_H
+
+// PRECISION - Precision for rsq, energy, force, and torque calculation
+// ACC_PRECISION - Precision for accumulation of energies, forces, and torques
+#ifdef _SINGLE_DOUBLE
+#define PRECISION float
+#define ACC_PRECISION double
+#endif
+
+#ifdef _DOUBLE_DOUBLE
+#define PRECISION double
+#define ACC_PRECISION double
+#endif
+
+#ifndef PRECISION
+#define PRECISION float
+#define ACC_PRECISION double
+#endif
+
+#define MAX_ATOMS 65536
+#include "nvc_timer.h"
+#include "pair_gpu_texture.h"
+
+template <class numtyp, class acctyp>
+class PairGPUAtom {
+ public:
+  PairGPUAtom() : _atom_fields(4), _ans_fields(10), allocated(false) {}
+  ~PairGPUAtom() { clear(); }
+
+  // Accessors
+  inline int atom_fields() const { return _atom_fields; }
+  inline int ans_fields() const { return _ans_fields; }
+  inline int max_atoms() const { return _max_atoms; }
+  inline int nall() const { return _nall; }
+  inline int inum() const { return _inum; }
+
+  /// Set number of atoms for future copy operations
+  inline void nall(const int n) { _nall=n; }
+  /// Set number of inum for future copy operations
+  inline void inum(const int n) { _inum=n; }
+  /// Set the number of atom fields (x, y, z, type, etc)
+  inline void atom_fields(const int n) { _atom_fields=n; }
+  /// Set the number of answer fields (energy, virial, force, etc.)
+  inline void ans_fields(const int n) { _ans_fields=n; }
+  
+  /// Memory usage per atom in this class
+  /** \note atom_fields and ans_fields should be set for correct answer **/
+  int bytes_per_atom() const; 
+
+  /// Must be called once to allocate host and device memory
+  /** \note atom_fields and ans_fields should be set first if not default **/
+  void init(const int max_atoms);
+  
+  /// Free all memory on host and device
+  void clear();
+ 
+  /// Return the total amount of host memory used by class
+  double host_memory_usage(const int max_atoms) const;
+
+  
+  // -------------------------COPY TO GPU ----------------------------------
+
+  /// Reset the write buffer pointer (Start copying new atom data)
+  inline void reset_write_buffer() { _write_loc=host_write.begin(); }
+  
+  /// Add a row to write buffer with unit stride
+  /** Copies nall() elements **/
+  template<class cpytyp>
+  inline void add_atom_data(const cpytyp *host_ptr)
+    { for (int i=0; i<_nall; i++) { *_write_loc=host_ptr[i]; _write_loc++; } }
+  
+  /// Add a row to write buffer with non-unit stride
+  /** Copies nall() elements **/
+  template<class cpytyp>
+  inline void add_atom_data(const cpytyp *hostptr, const int stride) {
+    int t=_nall*stride; 
+    for (int i=0; i<t; i+=stride) { *_write_loc=hostptr[i]; _write_loc++; }
+  }
+  
+  /// Copy num_rows x nall() write buffer to x in GPU
+  /** num_rows<=atom_fields() **/
+  inline void copy_atom_data(const int num_rows, cudaStream_t &stream) 
+    { dev_x.copy_2Dfrom_host(host_write.begin(),num_rows,nall(),stream); }
+
+    
+  // -------------------------COPY FROM GPU -------------------------------
+
+  /// Copy answers from GPU into read buffer
+  void copy_answers(const bool eflag, const bool vflag, cudaStream_t &s);
+  
+  /// Copy energy and virial data into LAMMPS memory
+  double energy_virial(const int *ilist, const bool eflag_atom,
+                       const bool vflag_atom, double *eatom, double **vatom,
+                       double *virial);
+
+  /// Add forces from the GPU into a LAMMPS pointer
+  void add_forces(const int *ilist, double **f);
+
+  /// Add torques from the GPU into a LAMMPS pointer
+  void add_torques(const int *ilist, double **tor, const int n);
+    
+  // ------------------------------ DATA ----------------------------------
+
+  // atom_fields() x n (example: rows 1-3 position, 4 is type)
+  NVC_ConstMatT dev_x;
+  // ans_fields() x n Storage for Forces, etc.
+  // example: if (eflag and vflag) 1 is energy, 2-7 is virial, 8-10 is force 
+  // example: if (!eflag) 1-3 is force
+  NVC_Mat<acctyp> ans;                               
+
+  // Buffer for moving floating point data to GPU
+  NVC_HostT host_write;
+  // Buffer for moving floating point data to CPU
+  NVC_Host<acctyp> host_read;
+  
+  // Timing Stuff
+  NVCTimer time_atom, time_answer;
+  
+ private:
+  bool allocated, _eflag, _vflag;
+  int _atom_fields, _ans_fields;
+  int _max_atoms, _nall, _inum;
+  numtyp * _write_loc;
+  acctyp * _read_loc;
+};
+
+#endif
--- a/lib/gpu/pair_gpu_nbor.cu
+++ b/lib/gpu/pair_gpu_nbor.cu
@ -0,0 +1,80 @@
+/***************************************************************************
+                               pair_gpu_nbor.cu
+                             -------------------
+                               W. Michael Brown
+
+  Neighbor memory operations for LAMMPS GPU Library
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS GPU Library
+ __________________________________________________________________________
+
+    begin                : Tue Aug 4 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+#include "pair_gpu_nbor.h"
+
+int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
+  if (_use_packing)
+    return (max_nbors*2+4)*sizeof(int);
+  else
+    return (max_nbors+3)*sizeof(int);
+}
+
+void PairGPUNbor::init(const int ij_size, const int max_atoms, 
+                       const int max_nbors) {
+  if (allocated)
+    clear();
+    
+  // Initialize timers for the selected GPU
+  time_nbor.init();
+
+  if (_use_packing)
+    dev_nbor.safe_alloc(max_nbors+4,max_atoms);
+  else  
+    dev_nbor.safe_alloc(3,max_atoms);
+  
+  ij.safe_alloc(max_nbors*max_atoms);
+  host_ij.safe_alloc_w(ij_size);
+    
+  allocated=true;
+}
+  
+void PairGPUNbor::clear() {
+  if (!allocated)
+    return;
+  allocated=false;
+
+  ij.clear();
+  host_ij.clear();
+  dev_nbor.clear();
+}  
+ 
+double PairGPUNbor::host_memory_usage() const {
+  return IJ_SIZE*sizeof(int)+sizeof(PairGPUNbor);
+}
+
+void PairGPUNbor::reset(const int inum, int *ilist, const int *numj, 
+                        cudaStream_t &s) {  
+  ij_total=0;
+
+  dev_nbor.copy_from_host(ilist,inum);
+  int acc=0;
+  for (int i=0; i<inum; i++) {
+    host_ij[i]=numj[ilist[i]];
+    host_ij[i+inum]=acc;
+    acc+=numj[ilist[i]];
+  }
+  
+  host_ij.copy_to_2Ddevice(dev_nbor.begin()+dev_nbor.row_size(),
+                           dev_nbor.row_size(),2,inum, s);
+}
--- a/lib/gpu/pair_gpu_nbor.h
+++ b/lib/gpu/pair_gpu_nbor.h
@ -0,0 +1,91 @@
+/***************************************************************************
+                               pair_gpu_nbor.h
+                             -------------------
+                               W. Michael Brown
+
+  Neighbor memory operations for LAMMPS GPU Library
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS GPU Library
+ __________________________________________________________________________
+
+    begin                : Tue Aug 4 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+#ifndef PAIR_GPU_NBOR_H
+#define PAIR_GPU_NBOR_H
+
+#include "nvc_macros.h"
+#include "nvc_timer.h"
+#include "nvc_memory.h"
+
+#define BLOCK_1D 64
+#define IJ_SIZE 131072
+
+class PairGPUNbor {
+ public:
+  PairGPUNbor() : _use_packing(false), allocated(false) {}
+  ~PairGPUNbor() { clear(); }
+ 
+  /// Determine whether neighbor packing should be used
+  /** If true, twice as much memory is reserved to allow packing neighbors by 
+    * atom for coalesced access after cutoff evaluation. This can be used
+    * for expensive potentials where it is more efficient to evaluate the 
+    * cutoff separately from the potential in order to reduce thread divergence 
+    * for expensive routines **/
+  void packing(const bool use_packing) { _use_packing=use_packing; }
+  
+  /// Called once to allocate memory
+  void init(const int ij_size, const int max_atoms, const int max_nbors);
+  
+  /// Free all memory on host and device
+  void clear();
+ 
+  /// Bytes per atom used on device
+  int bytes_per_atom(const int max_nbors) const;
+  /// Total host memory used by class
+  double host_memory_usage() const;
+
+  /// Reset neighbor data (first time or from a rebuild)  
+  void reset(const int inum, int *ilist, const int *numj, cudaStream_t &s);
+  /// Add neighbor data from host
+  inline void add(const int num_ij, cudaStream_t &s)
+    { host_ij.copy_to_device(ij.begin()+ij_total,num_ij,s); ij_total+=num_ij; }
+
+  /// Pack neighbors satisfying cutoff by atom for coalesced access
+  void pack_nbors(const int GX, const int BX, const int start, 
+                  const int inum, const int form_low, const int form_high);
+
+    
+  // ------------------------------- Data -------------------------------
+
+  // Store IJ interactions on device
+  NVC_VecI ij;
+  // Buffer for moving ij data to GPU
+  NVC_HostI host_ij;
+
+  // --------------- Atom neighbors
+  // 3 x n
+  // - 1st row is i
+  // - 2nd row is numj (number of neighbors)
+  // - 3rd row is starting address in host_ij of neighbors
+  NVC_MatI dev_nbor;
+
+  // --------------- Timing Stuff
+  NVCTimer time_nbor;
+  
+  int ij_total;
+ private:
+  bool allocated, _use_packing;
+};
+
+#endif
--- a/lib/gpu/pair_gpu_texture.h
+++ b/lib/gpu/pair_gpu_texture.h
@ -0,0 +1,292 @@
+/***************************************************************************
+                              pair_gpu_texture.h
+                             -------------------
+                               W. Michael Brown
+
+  Tricks for templating textures
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS GPU Library
+ __________________________________________________________________________
+
+    begin                : Tue Jun 23 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+#include "nvc_memory.h"
+
+#ifndef PAIR_GPU_TEXTURE_H
+#define PAIR_GPU_TEXTURE_H
+
+template <class numtyp> class cu_vec_traits;
+template <> class cu_vec_traits<float> { public: typedef float2 vec2; };
+template <> class cu_vec_traits<double> { public: typedef double2 vec2; };
+
+// ------------------------------- x ------------------------------------
+
+static texture<float, 2, cudaReadModeElementType> x_float_tex;
+static texture<int2, 2, cudaReadModeElementType> x_double_tex;
+template <class numtyp> inline void x_bind_texture(NVC_ConstMatT &mat)
+  { mat.bind_texture(x_float_tex); }
+
+template <> inline void x_bind_texture<double>(NVC_ConstMatD &mat)
+  { mat.bind_texture(x_double_tex); } 
+template <class numtyp> inline void x_unbind_texture()
+  { cudaUnbindTexture(x_float_tex); }
+template <> inline void x_unbind_texture<double>() 
+  { cudaUnbindTexture(x_double_tex); }
+template <class numtyp>
+static __inline__ __device__ numtyp _x_(const int i, const int j) {
+  return tex2D(x_float_tex,i,j);
+}
+#ifdef GB_GPU_DOUBLE
+template <>
+static __inline__ __device__ double _x_<double>(const int i,const int j) {
+  int2 t=tex2D(x_double_tex,i,j);
+  return __hiloint2double(t.y, t.x);
+}
+#endif
+
+// ------------------------------- form ------------------------------------
+
+static texture<int, 2, cudaReadModeElementType> form_tex;
+inline void form_bind_texture(NVC_ConstMatI &mat)
+  { mat.bind_texture(form_tex); }
+inline void form_unbind_texture()
+  { cudaUnbindTexture(form_tex); }
+static __inline__ __device__ int _form_(const int i, const int j) {
+  return tex2D(form_tex,i,j);
+}
+
+// ------------------------------- lshape ------------------------------------
+
+static texture<float, 1, cudaReadModeElementType> lshape_float_tex;
+static texture<int2, 1, cudaReadModeElementType> lshape_double_tex;
+static cudaChannelFormatDesc channel_lshape;
+template <class numtyp> inline void lshape_bind_texture(NVC_VecT &vec) 
+ { vec.bind_texture(lshape_float_tex,channel_lshape); }
+template <> inline void lshape_bind_texture<double>(NVC_VecD &vec)
+ { vec.bind_texture(lshape_double_tex,channel_lshape); }
+template <class numtyp> inline void lshape_unbind_texture() 
+  { cudaUnbindTexture(lshape_float_tex); }
+template <> inline void lshape_unbind_texture<double>() 
+  { cudaUnbindTexture(lshape_double_tex); }
+template <class numtyp>
+static __inline__ __device__ numtyp _lshape_(const int i) 
+  { return tex1Dfetch(lshape_float_tex,i); }
+#ifdef GB_GPU_DOUBLE
+template <>
+static __inline__ __device__ double _lshape_<double>(const int i) {
+  int2 t=tex1Dfetch(lshape_double_tex,i);
+  return __hiloint2double(t.y, t.x);
+}
+#endif
+
+// ------------------------------- shape ------------------------------------
+
+static texture<float, 2, cudaReadModeElementType> shape_float_tex;
+static texture<int2, 2, cudaReadModeElementType> shape_double_tex;
+template <class numtyp> inline void shape_bind_texture(NVC_ConstMatT &mat) 
+ { mat.bind_texture(shape_float_tex); }
+template <> inline void shape_bind_texture<double>(NVC_ConstMatD &mat) 
+ { mat.bind_texture(shape_double_tex); } 
+template <class numtyp> inline void shape_unbind_texture() 
+  { cudaUnbindTexture(shape_float_tex); }
+template <> inline void shape_unbind_texture<double>() 
+  { cudaUnbindTexture(shape_double_tex); }
+template <class numtyp> 
+static __inline__ __device__ numtyp _shape_(const int i, const int j) {
+  return tex2D(shape_float_tex,j,i);
+}
+#ifdef GB_GPU_DOUBLE
+template <>
+static __inline__ __device__ double _shape_<double>(const int i, const int j) {
+  int2 t=tex2D(shape_double_tex,j,i);
+  return __hiloint2double(t.y, t.x);
+}
+#endif
+
+// ------------------------------- well ------------------------------------
+
+static texture<float, 2, cudaReadModeElementType> well_float_tex;
+static texture<int2, 2, cudaReadModeElementType> well_double_tex;
+template <class numtyp> inline void well_bind_texture(NVC_ConstMatT &mat) 
+  { mat.bind_texture(well_float_tex); }
+template <> inline void well_bind_texture<double>(NVC_ConstMatD &mat)
+  { mat.bind_texture(well_double_tex); }
+template <class numtyp> inline void well_unbind_texture() 
+  { cudaUnbindTexture(well_float_tex); }
+template <> inline void well_unbind_texture<double>() 
+  { cudaUnbindTexture(well_double_tex); }
+template <class numtyp> 
+static __inline__ __device__ numtyp _well_(const int i, const int j) 
+  { return tex2D(well_float_tex,j,i); }
+#ifdef GB_GPU_DOUBLE
+template <>
+static __inline__ __device__ double _well_<double>(const int i,const int j) {
+  int2 t=tex2D(well_double_tex,j,i);
+  return __hiloint2double(t.y, t.x);
+}
+#endif
+
+// ------------------------------- sigma ------------------------------------
+
+static texture<float, 2, cudaReadModeElementType> sigma_float_tex;
+static texture<int2, 2, cudaReadModeElementType> sigma_double_tex;
+template <class numtyp> inline void sigma_bind_texture(NVC_ConstMatT &mat)
+  { mat.bind_texture(sigma_float_tex); }
+template <> inline void sigma_bind_texture<double>(NVC_ConstMatD &mat)
+  { mat.bind_texture(sigma_double_tex); }
+template <class numtyp> inline void sigma_unbind_texture() 
+  { cudaUnbindTexture(sigma_float_tex); }
+template <> inline void sigma_unbind_texture<double>() 
+  { cudaUnbindTexture(sigma_double_tex); }
+template <class numtyp>
+static __inline__ __device__ numtyp _sigma_(const int i, const int j) {
+  return tex2D(sigma_float_tex,j,i);
+}
+#ifdef GB_GPU_DOUBLE
+template <>
+static __inline__ __device__ double _sigma_<double>(const int i,const int j) {
+  int2 t=tex2D(sigma_double_tex,j,i);
+  return __hiloint2double(t.y, t.x);
+}
+#endif
+
+// ------------------------------- epsilon ------------------------------------
+
+static texture<float, 2, cudaReadModeElementType> epsilon_float_tex;
+static texture<int2, 2, cudaReadModeElementType> epsilon_double_tex;
+template <class numtyp> inline void epsilon_bind_texture(NVC_ConstMatT &mat)
+  { mat.bind_texture(epsilon_float_tex); }
+template <> inline void epsilon_bind_texture<double>(NVC_ConstMatD &mat)
+  { mat.bind_texture(epsilon_double_tex); }
+template <class numtyp> inline void epsilon_unbind_texture() 
+  { cudaUnbindTexture(epsilon_float_tex); }
+template <> inline void epsilon_unbind_texture<double>() 
+  { cudaUnbindTexture(epsilon_double_tex); }
+template <class numtyp>
+static __inline__ __device__ numtyp _epsilon_(const int i, const int j) {
+  return tex2D(epsilon_float_tex,j,i);
+}
+#ifdef GB_GPU_DOUBLE
+template <>
+static __inline__ __device__ double _epsilon_<double>(const int i,const int j) {
+  int2 t=tex2D(epsilon_double_tex,j,i);
+  return __hiloint2double(t.y, t.x);
+}
+#endif
+
+// ------------------------------- cutsq ------------------------------------
+
+static texture<float, 2, cudaReadModeElementType> cutsq_float_tex;
+static texture<int2, 2, cudaReadModeElementType> cutsq_double_tex;
+template <class numtyp> inline void cutsq_bind_texture(NVC_ConstMatT &mat)
+  { mat.bind_texture(cutsq_float_tex); }
+template <> inline void cutsq_bind_texture<double>(NVC_ConstMatD &mat)
+  { mat.bind_texture(cutsq_double_tex); }
+template <class numtyp> inline void cutsq_unbind_texture() 
+ { cudaUnbindTexture(cutsq_float_tex); }
+template <> inline void cutsq_unbind_texture<double>() 
+ { cudaUnbindTexture(cutsq_double_tex); }
+template <class numtyp>
+static __inline__ __device__ numtyp _cutsq_(const int i, const int j) {
+  return tex2D(cutsq_float_tex,j,i);
+}
+#ifdef GB_GPU_DOUBLE
+template <>
+static __inline__ __device__ double _cutsq_<double>(const int i,const int j) {
+  int2 t=tex2D(cutsq_double_tex,j,i);
+  return __hiloint2double(t.y, t.x);
+}
+#endif
+
+// ------------------------------- lj1 ------------------------------------
+
+static texture<float2, 2, cudaReadModeElementType> lj1_float_tex;
+static texture<int4, 2, cudaReadModeElementType> lj1_double_tex;
+template <class numtyp> inline void lj1_bind_texture(NVC_ConstMatT &mat)
+  { mat.bind_texture(lj1_float_tex); }
+template <> inline void lj1_bind_texture<double2>(NVC_ConstMatD2 &mat)
+  { mat.bind_texture(lj1_double_tex); }
+template <class numtyp> inline void lj1_unbind_texture() 
+ { cudaUnbindTexture(lj1_float_tex); }
+template <> inline void lj1_unbind_texture<double2>() 
+ { cudaUnbindTexture(lj1_double_tex); }
+template <class numtyp>
+static __inline__ __device__ 
+typename cu_vec_traits<numtyp>::vec2 _lj1_(const int i, const int j) {
+  return tex2D(lj1_float_tex,j,i);
+}
+#ifdef GB_GPU_DOUBLE
+template <>
+static __inline__ __device__ double2 _lj1_<double>(const int i,const int j) {
+  int4 t=tex2D(lj1_double_tex,j,i);
+  double2 ans;
+  ans.x=__hiloint2double(t.y, t.x);
+  ans.y=__hiloint2double(t.w, t.z);
+  return ans;
+}
+#endif
+
+// ------------------------------- lj3 ------------------------------------
+
+static texture<float2, 2, cudaReadModeElementType> lj3_float_tex;
+static texture<int4, 2, cudaReadModeElementType> lj3_double_tex;
+template <class numtyp> inline void lj3_bind_texture(NVC_ConstMatT &mat)
+  { mat.bind_texture(lj3_float_tex); }
+template <> inline void lj3_bind_texture<double2>(NVC_ConstMatD2 &mat)
+  { mat.bind_texture(lj3_double_tex); }
+template <class numtyp> inline void lj3_unbind_texture() 
+ { cudaUnbindTexture(lj3_float_tex); }
+template <> inline void lj3_unbind_texture<double2>() 
+ { cudaUnbindTexture(lj3_double_tex); }
+template <class numtyp>
+static __inline__ __device__ 
+typename cu_vec_traits<numtyp>::vec2 _lj3_(const int i, const int j) {
+  return tex2D(lj3_float_tex,j,i);
+}
+#ifdef GB_GPU_DOUBLE
+template <>
+static __inline__ __device__ double2 _lj3_<double>(const int i,const int j) {
+  int4 t=tex2D(lj3_double_tex,j,i);
+  double2 ans;
+  ans.x=__hiloint2double(t.y, t.x);
+  ans.y=__hiloint2double(t.w, t.z);
+  return ans;
+}
+#endif
+
+// ------------------------------- offset ------------------------------------
+
+static texture<float, 2, cudaReadModeElementType> offset_float_tex;
+static texture<int2, 2, cudaReadModeElementType> offset_double_tex;
+template <class numtyp> inline void offset_bind_texture(NVC_ConstMatT &mat)
+  { mat.bind_texture(offset_float_tex); }
+template <> inline void offset_bind_texture<double>(NVC_ConstMatD &mat)
+  { mat.bind_texture(offset_double_tex); }
+template <class numtyp> inline void offset_unbind_texture() 
+ { cudaUnbindTexture(offset_float_tex); }
+template <> inline void offset_unbind_texture<double>() 
+ { cudaUnbindTexture(offset_double_tex); }
+template <class numtyp>
+static __inline__ __device__ numtyp _offset_(const int i, const int j) {
+  return tex2D(offset_float_tex,j,i);
+}
+#ifdef GB_GPU_DOUBLE
+template <>
+static __inline__ __device__ double _offset_<double>(const int i,const int j) {
+  int2 t=tex2D(offset_double_tex,j,i);
+  return __hiloint2double(t.y, t.x);
+}
+#endif
+
+#endif
--- a/src/GPU/Install.csh
+++ b/src/GPU/Install.csh
@ -0,0 +1,34 @@
+# Install/unInstall package classes in LAMMPS
+# edit Makefile.package to include/exclude GPU library
+
+if ($1 == 1) then
+
+#  sed -i 's/\S*gpu //' ../Makefile.package
+#  sed -i 's|^PKGPATH =\s*|&-L../../lib/gpu |' ../Makefile.package
+#  sed -i 's|^PKGLIB =\s*|&-lpair_gpu_lib |' ../Makefile.package
+
+  cp -f pair_lj_cut_gpu.h ../
+  cp -f pair_lj_cut_gpu.cpp ../
+  
+  if (-e ../pair_gayberne.cpp) then
+    cp -f style_gpu.h ../
+    cp -f pair_gayberne_gpu.h ../
+    cp -f pair_gayberne_gpu.cpp ../
+  else
+    grep -v erne style_gpu.h > ../style_gpu.h
+  endif 
+  
+else if ($1 == 0) then
+
+#  sed -i 's/\S*gpu //' ../Makefile.package
+
+  rm -f ../style_gpu.h
+  touch ../style_gpu.h
+
+  rm -f ../pair_lj_cut_gpu.h
+  rm -f ../pair_lj_cut_gpu.cpp
+
+  rm -f ../pair_gayberne_gpu.h
+  rm -f ../pair_gayberne_gpu.cpp
+
+endif
--- a/src/GPU/pair_gayberne_gpu.cpp
+++ b/src/GPU/pair_gayberne_gpu.cpp
@ -0,0 +1,369 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   cetain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Mike Brown (SNL)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "string.h"
+#include "pair_gayberne_gpu.h"
+#include "math_extra.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "integrate.h"
+#include "memory.h"
+#include "error.h"
+#include "neigh_request.h"
+#include "universe.h"
+
+#ifdef GB_GPU_OMP
+#include "omp.h"
+#endif
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+// External functions from cuda library for atom decomposition
+int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
+                  const double upsilon, const double mu, double **shape,
+                  double **well, double **cutsq, double **sigma, 
+                  double **epsilon, double *host_lshape, int **form,
+                  double **host_lj1, double **host_lj2, double **host_lj3, 
+                  double **host_lj4, double **offset, double *special_lj, 
+                  const int max_nbors, const int thread, const int gpu_id);
+void gb_gpu_clear(const int thread);
+int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum, 
+                         int *ilist, const int *numj, const int *type,
+                         const int thread, bool &success);
+void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread);
+void gb_gpu_atom(double **host_x, double **host_quat, const int *host_type, 
+                 const bool rebuild, const int thread);
+void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, 
+                     const int thread);
+double gb_gpu_forces(double **f, double **tor, const int *ilist,
+                     const bool eflag, const bool vflag, const bool eflag_atom,
+                     const bool vflag_atom, double *eatom, double **vatom,
+                     double *virial, const int thread);
+std::string gb_gpu_name(const int i, const int max_nbors);
+void gb_gpu_time(const int thread);
+int gb_gpu_num_devices();
+double gb_gpu_bytes();
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), my_thread(0),
+                                                omp_chunk(0), nthreads(1),
+                                                multi_gpu_mode(ONE_NODE),
+                                                multi_gpu_param(0)
+{
+  ij_new[0]=NULL;
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairGayBerneGPU::~PairGayBerneGPU()
+{
+  if (comm->me == 0 && screen) {
+    printf("\n\n-------------------------------------");
+    printf("--------------------------------\n");
+    printf("      GPU Time Stamps: ");
+    printf("\n-------------------------------------");
+    printf("--------------------------------\n");
+    gb_gpu_time(my_thread);
+    std::cout << "Procs: " << universe->nprocs << std::endl;
+    printf("-------------------------------------");
+    printf("--------------------------------\n\n");
+  }
+  #pragma omp parallel
+  {
+    #ifdef GB_GPU_OMP
+    int my_thread=omp_get_thread_num();
+    #endif
+    gb_gpu_clear(my_thread);
+    if (ij_new[my_thread]!=NULL) {
+      ij_new[my_thread]=NULL;
+      delete [] ij_new[my_thread];
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairGayBerneGPU::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+  if (vflag_atom) 
+    error->all("Per-atom virial not available with GPU Gay-Berne.");
+
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  int inum = list->inum;
+  
+  bool rebuild=false;
+  if (neighbor->ncalls > last_neighbor) {
+    last_neighbor=neighbor->ncalls;
+    rebuild=true;
+  }
+  
+  #pragma omp parallel
+  {
+    
+  bool success=true;
+  #ifdef GB_GPU_OMP
+  int my_thread=omp_get_thread_num();
+  if (rebuild) {
+    omp_chunk=static_cast<int>(ceil(static_cast<double>(inum)/nthreads));
+    if (my_thread==nthreads-1)
+      thread_inum[my_thread]=inum-(nthreads-1)*omp_chunk;
+    else
+      thread_inum[my_thread]=omp_chunk;
+    olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal, 
+                                        thread_inum[my_thread],  
+                                        list->ilist+omp_chunk*my_thread, 
+                                        list->numneigh, atom->type, my_thread,
+                                        success);
+  }
+  #else
+  if (rebuild)
+    olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal, inum, list->ilist, 
+                                        list->numneigh, atom->type, my_thread,
+                                        success);
+  #endif
+  if (!success)
+    error->one("Total # of atoms exceeds maximum allowed per GPGPU.\n");
+  
+  // copy atom data to GPU
+  gb_gpu_atom(atom->x,atom->quat,atom->type,rebuild,my_thread);
+
+  int i,j,ii,jj,jnum;
+  double factor_lj;
+  int *jlist;
+
+  if (rebuild==true) {
+    int num_ij = 0;
+
+    // loop over neighbors of my atoms
+    int *ijp=ij_new[my_thread];
+    #ifdef GB_GPU_OMP
+    int mgo=my_thread*omp_chunk;
+    int mgot=mgo+thread_inum[my_thread];
+    #else
+    int mgo=0, mgot=inum;
+    #endif
+    for (ii = mgo; ii<mgot; ii++) {
+      i = olist[my_thread][ii];
+      jlist = list->firstneigh[i];
+      jnum = list->numneigh[i];
+      
+      for (jj = 0; jj < jnum; jj++) {
+        j = jlist[jj];
+
+        *ijp=j;
+        ijp++;
+        num_ij++;
+          
+        if (num_ij==ij_size) {
+          memcpy(ij[my_thread],ij_new[my_thread],num_ij*sizeof(int));
+          gb_gpu_nbors(num_ij,eflag,my_thread);
+          ijp=ij_new[my_thread];
+          num_ij=0;
+        }
+      }
+    }
+    if (num_ij>0) {
+      memcpy(ij[my_thread],ij_new[my_thread],num_ij*sizeof(int));
+      gb_gpu_nbors(num_ij,eflag,my_thread);
+    }
+  }
+  
+  gb_gpu_gayberne(eflag,vflag,rebuild,my_thread);
+  double lvirial[6];
+  for (int i=0; i<6; i++) lvirial[i]=0.0;
+  double my_eng=gb_gpu_forces(atom->f,atom->torque,olist[my_thread],eflag,vflag,
+                              eflag_atom, vflag_atom, eatom, vatom, lvirial,
+                              my_thread);
+  #pragma omp critical
+  {
+    eng_vdwl+=my_eng;
+    virial[0]+=lvirial[0];
+    virial[1]+=lvirial[1];
+    virial[2]+=lvirial[2];
+    virial[3]+=lvirial[3];
+    virial[4]+=lvirial[4];
+    virial[5]+=lvirial[5];
+  }
+  
+  } //End parallel
+  
+  if (vflag_fdotr) virial_compute();
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairGayBerneGPU::settings(int narg, char **arg)
+{
+  if (narg != 4 && narg != 6) error->all("Illegal pair_style command");
+  
+  // Set multi_gpu_mode to one_node for multiple gpus on 1 node
+  // -- param is starting gpu id
+  // Set multi_gpu_mode to one_gpu to select the same gpu id on every node
+  // -- param is id of gpu
+  // Set multi_gpu_mode to multi_gpu to get ma
+  // -- param is number of gpus per node
+  multi_gpu_mode=ONE_NODE;
+  multi_gpu_param=0;
+  if (narg==6) {
+    multi_gpu_param=atoi(arg[5]);
+    if (strcmp("one_node",arg[4])==0)
+      multi_gpu_mode=ONE_NODE;
+    else if (strcmp("one_gpu",arg[4])==0)
+      multi_gpu_mode=ONE_GPU;
+    else if (strcmp("multi_gpu",arg[4])==0) {
+      multi_gpu_mode=MULTI_GPU;
+      if (multi_gpu_param<1)
+        error->all("Illegal pair_style command");
+    } else
+      error->all("Illegal pair_style command");
+    narg-=2;
+  }
+
+  PairGayBerne::settings(narg,arg);
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairGayBerneGPU::init_style()
+{
+  // Set the GPU ID
+  int my_gpu=comm->me+multi_gpu_param;
+  int ngpus=universe->nprocs;
+  if (multi_gpu_mode==ONE_GPU) {
+    my_gpu=multi_gpu_param;
+    ngpus=1;
+  } else if (multi_gpu_mode==MULTI_GPU) {
+    ngpus=multi_gpu_param;
+    my_gpu=comm->me%ngpus;
+    if (ngpus>universe->nprocs)
+      ngpus=universe->nprocs;
+  }
+  
+  if (!atom->quat_flag || !atom->torque_flag || !atom->avec->shape_type)
+    error->all("Pair gayberne requires atom attributes quat, torque, shape");
+  if (atom->radius_flag)
+    error->all("Pair gayberne cannot be used with atom attribute diameter");
+
+  int irequest = neighbor->request(this);
+
+  // per-type shape precalculations
+
+  for (int i = 1; i <= atom->ntypes; i++) {
+    if (setwell[i]) {
+      double *one = atom->shape[i];
+      shape[i][0] = one[0]*one[0];
+      shape[i][1] = one[1]*one[1];
+      shape[i][2] = one[2]*one[2];
+      lshape[i] = (one[0]*one[1]+one[2]*one[2])*sqrt(one[0]*one[1]);
+    }
+  }
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++)
+    for (int j = i; j <= atom->ntypes; j++) {
+      cut = init_one(i,j);
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
+    }
+
+  // If compiled with OpenMP and only 1 proc, try to use multiple GPUs w/threads
+  #ifdef GB_GPU_OMP
+  if (multi_gpu_mode!=ONE_GPU)
+    nthreads=ngpus=gb_gpu_num_devices();
+  else
+    nthreads=ngpus=1;
+  if (nthreads>MAX_GPU_THREADS)
+    nthreads=MAX_GPU_THREADS;
+  omp_set_num_threads(nthreads);
+  #endif
+    
+  #pragma omp parallel firstprivate(my_gpu)
+  {
+    #ifdef GB_GPU_OMP
+    int my_thread = omp_get_thread_num();
+    if (multi_gpu_mode!=ONE_GPU)
+      my_gpu=my_thread;
+    if (multi_gpu_mode==ONE_NODE)
+      my_gpu+=multi_gpu_param;
+    #endif
+    
+    ij[my_thread]=gb_gpu_init(ij_size, atom->ntypes+1, gamma, upsilon, mu, 
+                              shape, well, cutsq, sigma, epsilon, lshape, form,
+                              lj1, lj2, lj3, lj4, offset, force->special_lj, 
+                              neighbor->oneatom, my_thread, my_gpu);
+    if (ij[my_thread]==0)
+      error->one("AT LEAST ONE PROCESS COULD NOT ALLOCATE A CUDA-ENABLED GPU.");
+    
+    if (ij_new[my_thread]!=NULL)
+      delete [] ij_new[my_thread];
+    ij_new[my_thread]=new int[ij_size];
+  }
+  
+  last_neighbor = -1;
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->full = 1;
+  if (force->newton_pair) 
+    error->all("Cannot use newton with GPU GayBerne pair style.");
+
+  if (comm->me == 0 && screen) {
+    printf("\n-------------------------------------");
+    printf("-------------------------------------\n");
+    printf("- Using GPGPU acceleration for Gay-Berne:\n");
+    printf("-------------------------------------");
+    printf("-------------------------------------\n");
+
+    for (int i=0; i<ngpus; i++) {
+      int gpui=my_gpu;
+      if (multi_gpu_mode==ONE_NODE)
+        gpui=i+multi_gpu_param;
+      else if (multi_gpu_mode==MULTI_GPU)
+        gpui=i;
+      std::string gpu_string=gb_gpu_name(gpui,neighbor->oneatom);
+      printf("GPU %d: %s\n",gpui,gpu_string.c_str());  
+    }
+    printf("-------------------------------------");
+    printf("-------------------------------------\n\n");
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairGayBerneGPU::memory_usage()
+{
+  double bytes=Pair::memory_usage()+nthreads*ij_size*sizeof(int);
+  return bytes+gb_gpu_bytes();
+}
--- a/src/GPU/pair_gayberne_gpu.h
+++ b/src/GPU/pair_gayberne_gpu.h
@ -0,0 +1,43 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef PAIR_GPU_H
+#define PAIR_GPU_H
+
+#include "pair_gayberne.h"
+#define MAX_GPU_THREADS 4
+
+namespace LAMMPS_NS {
+
+class PairGayBerneGPU : public PairGayBerne {
+ public:
+  PairGayBerneGPU(LAMMPS *lmp);
+  ~PairGayBerneGPU();
+  void compute(int, int);
+  void settings(int, char **);
+  void init_style();
+  double memory_usage();
+ 
+  enum { ONE_NODE, ONE_GPU, MULTI_GPU };
+
+ private:  
+  int ij_size;
+  int *ij[MAX_GPU_THREADS], *ij_new[MAX_GPU_THREADS], *olist[MAX_GPU_THREADS];
+ 
+  int my_thread, nthreads, thread_inum[MAX_GPU_THREADS], omp_chunk;
+ 
+  int last_neighbor, multi_gpu_mode, multi_gpu_param;
+};
+
+}
+#endif
--- a/src/GPU/pair_lj_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_gpu.cpp
@ -0,0 +1,270 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   cetain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Mike Brown (SNL)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "string.h"
+#include "pair_lj_cut_gpu.h"
+#include "math_extra.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "integrate.h"
+#include "memory.h"
+#include "error.h"
+#include "neigh_request.h"
+#include "universe.h"
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+// External functions from cuda library for force decomposition
+int * lj_gpu_init(int &ij_size, const int ntypes, double **cutsq, 
+                  double **sigma, double **epsilon, double **host_lj1, 
+                  double **host_lj2, double **host_lj3, double **host_lj4, 
+                  double **offset, double *special_lj, const int max_nbors, 
+                  const int gpu_id);
+void lj_gpu_clear();
+bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist, 
+                        const int *numj);
+void lj_gpu_nbors(const int num_ij);
+void lj_gpu_atom(double **host_x, const int *host_type, const bool rebuild);
+void lj_gpu(const bool eflag, const bool vflag, const bool rebuild);
+double lj_gpu_forces(double **f, const int *ilist, const bool eflag, 
+                     const bool vflag, const bool eflag_atom, 
+                     const bool vflag_atom, double *eatom, double **vatom,
+                     double *virial);
+std::string lj_gpu_name(const int gpu_id, const int max_nbors);
+void lj_gpu_time();
+int lj_gpu_num_devices();
+double lj_gpu_bytes();
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutGPU::PairLJCutGPU(LAMMPS *lmp) : PairLJCut(lmp), multi_gpu_mode(0)
+{
+  ij_new=NULL;
+  respa_enable = 0;
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairLJCutGPU::~PairLJCutGPU()
+{
+  if (comm->me == 0 && screen) {
+    printf("\n\n-------------------------------------");
+    printf("--------------------------------\n");
+    printf("      GPU Time Stamps: ");
+    printf("\n-------------------------------------");
+    printf("--------------------------------\n");
+    lj_gpu_time();
+    std::cout << "Procs: " << universe->nprocs << std::endl;
+    printf("-------------------------------------");
+    printf("--------------------------------\n\n");
+  }
+  lj_gpu_clear();
+  if (ij_new!=NULL) {
+    ij_new=NULL;
+    delete [] ij_new;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutGPU::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+  if (vflag_atom) 
+    error->all("Per-atom virial not available with GPU Gay-Berne.");
+
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  int inum = list->inum;
+  int *ilist = list->ilist;
+  
+  bool rebuild=false;
+  if (neighbor->ncalls > last_neighbor) {
+    last_neighbor=neighbor->ncalls;
+    rebuild=true;
+  }
+  
+  // copy nbors to GPU
+  if (rebuild)
+    if (!lj_gpu_reset_nbors(nall, inum, ilist, list->numneigh))
+      error->one("Total # of atoms exceed maximum allowed per GPGPU.\n");
+  
+  // copy atom data to GPU
+  lj_gpu_atom(atom->x,atom->type,rebuild);
+
+  int i,j,ii,jj,jnum;
+  double factor_lj;
+  int *jlist;
+
+  if (rebuild==true) {
+    int num_ij = 0;
+
+    // loop over neighbors of my atoms
+    int *ijp=ij_new;
+    for (ii = 0; ii<inum; ii++) {
+      i = ilist[ii];
+      jlist = list->firstneigh[i];
+      jnum = list->numneigh[i];
+      
+      for (jj = 0; jj < jnum; jj++) {
+        j = jlist[jj];
+
+        *ijp=j;
+        ijp++;
+        num_ij++;
+          
+        if (num_ij==ij_size) {
+          memcpy(ij,ij_new,num_ij*sizeof(int));
+          lj_gpu_nbors(num_ij);
+          ijp=ij_new;
+          num_ij=0;
+        }
+      }
+    }
+    if (num_ij>0) {
+      memcpy(ij,ij_new,num_ij*sizeof(int));
+      lj_gpu_nbors(num_ij);
+    }
+  }
+  
+  lj_gpu(eflag,vflag,rebuild);
+  eng_vdwl=lj_gpu_forces(atom->f,ilist,eflag,vflag, eflag_atom, vflag_atom, 
+                         eatom, vatom, virial);
+
+  if (vflag_fdotr) virial_compute();
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairLJCutGPU::settings(int narg, char **arg)
+{
+  if (narg != 1 && narg != 3) error->all("Illegal pair_style command");
+  
+  // Set multi_gpu_mode to one_node for multiple gpus on 1 node
+  // -- param is starting gpu id
+  // Set multi_gpu_mode to one_gpu to select the same gpu id on every node
+  // -- param is id of gpu
+  // Set multi_gpu_mode to multi_gpu to get ma
+  // -- param is number of gpus per node
+  multi_gpu_mode=ONE_NODE;
+  multi_gpu_param=0;
+  if (narg==3) {
+    multi_gpu_param=atoi(arg[2]);
+    if (strcmp("one_node",arg[1])==0)
+      multi_gpu_mode=ONE_NODE;
+    else if (strcmp("one_gpu",arg[1])==0)
+      multi_gpu_mode=ONE_GPU;
+    else if (strcmp("multi_gpu",arg[1])==0) {
+      multi_gpu_mode=MULTI_GPU;
+      if (multi_gpu_param<1)
+        error->all("Illegal pair_style command");
+    } else
+      error->all("Illegal pair_style command");
+    narg-=2;
+  }
+
+  PairLJCut::settings(narg,arg);
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairLJCutGPU::init_style()
+{
+  // Set the GPU ID
+  int my_gpu=comm->me+multi_gpu_param;
+  int ngpus=universe->nprocs;
+  if (multi_gpu_mode==ONE_GPU) {
+    my_gpu=multi_gpu_param;
+    ngpus=1;
+  } else if (multi_gpu_mode==MULTI_GPU) {
+    ngpus=multi_gpu_param;
+    my_gpu=comm->me%ngpus;
+    if (ngpus>universe->nprocs)
+      ngpus=universe->nprocs;
+  }
+  
+  int irequest = neighbor->request(this);
+  cut_respa=NULL;
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++)
+    for (int j = i; j <= atom->ntypes; j++) {
+      cut = init_one(i,j);
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
+    }
+
+  ij=lj_gpu_init(ij_size, atom->ntypes+1, cutsq, sigma, epsilon, lj1, lj2, lj3, 
+                 lj4, offset, force->special_lj, neighbor->oneatom, my_gpu);
+  if (ij==0)
+    error->one("AT LEAST ONE PROCESS COULD NOT ALLOCATE A CUDA-ENABLED GPU.");
+    
+  if (ij_new!=NULL)
+    delete [] ij_new;
+  ij_new=new int[ij_size];
+  
+  last_neighbor = -1;
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->full = 1;
+  if (force->newton_pair) 
+    error->all("Cannot use newton with GPU LJCut pair style.");
+
+  if (comm->me == 0 && screen) {
+    printf("\n-------------------------------------");
+    printf("-------------------------------------\n");
+    printf("- Using GPGPU acceleration for LJ-Cut:\n");
+    printf("-------------------------------------");
+    printf("-------------------------------------\n");
+
+    for (int i=0; i<ngpus; i++) {
+      int gpui=my_gpu;
+      if (multi_gpu_mode==ONE_NODE)
+        gpui=i+multi_gpu_param;
+      else if (multi_gpu_mode==MULTI_GPU)
+        gpui=i;
+      std::string gpu_string=lj_gpu_name(gpui,neighbor->oneatom);
+      printf("GPU %d: %s\n",gpui,gpu_string.c_str());  
+    }
+    printf("-------------------------------------");
+    printf("-------------------------------------\n\n");
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCutGPU::memory_usage()
+{
+  double bytes=Pair::memory_usage()+ij_size*sizeof(int);
+  return bytes+lj_gpu_bytes();
+}
--- a/src/GPU/pair_lj_cut_gpu.h
+++ b/src/GPU/pair_lj_cut_gpu.h
@ -0,0 +1,40 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef PAIR_LJ_CUT_GPU_H
+#define PAIR_LJ_CUT_GPU_H
+
+#include "pair_lj_cut.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutGPU : public PairLJCut {
+ public:
+  PairLJCutGPU(LAMMPS *lmp);
+  ~PairLJCutGPU();
+  void compute(int, int);
+  void settings(int, char **);
+  void init_style();
+  double memory_usage();
+
+  enum { ONE_NODE, ONE_GPU, MULTI_GPU };
+
+ private:  
+  int ij_size;
+  int *ij, *ij_new;
+ 
+  int last_neighbor, multi_gpu_mode, multi_gpu_param;
+};
+
+}
+#endif
--- a/src/GPU/style_gpu.h
+++ b/src/GPU/style_gpu.h
@ -0,0 +1,41 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef AtomInclude
+#endif
+
+#ifdef AtomClass
+# endif
+
+#ifdef ComputeInclude
+#endif
+
+#ifdef ComputeClass
+#endif
+
+#ifdef FixInclude
+#endif
+
+#ifdef FixClass
+#endif
+
+#ifdef PairInclude
+#include "pair_lj_cut_gpu.h"
+#include "pair_gayberne_gpu.h"
+#endif
+
+#ifdef PairClass
+PairStyle(lj/cut/gpu,PairLJCutGPU)
+PairStyle(gayberne/gpu,PairGayBerneGPU)
+#endif
+