git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7280 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-12-02 16:01:04 +00:00 · 2011-12-02 16:01:04 +00:00 · 00dc2b891f
parent 4fe24446de
commit 00dc2b891f
110 changed files with 0 additions and 28102 deletions
--- a/lib/gpu/Makefile.firefly
+++ b/lib/gpu/Makefile.firefly
@ -1,41 +0,0 @@
-# /* ----------------------------------------------------------------------   
-#    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator       
-#    http://lammps.sandia.gov, Sandia National Laboratories                   
-#    Steve Plimpton, sjplimp@sandia.gov                                       
-#                                                                             
-#    Copyright (2003) Sandia Corporation.  Under the terms of Contract        
-#    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains   
-#    certain rights in this software.  This software is distributed under      
-#    the GNU General Public License.                                          
-#                                                                             
-#    See the README file in the top-level LAMMPS directory.                   
-# ------------------------------------------------------------------------- */
-#                                                                             
-# /* ----------------------------------------------------------------------   
-#    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov               
-#                          Peng Wang (Nvidia), penwang@nvidia.com             
-#                          Paul Crozier (SNL), pscrozi@sandia.gov             
-# ------------------------------------------------------------------------- */
-
-CUDA_HOME = /usr/local/cuda
-NVCC = nvcc
-
-CUDA_ARCH = -arch=sm_11
-CUDA_PRECISION = -D_SINGLE_SINGLE
-CUDA_INCLUDE = -I/usr/local/cuda/include
-CUDA_LIB = -L/usr/local/cuda/lib64
-CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
-#CUDA_OPTS = -DUNIX -g -G
-
-CUDR_CPP = mpic++ -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK -fopenmp
-CUDR_OPTS = -g -Wall -O2 -DUCL_NO_EXIT # -xHost -no-prec-div -ansi-alias
-#CUDR_OPTS = -g -Wall -DUCL_SYNC_DEBUG
-
-BIN_DIR = /home/wb8/bin
-OBJ_DIR = /home/wb8/obj/lammps
-LIB_DIR = /home/wb8/obj/lammps
-AR = ar
-BSH = /bin/sh
-
-include Nvidia.makefile
-
--- a/lib/gpu/Makefile.firefly_opencl
+++ b/lib/gpu/Makefile.firefly_opencl
@ -1,31 +0,0 @@
-# /* ----------------------------------------------------------------------   
-#    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator       
-#    http://lammps.sandia.gov, Sandia National Laboratories                   
-#    Steve Plimpton, sjplimp@sandia.gov                                       
-#                                                                             
-#    Copyright (2003) Sandia Corporation.  Under the terms of Contract        
-#    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains   
-#    certain rights in this software.  This software is distributed under      
-#    the GNU General Public License.                                          
-#                                                                             
-#    See the README file in the top-level LAMMPS directory.                   
-# ------------------------------------------------------------------------- */
-#                                                                             
-# /* ----------------------------------------------------------------------   
-#    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov               
-#                          Peng Wang (Nvidia), penwang@nvidia.com             
-#                          Paul Crozier (SNL), pscrozi@sandia.gov             
-# ------------------------------------------------------------------------- */
-
-OCL_CPP = mpic++ -O3 -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK -I/usr/local/cuda/include/
-OCL_LINK = -lOpenCL
-OCL_PREC = -D_SINGLE_SINGLE
-
-BIN_DIR =  /home/wb8/bin
-OBJ_DIR = /home/wb8/obj/lammps
-LIB_DIR = /home/wb8/obj/lammps
-AR = ar
-BSH = /bin/sh
-
-include Opencl.makefile
-
--- a/lib/gpu/atomic_gpu_memory.cpp
+++ b/lib/gpu/atomic_gpu_memory.cpp
@ -1,289 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
- 
-#include "atomic_gpu_memory.h"
-#define AtomicGPUMemoryT AtomicGPUMemory<numtyp, acctyp>
-
-extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
-
-template <class numtyp, class acctyp>
-AtomicGPUMemoryT::AtomicGPUMemory() : _compiled(false), _max_bytes(0)  {
-  device=&pair_gpu_device;
-  ans=new PairGPUAns<numtyp,acctyp>();
-  nbor=new PairGPUNbor();
-}
-
-template <class numtyp, class acctyp>
-AtomicGPUMemoryT::~AtomicGPUMemory() {
-  delete ans;
-  delete nbor;
-}
-
-template <class numtyp, class acctyp>
-int AtomicGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
-  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
-         nbor->bytes_per_atom(max_nbors);
-}
-
-template <class numtyp, class acctyp>
-int AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
-                                  const int max_nbors, const int maxspecial,
-                                  const double cell_size,
-                                  const double gpu_split, FILE *_screen,
-                                  const char *pair_program) {
-  nbor_time_avail=false;
-  screen=_screen;
-
-  bool gpu_nbor=false;
-  if (device->gpu_mode()==PairGPUDevice<numtyp,acctyp>::GPU_NEIGH)
-    gpu_nbor=true;
-
-  int _gpu_host=0;
-  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
-  if (host_nlocal>0)
-    _gpu_host=1;
-
-  _threads_per_atom=device->threads_per_atom();
-  if (_threads_per_atom>1 && gpu_nbor==false) {
-    nbor->packing(true);
-    _nbor_data=&(nbor->dev_packed);
-  } else
-    _nbor_data=&(nbor->dev_nbor);
-    
-  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false);
-  if (success!=0)
-    return success;
-    
-  ucl_device=device->gpu;
-  atom=&device->atom;
-
-  _block_size=device->pair_block_size();
-  compile_kernels(*ucl_device,pair_program);
-
-  // Initialize host-device load balancer
-  hd_balancer.init(device,gpu_nbor,gpu_split);
-
-  // Initialize timers for the selected GPU
-  time_pair.init(*ucl_device);
-  time_pair.zero();
-
-  pos_tex.bind_float(atom->dev_x,4);
-
-  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
-
-  return 0;
-}
-
-template <class numtyp, class acctyp>
-void AtomicGPUMemoryT::estimate_gpu_overhead() {
-  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
-}
-
-template <class numtyp, class acctyp>
-void AtomicGPUMemoryT::clear_atomic() {
-  // Output any timing information
-  acc_timers();
-  double avg_split=hd_balancer.all_avg_split();
-  _gpu_overhead*=hd_balancer.timestep();
-  _driver_overhead*=hd_balancer.timestep();
-  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
-                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);
-
-  if (_compiled) {
-    k_pair_fast.clear();
-    k_pair.clear();
-    delete pair_program;
-    _compiled=false;
-  }
-
-  time_pair.clear();
-  hd_balancer.clear();
-
-  nbor->clear();
-  ans->clear();
-  device->clear();
-}
-
-// ---------------------------------------------------------------------------
-// Copy neighbor list from host
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
-                                   int *numj, int **firstneigh, bool &success) {
-  success=true;
-
-  nbor_time_avail=true;
-  int mn=nbor->max_nbor_loop(inum,numj,ilist);
-  resize_atom(inum,nall,success);
-  resize_local(inum,mn,success);
-  if (!success)
-    return false;
-
-  nbor->get_host(inum,ilist,numj,firstneigh,block_size());
-
-  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
-  if (bytes>_max_an_bytes)
-    _max_an_bytes=bytes;
-  
-  return ilist;
-}
-
-// ---------------------------------------------------------------------------
-// Build neighbor list on device
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
-                                              const int host_inum,
-                                              const int nall, double **host_x,
-                                              int *host_type, double *sublo,
-                                              double *subhi, int *tag,
-                                              int **nspecial, int **special,
-                                              bool &success) {
-  nbor_time_avail=true;
-
-  success=true;
-  resize_atom(inum,nall,success);
-  resize_local(inum,host_inum,nbor->max_nbors(),success);
-  if (!success)
-    return;
-  atom->cast_copy_x(host_x,host_type);
-
-  int mn;
-  nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
-
-  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
-  if (bytes>_max_an_bytes)
-    _max_an_bytes=bytes;
-}
-
-// ---------------------------------------------------------------------------
-// Copy nbor list from host if necessary and then calculate forces, virials,..
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void AtomicGPUMemoryT::compute(const int f_ago, const int inum_full,
-                               const int nall, double **host_x, int *host_type,
-                               int *ilist, int *numj, int **firstneigh,
-                               const bool eflag, const bool vflag,
-                               const bool eatom, const bool vatom,
-                               int &host_start, const double cpu_time,
-                               bool &success) {
-  acc_timers();
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    resize_atom(0,nall,success);
-    zero_timers();
-    return;
-  }
-  
-  int ago=hd_balancer.ago_first(f_ago);
-  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
-  ans->inum(inum);
-  host_start=inum;
-
-  if (ago==0) {
-    reset_nbors(nall, inum, ilist, numj, firstneigh, success);
-    if (!success)
-      return;
-  }
-
-  atom->cast_x_data(host_x,host_type);
-  hd_balancer.start_timer();
-  atom->add_x_data(host_x,host_type);
-
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
-  device->add_ans_object(ans);
-  hd_balancer.stop_timer();
-}
-
-// ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary and then compute forces, virials, energies
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int ** AtomicGPUMemoryT::compute(const int ago, const int inum_full,
-                                 const int nall, double **host_x, int *host_type,
-                                 double *sublo, double *subhi, int *tag,
-                                 int **nspecial, int **special, const bool eflag, 
-                                 const bool vflag, const bool eatom,
-                                 const bool vatom, int &host_start,
-                                 int **ilist, int **jnum,
-                                 const double cpu_time, bool &success) {
-  acc_timers();
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    resize_atom(0,nall,success);
-    zero_timers();
-    return NULL;
-  }
-  
-  hd_balancer.balance(cpu_time);
-  int inum=hd_balancer.get_gpu_count(ago,inum_full);
-  ans->inum(inum);
-  host_start=inum;
- 
-  // Build neighbor list on GPU if necessary
-  if (ago==0) {
-    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    sublo, subhi, tag, nspecial, special, success);
-    if (!success)
-      return NULL;
-    hd_balancer.start_timer();
-  } else {
-    atom->cast_x_data(host_x,host_type);
-    hd_balancer.start_timer();
-    atom->add_x_data(host_x,host_type);
-  }
-  *ilist=nbor->host_ilist.begin();
-  *jnum=nbor->host_acc.begin();
-
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
-  device->add_ans_object(ans);
-  hd_balancer.stop_timer();
-  
-  return nbor->host_jlist.begin()-host_start;
-}
-
-template <class numtyp, class acctyp>
-double AtomicGPUMemoryT::host_memory_usage_atomic() const {
-  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
-         4*sizeof(numtyp)+sizeof(AtomicGPUMemory<numtyp,acctyp>);
-}
-
-template <class numtyp, class acctyp>
-void AtomicGPUMemoryT::compile_kernels(UCL_Device &dev, const char *pair_str) {
-  if (_compiled)
-    return;
-
-  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
-                    std::string(OCL_PRECISION_COMPILE);
-
-  pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,flags.c_str());
-  k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
-  k_pair.set_function(*pair_program,"kernel_pair");
-  pos_tex.get_texture(*pair_program,"pos_tex");
-
-  _compiled=true;
-}
-
-template class AtomicGPUMemory<PRECISION,ACC_PRECISION>;
-
--- a/lib/gpu/atomic_gpu_memory.h
+++ b/lib/gpu/atomic_gpu_memory.h
@ -1,206 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef ATOMIC_GPU_MEMORY_H
-#define ATOMIC_GPU_MEMORY_H
-
-#include "pair_gpu_device.h"
-#include "pair_gpu_balance.h"
-#include "mpi.h"
-
-#ifdef USE_OPENCL
-#include "geryon/ocl_texture.h"
-#else
-#include "geryon/nvd_texture.h"
-#endif
-
-template <class numtyp, class acctyp>
-class AtomicGPUMemory {
- public:
-  AtomicGPUMemory();
-  virtual ~AtomicGPUMemory();
-
-  /// Clear any previous data and set up for a new LAMMPS run
-  /** \param max_nbors initial number of rows in the neighbor matrix
-    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device
-    * 
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init_atomic(const int nlocal, const int nall, const int max_nbors,
-                  const int maxspecial, const double cell_size, 
-                  const double gpu_split, FILE *screen, 
-                  const char *pair_program);
-
-  /// Estimate the overhead for GPU context changes and CPU driver
-  void estimate_gpu_overhead();
-
-  /// Check if there is enough storage for atom arrays and realloc if not
-  /** \param success set to false if insufficient memory **/
-  inline void resize_atom(const int inum, const int nall, bool &success) {
-    if (atom->resize(nall, success))
-      pos_tex.bind_float(atom->dev_x,4);
-    ans->resize(inum,success);
-  }
-
-  /// Check if there is enough storage for neighbors and realloc if not
-  /** \param nlocal number of particles whose nbors must be stored on device
-    * \param host_inum number of particles whose nbors need to copied to host
-    * \param current maximum number of neighbors
-    * \note olist_size=total number of local particles **/
-  inline void resize_local(const int inum, const int max_nbors, bool &success) {
-    nbor->resize(inum,max_nbors,success);
-  }
-
-  /// Check if there is enough storage for neighbors and realloc if not
-  /** \param nlocal number of particles whose nbors must be stored on device
-    * \param host_inum number of particles whose nbors need to copied to host
-    * \param current maximum number of neighbors
-    * \note host_inum is 0 if the host is performing neighboring
-    * \note nlocal+host_inum=total number local particles
-    * \note olist_size=0 **/
-  inline void resize_local(const int inum, const int host_inum, 
-                           const int max_nbors, bool &success) {
-    nbor->resize(inum,host_inum,max_nbors,success);
-  }
-
-  /// Clear all host and device data
-  /** \note This is called at the beginning of the init() routine **/
-  void clear_atomic();
-
-  /// Returns memory usage on device per atom
-  int bytes_per_atom_atomic(const int max_nbors) const;
-
-  /// Total host memory used by library for pair style
-  double host_memory_usage_atomic() const;
-
-  /// Accumulate timers
-  inline void acc_timers() {
-    if (device->time_device()) {
-      if (nbor_time_avail) {
-        nbor->time_nbor.add_to_total();
-        nbor->time_kernel.add_to_total();
-        nbor_time_avail=false;
-      }
-      time_pair.add_to_total();
-      atom->acc_timers();
-      ans->acc_timers();
-    }
-  }
-
-  /// Zero timers
-  inline void zero_timers() {
-    nbor_time_avail=false;
-    time_pair.zero();
-    atom->zero_timers();
-    ans->zero_timers();
-  }
-
-  /// Copy neighbor list from host
-  int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
-                    int **firstneigh, bool &success);
-
-  /// Build neighbor list on device
-  void build_nbor_list(const int inum, const int host_inum,
-                       const int nall, double **host_x, int *host_type,
-                       double *sublo, double *subhi, int *tag, int **nspecial, 
-                       int **special, bool &success);
-
-  /// Pair loop with host neighboring
-  void compute(const int f_ago, const int inum_full,
-               const int nall, double **host_x, int *host_type,
-               int *ilist, int *numj, int **firstneigh, const bool eflag,
-               const bool vflag, const bool eatom, const bool vatom,
-               int &host_start, const double cpu_time, bool &success);
-
-  /// Pair loop with device neighboring
-  int * compute(const int ago, const int inum_full,
-                const int nall, double **host_x, int *host_type, double *sublo,
-                double *subhi, int *tag, int **nspecial,
-                int **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
-                const double cpu_time, bool &success);
-
-  /// Pair loop with device neighboring
-  int ** compute(const int ago, const int inum_full,
-                 const int nall, double **host_x, int *host_type, double *sublo,
-                 double *subhi, int *tag, int **nspecial,
-                 int **special, const bool eflag, const bool vflag, 
-                 const bool eatom, const bool vatom, int &host_start, 
-                 int **ilist, int **numj, const double cpu_time, bool &success);
-
-  // -------------------------- DEVICE DATA ------------------------- 
-
-  /// Device Properties and Atom and Neighbor storage
-  PairGPUDevice<numtyp,acctyp> *device;
-
-  /// Geryon device
-  UCL_Device *ucl_device;
-
-  /// Device Timers
-  UCL_Timer time_pair;
-
-  /// Host device load balancer
-  PairGPUBalance<numtyp,acctyp> hd_balancer;
-
-  /// LAMMPS pointer for screen output
-  FILE *screen;
-
-  // --------------------------- ATOM DATA --------------------------
-
-  /// Atom Data
-  PairGPUAtom<numtyp,acctyp> *atom;
-
-  // ------------------------ FORCE/ENERGY DATA -----------------------
-
-  PairGPUAns<numtyp,acctyp> *ans;
-
-  // --------------------------- NBOR DATA ----------------------------
-
-  /// Neighbor data
-  PairGPUNbor *nbor;
-
-  /// True if we need to accumulate time for neighboring
-  bool nbor_time_avail;
-
-  // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Program *pair_program;
-  UCL_Kernel k_pair_fast, k_pair;
-  inline int block_size() { return _block_size; }
-
-  // --------------------------- TEXTURES -----------------------------
-  UCL_Texture pos_tex;
-
- protected:
-  bool _compiled;
-  int _block_size, _threads_per_atom;
-  double _max_bytes, _max_an_bytes;
-  double _gpu_overhead, _driver_overhead;
-  UCL_D_Vec<int> *_nbor_data;
-
-  void compile_kernels(UCL_Device &dev, const char *pair_string);
-
-  virtual void loop(const bool _eflag, const bool _vflag) = 0;
-};
-
-#endif
-
-
--- a/lib/gpu/base_ellipsoid.cpp
+++ b/lib/gpu/base_ellipsoid.cpp
@ -1,470 +0,0 @@
-/***************************************************************************
-                              base_ellipsoid.cpp
-                             -------------------
-                               W. Michael Brown
-
-  Base class for acceleration of ellipsoid potentials
-
- __________________________________________________________________________
-    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
- __________________________________________________________________________
-
-    begin                : Thu May 5 2011
-    email                : brownw@ornl.gov
- ***************************************************************************/
-
-#include "base_ellipsoid.h"
-using namespace LAMMPS_AL;
-
-#ifdef USE_OPENCL
-#include "ellipsoid_nbor_cl.h"
-#else
-#include "ellipsoid_nbor_ptx.h"
-#endif
-
-#define BaseEllipsoidT BaseEllipsoid<numtyp, acctyp>
-extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
-
-template <class numtyp, class acctyp>
-BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0) {
-  device=&pair_gpu_device;
-  ans=new PairGPUAns<numtyp,acctyp>();
-  nbor=new PairGPUNbor();
-}
-
-template <class numtyp, class acctyp>
-BaseEllipsoidT::~BaseEllipsoid() {
-  delete ans;
-  delete nbor;
-}
-
-template <class numtyp, class acctyp>
-int BaseEllipsoidT::bytes_per_atom(const int max_nbors) const {
-  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
-         nbor->bytes_per_atom(max_nbors);
-}
-
-template <class numtyp, class acctyp>
-int BaseEllipsoidT::init_base(const int nlocal, const int nall,
-                              const int max_nbors, const int maxspecial,
-                              const double cell_size, const double gpu_split,
-                              FILE *_screen, const int ntypes, int **h_form,
-                              const char *ellipsoid_program,
-                              const char *lj_program, const bool ellip_sphere) {
-  nbor_time_avail=false;
-  screen=_screen;
-  _ellipsoid_sphere=ellip_sphere;
-
-  bool gpu_nbor=false;
-  if (device->gpu_mode()==PairGPUDevice<numtyp,acctyp>::GPU_NEIGH)
-    gpu_nbor=true;
-
-  int _gpu_host=0;
-  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
-  if (host_nlocal>0)
-    _gpu_host=1;
-
-  _threads_per_atom=device->threads_per_charge();
-    
-  int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,true);
-  if (success!=0)
-    return success;
-
-  ucl_device=device->gpu;
-  atom=&device->atom;
-
-  _block_size=device->pair_block_size();
-  compile_kernels(*ucl_device,ellipsoid_program,lj_program,ellip_sphere);
-
-  // Initialize host-device load balancer
-  hd_balancer.init(device,gpu_nbor,gpu_split);
-
-  // Initialize timers for the selected GPU
-  time_lj.init(*ucl_device);
-  time_nbor1.init(*ucl_device);
-  time_ellipsoid.init(*ucl_device);
-  time_nbor2.init(*ucl_device);
-  time_ellipsoid2.init(*ucl_device);
-  time_nbor3.init(*ucl_device);
-  time_ellipsoid3.init(*ucl_device);
-  time_lj.zero();
-  time_nbor1.zero();
-  time_ellipsoid.zero();
-  time_nbor2.zero();
-  time_ellipsoid2.zero();
-  time_nbor3.zero();
-  time_ellipsoid3.zero();
-
-  // See if we want fast GB-sphere or sphere-sphere calculations
-  _host_form=h_form;
-  _multiple_forms=false;
-  for (int i=1; i<ntypes; i++)
-    for (int j=i; j<ntypes; j++)
-      if (_host_form[i][j]!=ELLIPSE_ELLIPSE)
-        _multiple_forms=true;
-  if (_multiple_forms && host_nlocal>0) {
-    std::cerr << "Cannot use Gayberne with multiple forms and GPU neighbor.\n";
-    exit(1);
-  }
-  
-  if (_multiple_forms)
-    ans->dev_ans.zero();
-
-  // Memory for ilist ordered by particle type
-  if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS)
-    return 0;
-  else return -3;
-
-  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
-
-  return 0;
-}
-
-template <class numtyp, class acctyp>
-void BaseEllipsoidT::estimate_gpu_overhead() {
-  device->estimate_gpu_overhead(2,_gpu_overhead,_driver_overhead);
-}
-
-template <class numtyp, class acctyp>
-void BaseEllipsoidT::clear_base() {
-  // Output any timing information
-  output_times();
-  host_olist.clear();
-  
-  if (_compiled) {
-    k_nbor_fast.clear();
-    k_nbor.clear();
-    k_ellipsoid.clear();
-    k_ellipsoid_sphere.clear();
-    k_sphere_ellipsoid.clear();
-    k_lj_fast.clear();
-    k_lj.clear();
-    delete nbor_program;
-    delete ellipsoid_program;
-    delete lj_program;
-    _compiled=false;
-  }
- 
-  time_nbor1.clear();
-  time_ellipsoid.clear();
-  time_nbor2.clear();
-  time_ellipsoid2.clear();
-  time_nbor3.clear();
-  time_ellipsoid3.clear();
-  time_lj.clear();
-  hd_balancer.clear();
-
-  nbor->clear();
-  ans->clear();
-  device->clear();
-}
-
-template <class numtyp, class acctyp>
-void BaseEllipsoidT::output_times() {
-  // Output any timing information
-  acc_timers();
-  double single[9], times[9];
-
-  single[0]=atom->transfer_time()+ans->transfer_time();
-  single[1]=nbor->time_nbor.total_seconds();
-  single[2]=time_nbor1.total_seconds()+time_nbor2.total_seconds()+
-            time_nbor3.total_seconds()+nbor->time_nbor.total_seconds();
-  single[3]=time_ellipsoid.total_seconds()+time_ellipsoid2.total_seconds()+
-            time_ellipsoid3.total_seconds();
-  if (_multiple_forms)
-    single[4]=time_lj.total_seconds();
-  else
-    single[4]=0;
-  single[5]=atom->cast_time()+ans->cast_time();
-  single[6]=_gpu_overhead;
-  single[7]=_driver_overhead;
-  single[8]=ans->cpu_idle_time();
-
-  MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,device->replica());
-  double avg_split=hd_balancer.all_avg_split();
-
-  _max_bytes+=atom->max_gpu_bytes();
-  double mpi_max_bytes;
-  MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
-             device->replica());
-  double max_mb=mpi_max_bytes/(1024*1024);
-
-  if (device->replica_me()==0)
-    if (screen && times[5]>0.0) {
-      int replica_size=device->replica_size();
-
-      fprintf(screen,"\n\n-------------------------------------");
-      fprintf(screen,"--------------------------------\n");
-      fprintf(screen,"      GPU Time Info (average): ");
-      fprintf(screen,"\n-------------------------------------");
-      fprintf(screen,"--------------------------------\n");
-
-      if (device->procs_per_gpu()==1) {
-        fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/replica_size);
-        fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[5]/replica_size);
-        fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/replica_size);
-        if (nbor->gpu_nbor())
-          fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/replica_size);
-        else
-          fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/replica_size);
-        fprintf(screen,"Force calc:      %.4f s.\n",times[3]/replica_size);
-        fprintf(screen,"LJ calc:         %.4f s.\n",times[4]/replica_size);
-      }
-      fprintf(screen,"GPU Overhead:    %.4f s.\n",times[6]/replica_size);
-      fprintf(screen,"Average split:   %.4f.\n",avg_split);
-      fprintf(screen,"Threads / atom:  %d.\n",_threads_per_atom);      
-      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
-      fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
-      fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[8]/replica_size);
-      fprintf(screen,"-------------------------------------");
-      fprintf(screen,"--------------------------------\n\n");
-
-
-      fprintf(screen,"Average split:   %.4f.\n",avg_split);
-      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
-    }
-  _max_bytes=0.0;
-}
-
-// ---------------------------------------------------------------------------
-// Pack neighbors to limit thread divergence for lj-lj and ellipse 
-// ---------------------------------------------------------------------------
-template<class numtyp, class acctyp>
-void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start, 
-                                const int inum, const int form_low,
-                                const int form_high, const bool shared_types,
-                                int ntypes) {
-  int stride=nbor->nbor_pitch();
-  if (shared_types) {
-    k_nbor_fast.set_size(GX,BX);
-    k_nbor_fast.run(&atom->dev_x.begin(), &cut_form.begin(), 
-                    &nbor->dev_nbor.begin(), &stride, &start, &inum,
-                    &nbor->dev_packed.begin(), &form_low, &form_high);
-  } else {
-    k_nbor.set_size(GX,BX);
-    k_nbor.run(&atom->dev_x.begin(), &cut_form.begin(), &ntypes,
-               &nbor->dev_nbor.begin(), &stride, &start, &inum, 
-               &nbor->dev_packed.begin(), &form_low, &form_high);
-  }
-}
-
-// ---------------------------------------------------------------------------
-// Copy neighbor list from host
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void BaseEllipsoidT::reset_nbors(const int nall, const int inum, 
-                                 const int osize, int *ilist,
-                                 int *numj, int *type, int **firstneigh,
-                                 bool &success) {
-  success=true;
-    
-  nbor_time_avail=true;
-
-  int mn=nbor->max_nbor_loop(osize,numj,ilist);
-  resize_atom(nall,success);
-  resize_local(inum,0,mn,osize,success);
-  if (!success)
-    return;
-    
-  if (_multiple_forms) {
-    int p=0;
-    for (int i=0; i<osize; i++) {
-      int itype=type[ilist[i]];
-      if (_host_form[itype][itype]==ELLIPSE_ELLIPSE) {
-        host_olist[p]=ilist[i];
-        p++;
-      }
-    }
-    _max_last_ellipse=p;
-    _last_ellipse=std::min(inum,_max_last_ellipse);
-    for (int i=0; i<osize; i++) {
-      int itype=type[ilist[i]];
-      if (_host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
-        host_olist[p]=ilist[i];
-        p++;
-      }
-    }
-    nbor->get_host(inum,host_olist.begin(),numj,firstneigh,block_size());
-    nbor->copy_unpacked(inum,mn);
-    return;
-  }
-  _last_ellipse=inum;
-  _max_last_ellipse=inum;
-  nbor->get_host(inum,ilist,numj,firstneigh,block_size());
-  nbor->copy_unpacked(inum,mn);
-
-  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
-  if (bytes>_max_an_bytes)
-    _max_an_bytes=bytes;
-}
-
-// ---------------------------------------------------------------------------
-// Build neighbor list on device
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum,
-                                            const int nall, double **host_x,
-                                            int *host_type, double *sublo,
-                                            double *subhi, int *tag, 
-                                            int **nspecial, int **special,
-                                            bool &success) {
-  nbor_time_avail=true;
-
-  success=true;
-  resize_atom(nall,success);
-  resize_local(inum,host_inum,nbor->max_nbors(),0,success);
-  if (!success)
-    return;
-  atom->cast_copy_x(host_x,host_type);
-
-  int mn;
-  nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
-  nbor->copy_unpacked(inum,mn);
-  _last_ellipse=inum;
-  _max_last_ellipse=inum;
-
-  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
-  if (bytes>_max_an_bytes)
-    _max_an_bytes=bytes;
-}
-
-// ---------------------------------------------------------------------------
-// Copy nbor list from host if necessary and then calculate forces, virials,..
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
-                             const int nall, double **host_x, int *host_type,
-                             int *ilist, int *numj, int **firstneigh,
-                             const bool eflag, const bool vflag,
-                             const bool eatom, const bool vatom,
-                             int &host_start, const double cpu_time,
-                             bool &success, double **host_quat) {
-  acc_timers();
-  if (inum_full==0) {
-    host_start=0;
-    zero_timers();
-    return NULL;
-  }
-  
-  int ago=hd_balancer.ago_first(f_ago);
-  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
-  ans->inum(inum);
-  _last_ellipse=std::min(inum,_max_last_ellipse);
-  host_start=inum;
-
-  if (ago==0) {
-    reset_nbors(nall, inum, inum_full, ilist, numj, host_type, firstneigh,
-                success);
-    if (!success)
-      return NULL;
-  }
-  int *list;
-  if (_multiple_forms)
-    list=host_olist.begin();
-  else
-    list=ilist;
-
-  atom->cast_x_data(host_x,host_type);
-  atom->cast_quat_data(host_quat[0]);
-  hd_balancer.start_timer();
-  atom->add_x_data(host_x,host_type);
-  atom->add_quat_data();
-
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom,list);
-  device->add_ans_object(ans);
-  hd_balancer.stop_timer();
-  return list;
-}
-
-// ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary and then compute forces, virials, energies
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall,
-                              double **host_x, int *host_type, double *sublo,
-                              double *subhi, int *tag, int **nspecial,
-                              int **special, const bool eflag, const bool vflag,
-                              const bool eatom, const bool vatom, 
-                              int &host_start, int **ilist, int **jnum,
-                              const double cpu_time, bool &success,
-                              double **host_quat) {
-  acc_timers();
-  if (inum_full==0) {
-    host_start=0;
-    zero_timers();
-    return NULL;
-  }
-
-  hd_balancer.balance(cpu_time);
-  int inum=hd_balancer.get_gpu_count(ago,inum_full);
-  ans->inum(inum);
-  _last_ellipse=std::min(inum,_max_last_ellipse);
-  host_start=inum;
-  
-  // Build neighbor list on GPU if necessary
-  if (ago==0) {
-    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    sublo, subhi, tag, nspecial, special, success);
-    if (!success)
-      return NULL;
-    atom->cast_quat_data(host_quat[0]);
-    hd_balancer.start_timer();
-  } else {    
-    atom->cast_x_data(host_x,host_type);
-    atom->cast_quat_data(host_quat[0]);
-    hd_balancer.start_timer();
-    atom->add_x_data(host_x,host_type);
-  }
-
-  atom->add_quat_data();
-  *ilist=nbor->host_ilist.begin();
-  *jnum=nbor->host_acc.begin();
-
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
-  device->add_ans_object(ans);
-  hd_balancer.stop_timer();
-  return nbor->host_jlist.begin()-host_start;
-}
-
-template <class numtyp, class acctyp>
-double BaseEllipsoidT::host_memory_usage_base() const {
-  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
-         4*sizeof(numtyp)+sizeof(BaseEllipsoid<numtyp,acctyp>);
-}
-
-template <class numtyp, class acctyp>
-void BaseEllipsoidT::compile_kernels(UCL_Device &dev, 
-                                     const char *ellipsoid_string,
-                                     const char *lj_string, const bool e_s) {
-  if (_compiled)
-    return;
-
-  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
-                    std::string(OCL_PRECISION_COMPILE);
-
-  nbor_program=new UCL_Program(dev);
-  nbor_program->load_string(ellipsoid_nbor,flags.c_str());
-  k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast");
-  k_nbor.set_function(*nbor_program,"kernel_nbor");
-
-  ellipsoid_program=new UCL_Program(dev);
-  ellipsoid_program->load_string(ellipsoid_string,flags.c_str());
-  k_ellipsoid.set_function(*ellipsoid_program,"kernel_ellipsoid");
-
-  lj_program=new UCL_Program(dev);
-  lj_program->load_string(lj_string,flags.c_str());
-  k_sphere_ellipsoid.set_function(*lj_program,"kernel_sphere_ellipsoid");
-  k_lj_fast.set_function(*lj_program,"kernel_lj_fast");
-  k_lj.set_function(*lj_program,"kernel_lj");
-  if (e_s)
-    k_ellipsoid_sphere.set_function(*lj_program,"kernel_ellipsoid_sphere");
-
-  _compiled=true;
-}
-
-template class BaseEllipsoid<PRECISION,ACC_PRECISION>;
-
--- a/lib/gpu/base_ellipsoid.h
+++ b/lib/gpu/base_ellipsoid.h
@ -1,255 +0,0 @@
-/***************************************************************************
-                               base_ellipsoid.h
-                             -------------------
-                               W. Michael Brown
-
-  Base class for acceleration of ellipsoid potentials
-
- __________________________________________________________________________
-    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
- __________________________________________________________________________
-
-    begin                : Thu May 5 2011
-    email                : brownw@ornl.gov
- ***************************************************************************/
-
-#ifndef BASE_ELLIPSOID_H
-#define BASE_ELLIPSOID_H
-
-#include "pair_gpu_device.h"
-#include "pair_gpu_balance.h"
-#include "mpi.h"
-
-#ifdef USE_OPENCL
-#include "geryon/ocl_texture.h"
-#else
-#include "geryon/nvd_texture.h"
-#endif
-
-namespace LAMMPS_AL {
-
-template <class numtyp, class acctyp>
-class BaseEllipsoid {
- public:
-  BaseEllipsoid();
-  virtual ~BaseEllipsoid();
-
-  /// Clear any previous data and set up for a new LAMMPS run
-  /** \param max_nbors initial number of rows in the neighbor matrix
-    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device
-    * \param ellipsoid_sphere true if ellipsoid-sphere case handled separately
-    * 
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init_base(const int nlocal, const int nall, const int max_nbors,
-                const int maxspecial, const double cell_size,
-                const double gpu_split, FILE *screen, const int ntypes,
-                int **h_form, const char *ellipsoid_program,
-                const char *lj_program, const bool ellipsoid_sphere=false);
-
-  /// Estimate the overhead for GPU context changes and CPU driver
-  void estimate_gpu_overhead();
-
-  /// Check if there is enough storage for atom arrays and realloc if not
-  /** \param success set to false if insufficient memory **/
-  inline void resize_atom(const int nall, bool &success) {
-    atom->resize(nall, success);
-  }
-
-  /// Check if there is enough storage for neighbors and realloc if not
-  /** \param nlocal number of particles whose nbors must be stored on device
-    * \param host_inum number of particles whose nbors need to copied to host
-    * \param current maximum number of neighbors
-    * \param olist_size size of list of particles from CPU neighboring
-    * \note host_inum is 0 if the host is performing neighboring
-    * \note if GPU is neighboring nlocal+host_inum=total number local particles
-    * \note if CPU is neighboring olist_size=total number of local particles 
-    * \note if GPU is neighboring olist_size=0 **/
-  inline void resize_local(const int nlocal, const int host_inum,
-                           const int max_nbors, const int olist_size,
-                           bool &success) {
-    ans->resize(nlocal, success);
-    if (_multiple_forms) ans->dev_ans.zero();
-
-    if (olist_size>static_cast<int>(host_olist.numel())) {
-      host_olist.clear();
-      int new_size=static_cast<int>(static_cast<double>(olist_size)*1.10);
-      success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
-    }
-
-    nbor->resize(nlocal,host_inum,max_nbors,success);
-    double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
-    if (bytes>_max_bytes)
-      _max_bytes=bytes;
-  }
-
-  /// Clear all host and device data
-  /** \note This is called at the beginning of the init() routine **/
-  void clear_base();
-  
-  /// Output any timing information
-  void output_times();
-
-  /// Returns memory usage on device per atom
-  int bytes_per_atom(const int max_nbors) const;
-
-  /// Total host memory used by library for pair style
-  double host_memory_usage_base() const;
-
-  /// Accumulate timers
-  inline void acc_timers() {
-    if (device->time_device()) {
-      if (nbor_time_avail) {
-        nbor->time_nbor.add_to_total();
-        nbor->time_nbor.add_to_total();
-        nbor_time_avail=false;
-      }
-      time_nbor1.add_to_total();
-      time_ellipsoid.add_to_total();
-      if (_multiple_forms) {
-        time_nbor2.add_to_total();
-        time_ellipsoid2.add_to_total();
-        if (_ellipsoid_sphere) {
-          time_nbor3.add_to_total();
-          time_ellipsoid3.add_to_total();
-        }
-        time_lj.add_to_total();
-      }
-      atom->acc_timers();
-      ans->acc_timers();
-    }
-  }
-  
-  /// Zero timers
-  inline void zero_timers() {
-    nbor_time_avail=false;
-    time_nbor1.zero();
-    time_ellipsoid.zero();
-    if (_multiple_forms) {
-      time_nbor2.zero();
-      time_ellipsoid2.zero();
-      if (_ellipsoid_sphere) {
-        time_nbor3.zero();
-        time_ellipsoid3.zero();
-      }
-      time_lj.zero();
-    }
-    atom->zero_timers();
-    ans->zero_timers();
-  }
-
-  /// Pack neighbors to limit thread divergence for lj-lj and ellipse 
-  void pack_nbors(const int GX, const int BX, const int start, const int inum,
-                  const int form_low, const int form_high, 
-                  const bool shared_types, int ntypes);
-
-  /// Copy neighbor list from host
-  void reset_nbors(const int nall, const int inum, const int osize, int *ilist,
-                   int *numj, int *type, int **firstneigh, bool &success);
-
-  /// Build neighbor list on device
-  void build_nbor_list(const int inum, const int host_inum,
-                       const int nall, double **host_x, int *host_type,
-                       double *sublo, double *subhi, int *tag, int **nspecial,
-                       int **special, bool &success);
-
-  /// Pair loop with host neighboring
-  int* compute(const int f_ago, const int inum_full, const int nall,
-               double **host_x, int *host_type, int *ilist, int *numj,
-               int **firstneigh, const bool eflag, const bool vflag,
-               const bool eatom, const bool vatom, int &host_start,
-               const double cpu_time, bool &success, double **quat);
-
-  /// Pair loop with device neighboring
-  int** compute(const int ago, const int inum_full, const int nall,
-                double **host_x, int *host_type, double *sublo,
-                double *subhi, int *tag, int **nspecial,
-                int **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
-                int **ilist, int **numj, const double cpu_time, bool &success,
-                double **host_quat);
-
-  /// Build neighbor list on accelerator
-  void build_nbor_list(const int inum, const int host_inum, const int nall, 
-                       double **host_x, int *host_type, double *sublo,
-                       double *subhi, bool &success);
-                       
-  // -------------------------- DEVICE DATA ------------------------- 
-
-  /// Device Properties and Atom and Neighbor storage
-  PairGPUDevice<numtyp,acctyp> *device;
-
-  /// Geryon device
-  UCL_Device *ucl_device;
-
-  /// Device Timers
-  UCL_Timer time_nbor1, time_ellipsoid, time_nbor2, time_ellipsoid2, time_lj;
-  UCL_Timer time_nbor3, time_ellipsoid3;
-
-  /// Host device load balancer
-  PairGPUBalance<numtyp,acctyp> hd_balancer;
-
-  /// LAMMPS pointer for screen output
-  FILE *screen;
-
-  // --------------------------- ATOM DATA --------------------------
-
-  /// Atom Data
-  PairGPUAtom<numtyp,acctyp> *atom;
-
-  // --------------------------- TYPE DATA -------------------------- 
-
-  /// cut_form.x = cutsq, cut_form.y = form
-  UCL_D_Vec<numtyp2> cut_form;
-
-  // ------------------------ FORCE/ENERGY DATA -----------------------
-
-  PairGPUAns<numtyp,acctyp> *ans;
-
-  // --------------------------- NBOR DATA ----------------------------
-
-  /// Neighbor data
-  PairGPUNbor *nbor;
-  /// ilist with particles sorted by type
-  UCL_H_Vec<int> host_olist;
-  /// True if we need to accumulate time for neighboring
-  bool nbor_time_avail;
-
-  // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Program *nbor_program, *ellipsoid_program, *lj_program;
-  UCL_Kernel k_nbor_fast, k_nbor;
-  UCL_Kernel k_ellipsoid, k_ellipsoid_sphere, k_sphere_ellipsoid;
-  UCL_Kernel k_lj_fast, k_lj;
-  inline int block_size() { return _block_size; }
-
-  // --------------------------- TEXTURES -----------------------------
-  UCL_Texture pos_tex;
-  UCL_Texture q_tex;
-
- protected:
-  bool _compiled, _ellipsoid_sphere;
-  int _block_size, _threads_per_atom;
-  double  _max_bytes, _max_an_bytes;
-  double _gpu_overhead, _driver_overhead;
-  UCL_D_Vec<int> *_nbor_data;
-
-  // True if we want to use fast GB-sphere or sphere-sphere calculations 
-  bool _multiple_forms;
-  int **_host_form;
-  int _last_ellipse, _max_last_ellipse;
-
-  void compile_kernels(UCL_Device &dev, const char *ellipsoid_string,
-                       const char *lj_string, const bool e_s);
-
-  virtual void loop(const bool _eflag, const bool _vflag) = 0;
-};
-
-}
-
-#endif
-
--- a/lib/gpu/charge_gpu_memory.cpp
+++ b/lib/gpu/charge_gpu_memory.cpp
@ -1,306 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#include "charge_gpu_memory.h"
-#define ChargeGPUMemoryT ChargeGPUMemory<numtyp, acctyp>
-
-extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
-
-template <class numtyp, class acctyp>
-ChargeGPUMemoryT::ChargeGPUMemory() : _compiled(false), _max_bytes(0) {
-  device=&pair_gpu_device;
-  ans=new PairGPUAns<numtyp,acctyp>();
-  nbor=new PairGPUNbor();
-}
-
-template <class numtyp, class acctyp>
-ChargeGPUMemoryT::~ChargeGPUMemory() {
-  delete ans;
-  delete nbor;
-}
-
-template <class numtyp, class acctyp>
-int ChargeGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
-  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
-         nbor->bytes_per_atom(max_nbors);
-}
-
-template <class numtyp, class acctyp>
-int ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
-                                  const int max_nbors, const int maxspecial,
-                                  const double cell_size,
-                                  const double gpu_split, FILE *_screen,
-                                  const char *pair_program) {
-  nbor_time_avail=false;
-  screen=_screen;
-
-  bool gpu_nbor=false;
-  if (device->gpu_mode()==PairGPUDevice<numtyp,acctyp>::GPU_NEIGH)
-    gpu_nbor=true;
-
-  int _gpu_host=0;
-  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
-  if (host_nlocal>0)
-    _gpu_host=1;
-
-  _threads_per_atom=device->threads_per_charge();
-  if (_threads_per_atom>1 && gpu_nbor==false) {
-    nbor->packing(true);
-    _nbor_data=&(nbor->dev_packed);
-  } else
-    _nbor_data=&(nbor->dev_nbor);
-    
-  int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false);
-  if (success!=0)
-    return success;
-
-  ucl_device=device->gpu;
-  atom=&device->atom;
-
-  _block_size=device->pair_block_size();
-  _block_bio_size=device->block_bio_pair();
-  compile_kernels(*ucl_device,pair_program);
-
-  // Initialize host-device load balancer
-  hd_balancer.init(device,gpu_nbor,gpu_split);
-
-  // Initialize timers for the selected GPU
-  time_pair.init(*ucl_device);
-  time_pair.zero();
-
-  pos_tex.bind_float(atom->dev_x,4);
-  q_tex.bind_float(atom->dev_q,1);
-
-  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
-
-  return success;
-}
-
-template <class numtyp, class acctyp>
-void ChargeGPUMemoryT::estimate_gpu_overhead() {
-  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
-}
-
-template <class numtyp, class acctyp>
-void ChargeGPUMemoryT::clear_atomic() {
-  // Output any timing information
-  acc_timers();
-  double avg_split=hd_balancer.all_avg_split();
-  _gpu_overhead*=hd_balancer.timestep();
-  _driver_overhead*=hd_balancer.timestep();
-  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
-                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);
-
-  if (_compiled) {
-    k_pair_fast.clear();
-    k_pair.clear();
-    delete pair_program;
-    _compiled=false;
-  }
-
-  time_pair.clear();
-  hd_balancer.clear();
-
-  nbor->clear();
-  ans->clear();
-  device->clear();
-}
-
-// ---------------------------------------------------------------------------
-// Copy neighbor list from host
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
-                                   int *numj, int **firstneigh, bool &success) {
-  success=true;
-
-  nbor_time_avail=true;
-
-  int mn=nbor->max_nbor_loop(inum,numj,ilist);
-  resize_atom(inum,nall,success);
-  resize_local(inum,mn,success);
-  if (!success)
-    return false;
-
-  nbor->get_host(inum,ilist,numj,firstneigh,block_size());
-
-  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
-  if (bytes>_max_an_bytes)
-    _max_an_bytes=bytes;
-
-  return ilist;
-}
-
-// ---------------------------------------------------------------------------
-// Build neighbor list on device
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
-                                              const int host_inum,
-                                              const int nall, double **host_x,
-                                              int *host_type, double *sublo,
-                                              double *subhi, int *tag, 
-                                              int **nspecial, int **special,
-                                              bool &success) {
-  nbor_time_avail=true;
-
-  success=true;
-  resize_atom(inum,nall,success);
-  resize_local(inum,host_inum,nbor->max_nbors(),success);
-  if (!success)
-    return;
-  atom->cast_copy_x(host_x,host_type);
-
-  int mn;
-  nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
-
-  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
-  if (bytes>_max_an_bytes)
-    _max_an_bytes=bytes;
-}
-
-// ---------------------------------------------------------------------------
-// Copy nbor list from host if necessary and then calculate forces, virials,..
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void ChargeGPUMemoryT::compute(const int f_ago, const int inum_full,
-                               const int nall, double **host_x, int *host_type,
-                               int *ilist, int *numj, int **firstneigh,
-                               const bool eflag, const bool vflag,
-                               const bool eatom, const bool vatom,
-                               int &host_start, const double cpu_time,
-                               bool &success, double *host_q,
-                               const int nlocal, double *boxlo, double *prd) {
-  acc_timers();
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    resize_atom(0,nall,success);
-    zero_timers();
-    return;
-  }
-  
-  int ago=hd_balancer.ago_first(f_ago);
-  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
-  ans->inum(inum);
-  host_start=inum;
-
-  if (ago==0) {
-    reset_nbors(nall, inum, ilist, numj, firstneigh, success);
-    if (!success)
-      return;
-  }
-
-  atom->cast_x_data(host_x,host_type);
-  atom->cast_q_data(host_q);
-  hd_balancer.start_timer();
-  atom->add_x_data(host_x,host_type);
-  atom->add_q_data();
-
-  device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
-                     boxlo, prd);
-
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
-  device->add_ans_object(ans);
-  hd_balancer.stop_timer();
-}
-
-// ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary and then compute forces, virials, energies
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int** ChargeGPUMemoryT::compute(const int ago, const int inum_full,
-                                const int nall, double **host_x, int *host_type,
-                                double *sublo, double *subhi, int *tag,
-                                int **nspecial, int **special, const bool eflag, 
-                                const bool vflag, const bool eatom,
-                                const bool vatom, int &host_start,
-                                int **ilist, int **jnum,
-                                const double cpu_time, bool &success,
-                                double *host_q, double *boxlo, double *prd) {
-  acc_timers();
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    resize_atom(0,nall,success);
-    zero_timers();
-    return NULL;
-  }
-  
-  hd_balancer.balance(cpu_time);
-  int inum=hd_balancer.get_gpu_count(ago,inum_full);
-  ans->inum(inum);
-  host_start=inum;
- 
-  // Build neighbor list on GPU if necessary
-  if (ago==0) {
-    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    sublo, subhi, tag, nspecial, special, success);
-    if (!success)
-      return NULL;
-    atom->cast_q_data(host_q);
-    hd_balancer.start_timer();
-  } else {
-    atom->cast_x_data(host_x,host_type);
-    atom->cast_q_data(host_q);
-    hd_balancer.start_timer();
-    atom->add_x_data(host_x,host_type);
-  }
-  atom->add_q_data();
-  *ilist=nbor->host_ilist.begin();
-  *jnum=nbor->host_acc.begin();
-
-  device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
-                     boxlo, prd);
-
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
-  device->add_ans_object(ans);
-  hd_balancer.stop_timer();
-  
-  return nbor->host_jlist.begin()-host_start;
-}
-
-template <class numtyp, class acctyp>
-double ChargeGPUMemoryT::host_memory_usage_atomic() const {
-  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
-         4*sizeof(numtyp)+sizeof(ChargeGPUMemory<numtyp,acctyp>);
-}
-
-template <class numtyp, class acctyp>
-void ChargeGPUMemoryT::compile_kernels(UCL_Device &dev, const char *pair_str) {
-  if (_compiled)
-    return;
-
-  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
-                    std::string(OCL_PRECISION_COMPILE);
-
-  pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,flags.c_str());
-  k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
-  k_pair.set_function(*pair_program,"kernel_pair");
-  pos_tex.get_texture(*pair_program,"pos_tex");
-  q_tex.get_texture(*pair_program,"q_tex");
-
-  _compiled=true;
-}
-
-template class ChargeGPUMemory<PRECISION,ACC_PRECISION>;
-
--- a/lib/gpu/charge_gpu_memory.h
+++ b/lib/gpu/charge_gpu_memory.h
@ -1,203 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef CHARGE_GPU_MEMORY_H
-#define CHARGE_GPU_MEMORY_H
-
-#include "pair_gpu_device.h"
-#include "pair_gpu_balance.h"
-#include "mpi.h"
-
-#ifdef USE_OPENCL
-#include "geryon/ocl_texture.h"
-#else
-#include "geryon/nvd_texture.h"
-#endif
-
-template <class numtyp, class acctyp>
-class ChargeGPUMemory {
- public:
-  ChargeGPUMemory();
-  virtual ~ChargeGPUMemory();
-
-  /// Clear any previous data and set up for a new LAMMPS run
-  /** \param max_nbors initial number of rows in the neighbor matrix
-    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device
-    * 
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init_atomic(const int nlocal, const int nall, const int max_nbors,
-                  const int maxspecial, const double cell_size,
-                  const double gpu_split, FILE *screen,
-                  const char *pair_program);
-
-  /// Estimate the overhead for GPU context changes and CPU driver
-  void estimate_gpu_overhead();
-
-  /// Check if there is enough storage for atom arrays and realloc if not
-  /** \param success set to false if insufficient memory **/
-  inline void resize_atom(const int inum, const int nall, bool &success) {
-    if (atom->resize(nall, success)) {
-      pos_tex.bind_float(atom->dev_x,4);
-      q_tex.bind_float(atom->dev_q,1);
-    }
-    ans->resize(inum,success);
-  }
-
-  /// Check if there is enough storage for neighbors and realloc if not
-  /** \param nlocal number of particles whose nbors must be stored on device
-    * \param host_inum number of particles whose nbors need to copied to host
-    * \param current maximum number of neighbors
-    * \note olist_size=total number of local particles **/
-  inline void resize_local(const int inum, const int max_nbors, bool &success) {
-    nbor->resize(inum,max_nbors,success);
-  }
-
-  /// Check if there is enough storage for neighbors and realloc if not
-  /** \param nlocal number of particles whose nbors must be stored on device
-    * \param host_inum number of particles whose nbors need to copied to host
-    * \param current maximum number of neighbors
-    * \note host_inum is 0 if the host is performing neighboring
-    * \note nlocal+host_inum=total number local particles
-    * \note olist_size=0 **/
-  inline void resize_local(const int inum, const int host_inum, 
-                           const int max_nbors, bool &success) {
-    nbor->resize(inum,host_inum,max_nbors,success);
-  }
-
-  /// Clear all host and device data
-  /** \note This is called at the beginning of the init() routine **/
-  void clear_atomic();
-
-  /// Returns memory usage on device per atom
-  int bytes_per_atom_atomic(const int max_nbors) const;
-
-  /// Total host memory used by library for pair style
-  double host_memory_usage_atomic() const;
-
-  /// Accumulate timers
-  inline void acc_timers() {
-    if (device->time_device()) {
-      if (nbor_time_avail) {
-        nbor->time_nbor.add_to_total();
-        nbor->time_kernel.add_to_total();
-        nbor_time_avail=false;
-      }
-      time_pair.add_to_total();
-      atom->acc_timers();
-      ans->acc_timers();
-    }
-  }
-
-  /// Zero timers
-  inline void zero_timers() {
-    nbor_time_avail=false;
-    time_pair.zero();
-    atom->zero_timers();
-    ans->zero_timers();
-  }
-
-  /// Copy neighbor list from host
-  int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
-                    int **firstneigh, bool &success);
-
-  /// Build neighbor list on device
-  void build_nbor_list(const int inum, const int host_inum,
-                       const int nall, double **host_x, int *host_type,
-                       double *sublo, double *subhi, int *tag, int **nspecial,
-                       int **special, bool &success);
-
-  /// Pair loop with host neighboring
-  void compute(const int f_ago, const int inum_full, const int nall,
-               double **host_x, int *host_type, int *ilist, int *numj,
-               int **firstneigh, const bool eflag, const bool vflag,
-               const bool eatom, const bool vatom, int &host_start,
-               const double cpu_time, bool &success, double *charge,
-               const int nlocal, double *boxlo, double *prd);
-
-  /// Pair loop with device neighboring
-  int** compute(const int ago, const int inum_full, const int nall,
-                double **host_x, int *host_type, double *sublo,
-                double *subhi, int *tag, int **nspecial,
-                int **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
-                int **ilist, int **numj, const double cpu_time, bool &success,
-                double *charge, double *boxlo, double *prd);
-
-  // -------------------------- DEVICE DATA ------------------------- 
-
-  /// Device Properties and Atom and Neighbor storage
-  PairGPUDevice<numtyp,acctyp> *device;
-
-  /// Geryon device
-  UCL_Device *ucl_device;
-
-  /// Device Timers
-  UCL_Timer time_pair;
-
-  /// Host device load balancer
-  PairGPUBalance<numtyp,acctyp> hd_balancer;
-
-  /// LAMMPS pointer for screen output
-  FILE *screen;
-
-  // --------------------------- ATOM DATA --------------------------
-
-  /// Atom Data
-  PairGPUAtom<numtyp,acctyp> *atom;
-
-
-  // ------------------------ FORCE/ENERGY DATA -----------------------
-
-  PairGPUAns<numtyp,acctyp> *ans;
-
-  // --------------------------- NBOR DATA ----------------------------
-
-  /// Neighbor data
-  PairGPUNbor *nbor;
-
-  /// True if we need to accumulate time for neighboring
-  bool nbor_time_avail;
-
-  // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Program *pair_program;
-  UCL_Kernel k_pair_fast, k_pair;
-  inline int block_size() { return _block_size; }
-
-  // --------------------------- TEXTURES -----------------------------
-  UCL_Texture pos_tex;
-  UCL_Texture q_tex;
-
- protected:
-  bool _compiled;
-  int _block_size, _block_bio_size, _threads_per_atom;
-  double  _max_bytes, _max_an_bytes;
-  double _gpu_overhead, _driver_overhead;
-  UCL_D_Vec<int> *_nbor_data;
-
-  void compile_kernels(UCL_Device &dev, const char *pair_string);
-
-  virtual void loop(const bool _eflag, const bool _vflag) = 0;
-};
-
-#endif
-
--- a/lib/gpu/cmm_cut_gpu.cpp
+++ b/lib/gpu/cmm_cut_gpu.cpp
@ -1,122 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#include <iostream>
-#include <cassert>
-#include <math.h>
-
-#include "cmm_cut_gpu_memory.h"
-
-using namespace std;
-
-static CMM_GPU_Memory<PRECISION,ACC_PRECISION> CMMMF;
-
-// ---------------------------------------------------------------------------
-// Allocate memory on host and device and copy constants to device
-// ---------------------------------------------------------------------------
-int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
-                 double **host_lj1, double **host_lj2, double **host_lj3, 
-                 double **host_lj4, double **offset, double *special_lj,
-                 const int inum, const int nall, const int max_nbors, 
-                 const int maxspecial, const double cell_size, int &gpu_mode,
-                 FILE *screen) {
-  CMMMF.clear();
-  gpu_mode=CMMMF.device->gpu_mode();
-  double gpu_split=CMMMF.device->particle_split();
-  int first_gpu=CMMMF.device->first_device();
-  int last_gpu=CMMMF.device->last_device();
-  int world_me=CMMMF.device->world_me();
-  int gpu_rank=CMMMF.device->gpu_rank();
-  int procs_per_gpu=CMMMF.device->procs_per_gpu();
-
-  CMMMF.device->init_message(screen,"cg/cmm",first_gpu,last_gpu);
-
-  bool message=false;
-  if (CMMMF.device->replica_me()==0 && screen)
-    message=true;
-
-  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
-    fflush(screen);
-  }
-
-  int init_ok=0;
-  if (world_me==0)
-    init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, 
-                       host_lj4, offset, special_lj, inum, nall, 300,
-                       maxspecial, cell_size, gpu_split, screen);
-
-  CMMMF.device->world_barrier();
-  if (message)
-    fprintf(screen,"Done.\n");
-
-  for (int i=0; i<procs_per_gpu; i++) {
-    if (message) {
-      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
-      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
-                last_gpu,i);
-      fflush(screen);
-    }
-    if (gpu_rank==i && world_me!=0)
-      init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
-                         host_lj4, offset, special_lj, inum, nall, 300,
-                         maxspecial, cell_size, gpu_split, screen);
-
-    CMMMF.device->gpu_barrier();
-    if (message) 
-      fprintf(screen,"Done.\n");
-  }
-  if (message)
-    fprintf(screen,"\n");
-
-  if (init_ok==0)
-    CMMMF.estimate_gpu_overhead();
-  return init_ok;
-}
-
-void cmm_gpu_clear() {
-  CMMMF.clear();
-}
-
-int** cmm_gpu_compute_n(const int ago, const int inum_full,
-                        const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, int *tag, int **nspecial,
-                        int **special, const bool eflag, const bool vflag,
-                        const bool eatom, const bool vatom, int &host_start,
-                        int **ilist, int **jnum, const double cpu_time,
-                        bool &success) {
-  return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
-                       subhi, tag, nspecial, special, eflag, vflag, eatom,
-                       vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
-void cmm_gpu_compute(const int ago, const int inum_full, const int nall,
-                     double **host_x, int *host_type, int *ilist, int *numj,
-                     int **firstneigh, const bool eflag, const bool vflag,
-                     const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success) {
-  CMMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
-                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
-}
-
-double cmm_gpu_bytes() {
-  return CMMMF.host_memory_usage();
-}
-
-
--- a/lib/gpu/cmm_cut_gpu_kernel.cu
+++ b/lib/gpu/cmm_cut_gpu_kernel.cu
@ -1,403 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef CMM_GPU_KERNEL
-#define CMM_GPU_KERNEL
-
-#ifdef NV_KERNEL
-
-#include "nv_kernel_def.h"
-texture<float4> pos_tex;
-
-#ifdef _DOUBLE_DOUBLE
-__inline double4 fetch_pos(const int& i, const double4 *pos)
-{
-  return pos[i];
-}
-#else
-__inline float4 fetch_pos(const int& i, const float4 *pos)
-{
-  return tex1Dfetch(pos_tex, i);
-}
-#endif
-
-#else
-
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-#define GLOBAL_ID_X get_global_id(0)
-#define THREAD_ID_X get_local_id(0)
-#define BLOCK_ID_X get_group_id(0)
-#define BLOCK_SIZE_X get_local_size(0)
-#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
-#define __inline inline
-
-#define fetch_pos(i,y) x_[i]
-#define BLOCK_PAIR 64
-#define MAX_SHARED_TYPES 8
-
-#endif
-
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
-#define SBBITS 30
-#define NEIGHMASK 0x3FFFFFFF
-__inline int sbmask(int j) { return j >> SBBITS & 3; }
-
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
-                          const int nbor_pitch, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp sp_lj[4];
-  sp_lj[0]=sp_lj_in[0];
-  sp_lj[1]=sp_lj_in[1];
-  sp_lj[2]=sp_lj_in[2];
-  sp_lj[3]=sp_lj_in[3];
-
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  if (ii<inum) {
-  
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    int itype=ix.w;
-
-    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
-  
-      int j=*nbor;
-      factor_lj = sp_lj[sbmask(j)];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
-      int mtype=itype*lj_types+jtype;
-      if (r2inv<lj1[mtype].x) {
-        r2inv=(numtyp)1.0/r2inv;
-        numtyp inv1,inv2;
-        
-        if (lj1[mtype].y == 2) {
-          inv1=r2inv*r2inv;
-          inv2=inv1*inv1;
-        } else if (lj1[mtype].y == 1) {
-          inv2=r2inv*sqrt(r2inv);
-          inv1=inv2*inv2;
-        } else {
-          inv1=r2inv*r2inv*r2inv;
-          inv2=inv1;
-        }
-        numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
-      
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-        if (eflag>0)
-          energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)-
-                    lj3[mtype].z;
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<4; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii
-}
-
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,__global int *dev_nbor,
-                               __global int *dev_packed, __global acctyp4 *ans,
-                               __global acctyp *engv, const int eflag,
-                               const int vflag, const int inum,
-                               const int nbor_pitch, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp sp_lj[4];
-  if (tid<4)
-    sp_lj[tid]=sp_lj_in[tid];
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[tid]=lj1_in[tid];
-    if (eflag>0)
-      lj3[tid]=lj3_in[tid];
-  }
-  
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  __syncthreads();
-  
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
-
-    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
-  
-      int j=*nbor;
-      factor_lj = sp_lj[sbmask(j)];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int mtype=itype+jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
-      if (r2inv<lj1[mtype].x) {
-        r2inv=(numtyp)1.0/r2inv;
-        numtyp inv1,inv2;
-        
-        if (lj1[mtype].y == (numtyp)2) {
-          inv1=r2inv*r2inv;
-          inv2=inv1*inv1;
-        } else if (lj1[mtype].y == (numtyp)1) {
-          inv2=r2inv*sqrt(r2inv);
-          inv1=inv2*inv2;
-        } else {
-          inv1=r2inv*r2inv*r2inv;
-          inv2=inv1;
-        }
-        numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
-      
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-        if (eflag>0)
-          energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)-
-                    lj3[mtype].z;
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<4; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii*/
-}
-
-#endif
-
--- a/lib/gpu/cmm_cut_gpu_memory.cpp
+++ b/lib/gpu/cmm_cut_gpu_memory.cpp
@ -1,155 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifdef USE_OPENCL
-#include "cmm_cut_gpu_cl.h"
-#else
-#include "cmm_cut_gpu_ptx.h"
-#endif
-
-#include "cmm_cut_gpu_memory.h"
-#include <cassert>
-#define CMM_GPU_MemoryT CMM_GPU_Memory<numtyp, acctyp>
-
-extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
-
-template <class numtyp, class acctyp>
-CMM_GPU_MemoryT::CMM_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
-}
-
-template <class numtyp, class acctyp>
-CMM_GPU_MemoryT::~CMM_GPU_Memory() { 
-  clear();
-}
- 
-template <class numtyp, class acctyp>
-int CMM_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
-  return this->bytes_per_atom_atomic(max_nbors);
-}
-
-template <class numtyp, class acctyp>
-int CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
-                          int **host_cg_type, double **host_lj1, 
-                          double **host_lj2, double **host_lj3, 
-                          double **host_lj4, double **host_offset, 
-                          double *host_special_lj, const int nlocal,
-                          const int nall, const int max_nbors,
-                          const int maxspecial, const double cell_size, 
-                          const double gpu_split, FILE *_screen) {
-  int success;
-  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,cmm_cut_gpu_kernel);
-  if (success!=0)
-    return success;
-
-  // If atom type constants fit in shared memory use fast kernel
-  int cmm_types=ntypes;
-  shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
-  if (cmm_types<=max_shared_types && this->_block_size>=max_shared_types) {
-    cmm_types=max_shared_types;
-    shared_types=true;
-  }
-  _cmm_types=cmm_types;
-
-  // Allocate a host write buffer for data initialization
-  UCL_H_Vec<numtyp> host_write(cmm_types*cmm_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
-
-  for (int i=0; i<cmm_types*cmm_types; i++)
-    host_write[i]=0.0;
-
-  lj1.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq, 
-                         host_cg_type,host_lj1,host_lj2);
-
-  lj3.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,cmm_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
-
-  UCL_H_Vec<double> dview;
-  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
-  dview.view(host_special_lj,4,*(this->ucl_device));
-  ucl_copy(sp_lj,dview,false);
-
-  _allocated=true;
-  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return 0;
-}
-
-template <class numtyp, class acctyp>
-void CMM_GPU_MemoryT::clear() {
-  if (!_allocated)
-    return;
-  _allocated=false;
-
-  lj1.clear();
-  lj3.clear();
-  sp_lj.clear();
-  this->clear_atomic();
-}
-
-template <class numtyp, class acctyp>
-double CMM_GPU_MemoryT::host_memory_usage() const {
-  return this->host_memory_usage_atomic()+sizeof(CMM_GPU_Memory<numtyp,acctyp>);
-}
-
-// ---------------------------------------------------------------------------
-// Calculate energies, forces, and torques
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
-  this->time_pair.start();
-  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->_threads_per_atom);
-  } else {
-    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom);
-  }
-  this->time_pair.stop();
-}
-
-template class CMM_GPU_Memory<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/cmm_cut_gpu_memory.h
+++ b/lib/gpu/cmm_cut_gpu_memory.h
@ -1,78 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef CMM_GPU_MEMORY_H
-#define CMM_GPU_MEMORY_H
-
-#include "atomic_gpu_memory.h"
-
-template <class numtyp, class acctyp>
-class CMM_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
- public:
-  CMM_GPU_Memory();
-  ~CMM_GPU_Memory(); 
-
-  /// Clear any previous data and set up for a new LAMMPS run
-  /** \param max_nbors initial number of rows in the neighbor matrix
-    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device
-    * 
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double **host_cutsq, int **host_cg_type,
-           double **host_lj1, double **host_lj2, double **host_lj3,
-           double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size,
-           const double gpu_split, FILE *screen);
-
-  /// Clear all host and device data
-  /** \note This is called at the beginning of the init() routine **/
-  void clear();
-
-  /// Returns memory usage on device per atom
-  int bytes_per_atom(const int max_nbors) const;
-
-  /// Total host memory used by library for pair style
-  double host_memory_usage() const;
-
-  // --------------------------- TYPE DATA --------------------------
-
-  /// lj1.x = cutsq, lj1.y=cg_type, lj1.z = lj1, lj1.w = lj2
-  UCL_D_Vec<numtyp4> lj1;
-  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
-  UCL_D_Vec<numtyp4> lj3;
-  /// Special LJ values
-  UCL_D_Vec<numtyp> sp_lj;
-
-  /// If atom type constants fit in shared memory, use fast kernels
-  bool shared_types;
-
-  /// Number of atom types 
-  int _cmm_types;
-
- private:
-  bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
-};
-
-#endif
-
--- a/lib/gpu/cmmc_long_gpu.cpp
+++ b/lib/gpu/cmmc_long_gpu.cpp
@ -1,130 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#include <iostream>
-#include <cassert>
-#include <math.h>
-
-#include "cmmc_long_gpu_memory.h"
-
-using namespace std;
-
-static CMML_GPU_Memory<PRECISION,ACC_PRECISION> CMMLMF;
-
-// ---------------------------------------------------------------------------
-// Allocate memory on host and device and copy constants to device
-// ---------------------------------------------------------------------------
-int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
-                  double **host_lj1, double **host_lj2, double **host_lj3, 
-                  double **host_lj4, double **offset, double *special_lj,
-                  const int inum, const int nall, const int max_nbors, 
-                  const int maxspecial, const double cell_size, int &gpu_mode,
-                  FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
-                  double *host_special_coul, const double qqrd2e,
-                  const double g_ewald) {
-  CMMLMF.clear();
-  gpu_mode=CMMLMF.device->gpu_mode();
-  double gpu_split=CMMLMF.device->particle_split();
-  int first_gpu=CMMLMF.device->first_device();
-  int last_gpu=CMMLMF.device->last_device();
-  int world_me=CMMLMF.device->world_me();
-  int gpu_rank=CMMLMF.device->gpu_rank();
-  int procs_per_gpu=CMMLMF.device->procs_per_gpu();
-
-  CMMLMF.device->init_message(screen,"cg/cmm/coul/long",first_gpu,last_gpu);
-
-  bool message=false;
-  if (CMMLMF.device->replica_me()==0 && screen)
-    message=true;
-
-  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
-    fflush(screen);
-  }
-
-  int init_ok=0;
-  if (world_me==0)
-    init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
-                        host_lj4, offset, special_lj, inum, nall, 300, 
-                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
-                        host_cut_coulsq, host_special_coul, qqrd2e,g_ewald);
-
-  CMMLMF.device->world_barrier();
-  if (message)
-    fprintf(screen,"Done.\n");
-
-  for (int i=0; i<procs_per_gpu; i++) {
-    if (message) {
-      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
-      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
-                last_gpu,i);
-      fflush(screen);
-    }
-    if (gpu_rank==i && world_me!=0)
-      init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
-                          host_lj4, offset, special_lj, inum,  nall, 300,
-                          maxspecial, cell_size, gpu_split, screen,
-                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
-                          qqrd2e, g_ewald);
-    CMMLMF.device->gpu_barrier();
-    if (message) 
-      fprintf(screen,"Done.\n");
-  }
-  if (message)
-    fprintf(screen,"\n");
-
-  if (init_ok==0)
-    CMMLMF.estimate_gpu_overhead();
-  return init_ok;
-}
-
-void cmml_gpu_clear() {
-  CMMLMF.clear();
-}
-
-int** cmml_gpu_compute_n(const int ago, const int inum_full,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, int *tag, int **nspecial, 
-                         int **special, const bool eflag, const bool vflag,
-                         const bool eatom, const bool vatom, int &host_start,
-                         int **ilist, int **jnum, const double cpu_time,
-                         bool &success, double *host_q, double *boxlo,
-                         double *prd) {
-  return CMMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
-                        subhi, tag, nspecial, special, eflag, vflag, eatom,
-                        vatom, host_start, ilist, jnum, cpu_time, success,
-                        host_q,boxlo,prd);
-}  
-			
-void cmml_gpu_compute(const int ago, const int inum_full, const int nall,
-                      double **host_x, int *host_type, int *ilist, int *numj,
-                      int **firstneigh, const bool eflag, const bool vflag,
-                      const bool eatom, const bool vatom, int &host_start,
-                      const double cpu_time, bool &success, double *host_q,
-                      const int nlocal, double *boxlo, double *prd) {
-  CMMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
-                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
-                host_q,nlocal,boxlo,prd);
-}
-
-double cmml_gpu_bytes() {
-  return CMMLMF.host_memory_usage();
-}
-
-
--- a/lib/gpu/cmmc_long_gpu_kernel.cu
+++ b/lib/gpu/cmmc_long_gpu_kernel.cu
@ -1,485 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef CMML_GPU_KERNEL
-#define CMML_GPU_KERNEL
-
-#ifdef NV_KERNEL
-
-#include "nv_kernel_def.h"
-texture<float4> pos_tex;
-texture<float> q_tex;
-
-#ifdef _DOUBLE_DOUBLE
-__inline double4 fetch_pos(const int& i, const double4 *pos)
-{
-  return pos[i];
-}
-__inline double fetch_q(const int& i, const double *q)
-{
-  return q[i];
-}
-#else
-__inline float4 fetch_pos(const int& i, const float4 *pos)
-{
-  return tex1Dfetch(pos_tex, i);
-}
-__inline float fetch_q(const int& i, const float *q)
-{
-  return tex1Dfetch(q_tex, i);
-}
-#endif
-
-#else
-
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-#define GLOBAL_ID_X get_global_id(0)
-#define THREAD_ID_X get_local_id(0)
-#define BLOCK_ID_X get_group_id(0)
-#define BLOCK_SIZE_X get_local_size(0)
-#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
-#define __inline inline
-
-#define fetch_pos(i,y) x_[i]
-#define fetch_q(i,y) q_[i]
-#define BLOCK_PAIR 64
-#define MAX_SHARED_TYPES 8
-
-#endif
-
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
-#define EWALD_F (numtyp)1.12837917
-#define EWALD_P (numtyp)0.3275911
-#define A1 (numtyp)0.254829592
-#define A2 (numtyp)-0.284496736
-#define A3 (numtyp)1.421413741
-#define A4 (numtyp)-1.453152027
-#define A5 (numtyp)1.061405429
-
-#define SBBITS 30
-#define NEIGHMASK 0x3FFFFFFF
-__inline int sbmask(int j) { return j >> SBBITS & 3; }
-
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_ ,
-                          const numtyp cut_coulsq, const numtyp qqrd2e,
-                          const numtyp g_ewald, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp sp_lj[8];
-  sp_lj[0]=sp_lj_in[0];
-  sp_lj[1]=sp_lj_in[1];
-  sp_lj[2]=sp_lj_in[2];
-  sp_lj[3]=sp_lj_in[3];
-  sp_lj[4]=sp_lj_in[4];
-  sp_lj[5]=sp_lj_in[5];
-  sp_lj[6]=sp_lj_in[6];
-  sp_lj[7]=sp_lj_in[7];
-
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
-    int itype=ix.w;
-
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
-
-      numtyp factor_lj, factor_coul;
-      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      int mtype=itype*lj_types+jtype;
-      if (rsq<lj1[mtype].x) {
-        numtyp forcecoul, force_lj, force, inv1, inv2, prefactor, _erfc;
-        numtyp r2inv=(numtyp)1.0/rsq;
-
-        if (rsq < lj1[mtype].y) {
-          if (lj3[mtype].x == (numtyp)2) {
-            inv1=r2inv*r2inv;
-            inv2=inv1*inv1;
-          } else if (lj3[mtype].x == (numtyp)1) {
-            inv2=r2inv*sqrt(r2inv);
-            inv1=inv2*inv2;
-          } else {
-            inv1=r2inv*r2inv*r2inv;
-            inv2=inv1;
-          }
-          force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
-        } else
-          force_lj = (numtyp)0.0;
-
-        if (rsq < cut_coulsq) {
-          numtyp r = sqrt(rsq);
-          numtyp grij = g_ewald * r;
-          numtyp expm2 = exp(-grij*grij);
-          numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
-          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
-          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else
-          forcecoul = (numtyp)0.0;
-
-        force = (force_lj + forcecoul) * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          if (rsq < cut_coulsq)
-            e_coul += prefactor*(_erfc-factor_coul);
-          if (rsq < lj1[mtype].y) {
-            energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
-                      lj3[mtype].w;
-          } 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-    red_acc[4][tid]=e_coul;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<5; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-    e_coul=red_acc[4][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-      *ap1=e_coul;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii
-}
-
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, 
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp g_ewald, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp sp_lj[8];
-  if (tid<8)
-    sp_lj[tid]=sp_lj_in[tid];
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[tid]=lj1_in[tid];
-    lj3[tid]=lj3_in[tid];
-  }
-  
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  __syncthreads();
-  
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
-    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
-
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
-
-      numtyp factor_lj, factor_coul;
-      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int mtype=itype+jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      if (rsq<lj1[mtype].x) {
-        numtyp forcecoul, force_lj, force, inv1, inv2, prefactor, _erfc;
-        numtyp r2inv=(numtyp)1.0/rsq;
-
-        if (rsq < lj1[mtype].y) {
-          if (lj3[mtype].x == (numtyp)2) {
-            inv1=r2inv*r2inv;
-            inv2=inv1*inv1;
-          } else if (lj3[mtype].x == (numtyp)1) {
-            inv2=r2inv*sqrt(r2inv);
-            inv1=inv2*inv2;
-          } else {
-            inv1=r2inv*r2inv*r2inv;
-            inv2=inv1;
-          }
-          force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
-        } else
-          force_lj = (numtyp)0.0;
-
-        if (rsq < cut_coulsq) {
-          numtyp r = sqrt(rsq);
-          numtyp grij = g_ewald * r;
-          numtyp expm2 = exp(-grij*grij);
-          numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
-          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
-          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else
-          forcecoul = (numtyp)0.0;
-
-        force = (force_lj + forcecoul) * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          if (rsq < cut_coulsq)
-            e_coul += prefactor*(_erfc-factor_coul);
-          if (rsq < lj1[mtype].y) {
-            energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
-                      lj3[mtype].w;
-          } 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-    red_acc[4][tid]=e_coul;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<5; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-    e_coul=red_acc[4][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-      *ap1=e_coul;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii*/
-}
-
-#endif
-
--- a/lib/gpu/cmmc_long_gpu_memory.cpp
+++ b/lib/gpu/cmmc_long_gpu_memory.cpp
@ -1,170 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifdef USE_OPENCL
-#include "cmmc_long_gpu_cl.h"
-#else
-#include "cmmc_long_gpu_ptx.h"
-#endif
-
-#include "cmmc_long_gpu_memory.h"
-#include <cassert>
-#define CMML_GPU_MemoryT CMML_GPU_Memory<numtyp, acctyp>
-
-extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
-
-template <class numtyp, class acctyp>
-CMML_GPU_MemoryT::CMML_GPU_Memory() : ChargeGPUMemory<numtyp,acctyp>(),
-                                    _allocated(false) {
-}
-
-template <class numtyp, class acctyp>
-CMML_GPU_MemoryT::~CMML_GPU_Memory() {
-  clear();
-}
- 
-template <class numtyp, class acctyp>
-int CMML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
-  return this->bytes_per_atom_atomic(max_nbors);
-}
-
-template <class numtyp, class acctyp>
-int CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
-                           int **host_cg_type, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
-                           double *host_special_lj, const int nlocal,
-                           const int nall, const int max_nbors,
-                           const int maxspecial, const double cell_size,
-                           const double gpu_split, FILE *_screen,
-                           double **host_cut_ljsq, 
-                           const double host_cut_coulsq,
-                           double *host_special_coul, const double qqrd2e,
-                           const double g_ewald) {
-  int success;
-  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,cmmc_long_gpu_kernel);
-  if (success!=0)
-    return success;
-
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
-  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
-    lj_types=max_shared_types;
-    shared_types=true;
-  }
-  _lj_types=lj_types;
-
-  // Allocate a host write buffer for data initialization
-  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
-
-  for (int i=0; i<lj_types*lj_types; i++)
-    host_write[i]=0.0;
-
-  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_cutsq,
-                         host_cut_ljsq,host_lj1,host_lj2);
-
-  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_cg_type,host_lj3,
-                         host_lj4,host_offset);
-
-  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
-  for (int i=0; i<4; i++) {
-    host_write[i]=host_special_lj[i];
-    host_write[i+4]=host_special_coul[i];
-  }
-  ucl_copy(sp_lj,host_write,8,false);
-
-  _cut_coulsq=host_cut_coulsq;
-  _qqrd2e=qqrd2e;
-  _g_ewald=g_ewald;
-
-  _allocated=true;
-  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return 0;
-}
-
-template <class numtyp, class acctyp>
-void CMML_GPU_MemoryT::clear() {
-  if (!_allocated)
-    return;
-  _allocated=false;
-
-  lj1.clear();
-  lj3.clear();
-  sp_lj.clear();
-  this->clear_atomic();
-}
-
-template <class numtyp, class acctyp>
-double CMML_GPU_MemoryT::host_memory_usage() const {
-  return this->host_memory_usage_atomic()+sizeof(CMML_GPU_Memory<numtyp,acctyp>);
-}
-
-// ---------------------------------------------------------------------------
-// Calculate energies, forces, and torques
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
-  this->time_pair.start();
-  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch,
-                          &this->atom->dev_q.begin(), &_cut_coulsq,
-                          &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
-  } else {
-    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->dev_q.begin(),
-                     &_cut_coulsq, &_qqrd2e, &_g_ewald, 
-                     &this->_threads_per_atom);
-  }
-  this->time_pair.stop();
-}
-
-template class CMML_GPU_Memory<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/cmmc_long_gpu_memory.h
+++ b/lib/gpu/cmmc_long_gpu_memory.h
@ -1,82 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef CMML_GPU_MEMORY_H
-#define CMML_GPU_MEMORY_H
-
-#include "charge_gpu_memory.h"
-
-template <class numtyp, class acctyp>
-class CMML_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
- public:
-  CMML_GPU_Memory();
-  ~CMML_GPU_Memory();
-
-  /// Clear any previous data and set up for a new LAMMPS run
-  /** \param max_nbors initial number of rows in the neighbor matrix
-    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device
-    * 
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double **host_cutsq, int ** cg_type,
-           double **host_lj1, double **host_lj2, double **host_lj3,
-           double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
-           const double gpu_split, FILE *screen, double **host_cut_ljsq,
-           const double host_cut_coulsq, double *host_special_coul,
-           const double qqrd2e, const double g_ewald);
-
-  /// Clear all host and device data
-  /** \note This is called at the beginning of the init() routine **/
-  void clear();
-
-  /// Returns memory usage on device per atom
-  int bytes_per_atom(const int max_nbors) const;
-
-  /// Total host memory used by library for pair style
-  double host_memory_usage() const;
-
-  // --------------------------- TYPE DATA --------------------------
-
-  /// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2, 
-  UCL_D_Vec<numtyp4> lj1;
-  /// lj3.x = cg_type, lj3.y = lj3, lj3.z = lj4, lj3.w = offset
-  UCL_D_Vec<numtyp4> lj3;
-  /// Special LJ values [0-3] and Special Coul values [4-7]
-  UCL_D_Vec<numtyp> sp_lj;
-
-  /// If atom type constants fit in shared memory, use fast kernels
-  bool shared_types;
-
-  /// Number of atom types 
-  int _lj_types;
-
-  numtyp _cut_coulsq, _qqrd2e, _g_ewald;
-
- private:
-  bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
-};
-
-#endif
-
--- a/lib/gpu/cmmc_msm_gpu.cpp
+++ b/lib/gpu/cmmc_msm_gpu.cpp
@ -1,131 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#include <iostream>
-#include <cassert>
-#include <math.h>
-
-#include "cmmc_msm_gpu_memory.h"
-
-using namespace std;
-
-static CMMM_GPU_Memory<PRECISION,ACC_PRECISION> CMMMMF;
-
-// ---------------------------------------------------------------------------
-// Allocate memory on host and device and copy constants to device
-// ---------------------------------------------------------------------------
-int cmmm_gpu_init(const int ntypes, double **cutsq, int **cg_type,
-                  double **host_lj1, double **host_lj2, double **host_lj3, 
-                  double **host_lj4, double **offset, double *special_lj,
-                  const int inum, const int nall, const int max_nbors, 
-                  const int maxspecial, const double cell_size, int &gpu_mode,
-                  FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
-                  double *host_special_coul, const double qqrd2e, 
-                  const int smooth) {
-  CMMMMF.clear();
-  gpu_mode=CMMMMF.device->gpu_mode();
-  double gpu_split=CMMMMF.device->particle_split();
-  int first_gpu=CMMMMF.device->first_device();
-  int last_gpu=CMMMMF.device->last_device();
-  int world_me=CMMMMF.device->world_me();
-  int gpu_rank=CMMMMF.device->gpu_rank();
-  int procs_per_gpu=CMMMMF.device->procs_per_gpu();
-
-  CMMMMF.device->init_message(screen,"cg/cmm/coul/msm",first_gpu,last_gpu);
-
-  bool message=false;
-  if (CMMMMF.device->replica_me()==0 && screen)
-    message=true;
-
-  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
-    fflush(screen);
-  }
-
-  int init_ok=0;
-  if (world_me==0)
-    init_ok=CMMMMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
-                        host_lj4, offset, special_lj, inum,  nall, 300,
-                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
-                        host_cut_coulsq, host_special_coul, qqrd2e,smooth);
-
-  CMMMMF.device->world_barrier();
-  if (message)
-    fprintf(screen,"Done.\n");
-
-  for (int i=0; i<procs_per_gpu; i++) {
-    if (message) {
-      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
-      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
-                last_gpu,i);
-      fflush(screen);
-    }
-    if (gpu_rank==i && world_me!=0)
-      init_ok=CMMMMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
-                          host_lj4, offset, special_lj, inum,  nall, 300,
-                          maxspecial, cell_size, gpu_split, screen,
-                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
-                          qqrd2e,smooth);
-
-    CMMMMF.device->gpu_barrier();
-    if (message) 
-      fprintf(screen,"Done.\n");
-  }
-  if (message)
-    fprintf(screen,"\n");
-
-  if (init_ok==0)
-    CMMMMF.estimate_gpu_overhead();
-  return init_ok;
-}
-
-void cmmm_gpu_clear() {
-  CMMMMF.clear();
-}
-
-int** cmmm_gpu_compute_n(const int ago, const int inum_full,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, int *tag, int **nspecial, 
-                         int **special, const bool eflag, const bool vflag,
-                         const bool eatom, const bool vatom, int &host_start,
-                         int **ilist, int **jnum, const double cpu_time,
-                         bool &success, double *host_q, double *boxlo,
-                         double *prd) {
-  return CMMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
-                        subhi, tag, nspecial, special, eflag, vflag, eatom,
-                        vatom, host_start, ilist, jnum, cpu_time, success,
-                        host_q, boxlo, prd);
-}  
-			
-void cmmm_gpu_compute(const int ago, const int inum_full, const int nall,
-                      double **host_x, int *host_type, int *ilist, int *numj,
-                      int **firstneigh, const bool eflag, const bool vflag,
-                      const bool eatom, const bool vatom, int &host_start,
-                      const double cpu_time, bool &success, double *host_q,
-                      const int nlocal, double *boxlo, double *prd) {
-  CMMMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
-                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
-                 host_q,nlocal,boxlo,prd);
-}
-
-double cmmm_gpu_bytes() {
-  return CMMMMF.host_memory_usage();
-}
-
-
--- a/lib/gpu/cmmc_msm_gpu_kernel.cu
+++ b/lib/gpu/cmmc_msm_gpu_kernel.cu
@ -1,531 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef CMMM_GPU_KERNEL
-#define CMMM_GPU_KERNEL
-
-#ifdef NV_KERNEL
-
-#include "nv_kernel_def.h"
-texture<float4> pos_tex;
-texture<float> q_tex;
-
-#ifdef _DOUBLE_DOUBLE
-__inline double4 fetch_pos(const int& i, const double4 *pos)
-{
-  return pos[i];
-}
-__inline double fetch_q(const int& i, const double *q)
-{
-  return q[i];
-}
-#else
-__inline float4 fetch_pos(const int& i, const float4 *pos)
-{
-  return tex1Dfetch(pos_tex, i);
-}
-__inline float fetch_q(const int& i, const float *q)
-{
-  return tex1Dfetch(q_tex, i);
-}
-#endif
-
-#else
-
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-#define GLOBAL_ID_X get_global_id(0)
-#define THREAD_ID_X get_local_id(0)
-#define BLOCK_ID_X get_group_id(0)
-#define BLOCK_SIZE_X get_local_size(0)
-#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
-#define __inline inline
-
-#define fetch_pos(i,y) x_[i]
-#define fetch_q(i,y) q_[i]
-#define BLOCK_PAIR 64
-#define MAX_SHARED_TYPES 8
-
-#endif
-
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
-#define SBBITS 30
-#define NEIGHMASK 0x3FFFFFFF
-__inline int sbmask(int j) { return j >> SBBITS & 3; }
-
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_,
-                          const numtyp cut_coulsq, const numtyp qqrd2e,
-                          const int smooth, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp sp_lj[8];
-  sp_lj[0]=sp_lj_in[0];
-  sp_lj[1]=sp_lj_in[1];
-  sp_lj[2]=sp_lj_in[2];
-  sp_lj[3]=sp_lj_in[3];
-  sp_lj[4]=sp_lj_in[4];
-  sp_lj[5]=sp_lj_in[5];
-  sp_lj[6]=sp_lj_in[6];
-  sp_lj[7]=sp_lj_in[7];
-  __local numtyp _ia;
-  __local numtyp _ia2;
-  __local numtyp _ia3;
-
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  if (ii<inum) {
-    _ia=(numtyp)-1.0/sqrt(cut_coulsq);
-    _ia2=(numtyp)-1.0/cut_coulsq;
-    _ia3=_ia2*_ia;
-    
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
-    int itype=ix.w;
-
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
-
-      numtyp factor_lj, factor_coul;
-      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      int mtype=itype*lj_types+jtype;
-      if (rsq<lj1[mtype].x) {
-        numtyp forcecoul, force_lj, force, inv1, inv2, prefactor;
-        numtyp r2inv=(numtyp)1.0/rsq;
-
-        if (rsq < lj1[mtype].y) {
-          if (lj3[mtype].x == (numtyp)2) {
-            inv1=r2inv*r2inv;
-            inv2=inv1*inv1;
-          } else if (lj3[mtype].x == (numtyp)1) {
-            inv2=r2inv*sqrt(r2inv);
-            inv1=inv2*inv2;
-          } else {
-            inv1=r2inv*r2inv*r2inv;
-            inv2=inv1;
-          }
-          force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
-        } else
-          force_lj = (numtyp)0.0;
-
-        numtyp ir, r2_ia2, r4_ia4, r6_ia6;
-        if (rsq < cut_coulsq) {
-          ir = (numtyp)1.0/sqrt(rsq);
-          prefactor = qqrd2e*qtmp*fetch_q(j,q_);
-          r2_ia2 = rsq*_ia2;
-          r4_ia4 = r2_ia2*r2_ia2;
-          if (smooth==0)
-            forcecoul = prefactor*(_ia3*((numtyp)-4.375+(numtyp)5.25*r2_ia2-
-                                        (numtyp)1.875*r4_ia4)-ir/rsq-
-                                        factor_coul*ir);
-          else {
-            r6_ia6 = r2_ia2*r4_ia4;
-            forcecoul = prefactor*(_ia3*((numtyp)-6.5625+(numtyp)11.8125*
-                                         r2_ia2-(numtyp)8.4375*r4_ia4+
-                                         (numtyp)2.1875*r6_ia6)-ir/rsq-
-                                         factor_coul*ir);
-          }
-        } else {
-          forcecoul = (numtyp)0.0;
-          prefactor = (numtyp)0.0;
-        }
-
-        force = forcecoul + force_lj * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          if (rsq < cut_coulsq)
-            if (smooth==0)
-              e_coul += prefactor*(ir+_ia*((numtyp)2.1875-(numtyp)2.1875*r2_ia2+
-                                           (numtyp)1.3125*r4_ia4-
-                                           (numtyp)0.3125*r4_ia4*r2_ia2)-
-                                           factor_coul*ir);
-            else
-              e_coul += prefactor*(ir+_ia*((numtyp)2.4609375-(numtyp)3.28125*
-                                           r2_ia2+(numtyp)2.953125*r4_ia4-
-                                           (numtyp)1.40625*r6_ia6+
-                                           (numtyp)0.2734375*r4_ia4*r4_ia4));
-              
-          if (rsq < lj1[mtype].y) {
-            energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
-                      lj3[mtype].w;
-          } 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-    red_acc[4][tid]=e_coul;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<5; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-    e_coul=red_acc[4][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-      *ap1=e_coul;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii
-}
-
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const int smooth, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp sp_lj[8];
-  if (tid<8)
-    sp_lj[tid]=sp_lj_in[tid];
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[tid]=lj1_in[tid];
-    lj3[tid]=lj3_in[tid];
-  }
-  
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  __local numtyp _ia;
-  __local numtyp _ia2;
-  __local numtyp _ia3;
-  _ia=(numtyp)-1.0/sqrt(cut_coulsq);
-  _ia2=(numtyp)1.0/cut_coulsq;
-  _ia3=_ia2*_ia;
-  __syncthreads();
-  
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
-    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
-
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
-
-      numtyp factor_lj, factor_coul;
-      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int mtype=itype+jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      if (rsq<lj1[mtype].x) {
-        numtyp forcecoul, force_lj, force, inv1, inv2, prefactor;
-        numtyp r2inv=(numtyp)1.0/rsq;
-
-        if (rsq < lj1[mtype].y) {
-          if (lj3[mtype].x == (numtyp)2) {
-            inv1=r2inv*r2inv;
-            inv2=inv1*inv1;
-          } else if (lj3[mtype].x == (numtyp)1) {
-            inv2=r2inv*sqrt(r2inv);
-            inv1=inv2*inv2;
-          } else {
-            inv1=r2inv*r2inv*r2inv;
-            inv2=inv1;
-          }
-          force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
-        } else
-          force_lj = (numtyp)0.0;
-
-        numtyp ir, r2_ia2, r4_ia4, r6_ia6;
-        if (rsq < cut_coulsq) {
-          ir = (numtyp)1.0/sqrt(rsq);
-          prefactor = qqrd2e*qtmp*fetch_q(j,q_);
-          r2_ia2 = rsq*_ia2;
-          r4_ia4 = r2_ia2*r2_ia2;
-          if (smooth==0)
-            forcecoul = prefactor*(_ia3*((numtyp)-4.375+(numtyp)5.25*r2_ia2-
-                                        (numtyp)1.875*r4_ia4)-ir/rsq-
-                                        factor_coul*ir);
-          else {
-            r6_ia6 = r2_ia2*r4_ia4;
-            forcecoul = prefactor*(_ia3*((numtyp)-6.5625+(numtyp)11.8125*
-                                         r2_ia2-(numtyp)8.4375*r4_ia4+
-                                         (numtyp)2.1875*r6_ia6)-ir/rsq-
-                                         factor_coul*ir);
-          }
-        } else {
-          forcecoul = (numtyp)0.0;
-          prefactor = (numtyp)0.0;
-        }
-
-        force = forcecoul + force_lj * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          if (rsq < cut_coulsq)
-            if (smooth==0)
-              e_coul += prefactor*(ir+_ia*((numtyp)2.1875-(numtyp)2.1875*r2_ia2+
-                                           (numtyp)1.3125*r4_ia4-
-                                           (numtyp)0.3125*r4_ia4*r2_ia2)-
-                                           factor_coul*ir);
-            else
-              e_coul += prefactor*(ir+_ia*((numtyp)2.4609375-(numtyp)3.28125*
-                                           r2_ia2+(numtyp)2.953125*r4_ia4-
-                                           (numtyp)1.40625*r6_ia6+
-                                           (numtyp)0.2734375*r4_ia4*r4_ia4));
-          if (rsq < lj1[mtype].y) {
-            energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
-                      lj3[mtype].w;
-          } 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-    red_acc[4][tid]=e_coul;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<5; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-    e_coul=red_acc[4][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-      *ap1=e_coul;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii*/
-}
-
-#endif
-
--- a/lib/gpu/cmmc_msm_gpu_memory.cpp
+++ b/lib/gpu/cmmc_msm_gpu_memory.cpp
@ -1,169 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifdef USE_OPENCL
-#include "cmmc_msm_gpu_cl.h"
-#else
-#include "cmmc_msm_gpu_ptx.h"
-#endif
-
-#include "cmmc_msm_gpu_memory.h"
-#include <cassert>
-#define CMMM_GPU_MemoryT CMMM_GPU_Memory<numtyp, acctyp>
-
-extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
-
-template <class numtyp, class acctyp>
-CMMM_GPU_MemoryT::CMMM_GPU_Memory() : ChargeGPUMemory<numtyp,acctyp>(),
-                                    _allocated(false) {
-}
-
-template <class numtyp, class acctyp>
-CMMM_GPU_MemoryT::~CMMM_GPU_Memory() {
-  clear();
-}
- 
-template <class numtyp, class acctyp>
-int CMMM_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
-  return this->bytes_per_atom_atomic(max_nbors);
-}
-
-template <class numtyp, class acctyp>
-int CMMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
-                           int **host_cg_type, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
-                           double *host_special_lj, const int nlocal,
-                           const int nall, const int max_nbors,
-                           const int maxspecial, const double cell_size,
-                           const double gpu_split, FILE *_screen,
-                           double **host_cut_ljsq, 
-                           const double host_cut_coulsq,
-                           double *host_special_coul, const double qqrd2e,
-                           const int smooth) {
-  int success;
-  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,cmmc_msm_gpu_kernel);
-  if (success!=0)
-    return success;
-
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
-  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
-    lj_types=max_shared_types;
-    shared_types=true;
-  }
-  _lj_types=lj_types;
-
-  // Allocate a host write buffer for data initialization
-  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
-
-  for (int i=0; i<lj_types*lj_types; i++)
-    host_write[i]=0.0;
-
-  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_cutsq,
-                         host_cut_ljsq,host_lj1,host_lj2);
-
-  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_cg_type,host_lj3,
-                         host_lj4,host_offset);
-
-  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
-  for (int i=0; i<4; i++) {
-    host_write[i]=host_special_lj[i];
-    host_write[i+4]=host_special_coul[i];
-  }
-  ucl_copy(sp_lj,host_write,8,false);
-
-  _cut_coulsq=host_cut_coulsq;
-  _qqrd2e=qqrd2e;
-  _smooth=smooth;
-
-  _allocated=true;
-  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return 0;
-}
-
-template <class numtyp, class acctyp>
-void CMMM_GPU_MemoryT::clear() {
-  if (!_allocated)
-    return;
-  _allocated=false;
-
-  lj1.clear();
-  lj3.clear();
-  sp_lj.clear();
-  this->clear_atomic();
-}
-
-template <class numtyp, class acctyp>
-double CMMM_GPU_MemoryT::host_memory_usage() const {
-  return this->host_memory_usage_atomic()+sizeof(CMMM_GPU_Memory<numtyp,acctyp>);
-}
-
-// ---------------------------------------------------------------------------
-// Calculate energies, forces, and torques
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
-  this->time_pair.start();
-  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch,
-                          &this->atom->dev_q.begin(), &_cut_coulsq,
-                          &_qqrd2e, &_smooth, &this->_threads_per_atom);
-  } else {
-    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
-                     &_qqrd2e, &_smooth, &this->_threads_per_atom);
-  }
-  this->time_pair.stop();
-}
-
-template class CMMM_GPU_Memory<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/cmmc_msm_gpu_memory.h
+++ b/lib/gpu/cmmc_msm_gpu_memory.h
@ -1,83 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef CMMM_GPU_MEMORY_H
-#define CMMM_GPU_MEMORY_H
-
-#include "charge_gpu_memory.h"
-
-template <class numtyp, class acctyp>
-class CMMM_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
- public:
-  CMMM_GPU_Memory();
-  ~CMMM_GPU_Memory();
-
-  /// Clear any previous data and set up for a new LAMMPS run
-  /** \param max_nbors initial number of rows in the neighbor matrix
-    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device
-    * 
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double **host_cutsq, int ** cg_type,
-           double **host_lj1, double **host_lj2, double **host_lj3,
-           double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
-           const double gpu_split, FILE *screen, double **host_cut_ljsq,
-           const double host_cut_coulsq, double *host_special_coul,
-           const double qqrd2e, const int smooth);
-
-  /// Clear all host and device data
-  /** \note This is called at the beginning of the init() routine **/
-  void clear();
-
-  /// Returns memory usage on device per atom
-  int bytes_per_atom(const int max_nbors) const;
-
-  /// Total host memory used by library for pair style
-  double host_memory_usage() const;
-
-  // --------------------------- TYPE DATA --------------------------
-
-  /// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2, 
-  UCL_D_Vec<numtyp4> lj1;
-  /// lj3.x = cg_type, lj3.y = lj3, lj3.z = lj4, lj3.w = offset
-  UCL_D_Vec<numtyp4> lj3;
-  /// Special LJ values [0-3] and Special Coul values [4-7]
-  UCL_D_Vec<numtyp> sp_lj;
-
-  /// If atom type constants fit in shared memory, use fast kernels
-  bool shared_types;
-
-  /// Number of atom types 
-  int _lj_types;
-
-  numtyp _cut_coulsq, _qqrd2e;
-
- private:
-  bool _allocated;
-  int _smooth;
-  void loop(const bool _eflag, const bool _vflag);
-};
-
-#endif
-
--- a/lib/gpu/coul_long_gpu.cpp
+++ b/lib/gpu/coul_long_gpu.cpp
@ -1,124 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#include <iostream>
-#include <cassert>
-#include <math.h>
-
-#include "coul_long_gpu_memory.h"
-
-using namespace std;
-
-static CL_GPU_Memory<PRECISION,ACC_PRECISION> CLMF;
-
-// ---------------------------------------------------------------------------
-// Allocate memory on host and device and copy constants to device
-// ---------------------------------------------------------------------------
-int cl_gpu_init(const int inum, const int nall, const int max_nbors,
-		const int maxspecial, const double cell_size, int &gpu_mode,
-		FILE *screen, double host_cut_coulsq, double *host_special_coul,
-		const double qqrd2e, const double g_ewald) {
-  CLMF.clear();
-  gpu_mode=CLMF.device->gpu_mode();
-  double gpu_split=CLMF.device->particle_split();
-  int first_gpu=CLMF.device->first_device();
-  int last_gpu=CLMF.device->last_device();
-  int world_me=CLMF.device->world_me();
-  int gpu_rank=CLMF.device->gpu_rank();
-  int procs_per_gpu=CLMF.device->procs_per_gpu();
-
-  CLMF.device->init_message(screen,"coul/long",first_gpu,last_gpu);
-
-  bool message=false;
-  if (CLMF.device->replica_me()==0 && screen)
-    message=true;
-
-  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
-    fflush(screen);
-  }
-
-  int init_ok=0;
-  if (world_me==0)
-    init_ok=CLMF.init(inum, nall, 300, maxspecial, cell_size, gpu_split,
-		      screen, host_cut_coulsq, host_special_coul, qqrd2e,
-		      g_ewald);
-
-  CLMF.device->world_barrier();
-  if (message)
-    fprintf(screen,"Done.\n");
-
-  for (int i=0; i<procs_per_gpu; i++) {
-    if (message) {
-      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
-      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
-                last_gpu,i);
-      fflush(screen);
-    }
-    if (gpu_rank==i && world_me!=0)
-      init_ok=CLMF.init(inum, nall, 300, maxspecial, cell_size, gpu_split,
-			screen, host_cut_coulsq, host_special_coul,
-			qqrd2e, g_ewald);
-
-    CLMF.device->gpu_barrier();
-    if (message)
-      fprintf(screen,"Done.\n");
-  }
-  if (message)
-    fprintf(screen,"\n");
-
-  if (init_ok==0)
-    CLMF.estimate_gpu_overhead();
-  return init_ok;
-}
-
-void cl_gpu_clear() {
-  CLMF.clear();
-}
-
-int** cl_gpu_compute_n(const int ago, const int inum_full,
-		       const int nall, double **host_x, int *host_type,
-		       double *sublo, double *subhi, int *tag, int **nspecial,
-		       int **special, const bool eflag, const bool vflag,
-		       const bool eatom, const bool vatom, int &host_start,
-		       int **ilist, int **jnum,  const double cpu_time,
-		       bool &success, double *host_q, double *boxlo,
-		       double *prd) {
-  return CLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
-		      subhi, tag, nspecial, special, eflag, vflag, eatom,
-		      vatom, host_start, ilist, jnum, cpu_time, success,
-		      host_q, boxlo, prd);
-}
-
-void cl_gpu_compute(const int ago, const int inum_full, const int nall,
-		    double **host_x, int *host_type, int *ilist, int *numj,
-		    int **firstneigh, const bool eflag, const bool vflag,
-		    const bool eatom, const bool vatom, int &host_start,
-		    const double cpu_time, bool &success, double *host_q,
-		    const int nlocal, double *boxlo, double *prd) {
-  CLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
-	       firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
-	       host_q,nlocal,boxlo,prd);
-}
-
-double cl_gpu_bytes() {
-  return CLMF.host_memory_usage();
-}
-
-
--- a/lib/gpu/coul_long_gpu_kernel.cu
+++ b/lib/gpu/coul_long_gpu_kernel.cu
@ -1,411 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef CL_GPU_KERNEL
-#define CL_GPU_KERNEL
-
-#ifdef NV_KERNEL
-
-#include "nv_kernel_def.h"
-texture<float4> pos_tex;
-texture<float> q_tex;
-
-#ifdef _DOUBLE_DOUBLE
-__inline double4 fetch_pos(const int& i, const double4 *pos)
-{
-  return pos[i];
-}
-__inline double fetch_q(const int& i, const double *q)
-{
-  return q[i];
-}
-#else
-__inline float4 fetch_pos(const int& i, const float4 *pos)
-{
-  return tex1Dfetch(pos_tex, i);
-}
-__inline float fetch_q(const int& i, const float *q)
-{
-  return tex1Dfetch(q_tex, i);
-}
-#endif
-
-#else
-
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-#define GLOBAL_ID_X get_global_id(0)
-#define THREAD_ID_X get_local_id(0)
-#define BLOCK_ID_X get_group_id(0)
-#define BLOCK_SIZE_X get_local_size(0)
-#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
-#define __inline inline
-
-#define fetch_pos(i,y) x_[i]
-#define fetch_q(i,y) q_[i]
-#define BLOCK_PAIR 64
-#define MAX_SHARED_TYPES 8
-
-#endif
-
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
-#define EWALD_F (numtyp)1.12837917
-#define EWALD_P (numtyp)0.3275911
-#define A1 (numtyp)0.254829592
-#define A2 (numtyp)-0.284496736
-#define A3 (numtyp)1.421413741
-#define A4 (numtyp)-1.453152027
-#define A5 (numtyp)1.061405429
-
-#define SBBITS 30
-#define NEIGHMASK 0x3FFFFFFF
-__inline int sbmask(int j) { return j >> SBBITS & 3; }
-
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types,
-                          __global numtyp *sp_cl_in, __global int *dev_nbor,
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_,
-                          const numtyp cut_coulsq, const numtyp qqrd2e,
-                          const numtyp g_ewald, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp sp_cl[4];
-  sp_cl[0]=sp_cl_in[0];
-  sp_cl[1]=sp_cl_in[1];
-  sp_cl[2]=sp_cl_in[2];
-  sp_cl[3]=sp_cl_in[3];
-
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
-
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
-
-      numtyp factor_coul;
-      factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      if (rsq < cut_coulsq) {
-	numtyp r2inv=(numtyp)1.0/rsq;
-	numtyp force, prefactor, _erfc;
-
-	numtyp r = sqrt(rsq);
-	numtyp grij = g_ewald * r;
-	numtyp expm2 = exp(-grij*grij);
-	numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
-	_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-	prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
-	force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-	  e_coul += prefactor*(_erfc-factor_coul);
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=e_coul;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<4; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    e_coul=red_acc[3][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=(acctyp)0;
-      ap1+=inum;
-      *ap1=e_coul;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii
-}
-
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in,
-                               __global numtyp* sp_cl_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv,
-                               const int eflag, const int vflag, const int inum,
-                               const int nbor_pitch, __global numtyp *q_,
-                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp g_ewald, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp sp_cl[4];
-  if (tid<4)
-    sp_cl[tid]=sp_cl_in[tid];
-
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  __syncthreads();
-
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
-
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
-
-      numtyp factor_coul;
-      factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      if (rsq < cut_coulsq) {
-	numtyp r2inv=(numtyp)1.0/rsq;
-	numtyp force, prefactor, _erfc;
-
-	numtyp r = sqrt(rsq);
-	numtyp grij = g_ewald * r;
-	numtyp expm2 = exp(-grij*grij);
-	numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
-	_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-	prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
-	force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-	  e_coul += prefactor*(_erfc-factor_coul);
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
- 
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=e_coul;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<4; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
- 
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    e_coul=red_acc[3][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=(acctyp)0;
-      ap1+=inum;
-      *ap1=e_coul;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii*/
-}
-
-#endif
-
--- a/lib/gpu/coul_long_gpu_memory.cpp
+++ b/lib/gpu/coul_long_gpu_memory.cpp
@ -1,158 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifdef USE_OPENCL
-#include "coul_long_gpu_cl.h"
-#else
-#include "coul_long_gpu_ptx.h"
-#endif
-
-#include "coul_long_gpu_memory.h"
-#include <cassert>
-#define CL_GPU_MemoryT CL_GPU_Memory<numtyp, acctyp>
-
-extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
-
-template <class numtyp, class acctyp>
-CL_GPU_MemoryT::CL_GPU_Memory() : ChargeGPUMemory<numtyp,acctyp>(),
-                                    _allocated(false) {
-}
-
-template <class numtyp, class acctyp>
-CL_GPU_MemoryT::~CL_GPU_Memory() {
-  clear();
-}
- 
-template <class numtyp, class acctyp>
-int CL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
-  return this->bytes_per_atom_atomic(max_nbors);
-}
-
-template <class numtyp, class acctyp>
-int CL_GPU_MemoryT::init(const int nlocal, const int nall, const int max_nbors,
-			 const int maxspecial, const double cell_size,
-			 const double gpu_split, FILE *_screen,
-			 const double host_cut_coulsq, double *host_special_coul,
-			 const double qqrd2e, const double g_ewald) {
-  int success;
-  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
-			    gpu_split,_screen,coul_long_gpu_kernel);
-  if (success!=0)
-    return success;
-
-  // we don't have atom types for coulomb only,
-  // but go with the minimum so that we can use
-  // the same infrastructure as lj/cut/coul/long/gpu.
-  int lj_types=1;
-  shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
-  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
-    lj_types=max_shared_types;
-    shared_types=true;
-  }
-  _lj_types=lj_types;
-
-  // Allocate a host write buffer for data initialization
-  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
-
-  for (int i=0; i<lj_types*lj_types; i++)
-    host_write[i]=0.0;
-
-  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-
-  sp_cl.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
-  for (int i=0; i<4; i++) {
-    host_write[i]=host_special_coul[i];
-  }
-  ucl_copy(sp_cl,host_write,4,false);
-
-  _cut_coulsq=host_cut_coulsq;
-  _qqrd2e=qqrd2e;
-  _g_ewald=g_ewald;
-
-  _allocated=true;
-  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_cl.row_bytes();
-  return 0;
-}
-
-template <class numtyp, class acctyp>
-void CL_GPU_MemoryT::clear() {
-  if (!_allocated)
-    return;
-  _allocated=false;
-
-  lj1.clear();
-  lj3.clear();
-  sp_cl.clear();
-  this->clear_atomic();
-}
-
-template <class numtyp, class acctyp>
-double CL_GPU_MemoryT::host_memory_usage() const {
-  return this->host_memory_usage_atomic()+sizeof(CL_GPU_Memory<numtyp,acctyp>);
-}
-
-// ---------------------------------------------------------------------------
-// Calculate energies, forces, and torques
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void CL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
-  this->time_pair.start();
-  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_cl.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
-                          &_cut_coulsq, &_qqrd2e, &_g_ewald,
-                          &this->_threads_per_atom);
-  } else {
-    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_cl.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
-                     &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
-  }
-  this->time_pair.stop();
-}
-
-template class CL_GPU_Memory<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/coul_long_gpu_memory.h
+++ b/lib/gpu/coul_long_gpu_memory.h
@ -1,79 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef CL_GPU_MEMORY_H
-#define CL_GPU_MEMORY_H
-
-#include "charge_gpu_memory.h"
-
-template <class numtyp, class acctyp>
-class CL_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
- public:
-  CL_GPU_Memory();
-  ~CL_GPU_Memory();
-
-  /// Clear any previous data and set up for a new LAMMPS run
-  /** \param max_nbors initial number of rows in the neighbor matrix
-    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device
-    * 
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init(const int nlocal, const int nall, const int max_nbors,
-           const int maxspecial, const double cell_size,
-	   const double gpu_split, FILE *screen, 
-	   const double host_cut_coulsq, double *host_special_coul,
-	   const double qqrd2e, const double g_ewald);
-
-  /// Clear all host and device data
-  /** \note This is called at the beginning of the init() routine **/
-  void clear();
-
-  /// Returns memory usage on device per atom
-  int bytes_per_atom(const int max_nbors) const;
-
-  /// Total host memory used by library for pair style
-  double host_memory_usage() const;
-
-  // --------------------------- TYPE DATA --------------------------
-
-  /// lj1 dummy
-  UCL_D_Vec<numtyp4> lj1;
-  /// lj3 dummy
-  UCL_D_Vec<numtyp4> lj3;
-  /// Special Coul values [0-3]
-  UCL_D_Vec<numtyp> sp_cl;
-
-  /// If atom type constants fit in shared memory, use fast kernels
-  bool shared_types;
-
-  /// Number of atom types
-  int _lj_types;
-
-  numtyp _cut_coulsq, _qqrd2e, _g_ewald;
-
- private:
-  bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
-};
-
-#endif
-
--- a/lib/gpu/crml_gpu.cpp
+++ b/lib/gpu/crml_gpu.cpp
@ -1,136 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#include <iostream>
-#include <cassert>
-#include <math.h>
-
-#include "crml_gpu_memory.h"
-
-using namespace std;
-
-static CRML_GPU_Memory<PRECISION,ACC_PRECISION> CRMLMF;
-
-// ---------------------------------------------------------------------------
-// Allocate memory on host and device and copy constants to device
-// ---------------------------------------------------------------------------
-int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
-                  double **host_lj2, double **host_lj3, double **host_lj4,
-                  double **offset, double *special_lj, const int inum,
-                  const int nall, const int max_nbors, const int maxspecial,
-                  const double cell_size, int &gpu_mode, FILE *screen,
-                  double host_cut_ljsq, double host_cut_coulsq,
-                  double *host_special_coul, const double qqrd2e,
-                  const double g_ewald, const double cut_lj_innersq,
-                  const double denom_lj, double **epsilon,
-                  double **sigma, const bool mix_arithmetic) {
-  CRMLMF.clear();
-  gpu_mode=CRMLMF.device->gpu_mode();
-  double gpu_split=CRMLMF.device->particle_split();
-  int first_gpu=CRMLMF.device->first_device();
-  int last_gpu=CRMLMF.device->last_device();
-  int world_me=CRMLMF.device->world_me();
-  int gpu_rank=CRMLMF.device->gpu_rank();
-  int procs_per_gpu=CRMLMF.device->procs_per_gpu();
-
-  CRMLMF.device->init_message(screen,"lj/charmm/coul/long",first_gpu,last_gpu);
-
-  bool message=false;
-  if (CRMLMF.device->replica_me()==0 && screen)
-    message=true;
-
-  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
-    fflush(screen);
-  }
-
-  int init_ok=0;
-  if (world_me==0)
-    CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                offset, special_lj, inum, nall, 300, maxspecial, cell_size,
-                gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
-                host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
-                epsilon,sigma,mix_arithmetic);
-
-  CRMLMF.device->world_barrier();
-  if (message)
-    fprintf(screen,"Done.\n");
-
-  for (int i=0; i<procs_per_gpu; i++) {
-    if (message) {
-      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
-      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
-                last_gpu,i);
-      fflush(screen);
-    }
-    if (gpu_rank==i && world_me!=0)
-      init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
-                          host_lj4, offset, special_lj, inum, nall, 300,
-                          maxspecial, cell_size, gpu_split, screen,
-                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
-                          qqrd2e, g_ewald,  cut_lj_innersq, denom_lj, epsilon,
-                          sigma, mix_arithmetic);
-
-    CRMLMF.device->gpu_barrier();
-    if (message) 
-      fprintf(screen,"Done.\n");
-  }
-  if (message)
-    fprintf(screen,"\n");
-
-  if (init_ok==0)
-    CRMLMF.estimate_gpu_overhead();
-  return init_ok;
-}
-
-void crml_gpu_clear() {
-  CRMLMF.clear();
-}
-
-int** crml_gpu_compute_n(const int ago, const int inum_full,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, int *tag, int **nspecial, 
-                         int **special, const bool eflag, const bool vflag,
-                         const bool eatom, const bool vatom, int &host_start,
-                         int **ilist, int **jnum, const double cpu_time,
-                         bool &success, double *host_q, double *boxlo,
-                         double *prd) {
-  return CRMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
-                        subhi, tag, nspecial, special, eflag, vflag, eatom,
-                        vatom, host_start, ilist, jnum, cpu_time, success,
-                        host_q, boxlo, prd);
-}  
-			
-void crml_gpu_compute(const int ago, const int inum_full,
-	 	                  const int nall, double **host_x, int *host_type,
-                      int *ilist, int *numj, int **firstneigh,
-		                  const bool eflag, const bool vflag, const bool eatom,
-                      const bool vatom, int &host_start, const double cpu_time,
-                      bool &success, double *host_q, const int nlocal, 
-                      double *boxlo, double *prd) {
-  CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
-                 eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,
-                 nlocal,boxlo,prd);
-}
-
-double crml_gpu_bytes() {
-  return CRMLMF.host_memory_usage();
-}
-
-
--- a/lib/gpu/crml_gpu_kernel.cu
+++ b/lib/gpu/crml_gpu_kernel.cu
@ -1,499 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef CRML_GPU_KERNEL
-#define CRML_GPU_KERNEL
-
-#ifdef NV_KERNEL
-
-#include "nv_kernel_def.h"
-texture<float4> pos_tex;
-texture<float> q_tex;
-
-#ifdef _DOUBLE_DOUBLE
-__inline double4 fetch_pos(const int& i, const double4 *pos)
-{
-  return pos[i];
-}
-__inline double fetch_q(const int& i, const double *q)
-{
-  return q[i];
-}
-#else
-__inline float4 fetch_pos(const int& i, const float4 *pos)
-{
-  return tex1Dfetch(pos_tex, i);
-}
-__inline float fetch_q(const int& i, const float *q)
-{
-  return tex1Dfetch(q_tex, i);
-}
-#endif
-
-#else
-
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-#define GLOBAL_ID_X get_global_id(0)
-#define THREAD_ID_X get_local_id(0)
-#define BLOCK_ID_X get_group_id(0)
-#define BLOCK_SIZE_X get_local_size(0)
-#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
-#define __inline inline
-
-#define fetch_pos(i,y) x_[i]
-#define fetch_q(i,y) q_[i]
-#define BLOCK_BIO_PAIR 64
-
-#endif
-
-#define MAX_BIO_SHARED_TYPES 128
-
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
-#define EWALD_F (numtyp)1.12837917
-#define EWALD_P (numtyp)0.3275911
-#define A1 (numtyp)0.254829592
-#define A2 (numtyp)-0.284496736
-#define A3 (numtyp)1.421413741
-#define A4 (numtyp)-1.453152027
-#define A5 (numtyp)1.061405429
-
-#define SBBITS 30
-#define NEIGHMASK 0x3FFFFFFF
-__inline int sbmask(int j) { return j >> SBBITS & 3; }
-
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          const int lj_types, __global numtyp *sp_lj_in,
-                          __global int *dev_nbor, __global int *dev_packed,
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nbor_pitch, __global numtyp *q_,
-                          const numtyp cut_coulsq, const numtyp qqrd2e,
-                          const numtyp g_ewald, const numtyp denom_lj,
-                          const numtyp cut_bothsq, const numtyp cut_ljsq,
-                          const numtyp cut_lj_innersq, const int t_per_atom) {
-
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp sp_lj[8];
-  sp_lj[0]=sp_lj_in[0];
-  sp_lj[1]=sp_lj_in[1];
-  sp_lj[2]=sp_lj_in[2];
-  sp_lj[3]=sp_lj_in[3];
-  sp_lj[4]=sp_lj_in[4];
-  sp_lj[5]=sp_lj_in[5];
-  sp_lj[6]=sp_lj_in[6];
-  sp_lj[7]=sp_lj_in[7];
-
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
-    int itype=ix.w;
-
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
-
-      numtyp factor_lj, factor_coul;
-      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      int mtype=itype*lj_types+jtype;
-      if (rsq<cut_bothsq) {
-        numtyp r2inv=(numtyp)1.0/rsq;
-        numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc, switch1;
-
-        if (rsq < cut_ljsq) {
-          r6inv = r2inv*r2inv*r2inv;
-          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
-          if (rsq > cut_lj_innersq) {
-            switch1 = (cut_ljsq-rsq);
-            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ 
-                             denom_lj;
-            switch1 *= switch1;
-            switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/
-                       denom_lj;
-            switch2 *= r6inv*(lj1[mtype].z*r6inv-lj1[mtype].w);
-            force_lj = force_lj*switch1+switch2;
-          }
-        } else
-          force_lj = (numtyp)0.0;
-
-        if (rsq < cut_coulsq) {
-          numtyp r = sqrt(rsq);
-          numtyp grij = g_ewald * r;
-          numtyp expm2 = exp(-grij*grij);
-          numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
-          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
-          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else
-          forcecoul = (numtyp)0.0;
-
-        force = (force_lj + forcecoul) * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          if (rsq < cut_coulsq)
-            e_coul += prefactor*(_erfc-factor_coul);
-          if (rsq < cut_ljsq) {
-            numtyp e=r6inv*(lj1[mtype].z*r6inv-lj1[mtype].w);
-            if (rsq > cut_lj_innersq)
-              e *= switch1;
-            energy+=factor_lj*e;
-          } 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-    red_acc[4][tid]=e_coul;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<5; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-    e_coul=red_acc[4][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-      *ap1=e_coul;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii
-}
-
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
-                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
-                               __global int *dev_packed, __global acctyp4 *ans,
-                               __global acctyp *engv, const int eflag,
-                               const int vflag, const int inum,
-                               const int nbor_pitch, __global numtyp *q_,
-                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp g_ewald, const numtyp denom_lj,
-                               const numtyp cut_bothsq, const numtyp cut_ljsq, 
-                               const numtyp cut_lj_innersq,
-                               const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
-  __local numtyp sp_lj[8];
-  if (tid<8)
-    sp_lj[tid]=sp_lj_in[tid];
-  ljd[tid]=ljd_in[tid];
-  if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
-    ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
-  
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  __syncthreads();
-  
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
-    int itype=ix.w;
-
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
-
-      numtyp factor_lj, factor_coul;
-      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      if (rsq<cut_bothsq) {
-        numtyp r2inv=(numtyp)1.0/rsq;
-        numtyp forcecoul, force_lj, force, prefactor, _erfc, switch1;
-        numtyp lj3, lj4;
-
-        if (rsq < cut_ljsq) {
-          numtyp eps = sqrt(ljd[itype].x*ljd[jtype].x);
-          numtyp sig6 = (numtyp)0.5 * (ljd[itype].y+ljd[jtype].y);
-
-          numtyp sig_r_6 = sig6*sig6*r2inv;
-          sig_r_6 = sig_r_6*sig_r_6*sig_r_6;
-          lj4 = (numtyp)4.0*eps*sig_r_6;
-          lj3 = lj4*sig_r_6;
-          force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
-          if (rsq > cut_lj_innersq) {
-            switch1 = (cut_ljsq-rsq);
-            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ 
-                             denom_lj;
-            switch1 *= switch1;
-            switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/
-                       denom_lj;
-            switch2 *= lj3-lj4;
-            force_lj = force_lj*switch1+switch2;
-          }
-        } else
-          force_lj = (numtyp)0.0;
-
-        if (rsq < cut_coulsq) {
-          numtyp r = sqrt(rsq);
-          numtyp grij = g_ewald * r;
-          numtyp expm2 = exp(-grij*grij);
-          numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
-          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
-          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else
-          forcecoul = (numtyp)0.0;
-
-        force = (force_lj + forcecoul) * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          if (rsq < cut_coulsq)
-            e_coul += prefactor*(_erfc-factor_coul);
-          if (rsq < cut_ljsq) {
-            numtyp e=lj3-lj4;
-            if (rsq > cut_lj_innersq)
-              e *= switch1;
-            energy+=factor_lj*e;
-          }
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-    red_acc[4][tid]=e_coul;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<5; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-    e_coul=red_acc[4][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-      *ap1=e_coul;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii
-}
-
-#endif
--- a/lib/gpu/crml_gpu_memory.cpp
+++ b/lib/gpu/crml_gpu_memory.cpp
@ -1,175 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifdef USE_OPENCL
-#include "crml_gpu_cl.h"
-#else
-#include "crml_gpu_ptx.h"
-#endif
-
-#include "crml_gpu_memory.h"
-#include <cassert>
-#define CRML_GPU_MemoryT CRML_GPU_Memory<numtyp, acctyp>
-
-extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
-
-template <class numtyp, class acctyp>
-CRML_GPU_MemoryT::CRML_GPU_Memory() : ChargeGPUMemory<numtyp,acctyp>(),
-                                    _allocated(false) {
-}
-
-template <class numtyp, class acctyp>
-CRML_GPU_MemoryT::~CRML_GPU_Memory() {
-  clear();
-}
- 
-template <class numtyp, class acctyp>
-int CRML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
-  return this->bytes_per_atom_atomic(max_nbors);
-}
-
-template <class numtyp, class acctyp>
-int CRML_GPU_MemoryT::init(const int ntypes,
-                           double host_cut_bothsq, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
-                           double *host_special_lj, const int nlocal,
-                           const int nall, const int max_nbors,
-                           const int maxspecial, const double cell_size,
-                           const double gpu_split, FILE *_screen,
-                           double host_cut_ljsq, const double host_cut_coulsq,
-                           double *host_special_coul, const double qqrd2e,
-                           const double g_ewald, const double cut_lj_innersq,
-                           const double denom_lj, double **epsilon,
-                           double **sigma, const bool mix_arithmetic) {
-  int success;
-  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,crml_gpu_kernel);
-  if (success!=0)
-    return success;
-
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  shared_types=false;
-  if (this->_block_bio_size>=64 && mix_arithmetic)
-    shared_types=true;
-  _lj_types=lj_types;
-
-  // Allocate a host write buffer for data initialization
-  int h_size=lj_types*lj_types;
-  int max_bio_shared_types=this->device->max_bio_shared_types();
-  if (h_size<max_bio_shared_types)
-    h_size=max_bio_shared_types;
-  UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
-  for (int i=0; i<h_size*32; i++)
-    host_write[i]=0.0;
-
-  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-                         host_lj3,host_lj4);
-
-  ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
-
-  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
-  for (int i=0; i<4; i++) {
-    host_write[i]=host_special_lj[i];
-    host_write[i+4]=host_special_coul[i];
-  }
-  ucl_copy(sp_lj,host_write,8,false);
-
-  _cut_bothsq = host_cut_bothsq;
-  _cut_coulsq = host_cut_coulsq;
-  _cut_ljsq = host_cut_ljsq;
-  _cut_lj_innersq = cut_lj_innersq;
-  _qqrd2e=qqrd2e;
-  _g_ewald=g_ewald;
-  _denom_lj=denom_lj;
-
-  _allocated=true;
-  this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes();
-  return 0;
-}
-
-template <class numtyp, class acctyp>
-void CRML_GPU_MemoryT::clear() {
-  if (!_allocated)
-    return;
-  _allocated=false;
-
-  lj1.clear();
-  ljd.clear();
-  sp_lj.clear();
-  this->clear_atomic();
-}
-
-template <class numtyp, class acctyp>
-double CRML_GPU_MemoryT::host_memory_usage() const {
-  return this->host_memory_usage_atomic()+sizeof(CRML_GPU_Memory<numtyp,acctyp>);
-}
-
-// ---------------------------------------------------------------------------
-// Calculate energies, forces, and torques
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->_block_bio_size;
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
-  this->time_pair.start();
-  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(),
-                          &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
-                          &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
-                          &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, 
-                          &this->_threads_per_atom);
-  } else {
-    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->dev_q.begin(),
-                     &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
-                     &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
-                     &this->_threads_per_atom);
-  }
-  this->time_pair.stop();
-}
-
-template class CRML_GPU_Memory<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/crml_gpu_memory.h
+++ b/lib/gpu/crml_gpu_memory.h
@ -1,86 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef CRML_GPU_MEMORY_H
-#define CRML_GPU_MEMORY_H
-
-#include "charge_gpu_memory.h"
-
-template <class numtyp, class acctyp>
-class CRML_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
- public:
-  CRML_GPU_Memory();
-  ~CRML_GPU_Memory();
-
-  /// Clear any previous data and set up for a new LAMMPS run
-  /** \param max_nbors initial number of rows in the neighbor matrix
-    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device
-    * 
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double host_cut_bothsq,
-           double **host_lj1, double **host_lj2, double **host_lj3,
-           double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
-           const double gpu_split, FILE *screen, double host_cut_ljsq,
-           const double host_cut_coulsq, double *host_special_coul,
-           const double qqrd2e, const double g_ewald,
-           const double cut_lj_innersq, const double denom_lj, 
-           double **epsilon, double **sigma, const bool mix_arithmetic);
-
-  /// Clear all host and device data
-  /** \note This is called at the beginning of the init() routine **/
-  void clear();
-
-  /// Returns memory usage on device per atom
-  int bytes_per_atom(const int max_nbors) const;
-
-  /// Total host memory used by library for pair style
-  double host_memory_usage() const;
-
-  // --------------------------- TYPE DATA --------------------------
-
-  /// x = lj1, y = lj2, z = lj3, w = lj4
-  UCL_D_Vec<numtyp4> lj1;
-  /// x = epsilon, y = sigma
-  UCL_D_Vec<numtyp2> ljd;
-  /// Special LJ values [0-3] and Special Coul values [4-7]
-  UCL_D_Vec<numtyp> sp_lj;
-
-  /// If atom type constants fit in shared memory, use fast kernels
-  bool shared_types;
-
-  /// Number of atom types 
-  int _lj_types;
-
-  numtyp _qqrd2e, _g_ewald, _denom_lj;
-
-  numtyp _cut_coulsq, _cut_bothsq, _cut_ljsq, _cut_lj_innersq;
-
- private:
-  bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
-};
-
-#endif
-
--- a/lib/gpu/ellipsoid_extra.h
+++ b/lib/gpu/ellipsoid_extra.h
@ -1,424 +0,0 @@
-// **************************************************************************
-//                              ellipsoid_extra.h
-//                             -------------------
-//                               W. Michael Brown
-//
-//  Device code for Ellipsoid math routines
-//
-// __________________________________________________________________________
-//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
-// __________________________________________________________________________
-//
-//    begin                :
-//    email                : brownw@ornl.gov
-// ***************************************************************************/
-
-#ifndef ELLIPSOID_EXTRA_H
-#define ELLIPSOID_EXTRA_H
-
-enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
-
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
-#ifdef NV_KERNEL
-
-#include "nv_kernel_def.h"
-
-#else
-
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-#define GLOBAL_ID_X get_global_id(0)
-#define THREAD_ID_X get_local_id(0)
-#define BLOCK_ID_X get_group_id(0)
-#define BLOCK_SIZE_X get_local_size(0)
-#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
-#define __inline inline
-#define BLOCK_PAIR 64
-#define MAX_SHARED_TYPES 8
-
-#endif
-
-/* ----------------------------------------------------------------------
-   dot product of 2 vectors
------------------------------------------------------------------------- */
-
-__inline numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
-{
-  return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2];
-}
-
-/* ----------------------------------------------------------------------
-   cross product of 2 vectors
------------------------------------------------------------------------- */
-
-__inline void gpu_cross3(const numtyp *v1, const numtyp *v2, numtyp *ans)
-{
-  ans[0] = v1[1]*v2[2]-v1[2]*v2[1];
-  ans[1] = v1[2]*v2[0]-v1[0]*v2[2];
-  ans[2] = v1[0]*v2[1]-v1[1]*v2[0];
-}
-
-/* ----------------------------------------------------------------------
-   determinant of a matrix
------------------------------------------------------------------------- */
-
-__inline numtyp gpu_det3(const numtyp m[9])
-{
-  numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] - 
-    m[3]*m[1]*m[8] + m[3]*m[2]*m[7] + 
-    m[6]*m[1]*m[5] - m[6]*m[2]*m[4];
-  return ans;
-}
-
-/* ----------------------------------------------------------------------
-   diagonal matrix times a full matrix
------------------------------------------------------------------------- */
-
-__inline void gpu_diag_times3(const numtyp4 shape, const numtyp m[9], 
-                              numtyp ans[9])
-{
-  ans[0] = shape.x*m[0];
-  ans[1] = shape.x*m[1];
-  ans[2] = shape.x*m[2];
-  ans[3] = shape.y*m[3];
-  ans[4] = shape.y*m[4];
-  ans[5] = shape.y*m[5];
-  ans[6] = shape.z*m[6];
-  ans[7] = shape.z*m[7];
-  ans[8] = shape.z*m[8];
-}
-
-/* ----------------------------------------------------------------------
-   add two matrices
------------------------------------------------------------------------- */
-
-__inline void gpu_plus3(const numtyp m[9], const numtyp m2[9], numtyp ans[9])
-{
-  ans[0] = m[0]+m2[0];
-  ans[1] = m[1]+m2[1];
-  ans[2] = m[2]+m2[2];
-  ans[3] = m[3]+m2[3];
-  ans[4] = m[4]+m2[4];
-  ans[5] = m[5]+m2[5];
-  ans[6] = m[6]+m2[6];
-  ans[7] = m[7]+m2[7];
-  ans[8] = m[8]+m2[8];
-}
-
-/* ----------------------------------------------------------------------
-   multiply the transpose of mat1 times mat2
------------------------------------------------------------------------- */
-
-__inline void gpu_transpose_times3(const numtyp m[9], const numtyp m2[9],
-                                   numtyp ans[9])
-{
-  ans[0] = m[0]*m2[0]+m[3]*m2[3]+m[6]*m2[6];
-  ans[1] = m[0]*m2[1]+m[3]*m2[4]+m[6]*m2[7];
-  ans[2] = m[0]*m2[2]+m[3]*m2[5]+m[6]*m2[8];
-  ans[3] = m[1]*m2[0]+m[4]*m2[3]+m[7]*m2[6];
-  ans[4] = m[1]*m2[1]+m[4]*m2[4]+m[7]*m2[7];
-  ans[5] = m[1]*m2[2]+m[4]*m2[5]+m[7]*m2[8];
-  ans[6] = m[2]*m2[0]+m[5]*m2[3]+m[8]*m2[6];
-  ans[7] = m[2]*m2[1]+m[5]*m2[4]+m[8]*m2[7];
-  ans[8] = m[2]*m2[2]+m[5]*m2[5]+m[8]*m2[8];
-}
-
-/* ----------------------------------------------------------------------
-   row vector times matrix
------------------------------------------------------------------------- */
-
-__inline void gpu_row_times3(const numtyp *v, const numtyp m[9], numtyp *ans)
-{
-  ans[0] = m[0]*v[0]+v[1]*m[3]+v[2]*m[6];
-  ans[1] = v[0]*m[1]+m[4]*v[1]+v[2]*m[7];
-  ans[2] = v[0]*m[2]+v[1]*m[5]+m[8]*v[2];
-}
-
-/* ----------------------------------------------------------------------
-   solve Ax = b or M ans = v
-   use gaussian elimination & partial pivoting on matrix
-   error_flag set to 2 if bad matrix inversion attempted
------------------------------------------------------------------------- */
-
-__inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans,
-                            __global int *error_flag)
-{
-  // create augmented matrix for pivoting
-
-  numtyp aug[12], t;
-
-  aug[3] = v[0];
-  aug[0] = m[0];
-  aug[1] = m[1];
-  aug[2] = m[2];
-  aug[7] = v[1];
-  aug[4] = m[3];
-  aug[5] = m[4];
-  aug[6] = m[5];
-  aug[11] = v[2];
-  aug[8] = m[6];
-  aug[9] = m[7];
-  aug[10] = m[8];
-
-  if (fabs(aug[4]) > fabs(aug[0])) {
-    numtyp swapt;
-    swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
-    swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
-    swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
-    swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
-  }
-  if (fabs(aug[8]) > fabs(aug[0])) {
-    numtyp swapt;
-    swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
-    swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
-    swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
-    swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
-  }
-
-  if (aug[0] != (numtyp)0.0) {
-    if (0!=0) {
-      numtyp swapt;
-      swapt=aug[0]; aug[0]=aug[0]; aug[0]=swapt;
-      swapt=aug[1]; aug[1]=aug[1]; aug[1]=swapt;
-      swapt=aug[2]; aug[2]=aug[2]; aug[2]=swapt;
-      swapt=aug[3]; aug[3]=aug[3]; aug[3]=swapt;
-    }
-  } else if (aug[4] != (numtyp)0.0) {
-    if (1!=0) {
-      numtyp swapt;
-      swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
-      swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
-      swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
-      swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
-    }
-  } else if (aug[8] != (numtyp)0.0) {
-    if (2!=0) {
-      numtyp swapt;
-      swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
-      swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
-      swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
-      swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
-    }
-  } else
-    *error_flag=2;
-
-  t = aug[4]/aug[0];
-  aug[5]-=t*aug[1];
-  aug[6]-=t*aug[2];
-  aug[7]-=t*aug[3];
-  t = aug[8]/aug[0];
-  aug[9]-=t*aug[1];
-  aug[10]-=t*aug[2];
-  aug[11]-=t*aug[3];
-
-  if (fabs(aug[9]) > fabs(aug[5])) {
-    numtyp swapt;
-    swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
-    swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
-    swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
-    swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
-  }
-
-  if (aug[5] != (numtyp)0.0) {
-    if (1!=1) {
-      numtyp swapt;
-      swapt=aug[4]; aug[4]=aug[4]; aug[4]=swapt;
-      swapt=aug[5]; aug[5]=aug[5]; aug[5]=swapt;
-      swapt=aug[6]; aug[6]=aug[6]; aug[6]=swapt;
-      swapt=aug[7]; aug[7]=aug[7]; aug[7]=swapt;
-    }
-  } else if (aug[9] != (numtyp)0.0) {
-    if (2!=1) {
-      numtyp swapt;
-      swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
-      swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
-      swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
-      swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
-    }
-  }
-
-  t = aug[9]/aug[5];
-  aug[10]-=t*aug[6];
-  aug[11]-=t*aug[7];
-  
-  if (aug[10] == (numtyp)0.0)
-    *error_flag=2;
-
-  ans[2] = aug[11]/aug[10];
-  t = (numtyp)0.0;
-  t += aug[6]*ans[2];
-  ans[1] = (aug[7]-t) / aug[5];
-  t = (numtyp)0.0;
-  t += aug[1]*ans[1];
-  t += aug[2]*ans[2];
-  ans[0] = (aug[3]-t) / aug[0];
-}
-
-/* ----------------------------------------------------------------------
-   compute rotation matrix from quaternion conjugate
-   quat = [w i j k]
------------------------------------------------------------------------- */
-
-__inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi, 
-                                    numtyp mat[9])
-{
-  numtyp4 q=qif[qi];
-  
-  numtyp w2 = q.x*q.x;
-  numtyp i2 = q.y*q.y;
-  numtyp j2 = q.z*q.z;
-  numtyp k2 = q.w*q.w;
-  numtyp twoij = (numtyp)2.0*q.y*q.z;
-  numtyp twoik = (numtyp)2.0*q.y*q.w;
-  numtyp twojk = (numtyp)2.0*q.z*q.w;
-  numtyp twoiw = (numtyp)2.0*q.y*q.x;
-  numtyp twojw = (numtyp)2.0*q.z*q.x;
-  numtyp twokw = (numtyp)2.0*q.w*q.x;
-
-  mat[0] = w2+i2-j2-k2;
-  mat[3] = twoij-twokw;
-  mat[6] = twojw+twoik;
-
-  mat[1] = twoij+twokw;
-  mat[4] = w2-i2+j2-k2;
-  mat[7] = twojk-twoiw;
-	
-  mat[2] = twoik-twojw;
-  mat[5] = twojk+twoiw;
-  mat[8] = w2-i2-j2+k2;
-}
-
-/* ----------------------------------------------------------------------
-   transposed matrix times diagonal matrix
------------------------------------------------------------------------- */
-
-__inline void gpu_transpose_times_diag3(const numtyp m[9],
-                                        const numtyp4 d, numtyp ans[9])
-{
-  ans[0] = m[0]*d.x;
-  ans[1] = m[3]*d.y;
-  ans[2] = m[6]*d.z;
-  ans[3] = m[1]*d.x;
-  ans[4] = m[4]*d.y;
-  ans[5] = m[7]*d.z;
-  ans[6] = m[2]*d.x;
-  ans[7] = m[5]*d.y;
-  ans[8] = m[8]*d.z;
-}
-
-/* ----------------------------------------------------------------------
-   multiply mat1 times mat2
------------------------------------------------------------------------- */
-
-__inline void gpu_times3(const numtyp m[9], const numtyp m2[9],
-                         numtyp ans[9])
-{
-  ans[0] = m[0]*m2[0] + m[1]*m2[3] + m[2]*m2[6];
-  ans[1] = m[0]*m2[1] + m[1]*m2[4] + m[2]*m2[7];
-  ans[2] = m[0]*m2[2] + m[1]*m2[5] + m[2]*m2[8];
-  ans[3] = m[3]*m2[0] + m[4]*m2[3] + m[5]*m2[6];
-  ans[4] = m[3]*m2[1] + m[4]*m2[4] + m[5]*m2[7];
-  ans[5] = m[3]*m2[2] + m[4]*m2[5] + m[5]*m2[8];
-  ans[6] = m[6]*m2[0] + m[7]*m2[3] + m[8]*m2[6];
-  ans[7] = m[6]*m2[1] + m[7]*m2[4] + m[8]*m2[7];
-  ans[8] = m[6]*m2[2] + m[7]*m2[5] + m[8]*m2[8];
-}
-
-/* ----------------------------------------------------------------------
-   Apply principal rotation generator about x to rotation matrix m
------------------------------------------------------------------------- */
-
-__inline void gpu_rotation_generator_x(const numtyp m[9], numtyp ans[9])
-{
-  ans[0] = 0;
-  ans[1] = -m[2];
-  ans[2] = m[1];
-  ans[3] = 0;
-  ans[4] = -m[5];
-  ans[5] = m[4];
-  ans[6] = 0;
-  ans[7] = -m[8];
-  ans[8] = m[7];
-}
-
-/* ----------------------------------------------------------------------
-   Apply principal rotation generator about y to rotation matrix m
------------------------------------------------------------------------- */
-
-__inline void gpu_rotation_generator_y(const numtyp m[9], numtyp ans[9])
-{
-  ans[0] = m[2];
-  ans[1] = 0;
-  ans[2] = -m[0];
-  ans[3] = m[5];
-  ans[4] = 0;
-  ans[5] = -m[3];
-  ans[6] = m[8];
-  ans[7] = 0;
-  ans[8] = -m[6];
-}
-
-/* ----------------------------------------------------------------------
-   Apply principal rotation generator about z to rotation matrix m
------------------------------------------------------------------------- */
-
-__inline void gpu_rotation_generator_z(const numtyp m[9], numtyp ans[9])
-{
-  ans[0] = -m[1];
-  ans[1] = m[0];
-  ans[2] = 0;
-  ans[3] = -m[4];
-  ans[4] = m[3];
-  ans[5] = 0;
-  ans[6] = -m[7];
-  ans[7] = m[6];
-  ans[8] = 0;
-}
-
-/* ----------------------------------------------------------------------
-   matrix times vector
------------------------------------------------------------------------- */
-
-__inline void gpu_times_column3(const numtyp m[9], const numtyp v[3],
-                                numtyp ans[3]) 
-{
-  ans[0] = m[0]*v[0] + m[1]*v[1] + m[2]*v[2];
-  ans[1] = m[3]*v[0] + m[4]*v[1] + m[5]*v[2];
-  ans[2] = m[6]*v[0] + m[7]*v[1] + m[8]*v[2];
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-#endif
--- a/lib/gpu/ellipsoid_nbor.cu
+++ b/lib/gpu/ellipsoid_nbor.cu
@ -1,165 +0,0 @@
-// **************************************************************************
-//                              ellipsoid_nbor.cu
-//                             -------------------
-//                               W. Michael Brown
-//
-//  Device code for Ellipsoid neighbor routines
-//
-// __________________________________________________________________________
-//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
-// __________________________________________________________________________
-//
-//    begin                :
-//    email                : brownw@ornl.gov
-// ***************************************************************************/
-
-#ifndef ELLIPSOID_NBOR_H
-#define ELLIPSOID_NBOR_H
-
-#ifdef NV_KERNEL
-
-#include "nv_kernel_def.h"
-
-#else
-
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-#define GLOBAL_ID_X get_global_id(0)
-#define THREAD_ID_X get_local_id(0)
-#define BLOCK_ID_X get_group_id(0)
-#define BLOCK_SIZE_X get_local_size(0)
-#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
-#define MAX_SHARED_TYPES 8
-
-#endif
-
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#else
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#endif
-
-#define SBBITS 30
-#define NEIGHMASK 0x3FFFFFFF
-
-// ---------------------------------------------------------------------------
-// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
-// -- Only unpack neighbors matching the specified inclusive range of forms
-// -- Only unpack neighbors within cutoff
-// ---------------------------------------------------------------------------
-__kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form, 
-                          const int ntypes, __global int *dev_nbor,
-                          const int nbor_pitch, const int start, const int inum, 
-                          __global int *dev_ij, const int form_low, 
-                          const int form_high) {
-                                
-  // ii indexes the two interacting particles in gi
-  int ii=GLOBAL_ID_X+start;
-
-  if (ii<inum) {
-    __global int *nbor=dev_ij+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
-    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
-  
-    numtyp4 ix=x_[i];
-    int iw=ix.w;
-    int itype=mul24(iw,ntypes);
-    int newj=0;  
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
-      int j=*nbor;
-      j &= NEIGHMASK;
-      numtyp4 jx=x_[j];
-      int jtype=jx.w;
-      int mtype=itype+jtype;
-      numtyp2 cf=cut_form[mtype];
-      if (cf.y>=form_low && cf.y<=form_high) {
-        // Compute r12;
-        numtyp rsq=jx.x-ix.x;
-        rsq*=rsq;
-        numtyp t=jx.y-ix.y;
-        rsq+=t*t;
-        t=jx.z-ix.z;
-        rsq+=t*t;
-
-        if (rsq<cf.x) {
-          *packed=j;
-          packed+=nbor_pitch;
-          newj++;
-        }
-      }
-    }
-    dev_nbor[ii+nbor_pitch]=newj;
-  }
-}
-
-// ---------------------------------------------------------------------------
-// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
-// -- Only unpack neighbors matching the specified inclusive range of forms
-// -- Only unpack neighbors within cutoff
-// -- Fast version of routine that uses shared memory for LJ constants
-// ---------------------------------------------------------------------------
-__kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
-                               __global int *dev_nbor, const int nbor_pitch, 
-                               const int start, const int inum, 
-                               __global int *dev_ij, const int form_low, 
-                               const int form_high) {
-                                
-  int ii=THREAD_ID_X;
-  __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    cutsq[ii]=cut_form[ii].x;
-    form[ii]=cut_form[ii].y;
-  }
-  ii+=mul24((int)BLOCK_SIZE_X,(int)BLOCK_ID_X)+start;
-  __syncthreads();
-
-  if (ii<inum) {
-    __global int *nbor=dev_ij+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
-    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
-  
-    numtyp4 ix=x_[i];
-    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
-
-    int newj=0;  
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
-      int j=*nbor;
-      j &= NEIGHMASK;
-      numtyp4 jx=x_[j];
-      int jtype=jx.w;
-      int mtype=itype+jtype;
-      
-      if (form[mtype]>=form_low && form[mtype]<=form_high) {
-        // Compute r12;
-        numtyp rsq=jx.x-ix.x;
-        rsq*=rsq;
-        numtyp t=jx.y-ix.y;
-        rsq+=t*t;
-        t=jx.z-ix.z;
-        rsq+=t*t;
-
-        if (rsq<cutsq[mtype]) {
-          *packed=j;
-          packed+=nbor_pitch;
-          newj++;
-        }
-      }
-    }
-    dev_nbor[ii+nbor_pitch]=newj;
-  }
-}
-
-#endif
--- a/lib/gpu/gayberne.cpp
+++ b/lib/gpu/gayberne.cpp
@ -1,307 +0,0 @@
-/***************************************************************************
-                                gayberne.cpp
-                             -------------------
-                               W. Michael Brown
-
-  Host code for Gay-Berne potential acceleration
-
- __________________________________________________________________________
-    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
- __________________________________________________________________________
-
-    begin                :
-    email                : brownw@ornl.gov
- ***************************************************************************/
-
-#ifdef USE_OPENCL
-#include "gayberne_cl.h"
-#else
-#include "gayberne_ptx.h"
-#endif
-
-#include "gayberne.h"
-#include <cassert>
-using namespace LAMMPS_AL;
-
-#define GayBerneT GayBerne<numtyp, acctyp>
-extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
-
-template <class numtyp, class acctyp>
-GayBerneT::GayBerne() : BaseEllipsoid<numtyp,acctyp>(),
-                                  _allocated(false) {
-}
-
-template <class numtyp, class acctyp>
-GayBerneT::~GayBerne() { 
-  clear();
-}
- 
-template <class numtyp, class acctyp>
-int GayBerneT::bytes_per_atom(const int max_nbors) const {
-  return this->bytes_per_atom(max_nbors);
-}
-
-template <class numtyp, class acctyp>
-int GayBerneT::init(const int ntypes, const double gamma, 
-                         const double upsilon, const double mu, 
-                         double **host_shape, double **host_well, 
-                         double **host_cutsq, double **host_sigma, 
-                         double **host_epsilon, double *host_lshape, 
-                         int **h_form, double **host_lj1, double **host_lj2,
-                         double **host_lj3, double **host_lj4,
-                         double **host_offset, const double *host_special_lj,
-                         const int nlocal, const int nall, const int max_nbors,
-                         const int maxspecial, const double cell_size,
-                         const double gpu_split, FILE *_screen) {
-  int success;
-  success=this->init_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                          _screen,ntypes,h_form,gayberne,gayberne_lj);
-  if (success!=0)
-    return success;
-
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  _shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
-  if (lj_types<=max_shared_types && this->block_size()>=max_shared_types) {
-    lj_types=max_shared_types;
-    _shared_types=true;
-  }
-  _lj_types=lj_types;
-
-  // Allocate a host write buffer for copying type data
-  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
-
-  for (int i=0; i<lj_types*lj_types; i++)
-    host_write[i]=0.0;
-
-  sigma_epsilon.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write,
-			 host_sigma,host_epsilon);
-
-  this->cut_form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack2(ntypes,lj_types,this->cut_form,host_write,
-			 host_cutsq,h_form);
-
-  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq,h_form);
-
-  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
-
-  dev_error.alloc(1,*(this->ucl_device));
-  dev_error.zero();
-    
-  // Allocate, cast and asynchronous memcpy of constant data
-  // Copy data for bonded interactions
-  gamma_upsilon_mu.alloc(7,*(this->ucl_device),UCL_READ_ONLY);
-  host_write[0]=static_cast<numtyp>(gamma); 
-  host_write[1]=static_cast<numtyp>(upsilon);
-  host_write[2]=static_cast<numtyp>(mu);
-  host_write[3]=static_cast<numtyp>(host_special_lj[0]);
-  host_write[4]=static_cast<numtyp>(host_special_lj[1]);
-  host_write[5]=static_cast<numtyp>(host_special_lj[2]);
-  host_write[6]=static_cast<numtyp>(host_special_lj[3]);
-  ucl_copy(gamma_upsilon_mu,host_write,7,false);
-
-  lshape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
-  UCL_H_Vec<double> d_view;
-  d_view.view(host_lshape,lshape.numel(),*(this->ucl_device));
-  ucl_copy(lshape,d_view,false);
-    
-  // Copy shape, well, sigma, epsilon, and cutsq onto GPU
-  // - cast if necessary
-  shape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
-  for (int i=0; i<ntypes; i++) {
-    host_write[i*4]=host_shape[i][0];
-    host_write[i*4+1]=host_shape[i][1];
-    host_write[i*4+2]=host_shape[i][2];
-  }
-  UCL_H_Vec<numtyp4> view4;
-  view4.view((numtyp4*)host_write.begin(),shape.numel(),*(this->ucl_device));
-  ucl_copy(shape,view4,false);
-
-  well.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
-  for (int i=0; i<ntypes; i++) {
-    host_write[i*4]=host_well[i][0];
-    host_write[i*4+1]=host_well[i][1];
-    host_write[i*4+2]=host_well[i][2];
-  }
-  view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device));
-  ucl_copy(well,view4,false);
-  
-  _allocated=true;
-  this->_max_bytes=sigma_epsilon.row_bytes()+this->cut_form.row_bytes()+
-                   lj1.row_bytes()+lj3.row_bytes()+gamma_upsilon_mu.row_bytes()+
-                   lshape.row_bytes()+shape.row_bytes()+well.row_bytes();
-
-  return 0;
-}
-
-template <class numtyp, class acctyp>
-void GayBerneT::clear() {
-  if (!_allocated)
-    return;
-
-  UCL_H_Vec<int> err_flag(1,*(this->ucl_device));
-  ucl_copy(err_flag,dev_error,false);
-  if (err_flag[0] == 2)
-    std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";  
-  err_flag.clear();
-
-  _allocated=false;
-
-  dev_error.clear();
-  lj1.clear();
-  lj3.clear();
-  sigma_epsilon.clear();
-  this->cut_form.clear();
-
-  shape.clear();
-  well.clear();
-  lshape.clear();
-  gamma_upsilon_mu.clear();
-  
-  this->clear_base();
-}
-
-template <class numtyp, class acctyp>
-double GayBerneT::host_memory_usage() const {
-  return this->host_memory_usage_base()+sizeof(GayBerneT)+
-         4*sizeof(numtyp);
-}
-
-// ---------------------------------------------------------------------------
-// Calculate energies, forces, and torques
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void GayBerneT::loop(const bool _eflag, const bool _vflag) {
-  const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-  
-  int GX=0, NGX;
-  int stride=this->nbor->nbor_pitch();
-  int ainum=this->ans->inum();
-
-  if (this->_multiple_forms) {
-    this->time_nbor1.start();
-    if (this->_last_ellipse>0) {
-      // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
-      GX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/
-                               (BX/this->_threads_per_atom)));
-      NGX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/BX));
-      this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE,
-			                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
-      this->time_nbor1.stop();
-
-      this->time_ellipsoid.start();
-      this->k_ellipsoid.set_size(GX,BX);
-      this->k_ellipsoid.run(&this->atom->dev_x.begin(),
-       &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
-       &this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(), 
-       &this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(),
-       &stride, &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(),
-       &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse,
-       &this->_threads_per_atom);
-      this->time_ellipsoid.stop();
-
-      if (this->_last_ellipse==this->ans->inum()) {
-        this->time_nbor2.start();
-        this->time_nbor2.stop();
-        this->time_ellipsoid2.start();
-        this->time_ellipsoid2.stop();
-        this->time_lj.start();
-        this->time_lj.stop();
-        return;
-      }
-
-      // ------------ SPHERE_ELLIPSE ---------------
-
-      this->time_nbor2.start();
-      GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
-                               this->_last_ellipse)/
-                               (BX/this->_threads_per_atom)));
-      NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
-                                this->_last_ellipse)/BX));
-      this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(),
-			                 SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
-      this->time_nbor2.stop();
-
-      this->time_ellipsoid2.start();
-      this->k_sphere_ellipsoid.set_size(GX,BX);
-      this->k_sphere_ellipsoid.run(&this->atom->dev_x.begin(),
-        &this->atom->dev_quat.begin(), &this->shape.begin(), 
-        &this->well.begin(), &this->gamma_upsilon_mu.begin(), 
-        &this->sigma_epsilon.begin(), &this->_lj_types, &this->lshape.begin(), 
-        &this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(),
-        &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
-        &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
-      this->time_ellipsoid2.stop();
-   } else {
-      this->ans->dev_ans.zero();
-      this->ans->dev_engv.zero();
-      this->time_nbor1.stop();
-      this->time_ellipsoid.start();                                 
-      this->time_ellipsoid.stop();
-      this->time_nbor2.start();
-      this->time_nbor2.stop();
-      this->time_ellipsoid2.start();
-      this->time_ellipsoid2.stop();
-    }
-    
-    // ------------         LJ      ---------------
-    this->time_lj.start();
-    if (this->_last_ellipse<this->ans->inum()) {
-      if (this->_shared_types) {
-        this->k_lj_fast.set_size(GX,BX);
-        this->k_lj_fast.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
-          &this->lj3.begin(), &this->gamma_upsilon_mu.begin(), &stride,
-          &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
-          &this->ans->dev_engv.begin(), &this->dev_error.begin(),
-          &eflag, &vflag, &this->_last_ellipse, &ainum,
-          &this->_threads_per_atom);
-      } else {
-        this->k_lj.set_size(GX,BX);
-        this->k_lj.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
-          &this->lj3.begin(), &this->_lj_types, &this->gamma_upsilon_mu.begin(),
-          &stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
-          &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
-          &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
-      }
-    }
-    this->time_lj.stop();
-  } else {
-    GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                             (BX/this->_threads_per_atom)));
-    NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
-    this->time_nbor1.start();
-    this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE,
-		                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
-    this->time_nbor1.stop();
-    this->time_ellipsoid.start(); 
-    this->k_ellipsoid.set_size(GX,BX);
-    this->k_ellipsoid.run(&this->atom->dev_x.begin(), 
-      &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), 
-      &this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(), 
-      &this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(),
-      &stride, &this->ans->dev_ans.begin(), &ainum, 
-      &this->ans->dev_engv.begin(), &this->dev_error.begin(),
-      &eflag, &vflag, &ainum, &this->_threads_per_atom);
-    this->time_ellipsoid.stop();
-  }
-}
-
-template class GayBerne<PRECISION,ACC_PRECISION>;
-
--- a/lib/gpu/gayberne.cu
+++ b/lib/gpu/gayberne.cu
@ -1,429 +0,0 @@
-// **************************************************************************
-//                                 gayberne.cu
-//                             -------------------
-//                               W. Michael Brown
-//
-//  Device code for Gay-Berne potential acceleration
-//
-// __________________________________________________________________________
-//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
-// __________________________________________________________________________
-//
-//    begin                :
-//    email                : brownw@ornl.gov
-// ***************************************************************************/
-
-#ifndef GAYBERNE_CU
-#define GAYBERNE_CU
-
-#ifdef NV_KERNEL
-#include "ellipsoid_extra.h"
-#endif
-
-#define SBBITS 30
-#define NEIGHMASK 0x3FFFFFFF
-__inline int sbmask(int j) { return j >> SBBITS & 3; }
-
-__inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape, 
-                                 numtyp ans[9])
-{
-  numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
-    m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
-    m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
-  den = (numtyp)1.0/den;
-  
-  ans[0] = shape.x*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
-		    m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
-		    m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
-		    m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
-		    m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
-  
-  ans[1] = shape.x*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
-		    (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
-		    (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
-		    m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
-		    m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
-  
-  ans[2] = shape.x*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
-		    m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
-		    m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
-		    (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
-		    m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
-  
-  ans[3] = shape.y*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
-		    m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
-		    m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
-		    m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
-		    m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
-  
-  ans[4] = shape.y*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
-		    (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
-		    (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
-		    m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
-		    m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
-  
-  ans[5] = shape.y*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
-		    m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
-		    (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
-		    m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
-		    (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
-  
-  ans[6] = shape.z*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
-		    (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
-		    m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
-		    m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
-		    m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
-  
-  ans[7] = shape.z*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
-		     (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
-		     (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
-		     m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
-		     m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
-  
-  ans[8] = shape.z*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
-		    m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
-		    m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
-		    (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
-		    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
-}
-
-__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
-                               __global numtyp4* shape, __global numtyp4* well, 
-                               __global numtyp *gum, __global numtyp2* sig_eps, 
-                               const int ntypes, __global numtyp *lshape, 
-                               __global int *dev_nbor, const int stride, 
-                               __global acctyp4 *ans, const int astride, 
-                               __global acctyp *engv, __global int *err_flag, 
-                               const int eflag, const int vflag, const int inum,
-                               const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp sp_lj[4];
-  sp_lj[0]=gum[3];    
-  sp_lj[1]=gum[4];    
-  sp_lj[2]=gum[5];    
-  sp_lj[3]=gum[6];    
-
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp4 tor;
-  tor.x=(acctyp)0;
-  tor.y=(acctyp)0;
-  tor.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=stride;
-    int numj=*nbor;
-    nbor+=stride;
-    __global int *nbor_end=nbor+mul24(stride,numj);
-    nbor+=mul24(offset,stride);
-    int n_stride=mul24(t_per_atom,stride);
-  
-    numtyp4 ix=x_[i];
-    int itype=ix.w;
-    numtyp a1[9], b1[9], g1[9];
-    numtyp4 ishape=shape[itype];
-    {
-      numtyp t[9];
-      gpu_quat_to_mat_trans(q,i,a1);
-      gpu_diag_times3(ishape,a1,t);
-      gpu_transpose_times3(a1,t,g1);
-      gpu_diag_times3(well[itype],a1,t);
-      gpu_transpose_times3(a1,t,b1);
-    }
-
-    numtyp factor_lj;
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-      int j=*nbor;
-      factor_lj = sp_lj[sbmask(j)];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp r12[3];
-      r12[0] = jx.x-ix.x;
-      r12[1] = jx.y-ix.y;
-      r12[2] = jx.z-ix.z;
-      numtyp ir = gpu_dot3(r12,r12);
-
-      ir = rsqrt(ir);
-      numtyp r = (numtyp)1.0/ir;
-
-      numtyp a2[9];
-      gpu_quat_to_mat_trans(q,j,a2);
-  
-      numtyp u_r, dUr[3], tUr[3], eta, teta[3];
-      { // Compute U_r, dUr, eta, and teta
-        // Compute g12
-        numtyp g12[9];
-        {
-          numtyp g2[9];
-          {
-              gpu_diag_times3(shape[jtype],a2,g12);
-              gpu_transpose_times3(a2,g12,g2);
-              gpu_plus3(g1,g2,g12);
-          }
-
-          { // Compute U_r and dUr
-    
-            // Compute kappa
-            numtyp kappa[3];
-            gpu_mldivide3(g12,r12,kappa,err_flag);
-
-            // -- replace r12 with r12 hat
-            r12[0]*=ir;
-            r12[1]*=ir;
-            r12[2]*=ir;
-
-            // -- kappa is now / r
-            kappa[0]*=ir;
-            kappa[1]*=ir;
-            kappa[2]*=ir;
-
-            // energy
-  
-            // compute u_r and dUr
-            numtyp uslj_rsq;
-            {
-              // Compute distance of closest approach
-              numtyp h12, sigma12;
-              sigma12 = gpu_dot3(r12,kappa);
-              sigma12 = rsqrt((numtyp)0.5*sigma12);
-              h12 = r-sigma12;
-
-              // -- kappa is now ok
-              kappa[0]*=r;
-              kappa[1]*=r;
-              kappa[2]*=r;
-          
-              int mtype=mul24(ntypes,itype)+jtype;
-              numtyp sigma = sig_eps[mtype].x;
-              numtyp epsilon = sig_eps[mtype].y;
-              numtyp varrho = sigma/(h12+gum[0]*sigma);
-              numtyp varrho6 = varrho*varrho*varrho;
-              varrho6*=varrho6;
-              numtyp varrho12 = varrho6*varrho6;
-              u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
-
-              numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
-              temp1 = temp1*(numtyp)24.0*epsilon;
-              uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
-              numtyp temp2 = gpu_dot3(kappa,r12);
-              uslj_rsq = uslj_rsq*ir*ir;
-
-              dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
-              dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
-              dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
-            }
-
-            // torque for particle 1
-            {
-              numtyp tempv[3], tempv2[3];
-              tempv[0] = -uslj_rsq*kappa[0];
-              tempv[1] = -uslj_rsq*kappa[1];
-              tempv[2] = -uslj_rsq*kappa[2];
-              gpu_row_times3(kappa,g1,tempv2);
-              gpu_cross3(tempv,tempv2,tUr);
-            }
-          }
-        }
-     
-        // Compute eta
-        {
-          eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
-          numtyp det_g12 = gpu_det3(g12);
-          eta = pow(eta/det_g12,gum[1]);
-        }
-    
-        // Compute teta
-        numtyp temp[9], tempv[3], tempv2[3];
-        compute_eta_torque(g12,a1,ishape,temp);
-        numtyp temp1 = -eta*gum[1];
-
-        tempv[0] = temp1*temp[0];
-        tempv[1] = temp1*temp[1];
-        tempv[2] = temp1*temp[2];
-        gpu_cross3(a1,tempv,tempv2);
-        teta[0] = tempv2[0];
-        teta[1] = tempv2[1];
-        teta[2] = tempv2[2];
-  
-        tempv[0] = temp1*temp[3];
-        tempv[1] = temp1*temp[4];
-        tempv[2] = temp1*temp[5];
-        gpu_cross3(a1+3,tempv,tempv2);
-        teta[0] += tempv2[0];
-        teta[1] += tempv2[1];
-        teta[2] += tempv2[2];
-
-        tempv[0] = temp1*temp[6];
-        tempv[1] = temp1*temp[7];
-        tempv[2] = temp1*temp[8];
-        gpu_cross3(a1+6,tempv,tempv2);
-        teta[0] += tempv2[0];
-        teta[1] += tempv2[1];
-        teta[2] += tempv2[2];
-      }
-  
-      numtyp chi, dchi[3], tchi[3];
-      { // Compute chi and dchi
-
-        // Compute b12
-        numtyp b2[9], b12[9];
-        {
-          gpu_diag_times3(well[jtype],a2,b12);
-          gpu_transpose_times3(a2,b12,b2);
-          gpu_plus3(b1,b2,b12);
-        }
-
-        // compute chi_12
-        r12[0]*=r;
-        r12[1]*=r;
-        r12[2]*=r;
-        numtyp iota[3];
-        gpu_mldivide3(b12,r12,iota,err_flag);
-        // -- iota is now iota/r
-        iota[0]*=ir;
-        iota[1]*=ir;
-        iota[2]*=ir;
-        r12[0]*=ir;
-        r12[1]*=ir;
-        r12[2]*=ir;
-        chi = gpu_dot3(r12,iota);
-        chi = pow(chi*(numtyp)2.0,gum[2]);
-
-        // -- iota is now ok
-        iota[0]*=r;
-        iota[1]*=r;
-        iota[2]*=r;
-
-        numtyp temp1 = gpu_dot3(iota,r12);
-        numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
-                                                          gum[2]);
-        dchi[0] = temp2*(iota[0]-temp1*r12[0]);
-        dchi[1] = temp2*(iota[1]-temp1*r12[1]);
-        dchi[2] = temp2*(iota[2]-temp1*r12[2]);
-
-        // compute t_chi
-        numtyp tempv[3];
-        gpu_row_times3(iota,b1,tempv);
-        gpu_cross3(tempv,iota,tchi);
-        temp1 = (numtyp)-4.0*ir*ir;
-        tchi[0] *= temp1;
-        tchi[1] *= temp1;
-        tchi[2] *= temp1;
-      }
-
-      numtyp temp2 = factor_lj*eta*chi;
-      if (eflag>0)
-        energy+=u_r*temp2;
-      numtyp temp1 = -eta*u_r*factor_lj;
-      if (vflag>0) {
-        r12[0]*=-r;
-        r12[1]*=-r;
-        r12[2]*=-r;
-        numtyp ft=temp1*dchi[0]-temp2*dUr[0];
-        f.x+=ft;
-        virial[0]+=r12[0]*ft;
-        ft=temp1*dchi[1]-temp2*dUr[1];
-        f.y+=ft;
-        virial[1]+=r12[1]*ft;
-        virial[3]+=r12[0]*ft;
-        ft=temp1*dchi[2]-temp2*dUr[2];
-        f.z+=ft;
-        virial[2]+=r12[2]*ft;
-        virial[4]+=r12[0]*ft;
-        virial[5]+=r12[1]*ft;
-      } else {
-        f.x+=temp1*dchi[0]-temp2*dUr[0];
-        f.y+=temp1*dchi[1]-temp2*dUr[1];
-        f.z+=temp1*dchi[2]-temp2*dUr[2];
-      }
-
-      // Torque on 1
-      temp1 = -u_r*eta*factor_lj;
-      temp2 = -u_r*chi*factor_lj;
-      numtyp temp3 = -chi*eta*factor_lj;
-      tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
-      tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
-      tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
- 
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[7][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=tor.x;
-    red_acc[4][tid]=tor.y;
-    red_acc[5][tid]=tor.z;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<6; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    tor.x=red_acc[3][tid];
-    tor.y=red_acc[4][tid];
-    tor.z=red_acc[5][tid];
-
-    if (eflag>0 || vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-      red_acc[6][tid]=energy;
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<7; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-      energy=red_acc[6][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=astride;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=astride;
-      }
-    }
-    ans[ii]=f;
-    ans[ii+astride]=tor;
-  } // if ii
-}
-
-#endif
-
--- a/lib/gpu/gayberne.h
+++ b/lib/gpu/gayberne.h
@ -1,95 +0,0 @@
-/***************************************************************************
-                                 gayberne.h
-                             -------------------
-                               W. Michael Brown
-
-  Host code for Gay-Berne potential acceleration
-
- __________________________________________________________________________
-    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
- __________________________________________________________________________
-
-    begin                :
-    email                : brownw@ornl.gov
- ***************************************************************************/
-
-#ifndef GAYBERNE_H
-#define GAYBERNE_H
-
-#include "base_ellipsoid.h"
-#include "mpi.h"
-
-namespace LAMMPS_AL {
-
-template <class numtyp, class acctyp>
-class GayBerne : public BaseEllipsoid<numtyp, acctyp> {
- public:
-  GayBerne();
-  ~GayBerne(); 
-
-  /// Clear any previous data and set up for a new LAMMPS run
-  /** \param gpu_nbor true if neighboring performed on device
-    * \param max_nbors initial number of rows in the neighbor matrix
-    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device 
-    * \return false if there is not sufficient memory or device init prob
-    * 
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, const double gamma,
-           const double upsilon, const double mu, double **host_shape,
-           double **host_well, double **host_cutsq, double **host_sigma, 
-           double **host_epsilon, double *host_lshape, int **h_form,
-           double **host_lj1, double **host_lj2, double **host_lj3, 
-           double **host_lj4, double **host_offset, 
-           const double *host_special_lj, const int nlocal, const int nall, 
-           const int max_nbors, const int maxspecial, const double cell_size,
-           const double gpu_split, FILE *screen);
-
-  /// Clear all host and device data
-  /** \note This is called at the beginning of the init() routine **/
-  void clear();
- 
-  /// Returns memory usage on device per atom
-  int bytes_per_atom(const int max_nbors) const;
-
-  /// Total host memory used by library for pair style
-  double host_memory_usage() const;
-
-  /// Device Error Flag - Set if a bad matrix inversion occurs
-  UCL_D_Vec<int> dev_error;
-  
-  // --------------------------- TYPE DATA -------------------------- 
-
-  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form
-  UCL_D_Vec<numtyp4> lj1;
-  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
-  UCL_D_Vec<numtyp4> lj3;
-  /// sigma_epsilon.x = sigma, sigma_epsilon.y = epsilon
-  UCL_D_Vec<numtyp2> sigma_epsilon;
-  // 0 - gamma, 1-upsilon, 2-mu, 3-special_lj[0], 4-special_lj[1], ...
-  UCL_D_Vec<numtyp> gamma_upsilon_mu;
-  
-  /// If atom type constants fit in shared memory, use fast kernels
-  bool _shared_types;
-  int _lj_types;
-   
-  // --------------------------- ATOM DATA -------------------------- 
-
-  /// Aspherical Const Data for Atoms
-  UCL_D_Vec<numtyp4> shape, well;
-  /// Aspherical Const Data for Atoms
-  UCL_D_Vec<numtyp> lshape;
-
- private:
-  bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
-};
-
-}
-
-#endif
--- a/lib/gpu/gayberne_ext.cpp
+++ b/lib/gpu/gayberne_ext.cpp
@ -1,141 +0,0 @@
-/***************************************************************************
-                              gayberne_ext.cpp
-                             -------------------
-                               W. Michael Brown
-
-  LAMMPS Wrappers for Gay-Berne Acceleration
-
- __________________________________________________________________________
-    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
- __________________________________________________________________________
-
-    begin                :
-    email                : brownw@ornl.gov
- ***************************************************************************/
-
-#include <iostream>
-#include <cassert>
-#include <math.h>
-
-#include "gayberne.h"
-
-using namespace std;
-using namespace LAMMPS_AL;
-
-static GayBerne<PRECISION,ACC_PRECISION> GBMF;
-
-// ---------------------------------------------------------------------------
-// Allocate memory on host and device and copy constants to device
-// ---------------------------------------------------------------------------
-int gb_gpu_init(const int ntypes, const double gamma,
-                const double upsilon, const double mu, double **shape,
-                double **well, double **cutsq, double **sigma,
-                double **epsilon, double *host_lshape, int **form,
-                double **host_lj1, double **host_lj2, double **host_lj3,
-                double **host_lj4, double **offset, double *special_lj,
-                const int inum, const int nall, const int max_nbors, 
-                const int maxspecial, const double cell_size, int &gpu_mode,
-                FILE *screen) {
-  GBMF.clear();
-  gpu_mode=GBMF.device->gpu_mode();
-  double gpu_split=GBMF.device->particle_split();
-  int first_gpu=GBMF.device->first_device();
-  int last_gpu=GBMF.device->last_device();
-  int world_me=GBMF.device->world_me();
-  int gpu_rank=GBMF.device->gpu_rank();
-  int procs_per_gpu=GBMF.device->procs_per_gpu();
-
-  GBMF.device->init_message(screen,"gayberne",first_gpu,last_gpu);
-
-  bool message=false;
-  if (GBMF.device->replica_me()==0 && screen)
-    message=true;
-
-  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
-    fflush(screen);
-  }
-
-  int init_ok=0;
-  if (world_me==0)
-    init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, 
-                      sigma, epsilon, host_lshape, form, host_lj1, 
-                      host_lj2, host_lj3, host_lj4, offset, special_lj, 
-                      inum, nall, max_nbors, maxspecial, cell_size, gpu_split,
-                      screen);
-
-  GBMF.device->world_barrier();
-  if (message)
-    fprintf(screen,"Done.\n");
-        
-  for (int i=0; i<procs_per_gpu; i++) {
-    if (message) {
-      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
-      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
-                last_gpu,i);
-      fflush(screen);
-    }
-    if (gpu_rank==i && world_me!=0)
-      init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,  sigma,
-                        epsilon, host_lshape, form, host_lj1, host_lj2,
-                        host_lj3, host_lj4, offset, special_lj,  inum, nall,
-                        max_nbors, maxspecial, cell_size, gpu_split,  screen);
-
-    GBMF.device->gpu_barrier();
-    if (message) 
-      fprintf(screen,"Done.\n");
-  }
-  if (message)
-    fprintf(screen,"\n");
-
-  if (init_ok==0)
-    GBMF.estimate_gpu_overhead();
-  return init_ok;
-}
-
-// ---------------------------------------------------------------------------
-// Clear memory on host and device
-// ---------------------------------------------------------------------------
-void gb_gpu_clear() {
-  GBMF.clear();
-}
-
-  int** compute(const int ago, const int inum_full, const int nall,
-                double **host_x, int *host_type, double *sublo,
-                double *subhi, int *tag, int **nspecial,
-                int **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
-                int **ilist, int **numj, const double cpu_time, bool &success,
-                double **host_quat);
-
-int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall,
-                       double **host_x, int *host_type, double *sublo,
-                       double *subhi, int *tag, int **nspecial, int **special,
-                       const bool eflag, const bool vflag, const bool eatom,
-                       const bool vatom, int &host_start, int **ilist,
-                       int **jnum, const double cpu_time, bool &success,
-                       double **host_quat) {
-  return GBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, 
-                      tag, nspecial, special, eflag, vflag, eatom, vatom, 
-                      host_start, ilist, jnum, cpu_time, success, host_quat);
-}
-
-int * gb_gpu_compute(const int ago, const int inum_full, const int nall,
-                     double **host_x, int *host_type, int *ilist, int *numj,
-                     int **firstneigh, const bool eflag, const bool vflag,
-                     const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success, double **host_quat) {
-  return GBMF.compute(ago, inum_full, nall, host_x, host_type, ilist,
-                      numj, firstneigh, eflag, vflag, eatom, vatom, host_start,
-                      cpu_time, success, host_quat);
-}
-
-// ---------------------------------------------------------------------------
-// Return memory usage
-// ---------------------------------------------------------------------------
-double gb_gpu_bytes() {
-  return GBMF.host_memory_usage();
-}
-
--- a/lib/gpu/gayberne_lj.cu
+++ b/lib/gpu/gayberne_lj.cu
@ -1,594 +0,0 @@
-// **************************************************************************
-//                                gayberne_lj.cu
-//                             -------------------
-//                               W. Michael Brown
-//
-//  Device code for Gay-Berne - Lennard-Jones potential acceleration
-//
-// __________________________________________________________________________
-//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
-// __________________________________________________________________________
-//
-//    begin                :
-//    email                : brownw@ornl.gov
-// ***************************************************************************/
-
-#ifndef GAYBERNE_LJ_CU
-#define GAYBERNE_LJ_CU
-
-#ifdef NV_KERNEL
-#include "ellipsoid_extra.h"
-#endif
-
-#define SBBITS 30
-#define NEIGHMASK 0x3FFFFFFF
-__inline int sbmask(int j) { return j >> SBBITS & 3; }
-
-__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
-                               __global numtyp4* shape,__global numtyp4* well, 
-                               __global numtyp *gum, __global numtyp2* sig_eps, 
-                               const int ntypes, __global numtyp *lshape, 
-                               __global int *dev_nbor, const int stride, 
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               __global int *err_flag, const int eflag, 
-                               const int vflag,const int start, const int inum, 
-                               const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom+start;
-  int offset=tid%t_per_atom;
-
-  __local numtyp sp_lj[4];
-  sp_lj[0]=gum[3];    
-  sp_lj[1]=gum[4];    
-  sp_lj[2]=gum[5];    
-  sp_lj[3]=gum[6];    
-
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=stride;
-    int numj=*nbor;
-    nbor+=stride;
-    __global int *nbor_end=nbor+stride*numj;
-    nbor+=mul24(offset,stride);
-    int n_stride=mul24(t_per_atom,stride);
-  
-    numtyp4 ix=x_[i];
-    int itype=ix.w;
-      
-    numtyp oner=shape[itype].x;
-    numtyp one_well=well[itype].x;
-  
-    numtyp factor_lj;
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
-      int j=*nbor;
-      factor_lj = sp_lj[sbmask(j)];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp r12[3];
-      r12[0] = jx.x-ix.x;
-      r12[1] = jx.y-ix.y;
-      r12[2] = jx.z-ix.z;
-      numtyp ir = gpu_dot3(r12,r12);
-
-      ir = rsqrt(ir);
-      numtyp r = (numtyp)1.0/ir;
-      
-      numtyp r12hat[3];
-      r12hat[0]=r12[0]*ir;
-      r12hat[1]=r12[1]*ir;
-      r12hat[2]=r12[2]*ir;
-
-      numtyp a2[9];
-      gpu_quat_to_mat_trans(q,j,a2);
-  
-      numtyp u_r, dUr[3], eta;
-      { // Compute U_r, dUr, eta, and teta
-        // Compute g12
-        numtyp g12[9];
-        {
-          {
-            numtyp g2[9];
-            gpu_diag_times3(shape[jtype],a2,g12);
-            gpu_transpose_times3(a2,g12,g2);
-            g12[0]=g2[0]+oner;
-            g12[4]=g2[4]+oner;
-            g12[8]=g2[8]+oner;
-            g12[1]=g2[1];
-            g12[2]=g2[2];
-            g12[3]=g2[3];
-            g12[5]=g2[5];
-            g12[6]=g2[6];
-            g12[7]=g2[7];    
-          }
-  
-          { // Compute U_r and dUr
-    
-            // Compute kappa
-            numtyp kappa[3];
-            gpu_mldivide3(g12,r12,kappa,err_flag);
-
-            // -- kappa is now / r
-            kappa[0]*=ir;
-            kappa[1]*=ir;
-            kappa[2]*=ir;
-  
-            // energy
-  
-            // compute u_r and dUr
-            numtyp uslj_rsq;
-            {
-              // Compute distance of closest approach
-              numtyp h12, sigma12;
-              sigma12 = gpu_dot3(r12hat,kappa);
-              sigma12 = rsqrt((numtyp)0.5*sigma12);
-              h12 = r-sigma12;
-
-              // -- kappa is now ok
-              kappa[0]*=r;
-              kappa[1]*=r;
-              kappa[2]*=r;
-          
-              int mtype=mul24(ntypes,itype)+jtype;
-              numtyp sigma = sig_eps[mtype].x;
-              numtyp epsilon = sig_eps[mtype].y;
-              numtyp varrho = sigma/(h12+gum[0]*sigma);
-              numtyp varrho6 = varrho*varrho*varrho;
-              varrho6*=varrho6;
-              numtyp varrho12 = varrho6*varrho6;
-              u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
-
-              numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
-              temp1 = temp1*(numtyp)24.0*epsilon;
-              uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
-              numtyp temp2 = gpu_dot3(kappa,r12hat);
-              uslj_rsq = uslj_rsq*ir*ir;
-
-              dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
-              dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
-              dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
-            }
-          }
-        }
-     
-        // Compute eta
-        {
-          eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
-          numtyp det_g12 = gpu_det3(g12);
-          eta = pow(eta/det_g12,gum[1]);
-        }
-      }
-  
-      numtyp chi, dchi[3];
-      { // Compute chi and dchi
-
-        // Compute b12
-        numtyp b12[9];
-        {
-          numtyp b2[9];
-          gpu_diag_times3(well[jtype],a2,b12);
-          gpu_transpose_times3(a2,b12,b2);
-          b12[0]=b2[0]+one_well;
-          b12[4]=b2[4]+one_well;
-          b12[8]=b2[8]+one_well;
-          b12[1]=b2[1];
-          b12[2]=b2[2];
-          b12[3]=b2[3];
-          b12[5]=b2[5];
-          b12[6]=b2[6];
-          b12[7]=b2[7];    
-        }
-
-        // compute chi_12
-        numtyp iota[3];
-        gpu_mldivide3(b12,r12,iota,err_flag);
-        // -- iota is now iota/r
-        iota[0]*=ir;
-        iota[1]*=ir;
-        iota[2]*=ir;
-        chi = gpu_dot3(r12hat,iota);
-        chi = pow(chi*(numtyp)2.0,gum[2]);
-
-        // -- iota is now ok
-        iota[0]*=r;
-        iota[1]*=r;
-        iota[2]*=r;
-
-        numtyp temp1 = gpu_dot3(iota,r12hat);
-        numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
-                                                     gum[2]);
-        dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
-        dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
-        dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
-      }
-
-      numtyp temp2 = factor_lj*eta*chi;
-      if (eflag>0)
-        energy+=u_r*temp2;
-      numtyp temp1 = -eta*u_r*factor_lj;
-      if (vflag>0) {
-        r12[0]*=-1;
-        r12[1]*=-1;
-        r12[2]*=-1;
-        numtyp ft=temp1*dchi[0]-temp2*dUr[0];
-        f.x+=ft;
-        virial[0]+=r12[0]*ft;
-        ft=temp1*dchi[1]-temp2*dUr[1];
-        f.y+=ft;
-        virial[1]+=r12[1]*ft;
-        virial[3]+=r12[0]*ft;
-        ft=temp1*dchi[2]-temp2*dUr[2];
-        f.z+=ft;
-        virial[2]+=r12[2]*ft;
-        virial[4]+=r12[0]*ft;
-        virial[5]+=r12[1]*ft;
-      } else {
-        f.x+=temp1*dchi[0]-temp2*dUr[0];
-        f.y+=temp1*dchi[1]-temp2*dUr[1];
-        f.z+=temp1*dchi[2]-temp2*dUr[2];
-      }
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<4; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii
-}
-
-__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, 
-                        __global numtyp4* lj3, const int lj_types, 
-                        __global numtyp *gum, 
-                        const int stride, __global int *dev_ij, 
-                        __global acctyp4 *ans, __global acctyp *engv, 
-                        __global int *err_flag, const int eflag, 
-                        const int vflag, const int start, const int inum, 
-                        const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom+start;
-  int offset=tid%t_per_atom;
-
-  __local numtyp sp_lj[4];
-  sp_lj[0]=gum[3];    
-  sp_lj[1]=gum[4];    
-  sp_lj[2]=gum[5];    
-  sp_lj[3]=gum[6];    
-
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  if (ii<inum) {
-    __global int *nbor=dev_ij+ii;
-    int i=*nbor;
-    nbor+=stride;
-    int numj=*nbor;
-    nbor+=stride;
-    __global int *list_end=nbor+mul24(stride,numj);
-    nbor+=mul24(offset,stride);
-    int n_stride=mul24(t_per_atom,stride);
-  
-    numtyp4 ix=x_[i];
-    int itype=ix.w;
-
-    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
-  
-      int j=*nbor;
-      factor_lj = sp_lj[sbmask(j)];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
-      int ii=itype*lj_types+jtype;
-      if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) {
-        r2inv=(numtyp)1.0/r2inv;
-        numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y);
-        force*=factor_lj;
-      
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
-          energy+=factor_lj*(e-lj3[ii].z); 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<4; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1+=energy;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1+=virial[i];
-        ap1+=inum;
-      }
-    }
-    acctyp4 old=ans[ii];
-    old.x+=f.x;
-    old.y+=f.y;
-    old.z+=f.z;
-    ans[ii]=old;
-  } // if ii
-}
-
-__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, 
-                             __global numtyp4* lj3_in, __global numtyp *gum, 
-                             const int stride, __global int *dev_ij,
-                             __global acctyp4 *ans, __global acctyp *engv,
-                             __global int *err_flag, const int eflag,
-                             const int vflag, const int start, const int inum,
-                             const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom+start;
-  int offset=tid%t_per_atom;
-
-  __local numtyp sp_lj[4];                              
-  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  if (tid<4)
-    sp_lj[tid]=gum[tid+3];    
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[tid]=lj1_in[tid];
-    if (eflag>0)
-      lj3[tid]=lj3_in[tid];
-  }
-  
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  __syncthreads();
-  
-  if (ii<inum) {
-    __global int *nbor=dev_ij+ii;
-    int i=*nbor;
-    nbor+=stride;
-    int numj=*nbor;
-    nbor+=stride;
-    __global int *list_end=nbor+mul24(stride,numj);
-    nbor+=mul24(offset,stride);
-    int n_stride=mul24(t_per_atom,stride);
-
-    numtyp4 ix=x_[i];
-    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
-
-    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
-  
-      int j=*nbor;
-      factor_lj = sp_lj[sbmask(j)];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=x_[j];
-      int mtype=itype+jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
-      if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) {
-        r2inv=(numtyp)1.0/r2inv;
-        numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
-      
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<4; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1+=energy;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1+=virial[i];
-        ap1+=inum;
-      }
-    }
-    acctyp4 old=ans[ii];
-    old.x+=f.x;
-    old.y+=f.y;
-    old.z+=f.z;
-    ans[ii]=old;
-  } // if ii
-}
-
-#endif
-
--- a/lib/gpu/gb_gpu.cpp
+++ b/lib/gpu/gb_gpu.cpp
@ -1,456 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#include <iostream>
-#include <cassert>
-#include <math.h>
-
-#include "gb_gpu_memory.h"
-
-using namespace std;
-
-static GB_GPU_Memory<PRECISION,ACC_PRECISION> GBMF;
-#define GBMT GB_GPU_Memory<numtyp,acctyp>
-
-template<class numtyp, class acctyp>
-void gb_gpu_pack_nbors(GBMT &gbm, const int GX, const int BX, const int start, 
-                const int inum, const int form_low, const int form_high) {
-  int stride=gbm.nbor->nbor_pitch();
-  int anall=gbm.atom->nall();
-  if (gbm.shared_types) {
-    GBMF.k_gb_nbor_fast.set_size(GX,BX);
-    GBMF.k_gb_nbor_fast.run(&gbm.atom->dev_x.begin(),
-              &gbm.cut_form.begin(), &gbm.nbor->dev_nbor.begin(), &stride,
-              &start, &inum, &gbm.nbor->dev_packed.begin(), &form_low,
-              &form_high, &anall);
-  } else {
-    GBMF.k_gb_nbor.set_size(GX,BX);
-    GBMF.k_gb_nbor.run(&gbm.atom->dev_x.begin(), &gbm.cut_form.begin(),
-              &gbm._lj_types, &gbm.nbor->dev_nbor.begin(), &stride,
-              &start, &inum, &gbm.nbor->dev_packed.begin(), &form_low,
-              &form_high, &anall);
-  }
-}
-
-// ---------------------------------------------------------------------------
-// Allocate memory on host and device and copy constants to device
-// ---------------------------------------------------------------------------
-int gb_gpu_init(const int ntypes, const double gamma,
-                const double upsilon, const double mu, double **shape,
-                double **well, double **cutsq, double **sigma,
-                double **epsilon, double *host_lshape, int **form,
-                double **host_lj1, double **host_lj2, double **host_lj3,
-                double **host_lj4, double **offset, double *special_lj,
-                const int inum, const int nall, const int max_nbors, 
-                const double cell_size, int &gpu_mode, FILE *screen) {
-  GBMF.clear();
-  gpu_mode=GBMF.device->gpu_mode();
-  double gpu_split=GBMF.device->particle_split();
-  int first_gpu=GBMF.device->first_device();
-  int last_gpu=GBMF.device->last_device();
-  int world_me=GBMF.device->world_me();
-  int gpu_rank=GBMF.device->gpu_rank();
-  int procs_per_gpu=GBMF.device->procs_per_gpu();
-
-  GBMF.device->init_message(screen,"gayberne",first_gpu,last_gpu);
-
-  bool message=false;
-  if (GBMF.device->replica_me()==0 && screen)
-    message=true;
-
-  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
-    fflush(screen);
-  }
-
-  int init_ok=0;
-  if (world_me==0)
-    init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, 
-                      sigma, epsilon, host_lshape, form, host_lj1, 
-                      host_lj2, host_lj3, host_lj4, offset, special_lj, 
-                      inum, nall, max_nbors, cell_size, gpu_split, screen);
-
-  GBMF.device->world_barrier();
-  if (message)
-    fprintf(screen,"Done.\n");
-        
-  for (int i=0; i<procs_per_gpu; i++) {
-    if (message) {
-      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
-      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
-                last_gpu,i);
-      fflush(screen);
-    }
-    if (gpu_rank==i && world_me!=0)
-      init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,  sigma,
-                        epsilon, host_lshape, form, host_lj1, host_lj2,
-                        host_lj3, host_lj4, offset, special_lj,  inum, nall,
-                        max_nbors, cell_size, gpu_split,  screen);
-
-    GBMF.device->gpu_barrier();
-    if (message) 
-      fprintf(screen,"Done.\n");
-  }
-  if (message)
-    fprintf(screen,"\n");
-
-  if (init_ok==0)
-    GBMF.estimate_gpu_overhead();
-  return init_ok;
-}
-
-// ---------------------------------------------------------------------------
-// Clear memory on host and device
-// ---------------------------------------------------------------------------
-void gb_gpu_clear() {
-  GBMF.clear();
-}
-
-// ---------------------------------------------------------------------------
-// Build neighbor list on device
-// ---------------------------------------------------------------------------
-template <class gbmtyp>
-inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum,
-                                    const int host_inum, const int nall, 
-                                    double **host_x, double **host_quat,
-                                    int *host_type, double *sublo,
-                                    double *subhi, bool &success) {
-  gbm.nbor_time_avail=true;
-
-  success=true;
-  gbm.resize_atom(inum,nall,success);
-  gbm.resize_local(inum,host_inum,gbm.nbor->max_nbors(),0,success);
-  if (!success)
-    return;
-    
-  gbm.atom->cast_copy_x(host_x,host_type);
-  int mn;
-  gbm.nbor->build_nbor_list(inum, host_inum, nall, *gbm.atom,
-                            sublo, subhi, NULL, NULL, NULL, success, mn);
-  gbm.nbor->copy_unpacked(inum,mn);
-  gbm.last_ellipse=inum;
-  gbm.max_last_ellipse=inum;
-}
-
-// ---------------------------------------------------------------------------
-// Copy neighbor list from host and (if spheres) reorder so ellipses first
-// ---------------------------------------------------------------------------
-template <class gbmtyp>
-void _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall,
-                          const int inum, const int osize,
-                          int *ilist, int *numj,
-                          int *type, int **firstneigh,
-                          bool &success) {
-  success=true;
-    
-  gbm.nbor_time_avail=true;
-
-  int mn=gbm.nbor->max_nbor_loop(inum,numj,ilist);
-  gbm.resize_atom(inum,nall,success);
-  gbm.resize_local(inum,0,mn,osize,success);
-  if (!success)
-    return;
-    
-  if (gbm.multiple_forms) {
-    int p=0;
-    for (int i=0; i<osize; i++) {
-      int itype=type[ilist[i]];
-      if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
-        gbm.host_olist[p]=ilist[i];
-        p++;
-      }
-    }
-    gbm.max_last_ellipse=p;
-    gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
-    for (int i=0; i<osize; i++) {
-      int itype=type[ilist[i]];
-      if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
-        gbm.host_olist[p]=ilist[i];
-        p++;
-      }
-    }
-    gbm.nbor->get_host(inum,gbm.host_olist.begin(),numj,firstneigh,
-                      gbm.block_size());
-    gbm.nbor->copy_unpacked(inum,mn);
-    return;
-  }
-  gbm.last_ellipse=inum;
-  gbm.max_last_ellipse=inum;
-  gbm.nbor->get_host(inum,ilist,numj,firstneigh,gbm.block_size());
-  gbm.nbor->copy_unpacked(inum,mn);
-}
-
-// ---------------------------------------------------------------------------
-// Calculate energies, forces, and torques
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=gbm.block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-  
-  int GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum())/
-                               (BX/gbm._threads_per_atom)));
-  int stride=gbm.nbor->nbor_pitch();
-  int ainum=gbm.ans->inum();
-  int anall=gbm.atom->nall();
-
-  if (gbm.multiple_forms) {
-    gbm.time_kernel.start();
-    if (gbm.last_ellipse>0) {
-      // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
-      GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
-                               (BX/gbm._threads_per_atom)));
-      gb_gpu_pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,ELLIPSE_SPHERE,
-			ELLIPSE_ELLIPSE);
-      gbm.time_kernel.stop();
-
-      gbm.time_gayberne.start();
-      GBMF.k_gayberne.set_size(GX,BX);
-      GBMF.k_gayberne.run(&gbm.atom->dev_x.begin(),
-           &gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(),
-           &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), 
-           &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
-           &stride, &gbm.ans->dev_ans.begin(),&ainum,&gbm.ans->dev_engv.begin(),
-           &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall,
-           &gbm._threads_per_atom);
-      gbm.time_gayberne.stop();
-
-      if (gbm.last_ellipse==gbm.ans->inum()) {
-        gbm.time_kernel2.start();
-        gbm.time_kernel2.stop();
-        gbm.time_gayberne2.start();
-        gbm.time_gayberne2.stop();
-        gbm.time_pair.start();
-        gbm.time_pair.stop();
-        return;
-      }
-
-      // ------------ SPHERE_ELLIPSE ---------------
-
-      gbm.time_kernel2.start();
-      GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum()-
-                               gbm.last_ellipse)/
-                               (BX/gbm._threads_per_atom)));
-      gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.ans->inum(),
-			SPHERE_ELLIPSE,SPHERE_ELLIPSE);
-      gbm.time_kernel2.stop();
-
-      gbm.time_gayberne2.start();
-      GBMF.k_sphere_gb.set_size(GX,BX);
-      GBMF.k_sphere_gb.run(&gbm.atom->dev_x.begin(),&gbm.atom->dev_quat.begin(),
-              &gbm.shape.begin(), &gbm.well.begin(), 
-              &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), 
-              &gbm._lj_types, &gbm.lshape.begin(), 
-              &gbm.nbor->dev_nbor.begin(), &stride, &gbm.ans->dev_ans.begin(),
-              &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
-              &vflag, &gbm.last_ellipse, &ainum, &anall,
-              &gbm._threads_per_atom);
-      gbm.time_gayberne2.stop();
-   } else {
-      gbm.ans->dev_ans.zero();
-      gbm.ans->dev_engv.zero();
-      gbm.time_kernel.stop();
-      gbm.time_gayberne.start();                                 
-      gbm.time_gayberne.stop();
-      gbm.time_kernel2.start();
-      gbm.time_kernel2.stop();
-      gbm.time_gayberne2.start();
-      gbm.time_gayberne2.stop();
-    }
-    
-    // ------------         LJ      ---------------
-    gbm.time_pair.start();
-    if (gbm.last_ellipse<gbm.ans->inum()) {
-      if (gbm.shared_types) {
-        GBMF.k_lj_fast.set_size(GX,BX);
-        GBMF.k_lj_fast.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
-                           &gbm.lj3.begin(), &gbm.gamma_upsilon_mu.begin(),
-                           &stride, &gbm.nbor->dev_packed.begin(),
-                           &gbm.ans->dev_ans.begin(),
-                           &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
-                           &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall,
-                           &gbm._threads_per_atom);
-      } else {
-        GBMF.k_lj.set_size(GX,BX);
-        GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
-                      &gbm.lj3.begin(), &gbm._lj_types, 
-                      &gbm.gamma_upsilon_mu.begin(), &stride, 
-                      &gbm.nbor->dev_packed.begin(), &gbm.ans->dev_ans.begin(),
-                      &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
-                      &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall,
-                      &gbm._threads_per_atom);
-      }
-    }
-    gbm.time_pair.stop();
-  } else {
-    gbm.time_kernel.start();
-    gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.ans->inum(),SPHERE_SPHERE,
-		      ELLIPSE_ELLIPSE);
-    gbm.time_kernel.stop();
-    gbm.time_gayberne.start(); 
-    GBMF.k_gayberne.set_size(GX,BX);
-    GBMF.k_gayberne.run(&gbm.atom->dev_x.begin(), &gbm.atom->dev_quat.begin(),
-            &gbm.shape.begin(), &gbm.well.begin(), 
-            &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), 
-            &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
-            &stride, &gbm.ans->dev_ans.begin(), &ainum,
-            &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
-            &eflag, &vflag, &ainum, &anall, &gbm._threads_per_atom);
-    gbm.time_gayberne.stop();
-  }
-}
-
-// ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary and then compute forces, torques, energies
-// ---------------------------------------------------------------------------
-template <class gbmtyp>
-inline int** _gb_gpu_compute_n(gbmtyp &gbm, const int ago,
-                               const int inum_full, const int nall,
-                               double **host_x, int *host_type,
-                               double *sublo, double *subhi, const bool eflag,
-                               const bool vflag, const bool eatom,
-                               const bool vatom, int &host_start,
-                               int **ilist, int **jnum, const double cpu_time,
-                               bool &success, double **host_quat) {
-  gbm.acc_timers();
-  if (inum_full==0) {
-    host_start=0;
-    gbm.zero_timers();
-    return NULL;
-  }
-
-  gbm.hd_balancer.balance(cpu_time);
-  int inum=gbm.hd_balancer.get_gpu_count(ago,inum_full);
-  gbm.ans->inum(inum);
-  gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
-  host_start=inum;
-  
-  // Build neighbor list on GPU if necessary
-  if (ago==0) {
-    _gb_gpu_build_nbor_list(gbm, inum, inum_full-inum, nall, host_x,
-                            host_quat, host_type, sublo, subhi, success);
-    if (!success)
-      return NULL;
-    gbm.atom->cast_quat_data(host_quat[0]);
-    gbm.hd_balancer.start_timer();
-  } else {    
-    gbm.atom->cast_x_data(host_x,host_type);
-    gbm.atom->cast_quat_data(host_quat[0]);
-    gbm.hd_balancer.start_timer();
-    gbm.atom->add_x_data(host_x,host_type);
-  }
-
-  gbm.atom->add_quat_data();
-  *ilist=gbm.nbor->host_ilist.begin();
-  *jnum=gbm.nbor->host_acc.begin();
-
-  _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
-  gbm.ans->copy_answers(eflag,vflag,eatom,vatom);
-  gbm.device->add_ans_object(gbm.ans);
-  gbm.hd_balancer.stop_timer();
-  return gbm.nbor->host_jlist.begin()-host_start;
-}
-
-int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall,
-                       double **host_x, int *host_type, double *sublo,
-                       double *subhi, const bool eflag, const bool vflag,
-                       const bool eatom, const bool vatom, int &host_start,
-                       int **ilist, int **jnum, const double cpu_time,
-                       bool &success, double **host_quat) {
-  return _gb_gpu_compute_n(GBMF, ago, inum_full, nall, host_x, host_type, sublo,
-                           subhi, eflag, vflag, eatom, vatom, host_start, ilist,
-                           jnum, cpu_time, success, host_quat);
-}  
-
-// ---------------------------------------------------------------------------
-// Copy nbor list from host if necessary and then calculate forces, torques,..
-// ---------------------------------------------------------------------------
-template <class gbmtyp>
-inline int * _gb_gpu_compute(gbmtyp &gbm, const int f_ago, const int inum_full,
-                             const int nall,double **host_x, int *host_type,
-                             int *ilist, int *numj, int **firstneigh,
-                             const bool eflag, const bool vflag,
-                             const bool eatom, const bool vatom,
-                             int &host_start, const double cpu_time,
-                             bool &success, double **host_quat) {
-  gbm.acc_timers();
-  if (inum_full==0) {
-    host_start=0;
-    gbm.zero_timers();
-    return NULL;
-  }
-  
-  int ago=gbm.hd_balancer.ago_first(f_ago);
-  int inum=gbm.hd_balancer.balance(ago,inum_full,cpu_time);
-  gbm.ans->inum(inum);
-  gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
-  host_start=inum;
-
-  if (ago==0) {
-    _gb_gpu_reset_nbors(gbm, nall, inum, inum_full, ilist, numj, host_type,
-		        firstneigh, success);
-    if (!success)
-      return NULL;
-  }
-  int *list;
-  if (gbm.multiple_forms)
-    list=gbm.host_olist.begin();
-  else
-    list=ilist;
-
-  gbm.atom->cast_x_data(host_x,host_type);
-  gbm.atom->cast_quat_data(host_quat[0]);
-  gbm.hd_balancer.start_timer();
-  gbm.atom->add_x_data(host_x,host_type);
-  gbm.atom->add_quat_data();
-
-  _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
-  gbm.ans->copy_answers(eflag,vflag,eatom,vatom,list);
-  gbm.device->add_ans_object(gbm.ans);
-  gbm.hd_balancer.stop_timer();
-  return list;
-}
-
-int * gb_gpu_compute(const int ago, const int inum_full, const int nall,
-                     double **host_x, int *host_type, int *ilist, int *numj,
-                     int **firstneigh, const bool eflag, const bool vflag,
-                     const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success, double **host_quat) {
-  return _gb_gpu_compute(GBMF, ago, inum_full, nall, host_x,
-			 host_type, ilist, numj, firstneigh, eflag, vflag,
-			 eatom, vatom, host_start, cpu_time, success,
-                         host_quat);
-}
-
-// ---------------------------------------------------------------------------
-// Return memory usage
-// ---------------------------------------------------------------------------
-double gb_gpu_bytes() {
-  return GBMF.host_memory_usage();
-}
--- a/lib/gpu/gb_gpu_extra.h
+++ b/lib/gpu/gb_gpu_extra.h
@ -1,315 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef GB_GPU_EXTRA_H
-#define GB_GPU_EXTRA_H
-
-enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
-
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
-#ifdef NV_KERNEL
-
-#include "nv_kernel_def.h"
-
-#else
-
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-#define GLOBAL_ID_X get_global_id(0)
-#define THREAD_ID_X get_local_id(0)
-#define BLOCK_ID_X get_group_id(0)
-#define BLOCK_SIZE_X get_local_size(0)
-#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
-#define __inline inline
-#define BLOCK_PAIR 64
-#define MAX_SHARED_TYPES 8
-
-#endif
-
-/* ----------------------------------------------------------------------
-   dot product of 2 vectors
------------------------------------------------------------------------- */
-
-__inline numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
-{
-  return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2];
-}
-
-/* ----------------------------------------------------------------------
-   cross product of 2 vectors
------------------------------------------------------------------------- */
-
-__inline void gpu_cross3(const numtyp *v1, const numtyp *v2, numtyp *ans)
-{
-  ans[0] = v1[1]*v2[2]-v1[2]*v2[1];
-  ans[1] = v1[2]*v2[0]-v1[0]*v2[2];
-  ans[2] = v1[0]*v2[1]-v1[1]*v2[0];
-}
-
-/* ----------------------------------------------------------------------
-   determinant of a matrix
------------------------------------------------------------------------- */
-
-__inline numtyp gpu_det3(const numtyp m[9])
-{
-  numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] - 
-    m[3]*m[1]*m[8] + m[3]*m[2]*m[7] + 
-    m[6]*m[1]*m[5] - m[6]*m[2]*m[4];
-  return ans;
-}
-
-/* ----------------------------------------------------------------------
-   diagonal matrix times a full matrix
------------------------------------------------------------------------- */
-
-__inline void gpu_times3(const numtyp4 shape, const numtyp m[9], 
-                         numtyp ans[9])
-{
-  ans[0] = shape.x*m[0];
-  ans[1] = shape.x*m[1];
-  ans[2] = shape.x*m[2];
-  ans[3] = shape.y*m[3];
-  ans[4] = shape.y*m[4];
-  ans[5] = shape.y*m[5];
-  ans[6] = shape.z*m[6];
-  ans[7] = shape.z*m[7];
-  ans[8] = shape.z*m[8];
-}
-
-/* ----------------------------------------------------------------------
-   add two matrices
------------------------------------------------------------------------- */
-
-__inline void gpu_plus3(const numtyp m[9], const numtyp m2[9], numtyp ans[9])
-{
-  ans[0] = m[0]+m2[0];
-  ans[1] = m[1]+m2[1];
-  ans[2] = m[2]+m2[2];
-  ans[3] = m[3]+m2[3];
-  ans[4] = m[4]+m2[4];
-  ans[5] = m[5]+m2[5];
-  ans[6] = m[6]+m2[6];
-  ans[7] = m[7]+m2[7];
-  ans[8] = m[8]+m2[8];
-}
-
-/* ----------------------------------------------------------------------
-   multiply the transpose of mat1 times mat2
------------------------------------------------------------------------- */
-
-__inline void gpu_transpose_times3(const numtyp m[9], const numtyp m2[9],
-                                   numtyp ans[9])
-{
-  ans[0] = m[0]*m2[0]+m[3]*m2[3]+m[6]*m2[6];
-  ans[1] = m[0]*m2[1]+m[3]*m2[4]+m[6]*m2[7];
-  ans[2] = m[0]*m2[2]+m[3]*m2[5]+m[6]*m2[8];
-  ans[3] = m[1]*m2[0]+m[4]*m2[3]+m[7]*m2[6];
-  ans[4] = m[1]*m2[1]+m[4]*m2[4]+m[7]*m2[7];
-  ans[5] = m[1]*m2[2]+m[4]*m2[5]+m[7]*m2[8];
-  ans[6] = m[2]*m2[0]+m[5]*m2[3]+m[8]*m2[6];
-  ans[7] = m[2]*m2[1]+m[5]*m2[4]+m[8]*m2[7];
-  ans[8] = m[2]*m2[2]+m[5]*m2[5]+m[8]*m2[8];
-}
-
-/* ----------------------------------------------------------------------
-   row vector times matrix
------------------------------------------------------------------------- */
-
-__inline void gpu_row_times3(const numtyp *v, const numtyp m[9], numtyp *ans)
-{
-  ans[0] = m[0]*v[0]+v[1]*m[3]+v[2]*m[6];
-  ans[1] = v[0]*m[1]+m[4]*v[1]+v[2]*m[7];
-  ans[2] = v[0]*m[2]+v[1]*m[5]+m[8]*v[2];
-}
-
-/* ----------------------------------------------------------------------
-   solve Ax = b or M ans = v
-   use gaussian elimination & partial pivoting on matrix
-   error_flag set to 2 if bad matrix inversion attempted
------------------------------------------------------------------------- */
-
-__inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans,
-                            __global int *error_flag)
-{
-  // create augmented matrix for pivoting
-
-  numtyp aug[12], t;
-
-  aug[3] = v[0];
-  aug[0] = m[0];
-  aug[1] = m[1];
-  aug[2] = m[2];
-  aug[7] = v[1];
-  aug[4] = m[3];
-  aug[5] = m[4];
-  aug[6] = m[5];
-  aug[11] = v[2];
-  aug[8] = m[6];
-  aug[9] = m[7];
-  aug[10] = m[8];
-
-  if (fabs(aug[4]) > fabs(aug[0])) {
-    numtyp swapt;
-    swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
-    swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
-    swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
-    swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
-  }
-  if (fabs(aug[8]) > fabs(aug[0])) {
-    numtyp swapt;
-    swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
-    swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
-    swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
-    swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
-  }
-
-  if (aug[0] != (numtyp)0.0) {
-    if (0!=0) {
-      numtyp swapt;
-      swapt=aug[0]; aug[0]=aug[0]; aug[0]=swapt;
-      swapt=aug[1]; aug[1]=aug[1]; aug[1]=swapt;
-      swapt=aug[2]; aug[2]=aug[2]; aug[2]=swapt;
-      swapt=aug[3]; aug[3]=aug[3]; aug[3]=swapt;
-    }
-  } else if (aug[4] != (numtyp)0.0) {
-    if (1!=0) {
-      numtyp swapt;
-      swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
-      swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
-      swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
-      swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
-    }
-  } else if (aug[8] != (numtyp)0.0) {
-    if (2!=0) {
-      numtyp swapt;
-      swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
-      swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
-      swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
-      swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
-    }
-  } else
-    *error_flag=2;
-
-  t = aug[4]/aug[0];
-  aug[5]-=t*aug[1];
-  aug[6]-=t*aug[2];
-  aug[7]-=t*aug[3];
-  t = aug[8]/aug[0];
-  aug[9]-=t*aug[1];
-  aug[10]-=t*aug[2];
-  aug[11]-=t*aug[3];
-
-  if (fabs(aug[9]) > fabs(aug[5])) {
-    numtyp swapt;
-    swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
-    swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
-    swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
-    swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
-  }
-
-  if (aug[5] != (numtyp)0.0) {
-    if (1!=1) {
-      numtyp swapt;
-      swapt=aug[4]; aug[4]=aug[4]; aug[4]=swapt;
-      swapt=aug[5]; aug[5]=aug[5]; aug[5]=swapt;
-      swapt=aug[6]; aug[6]=aug[6]; aug[6]=swapt;
-      swapt=aug[7]; aug[7]=aug[7]; aug[7]=swapt;
-    }
-  } else if (aug[9] != (numtyp)0.0) {
-    if (2!=1) {
-      numtyp swapt;
-      swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
-      swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
-      swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
-      swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
-    }
-  }
-
-  t = aug[9]/aug[5];
-  aug[10]-=t*aug[6];
-  aug[11]-=t*aug[7];
-  
-  if (aug[10] == (numtyp)0.0)
-    *error_flag=2;
-
-  ans[2] = aug[11]/aug[10];
-  t = (numtyp)0.0;
-  t += aug[6]*ans[2];
-  ans[1] = (aug[7]-t) / aug[5];
-  t = (numtyp)0.0;
-  t += aug[1]*ans[1];
-  t += aug[2]*ans[2];
-  ans[0] = (aug[3]-t) / aug[0];
-}
-
-/* ----------------------------------------------------------------------
-   compute rotation matrix from quaternion conjugate
-   quat = [w i j k]
------------------------------------------------------------------------- */
-
-__inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi, 
-                                    numtyp mat[9])
-{
-  numtyp4 q=qif[qi];
-  
-  numtyp w2 = q.x*q.x;
-  numtyp i2 = q.y*q.y;
-  numtyp j2 = q.z*q.z;
-  numtyp k2 = q.w*q.w;
-  numtyp twoij = (numtyp)2.0*q.y*q.z;
-  numtyp twoik = (numtyp)2.0*q.y*q.w;
-  numtyp twojk = (numtyp)2.0*q.z*q.w;
-  numtyp twoiw = (numtyp)2.0*q.y*q.x;
-  numtyp twojw = (numtyp)2.0*q.z*q.x;
-  numtyp twokw = (numtyp)2.0*q.w*q.x;
-
-  mat[0] = w2+i2-j2-k2;
-  mat[3] = twoij-twokw;
-  mat[6] = twojw+twoik;
-
-  mat[1] = twoij+twokw;
-  mat[4] = w2-i2+j2-k2;
-  mat[7] = twojk-twoiw;
-	
-  mat[2] = twoik-twojw;
-  mat[5] = twojk+twoiw;
-  mat[8] = w2-i2-j2+k2;
-}
-
-#endif
--- a/lib/gpu/gb_gpu_kernel.cu
+++ b/lib/gpu/gb_gpu_kernel.cu
@ -1,431 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef GB_GPU_KERNEL
-#define GB_GPU_KERNEL
-
-#ifdef NV_KERNEL
-#include "gb_gpu_extra.h"
-#endif
-
-#define SBBITS 30
-#define NEIGHMASK 0x3FFFFFFF
-__inline int sbmask(int j) { return j >> SBBITS & 3; }
-
-__inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape, 
-                                 numtyp ans[9])
-{
-  numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
-    m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
-    m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
-  den = (numtyp)1.0/den;
-  
-  ans[0] = shape.x*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
-		    m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
-		    m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
-		    m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
-		    m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
-  
-  ans[1] = shape.x*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
-		    (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
-		    (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
-		    m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
-		    m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
-  
-  ans[2] = shape.x*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
-		    m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
-		    m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
-		    (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
-		    m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
-  
-  ans[3] = shape.y*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
-		    m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
-		    m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
-		    m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
-		    m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
-  
-  ans[4] = shape.y*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
-		    (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
-		    (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
-		    m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
-		    m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
-  
-  ans[5] = shape.y*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
-		    m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
-		    (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
-		    m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
-		    (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
-  
-  ans[6] = shape.z*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
-		    (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
-		    m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
-		    m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
-		    m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
-  
-  ans[7] = shape.z*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
-		     (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
-		     (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
-		     m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
-		     m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
-  
-  ans[8] = shape.z*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
-		    m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
-		    m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
-		    (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
-		    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
-}
-
-__kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
-                              __global numtyp4* shape, __global numtyp4* well, 
-                              __global numtyp *gum, __global numtyp2* sig_eps, 
-                              const int ntypes, __global numtyp *lshape, 
-                              __global int *dev_nbor, const int stride, 
-                              __global acctyp4 *ans, const int astride, 
-                              __global acctyp *engv, __global int *err_flag, 
-                              const int eflag, const int vflag, const int inum,
-                              const int nall, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp sp_lj[4];
-  sp_lj[0]=gum[3];    
-  sp_lj[1]=gum[4];    
-  sp_lj[2]=gum[5];    
-  sp_lj[3]=gum[6];    
-
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp4 tor;
-  tor.x=(acctyp)0;
-  tor.y=(acctyp)0;
-  tor.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=stride;
-    int numj=*nbor;
-    nbor+=stride;
-    __global int *nbor_end=nbor+mul24(stride,numj);
-    nbor+=mul24(offset,stride);
-    int n_stride=mul24(t_per_atom,stride);
-  
-    numtyp4 ix=x_[i];
-    int itype=ix.w;
-    numtyp a1[9], b1[9], g1[9];
-    numtyp4 ishape=shape[itype];
-    {
-      numtyp t[9];
-      gpu_quat_to_mat_trans(q,i,a1);
-      gpu_times3(ishape,a1,t);
-      gpu_transpose_times3(a1,t,g1);
-      gpu_times3(well[itype],a1,t);
-      gpu_transpose_times3(a1,t,b1);
-    }
-
-    numtyp factor_lj;
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-      int j=*nbor;
-      factor_lj = sp_lj[sbmask(j)];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp r12[3];
-      r12[0] = jx.x-ix.x;
-      r12[1] = jx.y-ix.y;
-      r12[2] = jx.z-ix.z;
-      numtyp ir = gpu_dot3(r12,r12);
-
-      ir = rsqrt(ir);
-      numtyp r = (numtyp)1.0/ir;
-
-      numtyp a2[9];
-      gpu_quat_to_mat_trans(q,j,a2);
-  
-      numtyp u_r, dUr[3], tUr[3], eta, teta[3];
-      { // Compute U_r, dUr, eta, and teta
-        // Compute g12
-        numtyp g12[9];
-        {
-          numtyp g2[9];
-          {
-              gpu_times3(shape[jtype],a2,g12);
-              gpu_transpose_times3(a2,g12,g2);
-              gpu_plus3(g1,g2,g12);
-          }
-
-          { // Compute U_r and dUr
-    
-            // Compute kappa
-            numtyp kappa[3];
-            gpu_mldivide3(g12,r12,kappa,err_flag);
-
-            // -- replace r12 with r12 hat
-            r12[0]*=ir;
-            r12[1]*=ir;
-            r12[2]*=ir;
-
-            // -- kappa is now / r
-            kappa[0]*=ir;
-            kappa[1]*=ir;
-            kappa[2]*=ir;
-
-            // energy
-  
-            // compute u_r and dUr
-            numtyp uslj_rsq;
-            {
-              // Compute distance of closest approach
-              numtyp h12, sigma12;
-              sigma12 = gpu_dot3(r12,kappa);
-              sigma12 = rsqrt((numtyp)0.5*sigma12);
-              h12 = r-sigma12;
-
-              // -- kappa is now ok
-              kappa[0]*=r;
-              kappa[1]*=r;
-              kappa[2]*=r;
-          
-              int mtype=mul24(ntypes,itype)+jtype;
-              numtyp sigma = sig_eps[mtype].x;
-              numtyp epsilon = sig_eps[mtype].y;
-              numtyp varrho = sigma/(h12+gum[0]*sigma);
-              numtyp varrho6 = varrho*varrho*varrho;
-              varrho6*=varrho6;
-              numtyp varrho12 = varrho6*varrho6;
-              u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
-
-              numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
-              temp1 = temp1*(numtyp)24.0*epsilon;
-              uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
-              numtyp temp2 = gpu_dot3(kappa,r12);
-              uslj_rsq = uslj_rsq*ir*ir;
-
-              dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
-              dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
-              dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
-            }
-
-            // torque for particle 1
-            {
-              numtyp tempv[3], tempv2[3];
-              tempv[0] = -uslj_rsq*kappa[0];
-              tempv[1] = -uslj_rsq*kappa[1];
-              tempv[2] = -uslj_rsq*kappa[2];
-              gpu_row_times3(kappa,g1,tempv2);
-              gpu_cross3(tempv,tempv2,tUr);
-            }
-          }
-        }
-     
-        // Compute eta
-        {
-          eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
-          numtyp det_g12 = gpu_det3(g12);
-          eta = pow(eta/det_g12,gum[1]);
-        }
-    
-        // Compute teta
-        numtyp temp[9], tempv[3], tempv2[3];
-        compute_eta_torque(g12,a1,ishape,temp);
-        numtyp temp1 = -eta*gum[1];
-
-        tempv[0] = temp1*temp[0];
-        tempv[1] = temp1*temp[1];
-        tempv[2] = temp1*temp[2];
-        gpu_cross3(a1,tempv,tempv2);
-        teta[0] = tempv2[0];
-        teta[1] = tempv2[1];
-        teta[2] = tempv2[2];
-  
-        tempv[0] = temp1*temp[3];
-        tempv[1] = temp1*temp[4];
-        tempv[2] = temp1*temp[5];
-        gpu_cross3(a1+3,tempv,tempv2);
-        teta[0] += tempv2[0];
-        teta[1] += tempv2[1];
-        teta[2] += tempv2[2];
-
-        tempv[0] = temp1*temp[6];
-        tempv[1] = temp1*temp[7];
-        tempv[2] = temp1*temp[8];
-        gpu_cross3(a1+6,tempv,tempv2);
-        teta[0] += tempv2[0];
-        teta[1] += tempv2[1];
-        teta[2] += tempv2[2];
-      }
-  
-      numtyp chi, dchi[3], tchi[3];
-      { // Compute chi and dchi
-
-        // Compute b12
-        numtyp b2[9], b12[9];
-        {
-          gpu_times3(well[jtype],a2,b12);
-          gpu_transpose_times3(a2,b12,b2);
-          gpu_plus3(b1,b2,b12);
-        }
-
-        // compute chi_12
-        r12[0]*=r;
-        r12[1]*=r;
-        r12[2]*=r;
-        numtyp iota[3];
-        gpu_mldivide3(b12,r12,iota,err_flag);
-        // -- iota is now iota/r
-        iota[0]*=ir;
-        iota[1]*=ir;
-        iota[2]*=ir;
-        r12[0]*=ir;
-        r12[1]*=ir;
-        r12[2]*=ir;
-        chi = gpu_dot3(r12,iota);
-        chi = pow(chi*(numtyp)2.0,gum[2]);
-
-        // -- iota is now ok
-        iota[0]*=r;
-        iota[1]*=r;
-        iota[2]*=r;
-
-        numtyp temp1 = gpu_dot3(iota,r12);
-        numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
-                                                          gum[2]);
-        dchi[0] = temp2*(iota[0]-temp1*r12[0]);
-        dchi[1] = temp2*(iota[1]-temp1*r12[1]);
-        dchi[2] = temp2*(iota[2]-temp1*r12[2]);
-
-        // compute t_chi
-        numtyp tempv[3];
-        gpu_row_times3(iota,b1,tempv);
-        gpu_cross3(tempv,iota,tchi);
-        temp1 = (numtyp)-4.0*ir*ir;
-        tchi[0] *= temp1;
-        tchi[1] *= temp1;
-        tchi[2] *= temp1;
-      }
-
-      numtyp temp2 = factor_lj*eta*chi;
-      if (eflag>0)
-        energy+=u_r*temp2;
-      numtyp temp1 = -eta*u_r*factor_lj;
-      if (vflag>0) {
-        r12[0]*=-r;
-        r12[1]*=-r;
-        r12[2]*=-r;
-        numtyp ft=temp1*dchi[0]-temp2*dUr[0];
-        f.x+=ft;
-        virial[0]+=r12[0]*ft;
-        ft=temp1*dchi[1]-temp2*dUr[1];
-        f.y+=ft;
-        virial[1]+=r12[1]*ft;
-        virial[3]+=r12[0]*ft;
-        ft=temp1*dchi[2]-temp2*dUr[2];
-        f.z+=ft;
-        virial[2]+=r12[2]*ft;
-        virial[4]+=r12[0]*ft;
-        virial[5]+=r12[1]*ft;
-      } else {
-        f.x+=temp1*dchi[0]-temp2*dUr[0];
-        f.y+=temp1*dchi[1]-temp2*dUr[1];
-        f.z+=temp1*dchi[2]-temp2*dUr[2];
-      }
-
-      // Torque on 1
-      temp1 = -u_r*eta*factor_lj;
-      temp2 = -u_r*chi*factor_lj;
-      numtyp temp3 = -chi*eta*factor_lj;
-      tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
-      tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
-      tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
- 
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[7][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=tor.x;
-    red_acc[4][tid]=tor.y;
-    red_acc[5][tid]=tor.z;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<6; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    tor.x=red_acc[3][tid];
-    tor.y=red_acc[4][tid];
-    tor.z=red_acc[5][tid];
-
-    if (eflag>0 || vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-      red_acc[6][tid]=energy;
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<7; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-      energy=red_acc[6][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=astride;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=astride;
-      }
-    }
-    ans[ii]=f;
-    ans[ii+astride]=tor;
-  } // if ii
-}
-
-#endif
-
--- a/lib/gpu/gb_gpu_kernel_lj.cu
+++ b/lib/gpu/gb_gpu_kernel_lj.cu
@ -1,594 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef GB_GPU_KERNEL_LJ
-#define GB_GPU_KERNEL_LJ
-
-#ifdef NV_KERNEL
-#include "gb_gpu_extra.h"
-#endif
-
-#define SBBITS 30
-#define NEIGHMASK 0x3FFFFFFF
-__inline int sbmask(int j) { return j >> SBBITS & 3; }
-
-__kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
-                               __global numtyp4* shape,__global numtyp4* well, 
-                               __global numtyp *gum, __global numtyp2* sig_eps, 
-                               const int ntypes, __global numtyp *lshape, 
-                               __global int *dev_nbor, const int stride, 
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               __global int *err_flag, const int eflag, 
-                               const int vflag,const int start, const int inum, 
-                               const int nall, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom+start;
-  int offset=tid%t_per_atom;
-
-  __local numtyp sp_lj[4];
-  sp_lj[0]=gum[3];    
-  sp_lj[1]=gum[4];    
-  sp_lj[2]=gum[5];    
-  sp_lj[3]=gum[6];    
-
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=stride;
-    int numj=*nbor;
-    nbor+=stride;
-    __global int *nbor_end=nbor+stride*numj;
-    nbor+=mul24(offset,stride);
-    int n_stride=mul24(t_per_atom,stride);
-  
-    numtyp4 ix=x_[i];
-    int itype=ix.w;
-      
-    numtyp oner=shape[itype].x;
-    numtyp one_well=well[itype].x;
-  
-    numtyp factor_lj;
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
-      int j=*nbor;
-      factor_lj = sp_lj[sbmask(j)];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp r12[3];
-      r12[0] = jx.x-ix.x;
-      r12[1] = jx.y-ix.y;
-      r12[2] = jx.z-ix.z;
-      numtyp ir = gpu_dot3(r12,r12);
-
-      ir = rsqrt(ir);
-      numtyp r = (numtyp)1.0/ir;
-      
-      numtyp r12hat[3];
-      r12hat[0]=r12[0]*ir;
-      r12hat[1]=r12[1]*ir;
-      r12hat[2]=r12[2]*ir;
-
-      numtyp a2[9];
-      gpu_quat_to_mat_trans(q,j,a2);
-  
-      numtyp u_r, dUr[3], eta;
-      { // Compute U_r, dUr, eta, and teta
-        // Compute g12
-        numtyp g12[9];
-        {
-          {
-            numtyp g2[9];
-            gpu_times3(shape[jtype],a2,g12);
-            gpu_transpose_times3(a2,g12,g2);
-            g12[0]=g2[0]+oner;
-            g12[4]=g2[4]+oner;
-            g12[8]=g2[8]+oner;
-            g12[1]=g2[1];
-            g12[2]=g2[2];
-            g12[3]=g2[3];
-            g12[5]=g2[5];
-            g12[6]=g2[6];
-            g12[7]=g2[7];    
-          }
-  
-          { // Compute U_r and dUr
-    
-            // Compute kappa
-            numtyp kappa[3];
-            gpu_mldivide3(g12,r12,kappa,err_flag);
-
-            // -- kappa is now / r
-            kappa[0]*=ir;
-            kappa[1]*=ir;
-            kappa[2]*=ir;
-  
-            // energy
-  
-            // compute u_r and dUr
-            numtyp uslj_rsq;
-            {
-              // Compute distance of closest approach
-              numtyp h12, sigma12;
-              sigma12 = gpu_dot3(r12hat,kappa);
-              sigma12 = rsqrt((numtyp)0.5*sigma12);
-              h12 = r-sigma12;
-
-              // -- kappa is now ok
-              kappa[0]*=r;
-              kappa[1]*=r;
-              kappa[2]*=r;
-          
-              int mtype=mul24(ntypes,itype)+jtype;
-              numtyp sigma = sig_eps[mtype].x;
-              numtyp epsilon = sig_eps[mtype].y;
-              numtyp varrho = sigma/(h12+gum[0]*sigma);
-              numtyp varrho6 = varrho*varrho*varrho;
-              varrho6*=varrho6;
-              numtyp varrho12 = varrho6*varrho6;
-              u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
-
-              numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
-              temp1 = temp1*(numtyp)24.0*epsilon;
-              uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
-              numtyp temp2 = gpu_dot3(kappa,r12hat);
-              uslj_rsq = uslj_rsq*ir*ir;
-
-              dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
-              dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
-              dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
-            }
-          }
-        }
-     
-        // Compute eta
-        {
-          eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
-          numtyp det_g12 = gpu_det3(g12);
-          eta = pow(eta/det_g12,gum[1]);
-        }
-      }
-  
-      numtyp chi, dchi[3];
-      { // Compute chi and dchi
-
-        // Compute b12
-        numtyp b12[9];
-        {
-          numtyp b2[9];
-          gpu_times3(well[jtype],a2,b12);
-          gpu_transpose_times3(a2,b12,b2);
-          b12[0]=b2[0]+one_well;
-          b12[4]=b2[4]+one_well;
-          b12[8]=b2[8]+one_well;
-          b12[1]=b2[1];
-          b12[2]=b2[2];
-          b12[3]=b2[3];
-          b12[5]=b2[5];
-          b12[6]=b2[6];
-          b12[7]=b2[7];    
-        }
-
-        // compute chi_12
-        numtyp iota[3];
-        gpu_mldivide3(b12,r12,iota,err_flag);
-        // -- iota is now iota/r
-        iota[0]*=ir;
-        iota[1]*=ir;
-        iota[2]*=ir;
-        chi = gpu_dot3(r12hat,iota);
-        chi = pow(chi*(numtyp)2.0,gum[2]);
-
-        // -- iota is now ok
-        iota[0]*=r;
-        iota[1]*=r;
-        iota[2]*=r;
-
-        numtyp temp1 = gpu_dot3(iota,r12hat);
-        numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
-        dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
-        dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
-        dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
-      }
-
-      numtyp temp2 = factor_lj*eta*chi;
-      if (eflag>0)
-        energy+=u_r*temp2;
-      numtyp temp1 = -eta*u_r*factor_lj;
-      if (vflag>0) {
-        r12[0]*=-1;
-        r12[1]*=-1;
-        r12[2]*=-1;
-        numtyp ft=temp1*dchi[0]-temp2*dUr[0];
-        f.x+=ft;
-        virial[0]+=r12[0]*ft;
-        ft=temp1*dchi[1]-temp2*dUr[1];
-        f.y+=ft;
-        virial[1]+=r12[1]*ft;
-        virial[3]+=r12[0]*ft;
-        ft=temp1*dchi[2]-temp2*dUr[2];
-        f.z+=ft;
-        virial[2]+=r12[2]*ft;
-        virial[4]+=r12[0]*ft;
-        virial[5]+=r12[1]*ft;
-      } else {
-        f.x+=temp1*dchi[0]-temp2*dUr[0];
-        f.y+=temp1*dchi[1]-temp2*dUr[1];
-        f.z+=temp1*dchi[2]-temp2*dUr[2];
-      }
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<4; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii
-}
-
-__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, 
-                        __global numtyp4* lj3, const int lj_types, 
-                        __global numtyp *gum, 
-                        const int stride, __global int *dev_ij, 
-                        __global acctyp4 *ans, __global acctyp *engv, 
-                        __global int *err_flag, const int eflag, 
-                        const int vflag, const int start, const int inum, 
-                        const int nall, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom+start;
-  int offset=tid%t_per_atom;
-
-  __local numtyp sp_lj[4];
-  sp_lj[0]=gum[3];    
-  sp_lj[1]=gum[4];    
-  sp_lj[2]=gum[5];    
-  sp_lj[3]=gum[6];    
-
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  if (ii<inum) {
-    __global int *nbor=dev_ij+ii;
-    int i=*nbor;
-    nbor+=stride;
-    int numj=*nbor;
-    nbor+=stride;
-    __global int *list_end=nbor+mul24(stride,numj);
-    nbor+=mul24(offset,stride);
-    int n_stride=mul24(t_per_atom,stride);
-  
-    numtyp4 ix=x_[i];
-    int itype=ix.w;
-
-    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
-  
-      int j=*nbor;
-      factor_lj = sp_lj[sbmask(j)];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
-      int ii=itype*lj_types+jtype;
-      if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) {
-        r2inv=(numtyp)1.0/r2inv;
-        numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y);
-        force*=factor_lj;
-      
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
-          energy+=factor_lj*(e-lj3[ii].z); 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<4; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1+=energy;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1+=virial[i];
-        ap1+=inum;
-      }
-    }
-    acctyp4 old=ans[ii];
-    old.x+=f.x;
-    old.y+=f.y;
-    old.z+=f.z;
-    ans[ii]=old;
-  } // if ii
-}
-
-__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, 
-                             __global numtyp4* lj3_in, __global numtyp *gum, 
-                             const int stride, __global int *dev_ij,
-                             __global acctyp4 *ans, __global acctyp *engv,
-                             __global int *err_flag, const int eflag,
-                             const int vflag, const int start, const int inum,
-                             const int nall, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom+start;
-  int offset=tid%t_per_atom;
-
-  __local numtyp sp_lj[4];                              
-  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  if (tid<4)
-    sp_lj[tid]=gum[tid+3];    
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[tid]=lj1_in[tid];
-    if (eflag>0)
-      lj3[tid]=lj3_in[tid];
-  }
-  
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  __syncthreads();
-  
-  if (ii<inum) {
-    __global int *nbor=dev_ij+ii;
-    int i=*nbor;
-    nbor+=stride;
-    int numj=*nbor;
-    nbor+=stride;
-    __global int *list_end=nbor+mul24(stride,numj);
-    nbor+=mul24(offset,stride);
-    int n_stride=mul24(t_per_atom,stride);
-
-    numtyp4 ix=x_[i];
-    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
-
-    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
-  
-      int j=*nbor;
-      factor_lj = sp_lj[sbmask(j)];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=x_[j];
-      int mtype=itype+jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
-      if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) {
-        r2inv=(numtyp)1.0/r2inv;
-        numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
-      
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<4; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1+=energy;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1+=virial[i];
-        ap1+=inum;
-      }
-    }
-    acctyp4 old=ans[ii];
-    old.x+=f.x;
-    old.y+=f.y;
-    old.z+=f.z;
-    ans[ii]=old;
-  } // if ii
-}
-
-#endif
--- a/lib/gpu/gb_gpu_kernel_nbor.cu
+++ b/lib/gpu/gb_gpu_kernel_nbor.cu
@ -1,169 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef PAIR_GPU_KERNEL_H
-#define PAIR_GPU_KERNEL_H
-
-#ifdef NV_KERNEL
-
-#include "nv_kernel_def.h"
-
-#else
-
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-#define GLOBAL_ID_X get_global_id(0)
-#define THREAD_ID_X get_local_id(0)
-#define BLOCK_ID_X get_group_id(0)
-#define BLOCK_SIZE_X get_local_size(0)
-#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
-#define MAX_SHARED_TYPES 8
-
-#endif
-
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#else
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#endif
-
-// ---------------------------------------------------------------------------
-// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
-// -- Only unpack neighbors matching the specified inclusive range of forms
-// -- Only unpack neighbors within cutoff
-// ---------------------------------------------------------------------------
-__kernel void kernel_gb_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form, 
-                             const int ntypes, __global int *dev_nbor,
-                             const int nbor_pitch, 
-                             const int start, const int inum, 
-                             __global int *dev_ij, const int form_low, 
-                             const int form_high, const int nall) {
-                                
-  // ii indexes the two interacting particles in gi
-  int ii=GLOBAL_ID_X+start;
-
-  if (ii<inum) {
-    __global int *nbor=dev_ij+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
-    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
-  
-    numtyp4 ix=x_[i];
-    int iw=ix.w;
-    int itype=mul24(iw,ntypes);
-    int newj=0;  
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
-      int j=*nbor;
-      if (j>=nall)
-        j%=nall;
-      numtyp4 jx=x_[j];
-      int jtype=jx.w;
-      int mtype=itype+jtype;
-      numtyp2 cf=cut_form[mtype];
-      if (cf.y>=form_low && cf.y<=form_high) {
-        // Compute r12;
-        numtyp rsq=jx.x-ix.x;
-        rsq*=rsq;
-        numtyp t=jx.y-ix.y;
-        rsq+=t*t;
-        t=jx.z-ix.z;
-        rsq+=t*t;
-
-        if (rsq<cf.x) {
-          *packed=j;
-          packed+=nbor_pitch;
-          newj++;
-        }
-      }
-    }
-    dev_nbor[ii+nbor_pitch]=newj;
-  }
-}
-
-// ---------------------------------------------------------------------------
-// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
-// -- Only unpack neighbors matching the specified inclusive range of forms
-// -- Only unpack neighbors within cutoff
-// -- Fast version of routine that uses shared memory for LJ constants
-// ---------------------------------------------------------------------------
-__kernel void kernel_gb_nbor_fast(__global numtyp4 *x_, 
-                                  __global numtyp2 *cut_form,
-                                  __global int *dev_nbor, 
-                                  const int nbor_pitch, 
-                                  const int start, const int inum, 
-                                  __global int *dev_ij, const int form_low, 
-                                  const int form_high, const int nall) {
-                                
-  int ii=THREAD_ID_X;
-  __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    cutsq[ii]=cut_form[ii].x;
-    form[ii]=cut_form[ii].y;
-  }
-  ii+=mul24((int)BLOCK_SIZE_X,(int)BLOCK_ID_X)+start;
-  __syncthreads();
-
-  if (ii<inum) {
-    __global int *nbor=dev_ij+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
-    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
-  
-    numtyp4 ix=x_[i];
-    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
-
-    int newj=0;  
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
-      int j=*nbor;
-      if (j>=nall)
-        j%=nall;
-      numtyp4 jx=x_[j];
-      int jtype=jx.w;
-      int mtype=itype+jtype;
-      
-      if (form[mtype]>=form_low && form[mtype]<=form_high) {
-        // Compute r12;
-        numtyp rsq=jx.x-ix.x;
-        rsq*=rsq;
-        numtyp t=jx.y-ix.y;
-        rsq+=t*t;
-        t=jx.z-ix.z;
-        rsq+=t*t;
-
-        if (rsq<cutsq[mtype]) {
-          *packed=j;
-          packed+=nbor_pitch;
-          newj++;
-        }
-      }
-    }
-    dev_nbor[ii+nbor_pitch]=newj;
-  }
-}
-
-#endif
--- a/lib/gpu/gb_gpu_memory.cpp
+++ b/lib/gpu/gb_gpu_memory.cpp
@ -1,361 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifdef USE_OPENCL
-#include "gb_gpu_cl.h"
-#include "gb_gpu_nbor_cl.h"
-#else
-#include "gb_gpu_ptx.h"
-#endif
-
-#include "gb_gpu_memory.h"
-#include <cassert>
-#define GB_GPU_MemoryT GB_GPU_Memory<numtyp, acctyp>
-
-extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
-
-template <class numtyp, class acctyp>
-GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false),
-                                  _max_bytes(0.0) {
-  device=&pair_gpu_device;
-  ans=new PairGPUAns<numtyp,acctyp>();
-  nbor=new PairGPUNbor;
-}
-
-template <class numtyp, class acctyp>
-GB_GPU_MemoryT::~GB_GPU_Memory() { 
-  clear();
-  delete ans;
-  delete nbor;
-}
- 
-template <class numtyp, class acctyp>
-int GB_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
-  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
-         nbor->bytes_per_atom(max_nbors);
-}
-
-template <class numtyp, class acctyp>
-int GB_GPU_MemoryT::init(const int ntypes, const double gamma, 
-                         const double upsilon, const double mu, 
-                         double **host_shape, double **host_well, 
-                         double **host_cutsq, double **host_sigma, 
-                         double **host_epsilon, double *host_lshape, 
-                         int **h_form, double **host_lj1, double **host_lj2,
-                         double **host_lj3, double **host_lj4,
-                         double **host_offset, const double *host_special_lj,
-                         const int nlocal, const int nall,
-                         const int max_nbors, const double cell_size,
-                         const double gpu_split, FILE *_screen) {
-  nbor_time_avail=false;
-  screen=_screen;
-
-  bool gpu_nbor=false;
-  if (device->gpu_mode()==PairGPUDevice<numtyp,acctyp>::GPU_NEIGH)
-    gpu_nbor=true;
-
-  int _gpu_host=0;
-  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
-  if (host_nlocal>0)
-    _gpu_host=1;
-  
-  _threads_per_atom=device->threads_per_atom();
-  int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,0,
-                           _gpu_host,max_nbors,cell_size,true);
-  if (success!=0)
-    return success;
-    
-  ucl_device=device->gpu;
-  atom=&device->atom;
-
-  _block_size=device->pair_block_size();
-  compile_kernels(*ucl_device);
-
-  // Initialize host-device load balancer
-  hd_balancer.init(device,gpu_nbor,gpu_split);
-
-  // Initialize timers for the selected GPU
-  time_pair.init(*ucl_device);
-  time_pair.zero();
-  
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  shared_types=false;
-  int max_shared_types=device->max_shared_types();
-  if (lj_types<=max_shared_types && _block_size>=max_shared_types) {
-    lj_types=max_shared_types;
-    shared_types=true;
-  }
-  _lj_types=lj_types;
-
-  // Allocate a host write buffer for copying type data
-  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*ucl_device,
-                               UCL_WRITE_OPTIMIZED);
-
-  for (int i=0; i<lj_types*lj_types; i++)
-    host_write[i]=0.0;
-
-  sigma_epsilon.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
-  this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write,
-			 host_sigma,host_epsilon);
-
-  cut_form.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
-  this->atom->type_pack2(ntypes,lj_types,cut_form,host_write,
-			 host_cutsq,h_form);
-
-  lj1.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq,h_form);
-
-  lj3.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
-
-  dev_error.alloc(1,*ucl_device);
-  dev_error.zero();
-    
-  _allocated=true;
-    
-  host_form=h_form;
-    
-  // Initialize timers for the selected GPU
-  time_kernel.init(*ucl_device);
-  time_gayberne.init(*ucl_device);
-  time_kernel2.init(*ucl_device);
-  time_gayberne2.init(*ucl_device);
-  time_kernel.zero();
-  time_gayberne.zero();
-  time_kernel2.zero();
-  time_gayberne2.zero();
-    
-  // Allocate, cast and asynchronous memcpy of constant data
-  // Copy data for bonded interactions
-  gamma_upsilon_mu.alloc(7,*ucl_device,UCL_READ_ONLY);
-  host_write[0]=static_cast<numtyp>(gamma); 
-  host_write[1]=static_cast<numtyp>(upsilon);
-  host_write[2]=static_cast<numtyp>(mu);
-  host_write[3]=static_cast<numtyp>(host_special_lj[0]);
-  host_write[4]=static_cast<numtyp>(host_special_lj[1]);
-  host_write[5]=static_cast<numtyp>(host_special_lj[2]);
-  host_write[6]=static_cast<numtyp>(host_special_lj[3]);
-  ucl_copy(gamma_upsilon_mu,host_write,7,false);
-
-  lshape.alloc(ntypes,*ucl_device,UCL_READ_ONLY);
-  UCL_H_Vec<double> d_view;
-  d_view.view(host_lshape,lshape.numel(),*ucl_device);
-  ucl_copy(lshape,d_view,false);
-    
-  // Copy shape, well, sigma, epsilon, and cutsq onto GPU
-  // - cast if necessary
-  shape.alloc(ntypes,*ucl_device,UCL_READ_ONLY);
-  for (int i=0; i<ntypes; i++) {
-    host_write[i*4]=host_shape[i][0];
-    host_write[i*4+1]=host_shape[i][1];
-    host_write[i*4+2]=host_shape[i][2];
-  }
-  UCL_H_Vec<numtyp4> view4;
-  view4.view((numtyp4*)host_write.begin(),shape.numel(),*ucl_device);
-  ucl_copy(shape,view4,false);
-
-  well.alloc(ntypes,*ucl_device,UCL_READ_ONLY);
-  for (int i=0; i<ntypes; i++) {
-    host_write[i*4]=host_well[i][0];
-    host_write[i*4+1]=host_well[i][1];
-    host_write[i*4+2]=host_well[i][2];
-  }
-  view4.view((numtyp4*)host_write.begin(),well.numel(),*ucl_device);
-  ucl_copy(well,view4,false);
-  
-  // See if we want fast GB-sphere or sphere-sphere calculations
-  multiple_forms=false;
-  for (int i=1; i<ntypes; i++)
-    for (int j=i; j<ntypes; j++) 
-      if (host_form[i][j]!=ELLIPSE_ELLIPSE)
-        multiple_forms=true;
-  if (multiple_forms && host_nlocal>0) {
-    std::cerr << "Cannot use Gayberne with multiple forms and GPU neighbor.\n";
-    exit(1);
-  }
-  
-  if (multiple_forms)
-    ans->dev_ans.zero();
-
-  _max_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
-
-  // Memory for ilist ordered by particle type
-  if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS)
-    return 0;
-  else return -3;
-}
-
-template <class numtyp, class acctyp>
-void GB_GPU_MemoryT::estimate_gpu_overhead() {
-  device->estimate_gpu_overhead(2,_gpu_overhead,_driver_overhead);
-}
-
-template <class numtyp, class acctyp>
-void GB_GPU_MemoryT::clear() {
-  if (!_allocated)
-    return;
-
-  UCL_H_Vec<int> err_flag(1,*ucl_device);
-  ucl_copy(err_flag,dev_error,false);
-  if (err_flag[0] == 2)
-    std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";  
-  err_flag.clear();
-
-  _allocated=false;
-
-  // Output any timing information
-  acc_timers();
-  double single[9], times[9];
-
-  single[0]=atom->transfer_time()+ans->transfer_time();
-  single[1]=nbor->time_nbor.total_seconds();
-  single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+
-            nbor->time_kernel.total_seconds();
-  single[3]=time_gayberne.total_seconds()+time_gayberne2.total_seconds();
-  if (multiple_forms)
-    single[4]=time_pair.total_seconds();
-  else
-    single[4]=0;
-  single[5]=atom->cast_time()+ans->cast_time();
-  single[6]=_gpu_overhead;
-  single[7]=_driver_overhead;
-  single[8]=ans->cpu_idle_time();
-
-  MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,device->replica());
-  double avg_split=hd_balancer.all_avg_split();
-
-  _max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+
-              sigma_epsilon.row_bytes()+cut_form.row_bytes()+
-              shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+
-              gamma_upsilon_mu.row_bytes()+atom->max_gpu_bytes();
-  double mpi_max_bytes;
-  MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
-             device->replica());
-  double max_mb=mpi_max_bytes/(1024*1024);
-
-  if (device->replica_me()==0)
-    if (screen && times[3]>0.0) {
-      int replica_size=device->replica_size();
-
-      fprintf(screen,"\n\n-------------------------------------");
-      fprintf(screen,"--------------------------------\n");
-      fprintf(screen,"      GPU Time Info (average): ");
-      fprintf(screen,"\n-------------------------------------");
-      fprintf(screen,"--------------------------------\n");
-
-      if (device->procs_per_gpu()==1) {
-        fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/replica_size);
-        fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[5]/replica_size);
-        fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/replica_size);
-        if (nbor->gpu_nbor())
-          fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/replica_size);
-        else
-          fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/replica_size);
-        fprintf(screen,"Force calc:      %.4f s.\n",times[3]/replica_size);
-        fprintf(screen,"LJ calc:         %.4f s.\n",times[4]/replica_size);
-      }
-      fprintf(screen,"GPU Overhead:    %.4f s.\n",times[6]/replica_size);
-      fprintf(screen,"Average split:   %.4f.\n",avg_split);
-      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
-      fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
-      fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[8]/replica_size);
-      fprintf(screen,"-------------------------------------");
-      fprintf(screen,"--------------------------------\n\n");
-
-
-      fprintf(screen,"Average split:   %.4f.\n",avg_split);
-      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
-
-
-    }
-  _max_bytes=0.0;
-
-  dev_error.clear();
-  lj1.clear();
-  lj3.clear();
-  sigma_epsilon.clear();
-  cut_form.clear();
-
-  shape.clear();
-  well.clear();
-  lshape.clear();
-  gamma_upsilon_mu.clear();
-  host_olist.clear();
-
-  time_kernel.clear();
-  time_gayberne.clear();
-  time_kernel2.clear();
-  time_gayberne2.clear();
-  time_pair.clear();
-  hd_balancer.clear();
-
-  if (_compiled) {
-    k_gb_nbor_fast.clear();
-    k_gb_nbor.clear();
-    k_gayberne.clear();
-    k_sphere_gb.clear();
-    k_lj_fast.clear();
-    k_lj.clear();
-    delete pair_program;
-    delete gb_program;
-    delete gb_lj_program;
-    _compiled=false;
-  }
- 
-  nbor->clear();
-  ans->clear();
-  device->clear();
-}
-
-template <class numtyp, class acctyp>
-double GB_GPU_MemoryT::host_memory_usage() const {
-  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
-         4*sizeof(numtyp)+sizeof(GB_GPU_Memory<numtyp,acctyp>)+
-         nbor->max_atoms()*sizeof(int);
-}
-
-template <class numtyp, class acctyp>
-void GB_GPU_MemoryT::compile_kernels(UCL_Device &dev) {
-  if (_compiled)
-    return;
-
-  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
-                    std::string(OCL_PRECISION_COMPILE);
-
-  pair_program=new UCL_Program(dev);
-  pair_program->load_string(gb_gpu_kernel_nbor,flags.c_str());
-  k_gb_nbor_fast.set_function(*pair_program,"kernel_gb_nbor_fast");
-  k_gb_nbor.set_function(*pair_program,"kernel_gb_nbor");
-
-  gb_program=new UCL_Program(dev);
-  gb_program->load_string(gb_gpu_kernel,flags.c_str());
-  k_gayberne.set_function(*gb_program,"kernel_gayberne");
-
-  gb_lj_program=new UCL_Program(dev);
-  gb_lj_program->load_string(gb_gpu_kernel_lj,flags.c_str());
-  k_sphere_gb.set_function(*gb_lj_program,"kernel_sphere_gb");
-  k_lj_fast.set_function(*gb_lj_program,"kernel_lj_fast");
-  k_lj.set_function(*gb_lj_program,"kernel_lj");
-
-  _compiled=true;
-}
-
-template class GB_GPU_Memory<PRECISION,ACC_PRECISION>;
-
--- a/lib/gpu/gb_gpu_memory.h
+++ b/lib/gpu/gb_gpu_memory.h
@ -1,214 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef GB_GPU_MEMORY_H
-#define GB_GPU_MEMORY_H
-
-#include "pair_gpu_device.h"
-#include "pair_gpu_balance.h"
-#include "mpi.h"
-
-template <class numtyp, class acctyp>
-class GB_GPU_Memory {
- public:
-  GB_GPU_Memory();
-  ~GB_GPU_Memory(); 
-
-  /// Clear any previous data and set up for a new LAMMPS run
-  /** \param gpu_nbor true if neighboring performed on device
-    * \param max_nbors initial number of rows in the neighbor matrix
-    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device 
-    * \return false if there is not sufficient memory or device init prob
-    * 
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, const double gamma,
-           const double upsilon, const double mu, double **host_shape,
-           double **host_well, double **host_cutsq, double **host_sigma, 
-           double **host_epsilon, double *host_lshape, int **h_form,
-           double **host_lj1, double **host_lj2, double **host_lj3, 
-           double **host_lj4, double **host_offset, 
-           const double *host_special_lj, const int nlocal, const int nall, 
-           const int max_nbors, const double cell_size,
-           const double gpu_split, FILE *screen);
-
-  /// Estimate the overhead for GPU context changes and CPU driver
-  void estimate_gpu_overhead();
-
-  /// Check if there is enough storage for atom arrays and realloc if not
-  /** \param success set to false if insufficient memory **/
-  inline void resize_atom(const int inum, const int nall, bool &success) {
-    atom->resize(nall, success);
-    ans->resize(inum, success);
-    if (multiple_forms) ans->dev_ans.zero();
-    double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
-    if (bytes>_max_bytes)
-      _max_bytes=bytes;
-  }
-
-  /// Check if there is enough storage for neighbors and realloc if not
-  /** \param nlocal number of particles whose nbors must be stored on device
-    * \param host_inum number of particles whose nbors need to copied to host
-    * \param current maximum number of neighbors
-    * \param olist_size size of list of particles from CPU neighboring
-    * \note host_inum is 0 if the host is performing neighboring
-    * \note if GPU is neighboring nlocal+host_inum=total number local particles
-    * \note if CPU is neighboring olist_size=total number of local particles 
-    * \note if GPU is neighboring olist_size=0 **/
-  inline void resize_local(const int nlocal, const int host_inum,
-                           const int max_nbors, const int olist_size,
-                           bool &success) {
-    if (olist_size>static_cast<int>(host_olist.numel())) {
-      host_olist.clear();
-      int new_size=static_cast<int>(static_cast<double>(olist_size)*1.10);
-      success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
-    }
-    nbor->resize(nlocal,host_inum,max_nbors,success);
-    double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
-    if (bytes>_max_bytes)
-      _max_bytes=bytes;
-  }
-
-  /// Clear all host and device data
-  /** \note This is called at the beginning of the init() routine **/
-  void clear();
- 
-  /// Returns memory usage on device per atom
-  int bytes_per_atom(const int max_nbors) const;
-
-  /// Total host memory used by library for pair style
-  double host_memory_usage() const;
-
-  /// Accumulate timers
-  inline void acc_timers() {
-    if (device->time_device()) {
-      if (nbor_time_avail) {
-        nbor->time_nbor.add_to_total();
-        nbor->time_kernel.add_to_total();
-        nbor_time_avail=false;
-      }
-      time_kernel.add_to_total();
-      time_gayberne.add_to_total();
-      if (multiple_forms) {
-        time_kernel2.add_to_total();
-        time_gayberne2.add_to_total();
-        time_pair.add_to_total();
-      }
-      atom->acc_timers();
-      ans->acc_timers();
-    }
-  }
-  
-  /// Accumulate timers
-  inline void zero_timers() {
-    nbor_time_avail=false;
-    time_kernel.zero();
-    time_gayberne.zero();
-    if (multiple_forms) {
-      time_kernel2.zero();
-      time_gayberne2.zero();
-      time_pair.zero();
-    }
-    atom->zero_timers();
-    ans->zero_timers();
-  }
-
-  // -------------------------- DEVICE DATA ------------------------- 
-  
-  /// Device Properties and Atom and Neighbor storage
-  PairGPUDevice<numtyp,acctyp> *device;
-  /// Geryon device
-  UCL_Device *ucl_device;
-  
-  /// Device Error Flag - Set if a bad matrix inversion occurs
-  UCL_D_Vec<int> dev_error;
-  /// Device timers
-  UCL_Timer time_kernel, time_gayberne, time_kernel2, time_gayberne2, time_pair;
-  /// Host device load balancer
-  PairGPUBalance<numtyp,acctyp> hd_balancer;
-  /// LAMMPS pointer for screen output
-  FILE *screen;
-  
-  // --------------------------- TYPE DATA -------------------------- 
-
-  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form
-  UCL_D_Vec<numtyp4> lj1;
-  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
-  UCL_D_Vec<numtyp4> lj3;
-  /// sigma_epsilon.x = sigma, sigma_epsilon.y = epsilon
-  UCL_D_Vec<numtyp2> sigma_epsilon;
-  /// cut_form.x = cutsq, cut_form.y = form
-  UCL_D_Vec<numtyp2> cut_form;
-  // 0 - gamma, 1-upsilon, 2-mu, 3-special_lj[0], 4-special_lj[1], ...
-  UCL_D_Vec<numtyp> gamma_upsilon_mu;
-  
-  // True if we want to use fast GB-sphere or sphere-sphere calculations 
-  bool multiple_forms;
-  int **host_form;
-
-  /// If atom type constants fit in shared memory, use fast kernels
-  bool shared_types;
-  int _lj_types;
-   
-  // --------------------------- ATOM DATA -------------------------- 
-
-  /// Atom Data
-  PairGPUAtom<numtyp,acctyp> *atom;
-
-  /// Aspherical Const Data for Atoms
-  UCL_D_Vec<numtyp4> shape, well;
-  /// Aspherical Const Data for Atoms
-  UCL_D_Vec<numtyp> lshape;
-
-  int last_ellipse, max_last_ellipse;
-
-  // ------------------------ FORCE/ENERGY DATA -----------------------
-
-  PairGPUAns<numtyp,acctyp> *ans;
-
-  // --------------------------- NBOR DATA ----------------------------
-
-  /// Neighbor data
-  PairGPUNbor *nbor;
-  /// ilist with particles sorted by type
-  UCL_H_Vec<int> host_olist;
-  /// True if we should accumulate the neighbor timer
-  bool nbor_time_avail;
-
-  // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Program *pair_program, *gb_program, *gb_lj_program;
-  UCL_Kernel k_gb_nbor_fast, k_gb_nbor;
-  UCL_Kernel k_gayberne, k_sphere_gb, k_lj_fast, k_lj;
-  inline int block_size() { return _block_size; }
-
-  int _threads_per_atom;
- private:
-  bool _allocated, _compiled;
-  int _block_size;
-  double _max_bytes;
-  double _gpu_overhead, _driver_overhead;
-  
-  void compile_kernels(UCL_Device &dev);
-};
-
-#endif
-
--- a/lib/gpu/geryon/opencl/CL/cl.h
+++ b/lib/gpu/geryon/opencl/CL/cl.h
@ -1,997 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2010 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
-
-/* $Revision: 11708 $ on $Date: 2010-06-14 12:06:24 +0530 (Mon, 14 Jun 2010) $ */
-
-#ifndef __OPENCL_CL_H
-#define __OPENCL_CL_H
-
-#ifdef __APPLE__
-#include <OpenCL/cl_platform.h>
-#else
-#include <CL/cl_platform.h>
-#endif	
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/******************************************************************************/
-
-typedef struct _cl_platform_id *    cl_platform_id;
-typedef struct _cl_device_id *      cl_device_id;
-typedef struct _cl_context *        cl_context;
-typedef struct _cl_command_queue *  cl_command_queue;
-typedef struct _cl_mem *            cl_mem;
-typedef struct _cl_program *        cl_program;
-typedef struct _cl_kernel *         cl_kernel;
-typedef struct _cl_event *          cl_event;
-typedef struct _cl_sampler *        cl_sampler;
-
-typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ 
-typedef cl_ulong            cl_bitfield;
-typedef cl_bitfield         cl_device_type;
-typedef cl_uint             cl_platform_info;
-typedef cl_uint             cl_device_info;
-typedef cl_bitfield         cl_device_fp_config;
-typedef cl_uint             cl_device_mem_cache_type;
-typedef cl_uint             cl_device_local_mem_type;
-typedef cl_bitfield         cl_device_exec_capabilities;
-typedef cl_bitfield         cl_command_queue_properties;
-
-typedef intptr_t			cl_context_properties;
-typedef cl_uint             cl_context_info;
-typedef cl_uint             cl_command_queue_info;
-typedef cl_uint             cl_channel_order;
-typedef cl_uint             cl_channel_type;
-typedef cl_bitfield         cl_mem_flags;
-typedef cl_uint             cl_mem_object_type;
-typedef cl_uint             cl_mem_info;
-typedef cl_uint             cl_image_info;
-typedef cl_uint             cl_buffer_create_type;
-typedef cl_uint             cl_addressing_mode;
-typedef cl_uint             cl_filter_mode;
-typedef cl_uint             cl_sampler_info;
-typedef cl_bitfield         cl_map_flags;
-typedef cl_uint             cl_program_info;
-typedef cl_uint             cl_program_build_info;
-typedef cl_int              cl_build_status;
-typedef cl_uint             cl_kernel_info;
-typedef cl_uint             cl_kernel_work_group_info;
-typedef cl_uint             cl_event_info;
-typedef cl_uint             cl_command_type;
-typedef cl_uint             cl_profiling_info;
-
-typedef struct _cl_image_format {
-    cl_channel_order        image_channel_order;
-    cl_channel_type         image_channel_data_type;
-} cl_image_format;
-
-
-typedef struct _cl_buffer_region {
-    size_t                  origin;
-    size_t                  size;
-} cl_buffer_region;
-
-/******************************************************************************/
-
-/* Error Codes */
-#define CL_SUCCESS                                  0
-#define CL_DEVICE_NOT_FOUND                         -1
-#define CL_DEVICE_NOT_AVAILABLE                     -2
-#define CL_COMPILER_NOT_AVAILABLE                   -3
-#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
-#define CL_OUT_OF_RESOURCES                         -5
-#define CL_OUT_OF_HOST_MEMORY                       -6
-#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
-#define CL_MEM_COPY_OVERLAP                         -8
-#define CL_IMAGE_FORMAT_MISMATCH                    -9
-#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
-#define CL_BUILD_PROGRAM_FAILURE                    -11
-#define CL_MAP_FAILURE                              -12
-#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
-#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
-
-#define CL_INVALID_VALUE                            -30
-#define CL_INVALID_DEVICE_TYPE                      -31
-#define CL_INVALID_PLATFORM                         -32
-#define CL_INVALID_DEVICE                           -33
-#define CL_INVALID_CONTEXT                          -34
-#define CL_INVALID_QUEUE_PROPERTIES                 -35
-#define CL_INVALID_COMMAND_QUEUE                    -36
-#define CL_INVALID_HOST_PTR                         -37
-#define CL_INVALID_MEM_OBJECT                       -38
-#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
-#define CL_INVALID_IMAGE_SIZE                       -40
-#define CL_INVALID_SAMPLER                          -41
-#define CL_INVALID_BINARY                           -42
-#define CL_INVALID_BUILD_OPTIONS                    -43
-#define CL_INVALID_PROGRAM                          -44
-#define CL_INVALID_PROGRAM_EXECUTABLE               -45
-#define CL_INVALID_KERNEL_NAME                      -46
-#define CL_INVALID_KERNEL_DEFINITION                -47
-#define CL_INVALID_KERNEL                           -48
-#define CL_INVALID_ARG_INDEX                        -49
-#define CL_INVALID_ARG_VALUE                        -50
-#define CL_INVALID_ARG_SIZE                         -51
-#define CL_INVALID_KERNEL_ARGS                      -52
-#define CL_INVALID_WORK_DIMENSION                   -53
-#define CL_INVALID_WORK_GROUP_SIZE                  -54
-#define CL_INVALID_WORK_ITEM_SIZE                   -55
-#define CL_INVALID_GLOBAL_OFFSET                    -56
-#define CL_INVALID_EVENT_WAIT_LIST                  -57
-#define CL_INVALID_EVENT                            -58
-#define CL_INVALID_OPERATION                        -59
-#define CL_INVALID_GL_OBJECT                        -60
-#define CL_INVALID_BUFFER_SIZE                      -61
-#define CL_INVALID_MIP_LEVEL                        -62
-#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
-
-/* OpenCL Version */
-#define CL_VERSION_1_0                              1
-#define CL_VERSION_1_1                              1
-
-/* cl_bool */
-#define CL_FALSE                                    0
-#define CL_TRUE                                     1
-
-/* cl_platform_info */
-#define CL_PLATFORM_PROFILE                         0x0900
-#define CL_PLATFORM_VERSION                         0x0901
-#define CL_PLATFORM_NAME                            0x0902
-#define CL_PLATFORM_VENDOR                          0x0903
-#define CL_PLATFORM_EXTENSIONS                      0x0904
-
-/* cl_device_type - bitfield */
-#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
-#define CL_DEVICE_TYPE_CPU                          (1 << 1)
-#define CL_DEVICE_TYPE_GPU                          (1 << 2)
-#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
-#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
-
-/* cl_device_info */
-#define CL_DEVICE_TYPE                              0x1000
-#define CL_DEVICE_VENDOR_ID                         0x1001
-#define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
-#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
-#define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
-#define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR       0x1006
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT      0x1007
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT        0x1008
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG       0x1009
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT      0x100A
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE     0x100B
-#define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
-#define CL_DEVICE_ADDRESS_BITS                      0x100D
-#define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
-#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
-#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
-#define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
-#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
-#define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
-#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
-#define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
-#define CL_DEVICE_IMAGE_SUPPORT                     0x1016
-#define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
-#define CL_DEVICE_MAX_SAMPLERS                      0x1018
-#define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
-#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
-#define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
-#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
-#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE         0x101D
-#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
-#define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
-#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
-#define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
-#define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
-#define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
-#define CL_DEVICE_ERROR_CORRECTION_SUPPORT          0x1024
-#define CL_DEVICE_PROFILING_TIMER_RESOLUTION        0x1025
-#define CL_DEVICE_ENDIAN_LITTLE                     0x1026
-#define CL_DEVICE_AVAILABLE                         0x1027
-#define CL_DEVICE_COMPILER_AVAILABLE                0x1028
-#define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
-#define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
-#define CL_DEVICE_NAME                              0x102B
-#define CL_DEVICE_VENDOR                            0x102C
-#define CL_DRIVER_VERSION                           0x102D
-#define CL_DEVICE_PROFILE                           0x102E
-#define CL_DEVICE_VERSION                           0x102F
-#define CL_DEVICE_EXTENSIONS                        0x1030
-#define CL_DEVICE_PLATFORM                          0x1031
-/* 0x1032 reserved for CL_DEVICE_DOUBLE_FP_CONFIG */
-/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF       0x1034
-#define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR          0x1036
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT         0x1037
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT           0x1038
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG          0x1039
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT         0x103A
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE        0x103B
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF          0x103C
-#define CL_DEVICE_OPENCL_C_VERSION                  0x103D
-
-/* cl_device_fp_config - bitfield */
-#define CL_FP_DENORM                                (1 << 0)
-#define CL_FP_INF_NAN                               (1 << 1)
-#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
-#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
-#define CL_FP_ROUND_TO_INF                          (1 << 4)
-#define CL_FP_FMA                                   (1 << 5)
-#define CL_FP_SOFT_FLOAT                            (1 << 6)
-
-/* cl_device_mem_cache_type */
-#define CL_NONE                                     0x0
-#define CL_READ_ONLY_CACHE                          0x1
-#define CL_READ_WRITE_CACHE                         0x2
-
-/* cl_device_local_mem_type */
-#define CL_LOCAL                                    0x1
-#define CL_GLOBAL                                   0x2
-
-/* cl_device_exec_capabilities - bitfield */
-#define CL_EXEC_KERNEL                              (1 << 0)
-#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
-
-/* cl_command_queue_properties - bitfield */
-#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
-#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
-
-/* cl_context_info  */
-#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
-#define CL_CONTEXT_DEVICES                          0x1081
-#define CL_CONTEXT_PROPERTIES                       0x1082
-#define CL_CONTEXT_NUM_DEVICES                      0x1083
-
-/* cl_context_info + cl_context_properties */
-#define CL_CONTEXT_PLATFORM                         0x1084
-
-/* cl_command_queue_info */
-#define CL_QUEUE_CONTEXT                            0x1090
-#define CL_QUEUE_DEVICE                             0x1091
-#define CL_QUEUE_REFERENCE_COUNT                    0x1092
-#define CL_QUEUE_PROPERTIES                         0x1093
-
-/* cl_mem_flags - bitfield */
-#define CL_MEM_READ_WRITE                           (1 << 0)
-#define CL_MEM_WRITE_ONLY                           (1 << 1)
-#define CL_MEM_READ_ONLY                            (1 << 2)
-#define CL_MEM_USE_HOST_PTR                         (1 << 3)
-#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
-#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
-
-/* cl_channel_order */
-#define CL_R                                        0x10B0
-#define CL_A                                        0x10B1
-#define CL_RG                                       0x10B2
-#define CL_RA                                       0x10B3
-#define CL_RGB                                      0x10B4
-#define CL_RGBA                                     0x10B5
-#define CL_BGRA                                     0x10B6
-#define CL_ARGB                                     0x10B7
-#define CL_INTENSITY                                0x10B8
-#define CL_LUMINANCE                                0x10B9
-#define CL_Rx                                       0x10BA
-#define CL_RGx                                      0x10BB
-#define CL_RGBx                                     0x10BC
-
-/* cl_channel_type */
-#define CL_SNORM_INT8                               0x10D0
-#define CL_SNORM_INT16                              0x10D1
-#define CL_UNORM_INT8                               0x10D2
-#define CL_UNORM_INT16                              0x10D3
-#define CL_UNORM_SHORT_565                          0x10D4
-#define CL_UNORM_SHORT_555                          0x10D5
-#define CL_UNORM_INT_101010                         0x10D6
-#define CL_SIGNED_INT8                              0x10D7
-#define CL_SIGNED_INT16                             0x10D8
-#define CL_SIGNED_INT32                             0x10D9
-#define CL_UNSIGNED_INT8                            0x10DA
-#define CL_UNSIGNED_INT16                           0x10DB
-#define CL_UNSIGNED_INT32                           0x10DC
-#define CL_HALF_FLOAT                               0x10DD
-#define CL_FLOAT                                    0x10DE
-
-/* cl_mem_object_type */
-#define CL_MEM_OBJECT_BUFFER                        0x10F0
-#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
-#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
-
-/* cl_mem_info */
-#define CL_MEM_TYPE                                 0x1100
-#define CL_MEM_FLAGS                                0x1101
-#define CL_MEM_SIZE                                 0x1102
-#define CL_MEM_HOST_PTR                             0x1103
-#define CL_MEM_MAP_COUNT                            0x1104
-#define CL_MEM_REFERENCE_COUNT                      0x1105
-#define CL_MEM_CONTEXT                              0x1106
-#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
-#define CL_MEM_OFFSET                               0x1108
-
-/* cl_image_info */
-#define CL_IMAGE_FORMAT                             0x1110
-#define CL_IMAGE_ELEMENT_SIZE                       0x1111
-#define CL_IMAGE_ROW_PITCH                          0x1112
-#define CL_IMAGE_SLICE_PITCH                        0x1113
-#define CL_IMAGE_WIDTH                              0x1114
-#define CL_IMAGE_HEIGHT                             0x1115
-#define CL_IMAGE_DEPTH                              0x1116
-
-/* cl_addressing_mode */
-#define CL_ADDRESS_NONE                             0x1130
-#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
-#define CL_ADDRESS_CLAMP                            0x1132
-#define CL_ADDRESS_REPEAT                           0x1133
-#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
-
-/* cl_filter_mode */
-#define CL_FILTER_NEAREST                           0x1140
-#define CL_FILTER_LINEAR                            0x1141
-
-/* cl_sampler_info */
-#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
-#define CL_SAMPLER_CONTEXT                          0x1151
-#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
-#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
-#define CL_SAMPLER_FILTER_MODE                      0x1154
-
-/* cl_map_flags - bitfield */
-#define CL_MAP_READ                                 (1 << 0)
-#define CL_MAP_WRITE                                (1 << 1)
-
-/* cl_program_info */
-#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
-#define CL_PROGRAM_CONTEXT                          0x1161
-#define CL_PROGRAM_NUM_DEVICES                      0x1162
-#define CL_PROGRAM_DEVICES                          0x1163
-#define CL_PROGRAM_SOURCE                           0x1164
-#define CL_PROGRAM_BINARY_SIZES                     0x1165
-#define CL_PROGRAM_BINARIES                         0x1166
-
-/* cl_program_build_info */
-#define CL_PROGRAM_BUILD_STATUS                     0x1181
-#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
-#define CL_PROGRAM_BUILD_LOG                        0x1183
-
-/* cl_build_status */
-#define CL_BUILD_SUCCESS                            0
-#define CL_BUILD_NONE                               -1
-#define CL_BUILD_ERROR                              -2
-#define CL_BUILD_IN_PROGRESS                        -3
-
-/* cl_kernel_info */
-#define CL_KERNEL_FUNCTION_NAME                     0x1190
-#define CL_KERNEL_NUM_ARGS                          0x1191
-#define CL_KERNEL_REFERENCE_COUNT                   0x1192
-#define CL_KERNEL_CONTEXT                           0x1193
-#define CL_KERNEL_PROGRAM                           0x1194
-
-/* cl_kernel_work_group_info */
-#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
-#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
-#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
-#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
-#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
-
-/* cl_event_info  */
-#define CL_EVENT_COMMAND_QUEUE                      0x11D0
-#define CL_EVENT_COMMAND_TYPE                       0x11D1
-#define CL_EVENT_REFERENCE_COUNT                    0x11D2
-#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
-#define CL_EVENT_CONTEXT                            0x11D4
-
-/* cl_command_type */
-#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
-#define CL_COMMAND_TASK                             0x11F1
-#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
-#define CL_COMMAND_READ_BUFFER                      0x11F3
-#define CL_COMMAND_WRITE_BUFFER                     0x11F4
-#define CL_COMMAND_COPY_BUFFER                      0x11F5
-#define CL_COMMAND_READ_IMAGE                       0x11F6
-#define CL_COMMAND_WRITE_IMAGE                      0x11F7
-#define CL_COMMAND_COPY_IMAGE                       0x11F8
-#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
-#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
-#define CL_COMMAND_MAP_BUFFER                       0x11FB
-#define CL_COMMAND_MAP_IMAGE                        0x11FC
-#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
-#define CL_COMMAND_MARKER                           0x11FE
-#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
-#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
-#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
-#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
-#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
-#define CL_COMMAND_USER                             0x1204
-
-/* command execution status */
-#define CL_COMPLETE                                 0x0
-#define CL_RUNNING                                  0x1
-#define CL_SUBMITTED                                0x2
-#define CL_QUEUED                                   0x3
-  
-/* cl_buffer_create_type  */
-#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
-
-/* cl_profiling_info  */
-#define CL_PROFILING_COMMAND_QUEUED                 0x1280
-#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
-#define CL_PROFILING_COMMAND_START                  0x1282
-#define CL_PROFILING_COMMAND_END                    0x1283
-
-/********************************************************************************************************/
-
-/* Platform API */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetPlatformIDs(cl_uint          /* num_entries */,
-                 cl_platform_id * /* platforms */,
-                 cl_uint *        /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL 
-clGetPlatformInfo(cl_platform_id   /* platform */, 
-                  cl_platform_info /* param_name */,
-                  size_t           /* param_value_size */, 
-                  void *           /* param_value */,
-                  size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Device APIs */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetDeviceIDs(cl_platform_id   /* platform */,
-               cl_device_type   /* device_type */, 
-               cl_uint          /* num_entries */, 
-               cl_device_id *   /* devices */, 
-               cl_uint *        /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetDeviceInfo(cl_device_id    /* device */,
-                cl_device_info  /* param_name */, 
-                size_t          /* param_value_size */, 
-                void *          /* param_value */,
-                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Context APIs  */
-extern CL_API_ENTRY cl_context CL_API_CALL
-clCreateContext(const cl_context_properties * /* properties */,
-                cl_uint                       /* num_devices */,
-                const cl_device_id *          /* devices */,
-                void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
-                void *                        /* user_data */,
-                cl_int *                      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_context CL_API_CALL
-clCreateContextFromType(const cl_context_properties * /* properties */,
-                        cl_device_type                /* device_type */,
-                        void (CL_CALLBACK *     /* pfn_notify*/ )(const char *, const void *, size_t, void *),
-                        void *                        /* user_data */,
-                        cl_int *                      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetContextInfo(cl_context         /* context */, 
-                 cl_context_info    /* param_name */, 
-                 size_t             /* param_value_size */, 
-                 void *             /* param_value */, 
-                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Command Queue APIs */
-extern CL_API_ENTRY cl_command_queue CL_API_CALL
-clCreateCommandQueue(cl_context                     /* context */, 
-                     cl_device_id                   /* device */, 
-                     cl_command_queue_properties    /* properties */,
-                     cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetCommandQueueInfo(cl_command_queue      /* command_queue */,
-                      cl_command_queue_info /* param_name */,
-                      size_t                /* param_value_size */,
-                      void *                /* param_value */,
-                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
-#warning CL_USE_DEPRECATED_OPENCL_1_0_APIS is defined. These APIs are unsupported and untested in OpenCL 1.1!
-/* 
- *  WARNING:
- *     This API introduces mutable state into the OpenCL implementation. It has been REMOVED
- *  to better facilitate thread safety.  The 1.0 API is not thread safe. It is not tested by the
- *  OpenCL 1.1 conformance test, and consequently may not work or may not work dependably.
- *  It is likely to be non-performant. Use of this API is not advised. Use at your own risk.
- *
- *  Software developers previously relying on this API are instructed to set the command queue 
- *  properties when creating the queue, instead. 
- */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetCommandQueueProperty(cl_command_queue              /* command_queue */,
-                          cl_command_queue_properties   /* properties */, 
-                          cl_bool                        /* enable */,
-                          cl_command_queue_properties * /* old_properties */) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;
-#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */
-
-/* Memory Object APIs */
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateBuffer(cl_context   /* context */,
-               cl_mem_flags /* flags */,
-               size_t       /* size */,
-               void *       /* host_ptr */,
-               cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateSubBuffer(cl_mem                   /* buffer */,
-                  cl_mem_flags             /* flags */,
-                  cl_buffer_create_type    /* buffer_create_type */,
-                  const void *             /* buffer_create_info */,
-                  cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateImage2D(cl_context              /* context */,
-                cl_mem_flags            /* flags */,
-                const cl_image_format * /* image_format */,
-                size_t                  /* image_width */,
-                size_t                  /* image_height */,
-                size_t                  /* image_row_pitch */, 
-                void *                  /* host_ptr */,
-                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-                        
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateImage3D(cl_context              /* context */,
-                cl_mem_flags            /* flags */,
-                const cl_image_format * /* image_format */,
-                size_t                  /* image_width */, 
-                size_t                  /* image_height */,
-                size_t                  /* image_depth */, 
-                size_t                  /* image_row_pitch */, 
-                size_t                  /* image_slice_pitch */, 
-                void *                  /* host_ptr */,
-                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-                        
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetSupportedImageFormats(cl_context           /* context */,
-                           cl_mem_flags         /* flags */,
-                           cl_mem_object_type   /* image_type */,
-                           cl_uint              /* num_entries */,
-                           cl_image_format *    /* image_formats */,
-                           cl_uint *            /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
-                                    
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetMemObjectInfo(cl_mem           /* memobj */,
-                   cl_mem_info      /* param_name */, 
-                   size_t           /* param_value_size */,
-                   void *           /* param_value */,
-                   size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetImageInfo(cl_mem           /* image */,
-               cl_image_info    /* param_name */, 
-               size_t           /* param_value_size */,
-               void *           /* param_value */,
-               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetMemObjectDestructorCallback(  cl_mem /* memobj */, 
-                                    void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
-                                    void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;  
-
-/* Sampler APIs  */
-extern CL_API_ENTRY cl_sampler CL_API_CALL
-clCreateSampler(cl_context          /* context */,
-                cl_bool             /* normalized_coords */, 
-                cl_addressing_mode  /* addressing_mode */, 
-                cl_filter_mode      /* filter_mode */,
-                cl_int *            /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetSamplerInfo(cl_sampler         /* sampler */,
-                 cl_sampler_info    /* param_name */,
-                 size_t             /* param_value_size */,
-                 void *             /* param_value */,
-                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-                            
-/* Program Object APIs  */
-extern CL_API_ENTRY cl_program CL_API_CALL
-clCreateProgramWithSource(cl_context        /* context */,
-                          cl_uint           /* count */,
-                          const char **     /* strings */,
-                          const size_t *    /* lengths */,
-                          cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_program CL_API_CALL
-clCreateProgramWithBinary(cl_context                     /* context */,
-                          cl_uint                        /* num_devices */,
-                          const cl_device_id *           /* device_list */,
-                          const size_t *                 /* lengths */,
-                          const unsigned char **         /* binaries */,
-                          cl_int *                       /* binary_status */,
-                          cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clBuildProgram(cl_program           /* program */,
-               cl_uint              /* num_devices */,
-               const cl_device_id * /* device_list */,
-               const char *         /* options */, 
-               void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
-               void *               /* user_data */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clUnloadCompiler(void) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetProgramInfo(cl_program         /* program */,
-                 cl_program_info    /* param_name */,
-                 size_t             /* param_value_size */,
-                 void *             /* param_value */,
-                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetProgramBuildInfo(cl_program            /* program */,
-                      cl_device_id          /* device */,
-                      cl_program_build_info /* param_name */,
-                      size_t                /* param_value_size */,
-                      void *                /* param_value */,
-                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-                            
-/* Kernel Object APIs */
-extern CL_API_ENTRY cl_kernel CL_API_CALL
-clCreateKernel(cl_program      /* program */,
-               const char *    /* kernel_name */,
-               cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clCreateKernelsInProgram(cl_program     /* program */,
-                         cl_uint        /* num_kernels */,
-                         cl_kernel *    /* kernels */,
-                         cl_uint *      /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainKernel(cl_kernel    /* kernel */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseKernel(cl_kernel   /* kernel */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetKernelArg(cl_kernel    /* kernel */,
-               cl_uint      /* arg_index */,
-               size_t       /* arg_size */,
-               const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetKernelInfo(cl_kernel       /* kernel */,
-                cl_kernel_info  /* param_name */,
-                size_t          /* param_value_size */,
-                void *          /* param_value */,
-                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetKernelWorkGroupInfo(cl_kernel                  /* kernel */,
-                         cl_device_id               /* device */,
-                         cl_kernel_work_group_info  /* param_name */,
-                         size_t                     /* param_value_size */,
-                         void *                     /* param_value */,
-                         size_t *                   /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Event Object APIs  */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clWaitForEvents(cl_uint             /* num_events */,
-                const cl_event *    /* event_list */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetEventInfo(cl_event         /* event */,
-               cl_event_info    /* param_name */,
-               size_t           /* param_value_size */,
-               void *           /* param_value */,
-               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-                            
-extern CL_API_ENTRY cl_event CL_API_CALL
-clCreateUserEvent(cl_context    /* context */,
-                  cl_int *      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;               
-                            
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetUserEventStatus(cl_event   /* event */,
-                     cl_int     /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
-                     
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetEventCallback( cl_event    /* event */,
-                    cl_int      /* command_exec_callback_type */,
-                    void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
-                    void *      /* user_data */) CL_API_SUFFIX__VERSION_1_1;
-
-/* Profiling APIs  */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetEventProfilingInfo(cl_event            /* event */,
-                        cl_profiling_info   /* param_name */,
-                        size_t              /* param_value_size */,
-                        void *              /* param_value */,
-                        size_t *            /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-                                
-/* Flush and Finish APIs */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Enqueued Commands APIs */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReadBuffer(cl_command_queue    /* command_queue */,
-                    cl_mem              /* buffer */,
-                    cl_bool             /* blocking_read */,
-                    size_t              /* offset */,
-                    size_t              /* cb */, 
-                    void *              /* ptr */,
-                    cl_uint             /* num_events_in_wait_list */,
-                    const cl_event *    /* event_wait_list */,
-                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
-                            
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReadBufferRect(cl_command_queue    /* command_queue */,
-                        cl_mem              /* buffer */,
-                        cl_bool             /* blocking_read */,
-                        const size_t *      /* buffer_offset */,
-                        const size_t *      /* host_offset */, 
-                        const size_t *      /* region */,
-                        size_t              /* buffer_row_pitch */,
-                        size_t              /* buffer_slice_pitch */,
-                        size_t              /* host_row_pitch */,
-                        size_t              /* host_slice_pitch */,                        
-                        void *              /* ptr */,
-                        cl_uint             /* num_events_in_wait_list */,
-                        const cl_event *    /* event_wait_list */,
-                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
-                            
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueWriteBuffer(cl_command_queue   /* command_queue */, 
-                     cl_mem             /* buffer */, 
-                     cl_bool            /* blocking_write */, 
-                     size_t             /* offset */, 
-                     size_t             /* cb */, 
-                     const void *       /* ptr */, 
-                     cl_uint            /* num_events_in_wait_list */, 
-                     const cl_event *   /* event_wait_list */, 
-                     cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_0;
-                            
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueWriteBufferRect(cl_command_queue    /* command_queue */,
-                         cl_mem              /* buffer */,
-                         cl_bool             /* blocking_read */,
-                         const size_t *      /* buffer_offset */,
-                         const size_t *      /* host_offset */, 
-                         const size_t *      /* region */,
-                         size_t              /* buffer_row_pitch */,
-                         size_t              /* buffer_slice_pitch */,
-                         size_t              /* host_row_pitch */,
-                         size_t              /* host_slice_pitch */,                        
-                         const void *        /* ptr */,
-                         cl_uint             /* num_events_in_wait_list */,
-                         const cl_event *    /* event_wait_list */,
-                         cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
-                            
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyBuffer(cl_command_queue    /* command_queue */, 
-                    cl_mem              /* src_buffer */,
-                    cl_mem              /* dst_buffer */, 
-                    size_t              /* src_offset */,
-                    size_t              /* dst_offset */,
-                    size_t              /* cb */, 
-                    cl_uint             /* num_events_in_wait_list */,
-                    const cl_event *    /* event_wait_list */,
-                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
-                            
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyBufferRect(cl_command_queue    /* command_queue */, 
-                        cl_mem              /* src_buffer */,
-                        cl_mem              /* dst_buffer */, 
-                        const size_t *      /* src_origin */,
-                        const size_t *      /* dst_origin */,
-                        const size_t *      /* region */, 
-                        size_t              /* src_row_pitch */,
-                        size_t              /* src_slice_pitch */,
-                        size_t              /* dst_row_pitch */,
-                        size_t              /* dst_slice_pitch */,
-                        cl_uint             /* num_events_in_wait_list */,
-                        const cl_event *    /* event_wait_list */,
-                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
-                            
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReadImage(cl_command_queue     /* command_queue */,
-                   cl_mem               /* image */,
-                   cl_bool              /* blocking_read */, 
-                   const size_t *       /* origin[3] */,
-                   const size_t *       /* region[3] */,
-                   size_t               /* row_pitch */,
-                   size_t               /* slice_pitch */, 
-                   void *               /* ptr */,
-                   cl_uint              /* num_events_in_wait_list */,
-                   const cl_event *     /* event_wait_list */,
-                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueWriteImage(cl_command_queue    /* command_queue */,
-                    cl_mem              /* image */,
-                    cl_bool             /* blocking_write */, 
-                    const size_t *      /* origin[3] */,
-                    const size_t *      /* region[3] */,
-                    size_t              /* input_row_pitch */,
-                    size_t              /* input_slice_pitch */, 
-                    const void *        /* ptr */,
-                    cl_uint             /* num_events_in_wait_list */,
-                    const cl_event *    /* event_wait_list */,
-                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyImage(cl_command_queue     /* command_queue */,
-                   cl_mem               /* src_image */,
-                   cl_mem               /* dst_image */, 
-                   const size_t *       /* src_origin[3] */,
-                   const size_t *       /* dst_origin[3] */,
-                   const size_t *       /* region[3] */, 
-                   cl_uint              /* num_events_in_wait_list */,
-                   const cl_event *     /* event_wait_list */,
-                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
-                           cl_mem           /* src_image */,
-                           cl_mem           /* dst_buffer */, 
-                           const size_t *   /* src_origin[3] */,
-                           const size_t *   /* region[3] */, 
-                           size_t           /* dst_offset */,
-                           cl_uint          /* num_events_in_wait_list */,
-                           const cl_event * /* event_wait_list */,
-                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
-                           cl_mem           /* src_buffer */,
-                           cl_mem           /* dst_image */, 
-                           size_t           /* src_offset */,
-                           const size_t *   /* dst_origin[3] */,
-                           const size_t *   /* region[3] */, 
-                           cl_uint          /* num_events_in_wait_list */,
-                           const cl_event * /* event_wait_list */,
-                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY void * CL_API_CALL
-clEnqueueMapBuffer(cl_command_queue /* command_queue */,
-                   cl_mem           /* buffer */,
-                   cl_bool          /* blocking_map */, 
-                   cl_map_flags     /* map_flags */,
-                   size_t           /* offset */,
-                   size_t           /* cb */,
-                   cl_uint          /* num_events_in_wait_list */,
-                   const cl_event * /* event_wait_list */,
-                   cl_event *       /* event */,
-                   cl_int *         /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY void * CL_API_CALL
-clEnqueueMapImage(cl_command_queue  /* command_queue */,
-                  cl_mem            /* image */, 
-                  cl_bool           /* blocking_map */, 
-                  cl_map_flags      /* map_flags */, 
-                  const size_t *    /* origin[3] */,
-                  const size_t *    /* region[3] */,
-                  size_t *          /* image_row_pitch */,
-                  size_t *          /* image_slice_pitch */,
-                  cl_uint           /* num_events_in_wait_list */,
-                  const cl_event *  /* event_wait_list */,
-                  cl_event *        /* event */,
-                  cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
-                        cl_mem           /* memobj */,
-                        void *           /* mapped_ptr */,
-                        cl_uint          /* num_events_in_wait_list */,
-                        const cl_event *  /* event_wait_list */,
-                        cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
-                       cl_kernel        /* kernel */,
-                       cl_uint          /* work_dim */,
-                       const size_t *   /* global_work_offset */,
-                       const size_t *   /* global_work_size */,
-                       const size_t *   /* local_work_size */,
-                       cl_uint          /* num_events_in_wait_list */,
-                       const cl_event * /* event_wait_list */,
-                       cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueTask(cl_command_queue  /* command_queue */,
-              cl_kernel         /* kernel */,
-              cl_uint           /* num_events_in_wait_list */,
-              const cl_event *  /* event_wait_list */,
-              cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
-					  void (*user_func)(void *), 
-                      void *            /* args */,
-                      size_t            /* cb_args */, 
-                      cl_uint           /* num_mem_objects */,
-                      const cl_mem *    /* mem_list */,
-                      const void **     /* args_mem_loc */,
-                      cl_uint           /* num_events_in_wait_list */,
-                      const cl_event *  /* event_wait_list */,
-                      cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMarker(cl_command_queue    /* command_queue */,
-                cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
-                       cl_uint          /* num_events */,
-                       const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueBarrier(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Extension function access
- *
- * Returns the extension function address for the given function name,
- * or NULL if a valid function can not be found.  The client must
- * check to make sure the address is not NULL, before using or 
- * calling the returned function address.
- */
-extern CL_API_ENTRY void * CL_API_CALL clGetExtensionFunctionAddress(const char * /* func_name */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_H */
-
--- a/lib/gpu/geryon/opencl/CL/cl_ext.h
+++ b/lib/gpu/geryon/opencl/CL/cl_ext.h
@ -1,213 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2010 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
-
-/* $Revision: 11687 $ on $Date: 2010-06-12 03:47:22 +0530 (Sat, 12 Jun 2010) $ */
-
-/* cl_ext.h contains OpenCL extensions which don't have external */
-/* (OpenGL, D3D) dependencies.                                   */
-
-#ifndef __CL_EXT_H
-#define __CL_EXT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef __APPLE__
-	#include <OpenCL/cl.h>
-    #include <AvailabilityMacros.h>
-#else
-	#include <CL/cl.h>
-#endif
-
-/* cl_khr_fp64 extension - no extension #define since it has no functions  */
-#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
-
-/* cl_khr_fp16 extension - no extension #define since it has no functions  */
-#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
-
-/* Memory object destruction
- *
- * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
- *
- * Registers a user callback function that will be called when the memory object is deleted and its resources 
- * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback 
- * stack associated with memobj. The registered user callback functions are called in the reverse order in 
- * which they were registered. The user callback functions are called and then the memory object is deleted 
- * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be 
- * notified when the memory referenced by host_ptr, specified when the memory object is created and used as 
- * the storage bits for the memory object, can be reused or freed.
- *
- * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
- *
- * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
- * before using.
- */
-#define cl_APPLE_SetMemObjectDestructor 1
-cl_int	CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */, 
-                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
-                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;  
-
-
-/* Context Logging Functions
- *
- * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
- * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
- * before using.
- *
- * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger 
- */
-#define cl_APPLE_ContextLoggingFunctions 1
-extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */, 
-                                            const void * /* private_info */, 
-                                            size_t       /* cb */, 
-                                            void *       /* user_data */ )  CL_EXT_SUFFIX__VERSION_1_0;
-
-/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
-extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */, 
-                                          const void * /* private_info */, 
-                                          size_t       /* cb */, 
-                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
-
-/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
-extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */, 
-                                          const void * /* private_info */, 
-                                          size_t       /* cb */, 
-                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
-
-
-/************************ 
-* cl_khr_icd extension *                                                  
-************************/
-#define cl_khr_icd 1
-
-/* cl_platform_info                                                        */
-#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
-
-/* Additional Error Codes                                                  */
-#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
-                       cl_platform_id * /* platforms */,
-                       cl_uint *        /* num_platforms */);
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
-    cl_uint          /* num_entries */,
-    cl_platform_id * /* platforms */,
-    cl_uint *        /* num_platforms */);
-
-
-/******************************************
-* cl_nv_device_attribute_query extension *
-******************************************/
-/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
-#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
-#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
-#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
-#define CL_DEVICE_WARP_SIZE_NV                      0x4003
-#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
-#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
-#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
-
-
-/*********************************
-* cl_amd_device_attribute_query *
-*********************************/
-#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
-
-
-#ifdef CL_VERSION_1_1
-   /***********************************
-    * cl_ext_device_fission extension *
-    ***********************************/
-    #define cl_ext_device_fission   1
-    
-    extern CL_API_ENTRY cl_int CL_API_CALL
-    clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
-    
-    typedef CL_API_ENTRY cl_int 
-    (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
-
-    extern CL_API_ENTRY cl_int CL_API_CALL
-    clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
-    
-    typedef CL_API_ENTRY cl_int 
-    (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
-
-    typedef cl_ulong  cl_device_partition_property_ext;
-    extern CL_API_ENTRY cl_int CL_API_CALL
-    clCreateSubDevicesEXT(  cl_device_id /*in_device*/,
-                            const cl_device_partition_property_ext * /* properties */,
-                            cl_uint /*num_entries*/,
-                            cl_device_id * /*out_devices*/,
-                            cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
-
-    extern CL_API_ENTRY cl_int 
-    ( CL_API_CALL * clCreateSubDevicesEXT_fn)(  cl_device_id /*in_device*/,
-                                                const cl_device_partition_property_ext * /* properties */,
-                                                cl_uint /*num_entries*/,
-                                                cl_device_id * /*out_devices*/,
-                                                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
-
-    /* cl_device_partition_property_ext */
-    #define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
-    #define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
-    #define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
-    #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
-    
-    /* clDeviceGetInfo selectors */
-    #define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
-    #define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
-    #define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
-    #define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
-    #define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
-    
-    /* error codes */
-    #define CL_DEVICE_PARTITION_FAILED_EXT              -1057
-    #define CL_INVALID_PARTITION_COUNT_EXT              -1058
-    #define CL_INVALID_PARTITION_NAME_EXT               -1059
-    
-    /* CL_AFFINITY_DOMAINs */
-    #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
-    #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
-    #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
-    #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
-    #define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
-    #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
-    
-    /* cl_device_partition_property_ext list terminators */
-    #define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
-    #define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
-    #define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
-
-
-
-#endif /* CL_VERSION_1_1 */
-
-#ifdef __cplusplus
-}
-#endif
-
-
-#endif /* __CL_EXT_H */
--- a/lib/gpu/geryon/opencl/CL/cl_gl.h
+++ b/lib/gpu/geryon/opencl/CL/cl_gl.h
@ -1,155 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2010 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-
-/* $Revision: 11708 $ on $Date: 2010-06-14 12:06:24 +0530 (Mon, 14 Jun 2010) $ */
-
-/*
- * cl_gl.h contains Khronos-approved (KHR) OpenCL extensions which have
- * OpenGL dependencies. The application is responsible for #including
- * OpenGL or OpenGL ES headers before #including cl_gl.h.
- */
-
-#ifndef __OPENCL_CL_GL_H
-#define __OPENCL_CL_GL_H
-
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#include <OpenGL/CGLDevice.h>
-#else
-#include <CL/cl.h>
-#endif	
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef cl_uint     cl_gl_object_type;
-typedef cl_uint     cl_gl_texture_info;
-typedef cl_uint     cl_gl_platform_info;
-typedef struct __GLsync *cl_GLsync;
-
-/* cl_gl_object_type */
-#define CL_GL_OBJECT_BUFFER             0x2000
-#define CL_GL_OBJECT_TEXTURE2D          0x2001
-#define CL_GL_OBJECT_TEXTURE3D          0x2002
-#define CL_GL_OBJECT_RENDERBUFFER       0x2003
-
-/* cl_gl_texture_info */
-#define CL_GL_TEXTURE_TARGET            0x2004
-#define CL_GL_MIPMAP_LEVEL              0x2005
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLBuffer(cl_context     /* context */,
-                     cl_mem_flags   /* flags */,
-                     cl_GLuint      /* bufobj */,
-                     int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLTexture2D(cl_context      /* context */,
-                        cl_mem_flags    /* flags */,
-                        cl_GLenum       /* target */,
-                        cl_GLint        /* miplevel */,
-                        cl_GLuint       /* texture */,
-                        cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLTexture3D(cl_context      /* context */,
-                        cl_mem_flags    /* flags */,
-                        cl_GLenum       /* target */,
-                        cl_GLint        /* miplevel */,
-                        cl_GLuint       /* texture */,
-                        cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLRenderbuffer(cl_context   /* context */,
-                           cl_mem_flags /* flags */,
-                           cl_GLuint    /* renderbuffer */,
-                           cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetGLObjectInfo(cl_mem                /* memobj */,
-                  cl_gl_object_type *   /* gl_object_type */,
-                  cl_GLuint *              /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
-                  
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetGLTextureInfo(cl_mem               /* memobj */,
-                   cl_gl_texture_info   /* param_name */,
-                   size_t               /* param_value_size */,
-                   void *               /* param_value */,
-                   size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
-                          cl_uint               /* num_objects */,
-                          const cl_mem *        /* mem_objects */,
-                          cl_uint               /* num_events_in_wait_list */,
-                          const cl_event *      /* event_wait_list */,
-                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
-                          cl_uint               /* num_objects */,
-                          const cl_mem *        /* mem_objects */,
-                          cl_uint               /* num_events_in_wait_list */,
-                          const cl_event *      /* event_wait_list */,
-                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-/* cl_khr_gl_sharing extension  */
-
-#define cl_khr_gl_sharing 1
-
-typedef cl_uint     cl_gl_context_info;
-
-/* Additional Error Codes  */
-#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
-
-/* cl_gl_context_info  */
-#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
-#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
-
-/* Additional cl_context_properties  */
-#define CL_GL_CONTEXT_KHR                       0x2008
-#define CL_EGL_DISPLAY_KHR                      0x2009
-#define CL_GLX_DISPLAY_KHR                      0x200A
-#define CL_WGL_HDC_KHR                          0x200B
-#define CL_CGL_SHAREGROUP_KHR                   0x200C
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
-                      cl_gl_context_info            /* param_name */,
-                      size_t                        /* param_value_size */,
-                      void *                        /* param_value */,
-                      size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
-    const cl_context_properties * properties,
-    cl_gl_context_info            param_name,
-    size_t                        param_value_size,
-    void *                        param_value,
-    size_t *                      param_value_size_ret);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_GL_H  */
--- a/lib/gpu/geryon/opencl/CL/cl_gl_ext.h
+++ b/lib/gpu/geryon/opencl/CL/cl_gl_ext.h
@ -1,69 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2010 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-
-/* $Revision: 11708 $ on $Date: 2010-06-14 12:06:24 +0530 (Mon, 14 Jun 2010) $ */
-
-/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have           */
-/* OpenGL dependencies.                                                         */
-
-#ifndef __OPENCL_CL_GL_EXT_H
-#define __OPENCL_CL_GL_EXT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef __APPLE__
-    #include <OpenCL/cl_gl.h>
-#else
-    #include <CL/cl_gl.h>
-#endif
-
-/*
- * For each extension, follow this template
- * /* cl_VEN_extname extension  */
-/* #define cl_VEN_extname 1
- * ... define new types, if any
- * ... define new tokens, if any
- * ... define new APIs, if any
- *
- *  If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
- *  This allows us to avoid having to decide whether to include GL headers or GLES here.
- */
-
-/* 
- *  cl_khr_gl_event  extension
- *  See section 9.9 in the OpenCL 1.1 spec for more information
- */
-#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
-
-extern CL_API_ENTRY cl_event CL_API_CALL
-clCreateEventFromGLsyncKHR(cl_context           /* context */,
-                           cl_GLsync            /* cl_GLsync */,
-                           cl_int *             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif	/* __OPENCL_CL_GL_EXT_H  */
--- a/lib/gpu/geryon/opencl/CL/cl_platform.h
+++ b/lib/gpu/geryon/opencl/CL/cl_platform.h
--- a/lib/gpu/geryon/opencl/CL/clext.h
+++ b/lib/gpu/geryon/opencl/CL/clext.h
@ -1,22 +0,0 @@
-/*
-** Copyright 1998-2002, NVIDIA Corporation.
-** All Rights Reserved.
-**
-** THE INFORMATION CONTAINED HEREIN IS PROPRIETARY AND CONFIDENTIAL TO
-** NVIDIA, CORPORATION.  USE, REPRODUCTION OR DISCLOSURE TO ANY THIRD PARTY
-** IS SUBJECT TO WRITTEN PRE-APPROVAL BY NVIDIA, CORPORATION.
-**
-**
-*/
-#ifndef __CLEXT_H
-#define __CLEXT_H
-
-#define CL_NV_DEVICE_COMPUTE_CAPABILITY_MAJOR       0x4000
-#define CL_NV_DEVICE_COMPUTE_CAPABILITY_MINOR       0x4001
-#define CL_NV_DEVICE_REGISTERS_PER_BLOCK            0x4002
-#define CL_NV_DEVICE_WARP_SIZE                      0x4003
-#define CL_NV_DEVICE_GPU_OVERLAP                    0x4004
-#define CL_NV_DEVICE_KERNEL_EXEC_TIMEOUT            0x4005
-#define CL_NV_DEVICE_INTEGRATED_MEMORY              0x4006
-
-#endif
--- a/lib/gpu/geryon/opencl/CL/opencl.h
+++ b/lib/gpu/geryon/opencl/CL/opencl.h
@ -1,54 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2010 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
-
-/* $Revision: 11708 $ on $Date: 2010-06-14 12:06:24 +0530 (Mon, 14 Jun 2010) $ */
-
-#ifndef __OPENCL_H
-#define __OPENCL_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef __APPLE__
-
-#include <OpenCL/cl.h>
-#include <OpenCL/cl_gl.h>
-#include <OpenCL/cl_gl_ext.h>
-#include <OpenCL/cl_ext.h>
-
-#else
-
-#include <CL/cl.h>
-#include <CL/cl_gl.h>
-#include <CL/cl_gl_ext.h>
-#include <CL/cl_ext.h>
-
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_H   */
-
--- a/lib/gpu/geryon/opencl_1_0/CL/cl.h
+++ b/lib/gpu/geryon/opencl_1_0/CL/cl.h
@ -1,876 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2009 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
-
-/* $Revision: 10424 $ on $Date: 2010-02-17 14:34:49 -0800 (Wed, 17 Feb 2010) $ */
-
-#ifndef __OPENCL_CL_H
-#define __OPENCL_CL_H
-
-#ifdef __APPLE__
-#include <OpenCL/cl_platform.h>
-#else
-#include <CL/cl_platform.h>
-#endif	
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/******************************************************************************/
-
-typedef struct _cl_platform_id *    cl_platform_id;
-typedef struct _cl_device_id *      cl_device_id;
-typedef struct _cl_context *        cl_context;
-typedef struct _cl_command_queue *  cl_command_queue;
-typedef struct _cl_mem *            cl_mem;
-typedef struct _cl_program *        cl_program;
-typedef struct _cl_kernel *         cl_kernel;
-typedef struct _cl_event *          cl_event;
-typedef struct _cl_sampler *        cl_sampler;
-
-typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ 
-typedef cl_ulong            cl_bitfield;
-typedef cl_bitfield         cl_device_type;
-typedef cl_uint             cl_platform_info;
-typedef cl_uint             cl_device_info;
-typedef cl_bitfield         cl_device_address_info;
-typedef cl_bitfield         cl_device_fp_config;
-typedef cl_uint             cl_device_mem_cache_type;
-typedef cl_uint             cl_device_local_mem_type;
-typedef cl_bitfield         cl_device_exec_capabilities;
-typedef cl_bitfield         cl_command_queue_properties;
-
-typedef intptr_t			cl_context_properties;
-typedef cl_uint             cl_context_info;
-typedef cl_uint             cl_command_queue_info;
-typedef cl_uint             cl_channel_order;
-typedef cl_uint             cl_channel_type;
-typedef cl_bitfield         cl_mem_flags;
-typedef cl_uint             cl_mem_object_type;
-typedef cl_uint             cl_mem_info;
-typedef cl_uint             cl_image_info;
-typedef cl_uint             cl_addressing_mode;
-typedef cl_uint             cl_filter_mode;
-typedef cl_uint             cl_sampler_info;
-typedef cl_bitfield         cl_map_flags;
-typedef cl_uint             cl_program_info;
-typedef cl_uint             cl_program_build_info;
-typedef cl_int              cl_build_status;
-typedef cl_uint             cl_kernel_info;
-typedef cl_uint             cl_kernel_work_group_info;
-typedef cl_uint             cl_event_info;
-typedef cl_uint             cl_command_type;
-typedef cl_uint             cl_profiling_info;
-
-typedef struct _cl_image_format {
-    cl_channel_order        image_channel_order;
-    cl_channel_type         image_channel_data_type;
-} cl_image_format;
-
-
-
-/******************************************************************************/
-
-/* Error Codes */
-#define CL_SUCCESS                                  0
-#define CL_DEVICE_NOT_FOUND                         -1
-#define CL_DEVICE_NOT_AVAILABLE                     -2
-#define CL_COMPILER_NOT_AVAILABLE                   -3
-#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
-#define CL_OUT_OF_RESOURCES                         -5
-#define CL_OUT_OF_HOST_MEMORY                       -6
-#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
-#define CL_MEM_COPY_OVERLAP                         -8
-#define CL_IMAGE_FORMAT_MISMATCH                    -9
-#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
-#define CL_BUILD_PROGRAM_FAILURE                    -11
-#define CL_MAP_FAILURE                              -12
-
-#define CL_INVALID_VALUE                            -30
-#define CL_INVALID_DEVICE_TYPE                      -31
-#define CL_INVALID_PLATFORM                         -32
-#define CL_INVALID_DEVICE                           -33
-#define CL_INVALID_CONTEXT                          -34
-#define CL_INVALID_QUEUE_PROPERTIES                 -35
-#define CL_INVALID_COMMAND_QUEUE                    -36
-#define CL_INVALID_HOST_PTR                         -37
-#define CL_INVALID_MEM_OBJECT                       -38
-#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
-#define CL_INVALID_IMAGE_SIZE                       -40
-#define CL_INVALID_SAMPLER                          -41
-#define CL_INVALID_BINARY                           -42
-#define CL_INVALID_BUILD_OPTIONS                    -43
-#define CL_INVALID_PROGRAM                          -44
-#define CL_INVALID_PROGRAM_EXECUTABLE               -45
-#define CL_INVALID_KERNEL_NAME                      -46
-#define CL_INVALID_KERNEL_DEFINITION                -47
-#define CL_INVALID_KERNEL                           -48
-#define CL_INVALID_ARG_INDEX                        -49
-#define CL_INVALID_ARG_VALUE                        -50
-#define CL_INVALID_ARG_SIZE                         -51
-#define CL_INVALID_KERNEL_ARGS                      -52
-#define CL_INVALID_WORK_DIMENSION                   -53
-#define CL_INVALID_WORK_GROUP_SIZE                  -54
-#define CL_INVALID_WORK_ITEM_SIZE                   -55
-#define CL_INVALID_GLOBAL_OFFSET                    -56
-#define CL_INVALID_EVENT_WAIT_LIST                  -57
-#define CL_INVALID_EVENT                            -58
-#define CL_INVALID_OPERATION                        -59
-#define CL_INVALID_GL_OBJECT                        -60
-#define CL_INVALID_BUFFER_SIZE                      -61
-#define CL_INVALID_MIP_LEVEL                        -62
-#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
-
-/* OpenCL Version */
-#define CL_VERSION_1_0                              1
-
-/* cl_bool */
-#define CL_FALSE                                    0
-#define CL_TRUE                                     1
-
-/* cl_platform_info */
-#define CL_PLATFORM_PROFILE                         0x0900
-#define CL_PLATFORM_VERSION                         0x0901
-#define CL_PLATFORM_NAME                            0x0902
-#define CL_PLATFORM_VENDOR                          0x0903
-#define CL_PLATFORM_EXTENSIONS                      0x0904
-
-/* cl_device_type - bitfield */
-#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
-#define CL_DEVICE_TYPE_CPU                          (1 << 1)
-#define CL_DEVICE_TYPE_GPU                          (1 << 2)
-#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
-#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
-
-/* cl_device_info */
-#define CL_DEVICE_TYPE                              0x1000
-#define CL_DEVICE_VENDOR_ID                         0x1001
-#define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
-#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
-#define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
-#define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR       0x1006
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT      0x1007
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT        0x1008
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG       0x1009
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT      0x100A
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE     0x100B
-#define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
-#define CL_DEVICE_ADDRESS_BITS                      0x100D
-#define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
-#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
-#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
-#define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
-#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
-#define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
-#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
-#define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
-#define CL_DEVICE_IMAGE_SUPPORT                     0x1016
-#define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
-#define CL_DEVICE_MAX_SAMPLERS                      0x1018
-#define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
-#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
-#define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
-#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
-#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE         0x101D
-#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
-#define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
-#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
-#define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
-#define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
-#define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
-#define CL_DEVICE_ERROR_CORRECTION_SUPPORT          0x1024
-#define CL_DEVICE_PROFILING_TIMER_RESOLUTION        0x1025
-#define CL_DEVICE_ENDIAN_LITTLE                     0x1026
-#define CL_DEVICE_AVAILABLE                         0x1027
-#define CL_DEVICE_COMPILER_AVAILABLE                0x1028
-#define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
-#define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
-#define CL_DEVICE_NAME                              0x102B
-#define CL_DEVICE_VENDOR                            0x102C
-#define CL_DRIVER_VERSION                           0x102D
-#define CL_DEVICE_PROFILE                           0x102E
-#define CL_DEVICE_VERSION                           0x102F
-#define CL_DEVICE_EXTENSIONS                        0x1030
-#define CL_DEVICE_PLATFORM                          0x1031
-/* 0x1032 reserved for CL_DEVICE_DOUBLE_FP_CONFIG */
-/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
-
-/* cl_device_fp_config - bitfield */
-#define CL_FP_DENORM                                (1 << 0)
-#define CL_FP_INF_NAN                               (1 << 1)
-#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
-#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
-#define CL_FP_ROUND_TO_INF                          (1 << 4)
-#define CL_FP_FMA                                   (1 << 5)
-
-/* cl_device_mem_cache_type */
-#define CL_NONE                                     0x0
-#define CL_READ_ONLY_CACHE                          0x1
-#define CL_READ_WRITE_CACHE                         0x2
-
-/* cl_device_local_mem_type */
-#define CL_LOCAL                                    0x1
-#define CL_GLOBAL                                   0x2
-
-/* cl_device_exec_capabilities - bitfield */
-#define CL_EXEC_KERNEL                              (1 << 0)
-#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
-
-/* cl_command_queue_properties - bitfield */
-#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
-#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
-
-/* cl_context_info  */
-#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
-#define CL_CONTEXT_DEVICES                          0x1081
-#define CL_CONTEXT_PROPERTIES                       0x1082
-
-/* cl_context_info + cl_context_properties */
-#define CL_CONTEXT_PLATFORM                         0x1084
-
-/* cl_command_queue_info */
-#define CL_QUEUE_CONTEXT                            0x1090
-#define CL_QUEUE_DEVICE                             0x1091
-#define CL_QUEUE_REFERENCE_COUNT                    0x1092
-#define CL_QUEUE_PROPERTIES                         0x1093
-
-/* cl_mem_flags - bitfield */
-#define CL_MEM_READ_WRITE                           (1 << 0)
-#define CL_MEM_WRITE_ONLY                           (1 << 1)
-#define CL_MEM_READ_ONLY                            (1 << 2)
-#define CL_MEM_USE_HOST_PTR                         (1 << 3)
-#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
-#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
-
-/* cl_channel_order */
-#define CL_R                                        0x10B0
-#define CL_A                                        0x10B1
-#define CL_RG                                       0x10B2
-#define CL_RA                                       0x10B3
-#define CL_RGB                                      0x10B4
-#define CL_RGBA                                     0x10B5
-#define CL_BGRA                                     0x10B6
-#define CL_ARGB                                     0x10B7
-#define CL_INTENSITY                                0x10B8
-#define CL_LUMINANCE                                0x10B9
-
-/* cl_channel_type */
-#define CL_SNORM_INT8                               0x10D0
-#define CL_SNORM_INT16                              0x10D1
-#define CL_UNORM_INT8                               0x10D2
-#define CL_UNORM_INT16                              0x10D3
-#define CL_UNORM_SHORT_565                          0x10D4
-#define CL_UNORM_SHORT_555                          0x10D5
-#define CL_UNORM_INT_101010                         0x10D6
-#define CL_SIGNED_INT8                              0x10D7
-#define CL_SIGNED_INT16                             0x10D8
-#define CL_SIGNED_INT32                             0x10D9
-#define CL_UNSIGNED_INT8                            0x10DA
-#define CL_UNSIGNED_INT16                           0x10DB
-#define CL_UNSIGNED_INT32                           0x10DC
-#define CL_HALF_FLOAT                               0x10DD
-#define CL_FLOAT                                    0x10DE
-
-/* cl_mem_object_type */
-#define CL_MEM_OBJECT_BUFFER                        0x10F0
-#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
-#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
-
-/* cl_mem_info */
-#define CL_MEM_TYPE                                 0x1100
-#define CL_MEM_FLAGS                                0x1101
-#define CL_MEM_SIZE                                 0x1102
-#define CL_MEM_HOST_PTR                             0x1103
-#define CL_MEM_MAP_COUNT                            0x1104
-#define CL_MEM_REFERENCE_COUNT                      0x1105
-#define CL_MEM_CONTEXT                              0x1106
-
-/* cl_image_info */
-#define CL_IMAGE_FORMAT                             0x1110
-#define CL_IMAGE_ELEMENT_SIZE                       0x1111
-#define CL_IMAGE_ROW_PITCH                          0x1112
-#define CL_IMAGE_SLICE_PITCH                        0x1113
-#define CL_IMAGE_WIDTH                              0x1114
-#define CL_IMAGE_HEIGHT                             0x1115
-#define CL_IMAGE_DEPTH                              0x1116
-
-/* cl_addressing_mode */
-#define CL_ADDRESS_NONE                             0x1130
-#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
-#define CL_ADDRESS_CLAMP                            0x1132
-#define CL_ADDRESS_REPEAT                           0x1133
-
-/* cl_filter_mode */
-#define CL_FILTER_NEAREST                           0x1140
-#define CL_FILTER_LINEAR                            0x1141
-
-/* cl_sampler_info */
-#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
-#define CL_SAMPLER_CONTEXT                          0x1151
-#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
-#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
-#define CL_SAMPLER_FILTER_MODE                      0x1154
-
-/* cl_map_flags - bitfield */
-#define CL_MAP_READ                                 (1 << 0)
-#define CL_MAP_WRITE                                (1 << 1)
-
-/* cl_program_info */
-#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
-#define CL_PROGRAM_CONTEXT                          0x1161
-#define CL_PROGRAM_NUM_DEVICES                      0x1162
-#define CL_PROGRAM_DEVICES                          0x1163
-#define CL_PROGRAM_SOURCE                           0x1164
-#define CL_PROGRAM_BINARY_SIZES                     0x1165
-#define CL_PROGRAM_BINARIES                         0x1166
-
-/* cl_program_build_info */
-#define CL_PROGRAM_BUILD_STATUS                     0x1181
-#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
-#define CL_PROGRAM_BUILD_LOG                        0x1183
-
-/* cl_build_status */
-#define CL_BUILD_SUCCESS                            0
-#define CL_BUILD_NONE                               -1
-#define CL_BUILD_ERROR                              -2
-#define CL_BUILD_IN_PROGRESS                        -3
-
-/* cl_kernel_info */
-#define CL_KERNEL_FUNCTION_NAME                     0x1190
-#define CL_KERNEL_NUM_ARGS                          0x1191
-#define CL_KERNEL_REFERENCE_COUNT                   0x1192
-#define CL_KERNEL_CONTEXT                           0x1193
-#define CL_KERNEL_PROGRAM                           0x1194
-
-/* cl_kernel_work_group_info */
-#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
-#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
-#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
-
-/* cl_event_info  */
-#define CL_EVENT_COMMAND_QUEUE                      0x11D0
-#define CL_EVENT_COMMAND_TYPE                       0x11D1
-#define CL_EVENT_REFERENCE_COUNT                    0x11D2
-#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
-
-/* cl_command_type */
-#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
-#define CL_COMMAND_TASK                             0x11F1
-#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
-#define CL_COMMAND_READ_BUFFER                      0x11F3
-#define CL_COMMAND_WRITE_BUFFER                     0x11F4
-#define CL_COMMAND_COPY_BUFFER                      0x11F5
-#define CL_COMMAND_READ_IMAGE                       0x11F6
-#define CL_COMMAND_WRITE_IMAGE                      0x11F7
-#define CL_COMMAND_COPY_IMAGE                       0x11F8
-#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
-#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
-#define CL_COMMAND_MAP_BUFFER                       0x11FB
-#define CL_COMMAND_MAP_IMAGE                        0x11FC
-#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
-#define CL_COMMAND_MARKER                           0x11FE
-#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
-#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
-
-/* command execution status */
-#define CL_COMPLETE                                 0x0
-#define CL_RUNNING                                  0x1
-#define CL_SUBMITTED                                0x2
-#define CL_QUEUED                                   0x3
-  
-/* cl_profiling_info  */
-#define CL_PROFILING_COMMAND_QUEUED                 0x1280
-#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
-#define CL_PROFILING_COMMAND_START                  0x1282
-#define CL_PROFILING_COMMAND_END                    0x1283
-
-/********************************************************************************************************/
-
-/* Platform API */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetPlatformIDs(cl_uint          /* num_entries */,
-                 cl_platform_id * /* platforms */,
-                 cl_uint *        /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL 
-clGetPlatformInfo(cl_platform_id   /* platform */, 
-                  cl_platform_info /* param_name */,
-                  size_t           /* param_value_size */, 
-                  void *           /* param_value */,
-                  size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Device APIs */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetDeviceIDs(cl_platform_id   /* platform */,
-               cl_device_type   /* device_type */, 
-               cl_uint          /* num_entries */, 
-               cl_device_id *   /* devices */, 
-               cl_uint *        /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetDeviceInfo(cl_device_id    /* device */,
-                cl_device_info  /* param_name */, 
-                size_t          /* param_value_size */, 
-                void *          /* param_value */,
-                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Context APIs  */
-extern CL_API_ENTRY cl_context CL_API_CALL
-clCreateContext(const cl_context_properties * /* properties */,
-                cl_uint                       /* num_devices */,
-                const cl_device_id *          /* devices */,
-                void (*pfn_notify)(const char *, const void *, size_t, void *) /* pfn_notify */,
-                void *                        /* user_data */,
-                cl_int *                      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_context CL_API_CALL
-clCreateContextFromType(const cl_context_properties * /* properties */,
-                        cl_device_type                /* device_type */,
-                        void (*pfn_notify)(const char *, const void *, size_t, void *) /* pfn_notify */,
-                        void *                        /* user_data */,
-                        cl_int *                      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetContextInfo(cl_context         /* context */, 
-                 cl_context_info    /* param_name */, 
-                 size_t             /* param_value_size */, 
-                 void *             /* param_value */, 
-                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Command Queue APIs */
-extern CL_API_ENTRY cl_command_queue CL_API_CALL
-clCreateCommandQueue(cl_context                     /* context */, 
-                     cl_device_id                   /* device */, 
-                     cl_command_queue_properties    /* properties */,
-                     cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetCommandQueueInfo(cl_command_queue      /* command_queue */,
-                      cl_command_queue_info /* param_name */,
-                      size_t                /* param_value_size */,
-                      void *                /* param_value */,
-                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetCommandQueueProperty(cl_command_queue              /* command_queue */,
-                          cl_command_queue_properties   /* properties */, 
-                          cl_bool                        /* enable */,
-                          cl_command_queue_properties * /* old_properties */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Memory Object APIs  */
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateBuffer(cl_context   /* context */,
-               cl_mem_flags /* flags */,
-               size_t       /* size */,
-               void *       /* host_ptr */,
-               cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateImage2D(cl_context              /* context */,
-                cl_mem_flags            /* flags */,
-                const cl_image_format * /* image_format */,
-                size_t                  /* image_width */,
-                size_t                  /* image_height */,
-                size_t                  /* image_row_pitch */, 
-                void *                  /* host_ptr */,
-                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-                        
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateImage3D(cl_context              /* context */,
-                cl_mem_flags            /* flags */,
-                const cl_image_format * /* image_format */,
-                size_t                  /* image_width */, 
-                size_t                  /* image_height */,
-                size_t                  /* image_depth */, 
-                size_t                  /* image_row_pitch */, 
-                size_t                  /* image_slice_pitch */, 
-                void *                  /* host_ptr */,
-                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-                        
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetSupportedImageFormats(cl_context           /* context */,
-                           cl_mem_flags         /* flags */,
-                           cl_mem_object_type   /* image_type */,
-                           cl_uint              /* num_entries */,
-                           cl_image_format *    /* image_formats */,
-                           cl_uint *            /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
-                                    
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetMemObjectInfo(cl_mem           /* memobj */,
-                   cl_mem_info      /* param_name */, 
-                   size_t           /* param_value_size */,
-                   void *           /* param_value */,
-                   size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetImageInfo(cl_mem           /* image */,
-               cl_image_info    /* param_name */, 
-               size_t           /* param_value_size */,
-               void *           /* param_value */,
-               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Sampler APIs  */
-extern CL_API_ENTRY cl_sampler CL_API_CALL
-clCreateSampler(cl_context          /* context */,
-                cl_bool             /* normalized_coords */, 
-                cl_addressing_mode  /* addressing_mode */, 
-                cl_filter_mode      /* filter_mode */,
-                cl_int *            /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetSamplerInfo(cl_sampler         /* sampler */,
-                 cl_sampler_info    /* param_name */,
-                 size_t             /* param_value_size */,
-                 void *             /* param_value */,
-                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-                            
-/* Program Object APIs  */
-extern CL_API_ENTRY cl_program CL_API_CALL
-clCreateProgramWithSource(cl_context        /* context */,
-                          cl_uint           /* count */,
-                          const char **     /* strings */,
-                          const size_t *    /* lengths */,
-                          cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_program CL_API_CALL
-clCreateProgramWithBinary(cl_context                     /* context */,
-                          cl_uint                        /* num_devices */,
-                          const cl_device_id *           /* device_list */,
-                          const size_t *                 /* lengths */,
-                          const unsigned char **         /* binaries */,
-                          cl_int *                       /* binary_status */,
-                          cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clBuildProgram(cl_program           /* program */,
-               cl_uint              /* num_devices */,
-               const cl_device_id * /* device_list */,
-               const char *         /* options */, 
-               void (*pfn_notify)(cl_program /* program */, void * /* user_data */),
-               void *               /* user_data */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clUnloadCompiler(void) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetProgramInfo(cl_program         /* program */,
-                 cl_program_info    /* param_name */,
-                 size_t             /* param_value_size */,
-                 void *             /* param_value */,
-                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetProgramBuildInfo(cl_program            /* program */,
-                      cl_device_id          /* device */,
-                      cl_program_build_info /* param_name */,
-                      size_t                /* param_value_size */,
-                      void *                /* param_value */,
-                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-                            
-/* Kernel Object APIs */
-extern CL_API_ENTRY cl_kernel CL_API_CALL
-clCreateKernel(cl_program      /* program */,
-               const char *    /* kernel_name */,
-               cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clCreateKernelsInProgram(cl_program     /* program */,
-                         cl_uint        /* num_kernels */,
-                         cl_kernel *    /* kernels */,
-                         cl_uint *      /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainKernel(cl_kernel    /* kernel */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseKernel(cl_kernel   /* kernel */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetKernelArg(cl_kernel    /* kernel */,
-               cl_uint      /* arg_index */,
-               size_t       /* arg_size */,
-               const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetKernelInfo(cl_kernel       /* kernel */,
-                cl_kernel_info  /* param_name */,
-                size_t          /* param_value_size */,
-                void *          /* param_value */,
-                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetKernelWorkGroupInfo(cl_kernel                  /* kernel */,
-                         cl_device_id               /* device */,
-                         cl_kernel_work_group_info  /* param_name */,
-                         size_t                     /* param_value_size */,
-                         void *                     /* param_value */,
-                         size_t *                   /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Event Object APIs  */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clWaitForEvents(cl_uint             /* num_events */,
-                const cl_event *    /* event_list */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetEventInfo(cl_event         /* event */,
-               cl_event_info    /* param_name */,
-               size_t           /* param_value_size */,
-               void *           /* param_value */,
-               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-                            
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Profiling APIs  */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetEventProfilingInfo(cl_event            /* event */,
-                        cl_profiling_info   /* param_name */,
-                        size_t              /* param_value_size */,
-                        void *              /* param_value */,
-                        size_t *            /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-                                
-/* Flush and Finish APIs */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Enqueued Commands APIs */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReadBuffer(cl_command_queue    /* command_queue */,
-                    cl_mem              /* buffer */,
-                    cl_bool             /* blocking_read */,
-                    size_t              /* offset */,
-                    size_t              /* cb */, 
-                    void *              /* ptr */,
-                    cl_uint             /* num_events_in_wait_list */,
-                    const cl_event *    /* event_wait_list */,
-                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
-                            
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueWriteBuffer(cl_command_queue   /* command_queue */, 
-                     cl_mem             /* buffer */, 
-                     cl_bool            /* blocking_write */, 
-                     size_t             /* offset */, 
-                     size_t             /* cb */, 
-                     const void *       /* ptr */, 
-                     cl_uint            /* num_events_in_wait_list */, 
-                     const cl_event *   /* event_wait_list */, 
-                     cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_0;
-                            
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyBuffer(cl_command_queue    /* command_queue */, 
-                    cl_mem              /* src_buffer */,
-                    cl_mem              /* dst_buffer */, 
-                    size_t              /* src_offset */,
-                    size_t              /* dst_offset */,
-                    size_t              /* cb */, 
-                    cl_uint             /* num_events_in_wait_list */,
-                    const cl_event *    /* event_wait_list */,
-                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
-                            
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReadImage(cl_command_queue     /* command_queue */,
-                   cl_mem               /* image */,
-                   cl_bool              /* blocking_read */, 
-                   const size_t *       /* origin[3] */,
-                   const size_t *       /* region[3] */,
-                   size_t               /* row_pitch */,
-                   size_t               /* slice_pitch */, 
-                   void *               /* ptr */,
-                   cl_uint              /* num_events_in_wait_list */,
-                   const cl_event *     /* event_wait_list */,
-                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueWriteImage(cl_command_queue    /* command_queue */,
-                    cl_mem              /* image */,
-                    cl_bool             /* blocking_write */, 
-                    const size_t *      /* origin[3] */,
-                    const size_t *      /* region[3] */,
-                    size_t              /* input_row_pitch */,
-                    size_t              /* input_slice_pitch */, 
-                    const void *        /* ptr */,
-                    cl_uint             /* num_events_in_wait_list */,
-                    const cl_event *    /* event_wait_list */,
-                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyImage(cl_command_queue     /* command_queue */,
-                   cl_mem               /* src_image */,
-                   cl_mem               /* dst_image */, 
-                   const size_t *       /* src_origin[3] */,
-                   const size_t *       /* dst_origin[3] */,
-                   const size_t *       /* region[3] */, 
-                   cl_uint              /* num_events_in_wait_list */,
-                   const cl_event *     /* event_wait_list */,
-                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
-                           cl_mem           /* src_image */,
-                           cl_mem           /* dst_buffer */, 
-                           const size_t *   /* src_origin[3] */,
-                           const size_t *   /* region[3] */, 
-                           size_t           /* dst_offset */,
-                           cl_uint          /* num_events_in_wait_list */,
-                           const cl_event * /* event_wait_list */,
-                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
-                           cl_mem           /* src_buffer */,
-                           cl_mem           /* dst_image */, 
-                           size_t           /* src_offset */,
-                           const size_t *   /* dst_origin[3] */,
-                           const size_t *   /* region[3] */, 
-                           cl_uint          /* num_events_in_wait_list */,
-                           const cl_event * /* event_wait_list */,
-                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY void * CL_API_CALL
-clEnqueueMapBuffer(cl_command_queue /* command_queue */,
-                   cl_mem           /* buffer */,
-                   cl_bool          /* blocking_map */, 
-                   cl_map_flags     /* map_flags */,
-                   size_t           /* offset */,
-                   size_t           /* cb */,
-                   cl_uint          /* num_events_in_wait_list */,
-                   const cl_event * /* event_wait_list */,
-                   cl_event *       /* event */,
-                   cl_int *         /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY void * CL_API_CALL
-clEnqueueMapImage(cl_command_queue  /* command_queue */,
-                  cl_mem            /* image */, 
-                  cl_bool           /* blocking_map */, 
-                  cl_map_flags      /* map_flags */, 
-                  const size_t *    /* origin[3] */,
-                  const size_t *    /* region[3] */,
-                  size_t *          /* image_row_pitch */,
-                  size_t *          /* image_slice_pitch */,
-                  cl_uint           /* num_events_in_wait_list */,
-                  const cl_event *  /* event_wait_list */,
-                  cl_event *        /* event */,
-                  cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
-                        cl_mem           /* memobj */,
-                        void *           /* mapped_ptr */,
-                        cl_uint          /* num_events_in_wait_list */,
-                        const cl_event *  /* event_wait_list */,
-                        cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
-                       cl_kernel        /* kernel */,
-                       cl_uint          /* work_dim */,
-                       const size_t *   /* global_work_offset */,
-                       const size_t *   /* global_work_size */,
-                       const size_t *   /* local_work_size */,
-                       cl_uint          /* num_events_in_wait_list */,
-                       const cl_event * /* event_wait_list */,
-                       cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueTask(cl_command_queue  /* command_queue */,
-              cl_kernel         /* kernel */,
-              cl_uint           /* num_events_in_wait_list */,
-              const cl_event *  /* event_wait_list */,
-              cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
-					  void (*user_func)(void *), 
-                      void *            /* args */,
-                      size_t            /* cb_args */, 
-                      cl_uint           /* num_mem_objects */,
-                      const cl_mem *    /* mem_list */,
-                      const void **     /* args_mem_loc */,
-                      cl_uint           /* num_events_in_wait_list */,
-                      const cl_event *  /* event_wait_list */,
-                      cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMarker(cl_command_queue    /* command_queue */,
-                cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
-                       cl_uint          /* num_events */,
-                       const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueBarrier(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Extension function access
- *
- * Returns the extension function address for the given function name,
- * or NULL if a valid function can not be found.  The client must
- * check to make sure the address is not NULL, before using or 
- * calling the returned function address.
- */
-extern CL_API_ENTRY void * CL_API_CALL clGetExtensionFunctionAddress(const char * /* func_name */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_H */
-
--- a/lib/gpu/geryon/opencl_1_0/CL/cl_ext.h
+++ b/lib/gpu/geryon/opencl_1_0/CL/cl_ext.h
@ -1,60 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2009 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
-
-/* $Revision: 10424 $ on $Date: 2010-02-17 14:34:49 -0800 (Wed, 17 Feb 2010) $ */
-
-#ifndef __CL_EXT_H
-#define __CL_EXT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* cl_khr_fp64 extension - no extension #define since it has no functions  */
-#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
-
-
-/* cl_khr_fp16 extension - no extension #define since it has no functions  */
-#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
-
-
-/* cl_khr_icd extension                                                    */
-#define cl_khr_icd 1
-
-/* cl_platform_info                                                        */
-#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
-
-/* Additional Error Codes                                                  */
-#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
-                       cl_platform_id * /* platforms */,
-                       cl_uint *        /* num_platforms */);
-
-#ifdef __cplusplus
-}
-#endif
-
-
-#endif /* __CL_EXT_H */
--- a/lib/gpu/geryon/opencl_1_0/CL/cl_gl.h
+++ b/lib/gpu/geryon/opencl_1_0/CL/cl_gl.h
@ -1,146 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2009 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-
-/* $Revision: 10424 $ on $Date: 2010-02-17 14:34:49 -0800 (Wed, 17 Feb 2010) $ */
-
-/*
- * cl_gl.h contains Khronos-approved (KHR) OpenCL extensions which have
- * OpenGL dependencies. The application is responsible for #including
- * OpenGL or OpenGL ES headers before #including cl_gl.h.
- */
-
-#ifndef __OPENCL_CL_GL_H
-#define __OPENCL_CL_GL_H
-
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include <CL/cl.h>
-#endif	
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef cl_uint     cl_gl_object_type;
-typedef cl_uint     cl_gl_texture_info;
-typedef cl_uint     cl_gl_platform_info;
-
-/* cl_gl_object_type */
-#define CL_GL_OBJECT_BUFFER             0x2000
-#define CL_GL_OBJECT_TEXTURE2D          0x2001
-#define CL_GL_OBJECT_TEXTURE3D          0x2002
-#define CL_GL_OBJECT_RENDERBUFFER       0x2003
-
-/* cl_gl_texture_info */
-#define CL_GL_TEXTURE_TARGET            0x2004
-#define CL_GL_MIPMAP_LEVEL              0x2005
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLBuffer(cl_context     /* context */,
-                     cl_mem_flags   /* flags */,
-                     cl_GLuint      /* bufobj */,
-                     int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLTexture2D(cl_context      /* context */,
-                        cl_mem_flags    /* flags */,
-                        cl_GLenum       /* target */,
-                        cl_GLint        /* miplevel */,
-                        cl_GLuint       /* texture */,
-                        cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLTexture3D(cl_context      /* context */,
-                        cl_mem_flags    /* flags */,
-                        cl_GLenum       /* target */,
-                        cl_GLint        /* miplevel */,
-                        cl_GLuint       /* texture */,
-                        cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLRenderbuffer(cl_context   /* context */,
-                           cl_mem_flags /* flags */,
-                           cl_GLuint    /* renderbuffer */,
-                           cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetGLObjectInfo(cl_mem                /* memobj */,
-                  cl_gl_object_type *   /* gl_object_type */,
-                  cl_GLuint *              /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
-                  
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetGLTextureInfo(cl_mem               /* memobj */,
-                   cl_gl_texture_info   /* param_name */,
-                   size_t               /* param_value_size */,
-                   void *               /* param_value */,
-                   size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
-                          cl_uint               /* num_objects */,
-                          const cl_mem *        /* mem_objects */,
-                          cl_uint               /* num_events_in_wait_list */,
-                          const cl_event *      /* event_wait_list */,
-                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
-                          cl_uint               /* num_objects */,
-                          const cl_mem *        /* mem_objects */,
-                          cl_uint               /* num_events_in_wait_list */,
-                          const cl_event *      /* event_wait_list */,
-                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-/* cl_khr_gl_sharing extension  */
-
-#define cl_khr_gl_sharing 1
-
-typedef cl_uint     cl_gl_context_info;
-
-/* Additional Error Codes  */
-#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
-
-/* cl_gl_context_info  */
-#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
-#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
-
-/* Additional cl_context_properties  */
-#define CL_GL_CONTEXT_KHR                       0x2008
-#define CL_EGL_DISPLAY_KHR                      0x2009
-#define CL_GLX_DISPLAY_KHR                      0x200A
-#define CL_WGL_HDC_KHR                          0x200B
-#define CL_CGL_SHAREGROUP_KHR                   0x200C
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
-                      cl_gl_context_info            /* param_name */,
-                      size_t                        /* param_value_size */,
-                      void *                        /* param_value */,
-                      size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_GL_H  */
--- a/lib/gpu/geryon/opencl_1_0/CL/cl_platform.h
+++ b/lib/gpu/geryon/opencl_1_0/CL/cl_platform.h
--- a/lib/gpu/geryon/opencl_1_0/CL/clext.h
+++ b/lib/gpu/geryon/opencl_1_0/CL/clext.h
@ -1,22 +0,0 @@
-/*
-** Copyright 1998-2002, NVIDIA Corporation.
-** All Rights Reserved.
-**
-** THE INFORMATION CONTAINED HEREIN IS PROPRIETARY AND CONFIDENTIAL TO
-** NVIDIA, CORPORATION.  USE, REPRODUCTION OR DISCLOSURE TO ANY THIRD PARTY
-** IS SUBJECT TO WRITTEN PRE-APPROVAL BY NVIDIA, CORPORATION.
-**
-**
-*/
-#ifndef __CLEXT_H
-#define __CLEXT_H
-
-#define CL_NV_DEVICE_COMPUTE_CAPABILITY_MAJOR       0x4000
-#define CL_NV_DEVICE_COMPUTE_CAPABILITY_MINOR       0x4001
-#define CL_NV_DEVICE_REGISTERS_PER_BLOCK            0x4002
-#define CL_NV_DEVICE_WARP_SIZE                      0x4003
-#define CL_NV_DEVICE_GPU_OVERLAP                    0x4004
-#define CL_NV_DEVICE_KERNEL_EXEC_TIMEOUT            0x4005
-#define CL_NV_DEVICE_INTEGRATED_MEMORY              0x4006
-
-#endif
--- a/lib/gpu/lj96_cut_gpu.cpp
+++ b/lib/gpu/lj96_cut_gpu.cpp
@ -1,121 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#include <iostream>
-#include <cassert>
-#include <math.h>
-
-#include "lj96_cut_gpu_memory.h"
-
-using namespace std;
-
-static LJ96_GPU_Memory<PRECISION,ACC_PRECISION> LJ96MF;
-
-// ---------------------------------------------------------------------------
-// Allocate memory on host and device and copy constants to device
-// ---------------------------------------------------------------------------
-int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                  double **host_lj2, double **host_lj3, double **host_lj4,
-                  double **offset, double *special_lj, const int inum,
-                  const int nall, const int max_nbors, const int maxspecial,
-                  const double cell_size, int &gpu_mode, FILE *screen) {
-  LJ96MF.clear();
-  gpu_mode=LJ96MF.device->gpu_mode();
-  double gpu_split=LJ96MF.device->particle_split();
-  int first_gpu=LJ96MF.device->first_device();
-  int last_gpu=LJ96MF.device->last_device();
-  int world_me=LJ96MF.device->world_me();
-  int gpu_rank=LJ96MF.device->gpu_rank();
-  int procs_per_gpu=LJ96MF.device->procs_per_gpu();
-
-  LJ96MF.device->init_message(screen,"lj96/cut",first_gpu,last_gpu);
-
-  bool message=false;
-  if (LJ96MF.device->replica_me()==0 && screen)
-    message=true;
-
-  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
-    fflush(screen);
-  }
-
-  int init_ok=0;
-  if (world_me==0)
-    init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                        host_lj4, offset, special_lj, inum, nall, 300,
-                        maxspecial, cell_size, gpu_split, screen);
-
-  LJ96MF.device->world_barrier();
-  if (message)
-    fprintf(screen,"Done.\n");
-
-  for (int i=0; i<procs_per_gpu; i++) {
-    if (message) {
-      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
-      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
-                last_gpu,i);
-      fflush(screen);
-    }
-    if (gpu_rank==i && world_me!=0)
-      init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                          offset, special_lj, inum,  nall, 300, maxspecial,
-                          cell_size, gpu_split, screen);
-
-    LJ96MF.device->gpu_barrier();
-    if (message) 
-      fprintf(screen,"Done.\n");
-  }
-  if (message)
-    fprintf(screen,"\n");
-
-  if (init_ok==0)
-    LJ96MF.estimate_gpu_overhead();
-  return init_ok;
-}
-
-void lj96_gpu_clear() {
-  LJ96MF.clear();
-}
-
-int** lj96_gpu_compute_n(const int ago, const int inum_full,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, int *tag, int **nspecial,
-                         int **special, const bool eflag, const bool vflag,
-                         const bool eatom, const bool vatom, int &host_start,
-                         int **ilist, int **jnum, const double cpu_time,
-                         bool &success) {
-  return LJ96MF.compute(ago, inum_full, nall, host_x, host_type, sublo,
-                        subhi, tag, nspecial, special, eflag, vflag, eatom,
-                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
-void lj96_gpu_compute(const int ago, const int inum_full, const int nall,
-                      double **host_x, int *host_type, int *ilist, int *numj,
-                      int **firstneigh, const bool eflag, const bool vflag,
-                      const bool eatom, const bool vatom, int &host_start,
-                      const double cpu_time, bool &success) {
-  LJ96MF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
-                 eflag,vflag,eatom,vatom,host_start,cpu_time,success);
-}
-
-double lj96_gpu_bytes() {
-  return LJ96MF.host_memory_usage();
-}
-
-
--- a/lib/gpu/lj96_cut_gpu_kernel.cu
+++ b/lib/gpu/lj96_cut_gpu_kernel.cu
@ -1,387 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef LJ96_GPU_KERNEL
-#define LJ96_GPU_KERNEL
-
-#ifdef NV_KERNEL
-
-#include "nv_kernel_def.h"
-texture<float4> pos_tex;
-
-#ifdef _DOUBLE_DOUBLE
-__inline double4 fetch_pos(const int& i, const double4 *pos)
-{
-  return pos[i];
-}
-#else
-__inline float4 fetch_pos(const int& i, const float4 *pos)
-{
-  return tex1Dfetch(pos_tex, i);
-}
-#endif
-
-#else
-
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-#define GLOBAL_ID_X get_global_id(0)
-#define THREAD_ID_X get_local_id(0)
-#define BLOCK_ID_X get_group_id(0)
-#define BLOCK_SIZE_X get_local_size(0)
-#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
-#define __inline inline
-
-#define fetch_pos(i,y) x_[i]
-#define BLOCK_PAIR 64
-#define MAX_SHARED_TYPES 8
-
-#endif
-
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
-#define SBBITS 30
-#define NEIGHMASK 0x3FFFFFFF
-__inline int sbmask(int j) { return j >> SBBITS & 3; }
-
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
-                          const int nbor_pitch, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp sp_lj[4];
-  sp_lj[0]=sp_lj_in[0];
-  sp_lj[1]=sp_lj_in[1];
-  sp_lj[2]=sp_lj_in[2];
-  sp_lj[3]=sp_lj_in[3];
-
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    int itype=ix.w;
-
-    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
-  
-      int j=*nbor;
-      factor_lj = sp_lj[sbmask(j)];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
-      int mtype=itype*lj_types+jtype;
-      if (r2inv<lj1[mtype].z) {
-        r2inv=(numtyp)1.0/r2inv;
-        numtyp r6inv = r2inv*r2inv*r2inv;
-	numtyp r3inv = sqrt(r6inv);
-        numtyp force = r2inv*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
-        force*=factor_lj;
-      
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<4; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii
-}
-
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp sp_lj[4];
-  if (tid<4)
-    sp_lj[tid]=sp_lj_in[tid];
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[tid]=lj1_in[tid];
-    if (eflag>0)
-      lj3[tid]=lj3_in[tid];
-  }
-  
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  __syncthreads();
-  
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
-
-    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
-  
-      int j=*nbor;
-      factor_lj = sp_lj[sbmask(j)];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int mtype=itype+jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
-      if (r2inv<lj1[mtype].z) {
-        r2inv=(numtyp)1.0/r2inv;
-        numtyp r6inv = r2inv*r2inv*r2inv;
-	numtyp r3inv = sqrt(r6inv);
-        numtyp force = r2inv*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
-      
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<4; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii*/
-}
-
-#endif
-
--- a/lib/gpu/lj96_cut_gpu_memory.cpp
+++ b/lib/gpu/lj96_cut_gpu_memory.cpp
@ -1,155 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifdef USE_OPENCL
-#include "lj96_cut_gpu_cl.h"
-#else
-#include "lj96_cut_gpu_ptx.h"
-#endif
-
-#include "lj96_cut_gpu_memory.h"
-#include <cassert>
-#define LJ96_GPU_MemoryT LJ96_GPU_Memory<numtyp, acctyp>
-
-extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
-
-template <class numtyp, class acctyp>
-LJ96_GPU_MemoryT::LJ96_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
-}
-
-template <class numtyp, class acctyp>
-LJ96_GPU_MemoryT::~LJ96_GPU_Memory() {
-  clear();
-}
- 
-template <class numtyp, class acctyp>
-int LJ96_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
-  return this->bytes_per_atom_atomic(max_nbors);
-}
-
-template <class numtyp, class acctyp>
-int LJ96_GPU_MemoryT::init(const int ntypes,
-                           double **host_cutsq, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
-                           double *host_special_lj, const int nlocal,
-                           const int nall, const int max_nbors,
-                           const int maxspecial, const double cell_size,
-                           const double gpu_split, FILE *_screen) {
-  int success;
-  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,lj96_cut_gpu_kernel);
-  if (success!=0)
-    return success;
-
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
-  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
-    lj_types=max_shared_types;
-    shared_types=true;
-  }
-  _lj_types=lj_types;
-
-  // Allocate a host write buffer for data initialization
-  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
-
-  for (int i=0; i<lj_types*lj_types; i++)
-    host_write[i]=0.0;
-
-  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq);
-
-  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
-
-  UCL_H_Vec<double> dview;
-  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
-  dview.view(host_special_lj,4,*(this->ucl_device));
-  ucl_copy(sp_lj,dview,false);
-
-  _allocated=true;
-  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return 0;
-}
-
-template <class numtyp, class acctyp>
-void LJ96_GPU_MemoryT::clear() {
-  if (!_allocated)
-    return;
-  _allocated=false;
-
-  lj1.clear();
-  lj3.clear();
-  sp_lj.clear();
-  this->clear_atomic();
-}
-
-template <class numtyp, class acctyp>
-double LJ96_GPU_MemoryT::host_memory_usage() const {
-  return this->host_memory_usage_atomic()+sizeof(LJ96_GPU_Memory<numtyp,acctyp>);
-}
-
-// ---------------------------------------------------------------------------
-// Calculate energies, forces, and torques
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
-  this->time_pair.start();
-  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->_threads_per_atom);
-  } else {
-    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom);
-  }
-  this->time_pair.stop();
-}
-
-template class LJ96_GPU_Memory<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lj96_cut_gpu_memory.h
+++ b/lib/gpu/lj96_cut_gpu_memory.h
@ -1,78 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef LJ96_GPU_MEMORY_H
-#define LJ96_GPU_MEMORY_H
-
-#include "atomic_gpu_memory.h"
-
-template <class numtyp, class acctyp>
-class LJ96_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
- public:
-  LJ96_GPU_Memory();
-  ~LJ96_GPU_Memory();
-
-  /// Clear any previous data and set up for a new LAMMPS run
-  /** \param max_nbors initial number of rows in the neighbor matrix
-    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device
-    * 
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double **host_cutsq, double **host_lj1,
-           double **host_lj2, double **host_lj3, double **host_lj4,
-           double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
-           const double gpu_split, FILE *screen);
-
-  /// Clear all host and device data
-  /** \note This is called at the beginning of the init() routine **/
-  void clear();
-
-  /// Returns memory usage on device per atom
-  int bytes_per_atom(const int max_nbors) const;
-
-  /// Total host memory used by library for pair style
-  double host_memory_usage() const;
-
-  // --------------------------- TYPE DATA --------------------------
-
-  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq
-  UCL_D_Vec<numtyp4> lj1;
-  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
-  UCL_D_Vec<numtyp4> lj3;
-  /// Special LJ values
-  UCL_D_Vec<numtyp> sp_lj;
-
-  /// If atom type constants fit in shared memory, use fast kernels
-  bool shared_types;
-
-  /// Number of atom types 
-  int _lj_types;
-
- private:
-  bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
-};
-
-#endif
-
--- a/lib/gpu/lj_class2_long.cpp
+++ b/lib/gpu/lj_class2_long.cpp
@ -1,168 +0,0 @@
-/***************************************************************************
-                             lj_class2_long.cpp
-                             -------------------
-                               W. Michael Brown
-
-  Host code for COMPASS LJ long potential acceleration
-
- __________________________________________________________________________
-    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
- __________________________________________________________________________
-
-    begin                : Mon May 16 2011
-    email                : brownw@ornl.gov
- ***************************************************************************/
-
-#ifdef USE_OPENCL
-#include "lj_class2_long_cl.h"
-#else
-#include "lj_class2_long_ptx.h"
-#endif
-
-#include "lj_class2_long.h"
-#include <cassert>
-using namespace LAMMPS_AL;
-
-#define LJClass2LongT LJClass2Long<numtyp, acctyp>
-
-extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
-
-template <class numtyp, class acctyp>
-LJClass2LongT::LJClass2Long() : ChargeGPUMemory<numtyp,acctyp>(),
-                                _allocated(false) {
-}
-
-template <class numtyp, class acctyp>
-LJClass2LongT::~LJClass2Long() {
-  clear();
-}
- 
-template <class numtyp, class acctyp>
-int LJClass2LongT::bytes_per_atom(const int max_nbors) const {
-  return this->bytes_per_atom_atomic(max_nbors);
-}
-
-template <class numtyp, class acctyp>
-int LJClass2LongT::init(const int ntypes, double **host_cutsq,
-                        double **host_lj1, double **host_lj2, double **host_lj3, 
-                        double **host_lj4, double **host_offset, 
-                        double *host_special_lj, const int nlocal,
-                        const int nall, const int max_nbors,
-                        const int maxspecial, const double cell_size,
-                        const double gpu_split, FILE *_screen,
-                        double **host_cut_ljsq, const double host_cut_coulsq,
-                        double *host_special_coul, const double qqrd2e,
-                        const double g_ewald) {
-  int success;
-  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,lj_class2_long);
-  if (success!=0)
-    return success;
-
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
-  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
-    lj_types=max_shared_types;
-    shared_types=true;
-  }
-  _lj_types=lj_types;
-
-  // Allocate a host write buffer for data initialization
-  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
-
-  for (int i=0; i<lj_types*lj_types; i++)
-    host_write[i]=0.0;
-
-  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq, host_cut_ljsq);
-
-  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
-
-  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
-  for (int i=0; i<4; i++) {
-    host_write[i]=host_special_lj[i];
-    host_write[i+4]=host_special_coul[i];
-  }
-  ucl_copy(sp_lj,host_write,8,false);
-
-  _cut_coulsq=host_cut_coulsq;
-  _qqrd2e=qqrd2e;
-  _g_ewald=g_ewald;
-
-  _allocated=true;
-  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return 0;
-}
-
-template <class numtyp, class acctyp>
-void LJClass2LongT::clear() {
-  if (!_allocated)
-    return;
-  _allocated=false;
-
-  lj1.clear();
-  lj3.clear();
-  sp_lj.clear();
-  this->clear_atomic();
-}
-
-template <class numtyp, class acctyp>
-double LJClass2LongT::host_memory_usage() const {
-  return this->host_memory_usage_atomic()+sizeof(LJClass2Long<numtyp,acctyp>);
-}
-
-// ---------------------------------------------------------------------------
-// Calculate energies, forces, and torques
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void LJClass2LongT::loop(const bool _eflag, const bool _vflag) {
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
-  this->time_pair.start();
-  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
-                          &_cut_coulsq, &_qqrd2e, &_g_ewald, 
-                          &this->_threads_per_atom);
-  } else {
-    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
-                     &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
-  }
-  this->time_pair.stop();
-}
-
-template class LJClass2Long<PRECISION,ACC_PRECISION>;
-
--- a/lib/gpu/lj_class2_long.cu
+++ b/lib/gpu/lj_class2_long.cu
@ -1,470 +0,0 @@
-// **************************************************************************
-//                              lj_class2_long.cu
-//                             -------------------
-//                               W. Michael Brown
-//
-//  Device code for COMPASS LJ long acceleration
-//
-// __________________________________________________________________________
-//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
-// __________________________________________________________________________
-//
-//    begin                : Mon May 16 2011
-//    email                : brownw@ornl.gov
-// ***************************************************************************/
-
-#ifndef LJCL_GPU_KERNEL
-#define LJCL_GPU_KERNEL
-
-#ifdef NV_KERNEL
-
-#include "nv_kernel_def.h"
-texture<float4> pos_tex;
-texture<float> q_tex;
-
-#ifdef _DOUBLE_DOUBLE
-__inline double4 fetch_pos(const int& i, const double4 *pos)
-{
-  return pos[i];
-}
-__inline double fetch_q(const int& i, const double *q)
-{
-  return q[i];
-}
-#else
-__inline float4 fetch_pos(const int& i, const float4 *pos)
-{
-  return tex1Dfetch(pos_tex, i);
-}
-__inline float fetch_q(const int& i, const float *q)
-{
-  return tex1Dfetch(q_tex, i);
-}
-#endif
-
-#else
-
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-#define GLOBAL_ID_X get_global_id(0)
-#define THREAD_ID_X get_local_id(0)
-#define BLOCK_ID_X get_group_id(0)
-#define BLOCK_SIZE_X get_local_size(0)
-#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
-#define __inline inline
-
-#define fetch_pos(i,y) x_[i]
-#define fetch_q(i,y) q_[i]
-#define BLOCK_PAIR 64
-#define MAX_SHARED_TYPES 8
-
-#endif
-
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
-#define EWALD_F (numtyp)1.12837917
-#define EWALD_P (numtyp)0.3275911
-#define A1 (numtyp)0.254829592
-#define A2 (numtyp)-0.284496736
-#define A3 (numtyp)1.421413741
-#define A4 (numtyp)-1.453152027
-#define A5 (numtyp)1.061405429
-
-#define SBBITS 30
-#define NEIGHMASK 0x3FFFFFFF
-__inline int sbmask(int j) { return j >> SBBITS & 3; }
-
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_,
-                          const numtyp cut_coulsq, const numtyp qqrd2e,
-                          const numtyp g_ewald, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp sp_lj[8];
-  sp_lj[0]=sp_lj_in[0];
-  sp_lj[1]=sp_lj_in[1];
-  sp_lj[2]=sp_lj_in[2];
-  sp_lj[3]=sp_lj_in[3];
-  sp_lj[4]=sp_lj_in[4];
-  sp_lj[5]=sp_lj_in[5];
-  sp_lj[6]=sp_lj_in[6];
-  sp_lj[7]=sp_lj_in[7];
-
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
-    int itype=ix.w;
-
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
-
-      numtyp factor_lj, factor_coul;
-      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      int mtype=itype*lj_types+jtype;
-      if (rsq<lj1[mtype].z) {
-        numtyp r2inv=(numtyp)1.0/rsq;
-        numtyp forcecoul, force_lj, force, r6inv, r3inv, prefactor, _erfc;
-
-        if (rsq < lj1[mtype].w) {
-          numtyp rinv=sqrt(r2inv);
-          r3inv=r2inv*rinv;
-          r6inv = r3inv*r3inv;
-          force_lj = factor_lj*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
-        } else
-          force_lj = (numtyp)0.0;
-
-        if (rsq < cut_coulsq) {
-          numtyp r = sqrt(rsq);
-          numtyp grij = g_ewald * r;
-          numtyp expm2 = exp(-grij*grij);
-          numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
-          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
-          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else
-          forcecoul = (numtyp)0.0;
-
-        force = (force_lj + forcecoul) * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          if (rsq < cut_coulsq)
-            e_coul += prefactor*(_erfc-factor_coul);
-          if (rsq < lj1[mtype].w) {
-            numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
-            energy+=factor_lj*(e-lj3[mtype].z);
-          } 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-    red_acc[4][tid]=e_coul;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<5; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-    e_coul=red_acc[4][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-      *ap1=e_coul;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii
-}
-
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp g_ewald, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp sp_lj[8];
-  if (tid<8)
-    sp_lj[tid]=sp_lj_in[tid];
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[tid]=lj1_in[tid];
-    if (eflag>0)
-      lj3[tid]=lj3_in[tid];
-  }
-  
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  __syncthreads();
-  
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
-    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
-
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
-
-      numtyp factor_lj, factor_coul;
-      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int mtype=itype+jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      if (rsq<lj1[mtype].z) {
-        numtyp r2inv=(numtyp)1.0/rsq;
-        numtyp forcecoul, force_lj, force, r6inv, r3inv, prefactor, _erfc;
-
-        if (rsq < lj1[mtype].w) {
-          numtyp rinv=sqrt(r2inv);
-          r3inv=r2inv*rinv;
-          r6inv = r3inv*r3inv;
-          force_lj = factor_lj*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
-        } else
-          force_lj = (numtyp)0.0;
-
-        if (rsq < cut_coulsq) {
-          numtyp r = sqrt(rsq);
-          numtyp grij = g_ewald * r;
-          numtyp expm2 = exp(-grij*grij);
-          numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
-          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
-          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else
-          forcecoul = (numtyp)0.0;
-
-        force = (force_lj + forcecoul) * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          if (rsq < cut_coulsq)
-            e_coul += prefactor*(_erfc-factor_coul);
-          if (rsq < lj1[mtype].w) {
-            numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
-            energy+=factor_lj*(e-lj3[mtype].z);
-          }
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-    red_acc[4][tid]=e_coul;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<5; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-    e_coul=red_acc[4][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-      *ap1=e_coul;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii*/
-}
-
-#endif
-
--- a/lib/gpu/lj_class2_long.h
+++ b/lib/gpu/lj_class2_long.h
@ -1,84 +0,0 @@
-/***************************************************************************
-                               lj_class2_long.h
-                             -------------------
-                               W. Michael Brown
-
-  Host code for COMPASS LJ long potential acceleration
-
- __________________________________________________________________________
-    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
- __________________________________________________________________________
-
-    begin                : Mon May 16 2011
-    email                : brownw@ornl.gov
- ***************************************************************************/
-
-#ifndef LJ_CLASS2_LONG_H
-#define LJ_CLASS2_LONG_H
-
-#include "charge_gpu_memory.h"
-
-namespace LAMMPS_AL {
-
-template <class numtyp, class acctyp>
-class LJClass2Long : public ChargeGPUMemory<numtyp, acctyp> {
- public:
-  LJClass2Long();
-  ~LJClass2Long();
-
-  /// Clear any previous data and set up for a new LAMMPS run
-  /** \param max_nbors initial number of rows in the neighbor matrix
-    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device
-    * 
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double **host_cutsq,
-           double **host_lj1, double **host_lj2, double **host_lj3,
-           double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
-           const double gpu_split, FILE *screen, double **host_cut_ljsq,
-           const double host_cut_coulsq, double *host_special_coul,
-           const double qqrd2e, const double g_ewald);
-
-  /// Clear all host and device data
-  /** \note This is called at the beginning of the init() routine **/
-  void clear();
-
-  /// Returns memory usage on device per atom
-  int bytes_per_atom(const int max_nbors) const;
-
-  /// Total host memory used by library for pair style
-  double host_memory_usage() const;
-
-  // --------------------------- TYPE DATA --------------------------
-
-  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = cutsq_vdw
-  UCL_D_Vec<numtyp4> lj1;
-  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
-  UCL_D_Vec<numtyp4> lj3;
-  /// Special LJ values [0-3] and Special Coul values [4-7]
-  UCL_D_Vec<numtyp> sp_lj;
-
-  /// If atom type constants fit in shared memory, use fast kernels
-  bool shared_types;
-
-  /// Number of atom types 
-  int _lj_types;
-
-  numtyp _cut_coulsq, _qqrd2e, _g_ewald;
-
- private:
-  bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
-};
-
-}
-
-#endif
-
--- a/lib/gpu/lj_class2_long_ext.cpp
+++ b/lib/gpu/lj_class2_long_ext.cpp
@ -1,129 +0,0 @@
-/***************************************************************************
-                           lj_class2_long_ext.cpp
-                             -------------------
-                               W. Michael Brown
-
-  LAMMPS Wrappers for COMMPASS LJ long Acceleration
-
- __________________________________________________________________________
-    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
- __________________________________________________________________________
-
-    begin                : Mon May 16 2011
-    email                : brownw@ornl.gov
- ***************************************************************************/
-
-#include <iostream>
-#include <cassert>
-#include <math.h>
-
-#include "lj_class2_long.h"
-
-using namespace std;
-using namespace LAMMPS_AL;
-
-static LJClass2Long<PRECISION,ACC_PRECISION> C2CLMF;
-
-// ---------------------------------------------------------------------------
-// Allocate memory on host and device and copy constants to device
-// ---------------------------------------------------------------------------
-int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                  double **host_lj2, double **host_lj3, double **host_lj4,
-                  double **offset, double *special_lj, const int inum,
-                  const int nall, const int max_nbors, const int maxspecial,
-                  const double cell_size, int &gpu_mode, FILE *screen,
-                  double **host_cut_ljsq, double host_cut_coulsq,
-                  double *host_special_coul, const double qqrd2e,
-                  const double g_ewald) {
-  C2CLMF.clear();
-  gpu_mode=C2CLMF.device->gpu_mode();
-  double gpu_split=C2CLMF.device->particle_split();
-  int first_gpu=C2CLMF.device->first_device();
-  int last_gpu=C2CLMF.device->last_device();
-  int world_me=C2CLMF.device->world_me();
-  int gpu_rank=C2CLMF.device->gpu_rank();
-  int procs_per_gpu=C2CLMF.device->procs_per_gpu();
-
-  C2CLMF.device->init_message(screen,"lj/class2/coul/long",first_gpu,last_gpu);
-
-  bool message=false;
-  if (C2CLMF.device->replica_me()==0 && screen)
-    message=true;
-
-  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
-    fflush(screen);
-  }
-
-  int init_ok=0;
-  if (world_me==0)
-    init_ok=C2CLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                        offset, special_lj, inum, nall, 300, maxspecial,
-                        cell_size, gpu_split, screen, host_cut_ljsq,
-                        host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
-
-  C2CLMF.device->world_barrier();
-  if (message)
-    fprintf(screen,"Done.\n");
-
-  for (int i=0; i<procs_per_gpu; i++) {
-    if (message) {
-      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
-      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
-                last_gpu,i);
-      fflush(screen);
-    }
-    if (gpu_rank==i && world_me!=0)
-      init_ok=C2CLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                          offset, special_lj, inum, nall, 300, maxspecial,
-                          cell_size, gpu_split, screen, host_cut_ljsq,
-                          host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
-
-    C2CLMF.device->gpu_barrier();
-    if (message) 
-      fprintf(screen,"Done.\n");
-  }
-  if (message)
-    fprintf(screen,"\n");
-
-  if (init_ok==0)
-    C2CLMF.estimate_gpu_overhead();
-  return init_ok;
-}
-
-void c2cl_gpu_clear() {
-  C2CLMF.clear();
-}
-
-int** c2cl_gpu_compute_n(const int ago, const int inum_full,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, int *tag, int **nspecial, 
-                         int **special, const bool eflag, const bool vflag,
-                         const bool eatom, const bool vatom, int &host_start,
-                         int **ilist, int **jnum,  const double cpu_time,
-                         bool &success, double *host_q, double *boxlo,
-                         double *prd) {
-  return C2CLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
-                        subhi, tag, nspecial, special, eflag, vflag, eatom,
-                        vatom, host_start, ilist, jnum, cpu_time, success,
-                        host_q, boxlo, prd);
-}  
-			
-void c2cl_gpu_compute(const int ago, const int inum_full, const int nall,
-                      double **host_x, int *host_type, int *ilist, int *numj,
-                      int **firstneigh, const bool eflag, const bool vflag,
-                      const bool eatom, const bool vatom, int &host_start,
-                      const double cpu_time, bool &success, double *host_q,
-                      const int nlocal, double *boxlo, double *prd) {
-  C2CLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
-                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
-                host_q,nlocal,boxlo,prd);
-}
-
-double c2cl_gpu_bytes() {
-  return C2CLMF.host_memory_usage();
-}
-
-
--- a/lib/gpu/lj_cut_gpu.cpp
+++ b/lib/gpu/lj_cut_gpu.cpp
@ -1,121 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#include <iostream>
-#include <cassert>
-#include <math.h>
-
-#include "lj_cut_gpu_memory.h"
-
-using namespace std;
-
-static LJL_GPU_Memory<PRECISION,ACC_PRECISION> LJLMF;
-
-// ---------------------------------------------------------------------------
-// Allocate memory on host and device and copy constants to device
-// ---------------------------------------------------------------------------
-int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                 double **host_lj2, double **host_lj3, double **host_lj4,
-                 double **offset, double *special_lj, const int inum,
-                 const int nall, const int max_nbors,  const int maxspecial,
-                 const double cell_size, int &gpu_mode, FILE *screen) {
-  LJLMF.clear();
-  gpu_mode=LJLMF.device->gpu_mode();
-  double gpu_split=LJLMF.device->particle_split();
-  int first_gpu=LJLMF.device->first_device();
-  int last_gpu=LJLMF.device->last_device();
-  int world_me=LJLMF.device->world_me();
-  int gpu_rank=LJLMF.device->gpu_rank();
-  int procs_per_gpu=LJLMF.device->procs_per_gpu();
-
-  LJLMF.device->init_message(screen,"lj/cut",first_gpu,last_gpu);
-
-  bool message=false;
-  if (LJLMF.device->replica_me()==0 && screen)
-    message=true;
-
-  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
-    fflush(screen);
-  }
-
-  int init_ok=0;
-  if (world_me==0)
-    init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
-                       host_lj4, offset, special_lj, inum, nall, 300,
-                       maxspecial, cell_size, gpu_split, screen);
-
-  LJLMF.device->world_barrier();
-  if (message)
-    fprintf(screen,"Done.\n");
-
-  for (int i=0; i<procs_per_gpu; i++) {
-    if (message) {
-      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
-      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
-                last_gpu,i);
-      fflush(screen);
-    }
-    if (gpu_rank==i && world_me!=0)
-      init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                         offset, special_lj, inum, nall, 300, maxspecial,
-                         cell_size, gpu_split, screen);
-
-    LJLMF.device->gpu_barrier();
-    if (message) 
-      fprintf(screen,"Done.\n");
-  }
-  if (message)
-    fprintf(screen,"\n");
-
-  if (init_ok==0)
-    LJLMF.estimate_gpu_overhead();
-  return init_ok;
-}
-
-void ljl_gpu_clear() {
-  LJLMF.clear();
-}
-
-int ** ljl_gpu_compute_n(const int ago, const int inum_full,
-                        const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, int *tag, int **nspecial,
-                        int **special, const bool eflag, const bool vflag,
-                        const bool eatom, const bool vatom, int &host_start,
-                        int **ilist, int **jnum, const double cpu_time,
-                        bool &success) {
-  return LJLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
-                       subhi, tag, nspecial, special, eflag, vflag, eatom,
-                       vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
-void ljl_gpu_compute(const int ago, const int inum_full, const int nall,
-                     double **host_x, int *host_type, int *ilist, int *numj,
-                     int **firstneigh, const bool eflag, const bool vflag,
-                     const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success) {
-  LJLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
-                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
-}
-
-double ljl_gpu_bytes() {
-  return LJLMF.host_memory_usage();
-}
-
-
--- a/lib/gpu/lj_cut_gpu_kernel.cu
+++ b/lib/gpu/lj_cut_gpu_kernel.cu
@ -1,385 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef LJ_GPU_KERNEL
-#define LJ_GPU_KERNEL
-
-#ifdef NV_KERNEL
-
-#include "nv_kernel_def.h"
-texture<float4> pos_tex;
-
-#ifdef _DOUBLE_DOUBLE
-__inline double4 fetch_pos(const int& i, const double4 *pos)
-{
-  return pos[i];
-}
-#else
-__inline float4 fetch_pos(const int& i, const float4 *pos)
-{
-  return tex1Dfetch(pos_tex, i);
-}
-#endif
-
-#else
-
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-#define GLOBAL_ID_X get_global_id(0)
-#define THREAD_ID_X get_local_id(0)
-#define BLOCK_ID_X get_group_id(0)
-#define BLOCK_SIZE_X get_local_size(0)
-#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
-#define __inline inline
-
-#define fetch_pos(i,y) x_[i]
-#define BLOCK_PAIR 64
-#define MAX_SHARED_TYPES 8
-
-#endif
-
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
-#define SBBITS 30
-#define NEIGHMASK 0x3FFFFFFF
-__inline int sbmask(int j) { return j >> SBBITS & 3; }
-
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp sp_lj[4];
-  sp_lj[0]=sp_lj_in[0];
-  sp_lj[1]=sp_lj_in[1];
-  sp_lj[2]=sp_lj_in[2];
-  sp_lj[3]=sp_lj_in[3];
-
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    int itype=ix.w;
-
-    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
-  
-      int j=*nbor;
-      factor_lj = sp_lj[sbmask(j)];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
-      int mtype=itype*lj_types+jtype;
-      if (r2inv<lj1[mtype].z) {
-        r2inv=(numtyp)1.0/r2inv;
-        numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
-        force*=factor_lj;
-      
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<4; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii
-}
-
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, 
-                               __global int *dev_nbor, __global int *dev_packed, 
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp sp_lj[4];
-  if (tid<4)
-    sp_lj[tid]=sp_lj_in[tid];
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[tid]=lj1_in[tid];
-    if (eflag>0)
-      lj3[tid]=lj3_in[tid];
-  }
-  
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  __syncthreads();
-  
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
-
-    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
-  
-      int j=*nbor;
-      factor_lj = sp_lj[sbmask(j)];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int mtype=itype+jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
-      if (r2inv<lj1[mtype].z) {
-        r2inv=(numtyp)1.0/r2inv;
-        numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
-      
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<4; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii*/
-}
-
-#endif
-
--- a/lib/gpu/lj_cut_gpu_memory.cpp
+++ b/lib/gpu/lj_cut_gpu_memory.cpp
@ -1,155 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifdef USE_OPENCL
-#include "lj_cut_gpu_cl.h"
-#else
-#include "lj_cut_gpu_ptx.h"
-#endif
-
-#include "lj_cut_gpu_memory.h"
-#include <cassert>
-#define LJL_GPU_MemoryT LJL_GPU_Memory<numtyp, acctyp>
-
-extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
-
-template <class numtyp, class acctyp>
-LJL_GPU_MemoryT::LJL_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
-}
-
-template <class numtyp, class acctyp>
-LJL_GPU_MemoryT::~LJL_GPU_Memory() { 
-  clear();
-}
- 
-template <class numtyp, class acctyp>
-int LJL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
-  return this->bytes_per_atom_atomic(max_nbors);
-}
-
-template <class numtyp, class acctyp>
-int LJL_GPU_MemoryT::init(const int ntypes, 
-                          double **host_cutsq, double **host_lj1, 
-                          double **host_lj2, double **host_lj3, 
-                          double **host_lj4, double **host_offset, 
-                          double *host_special_lj, const int nlocal,
-                          const int nall, const int max_nbors,
-                          const int maxspecial, const double cell_size,
-                          const double gpu_split, FILE *_screen) {
-  int success;
-  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,lj_cut_gpu_kernel);
-  if (success!=0)
-    return success;
-
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
-  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
-    lj_types=max_shared_types;
-    shared_types=true;
-  }
-  _lj_types=lj_types;
-
-  // Allocate a host write buffer for data initialization
-  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
-
-  for (int i=0; i<lj_types*lj_types; i++)
-    host_write[i]=0.0;
-
-  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq);
-
-  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
-
-  UCL_H_Vec<double> dview;
-  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
-  dview.view(host_special_lj,4,*(this->ucl_device));
-  ucl_copy(sp_lj,dview,false);
-
-  _allocated=true;
-  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return 0;
-}
-
-template <class numtyp, class acctyp>
-void LJL_GPU_MemoryT::clear() {
-  if (!_allocated)
-    return;
-  _allocated=false;
-
-  lj1.clear();
-  lj3.clear();
-  sp_lj.clear();
-  this->clear_atomic();
-}
-
-template <class numtyp, class acctyp>
-double LJL_GPU_MemoryT::host_memory_usage() const {
-  return this->host_memory_usage_atomic()+sizeof(LJL_GPU_Memory<numtyp,acctyp>);
-}
-
-// ---------------------------------------------------------------------------
-// Calculate energies, forces, and torques
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
-  this->time_pair.start();
-  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->_threads_per_atom);
-  } else {
-    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom);
-  }
-  this->time_pair.stop();
-}
-
-template class LJL_GPU_Memory<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lj_cut_gpu_memory.h
+++ b/lib/gpu/lj_cut_gpu_memory.h
@ -1,78 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef LJL_GPU_MEMORY_H
-#define LJL_GPU_MEMORY_H
-
-#include "atomic_gpu_memory.h"
-
-template <class numtyp, class acctyp>
-class LJL_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
- public:
-  LJL_GPU_Memory();
-  ~LJL_GPU_Memory(); 
-
-  /// Clear any previous data and set up for a new LAMMPS run
-  /** \param max_nbors initial number of rows in the neighbor matrix
-    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device
-    * 
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double **host_cutsq,
-           double **host_lj1, double **host_lj2, double **host_lj3,
-           double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
-           const double gpu_split, FILE *screen);
-
-  /// Clear all host and device data
-  /** \note This is called at the beginning of the init() routine **/
-  void clear();
-
-  /// Returns memory usage on device per atom
-  int bytes_per_atom(const int max_nbors) const;
-
-  /// Total host memory used by library for pair style
-  double host_memory_usage() const;
-
-  // --------------------------- TYPE DATA --------------------------
-
-  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq
-  UCL_D_Vec<numtyp4> lj1;
-  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
-  UCL_D_Vec<numtyp4> lj3;
-  /// Special LJ values
-  UCL_D_Vec<numtyp> sp_lj;
-
-  /// If atom type constants fit in shared memory, use fast kernels
-  bool shared_types;
-
-  /// Number of atom types 
-  int _lj_types;
-
- private:
-  bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
-};
-
-#endif
-
--- a/lib/gpu/lj_expand_gpu.cpp
+++ b/lib/gpu/lj_expand_gpu.cpp
@ -1,122 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Inderaj Bains (NVIDIA), ibains@nvidia.com
------------------------------------------------------------------------- */
-
-#include <iostream>
-#include <cassert>
-#include <math.h>
-
-#include "lj_expand_gpu_memory.h"
-
-using namespace std;
-
-static LJE_GPU_Memory<PRECISION,ACC_PRECISION> LJEMF;
-
-// ---------------------------------------------------------------------------
-// Allocate memory on host and device and copy constants to device
-// ---------------------------------------------------------------------------
-int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                 double **host_lj2, double **host_lj3, double **host_lj4,
-                 double **offset, double **shift, double *special_lj,
-                 const int inum, const int nall, const int max_nbors, 
-                 const int maxspecial, const double cell_size, int &gpu_mode,
-                 FILE *screen) {
-  LJEMF.clear();
-  gpu_mode=LJEMF.device->gpu_mode();
-  double gpu_split=LJEMF.device->particle_split();
-  int first_gpu=LJEMF.device->first_device();
-  int last_gpu=LJEMF.device->last_device();
-  int world_me=LJEMF.device->world_me();
-  int gpu_rank=LJEMF.device->gpu_rank();
-  int procs_per_gpu=LJEMF.device->procs_per_gpu();
-
-  LJEMF.device->init_message(screen,"lj/expand",first_gpu,last_gpu);
-
-  bool message=false;
-  if (LJEMF.device->replica_me()==0 && screen)
-    message=true;
-
-  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
-    fflush(screen);
-  }
-
-  int init_ok=0;
-  if (world_me==0)
-    init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                       host_lj4, offset, shift, special_lj, inum, nall, 300,
-                       maxspecial, cell_size, gpu_split, screen);
-
-  LJEMF.device->world_barrier();
-  if (message)
-    fprintf(screen,"Done.\n");
-
-  for (int i=0; i<procs_per_gpu; i++) {
-    if (message) {
-      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
-      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
-                last_gpu,i);
-      fflush(screen);
-    }
-    if (gpu_rank==i && world_me!=0)
-      init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                         offset, shift, special_lj, inum, nall, 300, maxspecial,
-                         cell_size, gpu_split,screen);
-
-    LJEMF.device->world_barrier();
-    if (message) 
-      fprintf(screen,"Done.\n");
-  }
-  if (message)
-    fprintf(screen,"\n");
-
-  if (init_ok==0)
-    LJEMF.estimate_gpu_overhead();
-  return init_ok;
-}
-
-void lje_gpu_clear() {
-  LJEMF.clear();
-}
-
-int** lje_gpu_compute_n(const int ago, const int inum_full,
-                        const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, int *tag, int **nspecial,
-                        int **special, const bool eflag, const bool vflag,
-                        const bool eatom, const bool vatom, int &host_start,
-                        int **ilist, int **jnum, const double cpu_time,
-                        bool &success) {
-  return LJEMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
-                       subhi, tag, nspecial, special, eflag, vflag, eatom,
-                       vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
-void lje_gpu_compute(const int ago, const int inum_full, const int nall,
-                     double **host_x, int *host_type, int *ilist, int *numj,
-                     int **firstneigh, const bool eflag, const bool vflag,
-                     const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success) {
-  LJEMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
-                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
-}
-
-double lje_gpu_bytes() {
-  return LJEMF.host_memory_usage();
-}
-
-
--- a/lib/gpu/lj_expand_gpu_kernel.cu
+++ b/lib/gpu/lj_expand_gpu_kernel.cu
@ -1,392 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Inderaj Bains (NVIDIA), ibains@nvidia.com
------------------------------------------------------------------------- */
-
-#ifndef LJE_GPU_KERNEL
-#define LJE_GPU_KERNEL
-
-#ifdef NV_KERNEL
-
-#include "nv_kernel_def.h"
-texture<float4> pos_tex;
-
-#ifdef _DOUBLE_DOUBLE
-__inline double4 fetch_pos(const int& i, const double4 *pos)
-{
-  return pos[i];
-}
-#else
-__inline float4 fetch_pos(const int& i, const float4 *pos)
-{
-  return tex1Dfetch(pos_tex, i);
-}
-#endif
-
-#else
-
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-#define GLOBAL_ID_X get_global_id(0)
-#define THREAD_ID_X get_local_id(0)
-#define BLOCK_ID_X get_group_id(0)
-#define BLOCK_SIZE_X get_local_size(0)
-#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
-#define __inline inline
-
-#define fetch_pos(i,y) x_[i]
-#define BLOCK_PAIR 64
-#define MAX_SHARED_TYPES 8
-
-#endif
-
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
-#define SBBITS 30
-#define NEIGHMASK 0x3FFFFFFF
-__inline int sbmask(int j) { return j >> SBBITS & 3; }
-
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
-                          const int nbor_pitch, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp sp_lj[4];
-  sp_lj[0]=sp_lj_in[0];
-  sp_lj[1]=sp_lj_in[1];
-  sp_lj[2]=sp_lj_in[2];
-  sp_lj[3]=sp_lj_in[3];
-
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    int itype=ix.w;
-
-    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
-  
-      int j=*nbor;
-      factor_lj = sp_lj[sbmask(j)];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
-      int mtype=itype*lj_types+jtype;
-      if (r2inv<lj1[mtype].z) {
-        numtyp r = sqrt(r2inv);
-	numtyp rshift = r - lj1[mtype].w;
-	numtyp rshiftsq = rshift*rshift;
-	r2inv = (numtyp) 1.0/rshiftsq;
-        numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
-        force*=factor_lj/rshift/r;
-      
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<4; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii
-}
-
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, 
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp sp_lj[4];
-  if (tid<4)
-    sp_lj[tid]=sp_lj_in[tid];
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[tid]=lj1_in[tid];
-    if (eflag>0)
-      lj3[tid]=lj3_in[tid];
-  }
-  
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(numtyp)0;
-  
-  __syncthreads();
-  
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
-
-    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
-  
-      int j=*nbor;
-      factor_lj = sp_lj[sbmask(j)];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int mtype=itype+jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
-      if (r2inv<lj1[mtype].z) {
-        numtyp r = sqrt(r2inv);
-	numtyp rshift = r - lj1[mtype].w;
-	numtyp rshiftsq = rshift*rshift;
-	r2inv = 1.0/rshiftsq;
-        numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
-        force*=factor_lj/rshift/r;
-      
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<4; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii*/
-}
-
-#endif
-
--- a/lib/gpu/lj_expand_gpu_memory.cpp
+++ b/lib/gpu/lj_expand_gpu_memory.cpp
@ -1,155 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Inderaj Bains (NVIDIA), ibains@nvidia.com
------------------------------------------------------------------------- */
-
-#ifdef USE_OPENCL
-#include "lj_expand_gpu_cl.h"
-#else
-#include "lj_expand_gpu_ptx.h"
-#endif
-
-#include "lj_expand_gpu_memory.h"
-#include <cassert>
-#define LJE_GPU_MemoryT LJE_GPU_Memory<numtyp, acctyp>
-
-extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
-
-template <class numtyp, class acctyp>
-LJE_GPU_MemoryT::LJE_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
-}
-
-template <class numtyp, class acctyp>
-LJE_GPU_MemoryT::~LJE_GPU_Memory() {
-  clear();
-}
- 
-template <class numtyp, class acctyp>
-int LJE_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
-  return this->bytes_per_atom_atomic(max_nbors);
-}
-
-template <class numtyp, class acctyp>
-int LJE_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
-                          double **host_lj1, double **host_lj2,
-                          double **host_lj3, double **host_lj4,
-                          double **host_offset, double **host_shift,
-                          double *host_special_lj, const int nlocal,
-                          const int nall, const int max_nbors,
-                          const int maxspecial, const double cell_size,
-                          const double gpu_split, FILE *_screen) {
-  int success;
-  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,lj_expand_gpu_kernel);
-  if (success!=0)
-    return success;
-
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
-  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
-    lj_types=max_shared_types;
-    shared_types=true;
-  }
-  _lj_types=lj_types;
-
-  // Allocate a host write buffer for data initialization
-  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
-
-  for (int i=0; i<lj_types*lj_types; i++)
-    host_write[i]=0.0;
-
-  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq, host_shift);
-
-  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
-
-  UCL_H_Vec<double> dview;
-  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
-  dview.view(host_special_lj,4,*(this->ucl_device));
-  ucl_copy(sp_lj,dview,false);
-
-  _allocated=true;
-  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return 0;
-}
-
-template <class numtyp, class acctyp>
-void LJE_GPU_MemoryT::clear() {
-  if (!_allocated)
-    return;
-  _allocated=false;
-
-  lj1.clear();
-  lj3.clear();
-  sp_lj.clear();
-  this->clear_atomic();
-}
-
-template <class numtyp, class acctyp>
-double LJE_GPU_MemoryT::host_memory_usage() const {
-  return this->host_memory_usage_atomic()+sizeof(LJE_GPU_Memory<numtyp,acctyp>);
-}
-
-// ---------------------------------------------------------------------------
-// Calculate energies, forces, and torques
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void LJE_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
-  this->time_pair.start();
-  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->_threads_per_atom);
-  } else {
-    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom);
-  }
-  this->time_pair.stop();
-}
-
-template class LJE_GPU_Memory<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lj_expand_gpu_memory.h
+++ b/lib/gpu/lj_expand_gpu_memory.h
@ -1,78 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Inderaj Bains (NVIDIA), ibains@nvidia.com
------------------------------------------------------------------------- */
-
-#ifndef LJE_GPU_MEMORY_H
-#define LJE_GPU_MEMORY_H
-
-#include "atomic_gpu_memory.h"
-
-template <class numtyp, class acctyp>
-class LJE_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
- public:
-  LJE_GPU_Memory();
-  ~LJE_GPU_Memory();
-
-  /// Clear any previous data and set up for a new LAMMPS run
-  /** \param max_nbors initial number of rows in the neighbor matrix
-    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device
-    * 
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double **host_cutsq, double **host_lj1,
-           double **host_lj2, double **host_lj3, double **host_lj4,
-           double **host_offset, double **host_shift, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
-           const double gpu_split, FILE *screen);
-
-  /// Clear all host and device data
-  /** \note This is called at the beginning of the init() routine **/
-  void clear();
-
-  /// Returns memory usage on device per atom
-  int bytes_per_atom(const int max_nbors) const;
-
-  /// Total host memory used by library for pair style
-  double host_memory_usage() const;
-
-  // --------------------------- TYPE DATA --------------------------
-
-  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = shift
-  UCL_D_Vec<numtyp4> lj1;
-  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
-  UCL_D_Vec<numtyp4> lj3;
-  /// Special LJ values
-  UCL_D_Vec<numtyp> sp_lj;
-
-  /// If atom type constants fit in shared memory, use fast kernels
-  bool shared_types;
-
-  /// Number of atom types 
-  int _lj_types;
-
- private:
-  bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
-};
-
-#endif
-
--- a/lib/gpu/ljc_cut_gpu.cpp
+++ b/lib/gpu/ljc_cut_gpu.cpp
@ -1,129 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#include <iostream>
-#include <cassert>
-#include <math.h>
-
-#include "ljc_cut_gpu_memory.h"
-
-using namespace std;
-
-static LJC_GPU_Memory<PRECISION,ACC_PRECISION> LJCMF;
-
-// ---------------------------------------------------------------------------
-// Allocate memory on host and device and copy constants to device
-// ---------------------------------------------------------------------------
-int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                 double **host_lj2, double **host_lj3, double **host_lj4,
-                 double **offset, double *special_lj, const int inum,
-                 const int nall, const int max_nbors, const int maxspecial,
-                 const double cell_size, int &gpu_mode, FILE *screen,
-                 double **host_cut_ljsq, double **host_cut_coulsq,
-                 double *host_special_coul, const double qqrd2e) {
-  LJCMF.clear();
-  gpu_mode=LJCMF.device->gpu_mode();
-  double gpu_split=LJCMF.device->particle_split();
-  int first_gpu=LJCMF.device->first_device();
-  int last_gpu=LJCMF.device->last_device();
-  int world_me=LJCMF.device->world_me();
-  int gpu_rank=LJCMF.device->gpu_rank();
-  int procs_per_gpu=LJCMF.device->procs_per_gpu();
-
-  LJCMF.device->init_message(screen,"lj/cut/coul/cut",first_gpu,last_gpu);
-
-  bool message=false;
-  if (LJCMF.device->replica_me()==0 && screen)
-    message=true;
-
-  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
-    fflush(screen);
-  }
-
-  int init_ok=0;
-  if (world_me==0)
-    init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                       host_lj4, offset, special_lj, inum, nall, 300,
-                       maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
-                       host_cut_coulsq, host_special_coul, qqrd2e);
-
-  LJCMF.device->world_barrier();
-  if (message)
-    fprintf(screen,"Done.\n");
-
-  for (int i=0; i<procs_per_gpu; i++) {
-    if (message) {
-      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
-      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
-                last_gpu,i);
-      fflush(screen);
-    }
-    if (gpu_rank==i && world_me!=0)
-      init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                         offset, special_lj, inum, nall, 300, maxspecial,
-                         cell_size, gpu_split, screen, host_cut_ljsq,
-                         host_cut_coulsq, host_special_coul, qqrd2e);
-
-    LJCMF.device->gpu_barrier();
-    if (message) 
-      fprintf(screen,"Done.\n");
-  }
-  if (message)
-    fprintf(screen,"\n");
-
-  if (init_ok==0)
-    LJCMF.estimate_gpu_overhead();
-  return init_ok;
-}
-
-void ljc_gpu_clear() {
-  LJCMF.clear();
-}
-
-int** ljc_gpu_compute_n(const int ago, const int inum_full,
-                        const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, int *tag, int **nspecial, 
-                        int **special, const bool eflag, const bool vflag,
-                        const bool eatom, const bool vatom, int &host_start,
-                        int **ilist, int **jnum, const double cpu_time,
-                        bool &success, double *host_q, double *boxlo,
-                        double *prd) {
-  return LJCMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
-                       subhi, tag, nspecial, special, eflag, vflag, eatom,
-                       vatom, host_start, ilist, jnum, cpu_time, success,
-                       host_q, boxlo, prd);
-}  
-			
-void ljc_gpu_compute(const int ago, const int inum_full, const int nall,
-                     double **host_x, int *host_type, int *ilist, int *numj,
-                     int **firstneigh, const bool eflag, const bool vflag,
-                     const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success, double *host_q,
-                     const int nlocal, double *boxlo, double *prd) {
-  LJCMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
-                vflag,eatom,vatom,host_start,cpu_time,success,host_q,
-                nlocal,boxlo,prd);
-}
-
-double ljc_gpu_bytes() {
-  return LJCMF.host_memory_usage();
-}
-
-
--- a/lib/gpu/ljc_cut_gpu_kernel.cu
+++ b/lib/gpu/ljc_cut_gpu_kernel.cu
@ -1,448 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef LJC_GPU_KERNEL
-#define LJC_GPU_KERNEL
-
-#ifdef NV_KERNEL
-
-#include "nv_kernel_def.h"
-texture<float4> pos_tex;
-texture<float> q_tex;
-
-#ifdef _DOUBLE_DOUBLE
-__inline double4 fetch_pos(const int& i, const double4 *pos)
-{
-  return pos[i];
-}
-__inline double fetch_q(const int& i, const double *q)
-{
-  return q[i];
-}
-#else
-__inline float4 fetch_pos(const int& i, const float4 *pos)
-{
-  return tex1Dfetch(pos_tex, i);
-}
-__inline float fetch_q(const int& i, const float *q)
-{
-  return tex1Dfetch(q_tex, i);
-}
-#endif
-
-#else
-
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-#define GLOBAL_ID_X get_global_id(0)
-#define THREAD_ID_X get_local_id(0)
-#define BLOCK_ID_X get_group_id(0)
-#define BLOCK_SIZE_X get_local_size(0)
-#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
-#define __inline inline
-
-#define fetch_pos(i,y) x_[i]
-#define fetch_q(i,y) q_[i]
-#define BLOCK_PAIR 64
-#define MAX_SHARED_TYPES 8
-
-#endif
-
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
-#define SBBITS 30
-#define NEIGHMASK 0x3FFFFFFF
-__inline int sbmask(int j) { return j >> SBBITS & 3; }
-
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_ ,
-                          __global numtyp *cutsq, const numtyp qqrd2e,
-                          const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp sp_lj[8];
-  sp_lj[0]=sp_lj_in[0];
-  sp_lj[1]=sp_lj_in[1];
-  sp_lj[2]=sp_lj_in[2];
-  sp_lj[3]=sp_lj_in[3];
-  sp_lj[4]=sp_lj_in[4];
-  sp_lj[5]=sp_lj_in[5];
-  sp_lj[6]=sp_lj_in[6];
-  sp_lj[7]=sp_lj_in[7];
-
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
-    int itype=ix.w;
-
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
-
-      numtyp factor_lj, factor_coul;
-      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = sp_lj[sbmask(j)+4];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      int mtype=itype*lj_types+jtype;
-      if (rsq<cutsq[mtype]) {
-        numtyp r2inv=(numtyp)1.0/rsq;
-        numtyp forcecoul, force_lj, force, r6inv;
-
-        if (rsq < lj1[mtype].z) {
-          r6inv = r2inv*r2inv*r2inv;
-          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
-        } else
-          force_lj = (numtyp)0.0;
-
-        if (rsq < lj1[mtype].w) 
-          forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*sqrt(r2inv)*factor_coul;
-        else
-          forcecoul = (numtyp)0.0;
-
-        force = (force_lj + forcecoul) * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          e_coul += forcecoul;
-          if (rsq < lj1[mtype].z) {
-            numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-            energy+=factor_lj*(e-lj3[mtype].z);
-          } 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-    red_acc[4][tid]=e_coul;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<5; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-    e_coul=red_acc[4][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-      *ap1=e_coul;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii
-}
-
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               __global numtyp *_cutsq, const numtyp qqrd2e,
-                               const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp sp_lj[8];
-  if (tid<8)
-    sp_lj[tid]=sp_lj_in[tid];
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[tid]=lj1_in[tid];
-    cutsq[tid]=_cutsq[tid];
-    if (eflag>0)
-      lj3[tid]=lj3_in[tid];
-  }
-  
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  __syncthreads();
-  
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
-    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
-
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
-
-      numtyp factor_lj, factor_coul;
-      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = sp_lj[sbmask(j)+4];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int mtype=itype+jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      if (rsq<cutsq[mtype]) {
-        numtyp r2inv=(numtyp)1.0/rsq;
-        numtyp forcecoul, force_lj, force, r6inv;
-
-        if (rsq < lj1[mtype].z) {
-          r6inv = r2inv*r2inv*r2inv;
-          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
-        } else
-          force_lj = (numtyp)0.0;
-
-        if (rsq < lj1[mtype].w)
-          forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*sqrt(r2inv)*factor_coul;
-        else
-          forcecoul = (numtyp)0.0;
-
-        force = (force_lj + forcecoul) * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          e_coul += forcecoul;
-          if (rsq < lj1[mtype].z) {
-            numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-            energy+=factor_lj*(e-lj3[mtype].z);
-          }
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-    red_acc[4][tid]=e_coul;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<5; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-    e_coul=red_acc[4][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-      *ap1=e_coul;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii*/
-}
-
-#endif
-
--- a/lib/gpu/ljc_cut_gpu_memory.cpp
+++ b/lib/gpu/ljc_cut_gpu_memory.cpp
@ -1,170 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifdef USE_OPENCL
-#include "ljc_cut_gpu_cl.h"
-#else
-#include "ljc_cut_gpu_ptx.h"
-#endif
-
-#include "ljc_cut_gpu_memory.h"
-#include <cassert>
-#define LJC_GPU_MemoryT LJC_GPU_Memory<numtyp, acctyp>
-
-extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
-
-template <class numtyp, class acctyp>
-LJC_GPU_MemoryT::LJC_GPU_Memory() : ChargeGPUMemory<numtyp,acctyp>(),
-                                    _allocated(false) {
-}
-
-template <class numtyp, class acctyp>
-LJC_GPU_MemoryT::~LJC_GPU_Memory() {
-  clear();
-}
- 
-template <class numtyp, class acctyp>
-int LJC_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
-  return this->bytes_per_atom_atomic(max_nbors);
-}
-
-template <class numtyp, class acctyp>
-int LJC_GPU_MemoryT::init(const int ntypes,
-                          double **host_cutsq, double **host_lj1, 
-                          double **host_lj2, double **host_lj3, 
-                          double **host_lj4, double **host_offset, 
-                          double *host_special_lj, const int nlocal,
-                          const int nall, const int max_nbors,
-                          const int maxspecial, const double cell_size,
-                          const double gpu_split, FILE *_screen,
-                          double **host_cut_ljsq, double **host_cut_coulsq,
-                          double *host_special_coul, const double qqrd2e) {
-  int success;
-  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,ljc_cut_gpu_kernel);
-  if (success!=0)
-    return success;
-
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
-  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
-    lj_types=max_shared_types;
-    shared_types=true;
-  }
-  _lj_types=lj_types;
-
-  // Allocate a host write buffer for data initialization
-  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
-
-  for (int i=0; i<lj_types*lj_types; i++)
-    host_write[i]=0.0;
-
-  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cut_ljsq, host_cut_coulsq);
-
-  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
-
-  cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
-
-  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
-  for (int i=0; i<4; i++) {
-    host_write[i]=host_special_lj[i];
-    host_write[i+4]=host_special_coul[i];
-  }
-  ucl_copy(sp_lj,host_write,8,false);
-
-  _qqrd2e=qqrd2e;
-
-  _allocated=true;
-  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
-                   sp_lj.row_bytes();
-  return 0;
-}
-
-template <class numtyp, class acctyp>
-void LJC_GPU_MemoryT::clear() {
-  if (!_allocated)
-    return;
-  _allocated=false;
-
-  lj1.clear();
-  lj3.clear();
-  cutsq.clear();
-  sp_lj.clear();
-  this->clear_atomic();
-}
-
-template <class numtyp, class acctyp>
-double LJC_GPU_MemoryT::host_memory_usage() const {
-  return this->host_memory_usage_atomic()+sizeof(LJC_GPU_Memory<numtyp,acctyp>);
-}
-
-// ---------------------------------------------------------------------------
-// Calculate energies, forces, and torques
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
-  this->time_pair.start();
-  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch,
-                          &this->atom->dev_q.begin(), &cutsq.begin(),
-                          &_qqrd2e, &this->_threads_per_atom);
-  } else {
-    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->dev_q.begin(),
-                     &cutsq.begin(), &_qqrd2e, &this->_threads_per_atom);
-  }
-  this->time_pair.stop();
-}
-
-template class LJC_GPU_Memory<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/ljc_cut_gpu_memory.h
+++ b/lib/gpu/ljc_cut_gpu_memory.h
@ -1,84 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef LJC_GPU_MEMORY_H
-#define LJC_GPU_MEMORY_H
-
-#include "charge_gpu_memory.h"
-
-template <class numtyp, class acctyp>
-class LJC_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
- public:
-  LJC_GPU_Memory();
-  ~LJC_GPU_Memory();
-
-  /// Clear any previous data and set up for a new LAMMPS run
-  /** \param max_nbors initial number of rows in the neighbor matrix
-    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device
-    * 
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double **host_cutsq, double **host_lj1,
-           double **host_lj2, double **host_lj3, double **host_lj4,
-           double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size,
-           const double gpu_split, FILE *screen, double **host_cut_ljsq,
-           double **host_cut_coulsq, double *host_special_coul,
-           const double qqrd2e);
-
-  /// Clear all host and device data
-  /** \note This is called at the beginning of the init() routine **/
-  void clear();
-
-  /// Returns memory usage on device per atom
-  int bytes_per_atom(const int max_nbors) const;
-
-  /// Total host memory used by library for pair style
-  double host_memory_usage() const;
-
-  // --------------------------- TYPE DATA --------------------------
-
-  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul
-  UCL_D_Vec<numtyp4> lj1;
-  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
-  UCL_D_Vec<numtyp4> lj3;
-  /// cutsq
-  UCL_D_Vec<numtyp> cutsq;
-  /// Special LJ values [0-3] and Special Coul values [4-7]
-  UCL_D_Vec<numtyp> sp_lj;
-
-  /// If atom type constants fit in shared memory, use fast kernels
-  bool shared_types;
-
-  /// Number of atom types 
-  int _lj_types;
-
-  numtyp _qqrd2e;
-
- private:
-  bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
-};
-
-#endif
-
--- a/lib/gpu/ljcl_cut_gpu.cpp
+++ b/lib/gpu/ljcl_cut_gpu.cpp
@ -1,130 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#include <iostream>
-#include <cassert>
-#include <math.h>
-
-#include "ljcl_cut_gpu_memory.h"
-
-using namespace std;
-
-static LJCL_GPU_Memory<PRECISION,ACC_PRECISION> LJCLMF;
-
-// ---------------------------------------------------------------------------
-// Allocate memory on host and device and copy constants to device
-// ---------------------------------------------------------------------------
-int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                  double **host_lj2, double **host_lj3, double **host_lj4,
-                  double **offset, double *special_lj, const int inum,
-                  const int nall, const int max_nbors, const int maxspecial,
-                  const double cell_size, int &gpu_mode, FILE *screen,
-                  double **host_cut_ljsq, double host_cut_coulsq,
-                  double *host_special_coul, const double qqrd2e,
-                  const double g_ewald) {
-  LJCLMF.clear();
-  gpu_mode=LJCLMF.device->gpu_mode();
-  double gpu_split=LJCLMF.device->particle_split();
-  int first_gpu=LJCLMF.device->first_device();
-  int last_gpu=LJCLMF.device->last_device();
-  int world_me=LJCLMF.device->world_me();
-  int gpu_rank=LJCLMF.device->gpu_rank();
-  int procs_per_gpu=LJCLMF.device->procs_per_gpu();
-
-  LJCLMF.device->init_message(screen,"lj/cut/coul/long",first_gpu,last_gpu);
-
-  bool message=false;
-  if (LJCLMF.device->replica_me()==0 && screen)
-    message=true;
-
-  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
-    fflush(screen);
-  }
-
-  int init_ok=0;
-  if (world_me==0)
-    init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                        offset, special_lj, inum, nall, 300, maxspecial,
-                        cell_size, gpu_split, screen, host_cut_ljsq,
-                        host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
-
-  LJCLMF.device->world_barrier();
-  if (message)
-    fprintf(screen,"Done.\n");
-
-  for (int i=0; i<procs_per_gpu; i++) {
-    if (message) {
-      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
-      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
-                last_gpu,i);
-      fflush(screen);
-    }
-    if (gpu_rank==i && world_me!=0)
-      init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                          offset, special_lj, inum, nall, 300, maxspecial,
-                          cell_size, gpu_split, screen, host_cut_ljsq,
-                          host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
-
-    LJCLMF.device->gpu_barrier();
-    if (message) 
-      fprintf(screen,"Done.\n");
-  }
-  if (message)
-    fprintf(screen,"\n");
-
-  if (init_ok==0)
-    LJCLMF.estimate_gpu_overhead();
-  return init_ok;
-}
-
-void ljcl_gpu_clear() {
-  LJCLMF.clear();
-}
-
-int** ljcl_gpu_compute_n(const int ago, const int inum_full,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, int *tag, int **nspecial, 
-                         int **special, const bool eflag, const bool vflag,
-                         const bool eatom, const bool vatom, int &host_start,
-                         int **ilist, int **jnum,  const double cpu_time,
-                         bool &success, double *host_q, double *boxlo,
-                         double *prd) {
-  return LJCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
-                        subhi, tag, nspecial, special, eflag, vflag, eatom,
-                        vatom, host_start, ilist, jnum, cpu_time, success,
-                        host_q, boxlo, prd);
-}  
-			
-void ljcl_gpu_compute(const int ago, const int inum_full, const int nall,
-                      double **host_x, int *host_type, int *ilist, int *numj,
-                      int **firstneigh, const bool eflag, const bool vflag,
-                      const bool eatom, const bool vatom, int &host_start,
-                      const double cpu_time, bool &success, double *host_q,
-                      const int nlocal, double *boxlo, double *prd) {
-  LJCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
-                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
-                host_q,nlocal,boxlo,prd);
-}
-
-double ljcl_gpu_bytes() {
-  return LJCLMF.host_memory_usage();
-}
-
-
--- a/lib/gpu/ljcl_cut_gpu_kernel.cu
+++ b/lib/gpu/ljcl_cut_gpu_kernel.cu
@ -1,468 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef LJCL_GPU_KERNEL
-#define LJCL_GPU_KERNEL
-
-#ifdef NV_KERNEL
-
-#include "nv_kernel_def.h"
-texture<float4> pos_tex;
-texture<float> q_tex;
-
-#ifdef _DOUBLE_DOUBLE
-__inline double4 fetch_pos(const int& i, const double4 *pos)
-{
-  return pos[i];
-}
-__inline double fetch_q(const int& i, const double *q)
-{
-  return q[i];
-}
-#else
-__inline float4 fetch_pos(const int& i, const float4 *pos)
-{
-  return tex1Dfetch(pos_tex, i);
-}
-__inline float fetch_q(const int& i, const float *q)
-{
-  return tex1Dfetch(q_tex, i);
-}
-#endif
-
-#else
-
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-#define GLOBAL_ID_X get_global_id(0)
-#define THREAD_ID_X get_local_id(0)
-#define BLOCK_ID_X get_group_id(0)
-#define BLOCK_SIZE_X get_local_size(0)
-#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
-#define __inline inline
-
-#define fetch_pos(i,y) x_[i]
-#define fetch_q(i,y) q_[i]
-#define BLOCK_PAIR 64
-#define MAX_SHARED_TYPES 8
-
-#endif
-
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
-#define EWALD_F (numtyp)1.12837917
-#define EWALD_P (numtyp)0.3275911
-#define A1 (numtyp)0.254829592
-#define A2 (numtyp)-0.284496736
-#define A3 (numtyp)1.421413741
-#define A4 (numtyp)-1.453152027
-#define A5 (numtyp)1.061405429
-
-#define SBBITS 30
-#define NEIGHMASK 0x3FFFFFFF
-__inline int sbmask(int j) { return j >> SBBITS & 3; }
-
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_,
-                          const numtyp cut_coulsq, const numtyp qqrd2e,
-                          const numtyp g_ewald, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp sp_lj[8];
-  sp_lj[0]=sp_lj_in[0];
-  sp_lj[1]=sp_lj_in[1];
-  sp_lj[2]=sp_lj_in[2];
-  sp_lj[3]=sp_lj_in[3];
-  sp_lj[4]=sp_lj_in[4];
-  sp_lj[5]=sp_lj_in[5];
-  sp_lj[6]=sp_lj_in[6];
-  sp_lj[7]=sp_lj_in[7];
-
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
-    int itype=ix.w;
-
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
-
-      numtyp factor_lj, factor_coul;
-      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      int mtype=itype*lj_types+jtype;
-      if (rsq<lj1[mtype].z) {
-        numtyp r2inv=(numtyp)1.0/rsq;
-        numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
-
-        if (rsq < lj1[mtype].w) {
-          r6inv = r2inv*r2inv*r2inv;
-          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
-        } else
-          force_lj = (numtyp)0.0;
-
-        if (rsq < cut_coulsq) {
-          numtyp r = sqrt(rsq);
-          numtyp grij = g_ewald * r;
-          numtyp expm2 = exp(-grij*grij);
-          numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
-          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
-          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else
-          forcecoul = (numtyp)0.0;
-
-        force = (force_lj + forcecoul) * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          if (rsq < cut_coulsq)
-            e_coul += prefactor*(_erfc-factor_coul);
-          if (rsq < lj1[mtype].w) {
-            numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-            energy+=factor_lj*(e-lj3[mtype].z);
-          } 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-    red_acc[4][tid]=e_coul;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<5; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-    e_coul=red_acc[4][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-      *ap1=e_coul;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii
-}
-
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp g_ewald, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp sp_lj[8];
-  if (tid<8)
-    sp_lj[tid]=sp_lj_in[tid];
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[tid]=lj1_in[tid];
-    if (eflag>0)
-      lj3[tid]=lj3_in[tid];
-  }
-  
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  __syncthreads();
-  
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
-    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
-
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
-
-      numtyp factor_lj, factor_coul;
-      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int mtype=itype+jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      if (rsq<lj1[mtype].z) {
-        numtyp r2inv=(numtyp)1.0/rsq;
-        numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
-
-        if (rsq < lj1[mtype].w) {
-          r6inv = r2inv*r2inv*r2inv;
-          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
-        } else
-          force_lj = (numtyp)0.0;
-
-        if (rsq < cut_coulsq) {
-          numtyp r = sqrt(rsq);
-          numtyp grij = g_ewald * r;
-          numtyp expm2 = exp(-grij*grij);
-          numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
-          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
-          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else
-          forcecoul = (numtyp)0.0;
-
-        force = (force_lj + forcecoul) * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          if (rsq < cut_coulsq)
-            e_coul += prefactor*(_erfc-factor_coul);
-          if (rsq < lj1[mtype].w) {
-            numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-            energy+=factor_lj*(e-lj3[mtype].z);
-          }
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-    red_acc[4][tid]=e_coul;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<5; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-    e_coul=red_acc[4][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-      *ap1=e_coul;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii*/
-}
-
-#endif
-
--- a/lib/gpu/ljcl_cut_gpu_memory.cpp
+++ b/lib/gpu/ljcl_cut_gpu_memory.cpp
@ -1,168 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifdef USE_OPENCL
-#include "ljcl_cut_gpu_cl.h"
-#else
-#include "ljcl_cut_gpu_ptx.h"
-#endif
-
-#include "ljcl_cut_gpu_memory.h"
-#include <cassert>
-#define LJCL_GPU_MemoryT LJCL_GPU_Memory<numtyp, acctyp>
-
-extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
-
-template <class numtyp, class acctyp>
-LJCL_GPU_MemoryT::LJCL_GPU_Memory() : ChargeGPUMemory<numtyp,acctyp>(),
-                                    _allocated(false) {
-}
-
-template <class numtyp, class acctyp>
-LJCL_GPU_MemoryT::~LJCL_GPU_Memory() {
-  clear();
-}
- 
-template <class numtyp, class acctyp>
-int LJCL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
-  return this->bytes_per_atom_atomic(max_nbors);
-}
-
-template <class numtyp, class acctyp>
-int LJCL_GPU_MemoryT::init(const int ntypes,
-                           double **host_cutsq, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
-                           double *host_special_lj, const int nlocal,
-                           const int nall, const int max_nbors,
-                           const int maxspecial, const double cell_size,
-                           const double gpu_split, FILE *_screen,
-                           double **host_cut_ljsq, const double host_cut_coulsq,
-                           double *host_special_coul, const double qqrd2e,
-                           const double g_ewald) {
-  int success;
-  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,ljcl_cut_gpu_kernel);
-  if (success!=0)
-    return success;
-
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
-  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
-    lj_types=max_shared_types;
-    shared_types=true;
-  }
-  _lj_types=lj_types;
-
-  // Allocate a host write buffer for data initialization
-  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
-
-  for (int i=0; i<lj_types*lj_types; i++)
-    host_write[i]=0.0;
-
-  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq, host_cut_ljsq);
-
-  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
-
-  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
-  for (int i=0; i<4; i++) {
-    host_write[i]=host_special_lj[i];
-    host_write[i+4]=host_special_coul[i];
-  }
-  ucl_copy(sp_lj,host_write,8,false);
-
-  _cut_coulsq=host_cut_coulsq;
-  _qqrd2e=qqrd2e;
-  _g_ewald=g_ewald;
-
-  _allocated=true;
-  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return 0;
-}
-
-template <class numtyp, class acctyp>
-void LJCL_GPU_MemoryT::clear() {
-  if (!_allocated)
-    return;
-  _allocated=false;
-
-  lj1.clear();
-  lj3.clear();
-  sp_lj.clear();
-  this->clear_atomic();
-}
-
-template <class numtyp, class acctyp>
-double LJCL_GPU_MemoryT::host_memory_usage() const {
-  return this->host_memory_usage_atomic()+sizeof(LJCL_GPU_Memory<numtyp,acctyp>);
-}
-
-// ---------------------------------------------------------------------------
-// Calculate energies, forces, and torques
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
-  this->time_pair.start();
-  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
-                          &_cut_coulsq, &_qqrd2e, &_g_ewald,
-                          &this->_threads_per_atom);
-  } else {
-    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
-                     &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
-  }
-  this->time_pair.stop();
-}
-
-template class LJCL_GPU_Memory<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/ljcl_cut_gpu_memory.h
+++ b/lib/gpu/ljcl_cut_gpu_memory.h
@ -1,82 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef LJCL_GPU_MEMORY_H
-#define LJCL_GPU_MEMORY_H
-
-#include "charge_gpu_memory.h"
-
-template <class numtyp, class acctyp>
-class LJCL_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
- public:
-  LJCL_GPU_Memory();
-  ~LJCL_GPU_Memory();
-
-  /// Clear any previous data and set up for a new LAMMPS run
-  /** \param max_nbors initial number of rows in the neighbor matrix
-    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device
-    * 
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double **host_cutsq,
-           double **host_lj1, double **host_lj2, double **host_lj3,
-           double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
-           const double gpu_split, FILE *screen, double **host_cut_ljsq,
-           const double host_cut_coulsq, double *host_special_coul,
-           const double qqrd2e, const double g_ewald);
-
-  /// Clear all host and device data
-  /** \note This is called at the beginning of the init() routine **/
-  void clear();
-
-  /// Returns memory usage on device per atom
-  int bytes_per_atom(const int max_nbors) const;
-
-  /// Total host memory used by library for pair style
-  double host_memory_usage() const;
-
-  // --------------------------- TYPE DATA --------------------------
-
-  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = cutsq_vdw
-  UCL_D_Vec<numtyp4> lj1;
-  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
-  UCL_D_Vec<numtyp4> lj3;
-  /// Special LJ values [0-3] and Special Coul values [4-7]
-  UCL_D_Vec<numtyp> sp_lj;
-
-  /// If atom type constants fit in shared memory, use fast kernels
-  bool shared_types;
-
-  /// Number of atom types 
-  int _lj_types;
-
-  numtyp _cut_coulsq, _qqrd2e, _g_ewald;
-
- private:
-  bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
-};
-
-#endif
-
--- a/lib/gpu/morse_gpu.cpp
+++ b/lib/gpu/morse_gpu.cpp
@ -1,122 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#include <iostream>
-#include <cassert>
-#include <math.h>
-
-#include "morse_gpu_memory.h"
-
-using namespace std;
-
-static MOR_GPU_Memory<PRECISION,ACC_PRECISION> MORMF;
-
-// ---------------------------------------------------------------------------
-// Allocate memory on host and device and copy constants to device
-// ---------------------------------------------------------------------------
-int mor_gpu_init(const int ntypes, double **cutsq,
-                 double **host_lj1, double **host_lj2, double **host_lj3, 
-                 double **host_lj4, double **offset, double *special_lj,
-                 const int inum, const int nall, const int max_nbors, 
-                 const int maxspecial, const double cell_size, int &gpu_mode,
-                 FILE *screen) {
-  MORMF.clear();
-  gpu_mode=MORMF.device->gpu_mode();
-  double gpu_split=MORMF.device->particle_split();
-  int first_gpu=MORMF.device->first_device();
-  int last_gpu=MORMF.device->last_device();
-  int world_me=MORMF.device->world_me();
-  int gpu_rank=MORMF.device->gpu_rank();
-  int procs_per_gpu=MORMF.device->procs_per_gpu();
-
-  MORMF.device->init_message(screen,"morse",first_gpu,last_gpu);
-
-  bool message=false;
-  if (MORMF.device->replica_me()==0 && screen)
-    message=true;
-
-  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
-    fflush(screen);
-  }
-
-  int init_ok=0;
-  if (world_me==0)
-    init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
-                       host_lj4, offset, special_lj, inum, nall, 300,
-                       maxspecial, cell_size, gpu_split, screen);
-
-  MORMF.device->world_barrier();
-  if (message)
-    fprintf(screen,"Done.\n");
-
-  for (int i=0; i<procs_per_gpu; i++) {
-    if (message) {
-      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
-      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
-                last_gpu,i);
-      fflush(screen);
-    }
-    if (gpu_rank==i && world_me!=0)
-      init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                         offset, special_lj, inum, nall, 300, maxspecial,
-                         cell_size, gpu_split, screen);
-
-    MORMF.device->gpu_barrier();
-    if (message) 
-      fprintf(screen,"Done.\n");
-  }
-  if (message)
-    fprintf(screen,"\n");
-
-  if (init_ok==0)
-    MORMF.estimate_gpu_overhead();
-  return init_ok;
-}
-
-void mor_gpu_clear() {
-  MORMF.clear();
-}
-
-int** mor_gpu_compute_n(const int ago, const int inum_full,
-                        const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, int *tag, int **nspecial,
-                        int **special, const bool eflag, const bool vflag,
-                        const bool eatom, const bool vatom, int &host_start,
-                        int **ilist, int **jnum, const double cpu_time,
-                        bool &success) {
-  return MORMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
-                       subhi, tag, nspecial, special, eflag, vflag, eatom,
-                       vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
-void mor_gpu_compute(const int ago, const int inum_full, const int nall,
-                     double **host_x, int *host_type, int *ilist, int *numj,
-                     int **firstneigh, const bool eflag, const bool vflag,
-                     const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success) {
-  MORMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
-                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
-}
-
-double mor_gpu_bytes() {
-  return MORMF.host_memory_usage();
-}
-
-
--- a/lib/gpu/morse_gpu_kernel.cu
+++ b/lib/gpu/morse_gpu_kernel.cu
@ -1,388 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef MORSE_GPU_KERNEL
-#define MORSE_GPU_KERNEL
-
-#ifdef NV_KERNEL
-
-#include "nv_kernel_def.h"
-texture<float4> pos_tex;
-
-#ifdef _DOUBLE_DOUBLE
-__inline double4 fetch_pos(const int& i, const double4 *pos)
-{
-  return pos[i];
-}
-#else
-__inline float4 fetch_pos(const int& i, const float4 *pos)
-{
-  return tex1Dfetch(pos_tex, i);
-}
-#endif
-
-#else
-
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-#define GLOBAL_ID_X get_global_id(0)
-#define THREAD_ID_X get_local_id(0)
-#define BLOCK_ID_X get_group_id(0)
-#define BLOCK_SIZE_X get_local_size(0)
-#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
-#define __inline inline
-
-#define fetch_pos(i,y) x_[i]
-#define BLOCK_PAIR 64
-#define MAX_SHARED_TYPES 8
-
-#endif
-
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
-#define SBBITS 30
-#define NEIGHMASK 0x3FFFFFFF
-__inline int sbmask(int j) { return j >> SBBITS & 3; }
-
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
-                          __global numtyp2* mor2, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
-                          const int nbor_pitch, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp sp_lj[4];
-  sp_lj[0]=sp_lj_in[0];
-  sp_lj[1]=sp_lj_in[1];
-  sp_lj[2]=sp_lj_in[2];
-  sp_lj[3]=sp_lj_in[3];
-
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    int itype=ix.w;
-
-    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
-  
-      int j=*nbor;
-      factor_lj = sp_lj[sbmask(j)];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp r = delx*delx+dely*dely+delz*delz;
-        
-      int mtype=itype*lj_types+jtype;
-      if (r<mor1[mtype].x) {
-        r=sqrt(r);
-        numtyp dexp=r-mor1[mtype].z;
-        dexp=exp(-mor1[mtype].w*dexp);
-        numtyp dm=dexp*dexp-dexp;
-        numtyp force = mor1[mtype].y*dm/r*factor_lj;
-      
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          numtyp e=mor2[mtype].x*(dexp*dexp - 2.0*dexp) - mor2[mtype].y;
-          energy+=e*factor_lj; 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<4; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii
-}
-
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
-                               __global numtyp2* mor2_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-
-  __local numtyp4 mor1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp2 mor2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp sp_lj[4];
-  if (tid<4)
-    sp_lj[tid]=sp_lj_in[tid];
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    mor1[tid]=mor1_in[tid];
-    if (eflag>0)
-      mor2[tid]=mor2_in[tid];
-  }
-  
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  __syncthreads();
-  
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
-
-    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
-  
-      int j=*nbor;
-      factor_lj = sp_lj[sbmask(j)];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int mtype=itype+jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp r = delx*delx+dely*dely+delz*delz;
-        
-      if (r<mor1[mtype].x) {
-        r=sqrt(r);
-        numtyp dexp=r-mor1[mtype].z;
-        dexp=exp(-mor1[mtype].w*dexp);
-        numtyp dm=dexp*dexp-dexp;
-        numtyp force = mor1[mtype].y*dm/r*factor_lj;
-      
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          numtyp e=mor2[mtype].x*(dm-dexp)-mor2[mtype].y;
-          energy+=e*factor_lj; 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<4; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  if (ii<inum && offset==0) {
-    __global acctyp *ap1=engv+ii;
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  } // if ii*/
-}
-
-#endif
-
--- a/lib/gpu/morse_gpu_memory.cpp
+++ b/lib/gpu/morse_gpu_memory.cpp
@ -1,155 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifdef USE_OPENCL
-#include "morse_gpu_cl.h"
-#else
-#include "morse_gpu_ptx.h"
-#endif
-
-#include "morse_gpu_memory.h"
-#include <cassert>
-#define MOR_GPU_MemoryT MOR_GPU_Memory<numtyp, acctyp>
-
-extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
-
-template <class numtyp, class acctyp>
-MOR_GPU_MemoryT::MOR_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
-}
-
-template <class numtyp, class acctyp>
-MOR_GPU_MemoryT::~MOR_GPU_Memory() { 
-  clear();
-}
- 
-template <class numtyp, class acctyp>
-int MOR_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
-  return this->bytes_per_atom_atomic(max_nbors);
-}
-
-template <class numtyp, class acctyp>
-int MOR_GPU_MemoryT::init(const int ntypes, 
-                          double **host_cutsq, double **host_morse1, 
-                          double **host_r0, double **host_alpha, 
-                          double **host_d0, double **host_offset, 
-                          double *host_special_lj, const int nlocal,
-                          const int nall, const int max_nbors,
-                          const int maxspecial, const double cell_size,
-                          const double gpu_split, FILE *_screen) {
-  int success;
-  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,morse_gpu_kernel);
-  if (success!=0)
-    return success;
-
-  // If atom type constants fit in shared memory use fast kernel
-  int types=ntypes;
-  shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
-  if (types<=max_shared_types && this->_block_size>=max_shared_types) {
-    types=max_shared_types;
-    shared_types=true;
-  }
-  _types=types;
-
-  // Allocate a host write buffer for data initialization
-  UCL_H_Vec<numtyp> host_write(types*types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
-
-  for (int i=0; i<types*types; i++)
-    host_write[i]=0.0;
-
-  mor1.alloc(types*types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,types,mor1,host_write,host_cutsq,host_morse1,
-                         host_r0,host_alpha);
-
-  mor2.alloc(types*types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack2(ntypes,types,mor2,host_write,host_d0,host_offset);
-
-  UCL_H_Vec<double> dview;
-  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
-  dview.view(host_special_lj,4,*(this->ucl_device));
-  ucl_copy(sp_lj,dview,false);
-
-  _allocated=true;
-  this->_max_bytes=mor1.row_bytes()+mor2.row_bytes()+sp_lj.row_bytes();
-  return 0;
-}
-
-template <class numtyp, class acctyp>
-void MOR_GPU_MemoryT::clear() {
-  if (!_allocated)
-    return;
-  _allocated=false;
-
-  mor1.clear();
-  mor2.clear();
-  sp_lj.clear();
-  this->clear_atomic();
-}
-
-template <class numtyp, class acctyp>
-double MOR_GPU_MemoryT::host_memory_usage() const {
-  return this->host_memory_usage_atomic()+sizeof(MOR_GPU_Memory<numtyp,acctyp>);
-}
-
-// ---------------------------------------------------------------------------
-// Calculate energies, forces, and torques
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void MOR_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
-  this->time_pair.start();
-  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &mor1.begin(),
-                          &mor2.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->_threads_per_atom);
-  } else {
-    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &mor1.begin(), &mor2.begin(),
-                     &_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom);
-  }
-  this->time_pair.stop();
-}
-
-template class MOR_GPU_Memory<PRECISION,ACC_PRECISION>;
-
--- a/lib/gpu/morse_gpu_memory.h
+++ b/lib/gpu/morse_gpu_memory.h
@ -1,78 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef MOR_GPU_MEMORY_H
-#define MOR_GPU_MEMORY_H
-
-#include "atomic_gpu_memory.h"
-
-template <class numtyp, class acctyp>
-class MOR_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
- public:
-  MOR_GPU_Memory();
-  ~MOR_GPU_Memory(); 
-
-  /// Clear any previous data and set up for a new LAMMPS run
-  /** \param max_nbors initial number of rows in the neighbor matrix
-    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device
-    * 
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double **host_cutsq,
-           double **host_morse1, double **host_r0, double **host_alpha,
-           double **host_d0, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
-           const double gpu_split, FILE *screen);
-
-  /// Clear all host and device data
-  /** \note This is called at the beginning of the init() routine **/
-  void clear();
-
-  /// Returns memory usage on device per atom
-  int bytes_per_atom(const int max_nbors) const;
-
-  /// Total host memory used by library for pair style
-  double host_memory_usage() const;
-
-  // --------------------------- TYPE DATA --------------------------
-
-  /// mor1.x = cutsq, mor1.y = morse1, mor1.z = r0, mor1.w = alpha
-  UCL_D_Vec<numtyp4> mor1;
-  /// mor2.x = d0, mor2.y = offset
-  UCL_D_Vec<numtyp2> mor2;
-  /// Special LJ values
-  UCL_D_Vec<numtyp> sp_lj;
-
-  /// If atom type constants fit in shared memory, use fast kernels
-  bool shared_types;
-
-  /// Number of atom types 
-  int _types;
-
- private:
-  bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
-};
-
-#endif
-
--- a/lib/gpu/nv_kernel_def.h
+++ b/lib/gpu/nv_kernel_def.h
@ -1,55 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS-Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-/*************************************************************************
-                 See pair_gpu_dev_kernel.cu for definitions
-                       of preprocessor constants
-*************************************************************************/
-
-#ifndef NV_KERNEL_DEF
-#define NV_KERNEL_DEF
-
-#include "geryon/ucl_nv_kernel.h"
-#ifdef __CUDA_ARCH__
-#define ARCH __CUDA_ARCH__
-#else
-#define ARCH 100
-#endif
-
-#if (ARCH < 200)
-
-#define THREADS_PER_ATOM 1
-#define THREADS_PER_CHARGE 8
-#define BLOCK_NBOR_BUILD 64
-#define BLOCK_PAIR 64
-#define BLOCK_BIO_PAIR 64
-#define MAX_SHARED_TYPES 8
-
-#else
-
-#define THREADS_PER_ATOM 1
-#define THREADS_PER_CHARGE 8
-#define BLOCK_NBOR_BUILD 128
-#define BLOCK_PAIR 128
-#define BLOCK_BIO_PAIR 128
-#define MAX_SHARED_TYPES 11
-
-#endif
-
-#define WARP_SIZE 32
-
-#endif
--- a/lib/gpu/pair_gpu_ans.cpp
+++ b/lib/gpu/pair_gpu_ans.cpp
@ -1,407 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#include "pair_gpu_ans.h"
-
-#define PairGPUAnsT PairGPUAns<numtyp,acctyp>
-
-template <class numtyp, class acctyp>
-PairGPUAnsT::PairGPUAns() : _allocated(false),_eflag(false),_vflag(false),
-                            _inum(0),_ilist(NULL),_newton(false) {
-}
-
-template <class numtyp, class acctyp>
-int PairGPUAnsT::bytes_per_atom() const { 
-  int bytes=11*sizeof(acctyp);
-  if (_rot)
-    bytes+=4*sizeof(acctyp);
-  if (_charge)
-    bytes+=sizeof(acctyp);
-  return bytes;
-}
-
-template <class numtyp, class acctyp>
-bool PairGPUAnsT::alloc(const int inum) {
-  _max_local=static_cast<int>(static_cast<double>(inum)*1.10);
-
-  bool success=true;
-  
-  int ans_elements=4;
-  if (_rot)
-    ans_elements+=4;
-  
-  // Ignore host/device transfers?
-  bool cpuview=false;
-  if (dev->device_type()==UCL_CPU)
-    cpuview=true;
-    
-  // --------------------------   Host allocations
-  success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
-  success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
-    
-  // ---------------------------  Device allocations
-  if (cpuview) {
-    dev_engv.view(host_engv);
-    dev_ans.view(host_ans);
-  } else {
-    success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
-                                       UCL_WRITE_ONLY)==UCL_SUCCESS);
-    success=success && (dev_ans.alloc(ans_elements*_max_local,
-                                      *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
-  }
-  _gpu_bytes=dev_engv.row_bytes()+dev_ans.row_bytes();
-  
-  _allocated=true;  
-  return success;
-}
-
-template <class numtyp, class acctyp>
-bool PairGPUAnsT::init(const int inum, const bool charge, const bool rot,
-                       UCL_Device &devi) {
-  clear();
-
-  bool success=true;
-  _charge=charge;
-  _rot=rot;
-  _other=_charge || _rot;
-  dev=&devi;
-
-  _e_fields=1;
-  if (_charge)
-    _e_fields++;
-  _ev_fields=6+_e_fields;
-    
-  // Initialize atom and nbor data
-  int ef_inum=inum;
-  if (ef_inum==0)
-    ef_inum=1000;
-  
-  // Initialize timers for the selected device
-  time_answer.init(*dev);
-  time_answer.zero();
-  _time_cast=0.0;
-  _time_cpu_idle=0.0;
-  
-  return success && alloc(ef_inum);
-}
-  
-template <class numtyp, class acctyp>
-bool PairGPUAnsT::add_fields(const bool charge, const bool rot) {
-  bool realloc=false;
-  if (charge && _charge==false) {
-    _charge=true;
-    _e_fields++;
-    _ev_fields++;
-    realloc=true;
-  }
-  if (rot && _rot==false) {
-    _rot=true;
-    realloc=true;
-  }
-  if (realloc) {
-    _other=_charge || _rot;
-    int inum=_max_local;
-    clear_resize();
-    return alloc(inum);
-  }
-  return true;
-}
-
-template <class numtyp, class acctyp>
-void PairGPUAnsT::clear_resize() {
-  if (!_allocated)
-    return;
-  _allocated=false;
-
-  dev_ans.clear();
-  dev_engv.clear();
-  host_ans.clear();
-  host_engv.clear();
-}
-
-template <class numtyp, class acctyp>
-void PairGPUAnsT::clear() {
-  _gpu_bytes=0;
-  if (!_allocated)
-    return;
-
-  time_answer.clear();
-  clear_resize();
-  _inum=0;
-  _ilist=NULL;
-  _eflag=false;
-  _vflag=false;
-}
-
-template <class numtyp, class acctyp>
-double PairGPUAnsT::host_memory_usage() const {
-  int atom_bytes=4;
-  if (_charge) 
-    atom_bytes+=1;
-  if (_rot) 
-    atom_bytes+=4;
-  int ans_bytes=atom_bytes+_ev_fields;
-  return ans_bytes*(_max_local)*sizeof(acctyp)+
-         sizeof(PairGPUAns<numtyp,acctyp>);
-}
-  
-template <class numtyp, class acctyp>
-void PairGPUAnsT::copy_answers(const bool eflag, const bool vflag,
-                               const bool ef_atom, const bool vf_atom) {
-  time_answer.start();
-  _eflag=eflag;
-  _vflag=vflag;
-  _ef_atom=ef_atom;
-  _vf_atom=vf_atom;
-    
-  int csize=_ev_fields;    
-  if (!eflag)
-    csize-=_e_fields;
-  if (!vflag)
-    csize-=6;
-      
-  if (csize>0)
-    ucl_copy(host_engv,dev_engv,_inum*csize,true);
-  if (_rot)
-    ucl_copy(host_ans,dev_ans,_inum*4*2,true);
-  else
-    ucl_copy(host_ans,dev_ans,_inum*4,true);
-  time_answer.stop();
-}
-
-template <class numtyp, class acctyp>
-void PairGPUAnsT::copy_answers(const bool eflag, const bool vflag,
-                               const bool ef_atom, const bool vf_atom,
-                               int *ilist) {
-  _ilist=ilist;
-  copy_answers(eflag,vflag,ef_atom,vf_atom);
-}
-
-template <class numtyp, class acctyp>
-double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
-                                  double *virial) {
-  if (_eflag==false && _vflag==false)
-    return 0.0;
-
-  double evdwl=0.0;
-  double virial_acc[6];
-  for (int i=0; i<6; i++) virial_acc[i]=0.0;
-  if (_ilist==NULL) {
-    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[i]+=*ap*0.5;
-          ap+=_inum;
-        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[i][j]+=*ap*0.5;
-            virial_acc[j]+=*ap;
-            ap+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial_acc[j]+=*ap;
-            ap+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]+=virial_acc[j]*0.5;
-  } else {
-    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
-      int ii=_ilist[i];
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[ii]+=*ap*0.5;
-          ap+=_inum;
-        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[ii][j]+=*ap*0.5;
-            virial_acc[j]+=*ap;
-            ap+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial_acc[j]+=*ap;
-            ap+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]+=virial_acc[j]*0.5;
-  }
-  
-  evdwl*=0.5;
-  return evdwl;
-}
-
-template <class numtyp, class acctyp>
-double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
-                                   double *virial, double &ecoul) {
-  if (_eflag==false && _vflag==false)
-    return 0.0;
-
-  if (_charge==false)
-    return energy_virial(eatom,vatom,virial);
-
-  double evdwl=0.0;
-  double _ecoul=0.0;
-  double virial_acc[6];
-  for (int i=0; i<6; i++) virial_acc[i]=0.0;
-  if (_ilist==NULL) {
-    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[i]+=*ap*0.5;
-          ap+=_inum;
-          _ecoul+=*ap;
-          eatom[i]+=*ap*0.5;
-          ap+=_inum;
-        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-          _ecoul+=*ap;
-          ap+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[i][j]+=*ap*0.5;
-            virial_acc[j]+=*ap;
-            ap+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial_acc[j]+=*ap;
-            ap+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]+=virial_acc[j]*0.5;
-  } else {
-    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
-      int ii=_ilist[i];
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[ii]+=*ap*0.5;
-          ap+=_inum;
-          _ecoul+=*ap;
-          eatom[ii]+=*ap*0.5;
-          ap+=_inum;
-        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-          _ecoul+=*ap;
-          ap+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[ii][j]+=*ap*0.5;
-            virial_acc[j]+=*ap;
-            ap+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial_acc[j]+=*ap;
-            ap+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]+=virial_acc[j]*0.5;
-  }
-  
-  evdwl*=0.5;
-  ecoul+=_ecoul*0.5;
-  return evdwl;
-}
-
-template <class numtyp, class acctyp>
-void PairGPUAnsT::get_answers(double **f, double **tor) {
-  acctyp *ap=host_ans.begin();
-  if (_ilist==NULL) {
-    for (int i=0; i<_inum; i++) {
-      f[i][0]+=*ap;
-      ap++;
-      f[i][1]+=*ap;
-      ap++;
-      f[i][2]+=*ap;
-      ap+=2;
-    }
-    if (_rot) {
-      for (int i=0; i<_inum; i++) {
-        tor[i][0]+=*ap;
-        ap++;
-        tor[i][1]+=*ap;
-        ap++;
-        tor[i][2]+=*ap;
-        ap+=2;
-      }
-    }
-  } else {
-    for (int i=0; i<_inum; i++) {
-      int ii=_ilist[i];
-      f[ii][0]+=*ap;
-      ap++;
-      f[ii][1]+=*ap;
-      ap++;
-      f[ii][2]+=*ap;
-      ap+=2;
-    }
-    if (_rot) {
-      for (int i=0; i<_inum; i++) {
-        int ii=_ilist[i];
-        tor[ii][0]+=*ap;
-        ap++;
-        tor[ii][1]+=*ap;
-        ap++;
-        tor[ii][2]+=*ap;
-        ap+=2;
-      }
-    }
-  }
-}
-
-template class PairGPUAns<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/pair_gpu_ans.h
+++ b/lib/gpu/pair_gpu_ans.h
@ -1,170 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef PAIR_GPU_ANS_H
-#define PAIR_GPU_ANS_H
-
-#include <math.h>
-#include "mpi.h"
-
-#ifdef USE_OPENCL
-
-#include "geryon/ocl_timer.h"
-#include "geryon/ocl_mat.h"
-using namespace ucl_opencl;
-
-#else
-
-#include "geryon/nvd_timer.h"
-#include "geryon/nvd_mat.h"
-using namespace ucl_cudadr;
-
-#endif
-
-#include "pair_gpu_precision.h"
-
-template <class numtyp, class acctyp>
-class PairGPUAns {
- public:
-  PairGPUAns();
-  ~PairGPUAns() { clear(); }
-
-  /// Current number of local atoms stored
-  inline int inum() const { return _inum; }
-  /// Set number of local atoms for future copy operations
-  inline void inum(const int n) { _inum=n; }
-  
-  /// Memory usage per atom in this class
-  int bytes_per_atom() const; 
-
-  /// Clear any previous data and set up for a new LAMMPS run
-  /** \param rot True if atom storage needs quaternions
-    * \param gpu_nbor True if neighboring will be performed on device **/
-  bool init(const int inum, const bool charge, const bool rot, UCL_Device &dev);
-  
-  /// Check if we have enough device storage and realloc if not
-  inline void resize(const int inum, bool &success) {
-    _inum=inum;
-    if (inum>_max_local) {
-      clear_resize();
-      success = success && alloc(inum);
-    }
-  }
-  
-  /// If already initialized by another LAMMPS style, add fields as necessary
-  /** \param rot True if atom storage needs quaternions
-    * \param gpu_nbor True if neighboring will be performed on device **/
-  bool add_fields(const bool charge, const bool rot);
-  
-  /// Free all memory on host and device needed to realloc for more atoms
-  void clear_resize();
-
-  /// Free all memory on host and device
-  void clear();
- 
-  /// Return the total amount of host memory used by class in bytes
-  double host_memory_usage() const;
-
-  /// Add copy times to timers
-  inline void acc_timers() {
-    time_answer.add_to_total();
-  }
-
-  /// Add copy times to timers
-  inline void zero_timers() {
-    time_answer.zero();
-  }
-
-  /// Return the total time for host/device data transfer
-  inline double transfer_time() {
-    return time_answer.total_seconds();
-  }
-  
-  /// Return the total time for data cast/pack
-  inline double cast_time() { return _time_cast; }
-  
-  /// Return number of bytes used on device
-  inline double gpu_bytes() { return _gpu_bytes; } 
-
-  // -------------------------COPY FROM GPU -------------------------------
-
-  /// Copy answers from device into read buffer asynchronously
-  void copy_answers(const bool eflag, const bool vflag,
-                    const bool ef_atom, const bool vf_atom);
-
-  /// Copy answers from device into read buffer asynchronously
-  void copy_answers(const bool eflag, const bool vflag,
-                    const bool ef_atom, const bool vf_atom, int *ilist);
-  
-  /// Copy energy and virial data into LAMMPS memory
-  double energy_virial(double *eatom, double **vatom, double *virial);
-
-  /// Copy energy and virial data into LAMMPS memory
-  double energy_virial(double *eatom, double **vatom, double *virial,
-                       double &ecoul);
-
-  /// Add forces and torques from the GPU into a LAMMPS pointer
-  void get_answers(double **f, double **tor);
-
-  inline double get_answers(double **f, double **tor, double *eatom, 
-                            double **vatom, double *virial, double &ecoul) {
-    double ta=MPI_Wtime();
-    time_answer.sync_stop();
-    _time_cpu_idle+=MPI_Wtime()-ta;
-    double ts=MPI_Wtime();
-    double evdw=energy_virial(eatom,vatom,virial,ecoul);
-    get_answers(f,tor);
-    _time_cast+=MPI_Wtime()-ts;
-    return evdw;
-  }
-  
-  /// Return the time the CPU was idle waiting for GPU
-  inline double cpu_idle_time() { return _time_cpu_idle; }
-
-  // ------------------------------ DATA ----------------------------------
-
-  /// Force and possibly torque
-  UCL_D_Vec<acctyp> dev_ans;
-  /// Energy and virial per-atom storage
-  UCL_D_Vec<acctyp> dev_engv;
-  
-  /// Force and possibly torque data on host
-  UCL_H_Vec<acctyp> host_ans;
-  /// Energy/virial data on host
-  UCL_H_Vec<acctyp> host_engv;
-  
-  /// Device timers
-  UCL_Timer time_answer;
-  
-  /// Geryon device
-  UCL_Device *dev;
-
- private:
-  bool alloc(const int inum);
-  
-  bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
-  int _max_local, _inum, _e_fields, _ev_fields;
-  int *_ilist;
-  double _time_cast, _time_cpu_idle;
-  
-  double _gpu_bytes;
-  
-  bool _newton;
-};
-
-#endif
-
--- a/lib/gpu/pair_gpu_atom.cpp
+++ b/lib/gpu/pair_gpu_atom.cpp
@ -1,335 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#include "pair_gpu_atom.h"
-
-#define PairGPUAtomT PairGPUAtom<numtyp,acctyp>
-
-#ifdef WINDLL
-#include <windows.h>
-typedef bool (*__win_sort_alloc)(const int max_atoms);
-typedef void (*__win_sort)(const int max_atoms, unsigned *cell_begin,
-                           int *particle_begin);
-__win_sort_alloc _win_sort_alloc;
-__win_sort _win_sort;
-#endif
-
-template <class numtyp, class acctyp>
-PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),
-                              _max_gpu_bytes(0) {
-  #ifndef USE_OPENCL
-  sort_config.op = CUDPP_ADD;
-  sort_config.datatype = CUDPP_UINT;
-  sort_config.algorithm = CUDPP_SORT_RADIX;
-  sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS;
-
-  #ifdef WINDLL
-  HINSTANCE hinstLib = LoadLibrary(TEXT("gpu.dll"));
-  if (hinstLib == NULL) {
-    printf("\nUnable to load gpu.dll\n");
-    exit(1);
-  }
-  _win_sort_alloc=(__win_sort_alloc)GetProcAddress(hinstLib,"_win_sort_alloc");
-  _win_sort=(__win_sort)GetProcAddress(hinstLib,"_win_sort");
-  #endif
-
-  #endif
-}
-
-template <class numtyp, class acctyp>
-int PairGPUAtomT::bytes_per_atom() const { 
-  int id_space=0;
-  if (_gpu_nbor)
-    id_space=2;
-  int bytes=4*sizeof(numtyp)+id_space;
-  if (_rot)
-    bytes+=4*sizeof(numtyp);
-  if (_charge)
-    bytes+=sizeof(numtyp);
-  return bytes;
-}
-
-template <class numtyp, class acctyp>
-bool PairGPUAtomT::alloc(const int nall) {
-  _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
-
-  bool success=true;
-  
-  // Ignore host/device transfers?
-  bool cpuview=false;
-  if (dev->device_type()==UCL_CPU)
-    cpuview=true;
-    
-  // Allocate storage for CUDPP sort
-  #ifndef USE_OPENCL
-  #ifdef WINDLL
-  _win_sort_alloc(_max_atoms);
-  #else
-  if (_gpu_nbor) {
-    CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);  
-    if (CUDPP_SUCCESS != result)
-      return false;
-  }
-  #endif
-  #endif
-
-  // --------------------------   Host allocations
-  // Get a host write only buffer
-  #ifdef GPU_CAST
-  success=success && (host_x_cast.alloc(_max_atoms*3,*dev,
-                                        UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  success=success && (host_type_cast.alloc(_max_atoms,*dev,
-                                           UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  #else
-  success=success && (host_x.alloc(_max_atoms*4,*dev,
-                      UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  #endif                      
-  // Buffer for casting only if different precisions
-  if (_charge)
-    success=success && (host_q.alloc(_max_atoms,*dev,
-                                     UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  // Buffer for casting only if different precisions
-  if (_rot)
-    success=success && (host_quat.alloc(_max_atoms*4,*dev,
-                                        UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-
-    
-  // ---------------------------  Device allocations
-  int gpu_bytes=0;
-  if (cpuview) {
-    #ifdef GPU_CAST
-    assert(0==1);
-    #else
-    dev_x.view(host_x);
-    #endif
-    if (_rot)
-      dev_quat.view(host_quat);
-    if (_charge)
-      dev_q.view(host_q);
-  } else {
-    #ifdef GPU_CAST
-    success=success && (UCL_SUCCESS==dev_x.alloc(_max_atoms*4,*dev));
-    success=success && (UCL_SUCCESS==
-                        dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
-    success=success && (UCL_SUCCESS==
-                        dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
-    gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
-    #else
-    success=success && (UCL_SUCCESS==
-                        dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
-    #endif
-    if (_charge) {
-      success=success && (dev_q.alloc(_max_atoms,*dev,
-                                      UCL_READ_ONLY)==UCL_SUCCESS);
-      gpu_bytes+=dev_q.row_bytes();
-    }
-    if (_rot) {
-      success=success && (dev_quat.alloc(_max_atoms*4,*dev,
-                                      UCL_READ_ONLY)==UCL_SUCCESS);
-      gpu_bytes+=dev_quat.row_bytes();
-    }
-  }
-  if (_gpu_nbor) {
-    success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
-    success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
-    gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
-    if (_bonds) {
-      success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
-      gpu_bytes+=dev_tag.row_bytes();
-    }
-  }
-
-  gpu_bytes+=dev_x.row_bytes();
-  if (gpu_bytes>_max_gpu_bytes)
-    _max_gpu_bytes=gpu_bytes;
-  
-  _allocated=true;  
-  return success;
-}
-
-template <class numtyp, class acctyp>
-bool PairGPUAtomT::add_fields(const bool charge, const bool rot,
-                              const bool gpu_nbor, const bool bonds) {
-  bool realloc=false;
-  if (charge && _charge==false) {
-    _charge=true;
-    realloc=true;
-  }
-  if (rot && _rot==false) {
-    _rot=true;
-    realloc=true;
-  }
-  if (gpu_nbor && _gpu_nbor==false) {
-    _gpu_nbor=true;
-    realloc=true;
-  }
-  if (bonds && _bonds==false) {
-    _bonds=true;
-    realloc=true;
-  }
-  if (realloc) {
-    _other=_charge || _rot;
-    int max_atoms=_max_atoms;
-    clear_resize();
-    return alloc(max_atoms);
-  }
-  return true;
-}
-
-template <class numtyp, class acctyp>
-bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot,
-                        UCL_Device &devi, const bool gpu_nbor,
-                        const bool bonds) {
-  clear();
-
-  bool success=true;
-  _x_avail=false;
-  _q_avail=false;
-  _quat_avail=false;
-  _resized=false;
-  _gpu_nbor=gpu_nbor;
-  _bonds=bonds;
-  _charge=charge;
-  _rot=rot;
-  _other=_charge || _rot;
-  dev=&devi;
-
-  // Initialize atom and nbor data
-  int ef_nall=nall;
-  if (ef_nall==0)
-    ef_nall=2000;
-  
-  // Initialize timers for the selected device
-  time_pos.init(*dev);
-  time_q.init(*dev);
-  time_quat.init(*dev);
-  time_pos.zero();
-  time_q.zero();
-  time_quat.zero();
-  _time_cast=0.0;
-  
-  #ifdef GPU_CAST
-  compile_kernels(*dev);
-  #endif
-  
-  return success && alloc(ef_nall);
-}
-  
-template <class numtyp, class acctyp>
-void PairGPUAtomT::clear_resize() {
-  if (!_allocated)
-    return;
-  _allocated=false;
-
-  dev_x.clear();
-  if (_charge) { 
-    dev_q.clear();
-    host_q.clear();
-  }
-  if (_rot) {
-    dev_quat.clear();
-    host_quat.clear();
-  }
-  #ifndef GPU_CAST
-  host_x.clear();
-  #else
-  host_x_cast.clear();
-  host_type_cast.clear();
-  #endif
-  dev_cell_id.clear();
-  dev_particle_id.clear();
-  dev_tag.clear();
-  #ifdef GPU_CAST
-  dev_x_cast.clear();
-  dev_type_cast.clear();
-  #endif
-
-  #ifndef USE_OPENCL
-  #ifndef WINDLL
-  if (_gpu_nbor) cudppDestroyPlan(sort_plan);
-  #endif
-  #endif
-}
-
-template <class numtyp, class acctyp>
-void PairGPUAtomT::clear() {
-  _max_gpu_bytes=0;
-  if (!_allocated)
-    return;
-
-  time_pos.clear();
-  time_q.clear();
-  time_quat.clear();
-  clear_resize();
-
-  #ifdef GPU_CAST
-  if (_compiled) {
-    k_cast_x.clear();
-    delete atom_program;
-    _compiled=false;
-  }
-  #endif
-}
-
-template <class numtyp, class acctyp>
-double PairGPUAtomT::host_memory_usage() const {
-  int atom_bytes=4;
-  if (_charge) 
-    atom_bytes+=1;
-  if (_rot) 
-    atom_bytes+=4;
-  return _max_atoms*atom_bytes*sizeof(numtyp)+
-         sizeof(PairGPUAtom<numtyp,acctyp>);
-}
-  
-// Sort arrays for neighbor list calculation
-template <class numtyp, class acctyp>
-void PairGPUAtomT::sort_neighbor(const int num_atoms) {
-  #ifndef USE_OPENCL
-  #ifdef WINDLL
-  _win_sort(num_atoms,(unsigned *)dev_cell_id.begin(),
-            (int *)dev_particle_id.begin());
-  #else
-  CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), 
-                                 (int *)dev_particle_id.begin(), 
-                                 8*sizeof(unsigned), num_atoms);
-  if (CUDPP_SUCCESS != result) {
-    printf("Error in cudppSort\n");
-    NVD_GERYON_EXIT;
-  }
-  #endif
-  #endif
-}
-
-#ifdef GPU_CAST
-#ifdef USE_OPENCL
-#include "pair_gpu_atom_cl.h"
-#else
-#include "pair_gpu_atom_ptx.h"
-#endif
-
-template <class numtyp, class acctyp>
-void PairGPUAtomT::compile_kernels(UCL_Device &dev) {
-  atom_program=new UCL_Program(dev);
-  atom_program->load_string(pair_gpu_atom_kernel,"");
-  k_cast_x.set_function(*atom_program,"kernel_cast_x");
-  _compiled=true;
-}
-
-#endif
-
-template class PairGPUAtom<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/pair_gpu_atom.h
+++ b/lib/gpu/pair_gpu_atom.h
@ -1,417 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef PAIR_GPU_ATOM_H
-#define PAIR_GPU_ATOM_H
-
-#include <math.h>
-#include "mpi.h"
-
-#ifdef USE_OPENCL
-
-#include "geryon/ocl_timer.h"
-#include "geryon/ocl_mat.h"
-#include "geryon/ocl_kernel.h"
-using namespace ucl_opencl;
-
-#else
-
-#include "cudpp.h"
-#include "geryon/nvd_timer.h"
-#include "geryon/nvd_mat.h"
-#include "geryon/nvd_kernel.h"
-using namespace ucl_cudadr;
-
-#endif
-
-#include "pair_gpu_precision.h"
-
-template <class numtyp, class acctyp>
-class PairGPUAtom {
- public:
-  PairGPUAtom();
-  ~PairGPUAtom() { clear(); }
-
-  /// Maximum number of atoms that can be stored with current allocation
-  inline int max_atoms() const { return _max_atoms; }
-  /// Current number of local+ghost atoms stored
-  inline int nall() const { return _nall; }
-
-  /// Set number of local+ghost atoms for future copy operations
-  inline void nall(const int n) { _nall=n; }
-  
-  /// Memory usage per atom in this class
-  int bytes_per_atom() const; 
-
-  /// Clear any previous data and set up for a new LAMMPS run
-  /** \param rot True if atom storage needs quaternions
-    * \param gpu_nbor True if neighboring will be performed on device **/
-  bool init(const int nall, const bool charge, const bool rot, 
-            UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false);
-  
-  /// Check if we have enough device storage and realloc if not
-  /** Returns true if resized with any call during this timestep **/
-  inline bool resize(const int nall, bool &success) {
-    _nall=nall;
-    if (nall>_max_atoms) {
-      clear_resize();
-      success = success && alloc(nall);
-      _resized=true;
-    }
-    return _resized;
-  }
-  
-  /// If already initialized by another LAMMPS style, add fields as necessary
-  /** \param rot True if atom storage needs quaternions
-    * \param gpu_nbor True if neighboring will be performed on device **/
-  bool add_fields(const bool charge, const bool rot, const bool gpu_nbor,
-                  const bool bonds);
-  
-  /// Returns true if GPU is using charges
-  bool charge() { return _charge; }
-  
-  /// Returns true if GPU is using quaternions
-  bool quat() { return _rot; }
-  
-  /// Only free matrices of length inum or nall for resizing
-  void clear_resize();
-  
-  /// Free all memory on host and device
-  void clear();
- 
-  /// Return the total amount of host memory used by class in bytes
-  double host_memory_usage() const;
-
-  /// Sort arrays for neighbor list calculation on device
-  void sort_neighbor(const int num_atoms);
-  
-  /// Add copy times to timers
-  inline void acc_timers() {
-    time_pos.add_to_total();
-    if (_charge)
-      time_q.add_to_total();
-    if (_rot)
-      time_quat.add_to_total();
-  }
-
-  /// Add copy times to timers
-  inline void zero_timers() {
-    time_pos.zero();
-    if (_charge)
-      time_q.zero();
-    if (_rot)
-      time_quat.zero();
-  }
-
-  /// Return the total time for host/device data transfer
-  /** Zeros the total so that the atom times are only included once **/
-  inline double transfer_time() {
-    double total=time_pos.total_seconds();
-    time_pos.zero_total();
-    if (_charge) {
-      total+=time_q.total_seconds();
-      time_q.zero_total();
-    }
-    if (_rot) {
-      total+=time_q.total_seconds();
-      time_quat.zero_total();
-    }
-    
-    return total;
-  }
-  
-  /// Return the total time for data cast/pack
-  /** Zeros the time so that atom times are only included once **/
-  inline double cast_time() 
-    { double t=_time_cast; _time_cast=0.0; return t; }
-
-  /// Pack LAMMPS atom type constants into matrix and copy to device
-  template <class dev_typ, class t1>
-  inline void type_pack1(const int n, const int m_size,
-			 UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
-			 t1 **one) {
-    int ii=0;
-    for (int i=0; i<n; i++) {
-      for (int j=0; j<n; j++) {
-        buffer[ii]=static_cast<numtyp>(one[i][j]);
-        ii++;
-      }
-      ii+=m_size-n;
-    }
-    UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
-    ucl_copy(dev_v,view,false);
-  }
-
-  /// Pack LAMMPS atom type constants into 2 vectors and copy to device
-  template <class dev_typ, class t1, class t2>
-  inline void type_pack2(const int n, const int m_size,
-			 UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
-			 t1 **one, t2 **two) {
-    int ii=0;
-    for (int i=0; i<n; i++) {
-      for (int j=0; j<n; j++) {
-        buffer[ii*2]=static_cast<numtyp>(one[i][j]);
-        buffer[ii*2+1]=static_cast<numtyp>(two[i][j]);
-        ii++;
-      }
-      ii+=m_size-n;
-    }
-    UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
-    ucl_copy(dev_v,view,false);
-  }
-
-  /// Pack LAMMPS atom type constants (3) into 4 vectors and copy to device
-  template <class dev_typ, class t1, class t2, class t3>
-  inline void type_pack4(const int n, const int m_size,
-			 UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
-			 t1 **one, t2 **two, t3 **three) {
-    int ii=0;
-    for (int i=0; i<n; i++) {
-      for (int j=0; j<n; j++) {
-        buffer[ii*4]=static_cast<numtyp>(one[i][j]);
-        buffer[ii*4+1]=static_cast<numtyp>(two[i][j]);
-        buffer[ii*4+2]=static_cast<numtyp>(three[i][j]);
-        ii++;
-      }
-      ii+=m_size-n;
-    }
-    UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
-    ucl_copy(dev_v,view,false);
-  }
-  
-  /// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device
-  template <class dev_typ, class t1, class t2, class t3, class t4>
-  inline void type_pack4(const int n, const int m_size,
-			 UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
-			 t1 **one, t2 **two, t3 **three, t4 **four) {
-    int ii=0;
-    for (int i=0; i<n; i++) {
-      for (int j=0; j<n; j++) {
-        buffer[ii*4]=static_cast<numtyp>(one[i][j]);
-        buffer[ii*4+1]=static_cast<numtyp>(two[i][j]);
-        buffer[ii*4+2]=static_cast<numtyp>(three[i][j]);
-        buffer[ii*4+3]=static_cast<numtyp>(four[i][j]);
-        ii++;
-      }
-      ii+=m_size-n;
-    }
-    UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
-    ucl_copy(dev_v,view,false);
-  }
-
-  /// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device
-  template <class dev_typ, class t1, class t2>
-  inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v, 
-                         UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two) {
-    for (int i=0; i<n; i++) {
-      buffer[i*2]=static_cast<numtyp>(one[i][i]);
-      buffer[i*2+1]=static_cast<numtyp>(two[i][i]);
-    }
-    UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),n,*dev);
-    ucl_copy(dev_v,view,false);
-  }
-
-  // -------------------------COPY TO GPU ----------------------------------
-
-  /// Signal that we need to transfer atom data for next timestep
-  inline void data_unavail()
-    { _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; }
-
-  /// Cast positions and types to write buffer
-  inline void cast_x_data(double **host_ptr, const int *host_type) {
-    if (_x_avail==false) {
-      double t=MPI_Wtime();
-      #ifdef GPU_CAST
-      memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
-      memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
-      #else
-      numtyp *_write_loc=host_x.begin();
-      for (int i=0; i<_nall; i++) {
-        *_write_loc=host_ptr[i][0];
-        _write_loc++;
-        *_write_loc=host_ptr[i][1];
-        _write_loc++;
-        *_write_loc=host_ptr[i][2];
-        _write_loc++;
-        *_write_loc=host_type[i];
-        _write_loc++;
-      }
-      #endif
-      _time_cast+=MPI_Wtime()-t;
-    }
-  }
-
-  /// Copy positions and types to device asynchronously
-  /** Copies nall() elements **/
-  inline void add_x_data(double **host_ptr, int *host_type) { 
-    time_pos.start();
-    if (_x_avail==false) {
-      #ifdef GPU_CAST
-      ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
-      ucl_copy(dev_type_cast,host_type_cast,_nall,true);
-      int block_size=64;
-      int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
-      k_cast_x.set_size(GX,block_size);
-      k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), 
-                   &_nall);
-      #else
-      ucl_copy(dev_x,host_x,_nall*4,true);
-      #endif
-      _x_avail=true;
-    }
-    time_pos.stop();
-  }
-
-  /// Calls cast_x_data and add_x_data and times the routines
-  inline void cast_copy_x(double **host_ptr, int *host_type) {
-    cast_x_data(host_ptr,host_type);
-    add_x_data(host_ptr,host_type);
-  }
-
-  // Cast charges to write buffer
-  template<class cpytyp>
-  inline void cast_q_data(cpytyp *host_ptr) {
-    if (_q_avail==false) {
-      double t=MPI_Wtime();
-      if (dev->device_type()==UCL_CPU) {
-        if (sizeof(numtyp)==sizeof(double)) {
-          host_q.view((numtyp*)host_ptr,_nall,*dev);
-          dev_q.view(host_q);
-        } else
-          for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
-      } else {
-        if (sizeof(numtyp)==sizeof(double))
-          memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
-        else
-          for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
-      }
-      _time_cast+=MPI_Wtime()-t;
-    }
-  }
-
-  // Copy charges to device asynchronously
-  inline void add_q_data() {
-    if (_q_avail==false) {
-      ucl_copy(dev_q,host_q,_nall,true);
-      _q_avail=true;
-    }
-  }
-
-  // Cast quaternions to write buffer
-  template<class cpytyp>
-  inline void cast_quat_data(cpytyp *host_ptr) {
-    if (_quat_avail==false) {
-      double t=MPI_Wtime();
-      if (dev->device_type()==UCL_CPU) {
-        if (sizeof(numtyp)==sizeof(double)) {
-          host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
-          dev_quat.view(host_quat);
-        } else
-          for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
-      } else {
-        if (sizeof(numtyp)==sizeof(double))
-          memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
-        else
-          for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
-      }
-      _time_cast+=MPI_Wtime()-t;
-    }
-  }
-
-  // Copy quaternions to device
-  /** Copies nall()*4 elements **/
-  inline void add_quat_data() {
-    if (_quat_avail==false) {
-      ucl_copy(dev_quat,host_quat,_nall*4,true);
-      _quat_avail=true;
-    }
-  }
-
-  /// Return number of bytes used on device
-  inline double max_gpu_bytes() 
-    { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } 
-
-  // ------------------------------ DATA ----------------------------------
-
-  /// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type
-  UCL_D_Vec<numtyp> dev_x;
-  /// Charges
-  UCL_D_Vec<numtyp> dev_q;
-  /// Quaterions
-  UCL_D_Vec<numtyp> dev_quat;
-  
-  #ifdef GPU_CAST
-  UCL_D_Vec<double> dev_x_cast;
-  UCL_D_Vec<int> dev_type_cast;
-  UCL_H_Vec<double> host_x_cast;
-  UCL_H_Vec<int> host_type_cast;
-  #endif
-
-  /// Buffer for moving positions to device
-  UCL_H_Vec<numtyp> host_x;
-  /// Buffer for moving charge data to GPU
-  UCL_H_Vec<numtyp> host_q;
-  /// Buffer for moving quat data to GPU
-  UCL_H_Vec<numtyp> host_quat;
-  
-  /// Cell list identifiers for device nbor builds
-  UCL_D_Vec<unsigned> dev_cell_id;
-  /// Cell list identifiers for device nbor builds
-  UCL_D_Vec<int> dev_particle_id;
-  /// Atom tag information for device nbor builds
-  UCL_D_Vec<int> dev_tag;
-
-  /// Device timers
-  UCL_Timer time_pos, time_q, time_quat;
-  
-  /// Geryon device
-  UCL_Device *dev;
-
- private:
-  #ifdef GPU_CAST
-  UCL_Program *atom_program;
-  UCL_Kernel k_cast_x;
-  void compile_kernels(UCL_Device &dev);
-  #endif
-
-  bool _compiled;
-  
-  // True if data has been copied to device already
-  bool _x_avail, _q_avail, _quat_avail, _resized;
-
-  bool alloc(const int nall);
-  
-  bool _allocated, _rot, _charge, _other;
-  int _max_atoms, _nall;
-  bool _gpu_nbor, _bonds;
-  double _time_cast;
-  
-  double _max_gpu_bytes;
-  
-  #ifndef USE_OPENCL
-  CUDPPConfiguration sort_config;
-  CUDPPHandle sort_plan;
-  #endif
-};
-
-#endif
-
--- a/lib/gpu/pair_gpu_atom_kernel.cu
+++ b/lib/gpu/pair_gpu_atom_kernel.cu
@ -1,46 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifdef NV_KERNEL
-#include "geryon/ucl_nv_kernel.h"
-#else
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-#define GLOBAL_ID_X get_global_id(0)
-#endif
-
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp4 double4
-#else
-#define numtyp float
-#define numtyp4 float4
-#endif
-
-__kernel void kernel_cast_x(__global numtyp4 *x_type, __global double *x,
-                            __global int *type, const int nall) {
-  int ii=GLOBAL_ID_X;
-
-  if (ii<nall) {
-    numtyp4 xt;
-    xt.w=type[ii];
-    int i=ii*3;
-    xt.x=x[i];
-    xt.y=x[i+1];
-    xt.z=x[i+2];
-    x_type[ii]=xt;
-  } // if ii
-}
--- a/lib/gpu/pair_gpu_balance.h
+++ b/lib/gpu/pair_gpu_balance.h
@ -1,206 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef PAIR_GPU_BALANCE_H
-#define PAIR_GPU_BALANCE_H
-
-#include "pair_gpu_device.h"
-#include <math.h>
-
-#define _HD_BALANCE_EVERY 25
-#define _HD_BALANCE_WEIGHT 0.5
-#define _HD_BALANCE_GAP 1.10
-
-/// Host/device load balancer
-template<class numtyp, class acctyp>
-class PairGPUBalance {
- public:
-  inline PairGPUBalance() : _init_done(false), _measure_this_step(false) {}
-  inline ~PairGPUBalance() { clear(); }
-
-  /// Clear any old data and setup for new LAMMPS run
-  inline void init(PairGPUDevice<numtyp, acctyp> *gpu, const bool gpu_nbor,
-                   const double split);
-
-  /// Clear all host and device data
-  inline void clear() {
-    if (_init_done) {
-      _device_time.clear();
-      _measure_this_step=false;
-      _init_done=false;
-    }
-  }
-  
-  /// Return the timestep since initialization
-  inline int timestep() { return _timestep; }
-
-  /// Get a count of the number of particles host will handle for initial alloc
-  inline int first_host_count(const int nlocal, const double gpu_split,
-                              const bool gpu_nbor) const {
-    int host_nlocal=0;
-    if (gpu_nbor && gpu_split!=1.0) {
-      if (gpu_split>0)
-        host_nlocal=static_cast<int>(ceil((1.0-gpu_split)*nlocal));
-      else
-        host_nlocal=static_cast<int>(ceil(0.05*nlocal));
-    }
-    return host_nlocal;
-  }
-
-  /// Return the number of particles the device will handle this timestep
-  inline int get_gpu_count(const int ago, const int inum_full);
-
-  /// Return the average fraction of particles handled by device on all procs
-  inline double all_avg_split() {
-    if (_load_balance) {
-      double _all_avg_split=0.0;
-      MPI_Reduce(&_avg_split,&_all_avg_split,1,MPI_DOUBLE,MPI_SUM,0,
-                 _device->replica());
-      _all_avg_split/=_device->replica_size();
-      return _all_avg_split/_avg_count;
-    } else
-      return _actual_split;
-  }
-
-  /// If CPU neighboring, allow the device fraction to increase on 2nd timestep
-  inline int ago_first(int ago) const
-    { if (_avg_count==1 && _actual_split<_desired_split) ago=0; return ago; }
-
-  /// Start the timer for asynchronous device execution
-  inline void start_timer() {
-    if (_measure_this_step) {
-      _device->gpu->sync();
-      _device->gpu_barrier();
-      _device->start_host_timer();
-      _device_time.start();
-      _device->gpu->sync();
-      _device->gpu_barrier();
-    }
-  }
-
-  /// Stop the timer for asynchronous device execution
-  inline void stop_timer() { if (_measure_this_step) { _device_time.stop(); } }
-
-  /// Calculate the new host/device split based on the cpu and device times
-  /** \note Only does calculation every _HD_BALANCE_EVERY timesteps 
-            (and first 10) **/
-  inline void balance(const double cpu_time);
-
-  /// Calls balance() and then get_gpu_count()
-  inline int balance(const int ago,const int inum_full,const double cpu_time) {
-    balance(cpu_time);
-    return get_gpu_count(ago,inum_full);
-  }
-  
- private:
-  PairGPUDevice<numtyp,acctyp> *_device;
-  UCL_Timer _device_time;
-  bool _init_done, _gpu_nbor;
-  
-  bool _load_balance;
-  double _actual_split, _avg_split, _desired_split, _max_split;
-  int _avg_count;
-
-  bool _measure_this_step;
-  int _inum, _inum_full, _timestep;
-};
-
-#define PairGPUBalanceT PairGPUBalance<numtyp,acctyp>
-
-template <class numtyp, class acctyp>
-void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu, 
-                           const bool gpu_nbor, const double split) {
-  clear();
-  _gpu_nbor=gpu_nbor;
-  _init_done=true;
-  
-  _device=gpu;
-  _device_time.init(*gpu->gpu);
-  
-  if (split<0.0) {
-    _load_balance=true;
-    _desired_split=0.90;
-  } else {
-    _load_balance=false;
-    _desired_split=split;
-  }
-  _actual_split=_desired_split;
-  _avg_split=0.0;
-  _avg_count=0;
-  _timestep=0;
-}
-
-template <class numtyp, class acctyp>
-int PairGPUBalanceT::get_gpu_count(const int ago, const int inum_full) {
-  _measure_this_step=false;
-  if (_load_balance) {
-    if (_avg_count<11 || _timestep%_HD_BALANCE_EVERY==0) {
-      _measure_this_step=true;
-      _inum_full=inum_full;
-    }
-    if (ago==0) {
-      _actual_split=_desired_split;
-      _max_split=_desired_split;
-    }
-  }
-  _inum=static_cast<int>(floor(_actual_split*inum_full));
-  if (_inum==0) _inum++;
-  _timestep++;
-  return _inum;
-}
-    
-template <class numtyp, class acctyp>
-void PairGPUBalanceT::balance(const double cpu_time) {
-  if (_measure_this_step) {
-    _measure_this_step=false;
-    double gpu_time=_device_time.seconds();
-
-    double max_gpu_time;
-    MPI_Allreduce(&gpu_time,&max_gpu_time,1,MPI_DOUBLE,MPI_MAX,
-                  _device->gpu_comm());
-
-    if (_inum_full==_inum) {
-      _desired_split=1.0;
-      return;
-    }
-
-    double cpu_time_per_atom=cpu_time/(_inum_full-_inum);
-    double cpu_other_time=_device->host_time()-cpu_time;
-    int host_inum=static_cast<int>((max_gpu_time-cpu_other_time)/
-                                   cpu_time_per_atom);
-
-    double split=static_cast<double>(_inum_full-host_inum)/_inum_full;
-    _desired_split=split*_HD_BALANCE_GAP;
-    if (_desired_split>1.0)
-      _desired_split=1.0;
-    if (_desired_split<0.0)
-      _desired_split=0.0;
-
-    if (!_gpu_nbor) {
-      if (_desired_split<_max_split)
-        _actual_split=_desired_split;
-      else
-        _actual_split=_max_split;
-    }
-//std::cout << gpu_time << " " << max_gpu_time << " " << cpu_other_time << " " << cpu_time_per_atom << " " << cpu_time << " " << _desired_split << " " << host_inum << std::endl;
-  }
-  _avg_split+=_desired_split;
-  _avg_count++;
-}
-
-#endif
-
--- a/lib/gpu/pair_gpu_build_kernel.cu
+++ b/lib/gpu/pair_gpu_build_kernel.cu
@ -1,300 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Peng Wang (Nvidia), penwang@nvidia.com
-                         Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifdef NV_KERNEL
-
-#include "nv_kernel_def.h"
-texture<float4> neigh_tex;
-
-#ifdef _DOUBLE_DOUBLE
-__inline double4 fetch_pos(const int i, const double4 *pos)
-{
-  return pos[i];
-}
-#else
-__inline float4 fetch_pos(const int& i, const float4 *pos)
-{
-  return tex1Dfetch(neigh_tex, i);
-}
-#endif
-
-#else
-
-#define fetch_pos(i,y) x_[i]
-#define BLOCK_NBOR_BUILD 64
-
-#endif
-
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp4 float4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp4 float4
-#endif
-
-#define BLOCK_CELL_2D 8
-
-#define SBBITS 30
-
-#define SBBITS 30
-
-__kernel void transpose(int *out, int *in, int columns_in, int rows_in)
-{
-	__local float block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
-	
-	unsigned ti=THREAD_ID_X;
-	unsigned tj=THREAD_ID_Y;
-	unsigned bi=BLOCK_ID_X;
-	unsigned bj=BLOCK_ID_Y;
-	
-	unsigned i=bi*BLOCK_CELL_2D+ti;
-	unsigned j=bj*BLOCK_CELL_2D+tj;
-	if ((i<columns_in) && (j<rows_in))
-		block[tj][ti]=in[j*columns_in+i];
-
-	__syncthreads();
-
-	i=bj*BLOCK_CELL_2D+ti;
-	j=bi*BLOCK_CELL_2D+tj;
-	if ((i<rows_in) && (j<columns_in))
-		out[j*rows_in+i] = block[ti][tj];
-}
-
-__kernel void calc_cell_id(numtyp4 *pos, unsigned *cell_id, int *particle_id,
-                           numtyp boxlo0, 
-                           numtyp boxlo1, numtyp boxlo2, numtyp boxhi0, 
-                           numtyp boxhi1, numtyp boxhi2, numtyp cell_size, 
-                           int ncellx, int ncelly, int nall) {
-  int i = threadIdx.x + blockIdx.x*blockDim.x;
-
-  if (i < nall) {
-    numtyp4 p = fetch_pos(i,pos); //pos[i];
-
-    p.x -= boxlo0;
-    p.y -= boxlo1;
-    p.z -= boxlo2;
-    
-    p.x = fmaxf(p.x, -cell_size);
-    p.x = fminf(p.x, boxhi0-boxlo0+cell_size);
-    p.y = fmaxf(p.y, -cell_size);
-    p.y = fminf(p.y, boxhi1-boxlo1+cell_size);
-    p.z = fmaxf(p.z, -cell_size);
-    p.z = fminf(p.z, boxhi2-boxlo2+cell_size);
-    
-    unsigned int id = (unsigned int)(p.x/cell_size + 1.0) 
-      + (unsigned int)(p.y/cell_size + 1.0) * ncellx
-      + (unsigned int)(p.z/cell_size + 1.0) * ncellx * ncelly;
-    
-    cell_id[i] = id;
-    particle_id[i] = i;
-  }
-}
-
-__kernel void kernel_calc_cell_counts(unsigned *cell_id,
-                                      int *cell_counts, int nall, int ncell) {
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < nall) {
-    int id = cell_id[idx];
-
-    // handle boundary cases
-    if (idx == 0) {
-      for (int i = 0; i < id + 1; i++) 
-        cell_counts[i] = 0;
-    }
-    if (idx == nall - 1) {
-      for (int i = id+1; i <= ncell; i++) 
-        cell_counts[i] = nall;
-    }
-
-    if (idx > 0 && idx < nall) {
-      int id_l = cell_id[idx-1];
-      if (id != id_l) {
-        for (int i = id_l+1; i <= id; i++) 
-          cell_counts[i] = idx;
-      }
-    }
-  }
-}
-
-__kernel void calc_neigh_list_cell(numtyp4 *pos,
-				     int *cell_particle_id, 
-				     int *cell_counts,
-				     int *nbor_list,
-				     int *host_nbor_list,
-				     int *host_numj, 
-				     int neigh_bin_size, 
-				     numtyp cell_size,
-				     int ncellx, int ncelly, int ncellz,
-				     int inum, int nt, int nall)
-{
-  int tid = threadIdx.x;
-  int ix = blockIdx.x;
-  int iy = blockIdx.y % ncelly;
-  int iz = blockIdx.y / ncelly;
-	  
-  int icell = ix + iy*ncellx + iz*ncellx*ncelly;
-
-  __shared__ int cell_list_sh[BLOCK_NBOR_BUILD];
-  __shared__ numtyp4 pos_sh[BLOCK_NBOR_BUILD];
-
-  int icell_begin = cell_counts[icell];
-  int icell_end = cell_counts[icell+1];
-
-  int nborz0 = max(iz-1,0), nborz1 = min(iz+1, ncellz-1),
-      nbory0 = max(iy-1,0), nbory1 = min(iy+1, ncelly-1),
-      nborx0 = max(ix-1,0), nborx1 = min(ix+1, ncellx-1);
-
-  numtyp4 diff;
-  numtyp r2;
-  for (int ii = 0; ii < ceil((numtyp)(icell_end - icell_begin)/blockDim.x); ii++) {
-    int i = icell_begin + tid + ii*blockDim.x;
-    int pid_i = nall, pid_j, stride;
-    numtyp4 atom_i, atom_j;
-    int cnt = 0;    
-    int *neigh_counts, *neigh_list;
-    
-    if (i < icell_end)
-      pid_i = cell_particle_id[i];
-
-    if (pid_i < nt) {
-      atom_i = fetch_pos(pid_i,pos); //pos[pid_i];
-    }
-    if (pid_i < inum) {
-      stride=inum;
-      neigh_counts=nbor_list+stride+pid_i;
-      neigh_list=neigh_counts+stride;
-      nbor_list[pid_i]=pid_i;
-    } else {
-      stride=1;
-    	neigh_counts=host_numj+pid_i-inum;
-      neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size;
-    }
-    
-    // loop through neighbors
-
-    for (int nborz = nborz0; nborz <= nborz1; nborz++) {
-      for (int nbory = nbory0; nbory <= nbory1; nbory++) {
-        for (int nborx = nborx0; nborx <= nborx1; nborx++) {
-	
-          int jcell = nborx + nbory*ncellx + nborz*ncellx*ncelly;
-		
-          int jcell_begin = cell_counts[jcell];
-          int jcell_end = cell_counts[jcell+1];
-          int num_atom_cell = jcell_end - jcell_begin;
-	  
-          // load jcell to shared memory
-          int num_iter = (int)ceil((numtyp)num_atom_cell/BLOCK_NBOR_BUILD);
-
-          for (int k = 0; k < num_iter; k++) {
-            int end_idx = min(BLOCK_NBOR_BUILD, num_atom_cell-k*BLOCK_NBOR_BUILD);
-	    
-            if (tid < end_idx) {
-              pid_j =  cell_particle_id[tid+k*BLOCK_NBOR_BUILD+jcell_begin];
-              cell_list_sh[tid] = pid_j;
-              atom_j = fetch_pos(pid_j,pos); //[pid_j];
-              pos_sh[tid].x = atom_j.x;
-              pos_sh[tid].y = atom_j.y;
-              pos_sh[tid].z = atom_j.z;
-            }
-            __syncthreads();
-	    
-            if (pid_i < nt) {
-	    
-              for (int j = 0; j < end_idx; j++) {
-                int pid_j = cell_list_sh[j]; // gather from shared memory
-                diff.x = atom_i.x - pos_sh[j].x;
-                diff.y = atom_i.y - pos_sh[j].y;
-                diff.z = atom_i.z - pos_sh[j].z;
-		
-                r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
-                if (r2 < cell_size*cell_size && r2 > 1e-5) {
-                  if (cnt < neigh_bin_size) {
-                    *neigh_list = pid_j;
-                    neigh_list+=stride;
-                  }
-                  cnt++;
-                }		
-              }
-            }
-	          __syncthreads();
-	        } // for (k)
-        }
-      }
-    }
-    if (pid_i < nt)
-      *neigh_counts = cnt;
-  } // for (i)
-}
-
-__kernel void kernel_special(__global int *dev_nbor, 
-                             __global int *host_nbor_list, 
-                             __global int *host_numj, __global int *tag,
-                             __global int *nspecial, __global int *special,
-                             int inum, int nt, int max_nbors) {
-  // ii indexes the two interacting particles in gi
-  int ii=GLOBAL_ID_X;
-
-  if (ii<nt) {
-    int stride;
-    __global int *list, *list_end;
-    
-    int n1=nspecial[ii*3];
-    int n2=nspecial[ii*3+1];
-    int n3=nspecial[ii*3+2];
-
-    int numj;
-    if (ii < inum) {
-      stride=inum;
-      list=dev_nbor+stride+ii;
-      numj=*list;
-      list+=stride;
-    } else {
-      stride=1;
-      list=host_nbor_list+(ii-inum)*max_nbors;
-      numj=host_numj[ii-inum];
-    }
-    list_end=list+numj*stride;
-  
-    for ( ; list<list_end; list+=stride) {
-      int nbor=*list;
-      int jtag=tag[nbor];
-
-      int offset=ii;
-      for (int i=0; i<n3; i++) {
-        if (special[offset]==jtag) {
-          int which = 1;
-          if (i>=n1)
-            which++;
-          if (i>=n2)
-            which++;
-          nbor=nbor ^ (which << SBBITS);
-          *list=nbor;
-        }
-        offset+=nt;
-      }
-    }
-  } // if ii
-}
--- a/lib/gpu/pair_gpu_dev_kernel.cu
+++ b/lib/gpu/pair_gpu_dev_kernel.cu
@ -1,120 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS-Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-/*************************************************************************
-                           Preprocessor Definitions
-                           
-  Note: It is assumed that constants with the same names are defined with
-  the same values in all files.
-  
-  ARCH
-     Definition:   Architecture number for accelerator
-  MEM_THREADS
-     Definition:   Number of threads with sequential ids accessing memory
-                   simultaneously on multiprocessor
-  WARP_SIZE:
-     Definition:   Number of threads guaranteed to be on the same instruction
-  THREADS_PER_ATOM
-     Definition:   Default number of threads assigned per atom for pair styles
-     Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
-  THREADS_PER_CHARGE
-     Definition:   Default number of threads assigned per atom for pair styles
-                   with charge
-     Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
-  PPPM_MAX_SPLINE
-     Definition:   Maximum order for splines in PPPM
-  PPPM_BLOCK_1D    
-     Definition:   Thread block size for PPPM kernels
-     Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
-                   PPPM_BLOCK_1D%32==0 
-  BLOCK_PAIR
-     Definition:   Default thread block size for pair styles
-     Restrictions:
-  MAX_SHARED_TYPES 8
-     Definition:   Max number of atom type params can be stored in shared memory
-     Restrictions: MAX_SHARED_TYPES*MAX_SHARED_TYPES<=BLOCK_PAIR
-  BLOCK_CELL_2D 
-     Definition:   Default block size in each dimension for cell list builds
-                   and matrix transpose
-  BLOCK_CELL_ID    
-     Definition:   Default block size for binning atoms in cell list builds
-  BLOCK_NBOR_BUILD 
-     Definition:   Default block size for neighbor list builds
-  BLOCK_BIO_PAIR
-     Definition:   Default thread block size for "bio" pair styles
-  MAX_BIO_SHARED_TYPES
-     Definition:   Max number of atom type params can be stored in shared memory
-     Restrictions:  MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2 &&
-                    MAX_BIO_SHARED_TYPES>=BLOCK_BIO_PAIR
-
-*************************************************************************/
-
-#ifndef PAIR_GPU_DEV_KERNEL
-#define PAIR_GPU_DEV_KERNEL
-
-#ifdef NV_KERNEL
-
-#include "nv_kernel_def.h"
-
-#else
-
-#define GLOBAL_ID_X get_global_id(0)
-#define ARCH 0
-#define DRIVER 0
-#define MEM_THREADS 16
-#define WARP_SIZE 1
-#define THREADS_PER_ATOM 1
-#define THREADS_PER_CHARGE 1
-#define BLOCK_PAIR 64
-#define MAX_SHARED_TYPES 8
-#define BLOCK_NBOR_BUILD 64
-#define BLOCK_BIO_PAIR 64
-
-#endif
-
-#define PPPM_MAX_SPLINE 8
-#define PPPM_BLOCK_1D 64
-#define BLOCK_CELL_2D 8
-#define BLOCK_CELL_ID 128
-#define MAX_BIO_SHARED_TYPES 128
-
-__kernel void kernel_zero(__global int *mem, int numel) {
-  int ii=GLOBAL_ID_X;
-  
-  if (ii<numel)
-    mem[ii]=0;
-}
-
-__kernel void kernel_info(__global int *info) {
-  info[0]=ARCH;
-  info[1]=MEM_THREADS;
-  info[2]=WARP_SIZE;
-  info[3]=THREADS_PER_ATOM;
-  info[4]=PPPM_MAX_SPLINE;
-  info[5]=PPPM_BLOCK_1D;
-  info[6]=BLOCK_PAIR;
-  info[7]=MAX_SHARED_TYPES;
-  info[8]=BLOCK_CELL_2D;
-  info[9]=BLOCK_CELL_ID;
-  info[10]=BLOCK_NBOR_BUILD;
-  info[11]=BLOCK_BIO_PAIR;
-  info[12]=MAX_BIO_SHARED_TYPES;
-  info[13]=THREADS_PER_CHARGE;
-}
-
-#endif
-
--- a/lib/gpu/pair_gpu_device.cpp
+++ b/lib/gpu/pair_gpu_device.cpp
@ -1,614 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#include "pair_gpu_device.h"
-#include "pair_gpu_precision.h"
-#include <map>
-#include <math.h>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#ifdef USE_OPENCL
-#include "pair_gpu_dev_cl.h"
-#else
-#include "pair_gpu_dev_ptx.h"
-#endif
-
-#define PairGPUDeviceT PairGPUDevice<numtyp, acctyp>
-
-template <class numtyp, class acctyp>
-PairGPUDeviceT::PairGPUDevice() : _init_count(0), _device_init(false),
-                                  _gpu_mode(GPU_FORCE), _first_device(0),
-                                  _last_device(0), _compiled(false) {
-}
-
-template <class numtyp, class acctyp>
-PairGPUDeviceT::~PairGPUDevice() {
-  clear_device();
-}
-
-template <class numtyp, class acctyp>
-int PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, 
-                                const int first_gpu, const int last_gpu,
-                                const int gpu_mode, const double p_split,
-                                const int nthreads, const int t_per_atom) {
-  _nthreads=nthreads;
-  #ifdef _OPENMP
-  omp_set_num_threads(nthreads);
-  #endif
-  _threads_per_atom=t_per_atom;
-  _threads_per_charge=t_per_atom;
-
-  if (_device_init)
-    return 0;
-  _device_init=true;
-  _comm_world=world;
-  _comm_replica=replica;
-  _first_device=first_gpu;
-  _last_device=last_gpu;
-  _gpu_mode=gpu_mode;
-  _particle_split=p_split;
-
-  // Get the rank/size within the world
-  MPI_Comm_rank(_comm_world,&_world_me);
-  MPI_Comm_size(_comm_world,&_world_size);
-  // Get the rank/size within the replica
-  MPI_Comm_rank(_comm_replica,&_replica_me);
-  MPI_Comm_size(_comm_replica,&_replica_size);
-
-  // Get the names of all nodes
-  int name_length;
-  char node_name[MPI_MAX_PROCESSOR_NAME];
-  char node_names[MPI_MAX_PROCESSOR_NAME*_world_size];
-  MPI_Get_processor_name(node_name,&name_length);
-  MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names,
-                MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world);
-  std::string node_string=std::string(node_name);
-  
-  // Get the number of procs per node                
-  std::map<std::string,int> name_map;
-  std::map<std::string,int>::iterator np;
-  for (int i=0; i<_world_size; i++) {
-    std::string i_string=std::string(&node_names[i*MPI_MAX_PROCESSOR_NAME]);
-    np=name_map.find(i_string);
-    if (np==name_map.end())
-      name_map[i_string]=1;
-    else
-      np->second++;
-  }
-  int procs_per_node=name_map.begin()->second;
-
-  // Assign a unique id to each node
-  int split_num=0, split_id=0;
-  for (np=name_map.begin(); np!=name_map.end(); ++np) {
-    if (np->first==node_string)
-      split_id=split_num;
-    split_num++;
-  }
-  
-  // Set up a per node communicator and find rank within
-  MPI_Comm node_comm;
-  MPI_Comm_split(_comm_world, split_id, 0, &node_comm);  
-  int node_rank;
-  MPI_Comm_rank(node_comm,&node_rank);                  
-
-  // set the device ID
-  _procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/
-                                       (last_gpu-first_gpu+1)));
-  int my_gpu=node_rank/_procs_per_gpu+first_gpu;
-
-  // Time on the device only if 1 proc per gpu
-  _time_device=true;
-  if (_procs_per_gpu>1)
-    _time_device=false;
-  
-  // Set up a per device communicator
-  MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
-  MPI_Comm_rank(_comm_gpu,&_gpu_rank);
-
-  gpu=new UCL_Device();
-  if (my_gpu>=gpu->num_devices())
-    return -2;
-  
-  gpu->set(my_gpu);
-
-  _long_range_precompute=0;
-
-  int flag=compile_kernels();
-
-  return flag;
-}
-
-template <class numtyp, class acctyp>
-int PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const bool charge,
-                         const bool rot, const int nlocal, 
-                         const int host_nlocal, const int nall,
-                         PairGPUNbor *nbor, const int maxspecial,
-                         const int gpu_host, const int max_nbors, 
-                         const double cell_size, const bool pre_cut) {
-  if (!_device_init)
-    return -1;
-  if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
-    return -5;
-
-  // Counts of data transfers for timing overhead estimates
-  _data_in_estimate=0;
-  _data_out_estimate=1;
-
-  // Initial number of local particles
-  int ef_nlocal=nlocal;
-  if (_particle_split<1.0 && _particle_split>0.0)
-    ef_nlocal=static_cast<int>(_particle_split*nlocal);
-
-  bool gpu_nbor=false;
-  if (_gpu_mode==GPU_NEIGH)
-    gpu_nbor=true;
-    
-  if (_init_count==0) {
-    // Initialize atom and nbor data
-    if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor && maxspecial>0))
-      return -3;
-      
-    _data_in_estimate++;
-    if (charge)
-      _data_in_estimate++;
-    if (rot)
-      _data_in_estimate++;
-  } else {
-    if (atom.charge()==false && charge)
-      _data_in_estimate++;
-    if (atom.quat()==false && rot)
-      _data_in_estimate++;
-    if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor && maxspecial))
-      return -3;
-  }
-  
-  if (!ans.init(ef_nlocal,charge,rot,*gpu))
-    return -3;
-
-  if (!nbor->init(&_nbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
-                  *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, 
-                  _block_cell_id, _block_nbor_build))
-    return -3;
-  nbor->cell_size(cell_size);
-
-  _init_count++;
-  return 0;
-}
-
-template <class numtyp, class acctyp>
-int PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal,
-                         const int nall) {
-  if (!_device_init)
-    return -1;                          
-  if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
-    return -5;
-
-  if (_init_count==0) {
-    // Initialize atom and nbor data
-    if (!atom.init(nall,true,false,*gpu,false,false))
-      return -3;
-  } else
-    if (!atom.add_fields(true,false,false,false))
-      return -3;
-
-  if (!ans.init(nlocal,true,false,*gpu))
-    return -3;
-
-  _init_count++;
-  return 0;
-}
-
-template <class numtyp, class acctyp>
-void PairGPUDeviceT::set_single_precompute
-                     (PPPMGPUMemory<numtyp,acctyp,float,_lgpu_float4> *pppm) {
-  _long_range_precompute=1;
-  pppm_single=pppm;
-}
-
-template <class numtyp, class acctyp>
-void PairGPUDeviceT::set_double_precompute
-                     (PPPMGPUMemory<numtyp,acctyp,double,_lgpu_double4> *pppm) {
-  _long_range_precompute=2;
-  pppm_double=pppm;
-}
-
-template <class numtyp, class acctyp>
-void PairGPUDeviceT::init_message(FILE *screen, const char *name,
-                                  const int first_gpu, const int last_gpu) {
-  #ifdef USE_OPENCL
-  std::string fs="";
-  #else
-  std::string fs=toa(gpu->free_gigabytes())+"/";
-  #endif
-  
-  if (_replica_me == 0 && screen) {
-    fprintf(screen,"\n-------------------------------------");
-    fprintf(screen,"-------------------------------------\n");
-    fprintf(screen,"- Using GPGPU acceleration for %s:\n",name);
-    fprintf(screen,"-  with %d proc(s) per device.\n",_procs_per_gpu);
-    #ifdef _OPENMP
-    fprintf(screen,"-  with %d thread(s) per proc.\n",_nthreads);
-    #endif
-    fprintf(screen,"-------------------------------------");
-    fprintf(screen,"-------------------------------------\n");
-
-    int last=last_gpu+1;
-    if (last>gpu->num_devices())
-      last=gpu->num_devices();
-    for (int i=first_gpu; i<last; i++) {
-      std::string sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
-                        toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+
-                        " GHZ (";
-      if (sizeof(PRECISION)==4) {
-        if (sizeof(ACC_PRECISION)==4)
-          sname+="Single Precision)";
-        else
-          sname+="Mixed Precision)";
-      } else
-        sname+="Double Precision)";
-
-      fprintf(screen,"GPU %d: %s\n",i,sname.c_str());         
-    }
-
-    fprintf(screen,"-------------------------------------");
-    fprintf(screen,"-------------------------------------\n\n");
-  }
-}
-
-template <class numtyp, class acctyp>
-void PairGPUDeviceT::estimate_gpu_overhead(const int kernel_calls, 
-                                           double &gpu_overhead,
-                                           double &gpu_driver_overhead) {
-  UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL;
-  UCL_D_Vec<int> *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL;
-  UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL;
-  UCL_Timer over_timer(*gpu);
-
-  if (_data_in_estimate>0) {
-    host_data_in=new UCL_H_Vec<int>[_data_in_estimate];
-    dev_data_in=new UCL_D_Vec<int>[_data_in_estimate];
-    timers_in=new UCL_Timer[_data_in_estimate];
-  }
-  
-  if (_data_out_estimate>0) {
-    host_data_out=new UCL_H_Vec<int>[_data_out_estimate];
-    dev_data_out=new UCL_D_Vec<int>[_data_out_estimate];
-    timers_out=new UCL_Timer[_data_out_estimate];
-  }
-  
-  if (kernel_calls>0) {
-    kernel_data=new UCL_D_Vec<int>[kernel_calls];
-    timers_kernel=new UCL_Timer[kernel_calls];
-  }
-  
-  for (int i=0; i<_data_in_estimate; i++) {
-    host_data_in[i].alloc(1,*gpu);
-    dev_data_in[i].alloc(1,*gpu);
-    timers_in[i].init(*gpu);
-  }  
-  
-  for (int i=0; i<_data_out_estimate; i++) {
-    host_data_out[i].alloc(1,*gpu);
-    dev_data_out[i].alloc(1,*gpu);
-    timers_out[i].init(*gpu);
-  }  
-  
-  for (int i=0; i<kernel_calls; i++) {
-    kernel_data[i].alloc(1,*gpu);
-    timers_kernel[i].init(*gpu);
-  }  
-  
-  gpu_overhead=0.0;
-  gpu_driver_overhead=0.0;
-  
-  for (int i=0; i<10; i++) {
-    gpu->sync();
-    gpu_barrier();
-    over_timer.start();
-    gpu->sync();
-    gpu_barrier();
-
-    double driver_time=MPI_Wtime();
-    for (int i=0; i<_data_in_estimate; i++) {
-      timers_in[i].start();
-      ucl_copy(dev_data_in[i],host_data_in[i],true);
-      timers_in[i].stop();
-    }
-    
-    for (int i=0; i<kernel_calls; i++) {
-      timers_kernel[i].start();
-      zero(kernel_data[i],1);
-      timers_kernel[i].stop();
-    }
-
-    for (int i=0; i<_data_out_estimate; i++) {
-      timers_out[i].start();
-      ucl_copy(host_data_out[i],dev_data_out[i],true);
-      timers_out[i].stop();
-    }
-    over_timer.stop();
-
-    double time=over_timer.seconds();
-    driver_time=MPI_Wtime()-driver_time;
-     
-    if (time_device()) {
-      for (int i=0; i<_data_in_estimate; i++)
-        timers_in[i].add_to_total();
-      for (int i=0; i<kernel_calls; i++)
-        timers_kernel[i].add_to_total();
-      for (int i=0; i<_data_out_estimate; i++)
-        timers_out[i].add_to_total();
-    }
-    
-    double mpi_time, mpi_driver_time;
-    MPI_Allreduce(&time,&mpi_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
-    MPI_Allreduce(&driver_time,&mpi_driver_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
-    gpu_overhead+=mpi_time;
-    gpu_driver_overhead+=mpi_driver_time;
-  }
-  gpu_overhead/=10.0;
-  gpu_driver_overhead/=10.0;
-
-  if (_data_in_estimate>0) {
-    delete [] host_data_in;
-    delete [] dev_data_in;
-    delete [] timers_in;
-  }
-  
-  if (_data_out_estimate>0) {
-    delete [] host_data_out;
-    delete [] dev_data_out;
-    delete [] timers_out;
-  }
-  
-  if (kernel_calls>0) {
-    delete [] kernel_data;
-    delete [] timers_kernel;
-  }
-}              
-
-template <class numtyp, class acctyp>
-void PairGPUDeviceT::output_times(UCL_Timer &time_pair, 
-                                  PairGPUAns<numtyp,acctyp> &ans, 
-                                  PairGPUNbor &nbor, const double avg_split, 
-                                  const double max_bytes, 
-                                  const double gpu_overhead,
-                                  const double driver_overhead, 
-                                  const int threads_per_atom, FILE *screen) {
-  double single[8], times[8];
-
-  single[0]=atom.transfer_time()+ans.transfer_time();
-  single[1]=nbor.time_nbor.total_seconds();
-  single[2]=nbor.time_kernel.total_seconds();
-  single[3]=time_pair.total_seconds();
-  single[4]=atom.cast_time()+ans.cast_time();
-  single[5]=gpu_overhead;
-  single[6]=driver_overhead;
-  single[7]=ans.cpu_idle_time();
-
-  MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
-
-  double my_max_bytes=max_bytes+atom.max_gpu_bytes();
-  double mpi_max_bytes;
-  MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
-  double max_mb=mpi_max_bytes/(1024.0*1024.0);
-
-  if (replica_me()==0)
-    if (screen && times[5]>0.0) {
-      fprintf(screen,"\n\n-------------------------------------");
-      fprintf(screen,"--------------------------------\n");
-      fprintf(screen,"      GPU Time Info (average): ");
-      fprintf(screen,"\n-------------------------------------");
-      fprintf(screen,"--------------------------------\n");
-
-      if (time_device()) {
-        fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/_replica_size);
-        fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[4]/_replica_size);
-        fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/_replica_size);
-        if (nbor.gpu_nbor())
-          fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/_replica_size);
-        else
-          fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size);
-        fprintf(screen,"Force calc:      %.4f s.\n",times[3]/_replica_size);
-      }
-      fprintf(screen,"GPU Overhead:    %.4f s.\n",times[5]/_replica_size);
-      fprintf(screen,"Average split:   %.4f.\n",avg_split);
-      fprintf(screen,"Threads / atom:  %d.\n",threads_per_atom);
-      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
-      fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size);
-      fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[7]/_replica_size);
-
-      fprintf(screen,"-------------------------------------");
-      fprintf(screen,"--------------------------------\n\n");
-    }
-}
-
-template <class numtyp, class acctyp>
-void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in, 
-                                         UCL_Timer &time_out,
-                                         UCL_Timer &time_map,
-                                         UCL_Timer &time_rho,
-                                         UCL_Timer &time_interp,
-                                         PairGPUAns<numtyp,acctyp> &ans, 
-                                         const double max_bytes, 
-                                         const double cpu_time, 
-                                         const double idle_time, FILE *screen) {
-  double single[8], times[8];
-
-  single[0]=time_out.total_seconds();
-  single[1]=time_in.total_seconds()+atom.transfer_time()+atom.cast_time();
-  single[2]=time_map.total_seconds();
-  single[3]=time_rho.total_seconds();
-  single[4]=time_interp.total_seconds();
-  single[5]=ans.transfer_time()+ans.cast_time();
-  single[6]=cpu_time;
-  single[7]=idle_time;
-
-  MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
-
-  double my_max_bytes=max_bytes+atom.max_gpu_bytes();
-  double mpi_max_bytes;
-  MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
-  double max_mb=mpi_max_bytes/(1024.0*1024.0);
-
-  if (replica_me()==0)
-    if (screen && times[6]>0.0) {
-      fprintf(screen,"\n\n-------------------------------------");
-      fprintf(screen,"--------------------------------\n");
-      fprintf(screen,"      GPU Time Info (average): ");
-      fprintf(screen,"\n-------------------------------------");
-      fprintf(screen,"--------------------------------\n");
-
-      if (time_device()) {
-        fprintf(screen,"Data Out:        %.4f s.\n",times[0]/_replica_size);
-        fprintf(screen,"Data In:         %.4f s.\n",times[1]/_replica_size);
-        fprintf(screen,"Kernel (map):    %.4f s.\n",times[2]/_replica_size);
-        fprintf(screen,"Kernel (rho):    %.4f s.\n",times[3]/_replica_size);
-        fprintf(screen,"Force interp:    %.4f s.\n",times[4]/_replica_size);
-        fprintf(screen,"Total rho:       %.4f s.\n",
-                (times[0]+times[2]+times[3])/_replica_size);
-        fprintf(screen,"Total interp:    %.4f s.\n",
-                (times[1]+times[4])/_replica_size);
-        fprintf(screen,"Force copy/cast: %.4f s.\n",times[5]/_replica_size);
-        fprintf(screen,"Total:           %.4f s.\n",
-                (times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/
-                _replica_size);
-      }
-      fprintf(screen,"CPU Poisson:     %.4f s.\n",times[6]/_replica_size);
-      fprintf(screen,"CPU Idle Time:   %.4f s.\n",times[7]/_replica_size);
-      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
-
-      fprintf(screen,"-------------------------------------");
-      fprintf(screen,"--------------------------------\n\n");
-    }
-}
-
-template <class numtyp, class acctyp>
-void PairGPUDeviceT::clear() {
-  if (_init_count>0) {
-    _long_range_precompute=0;
-    _init_count--;
-    if (_init_count==0) {
-      atom.clear();
-      _nbor_shared.clear();
-      if (_compiled) {
-        k_zero.clear();
-        k_info.clear();
-        delete dev_program;
-        _compiled=false;
-      }
-    }
-  }
-}
-
-template <class numtyp, class acctyp>
-void PairGPUDeviceT::clear_device() {
-  while (_init_count>0)
-    clear();
-  if (_device_init) {
-    delete gpu;
-    _device_init=false;
-  }
-}
-
-template <class numtyp, class acctyp>
-int PairGPUDeviceT::compile_kernels() {
-  int flag=0;
-
-  if (_compiled)
-  	return flag;
-  	
-  std::string flags="-cl-mad-enable";
-  dev_program=new UCL_Program(*gpu);
-  int success=dev_program->load_string(pair_gpu_dev_kernel,flags.c_str());
-  if (success!=UCL_SUCCESS)
-    return -4;
-  k_zero.set_function(*dev_program,"kernel_zero");
-  k_info.set_function(*dev_program,"kernel_info");
-  _compiled=true;
-
-  UCL_H_Vec<int> h_gpu_lib_data(14,*gpu,UCL_NOT_PINNED);
-  UCL_D_Vec<int> d_gpu_lib_data(14,*gpu);
-  k_info.set_size(1,1);
-  k_info.run(&d_gpu_lib_data.begin());
-  ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false);
-  
-  _ptx_arch=static_cast<double>(h_gpu_lib_data[0])/100.0;
-  #ifndef USE_OPENCL
-  if (_ptx_arch>gpu->arch())
-    return -4;
-  #endif
-
-  _num_mem_threads=h_gpu_lib_data[1];
-  _warp_size=h_gpu_lib_data[2];
-  if (_threads_per_atom<1)
-    _threads_per_atom=h_gpu_lib_data[3];
-  if (_threads_per_charge<1)
-    _threads_per_charge=h_gpu_lib_data[13];
-  _pppm_max_spline=h_gpu_lib_data[4];
-  _pppm_block=h_gpu_lib_data[5];
-  _block_pair=h_gpu_lib_data[6];
-  _max_shared_types=h_gpu_lib_data[7];
-  _block_cell_2d=h_gpu_lib_data[8];
-  _block_cell_id=h_gpu_lib_data[9];
-  _block_nbor_build=h_gpu_lib_data[10];
-  _block_bio_pair=h_gpu_lib_data[11];
-  _max_bio_shared_types=h_gpu_lib_data[12];
-
-  if (static_cast<size_t>(_block_pair)>gpu->group_size())
-    _block_pair=gpu->group_size();
-  if (static_cast<size_t>(_block_bio_pair)>gpu->group_size())
-    _block_bio_pair=gpu->group_size();
-  if (_threads_per_atom>_warp_size)
-    _threads_per_atom=_warp_size;
-  if (_warp_size%_threads_per_atom!=0)
-    _threads_per_atom=1;
-  if (_threads_per_charge>_warp_size)
-    _threads_per_charge=_warp_size;
-  if (_warp_size%_threads_per_charge!=0)
-    _threads_per_charge=1;
-
-  return flag;    
-}
-
-template <class numtyp, class acctyp>
-double PairGPUDeviceT::host_memory_usage() const {
-  return atom.host_memory_usage()+4*sizeof(numtyp)+
-         sizeof(PairGPUDevice<numtyp,acctyp>);
-}
-
-template class PairGPUDevice<PRECISION,ACC_PRECISION>;
-PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
-
-int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
-                    const int last_gpu, const int gpu_mode, 
-                    const double particle_split, const int nthreads,
-                    const int t_per_atom) {
-  return pair_gpu_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
-                                     particle_split,nthreads,t_per_atom);
-}
-
-void lmp_clear_device() {
-  pair_gpu_device.clear_device();
-}
-
-double lmp_gpu_forces(double **f, double **tor, double *eatom,
-                      double **vatom, double *virial, double &ecoul) {
-  return pair_gpu_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
-}
--- a/lib/gpu/pair_gpu_device.h
+++ b/lib/gpu/pair_gpu_device.h
@ -1,311 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef PAIR_GPU_DEVICE_H
-#define PAIR_GPU_DEVICE_H
-
-#include "pair_gpu_atom.h"
-#include "pair_gpu_ans.h"
-#include "pair_gpu_nbor.h"
-#include "pppm_gpu_memory.h"
-#include "mpi.h"
-#include <sstream>
-#include "stdio.h"
-#include <string>
-#include <queue>
-
-template <class numtyp, class acctyp, 
-          class grdtyp, class grdtyp4> class PPPMGPUMemory;
-
-template <class numtyp, class acctyp>
-class PairGPUDevice {
- public:
-  PairGPUDevice();
-  ~PairGPUDevice(); 
- 
-  /// Initialize the device for use by this process
-  /** Sets up a per-device MPI communicator for load balancing and initializes
-    * the device (>=first_gpu and <=last_gpu) that this proc will be using 
-    * Returns:
-    * -  0 if successfull
-    * - -2 if GPU not found
-    * - -4 if GPU library not compiled for GPU **/
-  int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, 
-                   const int last_gpu, const int gpu_mode, 
-                   const double particle_split, const int nthreads,
-                   const int t_per_atom);
-
-  /// Initialize the device for Atom and Neighbor storage
-  /** \param rot True if quaternions need to be stored
-    * \param nlocal Total number of local particles to allocate memory for
-    * \param host_nlocal Initial number of host particles to allocate memory for
-    * \param nall Total number of local+ghost particles
-    * \param gpu_nbor True if neighboring is performed on device
-    * \param gpu_host 0 if host will not perform force calculations,
-    *                 1 if gpu_nbor is true, and host needs a half nbor list,
-    *                 2 if gpu_nbor is true, and host needs a full nbor list
-    * \param max_nbors Initial number of rows in the neighbor matrix
-    * \param cell_size cutoff+skin 
-    * \param pre_cut True if cutoff test will be performed in separate kernel
-    *                than the force kernel 
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init(PairGPUAns<numtyp,acctyp> &a, const bool charge, const bool rot,
-           const int nlocal, const int host_nlocal, const int nall,
-           PairGPUNbor *nbor, const int maxspecial, const int gpu_host,
-           const int max_nbors, const double cell_size, const bool pre_cut);
-
-  /// Initialize the device for Atom storage only
-  /** \param nlocal Total number of local particles to allocate memory for
-    * \param nall Total number of local+ghost particles
-    *
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal, const int nall);
-
-  /// Output a message for pair_style acceleration with device stats
-  void init_message(FILE *screen, const char *name,
-                    const int first_gpu, const int last_gpu);
-
-  /// Perform charge assignment asynchronously for PPPM
-	void set_single_precompute(PPPMGPUMemory<numtyp,acctyp,
-	                                         float,_lgpu_float4> *pppm);
-
-  /// Perform charge assignment asynchronously for PPPM
-	void set_double_precompute(PPPMGPUMemory<numtyp,acctyp,
-	                                         double,_lgpu_double4> *pppm);
-
-  /// Esimate the overhead from GPU calls from multiple procs
-  /** \param kernel_calls Number of kernel calls/timestep for timing estimated
-    *                     overhead
-    * \param gpu_overhead Estimated gpu overhead per timestep (sec)
-    * \param driver_overhead Estimated overhead from driver per timestep (s) **/
-  void estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead,
-                             double &gpu_driver_overhead);
-
-  /// Returns true if double precision is supported on card
-  inline bool double_precision() { return gpu->double_precision(); }
-  
-  /// Output a message with timing information
-  void output_times(UCL_Timer &time_pair, PairGPUAns<numtyp,acctyp> &ans, 
-                    PairGPUNbor &nbor, const double avg_split, 
-                    const double max_bytes, const double gpu_overhead,
-                    const double driver_overhead, 
-                    const int threads_per_atom, FILE *screen);
-
-  /// Output a message with timing information
-  void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out,
-                           UCL_Timer & time_map, UCL_Timer & time_rho,
-                           UCL_Timer &time_interp, 
-                           PairGPUAns<numtyp,acctyp> &ans, 
-                           const double max_bytes, const double cpu_time,
-                           const double cpu_idle_time, FILE *screen);
-
-  /// Clear all memory on host and device associated with atom and nbor data
-  void clear();
-  
-  /// Clear all memory on host and device
-  void clear_device();
-
-  /// Add an answer object for putting forces, energies, etc from GPU to LAMMPS
-  inline void add_ans_object(PairGPUAns<numtyp,acctyp> *ans)
-    { ans_queue.push(ans); }
-
-  /// Add "answers" (force,energies,etc.) into LAMMPS structures
-  inline double fix_gpu(double **f, double **tor, double *eatom,
-                        double **vatom, double *virial, double &ecoul) {
-    atom.data_unavail();
-    if (ans_queue.empty()==false) {
-      stop_host_timer();
-      double evdw=0.0;
-      while (ans_queue.empty()==false) {
-        evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
-        ans_queue.pop();
-      }                                                 
-      return evdw;
-    }
-    return 0.0;
-  }
-
-  /// Start timer on host
-  inline void start_host_timer() 
-    { _cpu_full=MPI_Wtime(); _host_timer_started=true; }
-  
-  /// Stop timer on host
-  inline void stop_host_timer() { 
-    if (_host_timer_started) {
-      _cpu_full=MPI_Wtime()-_cpu_full; 
-      _host_timer_started=false;
-    }
-  }
-  
-  /// Return host time
-  inline double host_time() { return _cpu_full; }
-
-  /// Return host memory usage in bytes
-  double host_memory_usage() const;
-
-  /// Return the number of procs sharing a device (size of device commincator)
-  inline int procs_per_gpu() const { return _procs_per_gpu; }
-  /// Return the number of threads per proc
-  inline int num_threads() const { return _nthreads; }
-  /// My rank within all processes
-  inline int world_me() const { return _world_me; }
-  /// Total number of processes
-  inline int world_size() const { return _world_size; }
-  /// MPI Barrier for world
-  inline void world_barrier() { MPI_Barrier(_comm_world); }
-  /// Return the replica MPI communicator
-  inline MPI_Comm & replica() { return _comm_replica; }
-  /// My rank within replica communicator
-  inline int replica_me() const { return _replica_me; }
-  /// Number of procs in replica communicator
-  inline int replica_size() const { return _replica_size; }
-  /// Return the per-GPU MPI communicator
-  inline MPI_Comm & gpu_comm() { return _comm_gpu; }
-  /// Return my rank in the device communicator
-  inline int gpu_rank() const { return _gpu_rank; }
-  /// MPI Barrier for gpu
-  inline void gpu_barrier() { MPI_Barrier(_comm_gpu); }
-  /// Return the 'mode' for acceleration: GPU_FORCE or GPU_NEIGH
-  inline int gpu_mode() const { return _gpu_mode; }
-  /// Index of first device used by a node
-  inline int first_device() const { return _first_device; }
-  /// Index of last device used by a node
-  inline int last_device() const { return _last_device; }
-  /// Particle split defined in fix
-  inline double particle_split() const { return _particle_split; }
-  /// Return the initialization count for the device
-  inline int init_count() const { return _init_count; }
-  /// True if device is being timed
-  inline bool time_device() const { return _time_device; }
-
-  /// Return the number of threads accessing memory simulatenously
-  inline int num_mem_threads() const { return _num_mem_threads; }
-  /// Return the number of threads per atom for pair styles
-  inline int threads_per_atom() const { return _threads_per_atom; }
-  /// Return the number of threads per atom for pair styles using charge
-  inline int threads_per_charge() const { return _threads_per_charge; }
-  /// Return the min of the pair block size or the device max block size
-  inline int pair_block_size() const { return _block_pair; }
-  /// Return the maximum number of atom types that can be used with shared mem
-  inline int max_shared_types() const { return _max_shared_types; }
-  /// Return the maximum order for PPPM splines
-  inline int pppm_max_spline() const { return _pppm_max_spline; }
-  /// Return the block size for PPPM kernels
-  inline int pppm_block() const { return _pppm_block; }
-  /// Return the block size for neighbor binning
-  inline int block_cell_2d() const { return _block_cell_2d; }
-  /// Return the block size for atom mapping for neighbor builds
-  inline int block_cell_id() const { return _block_cell_id; }
-  /// Return the block size for neighbor build kernel
-  inline int block_nbor_build() const { return _block_nbor_build; }
-  /// Return the block size for "bio" pair styles
-  inline int block_bio_pair() const { return _block_bio_pair; }
-  /// Return the maximum number of atom types for shared mem with "bio" styles
-  inline int max_bio_shared_types() const { return _max_bio_shared_types; }
-  /// Architecture gpu code compiled for (returns 0 for OpenCL)
-  inline double ptx_arch() const { return _ptx_arch; }
-
-  // -------------------- SHARED DEVICE ROUTINES -------------------- 
-  // Perform asynchronous zero of integer array 
-  void zero(UCL_D_Vec<int> &mem, const int numel) {
-    int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
-                                    _block_pair));
-    k_zero.set_size(num_blocks,_block_pair);
-    k_zero.run(&mem.begin(),&numel);
-  }
-
-  // -------------------------- DEVICE DATA ------------------------- 
-
-  /// Geryon Device
-  UCL_Device *gpu;
-
-  enum{GPU_FORCE, GPU_NEIGH};
-
-  // --------------------------- ATOM DATA -------------------------- 
-
-  /// Atom Data
-  PairGPUAtom<numtyp,acctyp> atom;
-
-  // --------------------------- NBOR DATA ----------------------------
-  
-  /// Neighbor Data
-  PairGPUNborShared _nbor_shared;
-
-  // ------------------------ LONG RANGE DATA -------------------------
-  
-  // Long Range Data
-  int _long_range_precompute;
-  PPPMGPUMemory<numtyp,acctyp,float,_lgpu_float4> *pppm_single;
-  PPPMGPUMemory<numtyp,acctyp,double,_lgpu_double4> *pppm_double;
-  /// Precomputations for long range charge assignment (asynchronously)
-  inline void precompute(const int ago, const int nlocal, const int nall,
-                         double **host_x, int *host_type, bool &success,
-                         double *charge, double *boxlo, double *prd) {
-    if (_long_range_precompute==1)
-      pppm_single->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
-                              boxlo,prd);
-    else if (_long_range_precompute==2)
-      pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
-                              boxlo,prd);
-  }
-
- private:
-  std::queue<PairGPUAns<numtyp,acctyp> *> ans_queue;
-  int _init_count;
-  bool _device_init, _host_timer_started, _time_device;
-  MPI_Comm _comm_world, _comm_replica, _comm_gpu;
-  int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, 
-      _replica_size;
-  int _gpu_mode, _first_device, _last_device, _nthreads;
-  double _particle_split;
-  double _cpu_full;
-  double _ptx_arch;
-
-  int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
-  int _pppm_max_spline, _pppm_block;
-  int _block_pair, _max_shared_types;
-  int _block_cell_2d, _block_cell_id, _block_nbor_build;
-  int _block_bio_pair, _max_bio_shared_types;
-
-  UCL_Program *dev_program;
-  UCL_Kernel k_zero, k_info;
-  bool _compiled;
-  int compile_kernels();
-
-  int _data_in_estimate, _data_out_estimate;
-  
-  template <class t>
-  inline std::string toa(const t& in) {
-    std::ostringstream o;
-    o.precision(2);
-    o << in;
-    return o.str();
-  }
-
-};
-
-#endif
--- a/lib/gpu/pair_gpu_nbor.cpp
+++ b/lib/gpu/pair_gpu_nbor.cpp
@ -1,406 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
-                         Peng Wang (Nvidia), penwang@nvidia.com
------------------------------------------------------------------------- */
-
-#include "pair_gpu_precision.h"
-#include "pair_gpu_nbor.h"
-#include "pair_gpu_device.h"
-#include "math.h"
-
-int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
-  if (_gpu_nbor)
-    return (max_nbors+2)*sizeof(int);
-  else if (_use_packing)
-    return ((max_nbors+2)*2)*sizeof(int);
-  else
-    return (max_nbors+3)*sizeof(int);
-}
-
-bool PairGPUNbor::init(PairGPUNborShared *shared, const int inum,
-                       const int host_inum, const int max_nbors, 
-                       const int maxspecial, UCL_Device &devi, 
-                       const bool gpu_nbor, const int gpu_host, 
-                       const bool pre_cut, const int block_cell_2d,
-                       const int block_cell_id, const int block_nbor_build) {
-  clear();
-
-  _block_cell_2d=block_cell_2d;
-  _block_cell_id=block_cell_id;
-  _block_nbor_build=block_nbor_build;
-  _shared=shared;
-  dev=&devi;
-  _gpu_nbor=gpu_nbor;
-  if (gpu_host==0)
-    _gpu_host=false;
-  else if (gpu_host==1)
-    _gpu_host=true;
-  else 
-    // Not yet implemented
-    assert(0==1);
-  
-  if (pre_cut || gpu_nbor==false)
-    _alloc_packed=true;
-  else
-    _alloc_packed=false;
-
-  bool success=true;
-    
-  // Initialize timers for the selected GPU
-  time_nbor.init(*dev);
-  time_kernel.init(*dev);
-  time_nbor.zero();
-  time_kernel.zero();
-
-  _max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
-  if (_max_atoms==0)
-    _max_atoms=1000;
-    
-  _max_host=static_cast<int>(static_cast<double>(host_inum)*1.10);
-  _max_nbors=max_nbors;
-
-  _maxspecial=maxspecial;
-  if (gpu_nbor==false)
-    _maxspecial=0;
-
-  if (gpu_nbor==false)
-    success=success && (host_packed.alloc(2*IJ_SIZE,*dev,
-                                          UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  alloc(success);
-  if (!success)
-    return false;
-    
-  if (_use_packing==false)
-    _shared->compile_kernels(devi,gpu_nbor);
-
-  return success;
-}
-
-void PairGPUNbor::alloc(bool &success) { 
-  dev_nbor.clear();
-  host_acc.clear();
-  int nt=_max_atoms+_max_host;
-  if (_use_packing==false || _gpu_nbor) 
-    success=success && (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev,
-                                       UCL_READ_ONLY)==UCL_SUCCESS);
-  else 
-    success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
-                                       UCL_READ_ONLY)==UCL_SUCCESS);
-  success=success && (host_acc.alloc(nt*2,*dev,
-                                     UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-
-  _c_bytes=dev_nbor.row_bytes();
-  if (_alloc_packed) {
-    dev_packed.clear();
-    success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
-                                         UCL_READ_ONLY)==UCL_SUCCESS);
-    _c_bytes+=dev_packed.row_bytes();                                         
-  } 
-  if (_max_host>0) {
-    host_nbor.clear();
-    dev_host_nbor.clear();
-    dev_host_numj.clear();
-    host_ilist.clear();
-    host_jlist.clear();
-    
-    success=success && (host_nbor.alloc(_max_nbors*_max_host,*dev,
-                                        UCL_RW_OPTIMIZED)==UCL_SUCCESS);
-    success=success && (dev_host_nbor.alloc(_max_nbors*_max_host,
-                                            *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
-    success=success && (dev_host_numj.alloc(_max_host,*dev,
-                                            UCL_WRITE_ONLY)==UCL_SUCCESS);
-    success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
-    if (!success)
-      return;
-    for (int i=0; i<nt; i++)
-      host_ilist[i]=i;
-    success=success && (host_jlist.alloc(_max_host,*dev,
-                                         UCL_NOT_PINNED)==UCL_SUCCESS);
-    if (!success)
-      return;
-    int *ptr=host_nbor.begin();
-    for (int i=0; i<_max_host; i++) {
-      host_jlist[i]=ptr;
-      ptr+=_max_nbors;
-    }                                                 
-    _c_bytes+=dev_host_nbor.row_bytes()+dev_host_numj.row_bytes();
-  }
-  if (_maxspecial>0) {
-    dev_nspecial.clear();
-    dev_special.clear();
-    dev_special_t.clear();
-    int at=_max_atoms+_max_host;
-    success=success && (dev_nspecial.alloc(3*at,*dev,
-                                           UCL_READ_ONLY)==UCL_SUCCESS);
-    success=success && (dev_special.alloc(_maxspecial*at,*dev,
-                                          UCL_READ_ONLY)==UCL_SUCCESS);
-    success=success && (dev_special_t.alloc(_maxspecial*at,*dev,
-                                            UCL_READ_ONLY)==UCL_SUCCESS);
-    _gpu_bytes+=dev_nspecial.row_bytes()+dev_special.row_bytes()+
-                dev_special_t.row_bytes();
-  }
-
-  _allocated=true;
-}
-  
-void PairGPUNbor::clear() {
-  _gpu_bytes=0.0;
-  _cell_bytes=0.0;
-  _c_bytes=0.0;
-  if (_allocated) {
-    _allocated=false;
-
-    host_packed.clear();
-    host_acc.clear();
-    dev_nbor.clear();
-    dev_host_nbor.clear();
-    dev_packed.clear();
-    host_nbor.clear();
-    dev_host_numj.clear();
-    host_ilist.clear();
-    host_jlist.clear();
-    dev_nspecial.clear();
-    dev_special.clear();
-    dev_special_t.clear();
-
-    time_kernel.clear();
-    time_nbor.clear();
-  }
-}
-
-double PairGPUNbor::host_memory_usage() const {
-  if (_gpu_nbor) {
-    if (_gpu_host)
-      return host_nbor.row_bytes()*host_nbor.rows()+host_ilist.row_bytes()+
-             host_jlist.row_bytes();
-    else
-      return 0;
-  } else 
-    return host_packed.row_bytes()*host_packed.rows()+host_acc.row_bytes()+
-           sizeof(PairGPUNbor);
-}
-
-void PairGPUNbor::get_host(const int inum, int *ilist, int *numj,
-                           int **firstneigh, const int block_size) {  
-  time_nbor.start();
-
-  UCL_H_Vec<int> ilist_view;
-  ilist_view.view(ilist,inum,*dev);
-  ucl_copy(dev_nbor,ilist_view,false);
-
-  UCL_D_Vec<int> nbor_offset;
-  UCL_H_Vec<int> host_offset;
-
-  int copy_count=0;
-  int ij_count=0;
-  int acc_count=0;
-  int dev_count=0;
-  int *h_ptr=host_packed.begin();
-  _nbor_pitch=inum;
-  
-  for (int ii=0; ii<inum; ii++) {
-    int i=ilist[ii];
-    int nj=numj[i];
-    host_acc[ii]=nj;
-    host_acc[ii+inum]=acc_count;
-
-    acc_count+=nj;
-    
-    int *jlist=firstneigh[i];
-    for (int jj=0; jj<nj; jj++) {
-      *h_ptr=jlist[jj];
-      h_ptr++;
-      ij_count++;
-       
-      if (ij_count==IJ_SIZE) {
-        dev_nbor.sync();
-        host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
-        nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE);
-        ucl_copy(nbor_offset,host_offset,true);
-        copy_count++;
-        ij_count=0;
-        dev_count+=IJ_SIZE;
-        h_ptr=host_packed.begin()+(IJ_SIZE*(copy_count%2));
-      }
-    }
-  }
-  if (ij_count!=0) {
-    dev_nbor.sync();
-    host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count);
-    nbor_offset.view_offset(dev_count,dev_packed,ij_count);
-    ucl_copy(nbor_offset,host_offset,true);
-  }
-  UCL_D_Vec<int> acc_view;
-  acc_view.view_offset(inum,dev_nbor,inum*2);
-  ucl_copy(acc_view,host_acc,true);
-  time_nbor.stop();
-  
-  if (_use_packing==false) {
-    time_kernel.start();
-    int GX=static_cast<int>(ceil(static_cast<double>(inum)/block_size));
-    _shared->k_nbor.set_size(GX,block_size);
-    _shared->k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum);
-    time_kernel.stop();
-  }
-}
-
-template <class numtyp, class acctyp>
-void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
-                                  const int nall, 
-                                  PairGPUAtom<numtyp,acctyp> &atom, 
-                                  double *sublo, double *subhi, int *tag, 
-                                  int **nspecial, int **special, bool &success,
-                                  int &mn) {
-  const int nt=inum+host_inum;
-  if (_maxspecial>0) {
-    time_nbor.start();
-    UCL_H_Vec<int> view_nspecial, view_special, view_tag;
-    view_nspecial.view(nspecial[0],nt*3,*dev);
-    view_special.view(special[0],nt*_maxspecial,*dev);
-    view_tag.view(tag,nall,*dev);
-    ucl_copy(dev_nspecial,view_nspecial,nt*3,false);
-    ucl_copy(dev_special_t,view_special,nt*_maxspecial,false);
-    ucl_copy(atom.dev_tag,view_tag,nall,false);
-    time_nbor.stop();
-    time_nbor.add_to_total();
-    time_kernel.start();
-    const int b2x=_block_cell_2d;
-    const int b2y=_block_cell_2d;
-    const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
-    const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
-    _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
-    _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
-                             &_maxspecial,&nt);        
-  } else
-    time_kernel.start();
-
-  _nbor_pitch=inum;
-  _shared->neigh_tex.bind_float(atom.dev_x,4);
-
-  int ncellx, ncelly, ncellz, ncell_3d;
-  ncellx = static_cast<int>(ceil(((subhi[0] - sublo[0]) +
-                                  2.0*_cell_size)/_cell_size));
-  ncelly = static_cast<int>(ceil(((subhi[1] - sublo[1]) +
-                                  2.0*_cell_size)/_cell_size));
-  ncellz = static_cast<int>(ceil(((subhi[2] - sublo[2]) +
-                                  2.0*_cell_size)/_cell_size));
-  ncell_3d = ncellx * ncelly * ncellz;
-  UCL_D_Vec<int> cell_counts;
-  cell_counts.alloc(ncell_3d+1,dev_nbor);
-  _cell_bytes=cell_counts.row_bytes();
-
-  /* build cell list on GPU */
-  const int neigh_block=_block_cell_id;
-  const int GX=(int)ceil((float)nall/neigh_block);
-  const numtyp sublo0=static_cast<numtyp>(sublo[0]);
-  const numtyp sublo1=static_cast<numtyp>(sublo[1]);
-  const numtyp sublo2=static_cast<numtyp>(sublo[2]);
-  const numtyp subhi0=static_cast<numtyp>(subhi[0]);
-  const numtyp subhi1=static_cast<numtyp>(subhi[1]);
-  const numtyp subhi2=static_cast<numtyp>(subhi[2]);
-  const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);
-  _shared->k_cell_id.set_size(GX,neigh_block);
-  _shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), 
-                         &atom.dev_particle_id.begin(),
-  				               &sublo0, &sublo1, &sublo2, &subhi0, &subhi1, 
-  				               &subhi2, &cell_size_cast, &ncellx, &ncelly, &nall);
-
-  atom.sort_neighbor(nall);
-
-  /* calculate cell count */
-  _shared->k_cell_counts.set_size(GX,neigh_block);
-  _shared->k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(), 
-                             &nall, &ncell_3d);
-
-  /* build the neighbor list */
-  const int cell_block=_block_nbor_build;
-  _shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
-  _shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
-                            &cell_counts.begin(), &dev_nbor.begin(),
-                            &dev_host_nbor.begin(), &dev_host_numj.begin(),
-                            &_max_nbors,&cell_size_cast,
-                            &ncellx, &ncelly, &ncellz, &inum, &nt, &nall);
-
-  /* Get the maximum number of nbors and realloc if necessary */
-  UCL_D_Vec<int> numj;
-  numj.view_offset(inum,dev_nbor,inum);
-  ucl_copy(host_acc,numj,inum,false);
-  if (nt>inum) {
-    UCL_H_Vec<int> host_offset;
-    host_offset.view_offset(inum,host_acc,nt-inum);
-    ucl_copy(host_offset,dev_host_numj,nt-inum,false);
-  }
-  mn=host_acc[0];
-  for (int i=1; i<nt; i++)
-    mn=std::max(mn,host_acc[i]);
-
-  if (mn>_max_nbors) {  
-    mn=static_cast<int>(static_cast<double>(mn)*1.10);
-    dev_nbor.clear();
-    success=success && (dev_nbor.alloc((mn+1)*_max_atoms,atom.dev_cell_id,
-                        UCL_READ_ONLY)==UCL_SUCCESS);
-    _gpu_bytes=dev_nbor.row_bytes();
-    if (_max_host>0) {
-      host_nbor.clear();
-      dev_host_nbor.clear();
-      success=success && (host_nbor.alloc(mn*_max_host,dev_nbor,
-                                          UCL_RW_OPTIMIZED)==UCL_SUCCESS);
-      success=success && (dev_host_nbor.alloc(mn*_max_host,
-                                        dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS);
-      int *ptr=host_nbor.begin();
-      for (int i=0; i<_max_host; i++) {
-        host_jlist[i]=ptr;
-        ptr+=mn;
-      }                                                 
-      _gpu_bytes+=dev_host_nbor.row_bytes();
-    }
-    if (_alloc_packed) {
-      dev_packed.clear();
-      success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
-                                           UCL_READ_ONLY)==UCL_SUCCESS);
-      _gpu_bytes+=dev_packed.row_bytes();
-    }
-    if (!success)
-      return;
-    _max_nbors=mn;
-    time_kernel.stop();
-    time_kernel.add_to_total();
-    build_nbor_list(inum, host_inum, nall, atom, sublo, subhi, tag, nspecial,
-                    special, success, mn);
-    return;
-  }
-  
-  if (_maxspecial>0) {
-    const int GX2=static_cast<int>(ceil(static_cast<double>(nt)/cell_block));
-    _shared->k_special.set_size(GX2,cell_block);
-    _shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), 
-                           &dev_host_numj.begin(), &atom.dev_tag.begin(), 
-                           &dev_nspecial.begin(), &dev_special.begin(), 
-                           &inum, &nt, &_max_nbors);
-  }
-  time_kernel.stop();
-
-  time_nbor.start();
-  if (_gpu_host)
-    ucl_copy(host_nbor,dev_host_nbor,false);
-  time_nbor.stop();
-}
-
-template void PairGPUNbor::build_nbor_list<PRECISION,ACC_PRECISION>
-     (const int inum, const int host_inum, const int nall,
-      PairGPUAtom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,
-      int *, int **, int **, bool &success, int &mn);
-
--- a/lib/gpu/pair_gpu_nbor.h
+++ b/lib/gpu/pair_gpu_nbor.h
@ -1,204 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef PAIR_GPU_NBOR_H
-#define PAIR_GPU_NBOR_H
-
-#include "pair_gpu_atom.h"
-#include "pair_gpu_nbor_shared.h"
-
-#define IJ_SIZE 131072
-
-#ifdef USE_OPENCL
-
-#include "geryon/ocl_timer.h"
-#include "geryon/ocl_mat.h"
-using namespace ucl_opencl;
-
-#else
-
-#include "geryon/nvd_timer.h"
-#include "geryon/nvd_mat.h"
-using namespace ucl_cudadr;
-
-#endif
-
-class PairGPUNbor {
- public:
-  PairGPUNbor() : _allocated(false), _use_packing(false) {}
-  ~PairGPUNbor() { clear(); }
- 
-  /// Determine whether neighbor unpacking should be used
-  /** If false, twice as much memory is reserved to allow unpacking neighbors by 
-    * atom for coalesced access. **/
-  void packing(const bool use_packing) { _use_packing=use_packing; }
-  
-  /// Clear any old data and setup for new LAMMPS run
-  /** \param inum Initial number of particles whose neighbors stored on device
-    * \param host_inum Initial number of particles whose nbors copied to host
-    * \param max_nbors Initial number of rows in the neighbor matrix
-    * \param gpu_nbor True if device will perform neighboring
-    * \param gpu_host 0 if host will not perform force calculations,
-    *                 1 if gpu_nbor is true, and host needs a half nbor list,
-    *                 2 if gpu_nbor is true, and host needs a full nbor list
-    * \param pre_cut True if cutoff test will be performed in separate kernel
-    *                than the force kernel **/
-  bool init(PairGPUNborShared *shared, const int inum, const int host_inum,
-            const int max_nbors, const int maxspecial, UCL_Device &dev,
-            const bool gpu_nbor, const int gpu_host, const bool pre_cut,
-            const int block_cell_2d, const int block_cell_id, 
-            const int block_nbor_build);
-
-  /// Set the size of the cutoff+skin
-  inline void cell_size(const double size) { _cell_size=size; }
-  
-  /// Get the size of the cutoff+skin
-  inline double cell_size() const { return _cell_size; }
-
-  /// Check if there is enough memory for neighbor data and realloc if not
-  /** \param inum Number of particles whose nbors will be stored on device
-    * \param max_nbor Current max number of neighbors for a particle
-    * \param success False if insufficient memory **/
-  inline void resize(const int inum, const int max_nbor, bool &success) {
-    if (inum>_max_atoms || max_nbor>_max_nbors) {
-      _max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
-      if (max_nbor>_max_nbors)
-        _max_nbors=static_cast<int>(static_cast<double>(max_nbor)*1.10);
-      alloc(success);
-    }
-  }
-
-  /// Check if there is enough memory for neighbor data and realloc if not
-  /** \param inum Number of particles whose nbors will be stored on device
-    * \param host_inum Number of particles whose nbors will be copied to host
-    * \param max_nbor Current max number of neighbors for a particle
-    * \param success False if insufficient memory **/
-  inline void resize(const int inum, const int host_inum, const int max_nbor, 
-                     bool &success) {
-    if (inum>_max_atoms || max_nbor>_max_nbors || host_inum>_max_host) {
-      _max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
-      _max_host=static_cast<int>(static_cast<double>(host_inum)*1.10);
-      if (max_nbor>_max_nbors)
-        _max_nbors=static_cast<int>(static_cast<double>(max_nbor)*1.10);
-      alloc(success);
-    }
-  }
-
-  /// Free all memory on host and device
-  void clear();
- 
-  /// Bytes per atom used on device
-  int bytes_per_atom(const int max_nbors) const;
-  
-  /// Total host memory used by class
-  double host_memory_usage() const;
-  
-  /// True if neighboring performed on GPU
-  inline bool gpu_nbor() const { return _gpu_nbor; }
-  
-  /// Make a copy of unpacked nbor lists in the packed storage area (for gb)
-  inline void copy_unpacked(const int inum, const int maxj) 
-    { ucl_copy(dev_packed,dev_nbor,inum*(maxj+2),true); }
-
-  /// Copy neighbor list from host (first time or from a rebuild)  
-  void get_host(const int inum, int *ilist, int *numj, 
-                int **firstneigh, const int block_size);
-  
-  /// Return the stride in elements for each nbor row
-  inline int nbor_pitch() const { return _nbor_pitch; }
-  
-  /// Return the maximum number of atoms that can currently be stored
-  inline int max_atoms() const { return _max_atoms; }
-
-  /// Return the maximum number of nbors for a particle based on current alloc
-  inline int max_nbors() const { return _max_nbors; }
-
-  /// Loop through neighbor count array and return maximum nbors for a particle
-  inline int max_nbor_loop(const int inum, int *numj, int *ilist) const {
-    int mn=0;
-    for (int i=0; i<inum; i++)
-      mn=std::max(mn,numj[ilist[i]]);
-    return mn;
-  }
-
-  /// Build nbor list on the device
-  template <class numtyp, class acctyp>
-  void build_nbor_list(const int inum, const int host_inum, const int nall,
-                       PairGPUAtom<numtyp,acctyp> &atom, double *sublo,
-                       double *subhi, int *tag, int **nspecial, int **special, 
-                       bool &success, int &max_nbors);
-
-  /// Return the number of bytes used on device
-  inline double gpu_bytes() {
-    double res = _gpu_bytes + _c_bytes + _cell_bytes;
-    if (_gpu_nbor==false)
-      res += 2*IJ_SIZE*sizeof(int);
-
-    return res;
-  }
-  
-  // ------------------------------- Data -------------------------------
-
-  /// Device neighbor matrix
-  /** - 1st row is i (index into atom data)
-    * - 2nd row is numj (number of neighbors)
-    * - 3rd row is starting location in packed nbors
-    * - Remaining rows are the neighbors arranged for coalesced access **/
-  UCL_D_Vec<int> dev_nbor;
-  /// Packed storage for neighbor lists copied from host
-  UCL_D_Vec<int> dev_packed;
-  /// Host buffer for copying neighbor lists
-  UCL_H_Vec<int> host_packed;
-  /// Host storage for nbor counts (row 1) & accumulated neighbor counts (row2)
-  UCL_H_Vec<int> host_acc;
-
-  // ----------------- Data for GPU Neighbor Calculation ---------------
-
-  /// Host storage for device calculated neighbor lists
-  /** Same storage format as device matrix **/
-  UCL_H_Vec<int> host_nbor;
-  /// Device storage for neighbor list matrix that will be copied to host
-  /** - 1st row is numj
-    * - Remaining rows are by atom, columns are nbors **/
-  UCL_D_Vec<int> dev_host_nbor;
-  UCL_D_Vec<int> dev_host_numj;
-  UCL_H_Vec<int> host_ilist;
-  UCL_H_Vec<int*> host_jlist;
-  /// Device storage for special neighbor counts
-  UCL_D_Vec<int> dev_nspecial;
-  /// Device storage for special neighbors
-  UCL_D_Vec<int> dev_special, dev_special_t;
-
-  /// Device timers
-  UCL_Timer time_nbor, time_kernel;
-  
- private:
-  PairGPUNborShared *_shared;
-  UCL_Device *dev;
-  bool _allocated, _use_packing;
-  int _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial;
-  bool _gpu_nbor, _gpu_host, _alloc_packed;
-  double _cell_size;
-
-  double _gpu_bytes, _c_bytes, _cell_bytes;
-  void alloc(bool &success);
-  
-  int _block_cell_2d, _block_cell_id, _block_nbor_build;
-};
-
-#endif
-
--- a/lib/gpu/pair_gpu_nbor_kernel.cu
+++ b/lib/gpu/pair_gpu_nbor_kernel.cu
@ -1,46 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifdef NV_KERNEL
-
-#include "geryon/ucl_nv_kernel.h"
-
-#else
-
-#define GLOBAL_ID_X get_global_id(0)
-
-#endif
-
-__kernel void kernel_unpack(__global int *dev_nbor, __global int *dev_ij,
-                            const int inum) {
-  // ii indexes the two interacting particles in gi
-  int ii=GLOBAL_ID_X;
-
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii+inum;
-    int numj=*nbor;
-    nbor+=inum;
-    __global int *list=dev_ij+*nbor;
-    __global int *list_end=list+numj;
-  
-    for ( ; list<list_end; list++) {
-      *nbor=*list;
-      nbor+=inum;
-    }
-  } // if ii
-}
-
--- a/lib/gpu/pair_gpu_nbor_shared.cpp
+++ b/lib/gpu/pair_gpu_nbor_shared.cpp
@ -1,71 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#include "pair_gpu_nbor_shared.h"
-
-#ifdef USE_OPENCL
-#include "pair_gpu_nbor_cl.h"
-#else
-#include "pair_gpu_nbor_ptx.h"
-#include "pair_gpu_build_ptx.h"
-#endif
-  
-void PairGPUNborShared::clear() {
-  if (_compiled) {
-    if (_gpu_nbor) {
-      k_cell_id.clear();
-      k_cell_counts.clear();
-      k_build_nbor.clear();
-      k_transpose.clear();
-      k_special.clear();
-      delete build_program;
-    } else {
-      k_nbor.clear();
-      delete nbor_program;
-    }
-    _compiled=false;
-  }
-}
-
-void PairGPUNborShared::compile_kernels(UCL_Device &dev, const bool gpu_nbor) {
-  if (_compiled)
-  	return;
-  	
-  _gpu_nbor=gpu_nbor;
-  std::string flags="-cl-fast-relaxed-math -cl-mad-enable";
-
-  if (gpu_nbor==false) {
-    nbor_program=new UCL_Program(dev);
-    nbor_program->load_string(pair_gpu_nbor_kernel,flags.c_str());
-    k_nbor.set_function(*nbor_program,"kernel_unpack");
-  } else {
-    build_program=new UCL_Program(dev);
-    #ifdef USE_OPENCL
-    std::cerr << "CANNOT CURRENTLY USE GPU NEIGHBORING WITH OPENCL\n";
-    exit(1);
-    #else
-    build_program->load_string(pair_gpu_build_kernel,flags.c_str());
-    #endif
-    k_cell_id.set_function(*build_program,"calc_cell_id");
-    k_cell_counts.set_function(*build_program,"kernel_calc_cell_counts");
-    k_build_nbor.set_function(*build_program,"calc_neigh_list_cell");
-    k_transpose.set_function(*build_program,"transpose");
-    k_special.set_function(*build_program,"kernel_special");
-    neigh_tex.get_texture(*build_program,"neigh_tex");
-  }
-  _compiled=true;
-}
--- a/lib/gpu/pair_gpu_nbor_shared.h
+++ b/lib/gpu/pair_gpu_nbor_shared.h
@ -1,58 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef PAIR_GPU_NBOR_SHARED_H
-#define PAIR_GPU_NBOR_SHARED_H
-
-#ifdef USE_OPENCL
-
-#include "geryon/ocl_kernel.h"
-#include "geryon/ocl_texture.h"
-using namespace ucl_opencl;
-
-#else
-
-#include "geryon/nvd_kernel.h"
-#include "geryon/nvd_texture.h"
-using namespace ucl_cudadr;
-
-#endif
-
-class PairGPUNborShared {
- public:
-  PairGPUNborShared() : _compiled(false) {}
-  ~PairGPUNborShared() { clear(); }
- 
-  /// Free all memory on host and device
-  void clear();
-
-  /// Texture for cached position/type access with CUDA
-  UCL_Texture neigh_tex;
-
-  /// Compile kernels for neighbor lists
-  void compile_kernels(UCL_Device &dev, const bool gpu_nbor);
-
-  // ----------------------------- Kernels
-  UCL_Program *nbor_program, *build_program;
-  UCL_Kernel k_nbor, k_cell_id, k_cell_counts, k_build_nbor;
-  UCL_Kernel k_transpose, k_special;
-
- private:
-  bool _compiled, _gpu_nbor;
-};
-
-#endif
--- a/lib/gpu/pair_gpu_precision.h
+++ b/lib/gpu/pair_gpu_precision.h
@ -1,90 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef PAIR_PRECISION_H
-#define PAIR_PRECISION_H
-
-struct _lgpu_float2 {
-  float x; float y;
-};
-
-struct _lgpu_float4 {
-  float x; float y; float z; float w;
-};
-
-struct _lgpu_double2 {
-  double x; double y;
-};
-
-struct _lgpu_double4 {
-  double x; double y; double z; double w;
-};
-
-#include <iostream>
-inline std::ostream & operator<<(std::ostream &out, const _lgpu_float2 &v) {
-  out << v.x << " " << v.y;
-  return out;
-}
-  
-inline std::ostream & operator<<(std::ostream &out, const _lgpu_float4 &v) {
-  out << v.x << " " << v.y << " " << v.z;
-  return out;
-}
-  
-inline std::ostream & operator<<(std::ostream &out, const _lgpu_double2 &v) {
-  out << v.x << " " << v.y;
-  return out;
-}
-  
-inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
-  out << v.x << " " << v.y << " " << v.z;
-  return out;
-}
-
-// PRECISION - Precision for rsq, energy, force, and torque calculation
-// ACC_PRECISION - Precision for accumulation of energies, forces, and torques
-#ifdef _SINGLE_DOUBLE
-#define OCL_PRECISION_COMPILE "-D_SINGLE_DOUBLE"
-#define PRECISION float
-#define ACC_PRECISION double
-#define numtyp2 _lgpu_float2
-#define numtyp4 _lgpu_float4
-#define acctyp4 _lgpu_double4
-#endif
-
-#ifdef _DOUBLE_DOUBLE
-#define OCL_PRECISION_COMPILE "-D_DOUBLE_DOUBLE"
-#define PRECISION double
-#define ACC_PRECISION double
-#define numtyp2 _lgpu_double2
-#define numtyp4 _lgpu_double4
-#define acctyp4 _lgpu_double4
-#endif
-
-#ifndef PRECISION
-#define OCL_PRECISION_COMPILE "-D_SINGLE_SINGLE"
-#define PRECISION float
-#define ACC_PRECISION float
-#define numtyp2 _lgpu_float2
-#define numtyp4 _lgpu_float4
-#define acctyp4 _lgpu_float4
-#endif
-
-enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
-
-#endif
-
--- a/lib/gpu/pair_win_sort.cpp
+++ b/lib/gpu/pair_win_sort.cpp
@ -1,82 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
-
-#ifndef USE_OPENCL
-#include "cudpp.h"
-#endif
-
-class PairWinSort {
- public:
-  inline PairWinSort() : _allocated(false) {
-    #ifndef USE_OPENCL
-    sort_config.op = CUDPP_ADD;
-    sort_config.datatype = CUDPP_UINT;
-    sort_config.algorithm = CUDPP_SORT_RADIX;
-    sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS;
-    #endif
-  }
-  inline ~PairWinSort() { clear(); }
-
-  /// Free all memory on host and device
-  inline void clear() {
-    #ifndef USE_OPENCL
-    if (_allocated) { cudppDestroyPlan(sort_plan); _allocated=false; }
-    #endif
-  }
- 
-  inline bool alloc(const int max_atoms) {
-    #ifndef USE_OPENCL
-    clear();
-    CUDPPResult result = cudppPlan(&sort_plan, sort_config, max_atoms, 1, 0);  
-    if (CUDPP_SUCCESS != result)
-      return false;
-    #endif
-    return true;
-  }
-
-  /// Sort arrays for neighbor list calculation
-  void sort_neighbor(const int num_atoms, unsigned *cell_begin, int *particle_begin) {
-    #ifndef USE_OPENCL
-    CUDPPResult result = cudppSort(sort_plan, cell_begin, particle_begin, 
-                                   8*sizeof(unsigned), num_atoms);
-    if (CUDPP_SUCCESS != result) {
-      printf("Error in cudppSort\n");
-      assert(1==0);
-    }
-    #endif
-  }
-  
- private:
-  
-  bool allocated;
-
-  #ifndef USE_OPENCL
-  CUDPPConfiguration sort_config;
-  CUDPPHandle sort_plan;
-  #endif
-};
-
-static PairWinSort win_sort;
-
-extern "C" __declspec(dllexport) bool _win_sort_alloc(const int max_atoms) {
-  win_sort.alloc(max_atoms);
-}
-
-extern "C" __declspec(dllexport) bool _win_sort(const int max_atoms, unsigned *cell_begin,
-                                                int *particle_begin) {
-  win_sort.sort(num_atoms,cell_begin,particle_begin);
-}
--- a/Show More
+++ b/Show More